This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
threads 1.61
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
125/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
129 OP(rn) == EVAL || \
cca55fe3
JP
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 134)
ee9b8eae 135#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 136
ee9b8eae
YO
137#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
138
139#if 0
140/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
145
146#else
147/* ... so we use this as its faster. */
148#define IS_TEXT(rn) ( OP(rn)==EXACT )
149#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
151
152#endif
e2d8ce26 153
a84d97b6
HS
154/*
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157*/
cca55fe3 158#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 162 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 163 else if (type == PLUS) \
cca55fe3 164 rn = NEXTOPER(rn); \
3dab1dad 165 else if (type == IFMATCH) \
a84d97b6 166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 167 else rn += NEXT_OFF(rn); \
3dab1dad 168 } \
5f80c4cf 169} STMT_END
74750237 170
c476f425 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
97aff369 177 dVAR;
a3b680e6 178 const int retval = PL_savestack_ix;
b1ce53c5 179#define REGCP_PAREN_ELEMS 4
a3b680e6 180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 181 int p;
40a82448 182 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 183
e49a9654
IH
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
186
6bda09f9 187#define REGCP_OTHER_ELEMS 8
4b3c1a47 188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 189
3280af22 190 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 191/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
3280af22 194 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 195 SSPUSHINT(p);
e7707071 196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regendp[p]
201 ));
a0d0e21e 202 }
b1ce53c5 203/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
3280af22
NIS
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
a01268b5 208 SSPUSHINT(*PL_reglastcloseparen);
3280af22 209 SSPUSHPTR(PL_reginput);
41123dfd
JH
210#define REGCP_FRAME_ELEMS 2
211/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 215
a0d0e21e
LW
216 return retval;
217}
218
c277df42 219/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
220#define REGCP_SET(cp) \
221 DEBUG_STATE_r( \
ab3bbdeb 222 PerlIO_printf(Perl_debug_log, \
e4f74956 223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
224 (IV)PL_savestack_ix)); \
225 cp = PL_savestack_ix
c3464db5 226
ab3bbdeb 227#define REGCP_UNWIND(cp) \
e4f74956 228 DEBUG_STATE_r( \
ab3bbdeb 229 if (cp != PL_savestack_ix) \
e4f74956
YO
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
232 (IV)(cp), (IV)PL_savestack_ix)); \
233 regcpblow(cp)
c277df42 234
76e3520e 235STATIC char *
097eb12c 236S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 237{
97aff369 238 dVAR;
3b6647e0 239 U32 i;
a0d0e21e 240 char *input;
b1ce53c5 241
a3621e74
YO
242 GET_RE_DEBUG_FLAGS_DECL;
243
b1ce53c5 244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 245 i = SSPOPINT;
b1ce53c5
JH
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 248 input = (char *) SSPOPPTR;
a01268b5 249 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
6bda09f9
YO
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 254
6bda09f9 255
b1ce53c5 256 /* Now restore the parentheses context. */
41123dfd
JH
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 259 I32 tmps;
097eb12c 260 U32 paren = (U32)SSPOPINT;
3280af22 261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
262 PL_regstartp[paren] = SSPOPINT;
263 tmps = SSPOPINT;
3280af22
NIS
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
e7707071 266 DEBUG_BUFFERS_r(
c3464db5 267 PerlIO_printf(Perl_debug_log,
b900a521 268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 269 (UV)paren, (IV)PL_regstartp[paren],
b900a521 270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 271 (IV)PL_regendp[paren],
3280af22 272 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 273 );
a0d0e21e 274 }
e7707071 275 DEBUG_BUFFERS_r(
bb7a0f54 276 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 277 PerlIO_printf(Perl_debug_log,
faccc32b 278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
280 }
281 );
daf18116 282#if 1
dafc8851
JH
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
292 * --jhi */
3b6647e0 293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c
AL
294 if (i > PL_regsize)
295 PL_regstartp[i] = -1;
296 PL_regendp[i] = -1;
a0d0e21e 297 }
dafc8851 298#endif
a0d0e21e
LW
299 return input;
300}
301
02db2b7b 302#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 303
a687059c 304/*
e50aee73 305 * pregexec and friends
a687059c
LW
306 */
307
76234dfb 308#ifndef PERL_IN_XSUB_RE
a687059c 309/*
c277df42 310 - pregexec - match a regexp against a string
a687059c 311 */
c277df42 312I32
864dbfa3 313Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
315/* strend: pointer to null at end of string */
316/* strbeg: real beginning of string */
317/* minend: end of match must be >=minend after stringarg. */
318/* nosave: For optimizations. */
319{
320 return
9041c2e3 321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
322 nosave ? 0 : REXEC_COPY_STR);
323}
76234dfb 324#endif
22e551b9 325
9041c2e3 326/*
cad2e5aa
JH
327 * Need to implement the following flags for reg_anch:
328 *
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
330 * USE_INTUIT_ML
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
334 * INTUIT_ONCE_ML
335 *
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
338 */
339
340/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
341
3f7c398e 342/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
344
345/* XXXX We assume that strpos is strbeg unless sv. */
346
6eb5f6b9
JH
347/* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
40d049e4
YO
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
355
356*/
6eb5f6b9 357
2c2d71f5
JH
358/* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
30944b6d
IZ
361 regtry() should be needed.
362
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
365 b) Fixed substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 368 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
369 string which does not contradict any of them.
370 */
2c2d71f5 371
6eb5f6b9
JH
372/* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
375
cad2e5aa
JH
376char *
377Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
379{
97aff369 380 dVAR;
b7953727 381 register I32 start_shift = 0;
cad2e5aa 382 /* Should be nonnegative! */
b7953727 383 register I32 end_shift = 0;
2c2d71f5
JH
384 register char *s;
385 register SV *check;
a1933d95 386 char *strbeg;
cad2e5aa 387 char *t;
1de06328 388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 389 I32 ml_anch;
bd61b366
SS
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
bbe252da 392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 393 RXi_GET_DECL(prog,progi);
30944b6d 394#ifdef DEBUGGING
890ce7af 395 const char * const i_strpos = strpos;
30944b6d 396#endif
a3621e74
YO
397
398 GET_RE_DEBUG_FLAGS_DECL;
399
a30b2f1f 400 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 401
bbe252da 402 if (prog->extflags & RXf_UTF8) {
b8d68ded
JH
403 PL_reg_flags |= RF_utf8;
404 }
ab3bbdeb
YO
405 DEBUG_EXECUTE_r(
406 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
2a782b5b 409 );
cad2e5aa 410
c344f387
JH
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 414 "String too short... [re_intuit_start]\n"));
cad2e5aa 415 goto fail;
2c2d71f5 416 }
1de06328 417
a1933d95 418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 419 PL_regeol = strend;
33b8afdf
JH
420 if (do_utf8) {
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
424 } else {
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
428 }
1de06328 429 if (check == &PL_sv_undef) {
a3621e74 430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 431 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
432 goto fail;
433 }
bbe252da
YO
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 437 && !multiline ) ); /* Check after \n? */
cad2e5aa 438
7e25d62c 439 if (!ml_anch) {
bbe252da
YO
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
443 && sv && !SvROK(sv)
444 && (strpos != strbeg)) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
446 goto fail;
447 }
448 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 449 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 450 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
451 I32 slen;
452
1aa99e6b 453 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 454
653099ff
GS
455 if (SvTAIL(check)) {
456 slen = SvCUR(check); /* >= 1 */
cad2e5aa 457
9041c2e3 458 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 459 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 461 goto fail_finish;
cad2e5aa
JH
462 }
463 /* Now should match s[0..slen-2] */
464 slen--;
3f7c398e 465 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 466 || (slen > 1
3f7c398e 467 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 468 report_neq:
a3621e74 469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
470 goto fail_finish;
471 }
cad2e5aa 472 }
3f7c398e 473 else if (*SvPVX_const(check) != *s
653099ff 474 || ((slen = SvCUR(check)) > 1
3f7c398e 475 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 476 goto report_neq;
c315bfe8 477 check_at = s;
2c2d71f5 478 goto success_at_start;
7e25d62c 479 }
cad2e5aa 480 }
2c2d71f5 481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 482 s = strpos;
2c2d71f5 483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
484 end_shift = prog->check_end_shift;
485
2c2d71f5 486 if (!ml_anch) {
a3b680e6 487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 488 - (SvTAIL(check) != 0);
a3b680e6 489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
490
491 if (end_shift < eshift)
492 end_shift = eshift;
493 }
cad2e5aa 494 }
2c2d71f5 495 else { /* Can match at random position */
cad2e5aa
JH
496 ml_anch = 0;
497 s = strpos;
1de06328
YO
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
500
501 /* end shift should be non negative here */
cad2e5aa
JH
502 }
503
bcdf7404 504#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 505 if (end_shift < 0)
1de06328 506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 507 (IV)end_shift, prog->precomp);
2c2d71f5
JH
508#endif
509
2c2d71f5
JH
510 restart:
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
513
514 {
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
520 }
6bda09f9 521 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
525 (IV)srch_end_shift,
526 (IV)prog->check_end_shift);
527 });
528
cad2e5aa 529 if (flags & REXEC_SCREAM) {
cad2e5aa 530 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 531 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 532
2c2d71f5
JH
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
85c508c3 535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 536 && SvTAIL(check) ))
9041c2e3 537 s = screaminstr(sv, check,
1de06328 538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 539 else
2c2d71f5 540 goto fail_finish;
4addbd3b
HS
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
3f7c398e 543 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
1de06328
YO
547 else {
548 U8* start_point;
549 U8* end_point;
bbe252da 550 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
553 } else {
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
556 }
6bda09f9 557 DEBUG_OPTIMISE_MORE_r({
56570a2c 558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 559 (int)(end_point - start_point),
fc8cd66c 560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
561 start_point);
562 });
563
564 s = fbm_instr( start_point, end_point,
7fba1cd6 565 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
566 }
567 }
cad2e5aa
JH
568 /* Update the count-of-usability, remove useless subpatterns,
569 unshift s. */
2c2d71f5 570
ab3bbdeb
YO
571 DEBUG_EXECUTE_r({
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 575 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
578 quoted,
579 RE_SV_TAIL(check),
580 (s ? " at offset " : "...\n") );
581 });
2c2d71f5
JH
582
583 if (!s)
584 goto fail_finish;
2c2d71f5 585 /* Finish the diagnostic message */
a3621e74 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 587
1de06328
YO
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
591 point. I think. :-(
592 */
593
594
595
596 check_at=s;
597
598
2c2d71f5
JH
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
605 */
606
1de06328
YO
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
609 {
30944b6d 610 /* Take into account the "other" substring. */
2c2d71f5
JH
611 /* XXXX May be hopelessly wrong for UTF... */
612 if (!other_last)
6eb5f6b9 613 other_last = strpos;
33b8afdf 614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
615 do_other_anchored:
616 {
890ce7af
AL
617 char * const last = HOP3c(s, -start_shift, strbeg);
618 char *last1, *last2;
be8e71aa 619 char * const saved_s = s;
33b8afdf 620 SV* must;
2c2d71f5 621
2c2d71f5
JH
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 624 && (!do_utf8
0ce71af7 625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 626 && t > strpos)))
6f207bd3 627 NOOP;
2c2d71f5
JH
628 else
629 t = strpos;
1aa99e6b 630 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
631 if (t < other_last) /* These positions already checked */
632 t = other_last;
1aa99e6b 633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
634 if (last < last1)
635 last1 = last;
1de06328
YO
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
639 dmq.
640 */
641
2c2d71f5 642 /* On end-of-str: see comment below. */
33b8afdf
JH
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
645 s = (char*)NULL;
1de06328 646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
647 }
648 else
649 s = fbm_instr(
650 (unsigned char*)t,
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
653 must,
7fba1cd6 654 multiline ? FBMrf_MULTILINE : 0
33b8afdf 655 );
ab3bbdeb
YO
656 DEBUG_EXECUTE_r({
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 660 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
661 quoted, RE_SV_TAIL(must));
662 });
663
664
2c2d71f5
JH
665 if (!s) {
666 if (last1 >= last2) {
a3621e74 667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
668 ", giving up...\n"));
669 goto fail_finish;
670 }
a3621e74 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 672 ", trying floating at offset %ld...\n",
be8e71aa 673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
2c2d71f5
JH
676 goto restart;
677 }
678 else {
a3621e74 679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 680 (long)(s - i_strpos)));
1aa99e6b
IH
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
be8e71aa 683 s = saved_s;
2c2d71f5
JH
684 if (t == strpos)
685 goto try_at_start;
2c2d71f5
JH
686 goto try_at_offset;
687 }
30944b6d 688 }
2c2d71f5
JH
689 }
690 else { /* Take into account the floating substring. */
33b8afdf 691 char *last, *last1;
be8e71aa 692 char * const saved_s = s;
33b8afdf
JH
693 SV* must;
694
695 t = HOP3c(s, -start_shift, strbeg);
696 last1 = last =
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
701 if (s < other_last)
702 s = other_last;
2c2d71f5 703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
709 s = (char*)NULL;
1de06328 710 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
711 }
712 else
2c2d71f5 713 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
714 (unsigned char*)last + SvCUR(must)
715 - (SvTAIL(must)!=0),
7fba1cd6 716 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
717 DEBUG_EXECUTE_r({
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 721 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
722 quoted, RE_SV_TAIL(must));
723 });
33b8afdf
JH
724 if (!s) {
725 if (last1 == last) {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
727 ", giving up...\n"));
728 goto fail_finish;
2c2d71f5 729 }
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 731 ", trying anchored starting at offset %ld...\n",
be8e71aa 732 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
733 other_last = last;
734 s = HOP3c(t, 1, strend);
735 goto restart;
736 }
737 else {
a3621e74 738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
be8e71aa 741 s = saved_s;
33b8afdf
JH
742 if (t == strpos)
743 goto try_at_start;
744 goto try_at_offset;
745 }
2c2d71f5 746 }
cad2e5aa 747 }
2c2d71f5 748
1de06328 749
9ef43ace 750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 751
6bda09f9 752 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
757 (IV)(s-strpos),
758 (IV)(t-strpos),
759 (IV)(t-s),
760 (IV)(strend-strpos)
761 )
762 );
763
2c2d71f5 764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 765 && (!do_utf8
9ef43ace 766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
767 && t > strpos)))
768 {
2c2d71f5
JH
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
771 try_at_offset:
cad2e5aa 772 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
779 find_anchor:
2c2d71f5 780 while (t < strend - prog->minlen) {
cad2e5aa 781 if (*t == '\n') {
4ee3650e 782 if (t < check_at - prog->check_offset_min) {
33b8afdf 783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
30944b6d
IZ
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
790 */
9041c2e3 791 strpos = t + 1;
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
794 goto do_other_anchored;
795 }
4ee3650e
GS
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
cad2e5aa 798 s = t + 1;
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
801 goto set_useful;
802 }
4ee3650e
GS
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
a3621e74 806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 808 other_last = strpos = s = t + 1;
cad2e5aa
JH
809 goto restart;
810 }
811 t++;
812 }
a3621e74 813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 814 PL_colors[0], PL_colors[1]));
2c2d71f5 815 goto fail_finish;
cad2e5aa 816 }
f5952150 817 else {
a3621e74 818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 819 PL_colors[0], PL_colors[1]));
f5952150 820 }
cad2e5aa
JH
821 s = t;
822 set_useful:
33b8afdf 823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
824 }
825 else {
f5952150 826 /* The found string does not prohibit matching at strpos,
2c2d71f5 827 - no optimization of calling REx engine can be performed,
f5952150
GS
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
2c2d71f5
JH
830 try_at_start:
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
05b4157f 833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 834 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 835 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 836 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 837 {
cad2e5aa
JH
838 t = strpos;
839 goto find_anchor;
840 }
a3621e74 841 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 844 );
2c2d71f5 845 success_at_start:
bbe252da 846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
847 && (do_utf8 ? (
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
851 ) : (
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
855 )))
66e933ab 856 {
cad2e5aa 857 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
cad2e5aa 865 s = strpos;
3cf5c195
IZ
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
6eb5f6b9 868 other heuristics. */
bbe252da 869 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
870 }
871 else
872 s = strpos;
873 }
874
6eb5f6b9
JH
875 /* Last resort... */
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
f8fc2ecf 879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 891 : 1);
1de06328
YO
892 char * endpos;
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
897 else
898 endpos= strend;
899
70685ca0
JH
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 902
6eb5f6b9 903 t = s;
f8fc2ecf 904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
905 if (!s) {
906#ifdef DEBUGGING
cbbf8932 907 const char *what = NULL;
6eb5f6b9
JH
908#endif
909 if (endpos == strend) {
a3621e74 910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
911 "Could not match STCLASS...\n") );
912 goto fail;
913 }
a3621e74 914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 915 "This position contradicts STCLASS...\n") );
bbe252da 916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 917 goto fail;
6eb5f6b9 918 /* Contradict one of substrings */
33b8afdf
JH
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 921 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 922 hop_and_restart:
1aa99e6b 923 s = HOP3c(t, 1, strend);
66e933ab
GS
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
a3621e74 926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
927 "Could not match STCLASS...\n") );
928 goto fail;
929 }
5e39e1e5
HS
930 if (!check)
931 goto giveup;
a3621e74 932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 933 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
934 what, (long)(s + start_shift - i_strpos)) );
935 goto restart;
936 }
66e933ab 937 /* Have both, check_string is floating */
6eb5f6b9
JH
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
9041c2e3 941 s = check_at;
5e39e1e5
HS
942 if (!check)
943 goto giveup;
a3621e74 944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 945 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
948 }
60e71179
GS
949 /* Another way we could have checked stclass at the
950 current position only: */
951 if (ml_anch) {
952 s = t = t + 1;
5e39e1e5
HS
953 if (!check)
954 goto giveup;
a3621e74 955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 956 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 958 goto try_at_offset;
66e933ab 959 }
33b8afdf 960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 961 goto fail;
6eb5f6b9
JH
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
a3621e74 965 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
966 goto hop_and_restart;
967 }
b7953727 968 if (t != s) {
a3621e74 969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 970 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
971 (long)(t - i_strpos), (long)(s - i_strpos))
972 );
973 }
974 else {
a3621e74 975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
976 "Does not contradict STCLASS...\n");
977 );
978 }
6eb5f6b9 979 }
5e39e1e5 980 giveup:
a3621e74 981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 984 return s;
2c2d71f5
JH
985
986 fail_finish: /* Substring not found */
33b8afdf
JH
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 989 fail:
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 991 PL_colors[4], PL_colors[5]));
bd61b366 992 return NULL;
cad2e5aa 993}
9661b544 994
786e8c11 995
3b0527fe 996
55eed653
NC
997#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1003 foldlen -= len; \
1004 uscan += len; \
1005 len=0; \
1006 } else { \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1011 } \
1012 break; \
1013 case trie_utf8: \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1015 break; \
1016 case trie_plain: \
1017 uvc = (UV)*uc; \
1018 len = 1; \
1019 } \
1020 \
1021 if (uvc < 256) { \
1022 charid = trie->charmap[ uvc ]; \
1023 } \
1024 else { \
1025 charid = 0; \
55eed653
NC
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1028 (char*)&uvc, sizeof(UV), 0); \
1029 if (svpp) \
1030 charid = (U16)SvIV(*svpp); \
1031 } \
1032 } \
1033} STMT_END
1034
1035#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1036 if ( (CoNd) \
1037 && (ln == len || \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
24b23f37 1040 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1041 goto got_it; \
1042 else { \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1046 if ( f != c \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1051 m, \
1052 NULL, ln, (bool)UTF)) \
24b23f37 1053 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1054 goto got_it; \
1055 } \
1056 s += len
1057
1058#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1059STMT_START { \
1060 while (s <= e) { \
1061 if ( (CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1063 ? ibcmp(s, m, ln) \
1064 : ibcmp_locale(s, m, ln))) \
24b23f37 1065 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1066 goto got_it; \
1067 s++; \
1068 } \
1069} STMT_END
1070
1071#define REXEC_FBC_UTF8_SCAN(CoDe) \
1072STMT_START { \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1074 CoDe \
1075 s += uskip; \
1076 } \
1077} STMT_END
1078
1079#define REXEC_FBC_SCAN(CoDe) \
1080STMT_START { \
1081 while (s < strend) { \
1082 CoDe \
1083 s++; \
1084 } \
1085} STMT_END
1086
1087#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088REXEC_FBC_UTF8_SCAN( \
1089 if (CoNd) { \
24b23f37 1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1091 goto got_it; \
1092 else \
1093 tmp = doevery; \
1094 } \
1095 else \
1096 tmp = 1; \
1097)
1098
1099#define REXEC_FBC_CLASS_SCAN(CoNd) \
1100REXEC_FBC_SCAN( \
1101 if (CoNd) { \
24b23f37 1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1103 goto got_it; \
1104 else \
1105 tmp = doevery; \
1106 } \
1107 else \
1108 tmp = 1; \
1109)
1110
1111#define REXEC_FBC_TRYIT \
24b23f37 1112if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1113 goto got_it
1114
1115#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1116 if (do_utf8) { \
1117 UtFpReLoAd; \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1119 } \
1120 else { \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1122 } \
1123 break
1124
1125#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1127 if (do_utf8) { \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1129 } \
1130 else { \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1132 } \
1133 break
1134
786e8c11
YO
1135#define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1137
1138/* We know what class REx starts with. Try to find this position... */
1139/* if reginfo is NULL, its a dryrun */
1140/* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1142
3c3eec57 1143STATIC char *
07be1b83 1144S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1145 const char *strend, regmatch_info *reginfo)
a687059c 1146{
27da23d5 1147 dVAR;
bbe252da 1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1149 char *m;
d8093b23 1150 STRLEN ln;
5dab1207 1151 STRLEN lnc;
078c425b 1152 register STRLEN uskip;
d8093b23
G
1153 unsigned int c1;
1154 unsigned int c2;
6eb5f6b9
JH
1155 char *e;
1156 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1157 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf
YO
1158 RXi_GET_DECL(prog,progi);
1159
6eb5f6b9
JH
1160 /* We know what class it must start with. */
1161 switch (OP(c)) {
6eb5f6b9 1162 case ANYOF:
388cc4de 1163 if (do_utf8) {
4cadc6a9 1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1167 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1168 }
1169 else {
1170 while (s < strend) {
1171 STRLEN skip = 1;
1172
32fc9b6a 1173 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
24b23f37 1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1179 goto got_it;
1180 else
1181 tmp = doevery;
1182 }
1183 else
1184 tmp = 1;
1185 s += skip;
1186 }
a0d0e21e 1187 }
6eb5f6b9 1188 break;
f33976b4 1189 case CANY:
4cadc6a9 1190 REXEC_FBC_SCAN(
24b23f37 1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1192 goto got_it;
1193 else
1194 tmp = doevery;
4cadc6a9 1195 );
f33976b4 1196 break;
6eb5f6b9 1197 case EXACTF:
5dab1207
NIS
1198 m = STRING(c);
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1201 if (UTF) {
a2a2844f 1202 STRLEN ulen1, ulen2;
5dab1207 1203 U8 *sm = (U8 *) m;
89ebb4a3
JH
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1207
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1210
89ebb4a3 1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1212 0, uniflags);
89ebb4a3 1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1214 0, uniflags);
5dab1207
NIS
1215 lnc = 0;
1216 while (sm < ((U8 *) m + ln)) {
1217 lnc++;
1218 sm += UTF8SKIP(sm);
1219 }
1aa99e6b
IH
1220 }
1221 else {
1222 c1 = *(U8*)m;
1223 c2 = PL_fold[c1];
1224 }
6eb5f6b9
JH
1225 goto do_exactf;
1226 case EXACTFL:
5dab1207
NIS
1227 m = STRING(c);
1228 ln = STR_LEN(c);
1229 lnc = (I32) ln;
d8093b23 1230 c1 = *(U8*)m;
6eb5f6b9
JH
1231 c2 = PL_fold_locale[c1];
1232 do_exactf:
db12adc6 1233 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1234
3b0527fe 1235 if (!reginfo && e < s)
6eb5f6b9 1236 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1237
60a8b682
JH
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
7f16dd3d
JH
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
60a8b682 1248
1aa99e6b 1249 if (do_utf8) {
575cac57 1250 UV c, f;
89ebb4a3 1251 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1252 STRLEN len, foldlen;
4ad0818d 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1254 if (c1 == c2) {
5dab1207
NIS
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1aa99e6b 1257 while (s <= e) {
89ebb4a3 1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1259 uniflags);
4cadc6a9 1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1261 }
09091399
JH
1262 }
1263 else {
1aa99e6b 1264 while (s <= e) {
89ebb4a3 1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1266 uniflags);
80aecb99 1267
60a8b682 1268 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
880bd946
JH
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1278
4cadc6a9 1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1280 }
09091399 1281 }
1aa99e6b
IH
1282 }
1283 else {
1284 if (c1 == c2)
4cadc6a9 1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1286 else
4cadc6a9 1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1288 }
1289 break;
bbce6d69 1290 case BOUNDL:
3280af22 1291 PL_reg_flags |= RF_tainted;
bbce6d69 1292 /* FALL THROUGH */
a0d0e21e 1293 case BOUND:
ffc61ed2 1294 if (do_utf8) {
12d33761 1295 if (s == PL_bostr)
ffc61ed2
JH
1296 tmp = '\n';
1297 else {
6136c704 1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1300 }
1301 tmp = ((OP(c) == BOUND ?
9041c2e3 1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1303 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1304 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1305 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1307 isALNUM_LC_utf8((U8*)s)))
1308 {
1309 tmp = !tmp;
4cadc6a9 1310 REXEC_FBC_TRYIT;
a687059c 1311 }
4cadc6a9 1312 );
a0d0e21e 1313 }
667bb95a 1314 else {
12d33761 1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1317 REXEC_FBC_SCAN(
ffc61ed2
JH
1318 if (tmp ==
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1320 tmp = !tmp;
4cadc6a9 1321 REXEC_FBC_TRYIT;
a0ed51b3 1322 }
4cadc6a9 1323 );
a0ed51b3 1324 }
24b23f37 1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1326 goto got_it;
1327 break;
bbce6d69 1328 case NBOUNDL:
3280af22 1329 PL_reg_flags |= RF_tainted;
bbce6d69 1330 /* FALL THROUGH */
a0d0e21e 1331 case NBOUND:
ffc61ed2 1332 if (do_utf8) {
12d33761 1333 if (s == PL_bostr)
ffc61ed2
JH
1334 tmp = '\n';
1335 else {
6136c704 1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1338 }
1339 tmp = ((OP(c) == NBOUND ?
9041c2e3 1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1341 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1342 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1343 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1345 isALNUM_LC_utf8((U8*)s)))
1346 tmp = !tmp;
4cadc6a9
YO
1347 else REXEC_FBC_TRYIT;
1348 );
a0d0e21e 1349 }
667bb95a 1350 else {
12d33761 1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1354 REXEC_FBC_SCAN(
ffc61ed2
JH
1355 if (tmp ==
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1357 tmp = !tmp;
4cadc6a9
YO
1358 else REXEC_FBC_TRYIT;
1359 );
a0ed51b3 1360 }
24b23f37 1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1362 goto got_it;
1363 break;
a0d0e21e 1364 case ALNUM:
4cadc6a9
YO
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1368 isALNUM(*s)
1369 );
bbce6d69 1370 case ALNUML:
4cadc6a9
YO
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1373 isALNUM_LC(*s)
1374 );
a0d0e21e 1375 case NALNUM:
4cadc6a9
YO
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1379 !isALNUM(*s)
1380 );
bbce6d69 1381 case NALNUML:
4cadc6a9
YO
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1384 !isALNUM_LC(*s)
1385 );
a0d0e21e 1386 case SPACE:
4cadc6a9
YO
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1390 isSPACE(*s)
1391 );
bbce6d69 1392 case SPACEL:
4cadc6a9
YO
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1395 isSPACE_LC(*s)
1396 );
a0d0e21e 1397 case NSPACE:
4cadc6a9
YO
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1401 !isSPACE(*s)
1402 );
bbce6d69 1403 case NSPACEL:
4cadc6a9
YO
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1406 !isSPACE_LC(*s)
1407 );
a0d0e21e 1408 case DIGIT:
4cadc6a9
YO
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1412 isDIGIT(*s)
1413 );
b8c5462f 1414 case DIGITL:
4cadc6a9
YO
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1417 isDIGIT_LC(*s)
1418 );
a0d0e21e 1419 case NDIGIT:
4cadc6a9
YO
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1423 !isDIGIT(*s)
1424 );
b8c5462f 1425 case NDIGITL:
4cadc6a9
YO
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1428 !isDIGIT_LC(*s)
1429 );
1de06328
YO
1430 case AHOCORASICKC:
1431 case AHOCORASICK:
07be1b83
YO
1432 {
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1436 : trie_plain;
1437 /* what trie are we using right now */
1438 reg_ac_data *aho
f8fc2ecf 1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1440 reg_trie_data *trie
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
55eed653 1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
07be1b83
YO
1443
1444 const char *last_start = strend - trie->minlen;
6148ee25 1445#ifdef DEBUGGING
07be1b83 1446 const char *real_start = s;
6148ee25 1447#endif
07be1b83 1448 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1449 SV *sv_points;
1450 U8 **points; /* map of where we were in the input string
786e8c11 1451 when reading a given char. For ASCII this
be8e71aa
YO
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
f9e705e8 1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1456 U8 *bitmap=NULL;
1457
07be1b83
YO
1458
1459 GET_RE_DEBUG_FLAGS_DECL;
1460
be8e71aa
YO
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1464 ENTER;
1465 SAVETMPS;
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1474 {
786e8c11
YO
1475 if (trie->bitmap)
1476 bitmap=(U8*)trie->bitmap;
1477 else
1478 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1479 }
786e8c11
YO
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1486 starting char.
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1494 if we have not.
1495
1496 */
07be1b83
YO
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1499 U8 *uc = (U8*)s;
1500 U16 charid = 0;
1501 U32 base = 1;
1502 U32 state = 1;
1503 UV uvc = 0;
1504 STRLEN len = 0;
1505 STRLEN foldlen = 0;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
786e8c11
YO
1508#ifdef DEBUGGING
1509 U32 accepted_word= 0;
1510#endif
07be1b83
YO
1511 U32 pointpos = 0;
1512
1513 while ( state && uc <= (U8*)strend ) {
1514 int failed=0;
786e8c11
YO
1515 U32 word = aho->states[ state ].wordnum;
1516
1de06328
YO
1517 if( state==1 ) {
1518 if ( bitmap ) {
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1525 }
1526 );
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1528 uc++;
786e8c11 1529 }
1de06328 1530 s= (char *)uc;
786e8c11 1531 }
786e8c11
YO
1532 if (uc >(U8*)last_start) break;
1533 }
1534
1535 if ( word ) {
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
07be1b83 1539 leftmost= lpos;
786e8c11 1540 }
07be1b83 1541 if (base==0) break;
786e8c11 1542
07be1b83
YO
1543 }
1544 points[pointpos++ % maxlen]= uc;
55eed653
NC
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1547 foldbuf, uniflags);
786e8c11
YO
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1550 s, do_utf8 );
07be1b83 1551 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1552 " Charid:%3u CP:%4"UVxf" ",
1553 charid, uvc);
1554 });
07be1b83
YO
1555
1556 do {
6148ee25 1557#ifdef DEBUGGING
786e8c11 1558 word = aho->states[ state ].wordnum;
6148ee25 1559#endif
07be1b83
YO
1560 base = aho->states[ state ].trans.base;
1561
786e8c11
YO
1562 DEBUG_TRIE_EXECUTE_r({
1563 if (failed)
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1565 s, do_utf8 );
07be1b83 1566 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1570 });
07be1b83
YO
1571 if ( base ) {
1572 U32 tmp;
1573 if (charid &&
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1576 < trie->lasttrans)
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1581 {
786e8c11
YO
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1584 state = tmp;
1585 break;
1586 }
1587 else {
786e8c11
YO
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1590 failed = 1;
1591 state = aho->fail[state];
07be1b83
YO
1592 }
1593 }
1594 else {
1595 /* we must be accepting here */
786e8c11
YO
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1598 failed = 1;
07be1b83
YO
1599 break;
1600 }
1601 } while(state);
786e8c11 1602 uc += len;
07be1b83
YO
1603 if (failed) {
1604 if (leftmost)
1605 break;
786e8c11 1606 if (!state) state = 1;
07be1b83
YO
1607 }
1608 }
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1613 leftmost = lpos;
786e8c11 1614 }
07be1b83 1615 }
07be1b83
YO
1616 if (leftmost) {
1617 s = (char*)leftmost;
786e8c11
YO
1618 DEBUG_TRIE_EXECUTE_r({
1619 PerlIO_printf(
70685ca0
JH
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1622 );
1623 });
24b23f37 1624 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1625 FREETMPS;
1626 LEAVE;
07be1b83 1627 goto got_it;
be8e71aa 1628 }
07be1b83 1629 s = HOPc(s,1);
786e8c11
YO
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1632 });
07be1b83 1633 } else {
786e8c11
YO
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1636 break;
1637 }
1638 }
be8e71aa
YO
1639 FREETMPS;
1640 LEAVE;
07be1b83
YO
1641 }
1642 break;
b3c9acc1 1643 default:
3c3eec57
GS
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1645 break;
d6a28714 1646 }
6eb5f6b9
JH
1647 return 0;
1648 got_it:
1649 return s;
1650}
1651
e6aad7ab 1652static void
fae667d5
YO
1653S_swap_match_buff (pTHX_ regexp *prog) {
1654 I32 *t;
1655 RXi_GET_DECL(prog,progi);
1656
1657 if (!progi->swap) {
1658 /* We have to be careful. If the previous successful match
1659 was from this regex we don't want a subsequent paritally
1660 successful match to clobber the old results.
1661 So when we detect this possibility we add a swap buffer
1662 to the re, and switch the buffer each match. If we fail
1663 we switch it back, otherwise we leave it swapped.
1664 */
1665 Newxz(progi->swap, 1, regexp_paren_ofs);
1666 /* no need to copy these */
1667 Newxz(progi->swap->startp, prog->nparens + 1, I32);
1668 Newxz(progi->swap->endp, prog->nparens + 1, I32);
1669 }
1670 t = progi->swap->startp;
1671 progi->swap->startp = prog->startp;
1672 prog->startp = t;
1673 t = progi->swap->endp;
1674 progi->swap->endp = prog->endp;
1675 prog->endp = t;
1676}
1677
1678
6eb5f6b9
JH
1679/*
1680 - regexec_flags - match a regexp against a string
1681 */
1682I32
1683Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1684 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1685/* strend: pointer to null at end of string */
1686/* strbeg: real beginning of string */
1687/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1688/* data: May be used for some additional optimizations.
1689 Currently its only used, with a U32 cast, for transmitting
1690 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1691/* nosave: For optimizations. */
1692{
97aff369 1693 dVAR;
24b23f37 1694 /*register*/ char *s;
6eb5f6b9 1695 register regnode *c;
24b23f37 1696 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1697 I32 minlen; /* must match at least this many chars */
1698 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1699 I32 end_shift = 0; /* Same for the end. */ /* CC */
1700 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1701 char *scream_olds = NULL;
3dab1dad 1702 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1703 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1704 I32 multiline;
f8fc2ecf 1705 RXi_GET_DECL(prog,progi);
3b0527fe 1706 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1707 bool swap_on_fail = 0;
a3621e74
YO
1708
1709 GET_RE_DEBUG_FLAGS_DECL;
1710
9d4ba2ae 1711 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1712
1713 /* Be paranoid... */
1714 if (prog == NULL || startpos == NULL) {
1715 Perl_croak(aTHX_ "NULL regexp parameter");
1716 return 0;
1717 }
1718
bbe252da 1719 multiline = prog->extflags & RXf_PMf_MULTILINE;
3b0527fe 1720 reginfo.prog = prog;
2757e526 1721
bac06658 1722 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1723 DEBUG_EXECUTE_r(
1724 debug_start_match(prog, do_utf8, startpos, strend,
1725 "Matching");
1726 );
bac06658 1727
6eb5f6b9 1728 minlen = prog->minlen;
1de06328
YO
1729
1730 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1732 "String too short [regexec_flags]...\n"));
1733 goto phooey;
1aa99e6b 1734 }
6eb5f6b9 1735
1de06328 1736
6eb5f6b9 1737 /* Check validity of program. */
f8fc2ecf 1738 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1739 Perl_croak(aTHX_ "corrupted regexp program");
1740 }
1741
1742 PL_reg_flags = 0;
1743 PL_reg_eval_set = 0;
1744 PL_reg_maxiter = 0;
1745
bbe252da 1746 if (prog->extflags & RXf_UTF8)
6eb5f6b9
JH
1747 PL_reg_flags |= RF_utf8;
1748
1749 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1750 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1751 PL_bostr = strbeg;
3b0527fe 1752 reginfo.sv = sv;
6eb5f6b9
JH
1753
1754 /* Mark end of line for $ (and such) */
1755 PL_regeol = strend;
1756
1757 /* see how far we have to get to not match where we matched before */
3b0527fe 1758 reginfo.till = startpos+minend;
6eb5f6b9 1759
6eb5f6b9
JH
1760 /* If there is a "must appear" string, look for it. */
1761 s = startpos;
1762
bbe252da 1763 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1764 MAGIC *mg;
1765
1766 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1767 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1768 else if (sv && SvTYPE(sv) >= SVt_PVMG
1769 && SvMAGIC(sv)
14befaf4
DM
1770 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1771 && mg->mg_len >= 0) {
3b0527fe 1772 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1773 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1774 if (s > reginfo.ganch)
6eb5f6b9 1775 goto phooey;
58e23c8d 1776 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1777 }
1778 }
58e23c8d 1779 else if (data) {
70685ca0 1780 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1781 } else /* pos() not defined */
3b0527fe 1782 reginfo.ganch = strbeg;
6eb5f6b9 1783 }
c74340f9 1784 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
fae667d5
YO
1785 swap_on_fail = 1;
1786 swap_match_buff(prog); /* do we need a save destructor here for
1787 eval dies? */
c74340f9 1788 }
a0714e2c 1789 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1790 re_scream_pos_data d;
1791
1792 d.scream_olds = &scream_olds;
1793 d.scream_pos = &scream_pos;
1794 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1795 if (!s) {
a3621e74 1796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1797 goto phooey; /* not present */
3fa9c3d7 1798 }
6eb5f6b9
JH
1799 }
1800
1de06328 1801
6eb5f6b9
JH
1802
1803 /* Simplest case: anchored match need be tried only once. */
1804 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1805 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1806 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1807 goto got_it;
bbe252da
YO
1808 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1809 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1810 {
1811 char *end;
1812
1813 if (minlen)
1814 dontbother = minlen - 1;
1aa99e6b 1815 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1816 /* for multiline we only have to try after newlines */
33b8afdf 1817 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1818 if (s == startpos)
1819 goto after_try;
1820 while (1) {
24b23f37 1821 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1822 goto got_it;
1823 after_try:
1824 if (s >= end)
1825 goto phooey;
bbe252da 1826 if (prog->extflags & RXf_USE_INTUIT) {
6eb5f6b9
JH
1827 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1828 if (!s)
1829 goto phooey;
1830 }
1831 else
1832 s++;
1833 }
1834 } else {
1835 if (s > startpos)
1836 s--;
1837 while (s < end) {
1838 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1839 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1840 goto got_it;
1841 }
1842 }
1843 }
1844 }
1845 goto phooey;
bbe252da 1846 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1847 {
1848 /* the warning about reginfo.ganch being used without intialization
bbe252da 1849 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1850 and we only enter this block when the same bit is set. */
58e23c8d
YO
1851 char *tmp_s = reginfo.ganch - prog->gofs;
1852 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1853 goto got_it;
1854 goto phooey;
1855 }
1856
1857 /* Messy cases: unanchored match. */
bbe252da 1858 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1859 /* we have /x+whatever/ */
1860 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1861 char ch;
bf93d4cc
GS
1862#ifdef DEBUGGING
1863 int did_match = 0;
1864#endif
33b8afdf
JH
1865 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1867 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1868
1aa99e6b 1869 if (do_utf8) {
4cadc6a9 1870 REXEC_FBC_SCAN(
6eb5f6b9 1871 if (*s == ch) {
a3621e74 1872 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1873 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1874 s += UTF8SKIP(s);
1875 while (s < strend && *s == ch)
1876 s += UTF8SKIP(s);
1877 }
4cadc6a9 1878 );
6eb5f6b9
JH
1879 }
1880 else {
4cadc6a9 1881 REXEC_FBC_SCAN(
6eb5f6b9 1882 if (*s == ch) {
a3621e74 1883 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1884 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1885 s++;
1886 while (s < strend && *s == ch)
1887 s++;
1888 }
4cadc6a9 1889 );
6eb5f6b9 1890 }
a3621e74 1891 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1892 PerlIO_printf(Perl_debug_log,
b7953727
JH
1893 "Did not find anchored character...\n")
1894 );
6eb5f6b9 1895 }
a0714e2c
SS
1896 else if (prog->anchored_substr != NULL
1897 || prog->anchored_utf8 != NULL
1898 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1899 && prog->float_max_offset < strend - s)) {
1900 SV *must;
1901 I32 back_max;
1902 I32 back_min;
1903 char *last;
6eb5f6b9 1904 char *last1; /* Last position checked before */
bf93d4cc
GS
1905#ifdef DEBUGGING
1906 int did_match = 0;
1907#endif
33b8afdf
JH
1908 if (prog->anchored_substr || prog->anchored_utf8) {
1909 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1910 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1911 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1912 back_max = back_min = prog->anchored_offset;
1913 } else {
1914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1915 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1916 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1917 back_max = prog->float_max_offset;
1918 back_min = prog->float_min_offset;
1919 }
1de06328
YO
1920
1921
33b8afdf
JH
1922 if (must == &PL_sv_undef)
1923 /* could not downgrade utf8 check substring, so must fail */
1924 goto phooey;
1925
1de06328
YO
1926 if (back_min<0) {
1927 last = strend;
1928 } else {
1929 last = HOP3c(strend, /* Cannot start after this */
1930 -(I32)(CHR_SVLEN(must)
1931 - (SvTAIL(must) != 0) + back_min), strbeg);
1932 }
6eb5f6b9
JH
1933 if (s > PL_bostr)
1934 last1 = HOPc(s, -1);
1935 else
1936 last1 = s - 1; /* bogus */
1937
a0288114 1938 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1939 check_substr==must. */
1940 scream_pos = -1;
1941 dontbother = end_shift;
1942 strend = HOPc(strend, -dontbother);
1943 while ( (s <= last) &&
9041c2e3 1944 ((flags & REXEC_SCREAM)
1de06328 1945 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1946 end_shift, &scream_pos, 0))
1de06328 1947 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1948 (unsigned char*)strend, must,
7fba1cd6 1949 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1950 /* we may be pointing at the wrong string */
1951 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1952 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1953 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1954 if (HOPc(s, -back_max) > last1) {
1955 last1 = HOPc(s, -back_min);
1956 s = HOPc(s, -back_max);
1957 }
1958 else {
52657f30 1959 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1960
1961 last1 = HOPc(s, -back_min);
52657f30 1962 s = t;
6eb5f6b9 1963 }
1aa99e6b 1964 if (do_utf8) {
6eb5f6b9 1965 while (s <= last1) {
24b23f37 1966 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1967 goto got_it;
1968 s += UTF8SKIP(s);
1969 }
1970 }
1971 else {
1972 while (s <= last1) {
24b23f37 1973 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1974 goto got_it;
1975 s++;
1976 }
1977 }
1978 }
ab3bbdeb
YO
1979 DEBUG_EXECUTE_r(if (!did_match) {
1980 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1981 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1982 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1983 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1984 ? "anchored" : "floating"),
ab3bbdeb
YO
1985 quoted, RE_SV_TAIL(must));
1986 });
6eb5f6b9
JH
1987 goto phooey;
1988 }
f8fc2ecf 1989 else if ( (c = progi->regstclass) ) {
f14c76ed 1990 if (minlen) {
f8fc2ecf 1991 const OPCODE op = OP(progi->regstclass);
66e933ab 1992 /* don't bother with what can't match */
786e8c11 1993 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1994 strend = HOPc(strend, -(minlen - 1));
1995 }
a3621e74 1996 DEBUG_EXECUTE_r({
be8e71aa 1997 SV * const prop = sv_newmortal();
32fc9b6a 1998 regprop(prog, prop, c);
0df25f3d 1999 {
02daf0ab 2000 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2001 s,strend-s,60);
0df25f3d 2002 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2003 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2004 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2005 quoted, (int)(strend - s));
0df25f3d 2006 }
ffc61ed2 2007 });
3b0527fe 2008 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2009 goto got_it;
07be1b83 2010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2011 }
2012 else {
2013 dontbother = 0;
a0714e2c 2014 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2015 /* Trim the end. */
d6a28714 2016 char *last;
33b8afdf
JH
2017 SV* float_real;
2018
2019 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2020 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2021 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2022
2023 if (flags & REXEC_SCREAM) {
33b8afdf 2024 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2025 end_shift, &scream_pos, 1); /* last one */
2026 if (!last)
ffc61ed2 2027 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2028 /* we may be pointing at the wrong string */
2029 else if (RX_MATCH_COPIED(prog))
3f7c398e 2030 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2031 }
d6a28714
JH
2032 else {
2033 STRLEN len;
cfd0369c 2034 const char * const little = SvPV_const(float_real, len);
d6a28714 2035
33b8afdf 2036 if (SvTAIL(float_real)) {
d6a28714
JH
2037 if (memEQ(strend - len + 1, little, len - 1))
2038 last = strend - len + 1;
7fba1cd6 2039 else if (!multiline)
9041c2e3 2040 last = memEQ(strend - len, little, len)
bd61b366 2041 ? strend - len : NULL;
b8c5462f 2042 else
d6a28714
JH
2043 goto find_last;
2044 } else {
2045 find_last:
9041c2e3 2046 if (len)
d6a28714 2047 last = rninstr(s, strend, little, little + len);
b8c5462f 2048 else
a0288114 2049 last = strend; /* matching "$" */
b8c5462f 2050 }
b8c5462f 2051 }
bf93d4cc 2052 if (last == NULL) {
6bda09f9
YO
2053 DEBUG_EXECUTE_r(
2054 PerlIO_printf(Perl_debug_log,
2055 "%sCan't trim the tail, match fails (should not happen)%s\n",
2056 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2057 goto phooey; /* Should not happen! */
2058 }
d6a28714
JH
2059 dontbother = strend - last + prog->float_min_offset;
2060 }
2061 if (minlen && (dontbother < minlen))
2062 dontbother = minlen - 1;
2063 strend -= dontbother; /* this one's always in bytes! */
2064 /* We don't know much -- general case. */
1aa99e6b 2065 if (do_utf8) {
d6a28714 2066 for (;;) {
24b23f37 2067 if (regtry(&reginfo, &s))
d6a28714
JH
2068 goto got_it;
2069 if (s >= strend)
2070 break;
b8c5462f 2071 s += UTF8SKIP(s);
d6a28714
JH
2072 };
2073 }
2074 else {
2075 do {
24b23f37 2076 if (regtry(&reginfo, &s))
d6a28714
JH
2077 goto got_it;
2078 } while (s++ < strend);
2079 }
2080 }
2081
2082 /* Failure. */
2083 goto phooey;
2084
2085got_it:
2086 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2087
2088 if (PL_reg_eval_set) {
2089 /* Preserve the current value of $^R */
2090 if (oreplsv != GvSV(PL_replgv))
2091 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2092 restored, the value remains
2093 the same. */
4f639d21 2094 restore_pos(aTHX_ prog);
d6a28714 2095 }
81714fb9
YO
2096 if (prog->paren_names)
2097 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2098
2099 /* make sure $`, $&, $', and $digit will work later */
2100 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2101 RX_MATCH_COPY_FREE(prog);
d6a28714 2102 if (flags & REXEC_COPY_STR) {
be8e71aa 2103 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2104#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2105 if ((SvIsCOW(sv)
2106 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2107 if (DEBUG_C_TEST) {
2108 PerlIO_printf(Perl_debug_log,
2109 "Copy on write: regexp capture, type %d\n",
2110 (int) SvTYPE(sv));
2111 }
2112 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2113 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2114 assert (SvPOKp(prog->saved_copy));
2115 } else
2116#endif
2117 {
2118 RX_MATCH_COPIED_on(prog);
2119 s = savepvn(strbeg, i);
2120 prog->subbeg = s;
2121 }
d6a28714 2122 prog->sublen = i;
d6a28714
JH
2123 }
2124 else {
2125 prog->subbeg = strbeg;
2126 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2127 }
2128 }
9041c2e3 2129
d6a28714
JH
2130 return 1;
2131
2132phooey:
a3621e74 2133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2134 PL_colors[4], PL_colors[5]));
d6a28714 2135 if (PL_reg_eval_set)
4f639d21 2136 restore_pos(aTHX_ prog);
fae667d5 2137 if (swap_on_fail)
c74340f9 2138 /* we failed :-( roll it back */
fae667d5
YO
2139 swap_match_buff(prog);
2140
d6a28714
JH
2141 return 0;
2142}
2143
6bda09f9 2144
d6a28714
JH
2145/*
2146 - regtry - try match at specific point
2147 */
2148STATIC I32 /* 0 failure, 1 success */
24b23f37 2149S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2150{
97aff369 2151 dVAR;
d6a28714
JH
2152 register I32 *sp;
2153 register I32 *ep;
2154 CHECKPOINT lastcp;
3b0527fe 2155 regexp *prog = reginfo->prog;
f8fc2ecf 2156 RXi_GET_DECL(prog,progi);
a3621e74 2157 GET_RE_DEBUG_FLAGS_DECL;
24b23f37 2158 reginfo->cutpoint=NULL;
d6a28714 2159
bbe252da 2160 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2161 MAGIC *mg;
2162
2163 PL_reg_eval_set = RS_init;
a3621e74 2164 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2165 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2166 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2167 ));
ea8d6ae1 2168 SAVESTACK_CXPOS();
d6a28714
JH
2169 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2170 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2171 SAVETMPS;
2172 /* Apparently this is not needed, judging by wantarray. */
e8347627 2173 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2174 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2175
3b0527fe 2176 if (reginfo->sv) {
d6a28714 2177 /* Make $_ available to executed code. */
3b0527fe 2178 if (reginfo->sv != DEFSV) {
59f00321 2179 SAVE_DEFSV;
3b0527fe 2180 DEFSV = reginfo->sv;
b8c5462f 2181 }
d6a28714 2182
3b0527fe
DM
2183 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2184 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2185 /* prepare for quick setting of pos */
d300d9fa 2186#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2187 if (SvIsCOW(reginfo->sv))
2188 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2189#endif
3dab1dad 2190 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2191 &PL_vtbl_mglob, NULL, 0);
d6a28714 2192 mg->mg_len = -1;
b8c5462f 2193 }
d6a28714
JH
2194 PL_reg_magic = mg;
2195 PL_reg_oldpos = mg->mg_len;
4f639d21 2196 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2197 }
09687e5a 2198 if (!PL_reg_curpm) {
a02a5408 2199 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2200#ifdef USE_ITHREADS
2201 {
be8e71aa 2202 SV* const repointer = newSViv(0);
577e12cc 2203 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2204 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2205 av_push(PL_regex_padav,repointer);
2206 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2207 PL_regex_pad = AvARRAY(PL_regex_padav);
2208 }
2209#endif
2210 }
aaa362c4 2211 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2212 PL_reg_oldcurpm = PL_curpm;
2213 PL_curpm = PL_reg_curpm;
2214 if (RX_MATCH_COPIED(prog)) {
2215 /* Here is a serious problem: we cannot rewrite subbeg,
2216 since it may be needed if this match fails. Thus
2217 $` inside (?{}) could fail... */
2218 PL_reg_oldsaved = prog->subbeg;
2219 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2220#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2221 PL_nrs = prog->saved_copy;
2222#endif
d6a28714
JH
2223 RX_MATCH_COPIED_off(prog);
2224 }
2225 else
bd61b366 2226 PL_reg_oldsaved = NULL;
d6a28714
JH
2227 prog->subbeg = PL_bostr;
2228 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2229 }
24b23f37
YO
2230 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2231 prog->startp[0] = *startpos - PL_bostr;
2232 PL_reginput = *startpos;
d6a28714 2233 PL_reglastparen = &prog->lastparen;
a01268b5 2234 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2235 prog->lastparen = 0;
03994de8 2236 prog->lastcloseparen = 0;
d6a28714 2237 PL_regsize = 0;
6bda09f9
YO
2238 PL_regstartp = prog->startp;
2239 PL_regendp = prog->endp;
d6a28714
JH
2240 if (PL_reg_start_tmpl <= prog->nparens) {
2241 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2242 if(PL_reg_start_tmp)
2243 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2244 else
a02a5408 2245 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2246 }
2247
2248 /* XXXX What this code is doing here?!!! There should be no need
2249 to do this again and again, PL_reglastparen should take care of
3dd2943c 2250 this! --ilya*/
dafc8851
JH
2251
2252 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2253 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2254 * PL_reglastparen), is not needed at all by the test suite
2255 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2256 * enough, for building DynaLoader, or otherwise this
2257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2258 * will happen. Meanwhile, this code *is* needed for the
2259 * above-mentioned test suite tests to succeed. The common theme
2260 * on those tests seems to be returning null fields from matches.
2261 * --jhi */
dafc8851 2262#if 1
c74340f9
YO
2263 sp = PL_regstartp;
2264 ep = PL_regendp;
d6a28714 2265 if (prog->nparens) {
097eb12c 2266 register I32 i;
eb160463 2267 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2268 *++sp = -1;
2269 *++ep = -1;
2270 }
2271 }
dafc8851 2272#endif
02db2b7b 2273 REGCP_SET(lastcp);
f8fc2ecf 2274 if (regmatch(reginfo, progi->program + 1)) {
c74340f9 2275 PL_regendp[0] = PL_reginput - PL_bostr;
d6a28714
JH
2276 return 1;
2277 }
24b23f37
YO
2278 if (reginfo->cutpoint)
2279 *startpos= reginfo->cutpoint;
02db2b7b 2280 REGCP_UNWIND(lastcp);
d6a28714
JH
2281 return 0;
2282}
2283
02db2b7b 2284
8ba1375e
MJD
2285#define sayYES goto yes
2286#define sayNO goto no
262b90c4 2287#define sayNO_SILENT goto no_silent
8ba1375e 2288
f9f4320a
YO
2289/* we dont use STMT_START/END here because it leads to
2290 "unreachable code" warnings, which are bogus, but distracting. */
2291#define CACHEsayNO \
c476f425
DM
2292 if (ST.cache_mask) \
2293 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2294 sayNO
3298f257 2295
a3621e74 2296/* this is used to determine how far from the left messages like
265c4333
YO
2297 'failed...' are printed. It should be set such that messages
2298 are inline with the regop output that created them.
a3621e74 2299*/
265c4333 2300#define REPORT_CODE_OFF 32
a3621e74
YO
2301
2302
2303/* Make sure there is a test for this +1 options in re_tests */
2304#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2305
40a82448
DM
2306#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2307#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2308
86545054
DM
2309#define SLAB_FIRST(s) (&(s)->states[0])
2310#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2311
5d9a96ca
DM
2312/* grab a new slab and return the first slot in it */
2313
2314STATIC regmatch_state *
2315S_push_slab(pTHX)
2316{
a35a87e7 2317#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2318 dMY_CXT;
2319#endif
5d9a96ca
DM
2320 regmatch_slab *s = PL_regmatch_slab->next;
2321 if (!s) {
2322 Newx(s, 1, regmatch_slab);
2323 s->prev = PL_regmatch_slab;
2324 s->next = NULL;
2325 PL_regmatch_slab->next = s;
2326 }
2327 PL_regmatch_slab = s;
86545054 2328 return SLAB_FIRST(s);
5d9a96ca 2329}
5b47454d 2330
95b24440 2331
40a82448
DM
2332/* push a new state then goto it */
2333
2334#define PUSH_STATE_GOTO(state, node) \
2335 scan = node; \
2336 st->resume_state = state; \
2337 goto push_state;
2338
2339/* push a new state with success backtracking, then goto it */
2340
2341#define PUSH_YES_STATE_GOTO(state, node) \
2342 scan = node; \
2343 st->resume_state = state; \
2344 goto push_yes_state;
2345
aa283a38 2346
aa283a38 2347
d6a28714 2348/*
95b24440 2349
bf1f174e
DM
2350regmatch() - main matching routine
2351
2352This is basically one big switch statement in a loop. We execute an op,
2353set 'next' to point the next op, and continue. If we come to a point which
2354we may need to backtrack to on failure such as (A|B|C), we push a
2355backtrack state onto the backtrack stack. On failure, we pop the top
2356state, and re-enter the loop at the state indicated. If there are no more
2357states to pop, we return failure.
2358
2359Sometimes we also need to backtrack on success; for example /A+/, where
2360after successfully matching one A, we need to go back and try to
2361match another one; similarly for lookahead assertions: if the assertion
2362completes successfully, we backtrack to the state just before the assertion
2363and then carry on. In these cases, the pushed state is marked as
2364'backtrack on success too'. This marking is in fact done by a chain of
2365pointers, each pointing to the previous 'yes' state. On success, we pop to
2366the nearest yes state, discarding any intermediate failure-only states.
2367Sometimes a yes state is pushed just to force some cleanup code to be
2368called at the end of a successful match or submatch; e.g. (??{$re}) uses
2369it to free the inner regex.
2370
2371Note that failure backtracking rewinds the cursor position, while
2372success backtracking leaves it alone.
2373
2374A pattern is complete when the END op is executed, while a subpattern
2375such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2376ops trigger the "pop to last yes state if any, otherwise return true"
2377behaviour.
2378
2379A common convention in this function is to use A and B to refer to the two
2380subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2381the subpattern to be matched possibly multiple times, while B is the entire
2382rest of the pattern. Variable and state names reflect this convention.
2383
2384The states in the main switch are the union of ops and failure/success of
2385substates associated with with that op. For example, IFMATCH is the op
2386that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2387'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2388successfully matched A and IFMATCH_A_fail is a state saying that we have
2389just failed to match A. Resume states always come in pairs. The backtrack
2390state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2391at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2392on success or failure.
2393
2394The struct that holds a backtracking state is actually a big union, with
2395one variant for each major type of op. The variable st points to the
2396top-most backtrack struct. To make the code clearer, within each
2397block of code we #define ST to alias the relevant union.
2398
2399Here's a concrete example of a (vastly oversimplified) IFMATCH
2400implementation:
2401
2402 switch (state) {
2403 ....
2404
2405#define ST st->u.ifmatch
2406
2407 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2408 ST.foo = ...; // some state we wish to save
95b24440 2409 ...
bf1f174e
DM
2410 // push a yes backtrack state with a resume value of
2411 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2412 // first node of A:
2413 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2414 // NOTREACHED
2415
2416 case IFMATCH_A: // we have successfully executed A; now continue with B
2417 next = B;
2418 bar = ST.foo; // do something with the preserved value
2419 break;
2420
2421 case IFMATCH_A_fail: // A failed, so the assertion failed
2422 ...; // do some housekeeping, then ...
2423 sayNO; // propagate the failure
2424
2425#undef ST
95b24440 2426
bf1f174e
DM
2427 ...
2428 }
95b24440 2429
bf1f174e
DM
2430For any old-timers reading this who are familiar with the old recursive
2431approach, the code above is equivalent to:
95b24440 2432
bf1f174e
DM
2433 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2434 {
2435 int foo = ...
95b24440 2436 ...
bf1f174e
DM
2437 if (regmatch(A)) {
2438 next = B;
2439 bar = foo;
2440 break;
95b24440 2441 }
bf1f174e
DM
2442 ...; // do some housekeeping, then ...
2443 sayNO; // propagate the failure
95b24440 2444 }
bf1f174e
DM
2445
2446The topmost backtrack state, pointed to by st, is usually free. If you
2447want to claim it, populate any ST.foo fields in it with values you wish to
2448save, then do one of
2449
2450 PUSH_STATE_GOTO(resume_state, node);
2451 PUSH_YES_STATE_GOTO(resume_state, node);
2452
2453which sets that backtrack state's resume value to 'resume_state', pushes a
2454new free entry to the top of the backtrack stack, then goes to 'node'.
2455On backtracking, the free slot is popped, and the saved state becomes the
2456new free state. An ST.foo field in this new top state can be temporarily
2457accessed to retrieve values, but once the main loop is re-entered, it
2458becomes available for reuse.
2459
2460Note that the depth of the backtrack stack constantly increases during the
2461left-to-right execution of the pattern, rather than going up and down with
2462the pattern nesting. For example the stack is at its maximum at Z at the
2463end of the pattern, rather than at X in the following:
2464
2465 /(((X)+)+)+....(Y)+....Z/
2466
2467The only exceptions to this are lookahead/behind assertions and the cut,
2468(?>A), which pop all the backtrack states associated with A before
2469continuing.
2470
2471Bascktrack state structs are allocated in slabs of about 4K in size.
2472PL_regmatch_state and st always point to the currently active state,
2473and PL_regmatch_slab points to the slab currently containing
2474PL_regmatch_state. The first time regmatch() is called, the first slab is
2475allocated, and is never freed until interpreter destruction. When the slab
2476is full, a new one is allocated and chained to the end. At exit from
2477regmatch(), slabs allocated since entry are freed.
2478
2479*/
95b24440 2480
40a82448 2481
5bc10b2c 2482#define DEBUG_STATE_pp(pp) \
265c4333 2483 DEBUG_STATE_r({ \
5bc10b2c
DM
2484 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2485 PerlIO_printf(Perl_debug_log, \
5d458dd8 2486 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2487 depth*2, "", \
13d6edb4 2488 PL_reg_name[st->resume_state], \
5d458dd8
YO
2489 ((st==yes_state||st==mark_state) ? "[" : ""), \
2490 ((st==yes_state) ? "Y" : ""), \
2491 ((st==mark_state) ? "M" : ""), \
2492 ((st==yes_state||st==mark_state) ? "]" : "") \
2493 ); \
265c4333 2494 });
5bc10b2c 2495
40a82448 2496
3dab1dad 2497#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2498
3df15adc 2499#ifdef DEBUGGING
5bc10b2c 2500
ab3bbdeb
YO
2501STATIC void
2502S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2503 const char *start, const char *end, const char *blurb)
2504{
bbe252da 2505 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
ab3bbdeb
YO
2506 if (!PL_colorset)
2507 reginitcolors();
2508 {
2509 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2510 prog->precomp, prog->prelen, 60);
2511
2512 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2513 start, end - start, 60);
2514
2515 PerlIO_printf(Perl_debug_log,
2516 "%s%s REx%s %s against %s\n",
2517 PL_colors[4], blurb, PL_colors[5], s0, s1);
2518
2519 if (do_utf8||utf8_pat)
1de06328
YO
2520 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2521 utf8_pat ? "pattern" : "",
2522 utf8_pat && do_utf8 ? " and " : "",
2523 do_utf8 ? "string" : ""
ab3bbdeb
YO
2524 );
2525 }
2526}
3df15adc
YO
2527
2528STATIC void
786e8c11
YO
2529S_dump_exec_pos(pTHX_ const char *locinput,
2530 const regnode *scan,
2531 const char *loc_regeol,
2532 const char *loc_bostr,
2533 const char *loc_reg_starttry,
2534 const bool do_utf8)
07be1b83 2535{
786e8c11 2536 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2537 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2538 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2539 /* The part of the string before starttry has one color
2540 (pref0_len chars), between starttry and current
2541 position another one (pref_len - pref0_len chars),
2542 after the current position the third one.
2543 We assume that pref0_len <= pref_len, otherwise we
2544 decrease pref0_len. */
786e8c11
YO
2545 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2546 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2547 int pref0_len;
2548
2549 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2550 pref_len++;
786e8c11
YO
2551 pref0_len = pref_len - (locinput - loc_reg_starttry);
2552 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2553 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2554 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2555 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2556 l--;
2557 if (pref0_len < 0)
2558 pref0_len = 0;
2559 if (pref0_len > pref_len)
2560 pref0_len = pref_len;
2561 {
3df15adc 2562 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2563
ab3bbdeb 2564 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2565 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2566
ab3bbdeb 2567 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2568 (locinput - pref_len + pref0_len),
1de06328 2569 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2570
ab3bbdeb 2571 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2572 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2573
1de06328 2574 const STRLEN tlen=len0+len1+len2;
3df15adc 2575 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2576 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2577 (IV)(locinput - loc_bostr),
07be1b83 2578 len0, s0,
07be1b83 2579 len1, s1,
07be1b83 2580 (docolor ? "" : "> <"),
07be1b83 2581 len2, s2,
f9f4320a 2582 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2583 "");
2584 }
2585}
3df15adc 2586
07be1b83
YO
2587#endif
2588
0a4db386
YO
2589/* reg_check_named_buff_matched()
2590 * Checks to see if a named buffer has matched. The data array of
2591 * buffer numbers corresponding to the buffer is expected to reside
2592 * in the regexp->data->data array in the slot stored in the ARG() of
2593 * node involved. Note that this routine doesn't actually care about the
2594 * name, that information is not preserved from compilation to execution.
2595 * Returns the index of the leftmost defined buffer with the given name
2596 * or 0 if non of the buffers matched.
2597 */
2598STATIC I32
2599S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2600 I32 n;
f8fc2ecf
YO
2601 RXi_GET_DECL(rex,rexi);
2602 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
0a4db386
YO
2603 I32 *nums=(I32*)SvPVX(sv_dat);
2604 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2605 if ((I32)*PL_reglastparen >= nums[n] &&
2606 PL_regendp[nums[n]] != -1)
2607 {
2608 return nums[n];
2609 }
2610 }
2611 return 0;
2612}
2613
d6a28714 2614STATIC I32 /* 0 failure, 1 success */
24b23f37 2615S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2616{
a35a87e7 2617#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2618 dMY_CXT;
2619#endif
27da23d5 2620 dVAR;
95b24440 2621 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2622 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2623
3b0527fe 2624 regexp *rex = reginfo->prog;
f8fc2ecf
YO
2625 RXi_GET_DECL(rex,rexi);
2626
5d9a96ca
DM
2627 regmatch_slab *orig_slab;
2628 regmatch_state *orig_state;
a3621e74 2629
5d9a96ca
DM
2630 /* the current state. This is a cached copy of PL_regmatch_state */
2631 register regmatch_state *st;
95b24440 2632
5d9a96ca
DM
2633 /* cache heavy used fields of st in registers */
2634 register regnode *scan;
2635 register regnode *next;
438e9bae 2636 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2637 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2638 register char *locinput = PL_reginput;
5d9a96ca 2639 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2640
b69b0499 2641 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2642 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2643 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2644 const U32 max_nochange_depth =
2645 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2646 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2647
77cb431f
DM
2648 regmatch_state *yes_state = NULL; /* state to pop to on success of
2649 subpattern */
e2e6a0f1
YO
2650 /* mark_state piggy backs on the yes_state logic so that when we unwind
2651 the stack on success we can update the mark_state as we go */
2652 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5d458dd8 2653
faec1544 2654 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2655 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2656 U32 state_num;
5d458dd8
YO
2657 bool no_final = 0; /* prevent failure from backtracking? */
2658 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2659 char *startpoint = PL_reginput;
5d458dd8
YO
2660 SV *popmark = NULL; /* are we looking for a mark? */
2661 SV *sv_commit = NULL; /* last mark name seen in failure */
2662 SV *sv_yes_mark = NULL; /* last mark name we have seen
2663 during a successfull match */
2664 U32 lastopen = 0; /* last open we saw */
2665 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
58e23c8d 2666
5d458dd8 2667
24d3c4a9
DM
2668 /* these three flags are set by various ops to signal information to
2669 * the very next op. They have a useful lifetime of exactly one loop
2670 * iteration, and are not preserved or restored by state pushes/pops
2671 */
2672 bool sw = 0; /* the condition value in (?(cond)a|b) */
2673 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2674 int logical = 0; /* the following EVAL is:
2675 0: (?{...})
2676 1: (?(?{...})X|Y)
2677 2: (??{...})
2678 or the following IFMATCH/UNLESSM is:
2679 false: plain (?=foo)
2680 true: used as a condition: (?(?=foo))
2681 */
2682
95b24440 2683#ifdef DEBUGGING
e68ec53f 2684 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2685#endif
2686
3b57cd43 2687 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2688 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2689 }));
5d9a96ca
DM
2690 /* on first ever call to regmatch, allocate first slab */
2691 if (!PL_regmatch_slab) {
2692 Newx(PL_regmatch_slab, 1, regmatch_slab);
2693 PL_regmatch_slab->prev = NULL;
2694 PL_regmatch_slab->next = NULL;
86545054 2695 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2696 }
2697
2698 /* remember current high-water mark for exit */
2699 /* XXX this should be done with SAVE* instead */
2700 orig_slab = PL_regmatch_slab;
2701 orig_state = PL_regmatch_state;
2702
2703 /* grab next free state slot */
2704 st = ++PL_regmatch_state;
86545054 2705 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2706 st = PL_regmatch_state = S_push_slab(aTHX);
2707
d6a28714
JH
2708 /* Note that nextchr is a byte even in UTF */
2709 nextchr = UCHARAT(locinput);
2710 scan = prog;
2711 while (scan != NULL) {
8ba1375e 2712
a3621e74 2713 DEBUG_EXECUTE_r( {
6136c704 2714 SV * const prop = sv_newmortal();
1de06328 2715 regnode *rnext=regnext(scan);
786e8c11 2716 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2717 regprop(rex, prop, scan);
07be1b83
YO
2718
2719 PerlIO_printf(Perl_debug_log,
2720 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2721 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2722 SvPVX_const(prop),
1de06328 2723 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2724 0 : (IV)(rnext - rexi->program));
2a782b5b 2725 });
d6a28714
JH
2726
2727 next = scan + NEXT_OFF(scan);
2728 if (next == scan)
2729 next = NULL;
40a82448 2730 state_num = OP(scan);
d6a28714 2731
40a82448
DM
2732 reenter_switch:
2733 switch (state_num) {
d6a28714 2734 case BOL:
7fba1cd6 2735 if (locinput == PL_bostr)
d6a28714 2736 {
3b0527fe 2737 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2738 break;
2739 }
d6a28714
JH
2740 sayNO;
2741 case MBOL:
12d33761
HS
2742 if (locinput == PL_bostr ||
2743 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2744 {
b8c5462f
JH
2745 break;
2746 }
d6a28714
JH
2747 sayNO;
2748 case SBOL:
c2a73568 2749 if (locinput == PL_bostr)
b8c5462f 2750 break;
d6a28714
JH
2751 sayNO;
2752 case GPOS:
3b0527fe 2753 if (locinput == reginfo->ganch)
d6a28714
JH
2754 break;
2755 sayNO;
ee9b8eae
YO
2756
2757 case KEEPS:
2758 /* update the startpoint */
2759 st->u.keeper.val = PL_regstartp[0];
2760 PL_reginput = locinput;
2761 PL_regstartp[0] = locinput - PL_bostr;
2762 PUSH_STATE_GOTO(KEEPS_next, next);
2763 /*NOT-REACHED*/
2764 case KEEPS_next_fail:
2765 /* rollback the start point change */
2766 PL_regstartp[0] = st->u.keeper.val;
2767 sayNO_SILENT;
2768 /*NOT-REACHED*/
d6a28714 2769 case EOL:
d6a28714
JH
2770 goto seol;
2771 case MEOL:
d6a28714 2772 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2773 sayNO;
b8c5462f 2774 break;
d6a28714
JH
2775 case SEOL:
2776 seol:
2777 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2778 sayNO;
d6a28714 2779 if (PL_regeol - locinput > 1)
b8c5462f 2780 sayNO;
b8c5462f 2781 break;
d6a28714
JH
2782 case EOS:
2783 if (PL_regeol != locinput)
b8c5462f 2784 sayNO;
d6a28714 2785 break;
ffc61ed2 2786 case SANY:
d6a28714 2787 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2788 sayNO;
f33976b4
DB
2789 if (do_utf8) {
2790 locinput += PL_utf8skip[nextchr];
2791 if (locinput > PL_regeol)
2792 sayNO;
2793 nextchr = UCHARAT(locinput);
2794 }
2795 else
2796 nextchr = UCHARAT(++locinput);
2797 break;
2798 case CANY:
2799 if (!nextchr && locinput >= PL_regeol)
2800 sayNO;
b8c5462f 2801 nextchr = UCHARAT(++locinput);
a0d0e21e 2802 break;
ffc61ed2 2803 case REG_ANY:
1aa99e6b
IH
2804 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2805 sayNO;
2806 if (do_utf8) {
b8c5462f 2807 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2808 if (locinput > PL_regeol)
2809 sayNO;
a0ed51b3 2810 nextchr = UCHARAT(locinput);
a0ed51b3 2811 }
1aa99e6b
IH
2812 else
2813 nextchr = UCHARAT(++locinput);
a0ed51b3 2814 break;
166ba7cd
DM
2815
2816#undef ST
2817#define ST st->u.trie
786e8c11
YO
2818 case TRIEC:
2819 /* In this case the charclass data is available inline so
2820 we can fail fast without a lot of extra overhead.
2821 */
2822 if (scan->flags == EXACT || !do_utf8) {
2823 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2824 DEBUG_EXECUTE_r(
2825 PerlIO_printf(Perl_debug_log,
2826 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2827 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2828 );
2829 sayNO_SILENT;
2830 /* NOTREACHED */
2831 }
2832 }
2833 /* FALL THROUGH */
5b47454d 2834 case TRIE:
3dab1dad 2835 {
07be1b83 2836 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2837 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2838 trie_type = do_utf8 ?
2839 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2840 : trie_plain;
2841
2842 /* what trie are we using right now */
be8e71aa 2843 reg_trie_data * const trie
f8fc2ecf 2844 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
55eed653 2845 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
3dab1dad 2846 U32 state = trie->startstate;
166ba7cd 2847
3dab1dad
YO
2848 if (trie->bitmap && trie_type != trie_utf8_fold &&
2849 !TRIE_BITMAP_TEST(trie,*locinput)
2850 ) {
2851 if (trie->states[ state ].wordnum) {
2852 DEBUG_EXECUTE_r(
2853 PerlIO_printf(Perl_debug_log,
2854 "%*s %smatched empty string...%s\n",
5bc10b2c 2855 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2856 );
2857 break;
2858 } else {
2859 DEBUG_EXECUTE_r(
2860 PerlIO_printf(Perl_debug_log,
786e8c11 2861 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2862 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2863 );
2864 sayNO_SILENT;
2865 }
2866 }
166ba7cd 2867
786e8c11
YO
2868 {
2869 U8 *uc = ( U8* )locinput;
2870
2871 STRLEN len = 0;
2872 STRLEN foldlen = 0;
2873 U8 *uscan = (U8*)NULL;
2874 STRLEN bufflen=0;
2875 SV *sv_accept_buff = NULL;
2876 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2877
2878 ST.accepted = 0; /* how many accepting states we have seen */
2879 ST.B = next;
2880 ST.jump = trie->jump;
786e8c11 2881 ST.me = scan;
07be1b83
YO
2882 /*
2883 traverse the TRIE keeping track of all accepting states
2884 we transition through until we get to a failing node.
2885 */
2886
a3621e74 2887 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2888 U32 base = trie->states[ state ].trans.base;
f9f4320a 2889 UV uvc = 0;
786e8c11
YO
2890 U16 charid;
2891 /* We use charid to hold the wordnum as we don't use it
2892 for charid until after we have done the wordnum logic.
2893 We define an alias just so that the wordnum logic reads
2894 more naturally. */
2895
2896#define got_wordnum charid
2897 got_wordnum = trie->states[ state ].wordnum;
2898
2899 if ( got_wordnum ) {
2900 if ( ! ST.accepted ) {
5b47454d
DM
2901 ENTER;
2902 SAVETMPS;
2903 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2904 sv_accept_buff=newSV(bufflen *
2905 sizeof(reg_trie_accepted) - 1);
786e8c11 2906 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2907 SvPOK_on(sv_accept_buff);
2908 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2909 SAVETMPS;
2910 ST.accept_buff =
5b47454d
DM
2911 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2912 }
786e8c11 2913 do {
166ba7cd 2914 if (ST.accepted >= bufflen) {
5b47454d 2915 bufflen *= 2;
166ba7cd 2916 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2917 SvGROW(sv_accept_buff,
2918 bufflen * sizeof(reg_trie_accepted));
2919 }
2920 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2921 + sizeof(reg_trie_accepted));
a3621e74 2922
786e8c11
YO
2923
2924 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2925 ST.accept_buff[ST.accepted].endpos = uc;
2926 ++ST.accepted;
2927 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2928 }
2929#undef got_wordnum
a3621e74 2930
07be1b83 2931 DEBUG_TRIE_EXECUTE_r({
786e8c11 2932 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2933 PerlIO_printf( Perl_debug_log,
786e8c11 2934 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2935 2+depth * 2, "", PL_colors[4],
786e8c11 2936 (UV)state, (UV)ST.accepted );
07be1b83 2937 });
a3621e74
YO
2938
2939 if ( base ) {
55eed653
NC
2940 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2941 uscan, len, uvc, charid, foldlen,
2942 foldbuf, uniflags);
a3621e74 2943
5b47454d
DM
2944 if (charid &&
2945 (base + charid > trie->uniquecharcount )
2946 && (base + charid - 1 - trie->uniquecharcount
2947 < trie->lasttrans)
2948 && trie->trans[base + charid - 1 -
2949 trie->uniquecharcount].check == state)
2950 {
2951 state = trie->trans[base + charid - 1 -
2952 trie->uniquecharcount ].next;
2953 }
2954 else {
2955 state = 0;
2956 }
2957 uc += len;
2958
2959 }
2960 else {
a3621e74
YO
2961 state = 0;
2962 }
2963 DEBUG_TRIE_EXECUTE_r(
e4584336 2964 PerlIO_printf( Perl_debug_log,
786e8c11 2965 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2966 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2967 );
2968 }
166ba7cd 2969 if (!ST.accepted )
a3621e74 2970 sayNO;
a3621e74 2971
166ba7cd
DM
2972 DEBUG_EXECUTE_r(
2973 PerlIO_printf( Perl_debug_log,
2974 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2975 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2976 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2977 );
786e8c11 2978 }}
fae667d5
YO
2979 goto trie_first_try; /* jump into the fail handler */
2980 /* NOTREACHED */
166ba7cd 2981 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
2982 if ( ST.jump) {
2983 REGCP_UNWIND(ST.cp);
2984 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2985 PL_regendp[n] = -1;
2986 *PL_reglastparen = n;
2987 }
2988 trie_first_try:
5d458dd8
YO
2989 if (do_cutgroup) {
2990 do_cutgroup = 0;
2991 no_final = 0;
2992 }
fae667d5
YO
2993
2994 if ( ST.jump) {
2995 ST.lastparen = *PL_reglastparen;
2996 REGCP_SET(ST.cp);
2997 }
166ba7cd
DM
2998 if ( ST.accepted == 1 ) {
2999 /* only one choice left - just continue */
3000 DEBUG_EXECUTE_r({
2b8b4781
NC
3001 AV *const trie_words
3002 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3003 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3004 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3005 SV *sv= tmp ? sv_newmortal() : NULL;
3006
166ba7cd
DM
3007 PerlIO_printf( Perl_debug_log,
3008 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3009 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3010 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3011 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3012 PL_colors[0], PL_colors[1],
3013 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3014 )
3015 : "not compiled under -Dr",
166ba7cd
DM
3016 PL_colors[5] );
3017 });
3018 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3019 /* in this case we free tmps/leave before we call regmatch
3020 as we wont be using accept_buff again. */
5d458dd8 3021
166ba7cd
DM
3022 locinput = PL_reginput;
3023 nextchr = UCHARAT(locinput);
5d458dd8
YO
3024 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3025 scan = ST.B;
3026 else
3027 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3028 if (!has_cutgroup) {
3029 FREETMPS;
3030 LEAVE;
3031 } else {
3032 ST.accepted--;
3033 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3034 }
786e8c11 3035
166ba7cd
DM
3036 continue; /* execute rest of RE */
3037 }
fae667d5
YO
3038
3039 if ( !ST.accepted-- ) {
5d458dd8
YO
3040 DEBUG_EXECUTE_r({
3041 PerlIO_printf( Perl_debug_log,
3042 "%*s %sTRIE failed...%s\n",
3043 REPORT_CODE_OFF+depth*2, "",
3044 PL_colors[4],
3045 PL_colors[5] );
3046 });
166ba7cd
DM
3047 FREETMPS;
3048 LEAVE;
5d458dd8 3049 sayNO_SILENT;
fae667d5
YO
3050 /*NOTREACHED*/
3051 }
166ba7cd 3052
a3621e74 3053 /*
166ba7cd
DM
3054 There are at least two accepting states left. Presumably
3055 the number of accepting states is going to be low,
3056 typically two. So we simply scan through to find the one
3057 with lowest wordnum. Once we find it, we swap the last
3058 state into its place and decrement the size. We then try to
3059 match the rest of the pattern at the point where the word
3060 ends. If we succeed, control just continues along the
3061 regex; if we fail we return here to try the next accepting
3062 state
3063 */
a3621e74 3064
166ba7cd
DM
3065 {
3066 U32 best = 0;
3067 U32 cur;
3068 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3069 DEBUG_TRIE_EXECUTE_r(
f2278c82 3070 PerlIO_printf( Perl_debug_log,
166ba7cd 3071 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3072 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3073 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3074 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3075 );
3076
3077 if (ST.accept_buff[cur].wordnum <
3078 ST.accept_buff[best].wordnum)
3079 best = cur;
a3621e74 3080 }
166ba7cd
DM
3081
3082 DEBUG_EXECUTE_r({
2b8b4781
NC
3083 AV *const trie_words
3084 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3085 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3086 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3087 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3088 ST.B :
7f69552c 3089 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3090 SV *sv= tmp ? sv_newmortal() : NULL;
3091
265c4333
YO
3092 PerlIO_printf( Perl_debug_log,
3093 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3094 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3095 ST.accept_buff[best].wordnum,
de734bd5
A
3096 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3097 PL_colors[0], PL_colors[1],
3098 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3099 ) : "not compiled under -Dr",
265c4333 3100 REG_NODE_NUM(nextop),
166ba7cd
DM
3101 PL_colors[5] );
3102 });
3103
3104 if ( best<ST.accepted ) {
3105 reg_trie_accepted tmp = ST.accept_buff[ best ];
3106 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3107 ST.accept_buff[ ST.accepted ] = tmp;
3108 best = ST.accepted;
a3621e74 3109 }
166ba7cd 3110 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3111 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3112 scan = ST.B;
786e8c11
YO
3113 /* NOTREACHED */
3114 } else {
5d458dd8 3115 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
786e8c11
YO
3116 /* NOTREACHED */
3117 }
5d458dd8
YO
3118 if (has_cutgroup) {
3119 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3120 /* NOTREACHED */
3121 } else {
3122 PUSH_STATE_GOTO(TRIE_next, scan);
3123 /* NOTREACHED */
3124 }
786e8c11 3125 /* NOTREACHED */
166ba7cd 3126 }
166ba7cd 3127 /* NOTREACHED */
5d458dd8
YO
3128 case TRIE_next:
3129 FREETMPS;
3130 LEAVE;
3131 sayYES;
166ba7cd
DM
3132#undef ST
3133
95b24440
DM
3134 case EXACT: {
3135 char *s = STRING(scan);
24d3c4a9 3136 ln = STR_LEN(scan);
eb160463 3137 if (do_utf8 != UTF) {
bc517b45 3138 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3139 char *l = locinput;
24d3c4a9 3140 const char * const e = s + ln;
a72c7584 3141
5ff6fc6d
JH
3142 if (do_utf8) {
3143 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3144 while (s < e) {
a3b680e6 3145 STRLEN ulen;
1aa99e6b 3146 if (l >= PL_regeol)
5ff6fc6d
JH
3147 sayNO;
3148 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3149 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3150 uniflags))
5ff6fc6d 3151 sayNO;
bc517b45 3152 l += ulen;
5ff6fc6d 3153 s ++;
1aa99e6b 3154 }
5ff6fc6d
JH
3155 }
3156 else {
3157 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3158 while (s < e) {
a3b680e6 3159 STRLEN ulen;
1aa99e6b
IH
3160 if (l >= PL_regeol)
3161 sayNO;
5ff6fc6d 3162 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3163 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3164 uniflags))
1aa99e6b 3165 sayNO;
bc517b45 3166 s += ulen;
a72c7584 3167 l ++;
1aa99e6b 3168 }
5ff6fc6d 3169 }
1aa99e6b
IH
3170 locinput = l;
3171 nextchr = UCHARAT(locinput);
3172 break;
3173 }
bc517b45 3174 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3175 /* Inline the first character, for speed. */
3176 if (UCHARAT(s) != nextchr)
3177 sayNO;
24d3c4a9 3178 if (PL_regeol - locinput < ln)
d6a28714 3179 sayNO;
24d3c4a9 3180 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3181 sayNO;
24d3c4a9 3182 locinput += ln;
d6a28714
JH
3183 nextchr = UCHARAT(locinput);
3184 break;
95b24440 3185 }
d6a28714 3186 case EXACTFL:
b8c5462f
JH
3187 PL_reg_flags |= RF_tainted;
3188 /* FALL THROUGH */
95b24440 3189 case EXACTF: {
be8e71aa 3190 char * const s = STRING(scan);
24d3c4a9 3191 ln = STR_LEN(scan);
d6a28714 3192
d07ddd77
JH
3193 if (do_utf8 || UTF) {
3194 /* Either target or the pattern are utf8. */
be8e71aa 3195 const char * const l = locinput;
d07ddd77 3196 char *e = PL_regeol;
bc517b45 3197
24d3c4a9 3198 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3199 l, &e, 0, do_utf8)) {
5486206c
JH
3200 /* One more case for the sharp s:
3201 * pack("U0U*", 0xDF) =~ /ss/i,
3202 * the 0xC3 0x9F are the UTF-8
3203 * byte sequence for the U+00DF. */
3204 if (!(do_utf8 &&
3205 toLOWER(s[0]) == 's' &&
24d3c4a9 3206 ln >= 2 &&
5486206c
JH
3207 toLOWER(s[1]) == 's' &&
3208 (U8)l[0] == 0xC3 &&
3209 e - l >= 2 &&
3210 (U8)l[1] == 0x9F))
3211 sayNO;
3212 }
d07ddd77
JH
3213 locinput = e;
3214 nextchr = UCHARAT(locinput);
3215 break;
a0ed51b3 3216 }
d6a28714 3217
bc517b45
JH
3218 /* Neither the target and the pattern are utf8. */
3219
d6a28714
JH
3220 /* Inline the first character, for speed. */
3221 if (UCHARAT(s) != nextchr &&
3222 UCHARAT(s) != ((OP(scan) == EXACTF)
3223 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3224 sayNO;
24d3c4a9 3225 if (PL_regeol - locinput < ln)
b8c5462f 3226 sayNO;
24d3c4a9
DM
3227 if (ln > 1 && (OP(scan) == EXACTF
3228 ? ibcmp(s, locinput, ln)
3229 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3230 sayNO;
24d3c4a9 3231 locinput += ln;
d6a28714 3232 nextchr = UCHARAT(locinput);
a0d0e21e 3233 break;
95b24440 3234 }
d6a28714 3235 case ANYOF:
ffc61ed2 3236 if (do_utf8) {
9e55ce06
JH
3237 STRLEN inclasslen = PL_regeol - locinput;
3238
32fc9b6a 3239 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3240 goto anyof_fail;
ffc61ed2
JH
3241 if (locinput >= PL_regeol)
3242 sayNO;
0f0076b4 3243 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3244 nextchr = UCHARAT(locinput);
e0f9d4a8 3245 break;
ffc61ed2
JH
3246 }
3247 else {
3248 if (nextchr < 0)
3249 nextchr = UCHARAT(locinput);
32fc9b6a 3250 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3251 goto anyof_fail;
ffc61ed2
JH
3252 if (!nextchr && locinput >= PL_regeol)
3253 sayNO;
3254 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3255 break;
3256 }
262b90c4 3257 anyof_fail:
e0f9d4a8
JH
3258 /* If we might have the case of the German sharp s
3259 * in a casefolding Unicode character class. */
3260
ebc501f0
JH
3261 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3262 locinput += SHARP_S_SKIP;
e0f9d4a8 3263 nextchr = UCHARAT(locinput);
ffc61ed2 3264 }
e0f9d4a8
JH
3265 else
3266 sayNO;
b8c5462f 3267 break;
d6a28714 3268 case ALNUML:
b8c5462f
JH
3269 PL_reg_flags |= RF_tainted;
3270 /* FALL THROUGH */
d6a28714 3271 case ALNUM:
b8c5462f 3272 if (!nextchr)
4633a7c4 3273 sayNO;
ffc61ed2 3274 if (do_utf8) {
1a4fad37 3275 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3276 if (!(OP(scan) == ALNUM
bb7a0f54 3277 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3278 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3279 {
3280 sayNO;
a0ed51b3 3281 }
b8c5462f 3282 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3283 nextchr = UCHARAT(locinput);
3284 break;
3285 }
ffc61ed2 3286 if (!(OP(scan) == ALNUM
d6a28714 3287 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3288 sayNO;
b8c5462f 3289 nextchr = UCHARAT(++locinput);
a0d0e21e 3290 break;
d6a28714 3291 case NALNUML:
b8c5462f
JH
3292 PL_reg_flags |= RF_tainted;
3293 /* FALL THROUGH */
d6a28714
JH
3294 case NALNUM:
3295 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3296 sayNO;
ffc61ed2 3297 if (do_utf8) {
1a4fad37 3298 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3299 if (OP(scan) == NALNUM
bb7a0f54 3300 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3301 : isALNUM_LC_utf8((U8*)locinput))
3302 {
b8c5462f 3303 sayNO;
d6a28714 3304 }
b8c5462f
JH
3305 locinput += PL_utf8skip[nextchr];
3306 nextchr = UCHARAT(locinput);
3307 break;
3308 }
ffc61ed2 3309 if (OP(scan) == NALNUM
d6a28714 3310 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3311 sayNO;
76e3520e 3312 nextchr = UCHARAT(++locinput);
a0d0e21e 3313 break;
d6a28714
JH
3314 case BOUNDL:
3315 case NBOUNDL:
3280af22 3316 PL_reg_flags |= RF_tainted;
bbce6d69 3317 /* FALL THROUGH */
d6a28714
JH
3318 case BOUND:
3319 case NBOUND:
3320 /* was last char in word? */
ffc61ed2 3321 if (do_utf8) {
12d33761 3322 if (locinput == PL_bostr)
24d3c4a9 3323 ln = '\n';
ffc61ed2 3324 else {
a3b680e6 3325 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3326
24d3c4a9 3327 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3328 }
3329 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3330 ln = isALNUM_uni(ln);
1a4fad37 3331 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3332 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3333 }
3334 else {
24d3c4a9 3335 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3336 n = isALNUM_LC_utf8((U8*)locinput);
3337 }
a0ed51b3 3338 }
d6a28714 3339 else {
24d3c4a9 3340 ln = (locinput != PL_bostr) ?
12d33761 3341 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3342 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3343 ln = isALNUM(ln);
ffc61ed2
JH
3344 n = isALNUM(nextchr);
3345 }
3346 else {
24d3c4a9 3347 ln = isALNUM_LC(ln);
ffc61ed2
JH
3348 n = isALNUM_LC(nextchr);
3349 }
d6a28714 3350 }
24d3c4a9 3351 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3352 OP(scan) == BOUNDL))
3353 sayNO;
a0ed51b3 3354 break;
d6a28714 3355 case SPACEL:
3280af22 3356 PL_reg_flags |= RF_tainted;
bbce6d69 3357 /* FALL THROUGH */
d6a28714 3358 case SPACE:
9442cb0e 3359 if (!nextchr)
4633a7c4 3360 sayNO;
1aa99e6b 3361 if (do_utf8) {
fd400ab9 3362 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3363 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3364 if (!(OP(scan) == SPACE
bb7a0f54 3365 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3366 : isSPACE_LC_utf8((U8*)locinput)))
3367 {
3368 sayNO;
3369 }
3370 locinput += PL_utf8skip[nextchr];
3371 nextchr = UCHARAT(locinput);
3372 break;
d6a28714 3373 }
ffc61ed2
JH
3374 if (!(OP(scan) == SPACE
3375 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3376 sayNO;
3377 nextchr = UCHARAT(++locinput);
3378 }
3379 else {
3380 if (!(OP(scan) == SPACE
3381 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3382 sayNO;
3383 nextchr = UCHARAT(++locinput);
a0ed51b3 3384 }
a0ed51b3 3385 break;
d6a28714 3386 case NSPACEL:
3280af22 3387 PL_reg_flags |= RF_tainted;
bbce6d69 3388 /* FALL THROUGH */
d6a28714 3389 case NSPACE:
9442cb0e 3390 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3391 sayNO;
1aa99e6b 3392 if (do_utf8) {
1a4fad37 3393 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3394 if (OP(scan) == NSPACE
bb7a0f54 3395 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3396 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3397 {
3398 sayNO;
3399 }
3400 locinput += PL_utf8skip[nextchr];
3401 nextchr = UCHARAT(locinput);
3402 break;
a0ed51b3 3403 }
ffc61ed2 3404 if (OP(scan) == NSPACE
d6a28714 3405 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3406 sayNO;
76e3520e 3407 nextchr = UCHARAT(++locinput);
a0d0e21e 3408 break;
d6a28714 3409 case DIGITL:
a0ed51b3
LW
3410 PL_reg_flags |= RF_tainted;
3411 /* FALL THROUGH */
d6a28714 3412 case DIGIT:
9442cb0e 3413 if (!nextchr)
a0ed51b3 3414 sayNO;
1aa99e6b 3415 if (do_utf8) {
1a4fad37 3416 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3417 if (!(OP(scan) == DIGIT
bb7a0f54 3418 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3419 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3420 {
a0ed51b3 3421 sayNO;
dfe13c55 3422 }
6f06b55f 3423 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3424 nextchr = UCHARAT(locinput);
3425 break;
3426 }
ffc61ed2 3427 if (!(OP(scan) == DIGIT
9442cb0e 3428 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3429 sayNO;
3430 nextchr = UCHARAT(++locinput);
3431 break;
d6a28714 3432 case NDIGITL:
b8c5462f
JH
3433 PL_reg_flags |= RF_tainted;
3434 /* FALL THROUGH */
d6a28714 3435 case NDIGIT:
9442cb0e 3436 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3437 sayNO;
1aa99e6b 3438 if (do_utf8) {
1a4fad37 3439 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3440 if (OP(scan) == NDIGIT
bb7a0f54 3441 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3442 : isDIGIT_LC_utf8((U8*)locinput))
3443 {
a0ed51b3 3444 sayNO;
9442cb0e 3445 }
6f06b55f 3446 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3447 nextchr = UCHARAT(locinput);
3448 break;
3449 }
ffc61ed2 3450 if (OP(scan) == NDIGIT
9442cb0e 3451 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3452 sayNO;
3453 nextchr = UCHARAT(++locinput);
3454 break;
3455 case CLUMP:
b7c83a7e 3456 if (locinput >= PL_regeol)
a0ed51b3 3457 sayNO;
b7c83a7e 3458 if (do_utf8) {
1a4fad37 3459 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3460 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3461 sayNO;
3462 locinput += PL_utf8skip[nextchr];
3463 while (locinput < PL_regeol &&
3464 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3465 locinput += UTF8SKIP(locinput);
3466 if (locinput > PL_regeol)
3467 sayNO;
eb08e2da
JH
3468 }
3469 else
3470 locinput++;
a0ed51b3
LW
3471 nextchr = UCHARAT(locinput);
3472 break;
81714fb9
YO
3473
3474 case NREFFL:
3475 {
3476 char *s;
ff1157ca 3477 char type;
81714fb9
YO
3478 PL_reg_flags |= RF_tainted;
3479 /* FALL THROUGH */
3480 case NREF:
3481 case NREFF:
ff1157ca 3482 type = OP(scan);
0a4db386
YO
3483 n = reg_check_named_buff_matched(rex,scan);
3484
3485 if ( n ) {
3486 type = REF + ( type - NREF );
3487 goto do_ref;
3488 } else {
81714fb9 3489 sayNO;
0a4db386
YO
3490 }
3491 /* unreached */
c8756f30 3492 case REFFL:
3280af22 3493 PL_reg_flags |= RF_tainted;
c8756f30 3494 /* FALL THROUGH */
c277df42 3495 case REF:
81714fb9 3496 case REFF:
c277df42 3497 n = ARG(scan); /* which paren pair */
81714fb9
YO
3498 type = OP(scan);
3499 do_ref:
24d3c4a9 3500 ln = PL_regstartp[n];
2c2d71f5 3501 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3502 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3503 sayNO; /* Do not match unless seen CLOSEn. */
24d3c4a9 3504 if (ln == PL_regendp[n])
a0d0e21e 3505 break;
a0ed51b3 3506
24d3c4a9 3507 s = PL_bostr + ln;
81714fb9 3508 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3509 char *l = locinput;
a3b680e6 3510 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3511 /*
3512 * Note that we can't do the "other character" lookup trick as
3513 * in the 8-bit case (no pun intended) because in Unicode we
3514 * have to map both upper and title case to lower case.
3515 */
81714fb9 3516 if (type == REFF) {
a0ed51b3 3517 while (s < e) {
a3b680e6
AL
3518 STRLEN ulen1, ulen2;
3519 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3520 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3521
a0ed51b3
LW
3522 if (l >= PL_regeol)
3523 sayNO;
a2a2844f
JH
3524 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3525 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3526 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3527 sayNO;
a2a2844f
JH
3528 s += ulen1;
3529 l += ulen2;
a0ed51b3
LW
3530 }
3531 }
3532 locinput = l;
3533 nextchr = UCHARAT(locinput);
3534 break;
3535 }
3536
a0d0e21e 3537 /* Inline the first character, for speed. */
76e3520e 3538 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3539 (type == REF ||
3540 (UCHARAT(s) != (type == REFF
3541 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3542 sayNO;
24d3c4a9
DM
3543 ln = PL_regendp[n] - ln;
3544 if (locinput + ln > PL_regeol)
4633a7c4 3545 sayNO;
81714fb9 3546 if (ln > 1 && (type == REF
24d3c4a9 3547 ? memNE(s, locinput, ln)
81714fb9 3548 : (type == REFF
24d3c4a9
DM
3549 ? ibcmp(s, locinput, ln)
3550 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3551 sayNO;
24d3c4a9 3552 locinput += ln;
76e3520e 3553 nextchr = UCHARAT(locinput);
a0d0e21e 3554 break;
81714fb9 3555 }
a0d0e21e 3556 case NOTHING:
c277df42 3557 case TAIL:
a0d0e21e
LW
3558 break;
3559 case BACK:
3560 break;
40a82448
DM
3561
3562#undef ST
3563#define ST st->u.eval
c277df42 3564 {
c277df42 3565 SV *ret;
6bda09f9 3566 regexp *re;
f8fc2ecf 3567 regexp_internal *rei;
1a147d38
YO
3568 regnode *startpoint;
3569
3570 case GOSTART:
e7707071
YO
3571 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3572 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3573 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3574 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3575 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3576 Perl_croak(aTHX_
3577 "Pattern subroutine nesting without pos change"
3578 " exceeded limit in regex");
6bda09f9
YO
3579 } else {
3580 nochange_depth = 0;
1a147d38 3581 }
6bda09f9 3582 re = rex;
f8fc2ecf 3583 rei = rexi;
6bda09f9 3584 (void)ReREFCNT_inc(rex);
1a147d38 3585 if (OP(scan)==GOSUB) {
6bda09f9
YO
3586 startpoint = scan + ARG2L(scan);
3587 ST.close_paren = ARG(scan);
3588 } else {
f8fc2ecf 3589 startpoint = rei->program+1;
6bda09f9
YO
3590 ST.close_paren = 0;
3591 }
3592 goto eval_recurse_doit;
3593 /* NOTREACHED */
3594 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3595 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3596 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3597 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3598 } else {
3599 nochange_depth = 0;
3600 }
8e5e9ebe 3601 {
4aabdb9b
DM
3602 /* execute the code in the {...} */
3603 dSP;
6136c704 3604 SV ** const before = SP;
4aabdb9b
DM
3605 OP_4tree * const oop = PL_op;
3606 COP * const ocurcop = PL_curcop;
3607 PAD *old_comppad;
4aabdb9b
DM
3608
3609 n = ARG(scan);
f8fc2ecf 3610 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3611 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3612 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3613 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4aabdb9b
DM
3614 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3615
2bf803e2
YO
3616 if (sv_yes_mark) {
3617 SV *sv_mrk = get_sv("REGMARK", 1);
3618 sv_setsv(sv_mrk, sv_yes_mark);
3619 }
3620
8e5e9ebe
RGS
3621 CALLRUNOPS(aTHX); /* Scalar context. */
3622 SPAGAIN;
3623 if (SP == before)
075aa684 3624 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3625 else {
3626 ret = POPs;
3627 PUTBACK;
3628 }
4aabdb9b
DM
3629
3630 PL_op = oop;
3631 PAD_RESTORE_LOCAL(old_comppad);
3632 PL_curcop = ocurcop;
24d3c4a9 3633 if (!logical) {
4aabdb9b
DM
3634 /* /(?{...})/ */
3635 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3636 break;
3637 }
8e5e9ebe 3638 }
24d3c4a9
DM
3639 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3640 logical = 0;
4aabdb9b 3641 {
4f639d21
DM
3642 /* extract RE object from returned value; compiling if
3643 * necessary */
3644
6136c704 3645 MAGIC *mg = NULL;
be8e71aa 3646 const SV *sv;
faf82a0b
AE
3647 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3648 mg = mg_find(sv, PERL_MAGIC_qr);
3649 else if (SvSMAGICAL(ret)) {
3650 if (SvGMAGICAL(ret))
3651 sv_unmagic(ret, PERL_MAGIC_qr);
3652 else
3653 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3654 }
faf82a0b 3655
0f5d15d6
IZ
3656 if (mg) {
3657 re = (regexp *)mg->mg_obj;
df0003d4 3658 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3659 }
3660 else {
3661 STRLEN len;
6136c704 3662 const char * const t = SvPV_const(ret, len);
0f5d15d6 3663 PMOP pm;
a3b680e6 3664 const I32 osize = PL_regsize;
0f5d15d6 3665
5fcd1c1b 3666 Zero(&pm, 1, PMOP);
4aabdb9b 3667 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3668 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3669 if (!(SvFLAGS(ret)
faf82a0b
AE
3670 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3671 | SVs_GMG)))
14befaf4
DM
3672 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3673 PERL_MAGIC_qr,0,0);
0f5d15d6 3674 PL_regsize = osize;
0f5d15d6 3675 }
4aabdb9b 3676 }
f8fc2ecf 3677 rei = RXi_GET(re);
6bda09f9
YO
3678 DEBUG_EXECUTE_r(
3679 debug_start_match(re, do_utf8, locinput, PL_regeol,
3680 "Matching embedded");
3681 );
f8fc2ecf 3682 startpoint = rei->program + 1;
1a147d38 3683 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3684 /* borrowed from regtry */
3685 if (PL_reg_start_tmpl <= re->nparens) {
3686 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3687 if(PL_reg_start_tmp)
3688 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3689 else
3690 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3691 }
aa283a38 3692
1a147d38 3693 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3694 /* run the pattern returned from (??{...}) */
40a82448
DM
3695 ST.cp = regcppush(0); /* Save *all* the positions. */
3696 REGCP_SET(ST.lastcp);
6bda09f9 3697
1a147d38
YO
3698 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3699 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
6bda09f9 3700
4aabdb9b
DM
3701 *PL_reglastparen = 0;
3702 *PL_reglastcloseparen = 0;
4aabdb9b 3703 PL_reginput = locinput;
ae0beba1 3704 PL_regsize = 0;
4aabdb9b
DM
3705
3706 /* XXXX This is too dramatic a measure... */
3707 PL_reg_maxiter = 0;
3708
faec1544 3709 ST.toggle_reg_flags = PL_reg_flags;
bbe252da 3710 if (re->extflags & RXf_UTF8)
faec1544
DM
3711 PL_reg_flags |= RF_utf8;
3712 else
3713 PL_reg_flags &= ~RF_utf8;
3714 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3715
40a82448 3716 ST.prev_rex = rex;
faec1544 3717 ST.prev_curlyx = cur_curlyx;
aa283a38 3718 rex = re;
f8fc2ecf 3719 rexi = rei;
faec1544 3720 cur_curlyx = NULL;
40a82448 3721 ST.B = next;
faec1544
DM
3722 ST.prev_eval = cur_eval;
3723 cur_eval = st;
faec1544 3724 /* now continue from first node in postoned RE */
6bda09f9 3725 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3726 /* NOTREACHED */
a0ed51b3 3727 }
24d3c4a9
DM
3728 /* logical is 1, /(?(?{...})X|Y)/ */
3729 sw = (bool)SvTRUE(ret);
3730 logical = 0;
c277df42
IZ
3731 break;
3732 }
40a82448 3733
faec1544
DM
3734 case EVAL_AB: /* cleanup after a successful (??{A})B */
3735 /* note: this is called twice; first after popping B, then A */
3736 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3737 ReREFCNT_dec(rex);
3738 rex = ST.prev_rex;
f8fc2ecf 3739 rexi = RXi_GET(rex);
faec1544
DM
3740 regcpblow(ST.cp);
3741 cur_eval = ST.prev_eval;
3742 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3743 /* XXXX This is too dramatic a measure... */
3744 PL_reg_maxiter = 0;
e7707071 3745 if ( nochange_depth )
4b196cd4 3746 nochange_depth--;
262b90c4 3747 sayYES;
40a82448 3748
40a82448 3749
faec1544
DM
3750 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3751 /* note: this is called twice; first after popping B, then A */
3752 PL_reg_flags ^= ST.toggle_reg_flags;
3753 ReREFCNT_dec(rex);
3754 rex = ST.prev_rex;
f8fc2ecf 3755 rexi = RXi_GET(rex);
40a82448
DM
3756 PL_reginput = locinput;
3757 REGCP_UNWIND(ST.lastcp);
3758 regcppop(rex);
faec1544
DM
3759 cur_eval = ST.prev_eval;
3760 cur_curlyx = ST.prev_curlyx;
3761 /* XXXX This is too dramatic a measure... */
3762 PL_reg_maxiter = 0;
e7707071 3763 if ( nochange_depth )
4b196cd4 3764 nochange_depth--;
40a82448 3765 sayNO_SILENT;
40a82448
DM
3766#undef ST
3767
a0d0e21e 3768 case OPEN:
c277df42 3769 n = ARG(scan); /* which paren pair */
3280af22
NIS
3770 PL_reg_start_tmp[n] = locinput;
3771 if (n > PL_regsize)
3772 PL_regsize = n;
e2e6a0f1 3773 lastopen = n;
a0d0e21e
LW
3774 break;
3775 case CLOSE:
c277df42 3776 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3777 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3778 PL_regendp[n] = locinput - PL_bostr;
7f69552c
YO
3779 /*if (n > PL_regsize)
3780 PL_regsize = n;*/
3b6647e0 3781 if (n > *PL_reglastparen)
3280af22 3782 *PL_reglastparen = n;
a01268b5 3783 *PL_reglastcloseparen = n;
3b6647e0 3784 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3785 goto fake_end;
3786 }
a0d0e21e 3787 break;
e2e6a0f1
YO
3788 case ACCEPT:
3789 if (ARG(scan)){
3790 regnode *cursor;
3791 for (cursor=scan;
3792 cursor && OP(cursor)!=END;
3793 cursor=regnext(cursor))
3794 {
3795 if ( OP(cursor)==CLOSE ){
3796 n = ARG(cursor);
3797 if ( n <= lastopen ) {
3798 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3799 PL_regendp[n] = locinput - PL_bostr;
3800 /*if (n > PL_regsize)
3801 PL_regsize = n;*/
3b6647e0 3802 if (n > *PL_reglastparen)
e2e6a0f1
YO
3803 *PL_reglastparen = n;
3804 *PL_reglastcloseparen = n;
3b6647e0
RB
3805 if ( n == ARG(scan) || (cur_eval &&
3806 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3807 break;
3808 }
3809 }
3810 }
3811 }
3812 goto fake_end;
3813 /*NOTREACHED*/
c277df42
IZ
3814 case GROUPP:
3815 n = ARG(scan); /* which paren pair */
3b6647e0 3816 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42 3817 break;
0a4db386
YO
3818 case NGROUPP:
3819 /* reg_check_named_buff_matched returns 0 for no match */
3820 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3821 break;
1a147d38 3822 case INSUBP:
0a4db386 3823 n = ARG(scan);
3b6647e0 3824 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3825 break;
3826 case DEFINEP:
3827 sw = 0;
3828 break;
c277df42 3829 case IFTHEN:
2c2d71f5 3830 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3831 if (sw)
c277df42
IZ
3832 next = NEXTOPER(NEXTOPER(scan));
3833 else {
3834 next = scan + ARG(scan);
3835 if (OP(next) == IFTHEN) /* Fake one. */
3836 next = NEXTOPER(NEXTOPER(next));
3837 }
3838 break;
3839 case LOGICAL:
24d3c4a9 3840 logical = scan->flags;
c277df42 3841 break;
c476f425 3842
2ab05381 3843/*******************************************************************
2ab05381 3844
c476f425
DM
3845The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3846pattern, where A and B are subpatterns. (For simple A, CURLYM or
3847STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3848
c476f425 3849A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3850
c476f425
DM
3851On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3852state, which contains the current count, initialised to -1. It also sets
3853cur_curlyx to point to this state, with any previous value saved in the
3854state block.
2ab05381 3855
c476f425
DM
3856CURLYX then jumps straight to the WHILEM op, rather than executing A,
3857since the pattern may possibly match zero times (i.e. it's a while {} loop
3858rather than a do {} while loop).
2ab05381 3859
c476f425
DM
3860Each entry to WHILEM represents a successful match of A. The count in the
3861CURLYX block is incremented, another WHILEM state is pushed, and execution
3862passes to A or B depending on greediness and the current count.
2ab05381 3863
c476f425
DM
3864For example, if matching against the string a1a2a3b (where the aN are
3865substrings that match /A/), then the match progresses as follows: (the
3866pushed states are interspersed with the bits of strings matched so far):
2ab05381 3867
c476f425
DM
3868 <CURLYX cnt=-1>
3869 <CURLYX cnt=0><WHILEM>
3870 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3871 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3872 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3873 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3874
c476f425
DM
3875(Contrast this with something like CURLYM, which maintains only a single
3876backtrack state:
2ab05381 3877
c476f425
DM
3878 <CURLYM cnt=0> a1
3879 a1 <CURLYM cnt=1> a2
3880 a1 a2 <CURLYM cnt=2> a3
3881 a1 a2 a3 <CURLYM cnt=3> b
3882)
2ab05381 3883
c476f425
DM
3884Each WHILEM state block marks a point to backtrack to upon partial failure
3885of A or B, and also contains some minor state data related to that
3886iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3887overall state, such as the count, and pointers to the A and B ops.
2ab05381 3888
c476f425
DM
3889This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3890must always point to the *current* CURLYX block, the rules are:
2ab05381 3891
c476f425
DM
3892When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3893and set cur_curlyx to point the new block.
2ab05381 3894
c476f425
DM
3895When popping the CURLYX block after a successful or unsuccessful match,
3896restore the previous cur_curlyx.
2ab05381 3897
c476f425
DM
3898When WHILEM is about to execute B, save the current cur_curlyx, and set it
3899to the outer one saved in the CURLYX block.
2ab05381 3900
c476f425
DM
3901When popping the WHILEM block after a successful or unsuccessful B match,
3902restore the previous cur_curlyx.
2ab05381 3903
c476f425
DM
3904Here's an example for the pattern (AI* BI)*BO
3905I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3906
c476f425
DM
3907cur_
3908curlyx backtrack stack
3909------ ---------------
3910NULL
3911CO <CO prev=NULL> <WO>
3912CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3913CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3914NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3915
c476f425
DM
3916At this point the pattern succeeds, and we work back down the stack to
3917clean up, restoring as we go:
95b24440 3918
c476f425
DM
3919CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3920CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3921CO <CO prev=NULL> <WO>
3922NULL
a0374537 3923
c476f425
DM
3924*******************************************************************/
3925
3926#define ST st->u.curlyx
3927
3928 case CURLYX: /* start of /A*B/ (for complex A) */
3929 {
3930 /* No need to save/restore up to this paren */
3931 I32 parenfloor = scan->flags;
3932
3933 assert(next); /* keep Coverity happy */
3934 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3935 next += ARG(next);
3936
3937 /* XXXX Probably it is better to teach regpush to support
3938 parenfloor > PL_regsize... */
3939 if (parenfloor > (I32)*PL_reglastparen)
3940 parenfloor = *PL_reglastparen; /* Pessimization... */
3941
3942 ST.prev_curlyx= cur_curlyx;
3943 cur_curlyx = st;
3944 ST.cp = PL_savestack_ix;
3945
3946 /* these fields contain the state of the current curly.
3947 * they are accessed by subsequent WHILEMs */
3948 ST.parenfloor = parenfloor;
3949 ST.min = ARG1(scan);
3950 ST.max = ARG2(scan);
3951 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3952 ST.B = next;
24d3c4a9
DM
3953 ST.minmod = minmod;
3954 minmod = 0;
c476f425
DM
3955 ST.count = -1; /* this will be updated by WHILEM */
3956 ST.lastloc = NULL; /* this will be updated by WHILEM */
3957
3958 PL_reginput = locinput;
3959 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 3960 /* NOTREACHED */
c476f425 3961 }
a0d0e21e 3962
c476f425 3963 case CURLYX_end: /* just finished matching all of A*B */
f0852a51
YO
3964 if (PL_reg_eval_set){
3965 SV *pres= GvSV(PL_replgv);
3966 SvREFCNT_inc(pres);
3967 regcpblow(ST.cp);
3968 sv_setsv(GvSV(PL_replgv), pres);
3969 SvREFCNT_dec(pres);
3970 } else {
3971 regcpblow(ST.cp);
3972 }
c476f425
DM
3973 cur_curlyx = ST.prev_curlyx;
3974 sayYES;
3975 /* NOTREACHED */
a0d0e21e 3976
c476f425
DM
3977 case CURLYX_end_fail: /* just failed to match all of A*B */
3978 regcpblow(ST.cp);
3979 cur_curlyx = ST.prev_curlyx;
3980 sayNO;
3981 /* NOTREACHED */
4633a7c4 3982
a0d0e21e 3983
c476f425
DM
3984#undef ST
3985#define ST st->u.whilem
3986
3987 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3988 {
3989 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
3990 I32 n;
3991 assert(cur_curlyx); /* keep Coverity happy */
3992 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3993 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3994 ST.cache_offset = 0;
3995 ST.cache_mask = 0;
3996
3997 PL_reginput = locinput;
3998
3999 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4000 "%*s whilem: matched %ld out of %ld..%ld\n",
4001 REPORT_CODE_OFF+depth*2, "", (long)n,
4002 (long)cur_curlyx->u.curlyx.min,
4003 (long)cur_curlyx->u.curlyx.max)
4004 );
a0d0e21e 4005
c476f425 4006 /* First just match a string of min A's. */
a0d0e21e 4007
c476f425
DM
4008 if (n < cur_curlyx->u.curlyx.min) {
4009 cur_curlyx->u.curlyx.lastloc = locinput;
4010 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4011 /* NOTREACHED */
4012 }
4013
4014 /* If degenerate A matches "", assume A done. */
4015
4016 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4017 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4018 "%*s whilem: empty match detected, trying continuation...\n",
4019 REPORT_CODE_OFF+depth*2, "")
4020 );
4021 goto do_whilem_B_max;
4022 }
4023
4024 /* super-linear cache processing */
4025
4026 if (scan->flags) {
a0d0e21e 4027
2c2d71f5 4028 if (!PL_reg_maxiter) {
c476f425
DM
4029 /* start the countdown: Postpone detection until we
4030 * know the match is not *that* much linear. */
2c2d71f5 4031 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4032 /* possible overflow for long strings and many CURLYX's */
4033 if (PL_reg_maxiter < 0)
4034 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4035 PL_reg_leftiter = PL_reg_maxiter;
4036 }
c476f425 4037
2c2d71f5 4038 if (PL_reg_leftiter-- == 0) {
c476f425 4039 /* initialise cache */
3298f257 4040 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4041 if (PL_reg_poscache) {
eb160463 4042 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4043 Renew(PL_reg_poscache, size, char);
4044 PL_reg_poscache_size = size;
4045 }
4046 Zero(PL_reg_poscache, size, char);
4047 }
4048 else {
4049 PL_reg_poscache_size = size;
a02a5408 4050 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4051 }
c476f425
DM
4052 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4053 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4054 PL_colors[4], PL_colors[5])
4055 );
2c2d71f5 4056 }
c476f425 4057
2c2d71f5 4058 if (PL_reg_leftiter < 0) {
c476f425
DM
4059 /* have we already failed at this position? */
4060 I32 offset, mask;
4061 offset = (scan->flags & 0xf) - 1
4062 + (locinput - PL_bostr) * (scan->flags>>4);
4063 mask = 1 << (offset % 8);
4064 offset /= 8;
4065 if (PL_reg_poscache[offset] & mask) {
4066 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4067 "%*s whilem: (cache) already tried at this position...\n",
4068 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4069 );
3298f257 4070 sayNO; /* cache records failure */
2c2d71f5 4071 }
c476f425
DM
4072 ST.cache_offset = offset;
4073 ST.cache_mask = mask;
2c2d71f5 4074 }
c476f425 4075 }
2c2d71f5 4076
c476f425 4077 /* Prefer B over A for minimal matching. */
a687059c 4078
c476f425
DM
4079 if (cur_curlyx->u.curlyx.minmod) {
4080 ST.save_curlyx = cur_curlyx;
4081 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4082 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4083 REGCP_SET(ST.lastcp);
4084 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4085 /* NOTREACHED */
4086 }
a0d0e21e 4087
c476f425
DM
4088 /* Prefer A over B for maximal matching. */
4089
4090 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4091 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4092 cur_curlyx->u.curlyx.lastloc = locinput;
4093 REGCP_SET(ST.lastcp);
4094 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4095 /* NOTREACHED */
4096 }
4097 goto do_whilem_B_max;
4098 }
4099 /* NOTREACHED */
4100
4101 case WHILEM_B_min: /* just matched B in a minimal match */
4102 case WHILEM_B_max: /* just matched B in a maximal match */
4103 cur_curlyx = ST.save_curlyx;
4104 sayYES;
4105 /* NOTREACHED */
4106
4107 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4108 cur_curlyx = ST.save_curlyx;
4109 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4110 cur_curlyx->u.curlyx.count--;
4111 CACHEsayNO;
4112 /* NOTREACHED */
4113
4114 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4115 REGCP_UNWIND(ST.lastcp);
4116 regcppop(rex);
4117 /* FALL THROUGH */
4118 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4119 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4120 cur_curlyx->u.curlyx.count--;
4121 CACHEsayNO;
4122 /* NOTREACHED */
4123
4124 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4125 REGCP_UNWIND(ST.lastcp);
4126 regcppop(rex); /* Restore some previous $<digit>s? */
4127 PL_reginput = locinput;
4128 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4129 "%*s whilem: failed, trying continuation...\n",
4130 REPORT_CODE_OFF+depth*2, "")
4131 );
4132 do_whilem_B_max:
4133 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4134 && ckWARN(WARN_REGEXP)
4135 && !(PL_reg_flags & RF_warned))
4136 {
4137 PL_reg_flags |= RF_warned;
4138 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4139 "Complex regular subexpression recursion",
4140 REG_INFTY - 1);
4141 }
4142
4143 /* now try B */
4144 ST.save_curlyx = cur_curlyx;
4145 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4146 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4147 /* NOTREACHED */
4148
4149 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4150 cur_curlyx = ST.save_curlyx;
4151 REGCP_UNWIND(ST.lastcp);
4152 regcppop(rex);
4153
4154 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4155 /* Maximum greed exceeded */
4156 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4157 && ckWARN(WARN_REGEXP)
4158 && !(PL_reg_flags & RF_warned))
4159 {
3280af22 4160 PL_reg_flags |= RF_warned;
c476f425
DM
4161 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4162 "%s limit (%d) exceeded",
4163 "Complex regular subexpression recursion",
4164 REG_INFTY - 1);
a0d0e21e 4165 }
c476f425 4166 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4167 CACHEsayNO;
a0d0e21e 4168 }
c476f425
DM
4169
4170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4171 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4172 );
4173 /* Try grabbing another A and see if it helps. */
4174 PL_reginput = locinput;
4175 cur_curlyx->u.curlyx.lastloc = locinput;
4176 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4177 REGCP_SET(ST.lastcp);
4178 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4179 /* NOTREACHED */
40a82448
DM
4180
4181#undef ST
4182#define ST st->u.branch
4183
4184 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4185 next = scan + ARG(scan);
4186 if (next == scan)
4187 next = NULL;
40a82448
DM
4188 scan = NEXTOPER(scan);
4189 /* FALL THROUGH */
c277df42 4190
40a82448
DM
4191 case BRANCH: /* /(...|A|...)/ */
4192 scan = NEXTOPER(scan); /* scan now points to inner node */
5d458dd8
YO
4193 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4194 && !has_cutgroup)
4195 {
40a82448
DM
4196 /* last branch; skip state push and jump direct to node */
4197 continue;
5d458dd8 4198 }
40a82448
DM
4199 ST.lastparen = *PL_reglastparen;
4200 ST.next_branch = next;
4201 REGCP_SET(ST.cp);
4202 PL_reginput = locinput;
02db2b7b 4203
40a82448 4204 /* Now go into the branch */
5d458dd8
YO
4205 if (has_cutgroup) {
4206 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4207 } else {
4208 PUSH_STATE_GOTO(BRANCH_next, scan);
4209 }
40a82448 4210 /* NOTREACHED */
5d458dd8
YO
4211 case CUTGROUP:
4212 PL_reginput = locinput;
4213 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
f8fc2ecf 4214 (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4215 PUSH_STATE_GOTO(CUTGROUP_next,next);
4216 /* NOTREACHED */
4217 case CUTGROUP_next_fail:
4218 do_cutgroup = 1;
4219 no_final = 1;
4220 if (st->u.mark.mark_name)
4221 sv_commit = st->u.mark.mark_name;
4222 sayNO;
4223 /* NOTREACHED */
4224 case BRANCH_next:
4225 sayYES;
4226 /* NOTREACHED */
40a82448 4227 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4228 if (do_cutgroup) {
4229 do_cutgroup = 0;
4230 no_final = 0;
4231 }
40a82448
DM
4232 REGCP_UNWIND(ST.cp);
4233 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4234 PL_regendp[n] = -1;
4235 *PL_reglastparen = n;
0a4db386 4236 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4237 scan = ST.next_branch;
4238 /* no more branches? */
5d458dd8
YO
4239 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4240 DEBUG_EXECUTE_r({
4241 PerlIO_printf( Perl_debug_log,
4242 "%*s %sBRANCH failed...%s\n",
4243 REPORT_CODE_OFF+depth*2, "",
4244 PL_colors[4],
4245 PL_colors[5] );
4246 });
4247 sayNO_SILENT;
4248 }
40a82448
DM
4249 continue; /* execute next BRANCH[J] op */
4250 /* NOTREACHED */
4251
a0d0e21e 4252 case MINMOD:
24d3c4a9 4253 minmod = 1;
a0d0e21e 4254 break;
40a82448
DM
4255
4256#undef ST
4257#define ST st->u.curlym
4258
4259 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4260
4261 /* This is an optimisation of CURLYX that enables us to push
4262 * only a single backtracking state, no matter now many matches
4263 * there are in {m,n}. It relies on the pattern being constant
4264 * length, with no parens to influence future backrefs
4265 */
4266
4267 ST.me = scan;
dc45a647 4268 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4269
4270 /* if paren positive, emulate an OPEN/CLOSE around A */
4271 if (ST.me->flags) {
3b6647e0 4272 U32 paren = ST.me->flags;
40a82448
DM
4273 if (paren > PL_regsize)
4274 PL_regsize = paren;
3b6647e0 4275 if (paren > *PL_reglastparen)
40a82448 4276 *PL_reglastparen = paren;
c277df42 4277 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4278 }
40a82448
DM
4279 ST.A = scan;
4280 ST.B = next;
4281 ST.alen = 0;
4282 ST.count = 0;
24d3c4a9
DM
4283 ST.minmod = minmod;
4284 minmod = 0;
40a82448
DM
4285 ST.c1 = CHRTEST_UNINIT;
4286 REGCP_SET(ST.cp);
6407bf3b 4287
40a82448
DM
4288 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4289 goto curlym_do_B;
4290
4291 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4292 PL_reginput = locinput;
40a82448
DM
4293 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4294 /* NOTREACHED */
5f80c4cf 4295
40a82448
DM
4296 case CURLYM_A: /* we've just matched an A */
4297 locinput = st->locinput;
4298 nextchr = UCHARAT(locinput);
4299
4300 ST.count++;
4301 /* after first match, determine A's length: u.curlym.alen */
4302 if (ST.count == 1) {
4303 if (PL_reg_match_utf8) {
4304 char *s = locinput;
4305 while (s < PL_reginput) {
4306 ST.alen++;
4307 s += UTF8SKIP(s);
4308 }
4309 }
4310 else {
4311 ST.alen = PL_reginput - locinput;
4312 }
4313 if (ST.alen == 0)
4314 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4315 }
0cadcf80
DM
4316 DEBUG_EXECUTE_r(
4317 PerlIO_printf(Perl_debug_log,
40a82448 4318 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4319 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4320 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4321 );
4322
40a82448 4323 locinput = PL_reginput;
0a4db386
YO
4324
4325 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4326 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4327 goto fake_end;
4328
4329 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4330 goto curlym_do_A; /* try to match another A */
4331 goto curlym_do_B; /* try to match B */
5f80c4cf 4332
40a82448
DM
4333 case CURLYM_A_fail: /* just failed to match an A */
4334 REGCP_UNWIND(ST.cp);
0a4db386
YO
4335
4336 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4337 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4338 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4339 sayNO;
0cadcf80 4340
40a82448
DM
4341 curlym_do_B: /* execute the B in /A{m,n}B/ */
4342 PL_reginput = locinput;
4343 if (ST.c1 == CHRTEST_UNINIT) {
4344 /* calculate c1 and c2 for possible match of 1st char
4345 * following curly */
4346 ST.c1 = ST.c2 = CHRTEST_VOID;
4347 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4348 regnode *text_node = ST.B;
4349 if (! HAS_TEXT(text_node))
4350 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4351 /* this used to be
4352
4353 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4354
4355 But the former is redundant in light of the latter.
4356
4357 if this changes back then the macro for
4358 IS_TEXT and friends need to change.
4359 */
4360 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4361 {
ee9b8eae 4362
40a82448
DM
4363 ST.c1 = (U8)*STRING(text_node);
4364 ST.c2 =
ee9b8eae 4365 (IS_TEXTF(text_node))
40a82448 4366 ? PL_fold[ST.c1]
ee9b8eae 4367 : (IS_TEXTFL(text_node))
40a82448
DM
4368 ? PL_fold_locale[ST.c1]
4369 : ST.c1;
c277df42 4370 }
c277df42 4371 }
40a82448
DM
4372 }
4373
4374 DEBUG_EXECUTE_r(
4375 PerlIO_printf(Perl_debug_log,
4376 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4377 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4378 "", (IV)ST.count)
4379 );
4380 if (ST.c1 != CHRTEST_VOID
4381 && UCHARAT(PL_reginput) != ST.c1
4382 && UCHARAT(PL_reginput) != ST.c2)
4383 {
4384 /* simulate B failing */
3e901dc0
YO
4385 DEBUG_OPTIMISE_r(
4386 PerlIO_printf(Perl_debug_log,
4387 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4388 (int)(REPORT_CODE_OFF+(depth*2)),"",
4389 (IV)ST.c1,(IV)ST.c2
4390 ));
40a82448
DM
4391 state_num = CURLYM_B_fail;
4392 goto reenter_switch;
4393 }
4394
4395 if (ST.me->flags) {
4396 /* mark current A as captured */
4397 I32 paren = ST.me->flags;
4398 if (ST.count) {
4399 PL_regstartp[paren]
4400 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4401 PL_regendp[paren] = PL_reginput - PL_bostr;
0a4db386 4402 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4403 }
40a82448
DM
4404 else
4405 PL_regendp[paren] = -1;
0a4db386 4406 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4407 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4408 {
4409 if (ST.count)
4410 goto fake_end;
4411 else
4412 sayNO;
4413 }
c277df42 4414 }
0a4db386 4415
40a82448 4416 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4417 /* NOTREACHED */
40a82448
DM
4418
4419 case CURLYM_B_fail: /* just failed to match a B */
4420 REGCP_UNWIND(ST.cp);
4421 if (ST.minmod) {
4422 if (ST.count == ARG2(ST.me) /* max */)
4423 sayNO;
4424 goto curlym_do_A; /* try to match a further A */
4425 }
4426 /* backtrack one A */
4427 if (ST.count == ARG1(ST.me) /* min */)
4428 sayNO;
4429 ST.count--;
4430 locinput = HOPc(locinput, -ST.alen);
4431 goto curlym_do_B; /* try to match B */
4432
c255a977
DM
4433#undef ST
4434#define ST st->u.curly
40a82448 4435
c255a977
DM
4436#define CURLY_SETPAREN(paren, success) \
4437 if (paren) { \
4438 if (success) { \
4439 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4440 PL_regendp[paren] = locinput - PL_bostr; \
0a4db386 4441 *PL_reglastcloseparen = paren; \
c255a977
DM
4442 } \
4443 else \
4444 PL_regendp[paren] = -1; \
4445 }
4446
4447 case STAR: /* /A*B/ where A is width 1 */
4448 ST.paren = 0;
4449 ST.min = 0;
4450 ST.max = REG_INFTY;
a0d0e21e
LW
4451 scan = NEXTOPER(scan);
4452 goto repeat;
c255a977
DM
4453 case PLUS: /* /A+B/ where A is width 1 */
4454 ST.paren = 0;
4455 ST.min = 1;
4456 ST.max = REG_INFTY;
c277df42 4457 scan = NEXTOPER(scan);
c255a977
DM
4458 goto repeat;
4459 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4460 ST.paren = scan->flags; /* Which paren to set */
4461 if (ST.paren > PL_regsize)
4462 PL_regsize = ST.paren;
3b6647e0 4463 if (ST.paren > *PL_reglastparen)
c255a977
DM
4464 *PL_reglastparen = ST.paren;
4465 ST.min = ARG1(scan); /* min to match */
4466 ST.max = ARG2(scan); /* max to match */
0a4db386 4467 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4468 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4469 ST.min=1;
4470 ST.max=1;
4471 }
c255a977
DM
4472 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4473 goto repeat;
4474 case CURLY: /* /A{m,n}B/ where A is width 1 */
4475 ST.paren = 0;
4476 ST.min = ARG1(scan); /* min to match */
4477 ST.max = ARG2(scan); /* max to match */
4478 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4479 repeat:
a0d0e21e
LW
4480 /*
4481 * Lookahead to avoid useless match attempts
4482 * when we know what character comes next.
c255a977 4483 *
5f80c4cf
JP
4484 * Used to only do .*x and .*?x, but now it allows
4485 * for )'s, ('s and (?{ ... })'s to be in the way
4486 * of the quantifier and the EXACT-like node. -- japhy
4487 */
4488
c255a977
DM
4489 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4490 sayNO;
cca55fe3 4491 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4492 U8 *s;
4493 regnode *text_node = next;
4494
3dab1dad
YO
4495 if (! HAS_TEXT(text_node))
4496 FIND_NEXT_IMPT(text_node);
5f80c4cf 4497
9e137952 4498 if (! HAS_TEXT(text_node))
c255a977 4499 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4500 else {
ee9b8eae 4501 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4502 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4503 goto assume_ok_easy;
cca55fe3 4504 }
be8e71aa
YO
4505 else
4506 s = (U8*)STRING(text_node);
ee9b8eae
YO
4507
4508 /* Currently we only get here when
4509
4510 PL_rekind[OP(text_node)] == EXACT
4511
4512 if this changes back then the macro for IS_TEXT and
4513 friends need to change. */
5f80c4cf 4514 if (!UTF) {
c255a977 4515 ST.c2 = ST.c1 = *s;
ee9b8eae 4516 if (IS_TEXTF(text_node))
c255a977 4517 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4518 else if (IS_TEXTFL(text_node))
c255a977 4519 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4520 }
5f80c4cf 4521 else { /* UTF */
ee9b8eae 4522 if (IS_TEXTF(text_node)) {
a2a2844f 4523 STRLEN ulen1, ulen2;
89ebb4a3
JH
4524 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4525 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4526
4527 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4528 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4529#ifdef EBCDIC
4530 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4531 ckWARN(WARN_UTF8) ?
4532 0 : UTF8_ALLOW_ANY);
4533 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4534 ckWARN(WARN_UTF8) ?
4535 0 : UTF8_ALLOW_ANY);
4536#else
c255a977 4537 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4538 uniflags);
c255a977 4539 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4540 uniflags);
4541#endif
5f80c4cf
JP
4542 }
4543 else {
c255a977 4544 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4545 uniflags);
5f80c4cf 4546 }
1aa99e6b
IH
4547 }
4548 }
bbce6d69 4549 }
a0d0e21e 4550 else
c255a977 4551 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4552 assume_ok_easy:
c255a977
DM
4553
4554 ST.A = scan;
4555 ST.B = next;
3280af22 4556 PL_reginput = locinput;
24d3c4a9
DM
4557 if (minmod) {
4558 minmod = 0;
e2e6a0f1 4559 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4560 sayNO;
c255a977 4561 ST.count = ST.min;
a0ed51b3 4562 locinput = PL_reginput;
c255a977
DM
4563 REGCP_SET(ST.cp);
4564 if (ST.c1 == CHRTEST_VOID)
4565 goto curly_try_B_min;
4566
4567 ST.oldloc = locinput;
4568
4569 /* set ST.maxpos to the furthest point along the
4570 * string that could possibly match */
4571 if (ST.max == REG_INFTY) {
4572 ST.maxpos = PL_regeol - 1;
4573 if (do_utf8)
4574 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4575 ST.maxpos--;
4576 }
4577 else if (do_utf8) {
4578 int m = ST.max - ST.min;
4579 for (ST.maxpos = locinput;
4580 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4581 ST.maxpos += UTF8SKIP(ST.maxpos);
4582 }
4583 else {
4584 ST.maxpos = locinput + ST.max - ST.min;
4585 if (ST.maxpos >= PL_regeol)
4586 ST.maxpos = PL_regeol - 1;
4587 }
4588 goto curly_try_B_min_known;
4589
4590 }
4591 else {
e2e6a0f1 4592 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4593 locinput = PL_reginput;
4594 if (ST.count < ST.min)
4595 sayNO;
4596 if ((ST.count > ST.min)
4597 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4598 {
4599 /* A{m,n} must come at the end of the string, there's
4600 * no point in backing off ... */
4601 ST.min = ST.count;
4602 /* ...except that $ and \Z can match before *and* after
4603 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4604 We may back off by one in this case. */
4605 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4606 ST.min--;
4607 }
4608 REGCP_SET(ST.cp);
4609 goto curly_try_B_max;
4610 }
4611 /* NOTREACHED */
4612
4613
4614 case CURLY_B_min_known_fail:
4615 /* failed to find B in a non-greedy match where c1,c2 valid */
4616 if (ST.paren && ST.count)
4617 PL_regendp[ST.paren] = -1;
4618
4619 PL_reginput = locinput; /* Could be reset... */
4620 REGCP_UNWIND(ST.cp);
4621 /* Couldn't or didn't -- move forward. */
4622 ST.oldloc = locinput;
4623 if (do_utf8)
4624 locinput += UTF8SKIP(locinput);
4625 else
4626 locinput++;
4627 ST.count++;
4628 curly_try_B_min_known:
4629 /* find the next place where 'B' could work, then call B */
4630 {
4631 int n;
4632 if (do_utf8) {
4633 n = (ST.oldloc == locinput) ? 0 : 1;
4634 if (ST.c1 == ST.c2) {
4635 STRLEN len;
4636 /* set n to utf8_distance(oldloc, locinput) */
4637 while (locinput <= ST.maxpos &&
4638 utf8n_to_uvchr((U8*)locinput,
4639 UTF8_MAXBYTES, &len,
4640 uniflags) != (UV)ST.c1) {
4641 locinput += len;
4642 n++;
4643 }
1aa99e6b
IH
4644 }
4645 else {
c255a977
DM
4646 /* set n to utf8_distance(oldloc, locinput) */
4647 while (locinput <= ST.maxpos) {
4648 STRLEN len;
4649 const UV c = utf8n_to_uvchr((U8*)locinput,
4650 UTF8_MAXBYTES, &len,
4651 uniflags);
4652 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4653 break;
4654 locinput += len;
4655 n++;
1aa99e6b 4656 }
0fe9bf95
IZ
4657 }
4658 }
c255a977
DM
4659 else {
4660 if (ST.c1 == ST.c2) {
4661 while (locinput <= ST.maxpos &&
4662 UCHARAT(locinput) != ST.c1)
4663 locinput++;
bbce6d69 4664 }
c255a977
DM
4665 else {
4666 while (locinput <= ST.maxpos
4667 && UCHARAT(locinput) != ST.c1
4668 && UCHARAT(locinput) != ST.c2)
4669 locinput++;
a0ed51b3 4670 }
c255a977
DM
4671 n = locinput - ST.oldloc;
4672 }
4673 if (locinput > ST.maxpos)
4674 sayNO;
4675 /* PL_reginput == oldloc now */
4676 if (n) {
4677 ST.count += n;
e2e6a0f1 4678 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4679 sayNO;
a0d0e21e 4680 }
c255a977
DM
4681 PL_reginput = locinput;
4682 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4683 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4684 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4685 goto fake_end;
4686 }
c255a977 4687 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4688 }
c255a977
DM
4689 /* NOTREACHED */
4690
4691
4692 case CURLY_B_min_fail:
4693 /* failed to find B in a non-greedy match where c1,c2 invalid */
4694 if (ST.paren && ST.count)
4695 PL_regendp[ST.paren] = -1;
4696
4697 REGCP_UNWIND(ST.cp);
4698 /* failed -- move forward one */
4699 PL_reginput = locinput;
e2e6a0f1 4700 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4701 ST.count++;
a0ed51b3 4702 locinput = PL_reginput;
c255a977
DM
4703 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4704 ST.count > 0)) /* count overflow ? */
15272685 4705 {
c255a977
DM
4706 curly_try_B_min:
4707 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4708 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4709 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4710 goto fake_end;
4711 }
c255a977 4712 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4713 }
4714 }
4633a7c4 4715 sayNO;
c255a977
DM
4716 /* NOTREACHED */
4717
4718
4719 curly_try_B_max:
4720 /* a successful greedy match: now try to match B */
40d049e4 4721 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4722 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4723 goto fake_end;
4724 }
c255a977
DM
4725 {
4726 UV c = 0;
4727 if (ST.c1 != CHRTEST_VOID)
4728 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4729 UTF8_MAXBYTES, 0, uniflags)
466787eb 4730 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4731 /* If it could work, try it. */
4732 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4733 CURLY_SETPAREN(ST.paren, ST.count);
4734 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4735 /* NOTREACHED */
4736 }
4737 }
4738 /* FALL THROUGH */
4739 case CURLY_B_max_fail:
4740 /* failed to find B in a greedy match */
4741 if (ST.paren && ST.count)
4742 PL_regendp[ST.paren] = -1;
4743
4744 REGCP_UNWIND(ST.cp);
4745 /* back up. */
4746 if (--ST.count < ST.min)
4747 sayNO;
4748 PL_reginput = locinput = HOPc(locinput, -1);
4749 goto curly_try_B_max;
4750
4751#undef ST
4752
a0d0e21e 4753 case END:
6bda09f9 4754 fake_end:
faec1544
DM
4755 if (cur_eval) {
4756 /* we've just finished A in /(??{A})B/; now continue with B */
4757 I32 tmpix;
faec1544
DM
4758 st->u.eval.toggle_reg_flags
4759 = cur_eval->u.eval.toggle_reg_flags;
4760 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4761
4762 st->u.eval.prev_rex = rex; /* inner */
f8fc2ecf
YO
4763 rex = cur_eval->u.eval.prev_rex; /* outer */
4764 rexi = RXi_GET(rex);
faec1544
DM
4765 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4766 ReREFCNT_inc(rex);
4767 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4768 REGCP_SET(st->u.eval.lastcp);
4769 PL_reginput = locinput;
4770
4771 /* Restore parens of the outer rex without popping the
4772 * savestack */
4773 tmpix = PL_savestack_ix;
4774 PL_savestack_ix = cur_eval->u.eval.lastcp;
4775 regcppop(rex);
4776 PL_savestack_ix = tmpix;
4777
4778 st->u.eval.prev_eval = cur_eval;
4779 cur_eval = cur_eval->u.eval.prev_eval;
4780 DEBUG_EXECUTE_r(
2a49f0f5
JH
4781 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4782 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4783 if ( nochange_depth )
4784 nochange_depth--;
4785
4786 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4787 st->u.eval.prev_eval->u.eval.B); /* match B */
4788 }
4789
3b0527fe 4790 if (locinput < reginfo->till) {
a3621e74 4791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4792 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4793 PL_colors[4],
4794 (long)(locinput - PL_reg_starttry),
3b0527fe 4795 (long)(reginfo->till - PL_reg_starttry),
7821416a 4796 PL_colors[5]));
58e23c8d 4797
262b90c4 4798 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4799 }
4800 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4801 sayYES; /* Success! */
dad79028
DM
4802
4803 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4804 DEBUG_EXECUTE_r(
4805 PerlIO_printf(Perl_debug_log,
4806 "%*s %ssubpattern success...%s\n",
5bc10b2c 4807 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4808 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4809 sayYES; /* Success! */
dad79028 4810
40a82448
DM
4811#undef ST
4812#define ST st->u.ifmatch
4813
4814 case SUSPEND: /* (?>A) */
4815 ST.wanted = 1;
9fe1d20c 4816 PL_reginput = locinput;
9041c2e3 4817 goto do_ifmatch;
dad79028 4818
40a82448
DM
4819 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4820 ST.wanted = 0;
dad79028
DM
4821 goto ifmatch_trivial_fail_test;
4822
40a82448
DM
4823 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4824 ST.wanted = 1;
dad79028 4825 ifmatch_trivial_fail_test:
a0ed51b3 4826 if (scan->flags) {
52657f30 4827 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4828 if (!s) {
4829 /* trivial fail */
24d3c4a9
DM
4830 if (logical) {
4831 logical = 0;
4832 sw = 1 - (bool)ST.wanted;
dad79028 4833 }
40a82448 4834 else if (ST.wanted)
dad79028
DM
4835 sayNO;
4836 next = scan + ARG(scan);
4837 if (next == scan)
4838 next = NULL;
4839 break;
4840 }
efb30f32 4841 PL_reginput = s;
a0ed51b3
LW
4842 }
4843 else
4844 PL_reginput = locinput;
4845
c277df42 4846 do_ifmatch:
40a82448 4847 ST.me = scan;
24d3c4a9 4848 ST.logical = logical;
40a82448
DM
4849 /* execute body of (?...A) */
4850 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4851 /* NOTREACHED */
4852
4853 case IFMATCH_A_fail: /* body of (?...A) failed */
4854 ST.wanted = !ST.wanted;
4855 /* FALL THROUGH */
4856
4857 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4858 if (ST.logical) {
4859 sw = (bool)ST.wanted;
40a82448
DM
4860 }
4861 else if (!ST.wanted)
4862 sayNO;
4863
4864 if (OP(ST.me) == SUSPEND)
4865 locinput = PL_reginput;
4866 else {
4867 locinput = PL_reginput = st->locinput;
4868 nextchr = UCHARAT(locinput);
4869 }
4870 scan = ST.me + ARG(ST.me);
4871 if (scan == ST.me)
4872 scan = NULL;
4873 continue; /* execute B */
4874
4875#undef ST
dad79028 4876
c277df42 4877 case LONGJMP:
c277df42
IZ
4878 next = scan + ARG(scan);
4879 if (next == scan)
4880 next = NULL;
a0d0e21e 4881 break;
54612592 4882 case COMMIT:
e2e6a0f1
YO
4883 reginfo->cutpoint = PL_regeol;
4884 /* FALLTHROUGH */
5d458dd8 4885 case PRUNE:
24b23f37 4886 PL_reginput = locinput;
e2e6a0f1 4887 if (!scan->flags)
f8fc2ecf 4888 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
54612592
YO
4889 PUSH_STATE_GOTO(COMMIT_next,next);
4890 /* NOTREACHED */
4891 case COMMIT_next_fail:
4892 no_final = 1;
4893 /* FALLTHROUGH */
7f69552c
YO
4894 case OPFAIL:
4895 sayNO;
e2e6a0f1
YO
4896 /* NOTREACHED */
4897
4898#define ST st->u.mark
4899 case MARKPOINT:
4900 ST.prev_mark = mark_state;
5d458dd8 4901 ST.mark_name = sv_commit = sv_yes_mark
f8fc2ecf 4902 = (SV*)rexi->data->data[ ARG( scan ) ];
e2e6a0f1
YO
4903 mark_state = st;
4904 ST.mark_loc = PL_reginput = locinput;
4905 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4906 /* NOTREACHED */
4907 case MARKPOINT_next:
4908 mark_state = ST.prev_mark;
4909 sayYES;
4910 /* NOTREACHED */
4911 case MARKPOINT_next_fail:
5d458dd8 4912 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
4913 {
4914 if (ST.mark_loc > startpoint)
4915 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4916 popmark = NULL; /* we found our mark */
4917 sv_commit = ST.mark_name;
4918
4919 DEBUG_EXECUTE_r({
5d458dd8 4920 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
4921 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4922 REPORT_CODE_OFF+depth*2, "",
be2597df 4923 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
4924 });
4925 }
4926 mark_state = ST.prev_mark;
5d458dd8
YO
4927 sv_yes_mark = mark_state ?
4928 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
4929 sayNO;
4930 /* NOTREACHED */
5d458dd8
YO
4931 case SKIP:
4932 PL_reginput = locinput;
4933 if (scan->flags) {
2bf803e2 4934 /* (*SKIP) : if we fail we cut here*/
5d458dd8 4935 ST.mark_name = NULL;
e2e6a0f1 4936 ST.mark_loc = locinput;
5d458dd8
YO
4937 PUSH_STATE_GOTO(SKIP_next,next);
4938 } else {
2bf803e2 4939 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
4940 otherwise do nothing. Meaning we need to scan
4941 */
4942 regmatch_state *cur = mark_state;
f8fc2ecf 4943 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4944
4945 while (cur) {
4946 if ( sv_eq( cur->u.mark.mark_name,
4947 find ) )
4948 {
4949 ST.mark_name = find;
4950 PUSH_STATE_GOTO( SKIP_next, next );
4951 }
4952 cur = cur->u.mark.prev_mark;
4953 }
e2e6a0f1 4954 }
2bf803e2 4955 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
4956 break;
4957 case SKIP_next_fail:
4958 if (ST.mark_name) {
4959 /* (*CUT:NAME) - Set up to search for the name as we
4960 collapse the stack*/
4961 popmark = ST.mark_name;
4962 } else {
4963 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
4964 if (ST.mark_loc > startpoint)
4965 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
4966 /* but we set sv_commit to latest mark_name if there
4967 is one so they can test to see how things lead to this
4968 cut */
4969 if (mark_state)
4970 sv_commit=mark_state->u.mark.mark_name;
4971 }
e2e6a0f1
YO
4972 no_final = 1;
4973 sayNO;
4974 /* NOTREACHED */
4975#undef ST
4976
a0d0e21e 4977 default:
b900a521 4978 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4979 PTR2UV(scan), OP(scan));
cea2e8a9 4980 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
4981
4982 } /* end switch */
95b24440 4983
5d458dd8
YO
4984 /* switch break jumps here */
4985 scan = next; /* prepare to execute the next op and ... */
4986 continue; /* ... jump back to the top, reusing st */
95b24440
DM
4987 /* NOTREACHED */
4988
40a82448
DM
4989 push_yes_state:
4990 /* push a state that backtracks on success */
4991 st->u.yes.prev_yes_state = yes_state;
4992 yes_state = st;
4993 /* FALL THROUGH */
4994 push_state:
4995 /* push a new regex state, then continue at scan */
4996 {
4997 regmatch_state *newst;
4998
24b23f37
YO
4999 DEBUG_STACK_r({
5000 regmatch_state *cur = st;
5001 regmatch_state *curyes = yes_state;
5002 int curd = depth;
5003 regmatch_slab *slab = PL_regmatch_slab;
5004 for (;curd > -1;cur--,curd--) {
5005 if (cur < SLAB_FIRST(slab)) {
5006 slab = slab->prev;
5007 cur = SLAB_LAST(slab);
5008 }
5009 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5010 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5011 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5012 (curyes == cur) ? "yes" : ""
5013 );
5014 if (curyes == cur)
5015 curyes = cur->u.yes.prev_yes_state;
5016 }
5017 } else
5018 DEBUG_STATE_pp("push")
5019 );
40a82448 5020 depth++;
40a82448
DM
5021 st->locinput = locinput;
5022 newst = st+1;
5023 if (newst > SLAB_LAST(PL_regmatch_slab))
5024 newst = S_push_slab(aTHX);
5025 PL_regmatch_state = newst;
786e8c11 5026
40a82448
DM
5027 locinput = PL_reginput;
5028 nextchr = UCHARAT(locinput);
5029 st = newst;
5030 continue;
5031 /* NOTREACHED */
5032 }
a0d0e21e 5033 }
a687059c 5034
a0d0e21e
LW
5035 /*
5036 * We get here only if there's trouble -- normally "case END" is
5037 * the terminating point.
5038 */
cea2e8a9 5039 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5040 /*NOTREACHED*/
4633a7c4
LW
5041 sayNO;
5042
262b90c4 5043yes:
77cb431f
DM
5044 if (yes_state) {
5045 /* we have successfully completed a subexpression, but we must now
5046 * pop to the state marked by yes_state and continue from there */
77cb431f 5047 assert(st != yes_state);
5bc10b2c
DM
5048#ifdef DEBUGGING
5049 while (st != yes_state) {
5050 st--;
5051 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5052 PL_regmatch_slab = PL_regmatch_slab->prev;
5053 st = SLAB_LAST(PL_regmatch_slab);
5054 }
e2e6a0f1 5055 DEBUG_STATE_r({
54612592
YO
5056 if (no_final) {
5057 DEBUG_STATE_pp("pop (no final)");
5058 } else {
5059 DEBUG_STATE_pp("pop (yes)");
5060 }
e2e6a0f1 5061 });
5bc10b2c
DM
5062 depth--;
5063 }
5064#else
77cb431f
DM
5065 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5066 || yes_state > SLAB_LAST(PL_regmatch_slab))
5067 {
5068 /* not in this slab, pop slab */
5069 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5070 PL_regmatch_slab = PL_regmatch_slab->prev;
5071 st = SLAB_LAST(PL_regmatch_slab);
5072 }
5073 depth -= (st - yes_state);
5bc10b2c 5074#endif
77cb431f
DM
5075 st = yes_state;
5076 yes_state = st->u.yes.prev_yes_state;
5077 PL_regmatch_state = st;
24b23f37 5078
5d458dd8
YO
5079 if (no_final) {
5080 locinput= st->locinput;
5081 nextchr = UCHARAT(locinput);
5082 }
54612592 5083 state_num = st->resume_state + no_final;
24d3c4a9 5084 goto reenter_switch;
77cb431f
DM
5085 }
5086
a3621e74 5087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5088 PL_colors[4], PL_colors[5]));
02db2b7b 5089
95b24440 5090 result = 1;
aa283a38 5091 goto final_exit;
4633a7c4
LW
5092
5093no:
a3621e74 5094 DEBUG_EXECUTE_r(
7821416a 5095 PerlIO_printf(Perl_debug_log,
786e8c11 5096 "%*s %sfailed...%s\n",
5bc10b2c 5097 REPORT_CODE_OFF+depth*2, "",
786e8c11 5098 PL_colors[4], PL_colors[5])
7821416a 5099 );
aa283a38 5100
262b90c4 5101no_silent:
54612592
YO
5102 if (no_final) {
5103 if (yes_state) {
5104 goto yes;
5105 } else {
5106 goto final_exit;
5107 }
5108 }
aa283a38
DM
5109 if (depth) {
5110 /* there's a previous state to backtrack to */
40a82448
DM
5111 st--;
5112 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5113 PL_regmatch_slab = PL_regmatch_slab->prev;
5114 st = SLAB_LAST(PL_regmatch_slab);
5115 }
5116 PL_regmatch_state = st;
40a82448
DM
5117 locinput= st->locinput;
5118 nextchr = UCHARAT(locinput);
5119
5bc10b2c
DM
5120 DEBUG_STATE_pp("pop");
5121 depth--;
262b90c4
DM
5122 if (yes_state == st)
5123 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5124
24d3c4a9
DM
5125 state_num = st->resume_state + 1; /* failure = success + 1 */
5126 goto reenter_switch;
95b24440 5127 }
24d3c4a9 5128 result = 0;
aa283a38 5129
262b90c4 5130 final_exit:
bbe252da 5131 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5132 SV *sv_err = get_sv("REGERROR", 1);
5133 SV *sv_mrk = get_sv("REGMARK", 1);
5134 if (result) {
e2e6a0f1 5135 sv_commit = &PL_sv_no;
5d458dd8
YO
5136 if (!sv_yes_mark)
5137 sv_yes_mark = &PL_sv_yes;
5138 } else {
5139 if (!sv_commit)
5140 sv_commit = &PL_sv_yes;
5141 sv_yes_mark = &PL_sv_no;
5142 }
5143 sv_setsv(sv_err, sv_commit);
5144 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5145 }
5d9a96ca
DM
5146 /* restore original high-water mark */
5147 PL_regmatch_slab = orig_slab;
5148 PL_regmatch_state = orig_state;
5149
5150 /* free all slabs above current one */
5151 if (orig_slab->next) {
c4fd8992 5152 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
5153 orig_slab->next = NULL;
5154 while (sl) {
c4fd8992 5155 regmatch_slab * const osl = sl;
5d9a96ca 5156 sl = sl->next;
ad65c075 5157 Safefree(osl);
5d9a96ca
DM
5158 }
5159 }
5160
95b24440 5161 return result;
a687059c
LW
5162}
5163
5164/*
5165 - regrepeat - repeatedly match something simple, report how many
5166 */
5167/*
5168 * [This routine now assumes that it will only match on things of length 1.
5169 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5170 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5171 */
76e3520e 5172STATIC I32
e2e6a0f1 5173S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5174{
27da23d5 5175 dVAR;
a0d0e21e 5176 register char *scan;
a0d0e21e 5177 register I32 c;
3280af22 5178 register char *loceol = PL_regeol;
a0ed51b3 5179 register I32 hardcount = 0;
53c4c00c 5180 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5181#ifndef DEBUGGING
5182 PERL_UNUSED_ARG(depth);
5183#endif
a0d0e21e 5184
3280af22 5185 scan = PL_reginput;
faf11cac
HS
5186 if (max == REG_INFTY)
5187 max = I32_MAX;
5188 else if (max < loceol - scan)
7f596f4c 5189 loceol = scan + max;
a0d0e21e 5190 switch (OP(p)) {
22c35a8c 5191 case REG_ANY:
1aa99e6b 5192 if (do_utf8) {
ffc61ed2 5193 loceol = PL_regeol;
1aa99e6b 5194 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5195 scan += UTF8SKIP(scan);
5196 hardcount++;
5197 }
5198 } else {
5199 while (scan < loceol && *scan != '\n')
5200 scan++;
a0ed51b3
LW
5201 }
5202 break;
ffc61ed2 5203 case SANY:
def8e4ea
JH
5204 if (do_utf8) {
5205 loceol = PL_regeol;
a0804c9e 5206 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5207 scan += UTF8SKIP(scan);
5208 hardcount++;
5209 }
5210 }
5211 else
5212 scan = loceol;
a0ed51b3 5213 break;
f33976b4
DB
5214 case CANY:
5215 scan = loceol;
5216 break;
090f7165
JH
5217 case EXACT: /* length of string is 1 */
5218 c = (U8)*STRING(p);
5219 while (scan < loceol && UCHARAT(scan) == c)
5220 scan++;
bbce6d69 5221 break;
5222 case EXACTF: /* length of string is 1 */
cd439c50 5223 c = (U8)*STRING(p);
bbce6d69 5224 while (scan < loceol &&
22c35a8c 5225 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5226 scan++;
5227 break;
5228 case EXACTFL: /* length of string is 1 */
3280af22 5229 PL_reg_flags |= RF_tainted;
cd439c50 5230 c = (U8)*STRING(p);
bbce6d69 5231 while (scan < loceol &&
22c35a8c 5232 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5233 scan++;
5234 break;
5235 case ANYOF:
ffc61ed2
JH
5236 if (do_utf8) {
5237 loceol = PL_regeol;
cfc92286 5238 while (hardcount < max && scan < loceol &&
32fc9b6a 5239 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5240 scan += UTF8SKIP(scan);
5241 hardcount++;
5242 }
5243 } else {
32fc9b6a 5244 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5245 scan++;
5246 }
a0d0e21e
LW
5247 break;
5248 case ALNUM:
1aa99e6b 5249 if (do_utf8) {
ffc61ed2 5250 loceol = PL_regeol;
1a4fad37 5251 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5252 while (hardcount < max && scan < loceol &&
3568d838 5253 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5254 scan += UTF8SKIP(scan);
5255 hardcount++;
5256 }
5257 } else {
5258 while (scan < loceol && isALNUM(*scan))
5259 scan++;
a0ed51b3
LW
5260 }
5261 break;
bbce6d69 5262 case ALNUML:
3280af22 5263 PL_reg_flags |= RF_tainted;
1aa99e6b 5264 if (do_utf8) {
ffc61ed2 5265 loceol = PL_regeol;
1aa99e6b
IH
5266 while (hardcount < max && scan < loceol &&
5267 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5268 scan += UTF8SKIP(scan);
5269 hardcount++;
5270 }
5271 } else {
5272 while (scan < loceol && isALNUM_LC(*scan))
5273 scan++;
a0ed51b3
LW
5274 }
5275 break;
a0d0e21e 5276 case NALNUM:
1aa99e6b 5277 if (do_utf8) {
ffc61ed2 5278 loceol = PL_regeol;
1a4fad37 5279 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5280 while (hardcount < max && scan < loceol &&
3568d838 5281 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5282 scan += UTF8SKIP(scan);
5283 hardcount++;
5284 }
5285 } else {
5286 while (scan < loceol && !isALNUM(*scan))
5287 scan++;
a0ed51b3
LW
5288 }
5289 break;
bbce6d69 5290 case NALNUML:
3280af22 5291 PL_reg_flags |= RF_tainted;
1aa99e6b 5292 if (do_utf8) {
ffc61ed2 5293 loceol = PL_regeol;
1aa99e6b
IH
5294 while (hardcount < max && scan < loceol &&
5295 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5296 scan += UTF8SKIP(scan);
5297 hardcount++;
5298 }
5299 } else {
5300 while (scan < loceol && !isALNUM_LC(*scan))
5301 scan++;
a0ed51b3
LW
5302 }
5303 break;
a0d0e21e 5304 case SPACE:
1aa99e6b 5305 if (do_utf8) {
ffc61ed2 5306 loceol = PL_regeol;
1a4fad37 5307 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5308 while (hardcount < max && scan < loceol &&
3568d838
JH
5309 (*scan == ' ' ||
5310 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5311 scan += UTF8SKIP(scan);
5312 hardcount++;
5313 }
5314 } else {
5315 while (scan < loceol && isSPACE(*scan))
5316 scan++;
a0ed51b3
LW
5317 }
5318 break;
bbce6d69 5319 case SPACEL:
3280af22 5320 PL_reg_flags |= RF_tainted;
1aa99e6b 5321 if (do_utf8) {
ffc61ed2 5322 loceol = PL_regeol;
1aa99e6b 5323 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5324 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5325 scan += UTF8SKIP(scan);
5326 hardcount++;
5327 }
5328 } else {
5329 while (scan < loceol && isSPACE_LC(*scan))
5330 scan++;
a0ed51b3
LW
5331 }
5332 break;
a0d0e21e 5333 case NSPACE:
1aa99e6b 5334 if (do_utf8) {
ffc61ed2 5335 loceol = PL_regeol;
1a4fad37 5336 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5337 while (hardcount < max && scan < loceol &&
3568d838
JH
5338 !(*scan == ' ' ||
5339 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5340 scan += UTF8SKIP(scan);
5341 hardcount++;
5342 }
5343 } else {
5344 while (scan < loceol && !isSPACE(*scan))
5345 scan++;
5346 break;
a0ed51b3 5347 }
bbce6d69 5348 case NSPACEL:
3280af22 5349 PL_reg_flags |= RF_tainted;
1aa99e6b 5350 if (do_utf8) {
ffc61ed2 5351 loceol = PL_regeol;
1aa99e6b 5352 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5353 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5354 scan += UTF8SKIP(scan);
5355 hardcount++;
5356 }
5357 } else {
5358 while (scan < loceol && !isSPACE_LC(*scan))
5359 scan++;
a0ed51b3
LW
5360 }
5361 break;
a0d0e21e 5362 case DIGIT:
1aa99e6b 5363 if (do_utf8) {
ffc61ed2 5364 loceol = PL_regeol;
1a4fad37 5365 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5366 while (hardcount < max && scan < loceol &&
3568d838 5367 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5368 scan += UTF8SKIP(scan);
5369 hardcount++;
5370 }
5371 } else {
5372 while (scan < loceol && isDIGIT(*scan))
5373 scan++;
a0ed51b3
LW
5374 }
5375 break;
a0d0e21e 5376 case NDIGIT:
1aa99e6b 5377 if (do_utf8) {
ffc61ed2 5378 loceol = PL_regeol;
1a4fad37 5379 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5380 while (hardcount < max && scan < loceol &&
3568d838 5381 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5382 scan += UTF8SKIP(scan);
5383 hardcount++;
5384 }
5385 } else {
5386 while (scan < loceol && !isDIGIT(*scan))
5387 scan++;
a0ed51b3
LW
5388 }
5389 break;
a0d0e21e
LW
5390 default: /* Called on something of 0 width. */
5391 break; /* So match right here or not at all. */
5392 }
a687059c 5393
a0ed51b3
LW
5394 if (hardcount)
5395 c = hardcount;
5396 else
5397 c = scan - PL_reginput;
3280af22 5398 PL_reginput = scan;
a687059c 5399
a3621e74 5400 DEBUG_r({
e68ec53f 5401 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5402 DEBUG_EXECUTE_r({
e68ec53f
YO
5403 SV * const prop = sv_newmortal();
5404 regprop(prog, prop, p);
5405 PerlIO_printf(Perl_debug_log,
be8e71aa 5406 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5407 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5408 });
be8e71aa 5409 });
9041c2e3 5410
a0d0e21e 5411 return(c);
a687059c
LW
5412}
5413
c277df42 5414
be8e71aa 5415#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5416/*
ffc61ed2
JH
5417- regclass_swash - prepare the utf8 swash
5418*/
5419
5420SV *
32fc9b6a 5421Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5422{
97aff369 5423 dVAR;
9e55ce06
JH
5424 SV *sw = NULL;
5425 SV *si = NULL;
5426 SV *alt = NULL;
f8fc2ecf
YO
5427 RXi_GET_DECL(prog,progi);
5428 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5429
4f639d21 5430 if (data && data->count) {
a3b680e6 5431 const U32 n = ARG(node);
ffc61ed2 5432
4f639d21
DM
5433 if (data->what[n] == 's') {
5434 SV * const rv = (SV*)data->data[n];
890ce7af 5435 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5436 SV **const ary = AvARRAY(av);
9e55ce06 5437 SV **a, **b;
9041c2e3 5438
711a919c 5439 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5440 * documentation of these array elements. */
5441
b11f357e 5442 si = *ary;
8f7f7219 5443 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5444 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5445
ffc61ed2
JH
5446 if (a)
5447 sw = *a;
5448 else if (si && doinit) {
5449 sw = swash_init("utf8", "", si, 1, 0);
5450 (void)av_store(av, 1, sw);
5451 }
9e55ce06
JH
5452 if (b)
5453 alt = *b;
ffc61ed2
JH
5454 }
5455 }
5456
9e55ce06
JH
5457 if (listsvp)
5458 *listsvp = si;
5459 if (altsvp)
5460 *altsvp = alt;
ffc61ed2
JH
5461
5462 return sw;
5463}
76234dfb 5464#endif
ffc61ed2
JH
5465
5466/*
ba7b4546 5467 - reginclass - determine if a character falls into a character class
832705d4
JH
5468
5469 The n is the ANYOF regnode, the p is the target string, lenp
5470 is pointer to the maximum length of how far to go in the p
5471 (if the lenp is zero, UTF8SKIP(p) is used),
5472 do_utf8 tells whether the target string is in UTF-8.
5473
bbce6d69 5474 */
5475
76e3520e 5476STATIC bool
32fc9b6a 5477S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5478{
27da23d5 5479 dVAR;
a3b680e6 5480 const char flags = ANYOF_FLAGS(n);
bbce6d69 5481 bool match = FALSE;
cc07378b 5482 UV c = *p;
ae9ddab8 5483 STRLEN len = 0;
9e55ce06 5484 STRLEN plen;
1aa99e6b 5485
19f67299
TS
5486 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5487 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5488 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5489 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5490 if (len == (STRLEN)-1)
5491 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5492 }
bbce6d69 5493
0f0076b4 5494 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5495 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5496 if (lenp)
5497 *lenp = 0;
ffc61ed2 5498 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5499 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5500 match = TRUE;
bbce6d69 5501 }
3568d838 5502 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5503 match = TRUE;
ffc61ed2 5504 if (!match) {
9e55ce06 5505 AV *av;
32fc9b6a 5506 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5507
5508 if (sw) {
3568d838 5509 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5510 match = TRUE;
5511 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5512 if (!match && lenp && av) {
5513 I32 i;
9e55ce06 5514 for (i = 0; i <= av_len(av); i++) {
890ce7af 5515 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5516 STRLEN len;
890ce7af 5517 const char * const s = SvPV_const(sv, len);
9e55ce06 5518
061b10df 5519 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5520 *lenp = len;
5521 match = TRUE;
5522 break;
5523 }
5524 }
5525 }
5526 if (!match) {
89ebb4a3 5527 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5528 STRLEN tmplen;
5529
9e55ce06
JH
5530 to_utf8_fold(p, tmpbuf, &tmplen);
5531 if (swash_fetch(sw, tmpbuf, do_utf8))
5532 match = TRUE;
5533 }
ffc61ed2
JH
5534 }
5535 }
bbce6d69 5536 }
9e55ce06 5537 if (match && lenp && *lenp == 0)
0f0076b4 5538 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5539 }
1aa99e6b 5540 if (!match && c < 256) {
ffc61ed2
JH
5541 if (ANYOF_BITMAP_TEST(n, c))
5542 match = TRUE;
5543 else if (flags & ANYOF_FOLD) {
eb160463 5544 U8 f;
a0ed51b3 5545
ffc61ed2
JH
5546 if (flags & ANYOF_LOCALE) {
5547 PL_reg_flags |= RF_tainted;
5548 f = PL_fold_locale[c];
5549 }
5550 else
5551 f = PL_fold[c];
5552 if (f != c && ANYOF_BITMAP_TEST(n, f))
5553 match = TRUE;
5554 }
5555
5556 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5557 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5558 if (
5559 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5560 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5561 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5562 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5563 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5564 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5565 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5566 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5567 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5568 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5569 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5570 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5571 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5572 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5573 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5574 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5575 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5576 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5577 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5578 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5579 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5580 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5581 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5582 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5583 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5584 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5585 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5586 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5587 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5588 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5589 ) /* How's that for a conditional? */
5590 {
5591 match = TRUE;
5592 }
a0ed51b3 5593 }
a0ed51b3
LW
5594 }
5595
a0ed51b3
LW
5596 return (flags & ANYOF_INVERT) ? !match : match;
5597}
161b471a 5598
dfe13c55 5599STATIC U8 *
0ce71af7 5600S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5601{
97aff369 5602 dVAR;
a0ed51b3 5603 if (off >= 0) {
1aa99e6b 5604 while (off-- && s < lim) {
ffc61ed2 5605 /* XXX could check well-formedness here */
a0ed51b3 5606 s += UTF8SKIP(s);
ffc61ed2 5607 }
a0ed51b3
LW
5608 }
5609 else {
1de06328
YO
5610 while (off++ && s > lim) {
5611 s--;
5612 if (UTF8_IS_CONTINUED(*s)) {
5613 while (s > lim && UTF8_IS_CONTINUATION(*s))
5614 s--;
a0ed51b3 5615 }
1de06328 5616 /* XXX could check well-formedness here */
a0ed51b3
LW
5617 }
5618 }
5619 return s;
5620}
161b471a 5621
f9f4320a
YO
5622#ifdef XXX_dmq
5623/* there are a bunch of places where we use two reghop3's that should
5624 be replaced with this routine. but since thats not done yet
5625 we ifdef it out - dmq
5626*/
dfe13c55 5627STATIC U8 *
1de06328
YO
5628S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5629{
5630 dVAR;
5631 if (off >= 0) {
5632 while (off-- && s < rlim) {
5633 /* XXX could check well-formedness here */
5634 s += UTF8SKIP(s);
5635 }
5636 }
5637 else {
5638 while (off++ && s > llim) {
5639 s--;
5640 if (UTF8_IS_CONTINUED(*s)) {
5641 while (s > llim && UTF8_IS_CONTINUATION(*s))
5642 s--;
5643 }
5644 /* XXX could check well-formedness here */
5645 }
5646 }
5647 return s;
5648}
f9f4320a 5649#endif
1de06328
YO
5650
5651STATIC U8 *
0ce71af7 5652S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5653{
97aff369 5654 dVAR;
a0ed51b3 5655 if (off >= 0) {
1aa99e6b 5656 while (off-- && s < lim) {
ffc61ed2 5657 /* XXX could check well-formedness here */
a0ed51b3 5658 s += UTF8SKIP(s);
ffc61ed2 5659 }
a0ed51b3 5660 if (off >= 0)
3dab1dad 5661 return NULL;
a0ed51b3
LW
5662 }
5663 else {
1de06328
YO
5664 while (off++ && s > lim) {
5665 s--;
5666 if (UTF8_IS_CONTINUED(*s)) {
5667 while (s > lim && UTF8_IS_CONTINUATION(*s))
5668 s--;
a0ed51b3 5669 }
1de06328 5670 /* XXX could check well-formedness here */
a0ed51b3
LW
5671 }
5672 if (off <= 0)
3dab1dad 5673 return NULL;
a0ed51b3
LW
5674 }
5675 return s;
5676}
51371543 5677
51371543 5678static void
acfe0abc 5679restore_pos(pTHX_ void *arg)
51371543 5680{
97aff369 5681 dVAR;
097eb12c 5682 regexp * const rex = (regexp *)arg;
51371543
GS
5683 if (PL_reg_eval_set) {
5684 if (PL_reg_oldsaved) {
4f639d21
DM
5685 rex->subbeg = PL_reg_oldsaved;
5686 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5687#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5688 rex->saved_copy = PL_nrs;
ed252734 5689#endif
4f639d21 5690 RX_MATCH_COPIED_on(rex);
51371543
GS
5691 }
5692 PL_reg_magic->mg_len = PL_reg_oldpos;
5693 PL_reg_eval_set = 0;
5694 PL_curpm = PL_reg_oldcurpm;
5695 }
5696}
33b8afdf
JH
5697
5698STATIC void
5699S_to_utf8_substr(pTHX_ register regexp *prog)
5700{
a1cac82e
NC
5701 int i = 1;
5702 do {
5703 if (prog->substrs->data[i].substr
5704 && !prog->substrs->data[i].utf8_substr) {
5705 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5706 prog->substrs->data[i].utf8_substr = sv;
5707 sv_utf8_upgrade(sv);
610460f9
NC
5708 if (SvVALID(prog->substrs->data[i].substr)) {
5709 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5710 if (flags & FBMcf_TAIL) {
5711 /* Trim the trailing \n that fbm_compile added last
5712 time. */
5713 SvCUR_set(sv, SvCUR(sv) - 1);
5714 /* Whilst this makes the SV technically "invalid" (as its
5715 buffer is no longer followed by "\0") when fbm_compile()
5716 adds the "\n" back, a "\0" is restored. */
5717 }
5718 fbm_compile(sv, flags);
5719 }
a1cac82e
NC
5720 if (prog->substrs->data[i].substr == prog->check_substr)
5721 prog->check_utf8 = sv;
5722 }
5723 } while (i--);
33b8afdf
JH
5724}
5725
5726STATIC void
5727S_to_byte_substr(pTHX_ register regexp *prog)
5728{
97aff369 5729 dVAR;
a1cac82e
NC
5730 int i = 1;
5731 do {
5732 if (prog->substrs->data[i].utf8_substr
5733 && !prog->substrs->data[i].substr) {
5734 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5735 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
5736 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5737 const U8 flags
5738 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5739 if (flags & FBMcf_TAIL) {
5740 /* Trim the trailing \n that fbm_compile added last
5741 time. */
5742 SvCUR_set(sv, SvCUR(sv) - 1);
5743 }
5744 fbm_compile(sv, flags);
5745 }
a1cac82e
NC
5746 } else {
5747 SvREFCNT_dec(sv);
5748 sv = &PL_sv_undef;
5749 }
5750 prog->substrs->data[i].substr = sv;
5751 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5752 prog->check_substr = sv;
33b8afdf 5753 }
a1cac82e 5754 } while (i--);
33b8afdf 5755}
66610fdd
RGS
5756
5757/*
5758 * Local variables:
5759 * c-indentation-style: bsd
5760 * c-basic-offset: 4
5761 * indent-tabs-mode: t
5762 * End:
5763 *
37442d52
RGS
5764 * ex: set ts=8 sts=4 sw=4 noet:
5765 */