This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
add p55.t MAD tests, and convert renamed mad/nomad into a module
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
125/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || \
128 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
129 OP(rn) == EVAL || \
cca55fe3
JP
130 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
131 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 132 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 133 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 134)
ee9b8eae 135#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 136
ee9b8eae
YO
137#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
138
139#if 0
140/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
141 we don't need this definition. */
142#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
143#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
144#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
145
146#else
147/* ... so we use this as its faster. */
148#define IS_TEXT(rn) ( OP(rn)==EXACT )
149#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
150#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
151
152#endif
e2d8ce26 153
a84d97b6
HS
154/*
155 Search for mandatory following text node; for lookahead, the text must
156 follow but for lookbehind (rn->flags != 0) we skip to the next step.
157*/
cca55fe3 158#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
159 while (JUMPABLE(rn)) { \
160 const OPCODE type = OP(rn); \
161 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 162 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 163 else if (type == PLUS) \
cca55fe3 164 rn = NEXTOPER(rn); \
3dab1dad 165 else if (type == IFMATCH) \
a84d97b6 166 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 167 else rn += NEXT_OFF(rn); \
3dab1dad 168 } \
5f80c4cf 169} STMT_END
74750237 170
c476f425 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
97aff369 177 dVAR;
a3b680e6 178 const int retval = PL_savestack_ix;
b1ce53c5 179#define REGCP_PAREN_ELEMS 4
a3b680e6 180 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 181 int p;
40a82448 182 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 183
e49a9654
IH
184 if (paren_elems_to_push < 0)
185 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
186
6bda09f9 187#define REGCP_OTHER_ELEMS 8
4b3c1a47 188 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 189
3280af22 190 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 191/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
192 SSPUSHINT(PL_regendp[p]);
193 SSPUSHINT(PL_regstartp[p]);
3280af22 194 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 195 SSPUSHINT(p);
e7707071 196 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
197 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
198 (UV)p, (IV)PL_regstartp[p],
199 (IV)(PL_reg_start_tmp[p] - PL_bostr),
200 (IV)PL_regendp[p]
201 ));
a0d0e21e 202 }
b1ce53c5 203/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
204 SSPUSHPTR(PL_regstartp);
205 SSPUSHPTR(PL_regendp);
3280af22
NIS
206 SSPUSHINT(PL_regsize);
207 SSPUSHINT(*PL_reglastparen);
a01268b5 208 SSPUSHINT(*PL_reglastcloseparen);
3280af22 209 SSPUSHPTR(PL_reginput);
41123dfd
JH
210#define REGCP_FRAME_ELEMS 2
211/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
212 * are needed for the regexp context stack bookkeeping. */
213 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 214 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 215
a0d0e21e
LW
216 return retval;
217}
218
c277df42 219/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
220#define REGCP_SET(cp) \
221 DEBUG_STATE_r( \
ab3bbdeb 222 PerlIO_printf(Perl_debug_log, \
e4f74956 223 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
224 (IV)PL_savestack_ix)); \
225 cp = PL_savestack_ix
c3464db5 226
ab3bbdeb 227#define REGCP_UNWIND(cp) \
e4f74956 228 DEBUG_STATE_r( \
ab3bbdeb 229 if (cp != PL_savestack_ix) \
e4f74956
YO
230 PerlIO_printf(Perl_debug_log, \
231 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
232 (IV)(cp), (IV)PL_savestack_ix)); \
233 regcpblow(cp)
c277df42 234
76e3520e 235STATIC char *
097eb12c 236S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 237{
97aff369 238 dVAR;
3b6647e0 239 U32 i;
a0d0e21e 240 char *input;
b1ce53c5 241
a3621e74
YO
242 GET_RE_DEBUG_FLAGS_DECL;
243
b1ce53c5 244 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 245 i = SSPOPINT;
b1ce53c5
JH
246 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
247 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 248 input = (char *) SSPOPPTR;
a01268b5 249 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
250 *PL_reglastparen = SSPOPINT;
251 PL_regsize = SSPOPINT;
6bda09f9
YO
252 PL_regendp=(I32 *) SSPOPPTR;
253 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 254
6bda09f9 255
b1ce53c5 256 /* Now restore the parentheses context. */
41123dfd
JH
257 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
258 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 259 I32 tmps;
097eb12c 260 U32 paren = (U32)SSPOPINT;
3280af22 261 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
262 PL_regstartp[paren] = SSPOPINT;
263 tmps = SSPOPINT;
3280af22
NIS
264 if (paren <= *PL_reglastparen)
265 PL_regendp[paren] = tmps;
e7707071 266 DEBUG_BUFFERS_r(
c3464db5 267 PerlIO_printf(Perl_debug_log,
b900a521 268 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 269 (UV)paren, (IV)PL_regstartp[paren],
b900a521 270 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 271 (IV)PL_regendp[paren],
3280af22 272 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 273 );
a0d0e21e 274 }
e7707071 275 DEBUG_BUFFERS_r(
bb7a0f54 276 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 277 PerlIO_printf(Perl_debug_log,
faccc32b 278 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 279 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
280 }
281 );
daf18116 282#if 1
dafc8851
JH
283 /* It would seem that the similar code in regtry()
284 * already takes care of this, and in fact it is in
285 * a better location to since this code can #if 0-ed out
286 * but the code in regtry() is needed or otherwise tests
287 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
288 * (as of patchlevel 7877) will fail. Then again,
289 * this code seems to be necessary or otherwise
290 * building DynaLoader will fail:
291 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
292 * --jhi */
3b6647e0 293 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c
AL
294 if (i > PL_regsize)
295 PL_regstartp[i] = -1;
296 PL_regendp[i] = -1;
a0d0e21e 297 }
dafc8851 298#endif
a0d0e21e
LW
299 return input;
300}
301
02db2b7b 302#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 303
a687059c 304/*
e50aee73 305 * pregexec and friends
a687059c
LW
306 */
307
76234dfb 308#ifndef PERL_IN_XSUB_RE
a687059c 309/*
c277df42 310 - pregexec - match a regexp against a string
a687059c 311 */
c277df42 312I32
864dbfa3 313Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 314 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
315/* strend: pointer to null at end of string */
316/* strbeg: real beginning of string */
317/* minend: end of match must be >=minend after stringarg. */
318/* nosave: For optimizations. */
319{
320 return
9041c2e3 321 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
322 nosave ? 0 : REXEC_COPY_STR);
323}
76234dfb 324#endif
22e551b9 325
9041c2e3 326/*
cad2e5aa
JH
327 * Need to implement the following flags for reg_anch:
328 *
329 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
330 * USE_INTUIT_ML
331 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
332 * INTUIT_AUTORITATIVE_ML
333 * INTUIT_ONCE_NOML - Intuit can match in one location only.
334 * INTUIT_ONCE_ML
335 *
336 * Another flag for this function: SECOND_TIME (so that float substrs
337 * with giant delta may be not rechecked).
338 */
339
340/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
341
3f7c398e 342/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
343 Otherwise, only SvCUR(sv) is used to get strbeg. */
344
345/* XXXX We assume that strpos is strbeg unless sv. */
346
6eb5f6b9
JH
347/* XXXX Some places assume that there is a fixed substring.
348 An update may be needed if optimizer marks as "INTUITable"
349 RExen without fixed substrings. Similarly, it is assumed that
350 lengths of all the strings are no more than minlen, thus they
351 cannot come from lookahead.
40d049e4
YO
352 (Or minlen should take into account lookahead.)
353 NOTE: Some of this comment is not correct. minlen does now take account
354 of lookahead/behind. Further research is required. -- demerphq
355
356*/
6eb5f6b9 357
2c2d71f5
JH
358/* A failure to find a constant substring means that there is no need to make
359 an expensive call to REx engine, thus we celebrate a failure. Similarly,
360 finding a substring too deep into the string means that less calls to
30944b6d
IZ
361 regtry() should be needed.
362
363 REx compiler's optimizer found 4 possible hints:
364 a) Anchored substring;
365 b) Fixed substring;
366 c) Whether we are anchored (beginning-of-line or \G);
367 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 368 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
369 string which does not contradict any of them.
370 */
2c2d71f5 371
6eb5f6b9
JH
372/* Most of decisions we do here should have been done at compile time.
373 The nodes of the REx which we used for the search should have been
374 deleted from the finite automaton. */
375
cad2e5aa
JH
376char *
377Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
378 char *strend, U32 flags, re_scream_pos_data *data)
379{
97aff369 380 dVAR;
b7953727 381 register I32 start_shift = 0;
cad2e5aa 382 /* Should be nonnegative! */
b7953727 383 register I32 end_shift = 0;
2c2d71f5
JH
384 register char *s;
385 register SV *check;
a1933d95 386 char *strbeg;
cad2e5aa 387 char *t;
1de06328 388 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 389 I32 ml_anch;
bd61b366
SS
390 register char *other_last = NULL; /* other substr checked before this */
391 char *check_at = NULL; /* check substr found at this pos */
bbe252da 392 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 393 RXi_GET_DECL(prog,progi);
30944b6d 394#ifdef DEBUGGING
890ce7af 395 const char * const i_strpos = strpos;
30944b6d 396#endif
a3621e74
YO
397
398 GET_RE_DEBUG_FLAGS_DECL;
399
a30b2f1f 400 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 401
bbe252da 402 if (prog->extflags & RXf_UTF8) {
b8d68ded
JH
403 PL_reg_flags |= RF_utf8;
404 }
ab3bbdeb
YO
405 DEBUG_EXECUTE_r(
406 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
407 sv ? "Guessing start of match in sv for"
408 : "Guessing start of match in string for");
2a782b5b 409 );
cad2e5aa 410
c344f387
JH
411 /* CHR_DIST() would be more correct here but it makes things slow. */
412 if (prog->minlen > strend - strpos) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 414 "String too short... [re_intuit_start]\n"));
cad2e5aa 415 goto fail;
2c2d71f5 416 }
1de06328 417
a1933d95 418 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 419 PL_regeol = strend;
33b8afdf
JH
420 if (do_utf8) {
421 if (!prog->check_utf8 && prog->check_substr)
422 to_utf8_substr(prog);
423 check = prog->check_utf8;
424 } else {
425 if (!prog->check_substr && prog->check_utf8)
426 to_byte_substr(prog);
427 check = prog->check_substr;
428 }
1de06328 429 if (check == &PL_sv_undef) {
a3621e74 430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 431 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
432 goto fail;
433 }
bbe252da
YO
434 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
435 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
436 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 437 && !multiline ) ); /* Check after \n? */
cad2e5aa 438
7e25d62c 439 if (!ml_anch) {
bbe252da
YO
440 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
441 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 442 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
443 && sv && !SvROK(sv)
444 && (strpos != strbeg)) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
446 goto fail;
447 }
448 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 449 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 450 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
451 I32 slen;
452
1aa99e6b 453 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 454
653099ff
GS
455 if (SvTAIL(check)) {
456 slen = SvCUR(check); /* >= 1 */
cad2e5aa 457
9041c2e3 458 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 459 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 461 goto fail_finish;
cad2e5aa
JH
462 }
463 /* Now should match s[0..slen-2] */
464 slen--;
3f7c398e 465 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 466 || (slen > 1
3f7c398e 467 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 468 report_neq:
a3621e74 469 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
470 goto fail_finish;
471 }
cad2e5aa 472 }
3f7c398e 473 else if (*SvPVX_const(check) != *s
653099ff 474 || ((slen = SvCUR(check)) > 1
3f7c398e 475 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 476 goto report_neq;
c315bfe8 477 check_at = s;
2c2d71f5 478 goto success_at_start;
7e25d62c 479 }
cad2e5aa 480 }
2c2d71f5 481 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 482 s = strpos;
2c2d71f5 483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
484 end_shift = prog->check_end_shift;
485
2c2d71f5 486 if (!ml_anch) {
a3b680e6 487 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 488 - (SvTAIL(check) != 0);
a3b680e6 489 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
490
491 if (end_shift < eshift)
492 end_shift = eshift;
493 }
cad2e5aa 494 }
2c2d71f5 495 else { /* Can match at random position */
cad2e5aa
JH
496 ml_anch = 0;
497 s = strpos;
1de06328
YO
498 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
499 end_shift = prog->check_end_shift;
500
501 /* end shift should be non negative here */
cad2e5aa
JH
502 }
503
bcdf7404 504#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 505 if (end_shift < 0)
1de06328 506 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 507 (IV)end_shift, prog->precomp);
2c2d71f5
JH
508#endif
509
2c2d71f5
JH
510 restart:
511 /* Find a possible match in the region s..strend by looking for
512 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
513
514 {
515 I32 srch_start_shift = start_shift;
516 I32 srch_end_shift = end_shift;
517 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
518 srch_end_shift -= ((strbeg - s) - srch_start_shift);
519 srch_start_shift = strbeg - s;
520 }
6bda09f9 521 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
522 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
523 (IV)prog->check_offset_min,
524 (IV)srch_start_shift,
525 (IV)srch_end_shift,
526 (IV)prog->check_end_shift);
527 });
528
cad2e5aa 529 if (flags & REXEC_SCREAM) {
cad2e5aa 530 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 531 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 532
2c2d71f5
JH
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
85c508c3 535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 536 && SvTAIL(check) ))
9041c2e3 537 s = screaminstr(sv, check,
1de06328 538 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 539 else
2c2d71f5 540 goto fail_finish;
4addbd3b
HS
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
3f7c398e 543 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
1de06328
YO
547 else {
548 U8* start_point;
549 U8* end_point;
bbe252da 550 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
551 start_point= (U8*)(s + srch_start_shift);
552 end_point= (U8*)(strend - srch_end_shift);
553 } else {
554 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
555 end_point= HOP3(strend, -srch_end_shift, strbeg);
556 }
6bda09f9 557 DEBUG_OPTIMISE_MORE_r({
56570a2c 558 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 559 (int)(end_point - start_point),
fc8cd66c 560 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
561 start_point);
562 });
563
564 s = fbm_instr( start_point, end_point,
7fba1cd6 565 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
566 }
567 }
cad2e5aa
JH
568 /* Update the count-of-usability, remove useless subpatterns,
569 unshift s. */
2c2d71f5 570
ab3bbdeb
YO
571 DEBUG_EXECUTE_r({
572 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
573 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
574 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 575 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
576 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
577 ? "anchored" : "floating"),
578 quoted,
579 RE_SV_TAIL(check),
580 (s ? " at offset " : "...\n") );
581 });
2c2d71f5
JH
582
583 if (!s)
584 goto fail_finish;
2c2d71f5 585 /* Finish the diagnostic message */
a3621e74 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 587
1de06328
YO
588 /* XXX dmq: first branch is for positive lookbehind...
589 Our check string is offset from the beginning of the pattern.
590 So we need to do any stclass tests offset forward from that
591 point. I think. :-(
592 */
593
594
595
596 check_at=s;
597
598
2c2d71f5
JH
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 602 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
605 */
606
1de06328
YO
607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
608 : (prog->float_substr && prog->anchored_substr))
609 {
30944b6d 610 /* Take into account the "other" substring. */
2c2d71f5
JH
611 /* XXXX May be hopelessly wrong for UTF... */
612 if (!other_last)
6eb5f6b9 613 other_last = strpos;
33b8afdf 614 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
615 do_other_anchored:
616 {
890ce7af
AL
617 char * const last = HOP3c(s, -start_shift, strbeg);
618 char *last1, *last2;
be8e71aa 619 char * const saved_s = s;
33b8afdf 620 SV* must;
2c2d71f5 621
2c2d71f5
JH
622 t = s - prog->check_offset_max;
623 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 624 && (!do_utf8
0ce71af7 625 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 626 && t > strpos)))
6f207bd3 627 NOOP;
2c2d71f5
JH
628 else
629 t = strpos;
1aa99e6b 630 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
631 if (t < other_last) /* These positions already checked */
632 t = other_last;
1aa99e6b 633 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
634 if (last < last1)
635 last1 = last;
1de06328
YO
636 /* XXXX It is not documented what units *_offsets are in.
637 We assume bytes, but this is clearly wrong.
638 Meaning this code needs to be carefully reviewed for errors.
639 dmq.
640 */
641
2c2d71f5 642 /* On end-of-str: see comment below. */
33b8afdf
JH
643 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
644 if (must == &PL_sv_undef) {
645 s = (char*)NULL;
1de06328 646 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
647 }
648 else
649 s = fbm_instr(
650 (unsigned char*)t,
651 HOP3(HOP3(last1, prog->anchored_offset, strend)
652 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
653 must,
7fba1cd6 654 multiline ? FBMrf_MULTILINE : 0
33b8afdf 655 );
ab3bbdeb
YO
656 DEBUG_EXECUTE_r({
657 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
658 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
659 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 660 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
661 quoted, RE_SV_TAIL(must));
662 });
663
664
2c2d71f5
JH
665 if (!s) {
666 if (last1 >= last2) {
a3621e74 667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
668 ", giving up...\n"));
669 goto fail_finish;
670 }
a3621e74 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 672 ", trying floating at offset %ld...\n",
be8e71aa 673 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
674 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
675 s = HOP3c(last, 1, strend);
2c2d71f5
JH
676 goto restart;
677 }
678 else {
a3621e74 679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 680 (long)(s - i_strpos)));
1aa99e6b
IH
681 t = HOP3c(s, -prog->anchored_offset, strbeg);
682 other_last = HOP3c(s, 1, strend);
be8e71aa 683 s = saved_s;
2c2d71f5
JH
684 if (t == strpos)
685 goto try_at_start;
2c2d71f5
JH
686 goto try_at_offset;
687 }
30944b6d 688 }
2c2d71f5
JH
689 }
690 else { /* Take into account the floating substring. */
33b8afdf 691 char *last, *last1;
be8e71aa 692 char * const saved_s = s;
33b8afdf
JH
693 SV* must;
694
695 t = HOP3c(s, -start_shift, strbeg);
696 last1 = last =
697 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
698 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
699 last = HOP3c(t, prog->float_max_offset, strend);
700 s = HOP3c(t, prog->float_min_offset, strend);
701 if (s < other_last)
702 s = other_last;
2c2d71f5 703 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
704 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
705 /* fbm_instr() takes into account exact value of end-of-str
706 if the check is SvTAIL(ed). Since false positives are OK,
707 and end-of-str is not later than strend we are OK. */
708 if (must == &PL_sv_undef) {
709 s = (char*)NULL;
1de06328 710 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
711 }
712 else
2c2d71f5 713 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
714 (unsigned char*)last + SvCUR(must)
715 - (SvTAIL(must)!=0),
7fba1cd6 716 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
717 DEBUG_EXECUTE_r({
718 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
719 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
720 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 721 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
722 quoted, RE_SV_TAIL(must));
723 });
33b8afdf
JH
724 if (!s) {
725 if (last1 == last) {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
727 ", giving up...\n"));
728 goto fail_finish;
2c2d71f5 729 }
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 731 ", trying anchored starting at offset %ld...\n",
be8e71aa 732 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
733 other_last = last;
734 s = HOP3c(t, 1, strend);
735 goto restart;
736 }
737 else {
a3621e74 738 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
739 (long)(s - i_strpos)));
740 other_last = s; /* Fix this later. --Hugo */
be8e71aa 741 s = saved_s;
33b8afdf
JH
742 if (t == strpos)
743 goto try_at_start;
744 goto try_at_offset;
745 }
2c2d71f5 746 }
cad2e5aa 747 }
2c2d71f5 748
1de06328 749
9ef43ace 750 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 751
6bda09f9 752 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
753 PerlIO_printf(Perl_debug_log,
754 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
755 (IV)prog->check_offset_min,
756 (IV)prog->check_offset_max,
757 (IV)(s-strpos),
758 (IV)(t-strpos),
759 (IV)(t-s),
760 (IV)(strend-strpos)
761 )
762 );
763
2c2d71f5 764 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 765 && (!do_utf8
9ef43ace 766 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
767 && t > strpos)))
768 {
2c2d71f5
JH
769 /* Fixed substring is found far enough so that the match
770 cannot start at strpos. */
771 try_at_offset:
cad2e5aa 772 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
773 /* Eventually fbm_*() should handle this, but often
774 anchored_offset is not 0, so this check will not be wasted. */
775 /* XXXX In the code below we prefer to look for "^" even in
776 presence of anchored substrings. And we search even
777 beyond the found float position. These pessimizations
778 are historical artefacts only. */
779 find_anchor:
2c2d71f5 780 while (t < strend - prog->minlen) {
cad2e5aa 781 if (*t == '\n') {
4ee3650e 782 if (t < check_at - prog->check_offset_min) {
33b8afdf 783 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
784 /* Since we moved from the found position,
785 we definitely contradict the found anchored
30944b6d
IZ
786 substr. Due to the above check we do not
787 contradict "check" substr.
788 Thus we can arrive here only if check substr
789 is float. Redo checking for "other"=="fixed".
790 */
9041c2e3 791 strpos = t + 1;
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 793 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
794 goto do_other_anchored;
795 }
4ee3650e
GS
796 /* We don't contradict the found floating substring. */
797 /* XXXX Why not check for STCLASS? */
cad2e5aa 798 s = t + 1;
a3621e74 799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 800 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
801 goto set_useful;
802 }
4ee3650e
GS
803 /* Position contradicts check-string */
804 /* XXXX probably better to look for check-string
805 than for "\n", so one should lower the limit for t? */
a3621e74 806 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 807 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 808 other_last = strpos = s = t + 1;
cad2e5aa
JH
809 goto restart;
810 }
811 t++;
812 }
a3621e74 813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 814 PL_colors[0], PL_colors[1]));
2c2d71f5 815 goto fail_finish;
cad2e5aa 816 }
f5952150 817 else {
a3621e74 818 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 819 PL_colors[0], PL_colors[1]));
f5952150 820 }
cad2e5aa
JH
821 s = t;
822 set_useful:
33b8afdf 823 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
824 }
825 else {
f5952150 826 /* The found string does not prohibit matching at strpos,
2c2d71f5 827 - no optimization of calling REx engine can be performed,
f5952150
GS
828 unless it was an MBOL and we are not after MBOL,
829 or a future STCLASS check will fail this. */
2c2d71f5
JH
830 try_at_start:
831 /* Even in this situation we may use MBOL flag if strpos is offset
832 wrt the start of the string. */
05b4157f 833 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 834 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 835 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 836 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 837 {
cad2e5aa
JH
838 t = strpos;
839 goto find_anchor;
840 }
a3621e74 841 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 842 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 843 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 844 );
2c2d71f5 845 success_at_start:
bbe252da 846 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
847 && (do_utf8 ? (
848 prog->check_utf8 /* Could be deleted already */
849 && --BmUSEFUL(prog->check_utf8) < 0
850 && (prog->check_utf8 == prog->float_utf8)
851 ) : (
852 prog->check_substr /* Could be deleted already */
853 && --BmUSEFUL(prog->check_substr) < 0
854 && (prog->check_substr == prog->float_substr)
855 )))
66e933ab 856 {
cad2e5aa 857 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 858 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
859 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
860 if (do_utf8 ? prog->check_substr : prog->check_utf8)
861 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
862 prog->check_substr = prog->check_utf8 = NULL; /* disable */
863 prog->float_substr = prog->float_utf8 = NULL; /* clear */
864 check = NULL; /* abort */
cad2e5aa 865 s = strpos;
3cf5c195
IZ
866 /* XXXX This is a remnant of the old implementation. It
867 looks wasteful, since now INTUIT can use many
6eb5f6b9 868 other heuristics. */
bbe252da 869 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
870 }
871 else
872 s = strpos;
873 }
874
6eb5f6b9
JH
875 /* Last resort... */
876 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
877 /* trie stclasses are too expensive to use here, we are better off to
878 leave it to regmatch itself */
f8fc2ecf 879 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
880 /* minlen == 0 is possible if regstclass is \b or \B,
881 and the fixed substr is ''$.
882 Since minlen is already taken into account, s+1 is before strend;
883 accidentally, minlen >= 1 guaranties no false positives at s + 1
884 even for \b or \B. But (minlen? 1 : 0) below assumes that
885 regstclass does not come from lookahead... */
886 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
887 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
888 const U8* const str = (U8*)STRING(progi->regstclass);
889 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
890 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 891 : 1);
1de06328
YO
892 char * endpos;
893 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
894 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
895 else if (prog->float_substr || prog->float_utf8)
896 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
897 else
898 endpos= strend;
899
70685ca0
JH
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
901 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 902
6eb5f6b9 903 t = s;
f8fc2ecf 904 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
905 if (!s) {
906#ifdef DEBUGGING
cbbf8932 907 const char *what = NULL;
6eb5f6b9
JH
908#endif
909 if (endpos == strend) {
a3621e74 910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
911 "Could not match STCLASS...\n") );
912 goto fail;
913 }
a3621e74 914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 915 "This position contradicts STCLASS...\n") );
bbe252da 916 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 917 goto fail;
6eb5f6b9 918 /* Contradict one of substrings */
33b8afdf
JH
919 if (prog->anchored_substr || prog->anchored_utf8) {
920 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 921 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 922 hop_and_restart:
1aa99e6b 923 s = HOP3c(t, 1, strend);
66e933ab
GS
924 if (s + start_shift + end_shift > strend) {
925 /* XXXX Should be taken into account earlier? */
a3621e74 926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
927 "Could not match STCLASS...\n") );
928 goto fail;
929 }
5e39e1e5
HS
930 if (!check)
931 goto giveup;
a3621e74 932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 933 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
934 what, (long)(s + start_shift - i_strpos)) );
935 goto restart;
936 }
66e933ab 937 /* Have both, check_string is floating */
6eb5f6b9
JH
938 if (t + start_shift >= check_at) /* Contradicts floating=check */
939 goto retry_floating_check;
940 /* Recheck anchored substring, but not floating... */
9041c2e3 941 s = check_at;
5e39e1e5
HS
942 if (!check)
943 goto giveup;
a3621e74 944 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 945 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
946 (long)(other_last - i_strpos)) );
947 goto do_other_anchored;
948 }
60e71179
GS
949 /* Another way we could have checked stclass at the
950 current position only: */
951 if (ml_anch) {
952 s = t = t + 1;
5e39e1e5
HS
953 if (!check)
954 goto giveup;
a3621e74 955 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 956 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 957 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 958 goto try_at_offset;
66e933ab 959 }
33b8afdf 960 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 961 goto fail;
6eb5f6b9
JH
962 /* Check is floating subtring. */
963 retry_floating_check:
964 t = check_at - start_shift;
a3621e74 965 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
966 goto hop_and_restart;
967 }
b7953727 968 if (t != s) {
a3621e74 969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 970 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
971 (long)(t - i_strpos), (long)(s - i_strpos))
972 );
973 }
974 else {
a3621e74 975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
976 "Does not contradict STCLASS...\n");
977 );
978 }
6eb5f6b9 979 }
5e39e1e5 980 giveup:
a3621e74 981 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
982 PL_colors[4], (check ? "Guessed" : "Giving up"),
983 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 984 return s;
2c2d71f5
JH
985
986 fail_finish: /* Substring not found */
33b8afdf
JH
987 if (prog->check_substr || prog->check_utf8) /* could be removed already */
988 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 989 fail:
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 991 PL_colors[4], PL_colors[5]));
bd61b366 992 return NULL;
cad2e5aa 993}
9661b544 994
786e8c11 995
3b0527fe 996
55eed653
NC
997#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
998uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
999 switch (trie_type) { \
1000 case trie_utf8_fold: \
1001 if ( foldlen>0 ) { \
1002 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1003 foldlen -= len; \
1004 uscan += len; \
1005 len=0; \
1006 } else { \
1007 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1008 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1009 foldlen -= UNISKIP( uvc ); \
1010 uscan = foldbuf + UNISKIP( uvc ); \
1011 } \
1012 break; \
1013 case trie_utf8: \
1014 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1015 break; \
1016 case trie_plain: \
1017 uvc = (UV)*uc; \
1018 len = 1; \
1019 } \
1020 \
1021 if (uvc < 256) { \
1022 charid = trie->charmap[ uvc ]; \
1023 } \
1024 else { \
1025 charid = 0; \
55eed653
NC
1026 if (widecharmap) { \
1027 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1028 (char*)&uvc, sizeof(UV), 0); \
1029 if (svpp) \
1030 charid = (U16)SvIV(*svpp); \
1031 } \
1032 } \
1033} STMT_END
1034
1035#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1036 if ( (CoNd) \
1037 && (ln == len || \
1038 ibcmp_utf8(s, NULL, 0, do_utf8, \
1039 m, NULL, ln, (bool)UTF)) \
24b23f37 1040 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1041 goto got_it; \
1042 else { \
1043 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1044 uvchr_to_utf8(tmpbuf, c); \
1045 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1046 if ( f != c \
1047 && (f == c1 || f == c2) \
1048 && (ln == foldlen || \
1049 !ibcmp_utf8((char *) foldbuf, \
1050 NULL, foldlen, do_utf8, \
1051 m, \
1052 NULL, ln, (bool)UTF)) \
24b23f37 1053 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1054 goto got_it; \
1055 } \
1056 s += len
1057
1058#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1059STMT_START { \
1060 while (s <= e) { \
1061 if ( (CoNd) \
1062 && (ln == 1 || !(OP(c) == EXACTF \
1063 ? ibcmp(s, m, ln) \
1064 : ibcmp_locale(s, m, ln))) \
24b23f37 1065 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1066 goto got_it; \
1067 s++; \
1068 } \
1069} STMT_END
1070
1071#define REXEC_FBC_UTF8_SCAN(CoDe) \
1072STMT_START { \
1073 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1074 CoDe \
1075 s += uskip; \
1076 } \
1077} STMT_END
1078
1079#define REXEC_FBC_SCAN(CoDe) \
1080STMT_START { \
1081 while (s < strend) { \
1082 CoDe \
1083 s++; \
1084 } \
1085} STMT_END
1086
1087#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1088REXEC_FBC_UTF8_SCAN( \
1089 if (CoNd) { \
24b23f37 1090 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1091 goto got_it; \
1092 else \
1093 tmp = doevery; \
1094 } \
1095 else \
1096 tmp = 1; \
1097)
1098
1099#define REXEC_FBC_CLASS_SCAN(CoNd) \
1100REXEC_FBC_SCAN( \
1101 if (CoNd) { \
24b23f37 1102 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1103 goto got_it; \
1104 else \
1105 tmp = doevery; \
1106 } \
1107 else \
1108 tmp = 1; \
1109)
1110
1111#define REXEC_FBC_TRYIT \
24b23f37 1112if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1113 goto got_it
1114
1115#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1116 if (do_utf8) { \
1117 UtFpReLoAd; \
1118 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1119 } \
1120 else { \
1121 REXEC_FBC_CLASS_SCAN(CoNd); \
1122 } \
1123 break
1124
1125#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1126 PL_reg_flags |= RF_tainted; \
1127 if (do_utf8) { \
1128 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1129 } \
1130 else { \
1131 REXEC_FBC_CLASS_SCAN(CoNd); \
1132 } \
1133 break
1134
786e8c11
YO
1135#define DUMP_EXEC_POS(li,s,doutf8) \
1136 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1137
1138/* We know what class REx starts with. Try to find this position... */
1139/* if reginfo is NULL, its a dryrun */
1140/* annoyingly all the vars in this routine have different names from their counterparts
1141 in regmatch. /grrr */
1142
3c3eec57 1143STATIC char *
07be1b83 1144S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1145 const char *strend, regmatch_info *reginfo)
a687059c 1146{
27da23d5 1147 dVAR;
bbe252da 1148 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1149 char *m;
d8093b23 1150 STRLEN ln;
5dab1207 1151 STRLEN lnc;
078c425b 1152 register STRLEN uskip;
d8093b23
G
1153 unsigned int c1;
1154 unsigned int c2;
6eb5f6b9
JH
1155 char *e;
1156 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1157 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf
YO
1158 RXi_GET_DECL(prog,progi);
1159
6eb5f6b9
JH
1160 /* We know what class it must start with. */
1161 switch (OP(c)) {
6eb5f6b9 1162 case ANYOF:
388cc4de 1163 if (do_utf8) {
4cadc6a9 1164 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1165 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1166 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1167 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1168 }
1169 else {
1170 while (s < strend) {
1171 STRLEN skip = 1;
1172
32fc9b6a 1173 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1174 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1175 /* The assignment of 2 is intentional:
1176 * for the folded sharp s, the skip is 2. */
1177 (skip = SHARP_S_SKIP))) {
24b23f37 1178 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1179 goto got_it;
1180 else
1181 tmp = doevery;
1182 }
1183 else
1184 tmp = 1;
1185 s += skip;
1186 }
a0d0e21e 1187 }
6eb5f6b9 1188 break;
f33976b4 1189 case CANY:
4cadc6a9 1190 REXEC_FBC_SCAN(
24b23f37 1191 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1192 goto got_it;
1193 else
1194 tmp = doevery;
4cadc6a9 1195 );
f33976b4 1196 break;
6eb5f6b9 1197 case EXACTF:
5dab1207
NIS
1198 m = STRING(c);
1199 ln = STR_LEN(c); /* length to match in octets/bytes */
1200 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1201 if (UTF) {
a2a2844f 1202 STRLEN ulen1, ulen2;
5dab1207 1203 U8 *sm = (U8 *) m;
89ebb4a3
JH
1204 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1205 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1207
1208 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1209 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1210
89ebb4a3 1211 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1212 0, uniflags);
89ebb4a3 1213 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1214 0, uniflags);
5dab1207
NIS
1215 lnc = 0;
1216 while (sm < ((U8 *) m + ln)) {
1217 lnc++;
1218 sm += UTF8SKIP(sm);
1219 }
1aa99e6b
IH
1220 }
1221 else {
1222 c1 = *(U8*)m;
1223 c2 = PL_fold[c1];
1224 }
6eb5f6b9
JH
1225 goto do_exactf;
1226 case EXACTFL:
5dab1207
NIS
1227 m = STRING(c);
1228 ln = STR_LEN(c);
1229 lnc = (I32) ln;
d8093b23 1230 c1 = *(U8*)m;
6eb5f6b9
JH
1231 c2 = PL_fold_locale[c1];
1232 do_exactf:
db12adc6 1233 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1234
3b0527fe 1235 if (!reginfo && e < s)
6eb5f6b9 1236 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1237
60a8b682
JH
1238 /* The idea in the EXACTF* cases is to first find the
1239 * first character of the EXACTF* node and then, if
1240 * necessary, case-insensitively compare the full
1241 * text of the node. The c1 and c2 are the first
1242 * characters (though in Unicode it gets a bit
1243 * more complicated because there are more cases
7f16dd3d
JH
1244 * than just upper and lower: one needs to use
1245 * the so-called folding case for case-insensitive
1246 * matching (called "loose matching" in Unicode).
1247 * ibcmp_utf8() will do just that. */
60a8b682 1248
1aa99e6b 1249 if (do_utf8) {
575cac57 1250 UV c, f;
89ebb4a3 1251 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1252 STRLEN len, foldlen;
4ad0818d 1253 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1254 if (c1 == c2) {
5dab1207
NIS
1255 /* Upper and lower of 1st char are equal -
1256 * probably not a "letter". */
1aa99e6b 1257 while (s <= e) {
89ebb4a3 1258 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1259 uniflags);
4cadc6a9 1260 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1261 }
09091399
JH
1262 }
1263 else {
1aa99e6b 1264 while (s <= e) {
89ebb4a3 1265 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1266 uniflags);
80aecb99 1267
60a8b682 1268 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1269 * Note that not all the possible combinations
1270 * are handled here: some of them are handled
1271 * by the standard folding rules, and some of
1272 * them (the character class or ANYOF cases)
1273 * are handled during compiletime in
1274 * regexec.c:S_regclass(). */
880bd946
JH
1275 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1276 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1277 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1278
4cadc6a9 1279 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1280 }
09091399 1281 }
1aa99e6b
IH
1282 }
1283 else {
1284 if (c1 == c2)
4cadc6a9 1285 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1286 else
4cadc6a9 1287 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1288 }
1289 break;
bbce6d69 1290 case BOUNDL:
3280af22 1291 PL_reg_flags |= RF_tainted;
bbce6d69 1292 /* FALL THROUGH */
a0d0e21e 1293 case BOUND:
ffc61ed2 1294 if (do_utf8) {
12d33761 1295 if (s == PL_bostr)
ffc61ed2
JH
1296 tmp = '\n';
1297 else {
6136c704 1298 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1299 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1300 }
1301 tmp = ((OP(c) == BOUND ?
9041c2e3 1302 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1303 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1304 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1305 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1306 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1307 isALNUM_LC_utf8((U8*)s)))
1308 {
1309 tmp = !tmp;
4cadc6a9 1310 REXEC_FBC_TRYIT;
a687059c 1311 }
4cadc6a9 1312 );
a0d0e21e 1313 }
667bb95a 1314 else {
12d33761 1315 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1316 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1317 REXEC_FBC_SCAN(
ffc61ed2
JH
1318 if (tmp ==
1319 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1320 tmp = !tmp;
4cadc6a9 1321 REXEC_FBC_TRYIT;
a0ed51b3 1322 }
4cadc6a9 1323 );
a0ed51b3 1324 }
24b23f37 1325 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1326 goto got_it;
1327 break;
bbce6d69 1328 case NBOUNDL:
3280af22 1329 PL_reg_flags |= RF_tainted;
bbce6d69 1330 /* FALL THROUGH */
a0d0e21e 1331 case NBOUND:
ffc61ed2 1332 if (do_utf8) {
12d33761 1333 if (s == PL_bostr)
ffc61ed2
JH
1334 tmp = '\n';
1335 else {
6136c704 1336 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1337 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1338 }
1339 tmp = ((OP(c) == NBOUND ?
9041c2e3 1340 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1341 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1342 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1343 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1344 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1345 isALNUM_LC_utf8((U8*)s)))
1346 tmp = !tmp;
4cadc6a9
YO
1347 else REXEC_FBC_TRYIT;
1348 );
a0d0e21e 1349 }
667bb95a 1350 else {
12d33761 1351 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1352 tmp = ((OP(c) == NBOUND ?
1353 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1354 REXEC_FBC_SCAN(
ffc61ed2
JH
1355 if (tmp ==
1356 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1357 tmp = !tmp;
4cadc6a9
YO
1358 else REXEC_FBC_TRYIT;
1359 );
a0ed51b3 1360 }
24b23f37 1361 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1362 goto got_it;
1363 break;
a0d0e21e 1364 case ALNUM:
4cadc6a9
YO
1365 REXEC_FBC_CSCAN_PRELOAD(
1366 LOAD_UTF8_CHARCLASS_ALNUM(),
1367 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1368 isALNUM(*s)
1369 );
bbce6d69 1370 case ALNUML:
4cadc6a9
YO
1371 REXEC_FBC_CSCAN_TAINT(
1372 isALNUM_LC_utf8((U8*)s),
1373 isALNUM_LC(*s)
1374 );
a0d0e21e 1375 case NALNUM:
4cadc6a9
YO
1376 REXEC_FBC_CSCAN_PRELOAD(
1377 LOAD_UTF8_CHARCLASS_ALNUM(),
1378 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1379 !isALNUM(*s)
1380 );
bbce6d69 1381 case NALNUML:
4cadc6a9
YO
1382 REXEC_FBC_CSCAN_TAINT(
1383 !isALNUM_LC_utf8((U8*)s),
1384 !isALNUM_LC(*s)
1385 );
a0d0e21e 1386 case SPACE:
4cadc6a9
YO
1387 REXEC_FBC_CSCAN_PRELOAD(
1388 LOAD_UTF8_CHARCLASS_SPACE(),
1389 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1390 isSPACE(*s)
1391 );
bbce6d69 1392 case SPACEL:
4cadc6a9
YO
1393 REXEC_FBC_CSCAN_TAINT(
1394 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1395 isSPACE_LC(*s)
1396 );
a0d0e21e 1397 case NSPACE:
4cadc6a9
YO
1398 REXEC_FBC_CSCAN_PRELOAD(
1399 LOAD_UTF8_CHARCLASS_SPACE(),
1400 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1401 !isSPACE(*s)
1402 );
bbce6d69 1403 case NSPACEL:
4cadc6a9
YO
1404 REXEC_FBC_CSCAN_TAINT(
1405 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1406 !isSPACE_LC(*s)
1407 );
a0d0e21e 1408 case DIGIT:
4cadc6a9
YO
1409 REXEC_FBC_CSCAN_PRELOAD(
1410 LOAD_UTF8_CHARCLASS_DIGIT(),
1411 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1412 isDIGIT(*s)
1413 );
b8c5462f 1414 case DIGITL:
4cadc6a9
YO
1415 REXEC_FBC_CSCAN_TAINT(
1416 isDIGIT_LC_utf8((U8*)s),
1417 isDIGIT_LC(*s)
1418 );
a0d0e21e 1419 case NDIGIT:
4cadc6a9
YO
1420 REXEC_FBC_CSCAN_PRELOAD(
1421 LOAD_UTF8_CHARCLASS_DIGIT(),
1422 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1423 !isDIGIT(*s)
1424 );
b8c5462f 1425 case NDIGITL:
4cadc6a9
YO
1426 REXEC_FBC_CSCAN_TAINT(
1427 !isDIGIT_LC_utf8((U8*)s),
1428 !isDIGIT_LC(*s)
1429 );
1de06328
YO
1430 case AHOCORASICKC:
1431 case AHOCORASICK:
07be1b83
YO
1432 {
1433 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1434 trie_type = do_utf8 ?
1435 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1436 : trie_plain;
1437 /* what trie are we using right now */
1438 reg_ac_data *aho
f8fc2ecf 1439 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1440 reg_trie_data *trie
1441 = (reg_trie_data*)progi->data->data[ aho->trie ];
55eed653 1442 HV *widecharmap = (HV*) progi->data->data[ aho->trie + 1 ];
07be1b83
YO
1443
1444 const char *last_start = strend - trie->minlen;
6148ee25 1445#ifdef DEBUGGING
07be1b83 1446 const char *real_start = s;
6148ee25 1447#endif
07be1b83 1448 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1449 SV *sv_points;
1450 U8 **points; /* map of where we were in the input string
786e8c11 1451 when reading a given char. For ASCII this
be8e71aa
YO
1452 is unnecessary overhead as the relationship
1453 is always 1:1, but for unicode, especially
1454 case folded unicode this is not true. */
f9e705e8 1455 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1456 U8 *bitmap=NULL;
1457
07be1b83
YO
1458
1459 GET_RE_DEBUG_FLAGS_DECL;
1460
be8e71aa
YO
1461 /* We can't just allocate points here. We need to wrap it in
1462 * an SV so it gets freed properly if there is a croak while
1463 * running the match */
1464 ENTER;
1465 SAVETMPS;
1466 sv_points=newSV(maxlen * sizeof(U8 *));
1467 SvCUR_set(sv_points,
1468 maxlen * sizeof(U8 *));
1469 SvPOK_on(sv_points);
1470 sv_2mortal(sv_points);
1471 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1472 if ( trie_type != trie_utf8_fold
1473 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1474 {
786e8c11
YO
1475 if (trie->bitmap)
1476 bitmap=(U8*)trie->bitmap;
1477 else
1478 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1479 }
786e8c11
YO
1480 /* this is the Aho-Corasick algorithm modified a touch
1481 to include special handling for long "unknown char"
1482 sequences. The basic idea being that we use AC as long
1483 as we are dealing with a possible matching char, when
1484 we encounter an unknown char (and we have not encountered
1485 an accepting state) we scan forward until we find a legal
1486 starting char.
1487 AC matching is basically that of trie matching, except
1488 that when we encounter a failing transition, we fall back
1489 to the current states "fail state", and try the current char
1490 again, a process we repeat until we reach the root state,
1491 state 1, or a legal transition. If we fail on the root state
1492 then we can either terminate if we have reached an accepting
1493 state previously, or restart the entire process from the beginning
1494 if we have not.
1495
1496 */
07be1b83
YO
1497 while (s <= last_start) {
1498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1499 U8 *uc = (U8*)s;
1500 U16 charid = 0;
1501 U32 base = 1;
1502 U32 state = 1;
1503 UV uvc = 0;
1504 STRLEN len = 0;
1505 STRLEN foldlen = 0;
1506 U8 *uscan = (U8*)NULL;
1507 U8 *leftmost = NULL;
786e8c11
YO
1508#ifdef DEBUGGING
1509 U32 accepted_word= 0;
1510#endif
07be1b83
YO
1511 U32 pointpos = 0;
1512
1513 while ( state && uc <= (U8*)strend ) {
1514 int failed=0;
786e8c11
YO
1515 U32 word = aho->states[ state ].wordnum;
1516
1de06328
YO
1517 if( state==1 ) {
1518 if ( bitmap ) {
1519 DEBUG_TRIE_EXECUTE_r(
1520 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 (char *)uc, do_utf8 );
1523 PerlIO_printf( Perl_debug_log,
1524 " Scanning for legal start char...\n");
1525 }
1526 );
1527 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1528 uc++;
786e8c11 1529 }
1de06328 1530 s= (char *)uc;
786e8c11 1531 }
786e8c11
YO
1532 if (uc >(U8*)last_start) break;
1533 }
1534
1535 if ( word ) {
1536 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1537 if (!leftmost || lpos < leftmost) {
1538 DEBUG_r(accepted_word=word);
07be1b83 1539 leftmost= lpos;
786e8c11 1540 }
07be1b83 1541 if (base==0) break;
786e8c11 1542
07be1b83
YO
1543 }
1544 points[pointpos++ % maxlen]= uc;
55eed653
NC
1545 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1546 uscan, len, uvc, charid, foldlen,
1547 foldbuf, uniflags);
786e8c11
YO
1548 DEBUG_TRIE_EXECUTE_r({
1549 dump_exec_pos( (char *)uc, c, strend, real_start,
1550 s, do_utf8 );
07be1b83 1551 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1552 " Charid:%3u CP:%4"UVxf" ",
1553 charid, uvc);
1554 });
07be1b83
YO
1555
1556 do {
6148ee25 1557#ifdef DEBUGGING
786e8c11 1558 word = aho->states[ state ].wordnum;
6148ee25 1559#endif
07be1b83
YO
1560 base = aho->states[ state ].trans.base;
1561
786e8c11
YO
1562 DEBUG_TRIE_EXECUTE_r({
1563 if (failed)
1564 dump_exec_pos( (char *)uc, c, strend, real_start,
1565 s, do_utf8 );
07be1b83 1566 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1567 "%sState: %4"UVxf", word=%"UVxf,
1568 failed ? " Fail transition to " : "",
1569 (UV)state, (UV)word);
1570 });
07be1b83
YO
1571 if ( base ) {
1572 U32 tmp;
1573 if (charid &&
1574 (base + charid > trie->uniquecharcount )
1575 && (base + charid - 1 - trie->uniquecharcount
1576 < trie->lasttrans)
1577 && trie->trans[base + charid - 1 -
1578 trie->uniquecharcount].check == state
1579 && (tmp=trie->trans[base + charid - 1 -
1580 trie->uniquecharcount ].next))
1581 {
786e8c11
YO
1582 DEBUG_TRIE_EXECUTE_r(
1583 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1584 state = tmp;
1585 break;
1586 }
1587 else {
786e8c11
YO
1588 DEBUG_TRIE_EXECUTE_r(
1589 PerlIO_printf( Perl_debug_log," - fail\n"));
1590 failed = 1;
1591 state = aho->fail[state];
07be1b83
YO
1592 }
1593 }
1594 else {
1595 /* we must be accepting here */
786e8c11
YO
1596 DEBUG_TRIE_EXECUTE_r(
1597 PerlIO_printf( Perl_debug_log," - accepting\n"));
1598 failed = 1;
07be1b83
YO
1599 break;
1600 }
1601 } while(state);
786e8c11 1602 uc += len;
07be1b83
YO
1603 if (failed) {
1604 if (leftmost)
1605 break;
786e8c11 1606 if (!state) state = 1;
07be1b83
YO
1607 }
1608 }
1609 if ( aho->states[ state ].wordnum ) {
1610 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1611 if (!leftmost || lpos < leftmost) {
1612 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1613 leftmost = lpos;
786e8c11 1614 }
07be1b83 1615 }
07be1b83
YO
1616 if (leftmost) {
1617 s = (char*)leftmost;
786e8c11
YO
1618 DEBUG_TRIE_EXECUTE_r({
1619 PerlIO_printf(
70685ca0
JH
1620 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1621 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1622 );
1623 });
24b23f37 1624 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1625 FREETMPS;
1626 LEAVE;
07be1b83 1627 goto got_it;
be8e71aa 1628 }
07be1b83 1629 s = HOPc(s,1);
786e8c11
YO
1630 DEBUG_TRIE_EXECUTE_r({
1631 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1632 });
07be1b83 1633 } else {
786e8c11
YO
1634 DEBUG_TRIE_EXECUTE_r(
1635 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1636 break;
1637 }
1638 }
be8e71aa
YO
1639 FREETMPS;
1640 LEAVE;
07be1b83
YO
1641 }
1642 break;
b3c9acc1 1643 default:
3c3eec57
GS
1644 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1645 break;
d6a28714 1646 }
6eb5f6b9
JH
1647 return 0;
1648 got_it:
1649 return s;
1650}
1651
e6aad7ab 1652static void
fae667d5
YO
1653S_swap_match_buff (pTHX_ regexp *prog) {
1654 I32 *t;
fae667d5 1655
28d8d7f4 1656 if (!prog->swap) {
fae667d5
YO
1657 /* We have to be careful. If the previous successful match
1658 was from this regex we don't want a subsequent paritally
1659 successful match to clobber the old results.
1660 So when we detect this possibility we add a swap buffer
1661 to the re, and switch the buffer each match. If we fail
1662 we switch it back, otherwise we leave it swapped.
1663 */
28d8d7f4 1664 Newxz(prog->swap, 1, regexp_paren_ofs);
fae667d5 1665 /* no need to copy these */
7b734803
NC
1666 Newxz(prog->swap->startp, 2 * (prog->nparens + 1), I32);
1667 prog->swap->endp = prog->swap->startp + prog->nparens + 1;
fae667d5 1668 }
28d8d7f4
YO
1669 t = prog->swap->startp;
1670 prog->swap->startp = prog->startp;
fae667d5 1671 prog->startp = t;
28d8d7f4
YO
1672 t = prog->swap->endp;
1673 prog->swap->endp = prog->endp;
fae667d5
YO
1674 prog->endp = t;
1675}
1676
1677
6eb5f6b9
JH
1678/*
1679 - regexec_flags - match a regexp against a string
1680 */
1681I32
1682Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1683 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1684/* strend: pointer to null at end of string */
1685/* strbeg: real beginning of string */
1686/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1687/* data: May be used for some additional optimizations.
1688 Currently its only used, with a U32 cast, for transmitting
1689 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1690/* nosave: For optimizations. */
1691{
97aff369 1692 dVAR;
24b23f37 1693 /*register*/ char *s;
6eb5f6b9 1694 register regnode *c;
24b23f37 1695 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1696 I32 minlen; /* must match at least this many chars */
1697 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1698 I32 end_shift = 0; /* Same for the end. */ /* CC */
1699 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1700 char *scream_olds = NULL;
3dab1dad 1701 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1702 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1703 I32 multiline;
f8fc2ecf 1704 RXi_GET_DECL(prog,progi);
3b0527fe 1705 regmatch_info reginfo; /* create some info to pass to regtry etc */
fae667d5 1706 bool swap_on_fail = 0;
a3621e74
YO
1707
1708 GET_RE_DEBUG_FLAGS_DECL;
1709
9d4ba2ae 1710 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1711
1712 /* Be paranoid... */
1713 if (prog == NULL || startpos == NULL) {
1714 Perl_croak(aTHX_ "NULL regexp parameter");
1715 return 0;
1716 }
1717
bbe252da 1718 multiline = prog->extflags & RXf_PMf_MULTILINE;
3b0527fe 1719 reginfo.prog = prog;
2757e526 1720
bac06658 1721 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1722 DEBUG_EXECUTE_r(
1723 debug_start_match(prog, do_utf8, startpos, strend,
1724 "Matching");
1725 );
bac06658 1726
6eb5f6b9 1727 minlen = prog->minlen;
1de06328
YO
1728
1729 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1731 "String too short [regexec_flags]...\n"));
1732 goto phooey;
1aa99e6b 1733 }
6eb5f6b9 1734
1de06328 1735
6eb5f6b9 1736 /* Check validity of program. */
f8fc2ecf 1737 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1738 Perl_croak(aTHX_ "corrupted regexp program");
1739 }
1740
1741 PL_reg_flags = 0;
1742 PL_reg_eval_set = 0;
1743 PL_reg_maxiter = 0;
1744
bbe252da 1745 if (prog->extflags & RXf_UTF8)
6eb5f6b9
JH
1746 PL_reg_flags |= RF_utf8;
1747
1748 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1749 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1750 PL_bostr = strbeg;
3b0527fe 1751 reginfo.sv = sv;
6eb5f6b9
JH
1752
1753 /* Mark end of line for $ (and such) */
1754 PL_regeol = strend;
1755
1756 /* see how far we have to get to not match where we matched before */
3b0527fe 1757 reginfo.till = startpos+minend;
6eb5f6b9 1758
6eb5f6b9
JH
1759 /* If there is a "must appear" string, look for it. */
1760 s = startpos;
1761
bbe252da 1762 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1763 MAGIC *mg;
1764
1765 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
58e23c8d 1766 reginfo.ganch = startpos + prog->gofs;
6eb5f6b9
JH
1767 else if (sv && SvTYPE(sv) >= SVt_PVMG
1768 && SvMAGIC(sv)
14befaf4
DM
1769 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1770 && mg->mg_len >= 0) {
3b0527fe 1771 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
bbe252da 1772 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1773 if (s > reginfo.ganch)
6eb5f6b9 1774 goto phooey;
58e23c8d 1775 s = reginfo.ganch - prog->gofs;
6eb5f6b9
JH
1776 }
1777 }
58e23c8d 1778 else if (data) {
70685ca0 1779 reginfo.ganch = strbeg + PTR2UV(data);
58e23c8d 1780 } else /* pos() not defined */
3b0527fe 1781 reginfo.ganch = strbeg;
6eb5f6b9 1782 }
c74340f9 1783 if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) {
fae667d5
YO
1784 swap_on_fail = 1;
1785 swap_match_buff(prog); /* do we need a save destructor here for
1786 eval dies? */
c74340f9 1787 }
a0714e2c 1788 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1789 re_scream_pos_data d;
1790
1791 d.scream_olds = &scream_olds;
1792 d.scream_pos = &scream_pos;
1793 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1794 if (!s) {
a3621e74 1795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1796 goto phooey; /* not present */
3fa9c3d7 1797 }
6eb5f6b9
JH
1798 }
1799
1de06328 1800
6eb5f6b9
JH
1801
1802 /* Simplest case: anchored match need be tried only once. */
1803 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1804 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1805 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 1806 goto got_it;
bbe252da
YO
1807 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1808 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
1809 {
1810 char *end;
1811
1812 if (minlen)
1813 dontbother = minlen - 1;
1aa99e6b 1814 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1815 /* for multiline we only have to try after newlines */
33b8afdf 1816 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1817 if (s == startpos)
1818 goto after_try;
1819 while (1) {
24b23f37 1820 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1821 goto got_it;
1822 after_try:
1823 if (s >= end)
1824 goto phooey;
bbe252da 1825 if (prog->extflags & RXf_USE_INTUIT) {
6eb5f6b9
JH
1826 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1827 if (!s)
1828 goto phooey;
1829 }
1830 else
1831 s++;
1832 }
1833 } else {
1834 if (s > startpos)
1835 s--;
1836 while (s < end) {
1837 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 1838 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1839 goto got_it;
1840 }
1841 }
1842 }
1843 }
1844 goto phooey;
bbe252da 1845 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
1846 {
1847 /* the warning about reginfo.ganch being used without intialization
bbe252da 1848 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 1849 and we only enter this block when the same bit is set. */
58e23c8d
YO
1850 char *tmp_s = reginfo.ganch - prog->gofs;
1851 if (regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
1852 goto got_it;
1853 goto phooey;
1854 }
1855
1856 /* Messy cases: unanchored match. */
bbe252da 1857 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
1858 /* we have /x+whatever/ */
1859 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1860 char ch;
bf93d4cc
GS
1861#ifdef DEBUGGING
1862 int did_match = 0;
1863#endif
33b8afdf
JH
1864 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1866 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1867
1aa99e6b 1868 if (do_utf8) {
4cadc6a9 1869 REXEC_FBC_SCAN(
6eb5f6b9 1870 if (*s == ch) {
a3621e74 1871 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1872 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1873 s += UTF8SKIP(s);
1874 while (s < strend && *s == ch)
1875 s += UTF8SKIP(s);
1876 }
4cadc6a9 1877 );
6eb5f6b9
JH
1878 }
1879 else {
4cadc6a9 1880 REXEC_FBC_SCAN(
6eb5f6b9 1881 if (*s == ch) {
a3621e74 1882 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 1883 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
1884 s++;
1885 while (s < strend && *s == ch)
1886 s++;
1887 }
4cadc6a9 1888 );
6eb5f6b9 1889 }
a3621e74 1890 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1891 PerlIO_printf(Perl_debug_log,
b7953727
JH
1892 "Did not find anchored character...\n")
1893 );
6eb5f6b9 1894 }
a0714e2c
SS
1895 else if (prog->anchored_substr != NULL
1896 || prog->anchored_utf8 != NULL
1897 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1898 && prog->float_max_offset < strend - s)) {
1899 SV *must;
1900 I32 back_max;
1901 I32 back_min;
1902 char *last;
6eb5f6b9 1903 char *last1; /* Last position checked before */
bf93d4cc
GS
1904#ifdef DEBUGGING
1905 int did_match = 0;
1906#endif
33b8afdf
JH
1907 if (prog->anchored_substr || prog->anchored_utf8) {
1908 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1909 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1910 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1911 back_max = back_min = prog->anchored_offset;
1912 } else {
1913 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1914 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1915 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1916 back_max = prog->float_max_offset;
1917 back_min = prog->float_min_offset;
1918 }
1de06328
YO
1919
1920
33b8afdf
JH
1921 if (must == &PL_sv_undef)
1922 /* could not downgrade utf8 check substring, so must fail */
1923 goto phooey;
1924
1de06328
YO
1925 if (back_min<0) {
1926 last = strend;
1927 } else {
1928 last = HOP3c(strend, /* Cannot start after this */
1929 -(I32)(CHR_SVLEN(must)
1930 - (SvTAIL(must) != 0) + back_min), strbeg);
1931 }
6eb5f6b9
JH
1932 if (s > PL_bostr)
1933 last1 = HOPc(s, -1);
1934 else
1935 last1 = s - 1; /* bogus */
1936
a0288114 1937 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1938 check_substr==must. */
1939 scream_pos = -1;
1940 dontbother = end_shift;
1941 strend = HOPc(strend, -dontbother);
1942 while ( (s <= last) &&
9041c2e3 1943 ((flags & REXEC_SCREAM)
1de06328 1944 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1945 end_shift, &scream_pos, 0))
1de06328 1946 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1947 (unsigned char*)strend, must,
7fba1cd6 1948 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1949 /* we may be pointing at the wrong string */
1950 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1951 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1952 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1953 if (HOPc(s, -back_max) > last1) {
1954 last1 = HOPc(s, -back_min);
1955 s = HOPc(s, -back_max);
1956 }
1957 else {
52657f30 1958 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1959
1960 last1 = HOPc(s, -back_min);
52657f30 1961 s = t;
6eb5f6b9 1962 }
1aa99e6b 1963 if (do_utf8) {
6eb5f6b9 1964 while (s <= last1) {
24b23f37 1965 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1966 goto got_it;
1967 s += UTF8SKIP(s);
1968 }
1969 }
1970 else {
1971 while (s <= last1) {
24b23f37 1972 if (regtry(&reginfo, &s))
6eb5f6b9
JH
1973 goto got_it;
1974 s++;
1975 }
1976 }
1977 }
ab3bbdeb
YO
1978 DEBUG_EXECUTE_r(if (!did_match) {
1979 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1980 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1981 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1982 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1983 ? "anchored" : "floating"),
ab3bbdeb
YO
1984 quoted, RE_SV_TAIL(must));
1985 });
6eb5f6b9
JH
1986 goto phooey;
1987 }
f8fc2ecf 1988 else if ( (c = progi->regstclass) ) {
f14c76ed 1989 if (minlen) {
f8fc2ecf 1990 const OPCODE op = OP(progi->regstclass);
66e933ab 1991 /* don't bother with what can't match */
786e8c11 1992 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1993 strend = HOPc(strend, -(minlen - 1));
1994 }
a3621e74 1995 DEBUG_EXECUTE_r({
be8e71aa 1996 SV * const prop = sv_newmortal();
32fc9b6a 1997 regprop(prog, prop, c);
0df25f3d 1998 {
02daf0ab 1999 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2000 s,strend-s,60);
0df25f3d 2001 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2002 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2003 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2004 quoted, (int)(strend - s));
0df25f3d 2005 }
ffc61ed2 2006 });
3b0527fe 2007 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2008 goto got_it;
07be1b83 2009 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2010 }
2011 else {
2012 dontbother = 0;
a0714e2c 2013 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2014 /* Trim the end. */
d6a28714 2015 char *last;
33b8afdf
JH
2016 SV* float_real;
2017
2018 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2019 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2020 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2021
2022 if (flags & REXEC_SCREAM) {
33b8afdf 2023 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2024 end_shift, &scream_pos, 1); /* last one */
2025 if (!last)
ffc61ed2 2026 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2027 /* we may be pointing at the wrong string */
2028 else if (RX_MATCH_COPIED(prog))
3f7c398e 2029 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2030 }
d6a28714
JH
2031 else {
2032 STRLEN len;
cfd0369c 2033 const char * const little = SvPV_const(float_real, len);
d6a28714 2034
33b8afdf 2035 if (SvTAIL(float_real)) {
d6a28714
JH
2036 if (memEQ(strend - len + 1, little, len - 1))
2037 last = strend - len + 1;
7fba1cd6 2038 else if (!multiline)
9041c2e3 2039 last = memEQ(strend - len, little, len)
bd61b366 2040 ? strend - len : NULL;
b8c5462f 2041 else
d6a28714
JH
2042 goto find_last;
2043 } else {
2044 find_last:
9041c2e3 2045 if (len)
d6a28714 2046 last = rninstr(s, strend, little, little + len);
b8c5462f 2047 else
a0288114 2048 last = strend; /* matching "$" */
b8c5462f 2049 }
b8c5462f 2050 }
bf93d4cc 2051 if (last == NULL) {
6bda09f9
YO
2052 DEBUG_EXECUTE_r(
2053 PerlIO_printf(Perl_debug_log,
2054 "%sCan't trim the tail, match fails (should not happen)%s\n",
2055 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2056 goto phooey; /* Should not happen! */
2057 }
d6a28714
JH
2058 dontbother = strend - last + prog->float_min_offset;
2059 }
2060 if (minlen && (dontbother < minlen))
2061 dontbother = minlen - 1;
2062 strend -= dontbother; /* this one's always in bytes! */
2063 /* We don't know much -- general case. */
1aa99e6b 2064 if (do_utf8) {
d6a28714 2065 for (;;) {
24b23f37 2066 if (regtry(&reginfo, &s))
d6a28714
JH
2067 goto got_it;
2068 if (s >= strend)
2069 break;
b8c5462f 2070 s += UTF8SKIP(s);
d6a28714
JH
2071 };
2072 }
2073 else {
2074 do {
24b23f37 2075 if (regtry(&reginfo, &s))
d6a28714
JH
2076 goto got_it;
2077 } while (s++ < strend);
2078 }
2079 }
2080
2081 /* Failure. */
2082 goto phooey;
2083
2084got_it:
2085 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2086
2087 if (PL_reg_eval_set) {
2088 /* Preserve the current value of $^R */
2089 if (oreplsv != GvSV(PL_replgv))
2090 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2091 restored, the value remains
2092 the same. */
4f639d21 2093 restore_pos(aTHX_ prog);
d6a28714 2094 }
81714fb9
YO
2095 if (prog->paren_names)
2096 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2097
2098 /* make sure $`, $&, $', and $digit will work later */
2099 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2100 RX_MATCH_COPY_FREE(prog);
d6a28714 2101 if (flags & REXEC_COPY_STR) {
be8e71aa 2102 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2103#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2104 if ((SvIsCOW(sv)
2105 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2106 if (DEBUG_C_TEST) {
2107 PerlIO_printf(Perl_debug_log,
2108 "Copy on write: regexp capture, type %d\n",
2109 (int) SvTYPE(sv));
2110 }
2111 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2112 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2113 assert (SvPOKp(prog->saved_copy));
2114 } else
2115#endif
2116 {
2117 RX_MATCH_COPIED_on(prog);
2118 s = savepvn(strbeg, i);
2119 prog->subbeg = s;
2120 }
d6a28714 2121 prog->sublen = i;
d6a28714
JH
2122 }
2123 else {
2124 prog->subbeg = strbeg;
2125 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2126 }
2127 }
9041c2e3 2128
d6a28714
JH
2129 return 1;
2130
2131phooey:
a3621e74 2132 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2133 PL_colors[4], PL_colors[5]));
d6a28714 2134 if (PL_reg_eval_set)
4f639d21 2135 restore_pos(aTHX_ prog);
fae667d5 2136 if (swap_on_fail)
c74340f9 2137 /* we failed :-( roll it back */
fae667d5
YO
2138 swap_match_buff(prog);
2139
d6a28714
JH
2140 return 0;
2141}
2142
6bda09f9 2143
d6a28714
JH
2144/*
2145 - regtry - try match at specific point
2146 */
2147STATIC I32 /* 0 failure, 1 success */
24b23f37 2148S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2149{
97aff369 2150 dVAR;
d6a28714 2151 CHECKPOINT lastcp;
3b0527fe 2152 regexp *prog = reginfo->prog;
f8fc2ecf 2153 RXi_GET_DECL(prog,progi);
a3621e74 2154 GET_RE_DEBUG_FLAGS_DECL;
24b23f37 2155 reginfo->cutpoint=NULL;
d6a28714 2156
bbe252da 2157 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2158 MAGIC *mg;
2159
2160 PL_reg_eval_set = RS_init;
a3621e74 2161 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2162 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2163 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2164 ));
ea8d6ae1 2165 SAVESTACK_CXPOS();
d6a28714
JH
2166 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2167 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2168 SAVETMPS;
2169 /* Apparently this is not needed, judging by wantarray. */
e8347627 2170 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2171 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2172
3b0527fe 2173 if (reginfo->sv) {
d6a28714 2174 /* Make $_ available to executed code. */
3b0527fe 2175 if (reginfo->sv != DEFSV) {
59f00321 2176 SAVE_DEFSV;
3b0527fe 2177 DEFSV = reginfo->sv;
b8c5462f 2178 }
d6a28714 2179
3b0527fe
DM
2180 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2181 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2182 /* prepare for quick setting of pos */
d300d9fa 2183#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2184 if (SvIsCOW(reginfo->sv))
2185 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2186#endif
3dab1dad 2187 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2188 &PL_vtbl_mglob, NULL, 0);
d6a28714 2189 mg->mg_len = -1;
b8c5462f 2190 }
d6a28714
JH
2191 PL_reg_magic = mg;
2192 PL_reg_oldpos = mg->mg_len;
4f639d21 2193 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2194 }
09687e5a 2195 if (!PL_reg_curpm) {
a02a5408 2196 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2197#ifdef USE_ITHREADS
2198 {
be8e71aa 2199 SV* const repointer = newSViv(0);
577e12cc 2200 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2201 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2202 av_push(PL_regex_padav,repointer);
2203 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2204 PL_regex_pad = AvARRAY(PL_regex_padav);
2205 }
2206#endif
2207 }
aaa362c4 2208 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2209 PL_reg_oldcurpm = PL_curpm;
2210 PL_curpm = PL_reg_curpm;
2211 if (RX_MATCH_COPIED(prog)) {
2212 /* Here is a serious problem: we cannot rewrite subbeg,
2213 since it may be needed if this match fails. Thus
2214 $` inside (?{}) could fail... */
2215 PL_reg_oldsaved = prog->subbeg;
2216 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2217#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2218 PL_nrs = prog->saved_copy;
2219#endif
d6a28714
JH
2220 RX_MATCH_COPIED_off(prog);
2221 }
2222 else
bd61b366 2223 PL_reg_oldsaved = NULL;
d6a28714
JH
2224 prog->subbeg = PL_bostr;
2225 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2226 }
24b23f37
YO
2227 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2228 prog->startp[0] = *startpos - PL_bostr;
2229 PL_reginput = *startpos;
d6a28714 2230 PL_reglastparen = &prog->lastparen;
a01268b5 2231 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2232 prog->lastparen = 0;
03994de8 2233 prog->lastcloseparen = 0;
d6a28714 2234 PL_regsize = 0;
6bda09f9
YO
2235 PL_regstartp = prog->startp;
2236 PL_regendp = prog->endp;
d6a28714
JH
2237 if (PL_reg_start_tmpl <= prog->nparens) {
2238 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2239 if(PL_reg_start_tmp)
2240 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2241 else
a02a5408 2242 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2243 }
2244
2245 /* XXXX What this code is doing here?!!! There should be no need
2246 to do this again and again, PL_reglastparen should take care of
3dd2943c 2247 this! --ilya*/
dafc8851
JH
2248
2249 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2250 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2251 * PL_reglastparen), is not needed at all by the test suite
2252 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2253 * enough, for building DynaLoader, or otherwise this
2254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2255 * will happen. Meanwhile, this code *is* needed for the
2256 * above-mentioned test suite tests to succeed. The common theme
2257 * on those tests seems to be returning null fields from matches.
2258 * --jhi */
dafc8851 2259#if 1
d6a28714 2260 if (prog->nparens) {
f8cc3bc6
NC
2261 I32 *sp = PL_regstartp;
2262 I32 *ep = PL_regendp;
097eb12c 2263 register I32 i;
eb160463 2264 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2265 *++sp = -1;
2266 *++ep = -1;
2267 }
2268 }
dafc8851 2269#endif
02db2b7b 2270 REGCP_SET(lastcp);
f8fc2ecf 2271 if (regmatch(reginfo, progi->program + 1)) {
c74340f9 2272 PL_regendp[0] = PL_reginput - PL_bostr;
d6a28714
JH
2273 return 1;
2274 }
24b23f37
YO
2275 if (reginfo->cutpoint)
2276 *startpos= reginfo->cutpoint;
02db2b7b 2277 REGCP_UNWIND(lastcp);
d6a28714
JH
2278 return 0;
2279}
2280
02db2b7b 2281
8ba1375e
MJD
2282#define sayYES goto yes
2283#define sayNO goto no
262b90c4 2284#define sayNO_SILENT goto no_silent
8ba1375e 2285
f9f4320a
YO
2286/* we dont use STMT_START/END here because it leads to
2287 "unreachable code" warnings, which are bogus, but distracting. */
2288#define CACHEsayNO \
c476f425
DM
2289 if (ST.cache_mask) \
2290 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2291 sayNO
3298f257 2292
a3621e74 2293/* this is used to determine how far from the left messages like
265c4333
YO
2294 'failed...' are printed. It should be set such that messages
2295 are inline with the regop output that created them.
a3621e74 2296*/
265c4333 2297#define REPORT_CODE_OFF 32
a3621e74
YO
2298
2299
2300/* Make sure there is a test for this +1 options in re_tests */
2301#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2302
40a82448
DM
2303#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2304#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2305
86545054
DM
2306#define SLAB_FIRST(s) (&(s)->states[0])
2307#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2308
5d9a96ca
DM
2309/* grab a new slab and return the first slot in it */
2310
2311STATIC regmatch_state *
2312S_push_slab(pTHX)
2313{
a35a87e7 2314#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2315 dMY_CXT;
2316#endif
5d9a96ca
DM
2317 regmatch_slab *s = PL_regmatch_slab->next;
2318 if (!s) {
2319 Newx(s, 1, regmatch_slab);
2320 s->prev = PL_regmatch_slab;
2321 s->next = NULL;
2322 PL_regmatch_slab->next = s;
2323 }
2324 PL_regmatch_slab = s;
86545054 2325 return SLAB_FIRST(s);
5d9a96ca 2326}
5b47454d 2327
95b24440 2328
40a82448
DM
2329/* push a new state then goto it */
2330
2331#define PUSH_STATE_GOTO(state, node) \
2332 scan = node; \
2333 st->resume_state = state; \
2334 goto push_state;
2335
2336/* push a new state with success backtracking, then goto it */
2337
2338#define PUSH_YES_STATE_GOTO(state, node) \
2339 scan = node; \
2340 st->resume_state = state; \
2341 goto push_yes_state;
2342
aa283a38 2343
aa283a38 2344
d6a28714 2345/*
95b24440 2346
bf1f174e
DM
2347regmatch() - main matching routine
2348
2349This is basically one big switch statement in a loop. We execute an op,
2350set 'next' to point the next op, and continue. If we come to a point which
2351we may need to backtrack to on failure such as (A|B|C), we push a
2352backtrack state onto the backtrack stack. On failure, we pop the top
2353state, and re-enter the loop at the state indicated. If there are no more
2354states to pop, we return failure.
2355
2356Sometimes we also need to backtrack on success; for example /A+/, where
2357after successfully matching one A, we need to go back and try to
2358match another one; similarly for lookahead assertions: if the assertion
2359completes successfully, we backtrack to the state just before the assertion
2360and then carry on. In these cases, the pushed state is marked as
2361'backtrack on success too'. This marking is in fact done by a chain of
2362pointers, each pointing to the previous 'yes' state. On success, we pop to
2363the nearest yes state, discarding any intermediate failure-only states.
2364Sometimes a yes state is pushed just to force some cleanup code to be
2365called at the end of a successful match or submatch; e.g. (??{$re}) uses
2366it to free the inner regex.
2367
2368Note that failure backtracking rewinds the cursor position, while
2369success backtracking leaves it alone.
2370
2371A pattern is complete when the END op is executed, while a subpattern
2372such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2373ops trigger the "pop to last yes state if any, otherwise return true"
2374behaviour.
2375
2376A common convention in this function is to use A and B to refer to the two
2377subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2378the subpattern to be matched possibly multiple times, while B is the entire
2379rest of the pattern. Variable and state names reflect this convention.
2380
2381The states in the main switch are the union of ops and failure/success of
2382substates associated with with that op. For example, IFMATCH is the op
2383that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2384'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2385successfully matched A and IFMATCH_A_fail is a state saying that we have
2386just failed to match A. Resume states always come in pairs. The backtrack
2387state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2388at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2389on success or failure.
2390
2391The struct that holds a backtracking state is actually a big union, with
2392one variant for each major type of op. The variable st points to the
2393top-most backtrack struct. To make the code clearer, within each
2394block of code we #define ST to alias the relevant union.
2395
2396Here's a concrete example of a (vastly oversimplified) IFMATCH
2397implementation:
2398
2399 switch (state) {
2400 ....
2401
2402#define ST st->u.ifmatch
2403
2404 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2405 ST.foo = ...; // some state we wish to save
95b24440 2406 ...
bf1f174e
DM
2407 // push a yes backtrack state with a resume value of
2408 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2409 // first node of A:
2410 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2411 // NOTREACHED
2412
2413 case IFMATCH_A: // we have successfully executed A; now continue with B
2414 next = B;
2415 bar = ST.foo; // do something with the preserved value
2416 break;
2417
2418 case IFMATCH_A_fail: // A failed, so the assertion failed
2419 ...; // do some housekeeping, then ...
2420 sayNO; // propagate the failure
2421
2422#undef ST
95b24440 2423
bf1f174e
DM
2424 ...
2425 }
95b24440 2426
bf1f174e
DM
2427For any old-timers reading this who are familiar with the old recursive
2428approach, the code above is equivalent to:
95b24440 2429
bf1f174e
DM
2430 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2431 {
2432 int foo = ...
95b24440 2433 ...
bf1f174e
DM
2434 if (regmatch(A)) {
2435 next = B;
2436 bar = foo;
2437 break;
95b24440 2438 }
bf1f174e
DM
2439 ...; // do some housekeeping, then ...
2440 sayNO; // propagate the failure
95b24440 2441 }
bf1f174e
DM
2442
2443The topmost backtrack state, pointed to by st, is usually free. If you
2444want to claim it, populate any ST.foo fields in it with values you wish to
2445save, then do one of
2446
2447 PUSH_STATE_GOTO(resume_state, node);
2448 PUSH_YES_STATE_GOTO(resume_state, node);
2449
2450which sets that backtrack state's resume value to 'resume_state', pushes a
2451new free entry to the top of the backtrack stack, then goes to 'node'.
2452On backtracking, the free slot is popped, and the saved state becomes the
2453new free state. An ST.foo field in this new top state can be temporarily
2454accessed to retrieve values, but once the main loop is re-entered, it
2455becomes available for reuse.
2456
2457Note that the depth of the backtrack stack constantly increases during the
2458left-to-right execution of the pattern, rather than going up and down with
2459the pattern nesting. For example the stack is at its maximum at Z at the
2460end of the pattern, rather than at X in the following:
2461
2462 /(((X)+)+)+....(Y)+....Z/
2463
2464The only exceptions to this are lookahead/behind assertions and the cut,
2465(?>A), which pop all the backtrack states associated with A before
2466continuing.
2467
2468Bascktrack state structs are allocated in slabs of about 4K in size.
2469PL_regmatch_state and st always point to the currently active state,
2470and PL_regmatch_slab points to the slab currently containing
2471PL_regmatch_state. The first time regmatch() is called, the first slab is
2472allocated, and is never freed until interpreter destruction. When the slab
2473is full, a new one is allocated and chained to the end. At exit from
2474regmatch(), slabs allocated since entry are freed.
2475
2476*/
95b24440 2477
40a82448 2478
5bc10b2c 2479#define DEBUG_STATE_pp(pp) \
265c4333 2480 DEBUG_STATE_r({ \
5bc10b2c
DM
2481 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2482 PerlIO_printf(Perl_debug_log, \
5d458dd8 2483 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2484 depth*2, "", \
13d6edb4 2485 PL_reg_name[st->resume_state], \
5d458dd8
YO
2486 ((st==yes_state||st==mark_state) ? "[" : ""), \
2487 ((st==yes_state) ? "Y" : ""), \
2488 ((st==mark_state) ? "M" : ""), \
2489 ((st==yes_state||st==mark_state) ? "]" : "") \
2490 ); \
265c4333 2491 });
5bc10b2c 2492
40a82448 2493
3dab1dad 2494#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2495
3df15adc 2496#ifdef DEBUGGING
5bc10b2c 2497
ab3bbdeb
YO
2498STATIC void
2499S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2500 const char *start, const char *end, const char *blurb)
2501{
bbe252da 2502 const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0;
ab3bbdeb
YO
2503 if (!PL_colorset)
2504 reginitcolors();
2505 {
2506 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2507 prog->precomp, prog->prelen, 60);
2508
2509 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2510 start, end - start, 60);
2511
2512 PerlIO_printf(Perl_debug_log,
2513 "%s%s REx%s %s against %s\n",
2514 PL_colors[4], blurb, PL_colors[5], s0, s1);
2515
2516 if (do_utf8||utf8_pat)
1de06328
YO
2517 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2518 utf8_pat ? "pattern" : "",
2519 utf8_pat && do_utf8 ? " and " : "",
2520 do_utf8 ? "string" : ""
ab3bbdeb
YO
2521 );
2522 }
2523}
3df15adc
YO
2524
2525STATIC void
786e8c11
YO
2526S_dump_exec_pos(pTHX_ const char *locinput,
2527 const regnode *scan,
2528 const char *loc_regeol,
2529 const char *loc_bostr,
2530 const char *loc_reg_starttry,
2531 const bool do_utf8)
07be1b83 2532{
786e8c11 2533 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2534 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2535 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2536 /* The part of the string before starttry has one color
2537 (pref0_len chars), between starttry and current
2538 position another one (pref_len - pref0_len chars),
2539 after the current position the third one.
2540 We assume that pref0_len <= pref_len, otherwise we
2541 decrease pref0_len. */
786e8c11
YO
2542 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2543 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2544 int pref0_len;
2545
2546 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2547 pref_len++;
786e8c11
YO
2548 pref0_len = pref_len - (locinput - loc_reg_starttry);
2549 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2550 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2551 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2553 l--;
2554 if (pref0_len < 0)
2555 pref0_len = 0;
2556 if (pref0_len > pref_len)
2557 pref0_len = pref_len;
2558 {
3df15adc 2559 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2560
ab3bbdeb 2561 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2562 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2563
ab3bbdeb 2564 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2565 (locinput - pref_len + pref0_len),
1de06328 2566 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2567
ab3bbdeb 2568 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2569 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2570
1de06328 2571 const STRLEN tlen=len0+len1+len2;
3df15adc 2572 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2573 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2574 (IV)(locinput - loc_bostr),
07be1b83 2575 len0, s0,
07be1b83 2576 len1, s1,
07be1b83 2577 (docolor ? "" : "> <"),
07be1b83 2578 len2, s2,
f9f4320a 2579 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2580 "");
2581 }
2582}
3df15adc 2583
07be1b83
YO
2584#endif
2585
0a4db386
YO
2586/* reg_check_named_buff_matched()
2587 * Checks to see if a named buffer has matched. The data array of
2588 * buffer numbers corresponding to the buffer is expected to reside
2589 * in the regexp->data->data array in the slot stored in the ARG() of
2590 * node involved. Note that this routine doesn't actually care about the
2591 * name, that information is not preserved from compilation to execution.
2592 * Returns the index of the leftmost defined buffer with the given name
2593 * or 0 if non of the buffers matched.
2594 */
2595STATIC I32
2596S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2597 I32 n;
f8fc2ecf
YO
2598 RXi_GET_DECL(rex,rexi);
2599 SV *sv_dat=(SV*)rexi->data->data[ ARG( scan ) ];
0a4db386
YO
2600 I32 *nums=(I32*)SvPVX(sv_dat);
2601 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2602 if ((I32)*PL_reglastparen >= nums[n] &&
2603 PL_regendp[nums[n]] != -1)
2604 {
2605 return nums[n];
2606 }
2607 }
2608 return 0;
2609}
2610
28d8d7f4
YO
2611#define SETREX(Re1,Re2) \
2612 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2613 Re1 = (Re2)
2614
d6a28714 2615STATIC I32 /* 0 failure, 1 success */
24b23f37 2616S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2617{
a35a87e7 2618#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2619 dMY_CXT;
2620#endif
27da23d5 2621 dVAR;
95b24440 2622 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2623 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2624
3b0527fe 2625 regexp *rex = reginfo->prog;
f8fc2ecf
YO
2626 RXi_GET_DECL(rex,rexi);
2627
5d9a96ca
DM
2628 regmatch_slab *orig_slab;
2629 regmatch_state *orig_state;
a3621e74 2630
5d9a96ca
DM
2631 /* the current state. This is a cached copy of PL_regmatch_state */
2632 register regmatch_state *st;
95b24440 2633
5d9a96ca
DM
2634 /* cache heavy used fields of st in registers */
2635 register regnode *scan;
2636 register regnode *next;
438e9bae 2637 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2638 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2639 register char *locinput = PL_reginput;
5d9a96ca 2640 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2641
b69b0499 2642 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2643 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2644 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2645 const U32 max_nochange_depth =
2646 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2647 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2648
77cb431f
DM
2649 regmatch_state *yes_state = NULL; /* state to pop to on success of
2650 subpattern */
e2e6a0f1
YO
2651 /* mark_state piggy backs on the yes_state logic so that when we unwind
2652 the stack on success we can update the mark_state as we go */
2653 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5d458dd8 2654
faec1544 2655 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2656 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2657 U32 state_num;
5d458dd8
YO
2658 bool no_final = 0; /* prevent failure from backtracking? */
2659 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2660 char *startpoint = PL_reginput;
5d458dd8
YO
2661 SV *popmark = NULL; /* are we looking for a mark? */
2662 SV *sv_commit = NULL; /* last mark name seen in failure */
2663 SV *sv_yes_mark = NULL; /* last mark name we have seen
2664 during a successfull match */
2665 U32 lastopen = 0; /* last open we saw */
2666 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
58e23c8d 2667
5d458dd8 2668
24d3c4a9
DM
2669 /* these three flags are set by various ops to signal information to
2670 * the very next op. They have a useful lifetime of exactly one loop
2671 * iteration, and are not preserved or restored by state pushes/pops
2672 */
2673 bool sw = 0; /* the condition value in (?(cond)a|b) */
2674 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2675 int logical = 0; /* the following EVAL is:
2676 0: (?{...})
2677 1: (?(?{...})X|Y)
2678 2: (??{...})
2679 or the following IFMATCH/UNLESSM is:
2680 false: plain (?=foo)
2681 true: used as a condition: (?(?=foo))
2682 */
2683
95b24440 2684#ifdef DEBUGGING
e68ec53f 2685 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2686#endif
2687
3b57cd43 2688 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2689 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2690 }));
5d9a96ca
DM
2691 /* on first ever call to regmatch, allocate first slab */
2692 if (!PL_regmatch_slab) {
2693 Newx(PL_regmatch_slab, 1, regmatch_slab);
2694 PL_regmatch_slab->prev = NULL;
2695 PL_regmatch_slab->next = NULL;
86545054 2696 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2697 }
2698
2699 /* remember current high-water mark for exit */
2700 /* XXX this should be done with SAVE* instead */
2701 orig_slab = PL_regmatch_slab;
2702 orig_state = PL_regmatch_state;
2703
2704 /* grab next free state slot */
2705 st = ++PL_regmatch_state;
86545054 2706 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2707 st = PL_regmatch_state = S_push_slab(aTHX);
2708
d6a28714
JH
2709 /* Note that nextchr is a byte even in UTF */
2710 nextchr = UCHARAT(locinput);
2711 scan = prog;
2712 while (scan != NULL) {
8ba1375e 2713
a3621e74 2714 DEBUG_EXECUTE_r( {
6136c704 2715 SV * const prop = sv_newmortal();
1de06328 2716 regnode *rnext=regnext(scan);
786e8c11 2717 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2718 regprop(rex, prop, scan);
07be1b83
YO
2719
2720 PerlIO_printf(Perl_debug_log,
2721 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2722 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2723 SvPVX_const(prop),
1de06328 2724 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2725 0 : (IV)(rnext - rexi->program));
2a782b5b 2726 });
d6a28714
JH
2727
2728 next = scan + NEXT_OFF(scan);
2729 if (next == scan)
2730 next = NULL;
40a82448 2731 state_num = OP(scan);
d6a28714 2732
40a82448
DM
2733 reenter_switch:
2734 switch (state_num) {
d6a28714 2735 case BOL:
7fba1cd6 2736 if (locinput == PL_bostr)
d6a28714 2737 {
3b0527fe 2738 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2739 break;
2740 }
d6a28714
JH
2741 sayNO;
2742 case MBOL:
12d33761
HS
2743 if (locinput == PL_bostr ||
2744 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2745 {
b8c5462f
JH
2746 break;
2747 }
d6a28714
JH
2748 sayNO;
2749 case SBOL:
c2a73568 2750 if (locinput == PL_bostr)
b8c5462f 2751 break;
d6a28714
JH
2752 sayNO;
2753 case GPOS:
3b0527fe 2754 if (locinput == reginfo->ganch)
d6a28714
JH
2755 break;
2756 sayNO;
ee9b8eae
YO
2757
2758 case KEEPS:
2759 /* update the startpoint */
2760 st->u.keeper.val = PL_regstartp[0];
2761 PL_reginput = locinput;
2762 PL_regstartp[0] = locinput - PL_bostr;
2763 PUSH_STATE_GOTO(KEEPS_next, next);
2764 /*NOT-REACHED*/
2765 case KEEPS_next_fail:
2766 /* rollback the start point change */
2767 PL_regstartp[0] = st->u.keeper.val;
2768 sayNO_SILENT;
2769 /*NOT-REACHED*/
d6a28714 2770 case EOL:
d6a28714
JH
2771 goto seol;
2772 case MEOL:
d6a28714 2773 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2774 sayNO;
b8c5462f 2775 break;
d6a28714
JH
2776 case SEOL:
2777 seol:
2778 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2779 sayNO;
d6a28714 2780 if (PL_regeol - locinput > 1)
b8c5462f 2781 sayNO;
b8c5462f 2782 break;
d6a28714
JH
2783 case EOS:
2784 if (PL_regeol != locinput)
b8c5462f 2785 sayNO;
d6a28714 2786 break;
ffc61ed2 2787 case SANY:
d6a28714 2788 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2789 sayNO;
f33976b4
DB
2790 if (do_utf8) {
2791 locinput += PL_utf8skip[nextchr];
2792 if (locinput > PL_regeol)
2793 sayNO;
2794 nextchr = UCHARAT(locinput);
2795 }
2796 else
2797 nextchr = UCHARAT(++locinput);
2798 break;
2799 case CANY:
2800 if (!nextchr && locinput >= PL_regeol)
2801 sayNO;
b8c5462f 2802 nextchr = UCHARAT(++locinput);
a0d0e21e 2803 break;
ffc61ed2 2804 case REG_ANY:
1aa99e6b
IH
2805 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2806 sayNO;
2807 if (do_utf8) {
b8c5462f 2808 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2809 if (locinput > PL_regeol)
2810 sayNO;
a0ed51b3 2811 nextchr = UCHARAT(locinput);
a0ed51b3 2812 }
1aa99e6b
IH
2813 else
2814 nextchr = UCHARAT(++locinput);
a0ed51b3 2815 break;
166ba7cd
DM
2816
2817#undef ST
2818#define ST st->u.trie
786e8c11
YO
2819 case TRIEC:
2820 /* In this case the charclass data is available inline so
2821 we can fail fast without a lot of extra overhead.
2822 */
2823 if (scan->flags == EXACT || !do_utf8) {
2824 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2825 DEBUG_EXECUTE_r(
2826 PerlIO_printf(Perl_debug_log,
2827 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2828 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2829 );
2830 sayNO_SILENT;
2831 /* NOTREACHED */
2832 }
2833 }
2834 /* FALL THROUGH */
5b47454d 2835 case TRIE:
3dab1dad 2836 {
07be1b83 2837 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2838 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2839 trie_type = do_utf8 ?
2840 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2841 : trie_plain;
2842
2843 /* what trie are we using right now */
be8e71aa 2844 reg_trie_data * const trie
f8fc2ecf 2845 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
55eed653 2846 HV * widecharmap = (HV *)rexi->data->data[ ARG( scan ) + 1 ];
3dab1dad 2847 U32 state = trie->startstate;
166ba7cd 2848
3dab1dad
YO
2849 if (trie->bitmap && trie_type != trie_utf8_fold &&
2850 !TRIE_BITMAP_TEST(trie,*locinput)
2851 ) {
2852 if (trie->states[ state ].wordnum) {
2853 DEBUG_EXECUTE_r(
2854 PerlIO_printf(Perl_debug_log,
2855 "%*s %smatched empty string...%s\n",
5bc10b2c 2856 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2857 );
2858 break;
2859 } else {
2860 DEBUG_EXECUTE_r(
2861 PerlIO_printf(Perl_debug_log,
786e8c11 2862 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2863 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2864 );
2865 sayNO_SILENT;
2866 }
2867 }
166ba7cd 2868
786e8c11
YO
2869 {
2870 U8 *uc = ( U8* )locinput;
2871
2872 STRLEN len = 0;
2873 STRLEN foldlen = 0;
2874 U8 *uscan = (U8*)NULL;
2875 STRLEN bufflen=0;
2876 SV *sv_accept_buff = NULL;
2877 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2878
2879 ST.accepted = 0; /* how many accepting states we have seen */
2880 ST.B = next;
2881 ST.jump = trie->jump;
786e8c11 2882 ST.me = scan;
07be1b83
YO
2883 /*
2884 traverse the TRIE keeping track of all accepting states
2885 we transition through until we get to a failing node.
2886 */
2887
a3621e74 2888 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2889 U32 base = trie->states[ state ].trans.base;
f9f4320a 2890 UV uvc = 0;
786e8c11
YO
2891 U16 charid;
2892 /* We use charid to hold the wordnum as we don't use it
2893 for charid until after we have done the wordnum logic.
2894 We define an alias just so that the wordnum logic reads
2895 more naturally. */
2896
2897#define got_wordnum charid
2898 got_wordnum = trie->states[ state ].wordnum;
2899
2900 if ( got_wordnum ) {
2901 if ( ! ST.accepted ) {
5b47454d
DM
2902 ENTER;
2903 SAVETMPS;
2904 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2905 sv_accept_buff=newSV(bufflen *
2906 sizeof(reg_trie_accepted) - 1);
786e8c11 2907 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2908 SvPOK_on(sv_accept_buff);
2909 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2910 SAVETMPS;
2911 ST.accept_buff =
5b47454d
DM
2912 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2913 }
786e8c11 2914 do {
166ba7cd 2915 if (ST.accepted >= bufflen) {
5b47454d 2916 bufflen *= 2;
166ba7cd 2917 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2918 SvGROW(sv_accept_buff,
2919 bufflen * sizeof(reg_trie_accepted));
2920 }
2921 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2922 + sizeof(reg_trie_accepted));
a3621e74 2923
786e8c11
YO
2924
2925 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2926 ST.accept_buff[ST.accepted].endpos = uc;
2927 ++ST.accepted;
2928 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2929 }
2930#undef got_wordnum
a3621e74 2931
07be1b83 2932 DEBUG_TRIE_EXECUTE_r({
786e8c11 2933 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2934 PerlIO_printf( Perl_debug_log,
786e8c11 2935 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2936 2+depth * 2, "", PL_colors[4],
786e8c11 2937 (UV)state, (UV)ST.accepted );
07be1b83 2938 });
a3621e74
YO
2939
2940 if ( base ) {
55eed653
NC
2941 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
2942 uscan, len, uvc, charid, foldlen,
2943 foldbuf, uniflags);
a3621e74 2944
5b47454d
DM
2945 if (charid &&
2946 (base + charid > trie->uniquecharcount )
2947 && (base + charid - 1 - trie->uniquecharcount
2948 < trie->lasttrans)
2949 && trie->trans[base + charid - 1 -
2950 trie->uniquecharcount].check == state)
2951 {
2952 state = trie->trans[base + charid - 1 -
2953 trie->uniquecharcount ].next;
2954 }
2955 else {
2956 state = 0;
2957 }
2958 uc += len;
2959
2960 }
2961 else {
a3621e74
YO
2962 state = 0;
2963 }
2964 DEBUG_TRIE_EXECUTE_r(
e4584336 2965 PerlIO_printf( Perl_debug_log,
786e8c11 2966 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2967 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2968 );
2969 }
166ba7cd 2970 if (!ST.accepted )
a3621e74 2971 sayNO;
a3621e74 2972
166ba7cd
DM
2973 DEBUG_EXECUTE_r(
2974 PerlIO_printf( Perl_debug_log,
2975 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2976 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2977 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2978 );
786e8c11 2979 }}
fae667d5
YO
2980 goto trie_first_try; /* jump into the fail handler */
2981 /* NOTREACHED */
166ba7cd 2982 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
2983 if ( ST.jump) {
2984 REGCP_UNWIND(ST.cp);
2985 for (n = *PL_reglastparen; n > ST.lastparen; n--)
2986 PL_regendp[n] = -1;
2987 *PL_reglastparen = n;
2988 }
2989 trie_first_try:
5d458dd8
YO
2990 if (do_cutgroup) {
2991 do_cutgroup = 0;
2992 no_final = 0;
2993 }
fae667d5
YO
2994
2995 if ( ST.jump) {
2996 ST.lastparen = *PL_reglastparen;
2997 REGCP_SET(ST.cp);
2998 }
166ba7cd
DM
2999 if ( ST.accepted == 1 ) {
3000 /* only one choice left - just continue */
3001 DEBUG_EXECUTE_r({
2b8b4781
NC
3002 AV *const trie_words
3003 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3004 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3005 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3006 SV *sv= tmp ? sv_newmortal() : NULL;
3007
166ba7cd
DM
3008 PerlIO_printf( Perl_debug_log,
3009 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3010 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3011 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3012 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3013 PL_colors[0], PL_colors[1],
3014 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3015 )
3016 : "not compiled under -Dr",
166ba7cd
DM
3017 PL_colors[5] );
3018 });
3019 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3020 /* in this case we free tmps/leave before we call regmatch
3021 as we wont be using accept_buff again. */
5d458dd8 3022
166ba7cd
DM
3023 locinput = PL_reginput;
3024 nextchr = UCHARAT(locinput);
5d458dd8
YO
3025 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3026 scan = ST.B;
3027 else
3028 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3029 if (!has_cutgroup) {
3030 FREETMPS;
3031 LEAVE;
3032 } else {
3033 ST.accepted--;
3034 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3035 }
786e8c11 3036
166ba7cd
DM
3037 continue; /* execute rest of RE */
3038 }
fae667d5
YO
3039
3040 if ( !ST.accepted-- ) {
5d458dd8
YO
3041 DEBUG_EXECUTE_r({
3042 PerlIO_printf( Perl_debug_log,
3043 "%*s %sTRIE failed...%s\n",
3044 REPORT_CODE_OFF+depth*2, "",
3045 PL_colors[4],
3046 PL_colors[5] );
3047 });
166ba7cd
DM
3048 FREETMPS;
3049 LEAVE;
5d458dd8 3050 sayNO_SILENT;
fae667d5
YO
3051 /*NOTREACHED*/
3052 }
166ba7cd 3053
a3621e74 3054 /*
166ba7cd
DM
3055 There are at least two accepting states left. Presumably
3056 the number of accepting states is going to be low,
3057 typically two. So we simply scan through to find the one
3058 with lowest wordnum. Once we find it, we swap the last
3059 state into its place and decrement the size. We then try to
3060 match the rest of the pattern at the point where the word
3061 ends. If we succeed, control just continues along the
3062 regex; if we fail we return here to try the next accepting
3063 state
3064 */
a3621e74 3065
166ba7cd
DM
3066 {
3067 U32 best = 0;
3068 U32 cur;
3069 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3070 DEBUG_TRIE_EXECUTE_r(
f2278c82 3071 PerlIO_printf( Perl_debug_log,
166ba7cd 3072 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3073 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3074 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3075 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3076 );
3077
3078 if (ST.accept_buff[cur].wordnum <
3079 ST.accept_buff[best].wordnum)
3080 best = cur;
a3621e74 3081 }
166ba7cd
DM
3082
3083 DEBUG_EXECUTE_r({
2b8b4781
NC
3084 AV *const trie_words
3085 = (AV *) rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET];
3086 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3087 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3088 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3089 ST.B :
7f69552c 3090 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3091 SV *sv= tmp ? sv_newmortal() : NULL;
3092
265c4333
YO
3093 PerlIO_printf( Perl_debug_log,
3094 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3095 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3096 ST.accept_buff[best].wordnum,
de734bd5
A
3097 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3098 PL_colors[0], PL_colors[1],
3099 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3100 ) : "not compiled under -Dr",
265c4333 3101 REG_NODE_NUM(nextop),
166ba7cd
DM
3102 PL_colors[5] );
3103 });
3104
3105 if ( best<ST.accepted ) {
3106 reg_trie_accepted tmp = ST.accept_buff[ best ];
3107 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3108 ST.accept_buff[ ST.accepted ] = tmp;
3109 best = ST.accepted;
a3621e74 3110 }
166ba7cd 3111 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3112 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3113 scan = ST.B;
786e8c11
YO
3114 /* NOTREACHED */
3115 } else {
5d458dd8 3116 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
786e8c11
YO
3117 /* NOTREACHED */
3118 }
5d458dd8
YO
3119 if (has_cutgroup) {
3120 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3121 /* NOTREACHED */
3122 } else {
3123 PUSH_STATE_GOTO(TRIE_next, scan);
3124 /* NOTREACHED */
3125 }
786e8c11 3126 /* NOTREACHED */
166ba7cd 3127 }
166ba7cd 3128 /* NOTREACHED */
5d458dd8
YO
3129 case TRIE_next:
3130 FREETMPS;
3131 LEAVE;
3132 sayYES;
166ba7cd
DM
3133#undef ST
3134
95b24440
DM
3135 case EXACT: {
3136 char *s = STRING(scan);
24d3c4a9 3137 ln = STR_LEN(scan);
eb160463 3138 if (do_utf8 != UTF) {
bc517b45 3139 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3140 char *l = locinput;
24d3c4a9 3141 const char * const e = s + ln;
a72c7584 3142
5ff6fc6d
JH
3143 if (do_utf8) {
3144 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3145 while (s < e) {
a3b680e6 3146 STRLEN ulen;
1aa99e6b 3147 if (l >= PL_regeol)
5ff6fc6d
JH
3148 sayNO;
3149 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3150 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3151 uniflags))
5ff6fc6d 3152 sayNO;
bc517b45 3153 l += ulen;
5ff6fc6d 3154 s ++;
1aa99e6b 3155 }
5ff6fc6d
JH
3156 }
3157 else {
3158 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3159 while (s < e) {
a3b680e6 3160 STRLEN ulen;
1aa99e6b
IH
3161 if (l >= PL_regeol)
3162 sayNO;
5ff6fc6d 3163 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3164 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3165 uniflags))
1aa99e6b 3166 sayNO;
bc517b45 3167 s += ulen;
a72c7584 3168 l ++;
1aa99e6b 3169 }
5ff6fc6d 3170 }
1aa99e6b
IH
3171 locinput = l;
3172 nextchr = UCHARAT(locinput);
3173 break;
3174 }
bc517b45 3175 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3176 /* Inline the first character, for speed. */
3177 if (UCHARAT(s) != nextchr)
3178 sayNO;
24d3c4a9 3179 if (PL_regeol - locinput < ln)
d6a28714 3180 sayNO;
24d3c4a9 3181 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3182 sayNO;
24d3c4a9 3183 locinput += ln;
d6a28714
JH
3184 nextchr = UCHARAT(locinput);
3185 break;
95b24440 3186 }
d6a28714 3187 case EXACTFL:
b8c5462f
JH
3188 PL_reg_flags |= RF_tainted;
3189 /* FALL THROUGH */
95b24440 3190 case EXACTF: {
be8e71aa 3191 char * const s = STRING(scan);
24d3c4a9 3192 ln = STR_LEN(scan);
d6a28714 3193
d07ddd77
JH
3194 if (do_utf8 || UTF) {
3195 /* Either target or the pattern are utf8. */
be8e71aa 3196 const char * const l = locinput;
d07ddd77 3197 char *e = PL_regeol;
bc517b45 3198
24d3c4a9 3199 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3200 l, &e, 0, do_utf8)) {
5486206c
JH
3201 /* One more case for the sharp s:
3202 * pack("U0U*", 0xDF) =~ /ss/i,
3203 * the 0xC3 0x9F are the UTF-8
3204 * byte sequence for the U+00DF. */
3205 if (!(do_utf8 &&
3206 toLOWER(s[0]) == 's' &&
24d3c4a9 3207 ln >= 2 &&
5486206c
JH
3208 toLOWER(s[1]) == 's' &&
3209 (U8)l[0] == 0xC3 &&
3210 e - l >= 2 &&
3211 (U8)l[1] == 0x9F))
3212 sayNO;
3213 }
d07ddd77
JH
3214 locinput = e;
3215 nextchr = UCHARAT(locinput);
3216 break;
a0ed51b3 3217 }
d6a28714 3218
bc517b45
JH
3219 /* Neither the target and the pattern are utf8. */
3220
d6a28714
JH
3221 /* Inline the first character, for speed. */
3222 if (UCHARAT(s) != nextchr &&
3223 UCHARAT(s) != ((OP(scan) == EXACTF)
3224 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3225 sayNO;
24d3c4a9 3226 if (PL_regeol - locinput < ln)
b8c5462f 3227 sayNO;
24d3c4a9
DM
3228 if (ln > 1 && (OP(scan) == EXACTF
3229 ? ibcmp(s, locinput, ln)
3230 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3231 sayNO;
24d3c4a9 3232 locinput += ln;
d6a28714 3233 nextchr = UCHARAT(locinput);
a0d0e21e 3234 break;
95b24440 3235 }
d6a28714 3236 case ANYOF:
ffc61ed2 3237 if (do_utf8) {
9e55ce06
JH
3238 STRLEN inclasslen = PL_regeol - locinput;
3239
32fc9b6a 3240 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3241 goto anyof_fail;
ffc61ed2
JH
3242 if (locinput >= PL_regeol)
3243 sayNO;
0f0076b4 3244 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3245 nextchr = UCHARAT(locinput);
e0f9d4a8 3246 break;
ffc61ed2
JH
3247 }
3248 else {
3249 if (nextchr < 0)
3250 nextchr = UCHARAT(locinput);
32fc9b6a 3251 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3252 goto anyof_fail;
ffc61ed2
JH
3253 if (!nextchr && locinput >= PL_regeol)
3254 sayNO;
3255 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3256 break;
3257 }
262b90c4 3258 anyof_fail:
e0f9d4a8
JH
3259 /* If we might have the case of the German sharp s
3260 * in a casefolding Unicode character class. */
3261
ebc501f0
JH
3262 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3263 locinput += SHARP_S_SKIP;
e0f9d4a8 3264 nextchr = UCHARAT(locinput);
ffc61ed2 3265 }
e0f9d4a8
JH
3266 else
3267 sayNO;
b8c5462f 3268 break;
d6a28714 3269 case ALNUML:
b8c5462f
JH
3270 PL_reg_flags |= RF_tainted;
3271 /* FALL THROUGH */
d6a28714 3272 case ALNUM:
b8c5462f 3273 if (!nextchr)
4633a7c4 3274 sayNO;
ffc61ed2 3275 if (do_utf8) {
1a4fad37 3276 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3277 if (!(OP(scan) == ALNUM
bb7a0f54 3278 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3279 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3280 {
3281 sayNO;
a0ed51b3 3282 }
b8c5462f 3283 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3284 nextchr = UCHARAT(locinput);
3285 break;
3286 }
ffc61ed2 3287 if (!(OP(scan) == ALNUM
d6a28714 3288 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3289 sayNO;
b8c5462f 3290 nextchr = UCHARAT(++locinput);
a0d0e21e 3291 break;
d6a28714 3292 case NALNUML:
b8c5462f
JH
3293 PL_reg_flags |= RF_tainted;
3294 /* FALL THROUGH */
d6a28714
JH
3295 case NALNUM:
3296 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3297 sayNO;
ffc61ed2 3298 if (do_utf8) {
1a4fad37 3299 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3300 if (OP(scan) == NALNUM
bb7a0f54 3301 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3302 : isALNUM_LC_utf8((U8*)locinput))
3303 {
b8c5462f 3304 sayNO;
d6a28714 3305 }
b8c5462f
JH
3306 locinput += PL_utf8skip[nextchr];
3307 nextchr = UCHARAT(locinput);
3308 break;
3309 }
ffc61ed2 3310 if (OP(scan) == NALNUM
d6a28714 3311 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3312 sayNO;
76e3520e 3313 nextchr = UCHARAT(++locinput);
a0d0e21e 3314 break;
d6a28714
JH
3315 case BOUNDL:
3316 case NBOUNDL:
3280af22 3317 PL_reg_flags |= RF_tainted;
bbce6d69 3318 /* FALL THROUGH */
d6a28714
JH
3319 case BOUND:
3320 case NBOUND:
3321 /* was last char in word? */
ffc61ed2 3322 if (do_utf8) {
12d33761 3323 if (locinput == PL_bostr)
24d3c4a9 3324 ln = '\n';
ffc61ed2 3325 else {
a3b680e6 3326 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3327
24d3c4a9 3328 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3329 }
3330 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3331 ln = isALNUM_uni(ln);
1a4fad37 3332 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3333 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3334 }
3335 else {
24d3c4a9 3336 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3337 n = isALNUM_LC_utf8((U8*)locinput);
3338 }
a0ed51b3 3339 }
d6a28714 3340 else {
24d3c4a9 3341 ln = (locinput != PL_bostr) ?
12d33761 3342 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3343 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3344 ln = isALNUM(ln);
ffc61ed2
JH
3345 n = isALNUM(nextchr);
3346 }
3347 else {
24d3c4a9 3348 ln = isALNUM_LC(ln);
ffc61ed2
JH
3349 n = isALNUM_LC(nextchr);
3350 }
d6a28714 3351 }
24d3c4a9 3352 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3353 OP(scan) == BOUNDL))
3354 sayNO;
a0ed51b3 3355 break;
d6a28714 3356 case SPACEL:
3280af22 3357 PL_reg_flags |= RF_tainted;
bbce6d69 3358 /* FALL THROUGH */
d6a28714 3359 case SPACE:
9442cb0e 3360 if (!nextchr)
4633a7c4 3361 sayNO;
1aa99e6b 3362 if (do_utf8) {
fd400ab9 3363 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3364 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3365 if (!(OP(scan) == SPACE
bb7a0f54 3366 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3367 : isSPACE_LC_utf8((U8*)locinput)))
3368 {
3369 sayNO;
3370 }
3371 locinput += PL_utf8skip[nextchr];
3372 nextchr = UCHARAT(locinput);
3373 break;
d6a28714 3374 }
ffc61ed2
JH
3375 if (!(OP(scan) == SPACE
3376 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3377 sayNO;
3378 nextchr = UCHARAT(++locinput);
3379 }
3380 else {
3381 if (!(OP(scan) == SPACE
3382 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3383 sayNO;
3384 nextchr = UCHARAT(++locinput);
a0ed51b3 3385 }
a0ed51b3 3386 break;
d6a28714 3387 case NSPACEL:
3280af22 3388 PL_reg_flags |= RF_tainted;
bbce6d69 3389 /* FALL THROUGH */
d6a28714 3390 case NSPACE:
9442cb0e 3391 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3392 sayNO;
1aa99e6b 3393 if (do_utf8) {
1a4fad37 3394 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3395 if (OP(scan) == NSPACE
bb7a0f54 3396 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3397 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3398 {
3399 sayNO;
3400 }
3401 locinput += PL_utf8skip[nextchr];
3402 nextchr = UCHARAT(locinput);
3403 break;
a0ed51b3 3404 }
ffc61ed2 3405 if (OP(scan) == NSPACE
d6a28714 3406 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3407 sayNO;
76e3520e 3408 nextchr = UCHARAT(++locinput);
a0d0e21e 3409 break;
d6a28714 3410 case DIGITL:
a0ed51b3
LW
3411 PL_reg_flags |= RF_tainted;
3412 /* FALL THROUGH */
d6a28714 3413 case DIGIT:
9442cb0e 3414 if (!nextchr)
a0ed51b3 3415 sayNO;
1aa99e6b 3416 if (do_utf8) {
1a4fad37 3417 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3418 if (!(OP(scan) == DIGIT
bb7a0f54 3419 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3420 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3421 {
a0ed51b3 3422 sayNO;
dfe13c55 3423 }
6f06b55f 3424 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3425 nextchr = UCHARAT(locinput);
3426 break;
3427 }
ffc61ed2 3428 if (!(OP(scan) == DIGIT
9442cb0e 3429 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3430 sayNO;
3431 nextchr = UCHARAT(++locinput);
3432 break;
d6a28714 3433 case NDIGITL:
b8c5462f
JH
3434 PL_reg_flags |= RF_tainted;
3435 /* FALL THROUGH */
d6a28714 3436 case NDIGIT:
9442cb0e 3437 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3438 sayNO;
1aa99e6b 3439 if (do_utf8) {
1a4fad37 3440 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3441 if (OP(scan) == NDIGIT
bb7a0f54 3442 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3443 : isDIGIT_LC_utf8((U8*)locinput))
3444 {
a0ed51b3 3445 sayNO;
9442cb0e 3446 }
6f06b55f 3447 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3448 nextchr = UCHARAT(locinput);
3449 break;
3450 }
ffc61ed2 3451 if (OP(scan) == NDIGIT
9442cb0e 3452 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3453 sayNO;
3454 nextchr = UCHARAT(++locinput);
3455 break;
3456 case CLUMP:
b7c83a7e 3457 if (locinput >= PL_regeol)
a0ed51b3 3458 sayNO;
b7c83a7e 3459 if (do_utf8) {
1a4fad37 3460 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3461 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3462 sayNO;
3463 locinput += PL_utf8skip[nextchr];
3464 while (locinput < PL_regeol &&
3465 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3466 locinput += UTF8SKIP(locinput);
3467 if (locinput > PL_regeol)
3468 sayNO;
eb08e2da
JH
3469 }
3470 else
3471 locinput++;
a0ed51b3
LW
3472 nextchr = UCHARAT(locinput);
3473 break;
81714fb9
YO
3474
3475 case NREFFL:
3476 {
3477 char *s;
ff1157ca 3478 char type;
81714fb9
YO
3479 PL_reg_flags |= RF_tainted;
3480 /* FALL THROUGH */
3481 case NREF:
3482 case NREFF:
ff1157ca 3483 type = OP(scan);
0a4db386
YO
3484 n = reg_check_named_buff_matched(rex,scan);
3485
3486 if ( n ) {
3487 type = REF + ( type - NREF );
3488 goto do_ref;
3489 } else {
81714fb9 3490 sayNO;
0a4db386
YO
3491 }
3492 /* unreached */
c8756f30 3493 case REFFL:
3280af22 3494 PL_reg_flags |= RF_tainted;
c8756f30 3495 /* FALL THROUGH */
c277df42 3496 case REF:
81714fb9 3497 case REFF:
c277df42 3498 n = ARG(scan); /* which paren pair */
81714fb9
YO
3499 type = OP(scan);
3500 do_ref:
24d3c4a9 3501 ln = PL_regstartp[n];
2c2d71f5 3502 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3503 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3504 sayNO; /* Do not match unless seen CLOSEn. */
24d3c4a9 3505 if (ln == PL_regendp[n])
a0d0e21e 3506 break;
a0ed51b3 3507
24d3c4a9 3508 s = PL_bostr + ln;
81714fb9 3509 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3510 char *l = locinput;
a3b680e6 3511 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3512 /*
3513 * Note that we can't do the "other character" lookup trick as
3514 * in the 8-bit case (no pun intended) because in Unicode we
3515 * have to map both upper and title case to lower case.
3516 */
81714fb9 3517 if (type == REFF) {
a0ed51b3 3518 while (s < e) {
a3b680e6
AL
3519 STRLEN ulen1, ulen2;
3520 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3521 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3522
a0ed51b3
LW
3523 if (l >= PL_regeol)
3524 sayNO;
a2a2844f
JH
3525 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3526 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3527 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3528 sayNO;
a2a2844f
JH
3529 s += ulen1;
3530 l += ulen2;
a0ed51b3
LW
3531 }
3532 }
3533 locinput = l;
3534 nextchr = UCHARAT(locinput);
3535 break;
3536 }
3537
a0d0e21e 3538 /* Inline the first character, for speed. */
76e3520e 3539 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3540 (type == REF ||
3541 (UCHARAT(s) != (type == REFF
3542 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3543 sayNO;
24d3c4a9
DM
3544 ln = PL_regendp[n] - ln;
3545 if (locinput + ln > PL_regeol)
4633a7c4 3546 sayNO;
81714fb9 3547 if (ln > 1 && (type == REF
24d3c4a9 3548 ? memNE(s, locinput, ln)
81714fb9 3549 : (type == REFF
24d3c4a9
DM
3550 ? ibcmp(s, locinput, ln)
3551 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3552 sayNO;
24d3c4a9 3553 locinput += ln;
76e3520e 3554 nextchr = UCHARAT(locinput);
a0d0e21e 3555 break;
81714fb9 3556 }
a0d0e21e 3557 case NOTHING:
c277df42 3558 case TAIL:
a0d0e21e
LW
3559 break;
3560 case BACK:
3561 break;
40a82448
DM
3562
3563#undef ST
3564#define ST st->u.eval
c277df42 3565 {
c277df42 3566 SV *ret;
6bda09f9 3567 regexp *re;
f8fc2ecf 3568 regexp_internal *rei;
1a147d38
YO
3569 regnode *startpoint;
3570
3571 case GOSTART:
e7707071
YO
3572 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3573 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3574 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3575 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3576 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3577 Perl_croak(aTHX_
3578 "Pattern subroutine nesting without pos change"
3579 " exceeded limit in regex");
6bda09f9
YO
3580 } else {
3581 nochange_depth = 0;
1a147d38 3582 }
6bda09f9 3583 re = rex;
f8fc2ecf 3584 rei = rexi;
6bda09f9 3585 (void)ReREFCNT_inc(rex);
1a147d38 3586 if (OP(scan)==GOSUB) {
6bda09f9
YO
3587 startpoint = scan + ARG2L(scan);
3588 ST.close_paren = ARG(scan);
3589 } else {
f8fc2ecf 3590 startpoint = rei->program+1;
6bda09f9
YO
3591 ST.close_paren = 0;
3592 }
3593 goto eval_recurse_doit;
3594 /* NOTREACHED */
3595 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3596 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3597 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3598 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3599 } else {
3600 nochange_depth = 0;
3601 }
8e5e9ebe 3602 {
4aabdb9b
DM
3603 /* execute the code in the {...} */
3604 dSP;
6136c704 3605 SV ** const before = SP;
4aabdb9b
DM
3606 OP_4tree * const oop = PL_op;
3607 COP * const ocurcop = PL_curcop;
3608 PAD *old_comppad;
4aabdb9b
DM
3609
3610 n = ARG(scan);
f8fc2ecf 3611 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3612 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3613 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3614 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4aabdb9b
DM
3615 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3616
2bf803e2
YO
3617 if (sv_yes_mark) {
3618 SV *sv_mrk = get_sv("REGMARK", 1);
3619 sv_setsv(sv_mrk, sv_yes_mark);
3620 }
3621
8e5e9ebe
RGS
3622 CALLRUNOPS(aTHX); /* Scalar context. */
3623 SPAGAIN;
3624 if (SP == before)
075aa684 3625 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3626 else {
3627 ret = POPs;
3628 PUTBACK;
3629 }
4aabdb9b
DM
3630
3631 PL_op = oop;
3632 PAD_RESTORE_LOCAL(old_comppad);
3633 PL_curcop = ocurcop;
24d3c4a9 3634 if (!logical) {
4aabdb9b
DM
3635 /* /(?{...})/ */
3636 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3637 break;
3638 }
8e5e9ebe 3639 }
24d3c4a9
DM
3640 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3641 logical = 0;
4aabdb9b 3642 {
4f639d21
DM
3643 /* extract RE object from returned value; compiling if
3644 * necessary */
3645
6136c704 3646 MAGIC *mg = NULL;
be8e71aa 3647 const SV *sv;
faf82a0b
AE
3648 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3649 mg = mg_find(sv, PERL_MAGIC_qr);
3650 else if (SvSMAGICAL(ret)) {
3651 if (SvGMAGICAL(ret))
3652 sv_unmagic(ret, PERL_MAGIC_qr);
3653 else
3654 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3655 }
faf82a0b 3656
0f5d15d6 3657 if (mg) {
28d8d7f4 3658 re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
0f5d15d6
IZ
3659 }
3660 else {
3661 STRLEN len;
6136c704 3662 const char * const t = SvPV_const(ret, len);
0f5d15d6 3663 PMOP pm;
a3b680e6 3664 const I32 osize = PL_regsize;
0f5d15d6 3665
5fcd1c1b 3666 Zero(&pm, 1, PMOP);
4aabdb9b 3667 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3668 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3669 if (!(SvFLAGS(ret)
faf82a0b
AE
3670 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3671 | SVs_GMG)))
14befaf4
DM
3672 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3673 PERL_MAGIC_qr,0,0);
0f5d15d6 3674 PL_regsize = osize;
0f5d15d6 3675 }
4aabdb9b 3676 }
28d8d7f4
YO
3677 RX_MATCH_COPIED_off(re);
3678 re->subbeg = rex->subbeg;
3679 re->sublen = rex->sublen;
f8fc2ecf 3680 rei = RXi_GET(re);
6bda09f9
YO
3681 DEBUG_EXECUTE_r(
3682 debug_start_match(re, do_utf8, locinput, PL_regeol,
3683 "Matching embedded");
3684 );
f8fc2ecf 3685 startpoint = rei->program + 1;
1a147d38 3686 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
3687 /* borrowed from regtry */
3688 if (PL_reg_start_tmpl <= re->nparens) {
3689 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3690 if(PL_reg_start_tmp)
3691 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3692 else
3693 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 3694 }
aa283a38 3695
1a147d38 3696 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 3697 /* run the pattern returned from (??{...}) */
40a82448
DM
3698 ST.cp = regcppush(0); /* Save *all* the positions. */
3699 REGCP_SET(ST.lastcp);
6bda09f9 3700
1a147d38
YO
3701 PL_regstartp = re->startp; /* essentially NOOP on GOSUB */
3702 PL_regendp = re->endp; /* essentially NOOP on GOSUB */
6bda09f9 3703
4aabdb9b
DM
3704 *PL_reglastparen = 0;
3705 *PL_reglastcloseparen = 0;
4aabdb9b 3706 PL_reginput = locinput;
ae0beba1 3707 PL_regsize = 0;
4aabdb9b
DM
3708
3709 /* XXXX This is too dramatic a measure... */
3710 PL_reg_maxiter = 0;
3711
faec1544 3712 ST.toggle_reg_flags = PL_reg_flags;
bbe252da 3713 if (re->extflags & RXf_UTF8)
faec1544
DM
3714 PL_reg_flags |= RF_utf8;
3715 else
3716 PL_reg_flags &= ~RF_utf8;
3717 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3718
40a82448 3719 ST.prev_rex = rex;
faec1544 3720 ST.prev_curlyx = cur_curlyx;
28d8d7f4 3721 SETREX(rex,re);
f8fc2ecf 3722 rexi = rei;
faec1544 3723 cur_curlyx = NULL;
40a82448 3724 ST.B = next;
faec1544
DM
3725 ST.prev_eval = cur_eval;
3726 cur_eval = st;
faec1544 3727 /* now continue from first node in postoned RE */
6bda09f9 3728 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3729 /* NOTREACHED */
a0ed51b3 3730 }
24d3c4a9
DM
3731 /* logical is 1, /(?(?{...})X|Y)/ */
3732 sw = (bool)SvTRUE(ret);
3733 logical = 0;
c277df42
IZ
3734 break;
3735 }
40a82448 3736
faec1544
DM
3737 case EVAL_AB: /* cleanup after a successful (??{A})B */
3738 /* note: this is called twice; first after popping B, then A */
3739 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448 3740 ReREFCNT_dec(rex);
28d8d7f4 3741 SETREX(rex,ST.prev_rex);
f8fc2ecf 3742 rexi = RXi_GET(rex);
faec1544
DM
3743 regcpblow(ST.cp);
3744 cur_eval = ST.prev_eval;
3745 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3746 /* XXXX This is too dramatic a measure... */
3747 PL_reg_maxiter = 0;
e7707071 3748 if ( nochange_depth )
4b196cd4 3749 nochange_depth--;
262b90c4 3750 sayYES;
40a82448 3751
40a82448 3752
faec1544
DM
3753 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3754 /* note: this is called twice; first after popping B, then A */
3755 PL_reg_flags ^= ST.toggle_reg_flags;
3756 ReREFCNT_dec(rex);
28d8d7f4 3757 SETREX(rex,ST.prev_rex);
f8fc2ecf 3758 rexi = RXi_GET(rex);
40a82448
DM
3759 PL_reginput = locinput;
3760 REGCP_UNWIND(ST.lastcp);
3761 regcppop(rex);
faec1544
DM
3762 cur_eval = ST.prev_eval;
3763 cur_curlyx = ST.prev_curlyx;
3764 /* XXXX This is too dramatic a measure... */
3765 PL_reg_maxiter = 0;
e7707071 3766 if ( nochange_depth )
4b196cd4 3767 nochange_depth--;
40a82448 3768 sayNO_SILENT;
40a82448
DM
3769#undef ST
3770
a0d0e21e 3771 case OPEN:
c277df42 3772 n = ARG(scan); /* which paren pair */
3280af22
NIS
3773 PL_reg_start_tmp[n] = locinput;
3774 if (n > PL_regsize)
3775 PL_regsize = n;
e2e6a0f1 3776 lastopen = n;
a0d0e21e
LW
3777 break;
3778 case CLOSE:
c277df42 3779 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3780 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3781 PL_regendp[n] = locinput - PL_bostr;
7f69552c
YO
3782 /*if (n > PL_regsize)
3783 PL_regsize = n;*/
3b6647e0 3784 if (n > *PL_reglastparen)
3280af22 3785 *PL_reglastparen = n;
a01268b5 3786 *PL_reglastcloseparen = n;
3b6647e0 3787 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
3788 goto fake_end;
3789 }
a0d0e21e 3790 break;
e2e6a0f1
YO
3791 case ACCEPT:
3792 if (ARG(scan)){
3793 regnode *cursor;
3794 for (cursor=scan;
3795 cursor && OP(cursor)!=END;
3796 cursor=regnext(cursor))
3797 {
3798 if ( OP(cursor)==CLOSE ){
3799 n = ARG(cursor);
3800 if ( n <= lastopen ) {
3801 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3802 PL_regendp[n] = locinput - PL_bostr;
3803 /*if (n > PL_regsize)
3804 PL_regsize = n;*/
3b6647e0 3805 if (n > *PL_reglastparen)
e2e6a0f1
YO
3806 *PL_reglastparen = n;
3807 *PL_reglastcloseparen = n;
3b6647e0
RB
3808 if ( n == ARG(scan) || (cur_eval &&
3809 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
3810 break;
3811 }
3812 }
3813 }
3814 }
3815 goto fake_end;
3816 /*NOTREACHED*/
c277df42
IZ
3817 case GROUPP:
3818 n = ARG(scan); /* which paren pair */
3b6647e0 3819 sw = (bool)(*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42 3820 break;
0a4db386
YO
3821 case NGROUPP:
3822 /* reg_check_named_buff_matched returns 0 for no match */
3823 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3824 break;
1a147d38 3825 case INSUBP:
0a4db386 3826 n = ARG(scan);
3b6647e0 3827 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
3828 break;
3829 case DEFINEP:
3830 sw = 0;
3831 break;
c277df42 3832 case IFTHEN:
2c2d71f5 3833 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3834 if (sw)
c277df42
IZ
3835 next = NEXTOPER(NEXTOPER(scan));
3836 else {
3837 next = scan + ARG(scan);
3838 if (OP(next) == IFTHEN) /* Fake one. */
3839 next = NEXTOPER(NEXTOPER(next));
3840 }
3841 break;
3842 case LOGICAL:
24d3c4a9 3843 logical = scan->flags;
c277df42 3844 break;
c476f425 3845
2ab05381 3846/*******************************************************************
2ab05381 3847
c476f425
DM
3848The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3849pattern, where A and B are subpatterns. (For simple A, CURLYM or
3850STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3851
c476f425 3852A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3853
c476f425
DM
3854On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3855state, which contains the current count, initialised to -1. It also sets
3856cur_curlyx to point to this state, with any previous value saved in the
3857state block.
2ab05381 3858
c476f425
DM
3859CURLYX then jumps straight to the WHILEM op, rather than executing A,
3860since the pattern may possibly match zero times (i.e. it's a while {} loop
3861rather than a do {} while loop).
2ab05381 3862
c476f425
DM
3863Each entry to WHILEM represents a successful match of A. The count in the
3864CURLYX block is incremented, another WHILEM state is pushed, and execution
3865passes to A or B depending on greediness and the current count.
2ab05381 3866
c476f425
DM
3867For example, if matching against the string a1a2a3b (where the aN are
3868substrings that match /A/), then the match progresses as follows: (the
3869pushed states are interspersed with the bits of strings matched so far):
2ab05381 3870
c476f425
DM
3871 <CURLYX cnt=-1>
3872 <CURLYX cnt=0><WHILEM>
3873 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3874 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3875 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3876 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3877
c476f425
DM
3878(Contrast this with something like CURLYM, which maintains only a single
3879backtrack state:
2ab05381 3880
c476f425
DM
3881 <CURLYM cnt=0> a1
3882 a1 <CURLYM cnt=1> a2
3883 a1 a2 <CURLYM cnt=2> a3
3884 a1 a2 a3 <CURLYM cnt=3> b
3885)
2ab05381 3886
c476f425
DM
3887Each WHILEM state block marks a point to backtrack to upon partial failure
3888of A or B, and also contains some minor state data related to that
3889iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3890overall state, such as the count, and pointers to the A and B ops.
2ab05381 3891
c476f425
DM
3892This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3893must always point to the *current* CURLYX block, the rules are:
2ab05381 3894
c476f425
DM
3895When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3896and set cur_curlyx to point the new block.
2ab05381 3897
c476f425
DM
3898When popping the CURLYX block after a successful or unsuccessful match,
3899restore the previous cur_curlyx.
2ab05381 3900
c476f425
DM
3901When WHILEM is about to execute B, save the current cur_curlyx, and set it
3902to the outer one saved in the CURLYX block.
2ab05381 3903
c476f425
DM
3904When popping the WHILEM block after a successful or unsuccessful B match,
3905restore the previous cur_curlyx.
2ab05381 3906
c476f425
DM
3907Here's an example for the pattern (AI* BI)*BO
3908I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3909
c476f425
DM
3910cur_
3911curlyx backtrack stack
3912------ ---------------
3913NULL
3914CO <CO prev=NULL> <WO>
3915CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3916CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3917NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3918
c476f425
DM
3919At this point the pattern succeeds, and we work back down the stack to
3920clean up, restoring as we go:
95b24440 3921
c476f425
DM
3922CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3923CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3924CO <CO prev=NULL> <WO>
3925NULL
a0374537 3926
c476f425
DM
3927*******************************************************************/
3928
3929#define ST st->u.curlyx
3930
3931 case CURLYX: /* start of /A*B/ (for complex A) */
3932 {
3933 /* No need to save/restore up to this paren */
3934 I32 parenfloor = scan->flags;
3935
3936 assert(next); /* keep Coverity happy */
3937 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3938 next += ARG(next);
3939
3940 /* XXXX Probably it is better to teach regpush to support
3941 parenfloor > PL_regsize... */
3942 if (parenfloor > (I32)*PL_reglastparen)
3943 parenfloor = *PL_reglastparen; /* Pessimization... */
3944
3945 ST.prev_curlyx= cur_curlyx;
3946 cur_curlyx = st;
3947 ST.cp = PL_savestack_ix;
3948
3949 /* these fields contain the state of the current curly.
3950 * they are accessed by subsequent WHILEMs */
3951 ST.parenfloor = parenfloor;
3952 ST.min = ARG1(scan);
3953 ST.max = ARG2(scan);
3954 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3955 ST.B = next;
24d3c4a9
DM
3956 ST.minmod = minmod;
3957 minmod = 0;
c476f425
DM
3958 ST.count = -1; /* this will be updated by WHILEM */
3959 ST.lastloc = NULL; /* this will be updated by WHILEM */
3960
3961 PL_reginput = locinput;
3962 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 3963 /* NOTREACHED */
c476f425 3964 }
a0d0e21e 3965
c476f425 3966 case CURLYX_end: /* just finished matching all of A*B */
f0852a51
YO
3967 if (PL_reg_eval_set){
3968 SV *pres= GvSV(PL_replgv);
3969 SvREFCNT_inc(pres);
3970 regcpblow(ST.cp);
3971 sv_setsv(GvSV(PL_replgv), pres);
3972 SvREFCNT_dec(pres);
3973 } else {
3974 regcpblow(ST.cp);
3975 }
c476f425
DM
3976 cur_curlyx = ST.prev_curlyx;
3977 sayYES;
3978 /* NOTREACHED */
a0d0e21e 3979
c476f425
DM
3980 case CURLYX_end_fail: /* just failed to match all of A*B */
3981 regcpblow(ST.cp);
3982 cur_curlyx = ST.prev_curlyx;
3983 sayNO;
3984 /* NOTREACHED */
4633a7c4 3985
a0d0e21e 3986
c476f425
DM
3987#undef ST
3988#define ST st->u.whilem
3989
3990 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3991 {
3992 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
3993 I32 n;
3994 assert(cur_curlyx); /* keep Coverity happy */
3995 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3996 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3997 ST.cache_offset = 0;
3998 ST.cache_mask = 0;
3999
4000 PL_reginput = locinput;
4001
4002 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4003 "%*s whilem: matched %ld out of %ld..%ld\n",
4004 REPORT_CODE_OFF+depth*2, "", (long)n,
4005 (long)cur_curlyx->u.curlyx.min,
4006 (long)cur_curlyx->u.curlyx.max)
4007 );
a0d0e21e 4008
c476f425 4009 /* First just match a string of min A's. */
a0d0e21e 4010
c476f425
DM
4011 if (n < cur_curlyx->u.curlyx.min) {
4012 cur_curlyx->u.curlyx.lastloc = locinput;
4013 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4014 /* NOTREACHED */
4015 }
4016
4017 /* If degenerate A matches "", assume A done. */
4018
4019 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4020 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4021 "%*s whilem: empty match detected, trying continuation...\n",
4022 REPORT_CODE_OFF+depth*2, "")
4023 );
4024 goto do_whilem_B_max;
4025 }
4026
4027 /* super-linear cache processing */
4028
4029 if (scan->flags) {
a0d0e21e 4030
2c2d71f5 4031 if (!PL_reg_maxiter) {
c476f425
DM
4032 /* start the countdown: Postpone detection until we
4033 * know the match is not *that* much linear. */
2c2d71f5 4034 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4035 /* possible overflow for long strings and many CURLYX's */
4036 if (PL_reg_maxiter < 0)
4037 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4038 PL_reg_leftiter = PL_reg_maxiter;
4039 }
c476f425 4040
2c2d71f5 4041 if (PL_reg_leftiter-- == 0) {
c476f425 4042 /* initialise cache */
3298f257 4043 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4044 if (PL_reg_poscache) {
eb160463 4045 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4046 Renew(PL_reg_poscache, size, char);
4047 PL_reg_poscache_size = size;
4048 }
4049 Zero(PL_reg_poscache, size, char);
4050 }
4051 else {
4052 PL_reg_poscache_size = size;
a02a5408 4053 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4054 }
c476f425
DM
4055 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4056 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4057 PL_colors[4], PL_colors[5])
4058 );
2c2d71f5 4059 }
c476f425 4060
2c2d71f5 4061 if (PL_reg_leftiter < 0) {
c476f425
DM
4062 /* have we already failed at this position? */
4063 I32 offset, mask;
4064 offset = (scan->flags & 0xf) - 1
4065 + (locinput - PL_bostr) * (scan->flags>>4);
4066 mask = 1 << (offset % 8);
4067 offset /= 8;
4068 if (PL_reg_poscache[offset] & mask) {
4069 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4070 "%*s whilem: (cache) already tried at this position...\n",
4071 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4072 );
3298f257 4073 sayNO; /* cache records failure */
2c2d71f5 4074 }
c476f425
DM
4075 ST.cache_offset = offset;
4076 ST.cache_mask = mask;
2c2d71f5 4077 }
c476f425 4078 }
2c2d71f5 4079
c476f425 4080 /* Prefer B over A for minimal matching. */
a687059c 4081
c476f425
DM
4082 if (cur_curlyx->u.curlyx.minmod) {
4083 ST.save_curlyx = cur_curlyx;
4084 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4085 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4086 REGCP_SET(ST.lastcp);
4087 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4088 /* NOTREACHED */
4089 }
a0d0e21e 4090
c476f425
DM
4091 /* Prefer A over B for maximal matching. */
4092
4093 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4094 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4095 cur_curlyx->u.curlyx.lastloc = locinput;
4096 REGCP_SET(ST.lastcp);
4097 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4098 /* NOTREACHED */
4099 }
4100 goto do_whilem_B_max;
4101 }
4102 /* NOTREACHED */
4103
4104 case WHILEM_B_min: /* just matched B in a minimal match */
4105 case WHILEM_B_max: /* just matched B in a maximal match */
4106 cur_curlyx = ST.save_curlyx;
4107 sayYES;
4108 /* NOTREACHED */
4109
4110 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4111 cur_curlyx = ST.save_curlyx;
4112 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4113 cur_curlyx->u.curlyx.count--;
4114 CACHEsayNO;
4115 /* NOTREACHED */
4116
4117 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4118 REGCP_UNWIND(ST.lastcp);
4119 regcppop(rex);
4120 /* FALL THROUGH */
4121 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4122 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4123 cur_curlyx->u.curlyx.count--;
4124 CACHEsayNO;
4125 /* NOTREACHED */
4126
4127 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4128 REGCP_UNWIND(ST.lastcp);
4129 regcppop(rex); /* Restore some previous $<digit>s? */
4130 PL_reginput = locinput;
4131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4132 "%*s whilem: failed, trying continuation...\n",
4133 REPORT_CODE_OFF+depth*2, "")
4134 );
4135 do_whilem_B_max:
4136 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4137 && ckWARN(WARN_REGEXP)
4138 && !(PL_reg_flags & RF_warned))
4139 {
4140 PL_reg_flags |= RF_warned;
4141 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4142 "Complex regular subexpression recursion",
4143 REG_INFTY - 1);
4144 }
4145
4146 /* now try B */
4147 ST.save_curlyx = cur_curlyx;
4148 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4149 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4150 /* NOTREACHED */
4151
4152 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4153 cur_curlyx = ST.save_curlyx;
4154 REGCP_UNWIND(ST.lastcp);
4155 regcppop(rex);
4156
4157 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4158 /* Maximum greed exceeded */
4159 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4160 && ckWARN(WARN_REGEXP)
4161 && !(PL_reg_flags & RF_warned))
4162 {
3280af22 4163 PL_reg_flags |= RF_warned;
c476f425
DM
4164 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4165 "%s limit (%d) exceeded",
4166 "Complex regular subexpression recursion",
4167 REG_INFTY - 1);
a0d0e21e 4168 }
c476f425 4169 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4170 CACHEsayNO;
a0d0e21e 4171 }
c476f425
DM
4172
4173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4174 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4175 );
4176 /* Try grabbing another A and see if it helps. */
4177 PL_reginput = locinput;
4178 cur_curlyx->u.curlyx.lastloc = locinput;
4179 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4180 REGCP_SET(ST.lastcp);
4181 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4182 /* NOTREACHED */
40a82448
DM
4183
4184#undef ST
4185#define ST st->u.branch
4186
4187 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4188 next = scan + ARG(scan);
4189 if (next == scan)
4190 next = NULL;
40a82448
DM
4191 scan = NEXTOPER(scan);
4192 /* FALL THROUGH */
c277df42 4193
40a82448
DM
4194 case BRANCH: /* /(...|A|...)/ */
4195 scan = NEXTOPER(scan); /* scan now points to inner node */
5d458dd8
YO
4196 if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4197 && !has_cutgroup)
4198 {
40a82448
DM
4199 /* last branch; skip state push and jump direct to node */
4200 continue;
5d458dd8 4201 }
40a82448
DM
4202 ST.lastparen = *PL_reglastparen;
4203 ST.next_branch = next;
4204 REGCP_SET(ST.cp);
4205 PL_reginput = locinput;
02db2b7b 4206
40a82448 4207 /* Now go into the branch */
5d458dd8
YO
4208 if (has_cutgroup) {
4209 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4210 } else {
4211 PUSH_STATE_GOTO(BRANCH_next, scan);
4212 }
40a82448 4213 /* NOTREACHED */
5d458dd8
YO
4214 case CUTGROUP:
4215 PL_reginput = locinput;
4216 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
f8fc2ecf 4217 (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4218 PUSH_STATE_GOTO(CUTGROUP_next,next);
4219 /* NOTREACHED */
4220 case CUTGROUP_next_fail:
4221 do_cutgroup = 1;
4222 no_final = 1;
4223 if (st->u.mark.mark_name)
4224 sv_commit = st->u.mark.mark_name;
4225 sayNO;
4226 /* NOTREACHED */
4227 case BRANCH_next:
4228 sayYES;
4229 /* NOTREACHED */
40a82448 4230 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4231 if (do_cutgroup) {
4232 do_cutgroup = 0;
4233 no_final = 0;
4234 }
40a82448
DM
4235 REGCP_UNWIND(ST.cp);
4236 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4237 PL_regendp[n] = -1;
4238 *PL_reglastparen = n;
0a4db386 4239 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4240 scan = ST.next_branch;
4241 /* no more branches? */
5d458dd8
YO
4242 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4243 DEBUG_EXECUTE_r({
4244 PerlIO_printf( Perl_debug_log,
4245 "%*s %sBRANCH failed...%s\n",
4246 REPORT_CODE_OFF+depth*2, "",
4247 PL_colors[4],
4248 PL_colors[5] );
4249 });
4250 sayNO_SILENT;
4251 }
40a82448
DM
4252 continue; /* execute next BRANCH[J] op */
4253 /* NOTREACHED */
4254
a0d0e21e 4255 case MINMOD:
24d3c4a9 4256 minmod = 1;
a0d0e21e 4257 break;
40a82448
DM
4258
4259#undef ST
4260#define ST st->u.curlym
4261
4262 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4263
4264 /* This is an optimisation of CURLYX that enables us to push
4265 * only a single backtracking state, no matter now many matches
4266 * there are in {m,n}. It relies on the pattern being constant
4267 * length, with no parens to influence future backrefs
4268 */
4269
4270 ST.me = scan;
dc45a647 4271 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4272
4273 /* if paren positive, emulate an OPEN/CLOSE around A */
4274 if (ST.me->flags) {
3b6647e0 4275 U32 paren = ST.me->flags;
40a82448
DM
4276 if (paren > PL_regsize)
4277 PL_regsize = paren;
3b6647e0 4278 if (paren > *PL_reglastparen)
40a82448 4279 *PL_reglastparen = paren;
c277df42 4280 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4281 }
40a82448
DM
4282 ST.A = scan;
4283 ST.B = next;
4284 ST.alen = 0;
4285 ST.count = 0;
24d3c4a9
DM
4286 ST.minmod = minmod;
4287 minmod = 0;
40a82448
DM
4288 ST.c1 = CHRTEST_UNINIT;
4289 REGCP_SET(ST.cp);
6407bf3b 4290
40a82448
DM
4291 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4292 goto curlym_do_B;
4293
4294 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4295 PL_reginput = locinput;
40a82448
DM
4296 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4297 /* NOTREACHED */
5f80c4cf 4298
40a82448
DM
4299 case CURLYM_A: /* we've just matched an A */
4300 locinput = st->locinput;
4301 nextchr = UCHARAT(locinput);
4302
4303 ST.count++;
4304 /* after first match, determine A's length: u.curlym.alen */
4305 if (ST.count == 1) {
4306 if (PL_reg_match_utf8) {
4307 char *s = locinput;
4308 while (s < PL_reginput) {
4309 ST.alen++;
4310 s += UTF8SKIP(s);
4311 }
4312 }
4313 else {
4314 ST.alen = PL_reginput - locinput;
4315 }
4316 if (ST.alen == 0)
4317 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4318 }
0cadcf80
DM
4319 DEBUG_EXECUTE_r(
4320 PerlIO_printf(Perl_debug_log,
40a82448 4321 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4322 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4323 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4324 );
4325
40a82448 4326 locinput = PL_reginput;
0a4db386
YO
4327
4328 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4329 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4330 goto fake_end;
4331
4332 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4333 goto curlym_do_A; /* try to match another A */
4334 goto curlym_do_B; /* try to match B */
5f80c4cf 4335
40a82448
DM
4336 case CURLYM_A_fail: /* just failed to match an A */
4337 REGCP_UNWIND(ST.cp);
0a4db386
YO
4338
4339 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4340 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4341 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4342 sayNO;
0cadcf80 4343
40a82448
DM
4344 curlym_do_B: /* execute the B in /A{m,n}B/ */
4345 PL_reginput = locinput;
4346 if (ST.c1 == CHRTEST_UNINIT) {
4347 /* calculate c1 and c2 for possible match of 1st char
4348 * following curly */
4349 ST.c1 = ST.c2 = CHRTEST_VOID;
4350 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4351 regnode *text_node = ST.B;
4352 if (! HAS_TEXT(text_node))
4353 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4354 /* this used to be
4355
4356 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4357
4358 But the former is redundant in light of the latter.
4359
4360 if this changes back then the macro for
4361 IS_TEXT and friends need to change.
4362 */
4363 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4364 {
ee9b8eae 4365
40a82448
DM
4366 ST.c1 = (U8)*STRING(text_node);
4367 ST.c2 =
ee9b8eae 4368 (IS_TEXTF(text_node))
40a82448 4369 ? PL_fold[ST.c1]
ee9b8eae 4370 : (IS_TEXTFL(text_node))
40a82448
DM
4371 ? PL_fold_locale[ST.c1]
4372 : ST.c1;
c277df42 4373 }
c277df42 4374 }
40a82448
DM
4375 }
4376
4377 DEBUG_EXECUTE_r(
4378 PerlIO_printf(Perl_debug_log,
4379 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4380 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4381 "", (IV)ST.count)
4382 );
4383 if (ST.c1 != CHRTEST_VOID
4384 && UCHARAT(PL_reginput) != ST.c1
4385 && UCHARAT(PL_reginput) != ST.c2)
4386 {
4387 /* simulate B failing */
3e901dc0
YO
4388 DEBUG_OPTIMISE_r(
4389 PerlIO_printf(Perl_debug_log,
4390 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4391 (int)(REPORT_CODE_OFF+(depth*2)),"",
4392 (IV)ST.c1,(IV)ST.c2
4393 ));
40a82448
DM
4394 state_num = CURLYM_B_fail;
4395 goto reenter_switch;
4396 }
4397
4398 if (ST.me->flags) {
4399 /* mark current A as captured */
4400 I32 paren = ST.me->flags;
4401 if (ST.count) {
4402 PL_regstartp[paren]
4403 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4404 PL_regendp[paren] = PL_reginput - PL_bostr;
0a4db386 4405 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4406 }
40a82448
DM
4407 else
4408 PL_regendp[paren] = -1;
0a4db386 4409 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4410 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4411 {
4412 if (ST.count)
4413 goto fake_end;
4414 else
4415 sayNO;
4416 }
c277df42 4417 }
0a4db386 4418
40a82448 4419 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4420 /* NOTREACHED */
40a82448
DM
4421
4422 case CURLYM_B_fail: /* just failed to match a B */
4423 REGCP_UNWIND(ST.cp);
4424 if (ST.minmod) {
4425 if (ST.count == ARG2(ST.me) /* max */)
4426 sayNO;
4427 goto curlym_do_A; /* try to match a further A */
4428 }
4429 /* backtrack one A */
4430 if (ST.count == ARG1(ST.me) /* min */)
4431 sayNO;
4432 ST.count--;
4433 locinput = HOPc(locinput, -ST.alen);
4434 goto curlym_do_B; /* try to match B */
4435
c255a977
DM
4436#undef ST
4437#define ST st->u.curly
40a82448 4438
c255a977
DM
4439#define CURLY_SETPAREN(paren, success) \
4440 if (paren) { \
4441 if (success) { \
4442 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4443 PL_regendp[paren] = locinput - PL_bostr; \
0a4db386 4444 *PL_reglastcloseparen = paren; \
c255a977
DM
4445 } \
4446 else \
4447 PL_regendp[paren] = -1; \
4448 }
4449
4450 case STAR: /* /A*B/ where A is width 1 */
4451 ST.paren = 0;
4452 ST.min = 0;
4453 ST.max = REG_INFTY;
a0d0e21e
LW
4454 scan = NEXTOPER(scan);
4455 goto repeat;
c255a977
DM
4456 case PLUS: /* /A+B/ where A is width 1 */
4457 ST.paren = 0;
4458 ST.min = 1;
4459 ST.max = REG_INFTY;
c277df42 4460 scan = NEXTOPER(scan);
c255a977
DM
4461 goto repeat;
4462 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4463 ST.paren = scan->flags; /* Which paren to set */
4464 if (ST.paren > PL_regsize)
4465 PL_regsize = ST.paren;
3b6647e0 4466 if (ST.paren > *PL_reglastparen)
c255a977
DM
4467 *PL_reglastparen = ST.paren;
4468 ST.min = ARG1(scan); /* min to match */
4469 ST.max = ARG2(scan); /* max to match */
0a4db386 4470 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4471 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4472 ST.min=1;
4473 ST.max=1;
4474 }
c255a977
DM
4475 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4476 goto repeat;
4477 case CURLY: /* /A{m,n}B/ where A is width 1 */
4478 ST.paren = 0;
4479 ST.min = ARG1(scan); /* min to match */
4480 ST.max = ARG2(scan); /* max to match */
4481 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4482 repeat:
a0d0e21e
LW
4483 /*
4484 * Lookahead to avoid useless match attempts
4485 * when we know what character comes next.
c255a977 4486 *
5f80c4cf
JP
4487 * Used to only do .*x and .*?x, but now it allows
4488 * for )'s, ('s and (?{ ... })'s to be in the way
4489 * of the quantifier and the EXACT-like node. -- japhy
4490 */
4491
c255a977
DM
4492 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4493 sayNO;
cca55fe3 4494 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4495 U8 *s;
4496 regnode *text_node = next;
4497
3dab1dad
YO
4498 if (! HAS_TEXT(text_node))
4499 FIND_NEXT_IMPT(text_node);
5f80c4cf 4500
9e137952 4501 if (! HAS_TEXT(text_node))
c255a977 4502 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4503 else {
ee9b8eae 4504 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4505 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4506 goto assume_ok_easy;
cca55fe3 4507 }
be8e71aa
YO
4508 else
4509 s = (U8*)STRING(text_node);
ee9b8eae
YO
4510
4511 /* Currently we only get here when
4512
4513 PL_rekind[OP(text_node)] == EXACT
4514
4515 if this changes back then the macro for IS_TEXT and
4516 friends need to change. */
5f80c4cf 4517 if (!UTF) {
c255a977 4518 ST.c2 = ST.c1 = *s;
ee9b8eae 4519 if (IS_TEXTF(text_node))
c255a977 4520 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4521 else if (IS_TEXTFL(text_node))
c255a977 4522 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4523 }
5f80c4cf 4524 else { /* UTF */
ee9b8eae 4525 if (IS_TEXTF(text_node)) {
a2a2844f 4526 STRLEN ulen1, ulen2;
89ebb4a3
JH
4527 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4528 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4529
4530 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4531 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4532#ifdef EBCDIC
4533 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4534 ckWARN(WARN_UTF8) ?
4535 0 : UTF8_ALLOW_ANY);
4536 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4537 ckWARN(WARN_UTF8) ?
4538 0 : UTF8_ALLOW_ANY);
4539#else
c255a977 4540 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4541 uniflags);
c255a977 4542 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4543 uniflags);
4544#endif
5f80c4cf
JP
4545 }
4546 else {
c255a977 4547 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4548 uniflags);
5f80c4cf 4549 }
1aa99e6b
IH
4550 }
4551 }
bbce6d69 4552 }
a0d0e21e 4553 else
c255a977 4554 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4555 assume_ok_easy:
c255a977
DM
4556
4557 ST.A = scan;
4558 ST.B = next;
3280af22 4559 PL_reginput = locinput;
24d3c4a9
DM
4560 if (minmod) {
4561 minmod = 0;
e2e6a0f1 4562 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4563 sayNO;
c255a977 4564 ST.count = ST.min;
a0ed51b3 4565 locinput = PL_reginput;
c255a977
DM
4566 REGCP_SET(ST.cp);
4567 if (ST.c1 == CHRTEST_VOID)
4568 goto curly_try_B_min;
4569
4570 ST.oldloc = locinput;
4571
4572 /* set ST.maxpos to the furthest point along the
4573 * string that could possibly match */
4574 if (ST.max == REG_INFTY) {
4575 ST.maxpos = PL_regeol - 1;
4576 if (do_utf8)
4577 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4578 ST.maxpos--;
4579 }
4580 else if (do_utf8) {
4581 int m = ST.max - ST.min;
4582 for (ST.maxpos = locinput;
4583 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4584 ST.maxpos += UTF8SKIP(ST.maxpos);
4585 }
4586 else {
4587 ST.maxpos = locinput + ST.max - ST.min;
4588 if (ST.maxpos >= PL_regeol)
4589 ST.maxpos = PL_regeol - 1;
4590 }
4591 goto curly_try_B_min_known;
4592
4593 }
4594 else {
e2e6a0f1 4595 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4596 locinput = PL_reginput;
4597 if (ST.count < ST.min)
4598 sayNO;
4599 if ((ST.count > ST.min)
4600 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4601 {
4602 /* A{m,n} must come at the end of the string, there's
4603 * no point in backing off ... */
4604 ST.min = ST.count;
4605 /* ...except that $ and \Z can match before *and* after
4606 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4607 We may back off by one in this case. */
4608 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4609 ST.min--;
4610 }
4611 REGCP_SET(ST.cp);
4612 goto curly_try_B_max;
4613 }
4614 /* NOTREACHED */
4615
4616
4617 case CURLY_B_min_known_fail:
4618 /* failed to find B in a non-greedy match where c1,c2 valid */
4619 if (ST.paren && ST.count)
4620 PL_regendp[ST.paren] = -1;
4621
4622 PL_reginput = locinput; /* Could be reset... */
4623 REGCP_UNWIND(ST.cp);
4624 /* Couldn't or didn't -- move forward. */
4625 ST.oldloc = locinput;
4626 if (do_utf8)
4627 locinput += UTF8SKIP(locinput);
4628 else
4629 locinput++;
4630 ST.count++;
4631 curly_try_B_min_known:
4632 /* find the next place where 'B' could work, then call B */
4633 {
4634 int n;
4635 if (do_utf8) {
4636 n = (ST.oldloc == locinput) ? 0 : 1;
4637 if (ST.c1 == ST.c2) {
4638 STRLEN len;
4639 /* set n to utf8_distance(oldloc, locinput) */
4640 while (locinput <= ST.maxpos &&
4641 utf8n_to_uvchr((U8*)locinput,
4642 UTF8_MAXBYTES, &len,
4643 uniflags) != (UV)ST.c1) {
4644 locinput += len;
4645 n++;
4646 }
1aa99e6b
IH
4647 }
4648 else {
c255a977
DM
4649 /* set n to utf8_distance(oldloc, locinput) */
4650 while (locinput <= ST.maxpos) {
4651 STRLEN len;
4652 const UV c = utf8n_to_uvchr((U8*)locinput,
4653 UTF8_MAXBYTES, &len,
4654 uniflags);
4655 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4656 break;
4657 locinput += len;
4658 n++;
1aa99e6b 4659 }
0fe9bf95
IZ
4660 }
4661 }
c255a977
DM
4662 else {
4663 if (ST.c1 == ST.c2) {
4664 while (locinput <= ST.maxpos &&
4665 UCHARAT(locinput) != ST.c1)
4666 locinput++;
bbce6d69 4667 }
c255a977
DM
4668 else {
4669 while (locinput <= ST.maxpos
4670 && UCHARAT(locinput) != ST.c1
4671 && UCHARAT(locinput) != ST.c2)
4672 locinput++;
a0ed51b3 4673 }
c255a977
DM
4674 n = locinput - ST.oldloc;
4675 }
4676 if (locinput > ST.maxpos)
4677 sayNO;
4678 /* PL_reginput == oldloc now */
4679 if (n) {
4680 ST.count += n;
e2e6a0f1 4681 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 4682 sayNO;
a0d0e21e 4683 }
c255a977
DM
4684 PL_reginput = locinput;
4685 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4686 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4687 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4688 goto fake_end;
4689 }
c255a977 4690 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4691 }
c255a977
DM
4692 /* NOTREACHED */
4693
4694
4695 case CURLY_B_min_fail:
4696 /* failed to find B in a non-greedy match where c1,c2 invalid */
4697 if (ST.paren && ST.count)
4698 PL_regendp[ST.paren] = -1;
4699
4700 REGCP_UNWIND(ST.cp);
4701 /* failed -- move forward one */
4702 PL_reginput = locinput;
e2e6a0f1 4703 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 4704 ST.count++;
a0ed51b3 4705 locinput = PL_reginput;
c255a977
DM
4706 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4707 ST.count > 0)) /* count overflow ? */
15272685 4708 {
c255a977
DM
4709 curly_try_B_min:
4710 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 4711 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4712 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4713 goto fake_end;
4714 }
c255a977 4715 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4716 }
4717 }
4633a7c4 4718 sayNO;
c255a977
DM
4719 /* NOTREACHED */
4720
4721
4722 curly_try_B_max:
4723 /* a successful greedy match: now try to match B */
40d049e4 4724 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4725 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
4726 goto fake_end;
4727 }
c255a977
DM
4728 {
4729 UV c = 0;
4730 if (ST.c1 != CHRTEST_VOID)
4731 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4732 UTF8_MAXBYTES, 0, uniflags)
466787eb 4733 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4734 /* If it could work, try it. */
4735 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4736 CURLY_SETPAREN(ST.paren, ST.count);
4737 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4738 /* NOTREACHED */
4739 }
4740 }
4741 /* FALL THROUGH */
4742 case CURLY_B_max_fail:
4743 /* failed to find B in a greedy match */
4744 if (ST.paren && ST.count)
4745 PL_regendp[ST.paren] = -1;
4746
4747 REGCP_UNWIND(ST.cp);
4748 /* back up. */
4749 if (--ST.count < ST.min)
4750 sayNO;
4751 PL_reginput = locinput = HOPc(locinput, -1);
4752 goto curly_try_B_max;
4753
4754#undef ST
4755
a0d0e21e 4756 case END:
6bda09f9 4757 fake_end:
faec1544
DM
4758 if (cur_eval) {
4759 /* we've just finished A in /(??{A})B/; now continue with B */
4760 I32 tmpix;
faec1544
DM
4761 st->u.eval.toggle_reg_flags
4762 = cur_eval->u.eval.toggle_reg_flags;
4763 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4764
4765 st->u.eval.prev_rex = rex; /* inner */
28d8d7f4 4766 SETREX(rex,cur_eval->u.eval.prev_rex);
f8fc2ecf 4767 rexi = RXi_GET(rex);
faec1544
DM
4768 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4769 ReREFCNT_inc(rex);
4770 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4771 REGCP_SET(st->u.eval.lastcp);
4772 PL_reginput = locinput;
4773
4774 /* Restore parens of the outer rex without popping the
4775 * savestack */
4776 tmpix = PL_savestack_ix;
4777 PL_savestack_ix = cur_eval->u.eval.lastcp;
4778 regcppop(rex);
4779 PL_savestack_ix = tmpix;
4780
4781 st->u.eval.prev_eval = cur_eval;
4782 cur_eval = cur_eval->u.eval.prev_eval;
4783 DEBUG_EXECUTE_r(
2a49f0f5
JH
4784 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4785 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
4786 if ( nochange_depth )
4787 nochange_depth--;
4788
4789 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
4790 st->u.eval.prev_eval->u.eval.B); /* match B */
4791 }
4792
3b0527fe 4793 if (locinput < reginfo->till) {
a3621e74 4794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4795 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4796 PL_colors[4],
4797 (long)(locinput - PL_reg_starttry),
3b0527fe 4798 (long)(reginfo->till - PL_reg_starttry),
7821416a 4799 PL_colors[5]));
58e23c8d 4800
262b90c4 4801 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4802 }
4803 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4804 sayYES; /* Success! */
dad79028
DM
4805
4806 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4807 DEBUG_EXECUTE_r(
4808 PerlIO_printf(Perl_debug_log,
4809 "%*s %ssubpattern success...%s\n",
5bc10b2c 4810 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4811 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4812 sayYES; /* Success! */
dad79028 4813
40a82448
DM
4814#undef ST
4815#define ST st->u.ifmatch
4816
4817 case SUSPEND: /* (?>A) */
4818 ST.wanted = 1;
9fe1d20c 4819 PL_reginput = locinput;
9041c2e3 4820 goto do_ifmatch;
dad79028 4821
40a82448
DM
4822 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4823 ST.wanted = 0;
dad79028
DM
4824 goto ifmatch_trivial_fail_test;
4825
40a82448
DM
4826 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4827 ST.wanted = 1;
dad79028 4828 ifmatch_trivial_fail_test:
a0ed51b3 4829 if (scan->flags) {
52657f30 4830 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4831 if (!s) {
4832 /* trivial fail */
24d3c4a9
DM
4833 if (logical) {
4834 logical = 0;
4835 sw = 1 - (bool)ST.wanted;
dad79028 4836 }
40a82448 4837 else if (ST.wanted)
dad79028
DM
4838 sayNO;
4839 next = scan + ARG(scan);
4840 if (next == scan)
4841 next = NULL;
4842 break;
4843 }
efb30f32 4844 PL_reginput = s;
a0ed51b3
LW
4845 }
4846 else
4847 PL_reginput = locinput;
4848
c277df42 4849 do_ifmatch:
40a82448 4850 ST.me = scan;
24d3c4a9 4851 ST.logical = logical;
40a82448
DM
4852 /* execute body of (?...A) */
4853 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4854 /* NOTREACHED */
4855
4856 case IFMATCH_A_fail: /* body of (?...A) failed */
4857 ST.wanted = !ST.wanted;
4858 /* FALL THROUGH */
4859
4860 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4861 if (ST.logical) {
4862 sw = (bool)ST.wanted;
40a82448
DM
4863 }
4864 else if (!ST.wanted)
4865 sayNO;
4866
4867 if (OP(ST.me) == SUSPEND)
4868 locinput = PL_reginput;
4869 else {
4870 locinput = PL_reginput = st->locinput;
4871 nextchr = UCHARAT(locinput);
4872 }
4873 scan = ST.me + ARG(ST.me);
4874 if (scan == ST.me)
4875 scan = NULL;
4876 continue; /* execute B */
4877
4878#undef ST
dad79028 4879
c277df42 4880 case LONGJMP:
c277df42
IZ
4881 next = scan + ARG(scan);
4882 if (next == scan)
4883 next = NULL;
a0d0e21e 4884 break;
54612592 4885 case COMMIT:
e2e6a0f1
YO
4886 reginfo->cutpoint = PL_regeol;
4887 /* FALLTHROUGH */
5d458dd8 4888 case PRUNE:
24b23f37 4889 PL_reginput = locinput;
e2e6a0f1 4890 if (!scan->flags)
f8fc2ecf 4891 sv_yes_mark = sv_commit = (SV*)rexi->data->data[ ARG( scan ) ];
54612592
YO
4892 PUSH_STATE_GOTO(COMMIT_next,next);
4893 /* NOTREACHED */
4894 case COMMIT_next_fail:
4895 no_final = 1;
4896 /* FALLTHROUGH */
7f69552c
YO
4897 case OPFAIL:
4898 sayNO;
e2e6a0f1
YO
4899 /* NOTREACHED */
4900
4901#define ST st->u.mark
4902 case MARKPOINT:
4903 ST.prev_mark = mark_state;
5d458dd8 4904 ST.mark_name = sv_commit = sv_yes_mark
f8fc2ecf 4905 = (SV*)rexi->data->data[ ARG( scan ) ];
e2e6a0f1
YO
4906 mark_state = st;
4907 ST.mark_loc = PL_reginput = locinput;
4908 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
4909 /* NOTREACHED */
4910 case MARKPOINT_next:
4911 mark_state = ST.prev_mark;
4912 sayYES;
4913 /* NOTREACHED */
4914 case MARKPOINT_next_fail:
5d458dd8 4915 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
4916 {
4917 if (ST.mark_loc > startpoint)
4918 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
4919 popmark = NULL; /* we found our mark */
4920 sv_commit = ST.mark_name;
4921
4922 DEBUG_EXECUTE_r({
5d458dd8 4923 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
4924 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
4925 REPORT_CODE_OFF+depth*2, "",
be2597df 4926 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
4927 });
4928 }
4929 mark_state = ST.prev_mark;
5d458dd8
YO
4930 sv_yes_mark = mark_state ?
4931 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
4932 sayNO;
4933 /* NOTREACHED */
5d458dd8
YO
4934 case SKIP:
4935 PL_reginput = locinput;
4936 if (scan->flags) {
2bf803e2 4937 /* (*SKIP) : if we fail we cut here*/
5d458dd8 4938 ST.mark_name = NULL;
e2e6a0f1 4939 ST.mark_loc = locinput;
5d458dd8
YO
4940 PUSH_STATE_GOTO(SKIP_next,next);
4941 } else {
2bf803e2 4942 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
4943 otherwise do nothing. Meaning we need to scan
4944 */
4945 regmatch_state *cur = mark_state;
f8fc2ecf 4946 SV *find = (SV*)rexi->data->data[ ARG( scan ) ];
5d458dd8
YO
4947
4948 while (cur) {
4949 if ( sv_eq( cur->u.mark.mark_name,
4950 find ) )
4951 {
4952 ST.mark_name = find;
4953 PUSH_STATE_GOTO( SKIP_next, next );
4954 }
4955 cur = cur->u.mark.prev_mark;
4956 }
e2e6a0f1 4957 }
2bf803e2 4958 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
4959 break;
4960 case SKIP_next_fail:
4961 if (ST.mark_name) {
4962 /* (*CUT:NAME) - Set up to search for the name as we
4963 collapse the stack*/
4964 popmark = ST.mark_name;
4965 } else {
4966 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
4967 if (ST.mark_loc > startpoint)
4968 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
4969 /* but we set sv_commit to latest mark_name if there
4970 is one so they can test to see how things lead to this
4971 cut */
4972 if (mark_state)
4973 sv_commit=mark_state->u.mark.mark_name;
4974 }
e2e6a0f1
YO
4975 no_final = 1;
4976 sayNO;
4977 /* NOTREACHED */
4978#undef ST
4979
a0d0e21e 4980 default:
b900a521 4981 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4982 PTR2UV(scan), OP(scan));
cea2e8a9 4983 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
4984
4985 } /* end switch */
95b24440 4986
5d458dd8
YO
4987 /* switch break jumps here */
4988 scan = next; /* prepare to execute the next op and ... */
4989 continue; /* ... jump back to the top, reusing st */
95b24440
DM
4990 /* NOTREACHED */
4991
40a82448
DM
4992 push_yes_state:
4993 /* push a state that backtracks on success */
4994 st->u.yes.prev_yes_state = yes_state;
4995 yes_state = st;
4996 /* FALL THROUGH */
4997 push_state:
4998 /* push a new regex state, then continue at scan */
4999 {
5000 regmatch_state *newst;
5001
24b23f37
YO
5002 DEBUG_STACK_r({
5003 regmatch_state *cur = st;
5004 regmatch_state *curyes = yes_state;
5005 int curd = depth;
5006 regmatch_slab *slab = PL_regmatch_slab;
5007 for (;curd > -1;cur--,curd--) {
5008 if (cur < SLAB_FIRST(slab)) {
5009 slab = slab->prev;
5010 cur = SLAB_LAST(slab);
5011 }
5012 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5013 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5014 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5015 (curyes == cur) ? "yes" : ""
5016 );
5017 if (curyes == cur)
5018 curyes = cur->u.yes.prev_yes_state;
5019 }
5020 } else
5021 DEBUG_STATE_pp("push")
5022 );
40a82448 5023 depth++;
40a82448
DM
5024 st->locinput = locinput;
5025 newst = st+1;
5026 if (newst > SLAB_LAST(PL_regmatch_slab))
5027 newst = S_push_slab(aTHX);
5028 PL_regmatch_state = newst;
786e8c11 5029
40a82448
DM
5030 locinput = PL_reginput;
5031 nextchr = UCHARAT(locinput);
5032 st = newst;
5033 continue;
5034 /* NOTREACHED */
5035 }
a0d0e21e 5036 }
a687059c 5037
a0d0e21e
LW
5038 /*
5039 * We get here only if there's trouble -- normally "case END" is
5040 * the terminating point.
5041 */
cea2e8a9 5042 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5043 /*NOTREACHED*/
4633a7c4
LW
5044 sayNO;
5045
262b90c4 5046yes:
77cb431f
DM
5047 if (yes_state) {
5048 /* we have successfully completed a subexpression, but we must now
5049 * pop to the state marked by yes_state and continue from there */
77cb431f 5050 assert(st != yes_state);
5bc10b2c
DM
5051#ifdef DEBUGGING
5052 while (st != yes_state) {
5053 st--;
5054 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5055 PL_regmatch_slab = PL_regmatch_slab->prev;
5056 st = SLAB_LAST(PL_regmatch_slab);
5057 }
e2e6a0f1 5058 DEBUG_STATE_r({
54612592
YO
5059 if (no_final) {
5060 DEBUG_STATE_pp("pop (no final)");
5061 } else {
5062 DEBUG_STATE_pp("pop (yes)");
5063 }
e2e6a0f1 5064 });
5bc10b2c
DM
5065 depth--;
5066 }
5067#else
77cb431f
DM
5068 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5069 || yes_state > SLAB_LAST(PL_regmatch_slab))
5070 {
5071 /* not in this slab, pop slab */
5072 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5073 PL_regmatch_slab = PL_regmatch_slab->prev;
5074 st = SLAB_LAST(PL_regmatch_slab);
5075 }
5076 depth -= (st - yes_state);
5bc10b2c 5077#endif
77cb431f
DM
5078 st = yes_state;
5079 yes_state = st->u.yes.prev_yes_state;
5080 PL_regmatch_state = st;
24b23f37 5081
5d458dd8
YO
5082 if (no_final) {
5083 locinput= st->locinput;
5084 nextchr = UCHARAT(locinput);
5085 }
54612592 5086 state_num = st->resume_state + no_final;
24d3c4a9 5087 goto reenter_switch;
77cb431f
DM
5088 }
5089
a3621e74 5090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5091 PL_colors[4], PL_colors[5]));
02db2b7b 5092
95b24440 5093 result = 1;
aa283a38 5094 goto final_exit;
4633a7c4
LW
5095
5096no:
a3621e74 5097 DEBUG_EXECUTE_r(
7821416a 5098 PerlIO_printf(Perl_debug_log,
786e8c11 5099 "%*s %sfailed...%s\n",
5bc10b2c 5100 REPORT_CODE_OFF+depth*2, "",
786e8c11 5101 PL_colors[4], PL_colors[5])
7821416a 5102 );
aa283a38 5103
262b90c4 5104no_silent:
54612592
YO
5105 if (no_final) {
5106 if (yes_state) {
5107 goto yes;
5108 } else {
5109 goto final_exit;
5110 }
5111 }
aa283a38
DM
5112 if (depth) {
5113 /* there's a previous state to backtrack to */
40a82448
DM
5114 st--;
5115 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5116 PL_regmatch_slab = PL_regmatch_slab->prev;
5117 st = SLAB_LAST(PL_regmatch_slab);
5118 }
5119 PL_regmatch_state = st;
40a82448
DM
5120 locinput= st->locinput;
5121 nextchr = UCHARAT(locinput);
5122
5bc10b2c
DM
5123 DEBUG_STATE_pp("pop");
5124 depth--;
262b90c4
DM
5125 if (yes_state == st)
5126 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5127
24d3c4a9
DM
5128 state_num = st->resume_state + 1; /* failure = success + 1 */
5129 goto reenter_switch;
95b24440 5130 }
24d3c4a9 5131 result = 0;
aa283a38 5132
262b90c4 5133 final_exit:
bbe252da 5134 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5135 SV *sv_err = get_sv("REGERROR", 1);
5136 SV *sv_mrk = get_sv("REGMARK", 1);
5137 if (result) {
e2e6a0f1 5138 sv_commit = &PL_sv_no;
5d458dd8
YO
5139 if (!sv_yes_mark)
5140 sv_yes_mark = &PL_sv_yes;
5141 } else {
5142 if (!sv_commit)
5143 sv_commit = &PL_sv_yes;
5144 sv_yes_mark = &PL_sv_no;
5145 }
5146 sv_setsv(sv_err, sv_commit);
5147 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5148 }
5d9a96ca
DM
5149 /* restore original high-water mark */
5150 PL_regmatch_slab = orig_slab;
5151 PL_regmatch_state = orig_state;
5152
5153 /* free all slabs above current one */
5154 if (orig_slab->next) {
c4fd8992 5155 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
5156 orig_slab->next = NULL;
5157 while (sl) {
c4fd8992 5158 regmatch_slab * const osl = sl;
5d9a96ca 5159 sl = sl->next;
ad65c075 5160 Safefree(osl);
5d9a96ca
DM
5161 }
5162 }
5163
95b24440 5164 return result;
a687059c
LW
5165}
5166
5167/*
5168 - regrepeat - repeatedly match something simple, report how many
5169 */
5170/*
5171 * [This routine now assumes that it will only match on things of length 1.
5172 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5173 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5174 */
76e3520e 5175STATIC I32
e2e6a0f1 5176S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5177{
27da23d5 5178 dVAR;
a0d0e21e 5179 register char *scan;
a0d0e21e 5180 register I32 c;
3280af22 5181 register char *loceol = PL_regeol;
a0ed51b3 5182 register I32 hardcount = 0;
53c4c00c 5183 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5184#ifndef DEBUGGING
5185 PERL_UNUSED_ARG(depth);
5186#endif
a0d0e21e 5187
3280af22 5188 scan = PL_reginput;
faf11cac
HS
5189 if (max == REG_INFTY)
5190 max = I32_MAX;
5191 else if (max < loceol - scan)
7f596f4c 5192 loceol = scan + max;
a0d0e21e 5193 switch (OP(p)) {
22c35a8c 5194 case REG_ANY:
1aa99e6b 5195 if (do_utf8) {
ffc61ed2 5196 loceol = PL_regeol;
1aa99e6b 5197 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5198 scan += UTF8SKIP(scan);
5199 hardcount++;
5200 }
5201 } else {
5202 while (scan < loceol && *scan != '\n')
5203 scan++;
a0ed51b3
LW
5204 }
5205 break;
ffc61ed2 5206 case SANY:
def8e4ea
JH
5207 if (do_utf8) {
5208 loceol = PL_regeol;
a0804c9e 5209 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5210 scan += UTF8SKIP(scan);
5211 hardcount++;
5212 }
5213 }
5214 else
5215 scan = loceol;
a0ed51b3 5216 break;
f33976b4
DB
5217 case CANY:
5218 scan = loceol;
5219 break;
090f7165
JH
5220 case EXACT: /* length of string is 1 */
5221 c = (U8)*STRING(p);
5222 while (scan < loceol && UCHARAT(scan) == c)
5223 scan++;
bbce6d69 5224 break;
5225 case EXACTF: /* length of string is 1 */
cd439c50 5226 c = (U8)*STRING(p);
bbce6d69 5227 while (scan < loceol &&
22c35a8c 5228 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5229 scan++;
5230 break;
5231 case EXACTFL: /* length of string is 1 */
3280af22 5232 PL_reg_flags |= RF_tainted;
cd439c50 5233 c = (U8)*STRING(p);
bbce6d69 5234 while (scan < loceol &&
22c35a8c 5235 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5236 scan++;
5237 break;
5238 case ANYOF:
ffc61ed2
JH
5239 if (do_utf8) {
5240 loceol = PL_regeol;
cfc92286 5241 while (hardcount < max && scan < loceol &&
32fc9b6a 5242 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5243 scan += UTF8SKIP(scan);
5244 hardcount++;
5245 }
5246 } else {
32fc9b6a 5247 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5248 scan++;
5249 }
a0d0e21e
LW
5250 break;
5251 case ALNUM:
1aa99e6b 5252 if (do_utf8) {
ffc61ed2 5253 loceol = PL_regeol;
1a4fad37 5254 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5255 while (hardcount < max && scan < loceol &&
3568d838 5256 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5257 scan += UTF8SKIP(scan);
5258 hardcount++;
5259 }
5260 } else {
5261 while (scan < loceol && isALNUM(*scan))
5262 scan++;
a0ed51b3
LW
5263 }
5264 break;
bbce6d69 5265 case ALNUML:
3280af22 5266 PL_reg_flags |= RF_tainted;
1aa99e6b 5267 if (do_utf8) {
ffc61ed2 5268 loceol = PL_regeol;
1aa99e6b
IH
5269 while (hardcount < max && scan < loceol &&
5270 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5271 scan += UTF8SKIP(scan);
5272 hardcount++;
5273 }
5274 } else {
5275 while (scan < loceol && isALNUM_LC(*scan))
5276 scan++;
a0ed51b3
LW
5277 }
5278 break;
a0d0e21e 5279 case NALNUM:
1aa99e6b 5280 if (do_utf8) {
ffc61ed2 5281 loceol = PL_regeol;
1a4fad37 5282 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5283 while (hardcount < max && scan < loceol &&
3568d838 5284 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5285 scan += UTF8SKIP(scan);
5286 hardcount++;
5287 }
5288 } else {
5289 while (scan < loceol && !isALNUM(*scan))
5290 scan++;
a0ed51b3
LW
5291 }
5292 break;
bbce6d69 5293 case NALNUML:
3280af22 5294 PL_reg_flags |= RF_tainted;
1aa99e6b 5295 if (do_utf8) {
ffc61ed2 5296 loceol = PL_regeol;
1aa99e6b
IH
5297 while (hardcount < max && scan < loceol &&
5298 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5299 scan += UTF8SKIP(scan);
5300 hardcount++;
5301 }
5302 } else {
5303 while (scan < loceol && !isALNUM_LC(*scan))
5304 scan++;
a0ed51b3
LW
5305 }
5306 break;
a0d0e21e 5307 case SPACE:
1aa99e6b 5308 if (do_utf8) {
ffc61ed2 5309 loceol = PL_regeol;
1a4fad37 5310 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5311 while (hardcount < max && scan < loceol &&
3568d838
JH
5312 (*scan == ' ' ||
5313 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5314 scan += UTF8SKIP(scan);
5315 hardcount++;
5316 }
5317 } else {
5318 while (scan < loceol && isSPACE(*scan))
5319 scan++;
a0ed51b3
LW
5320 }
5321 break;
bbce6d69 5322 case SPACEL:
3280af22 5323 PL_reg_flags |= RF_tainted;
1aa99e6b 5324 if (do_utf8) {
ffc61ed2 5325 loceol = PL_regeol;
1aa99e6b 5326 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5327 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5328 scan += UTF8SKIP(scan);
5329 hardcount++;
5330 }
5331 } else {
5332 while (scan < loceol && isSPACE_LC(*scan))
5333 scan++;
a0ed51b3
LW
5334 }
5335 break;
a0d0e21e 5336 case NSPACE:
1aa99e6b 5337 if (do_utf8) {
ffc61ed2 5338 loceol = PL_regeol;
1a4fad37 5339 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5340 while (hardcount < max && scan < loceol &&
3568d838
JH
5341 !(*scan == ' ' ||
5342 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5343 scan += UTF8SKIP(scan);
5344 hardcount++;
5345 }
5346 } else {
5347 while (scan < loceol && !isSPACE(*scan))
5348 scan++;
5349 break;
a0ed51b3 5350 }
bbce6d69 5351 case NSPACEL:
3280af22 5352 PL_reg_flags |= RF_tainted;
1aa99e6b 5353 if (do_utf8) {
ffc61ed2 5354 loceol = PL_regeol;
1aa99e6b 5355 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5356 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5357 scan += UTF8SKIP(scan);
5358 hardcount++;
5359 }
5360 } else {
5361 while (scan < loceol && !isSPACE_LC(*scan))
5362 scan++;
a0ed51b3
LW
5363 }
5364 break;
a0d0e21e 5365 case DIGIT:
1aa99e6b 5366 if (do_utf8) {
ffc61ed2 5367 loceol = PL_regeol;
1a4fad37 5368 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5369 while (hardcount < max && scan < loceol &&
3568d838 5370 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5371 scan += UTF8SKIP(scan);
5372 hardcount++;
5373 }
5374 } else {
5375 while (scan < loceol && isDIGIT(*scan))
5376 scan++;
a0ed51b3
LW
5377 }
5378 break;
a0d0e21e 5379 case NDIGIT:
1aa99e6b 5380 if (do_utf8) {
ffc61ed2 5381 loceol = PL_regeol;
1a4fad37 5382 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5383 while (hardcount < max && scan < loceol &&
3568d838 5384 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5385 scan += UTF8SKIP(scan);
5386 hardcount++;
5387 }
5388 } else {
5389 while (scan < loceol && !isDIGIT(*scan))
5390 scan++;
a0ed51b3
LW
5391 }
5392 break;
a0d0e21e
LW
5393 default: /* Called on something of 0 width. */
5394 break; /* So match right here or not at all. */
5395 }
a687059c 5396
a0ed51b3
LW
5397 if (hardcount)
5398 c = hardcount;
5399 else
5400 c = scan - PL_reginput;
3280af22 5401 PL_reginput = scan;
a687059c 5402
a3621e74 5403 DEBUG_r({
e68ec53f 5404 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5405 DEBUG_EXECUTE_r({
e68ec53f
YO
5406 SV * const prop = sv_newmortal();
5407 regprop(prog, prop, p);
5408 PerlIO_printf(Perl_debug_log,
be8e71aa 5409 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5410 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5411 });
be8e71aa 5412 });
9041c2e3 5413
a0d0e21e 5414 return(c);
a687059c
LW
5415}
5416
c277df42 5417
be8e71aa 5418#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5419/*
ffc61ed2
JH
5420- regclass_swash - prepare the utf8 swash
5421*/
5422
5423SV *
32fc9b6a 5424Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5425{
97aff369 5426 dVAR;
9e55ce06
JH
5427 SV *sw = NULL;
5428 SV *si = NULL;
5429 SV *alt = NULL;
f8fc2ecf
YO
5430 RXi_GET_DECL(prog,progi);
5431 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5432
4f639d21 5433 if (data && data->count) {
a3b680e6 5434 const U32 n = ARG(node);
ffc61ed2 5435
4f639d21
DM
5436 if (data->what[n] == 's') {
5437 SV * const rv = (SV*)data->data[n];
890ce7af 5438 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5439 SV **const ary = AvARRAY(av);
9e55ce06 5440 SV **a, **b;
9041c2e3 5441
711a919c 5442 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5443 * documentation of these array elements. */
5444
b11f357e 5445 si = *ary;
8f7f7219 5446 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5447 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5448
ffc61ed2
JH
5449 if (a)
5450 sw = *a;
5451 else if (si && doinit) {
5452 sw = swash_init("utf8", "", si, 1, 0);
5453 (void)av_store(av, 1, sw);
5454 }
9e55ce06
JH
5455 if (b)
5456 alt = *b;
ffc61ed2
JH
5457 }
5458 }
5459
9e55ce06
JH
5460 if (listsvp)
5461 *listsvp = si;
5462 if (altsvp)
5463 *altsvp = alt;
ffc61ed2
JH
5464
5465 return sw;
5466}
76234dfb 5467#endif
ffc61ed2
JH
5468
5469/*
ba7b4546 5470 - reginclass - determine if a character falls into a character class
832705d4
JH
5471
5472 The n is the ANYOF regnode, the p is the target string, lenp
5473 is pointer to the maximum length of how far to go in the p
5474 (if the lenp is zero, UTF8SKIP(p) is used),
5475 do_utf8 tells whether the target string is in UTF-8.
5476
bbce6d69 5477 */
5478
76e3520e 5479STATIC bool
32fc9b6a 5480S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5481{
27da23d5 5482 dVAR;
a3b680e6 5483 const char flags = ANYOF_FLAGS(n);
bbce6d69 5484 bool match = FALSE;
cc07378b 5485 UV c = *p;
ae9ddab8 5486 STRLEN len = 0;
9e55ce06 5487 STRLEN plen;
1aa99e6b 5488
19f67299
TS
5489 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5490 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5491 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5492 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5493 if (len == (STRLEN)-1)
5494 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5495 }
bbce6d69 5496
0f0076b4 5497 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5498 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5499 if (lenp)
5500 *lenp = 0;
ffc61ed2 5501 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5502 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5503 match = TRUE;
bbce6d69 5504 }
3568d838 5505 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5506 match = TRUE;
ffc61ed2 5507 if (!match) {
9e55ce06 5508 AV *av;
32fc9b6a 5509 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5510
5511 if (sw) {
3568d838 5512 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5513 match = TRUE;
5514 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5515 if (!match && lenp && av) {
5516 I32 i;
9e55ce06 5517 for (i = 0; i <= av_len(av); i++) {
890ce7af 5518 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5519 STRLEN len;
890ce7af 5520 const char * const s = SvPV_const(sv, len);
9e55ce06 5521
061b10df 5522 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5523 *lenp = len;
5524 match = TRUE;
5525 break;
5526 }
5527 }
5528 }
5529 if (!match) {
89ebb4a3 5530 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5531 STRLEN tmplen;
5532
9e55ce06
JH
5533 to_utf8_fold(p, tmpbuf, &tmplen);
5534 if (swash_fetch(sw, tmpbuf, do_utf8))
5535 match = TRUE;
5536 }
ffc61ed2
JH
5537 }
5538 }
bbce6d69 5539 }
9e55ce06 5540 if (match && lenp && *lenp == 0)
0f0076b4 5541 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5542 }
1aa99e6b 5543 if (!match && c < 256) {
ffc61ed2
JH
5544 if (ANYOF_BITMAP_TEST(n, c))
5545 match = TRUE;
5546 else if (flags & ANYOF_FOLD) {
eb160463 5547 U8 f;
a0ed51b3 5548
ffc61ed2
JH
5549 if (flags & ANYOF_LOCALE) {
5550 PL_reg_flags |= RF_tainted;
5551 f = PL_fold_locale[c];
5552 }
5553 else
5554 f = PL_fold[c];
5555 if (f != c && ANYOF_BITMAP_TEST(n, f))
5556 match = TRUE;
5557 }
5558
5559 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5560 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5561 if (
5562 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5563 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5564 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5565 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5566 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5567 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5568 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5569 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5570 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5571 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5572 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5573 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5574 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5575 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5576 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5577 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5578 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5579 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5580 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5581 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5582 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5583 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5584 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5585 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5586 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5587 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5588 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5589 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5590 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5591 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5592 ) /* How's that for a conditional? */
5593 {
5594 match = TRUE;
5595 }
a0ed51b3 5596 }
a0ed51b3
LW
5597 }
5598
a0ed51b3
LW
5599 return (flags & ANYOF_INVERT) ? !match : match;
5600}
161b471a 5601
dfe13c55 5602STATIC U8 *
0ce71af7 5603S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5604{
97aff369 5605 dVAR;
a0ed51b3 5606 if (off >= 0) {
1aa99e6b 5607 while (off-- && s < lim) {
ffc61ed2 5608 /* XXX could check well-formedness here */
a0ed51b3 5609 s += UTF8SKIP(s);
ffc61ed2 5610 }
a0ed51b3
LW
5611 }
5612 else {
1de06328
YO
5613 while (off++ && s > lim) {
5614 s--;
5615 if (UTF8_IS_CONTINUED(*s)) {
5616 while (s > lim && UTF8_IS_CONTINUATION(*s))
5617 s--;
a0ed51b3 5618 }
1de06328 5619 /* XXX could check well-formedness here */
a0ed51b3
LW
5620 }
5621 }
5622 return s;
5623}
161b471a 5624
f9f4320a
YO
5625#ifdef XXX_dmq
5626/* there are a bunch of places where we use two reghop3's that should
5627 be replaced with this routine. but since thats not done yet
5628 we ifdef it out - dmq
5629*/
dfe13c55 5630STATIC U8 *
1de06328
YO
5631S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5632{
5633 dVAR;
5634 if (off >= 0) {
5635 while (off-- && s < rlim) {
5636 /* XXX could check well-formedness here */
5637 s += UTF8SKIP(s);
5638 }
5639 }
5640 else {
5641 while (off++ && s > llim) {
5642 s--;
5643 if (UTF8_IS_CONTINUED(*s)) {
5644 while (s > llim && UTF8_IS_CONTINUATION(*s))
5645 s--;
5646 }
5647 /* XXX could check well-formedness here */
5648 }
5649 }
5650 return s;
5651}
f9f4320a 5652#endif
1de06328
YO
5653
5654STATIC U8 *
0ce71af7 5655S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5656{
97aff369 5657 dVAR;
a0ed51b3 5658 if (off >= 0) {
1aa99e6b 5659 while (off-- && s < lim) {
ffc61ed2 5660 /* XXX could check well-formedness here */
a0ed51b3 5661 s += UTF8SKIP(s);
ffc61ed2 5662 }
a0ed51b3 5663 if (off >= 0)
3dab1dad 5664 return NULL;
a0ed51b3
LW
5665 }
5666 else {
1de06328
YO
5667 while (off++ && s > lim) {
5668 s--;
5669 if (UTF8_IS_CONTINUED(*s)) {
5670 while (s > lim && UTF8_IS_CONTINUATION(*s))
5671 s--;
a0ed51b3 5672 }
1de06328 5673 /* XXX could check well-formedness here */
a0ed51b3
LW
5674 }
5675 if (off <= 0)
3dab1dad 5676 return NULL;
a0ed51b3
LW
5677 }
5678 return s;
5679}
51371543 5680
51371543 5681static void
acfe0abc 5682restore_pos(pTHX_ void *arg)
51371543 5683{
97aff369 5684 dVAR;
097eb12c 5685 regexp * const rex = (regexp *)arg;
51371543
GS
5686 if (PL_reg_eval_set) {
5687 if (PL_reg_oldsaved) {
4f639d21
DM
5688 rex->subbeg = PL_reg_oldsaved;
5689 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5690#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5691 rex->saved_copy = PL_nrs;
ed252734 5692#endif
4f639d21 5693 RX_MATCH_COPIED_on(rex);
51371543
GS
5694 }
5695 PL_reg_magic->mg_len = PL_reg_oldpos;
5696 PL_reg_eval_set = 0;
5697 PL_curpm = PL_reg_oldcurpm;
5698 }
5699}
33b8afdf
JH
5700
5701STATIC void
5702S_to_utf8_substr(pTHX_ register regexp *prog)
5703{
a1cac82e
NC
5704 int i = 1;
5705 do {
5706 if (prog->substrs->data[i].substr
5707 && !prog->substrs->data[i].utf8_substr) {
5708 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5709 prog->substrs->data[i].utf8_substr = sv;
5710 sv_utf8_upgrade(sv);
610460f9
NC
5711 if (SvVALID(prog->substrs->data[i].substr)) {
5712 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5713 if (flags & FBMcf_TAIL) {
5714 /* Trim the trailing \n that fbm_compile added last
5715 time. */
5716 SvCUR_set(sv, SvCUR(sv) - 1);
5717 /* Whilst this makes the SV technically "invalid" (as its
5718 buffer is no longer followed by "\0") when fbm_compile()
5719 adds the "\n" back, a "\0" is restored. */
5720 }
5721 fbm_compile(sv, flags);
5722 }
a1cac82e
NC
5723 if (prog->substrs->data[i].substr == prog->check_substr)
5724 prog->check_utf8 = sv;
5725 }
5726 } while (i--);
33b8afdf
JH
5727}
5728
5729STATIC void
5730S_to_byte_substr(pTHX_ register regexp *prog)
5731{
97aff369 5732 dVAR;
a1cac82e
NC
5733 int i = 1;
5734 do {
5735 if (prog->substrs->data[i].utf8_substr
5736 && !prog->substrs->data[i].substr) {
5737 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
5738 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
5739 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
5740 const U8 flags
5741 = BmFLAGS(prog->substrs->data[i].utf8_substr);
5742 if (flags & FBMcf_TAIL) {
5743 /* Trim the trailing \n that fbm_compile added last
5744 time. */
5745 SvCUR_set(sv, SvCUR(sv) - 1);
5746 }
5747 fbm_compile(sv, flags);
5748 }
a1cac82e
NC
5749 } else {
5750 SvREFCNT_dec(sv);
5751 sv = &PL_sv_undef;
5752 }
5753 prog->substrs->data[i].substr = sv;
5754 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
5755 prog->check_substr = sv;
33b8afdf 5756 }
a1cac82e 5757 } while (i--);
33b8afdf 5758}
66610fdd
RGS
5759
5760/*
5761 * Local variables:
5762 * c-indentation-style: bsd
5763 * c-basic-offset: 4
5764 * indent-tabs-mode: t
5765 * End:
5766 *
37442d52
RGS
5767 * ex: set ts=8 sts=4 sw=4 noet:
5768 */