This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
fix regression introduced in #27778: must backtrack into inner regex
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
ab3bbdeb 200 PerlIO_printf(Perl_debug_log, \
e4f74956 201 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
202 (IV)PL_savestack_ix)); \
203 cp = PL_savestack_ix
c3464db5 204
ab3bbdeb 205#define REGCP_UNWIND(cp) \
e4f74956 206 DEBUG_STATE_r( \
ab3bbdeb 207 if (cp != PL_savestack_ix) \
e4f74956
YO
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
210 (IV)(cp), (IV)PL_savestack_ix)); \
211 regcpblow(cp)
c277df42 212
76e3520e 213STATIC char *
097eb12c 214S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 215{
97aff369 216 dVAR;
b1ce53c5 217 I32 i;
a0d0e21e 218 char *input;
b1ce53c5 219
a3621e74
YO
220 GET_RE_DEBUG_FLAGS_DECL;
221
b1ce53c5 222 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 223 i = SSPOPINT;
b1ce53c5
JH
224 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
225 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 226 input = (char *) SSPOPPTR;
a01268b5 227 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
228 *PL_reglastparen = SSPOPINT;
229 PL_regsize = SSPOPINT;
b1ce53c5
JH
230
231 /* Now restore the parentheses context. */
41123dfd
JH
232 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
233 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 234 I32 tmps;
097eb12c 235 U32 paren = (U32)SSPOPINT;
3280af22 236 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
237 PL_regstartp[paren] = SSPOPINT;
238 tmps = SSPOPINT;
3280af22
NIS
239 if (paren <= *PL_reglastparen)
240 PL_regendp[paren] = tmps;
a3621e74 241 DEBUG_EXECUTE_r(
c3464db5 242 PerlIO_printf(Perl_debug_log,
b900a521 243 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 244 (UV)paren, (IV)PL_regstartp[paren],
b900a521 245 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 246 (IV)PL_regendp[paren],
3280af22 247 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 248 );
a0d0e21e 249 }
a3621e74 250 DEBUG_EXECUTE_r(
bb7a0f54 251 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 252 PerlIO_printf(Perl_debug_log,
faccc32b 253 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 254 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
255 }
256 );
daf18116 257#if 1
dafc8851
JH
258 /* It would seem that the similar code in regtry()
259 * already takes care of this, and in fact it is in
260 * a better location to since this code can #if 0-ed out
261 * but the code in regtry() is needed or otherwise tests
262 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
263 * (as of patchlevel 7877) will fail. Then again,
264 * this code seems to be necessary or otherwise
265 * building DynaLoader will fail:
266 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
267 * --jhi */
bb7a0f54 268 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
269 if (i > PL_regsize)
270 PL_regstartp[i] = -1;
271 PL_regendp[i] = -1;
a0d0e21e 272 }
dafc8851 273#endif
a0d0e21e
LW
274 return input;
275}
276
02db2b7b 277#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 278
a687059c 279/*
e50aee73 280 * pregexec and friends
a687059c
LW
281 */
282
76234dfb 283#ifndef PERL_IN_XSUB_RE
a687059c 284/*
c277df42 285 - pregexec - match a regexp against a string
a687059c 286 */
c277df42 287I32
864dbfa3 288Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 289 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
290/* strend: pointer to null at end of string */
291/* strbeg: real beginning of string */
292/* minend: end of match must be >=minend after stringarg. */
293/* nosave: For optimizations. */
294{
295 return
9041c2e3 296 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
297 nosave ? 0 : REXEC_COPY_STR);
298}
76234dfb 299#endif
22e551b9 300
9041c2e3 301/*
cad2e5aa
JH
302 * Need to implement the following flags for reg_anch:
303 *
304 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
305 * USE_INTUIT_ML
306 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
307 * INTUIT_AUTORITATIVE_ML
308 * INTUIT_ONCE_NOML - Intuit can match in one location only.
309 * INTUIT_ONCE_ML
310 *
311 * Another flag for this function: SECOND_TIME (so that float substrs
312 * with giant delta may be not rechecked).
313 */
314
315/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
316
3f7c398e 317/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
318 Otherwise, only SvCUR(sv) is used to get strbeg. */
319
320/* XXXX We assume that strpos is strbeg unless sv. */
321
6eb5f6b9
JH
322/* XXXX Some places assume that there is a fixed substring.
323 An update may be needed if optimizer marks as "INTUITable"
324 RExen without fixed substrings. Similarly, it is assumed that
325 lengths of all the strings are no more than minlen, thus they
326 cannot come from lookahead.
327 (Or minlen should take into account lookahead.) */
328
2c2d71f5
JH
329/* A failure to find a constant substring means that there is no need to make
330 an expensive call to REx engine, thus we celebrate a failure. Similarly,
331 finding a substring too deep into the string means that less calls to
30944b6d
IZ
332 regtry() should be needed.
333
334 REx compiler's optimizer found 4 possible hints:
335 a) Anchored substring;
336 b) Fixed substring;
337 c) Whether we are anchored (beginning-of-line or \G);
338 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 339 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
340 string which does not contradict any of them.
341 */
2c2d71f5 342
6eb5f6b9
JH
343/* Most of decisions we do here should have been done at compile time.
344 The nodes of the REx which we used for the search should have been
345 deleted from the finite automaton. */
346
cad2e5aa
JH
347char *
348Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
349 char *strend, U32 flags, re_scream_pos_data *data)
350{
97aff369 351 dVAR;
b7953727 352 register I32 start_shift = 0;
cad2e5aa 353 /* Should be nonnegative! */
b7953727 354 register I32 end_shift = 0;
2c2d71f5
JH
355 register char *s;
356 register SV *check;
a1933d95 357 char *strbeg;
cad2e5aa 358 char *t;
1de06328 359 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 360 I32 ml_anch;
bd61b366
SS
361 register char *other_last = NULL; /* other substr checked before this */
362 char *check_at = NULL; /* check substr found at this pos */
1df70142 363 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 364#ifdef DEBUGGING
890ce7af 365 const char * const i_strpos = strpos;
30944b6d 366#endif
a3621e74
YO
367
368 GET_RE_DEBUG_FLAGS_DECL;
369
a30b2f1f 370 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 371
b8d68ded 372 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
373 PL_reg_flags |= RF_utf8;
374 }
ab3bbdeb
YO
375 DEBUG_EXECUTE_r(
376 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
377 sv ? "Guessing start of match in sv for"
378 : "Guessing start of match in string for");
2a782b5b 379 );
cad2e5aa 380
c344f387
JH
381 /* CHR_DIST() would be more correct here but it makes things slow. */
382 if (prog->minlen > strend - strpos) {
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 384 "String too short... [re_intuit_start]\n"));
cad2e5aa 385 goto fail;
2c2d71f5 386 }
1de06328 387
a1933d95 388 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 389 PL_regeol = strend;
33b8afdf
JH
390 if (do_utf8) {
391 if (!prog->check_utf8 && prog->check_substr)
392 to_utf8_substr(prog);
393 check = prog->check_utf8;
394 } else {
395 if (!prog->check_substr && prog->check_utf8)
396 to_byte_substr(prog);
397 check = prog->check_substr;
398 }
1de06328 399 if (check == &PL_sv_undef) {
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 401 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
402 goto fail;
403 }
2c2d71f5 404 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
405 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
406 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 407 && !multiline ) ); /* Check after \n? */
cad2e5aa 408
7e25d62c
JH
409 if (!ml_anch) {
410 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
411 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 412 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
413 && sv && !SvROK(sv)
414 && (strpos != strbeg)) {
a3621e74 415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
416 goto fail;
417 }
418 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 419 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 420 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
421 I32 slen;
422
1aa99e6b 423 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 424
653099ff
GS
425 if (SvTAIL(check)) {
426 slen = SvCUR(check); /* >= 1 */
cad2e5aa 427
9041c2e3 428 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 429 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 430 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 431 goto fail_finish;
cad2e5aa
JH
432 }
433 /* Now should match s[0..slen-2] */
434 slen--;
3f7c398e 435 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 436 || (slen > 1
3f7c398e 437 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 438 report_neq:
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
440 goto fail_finish;
441 }
cad2e5aa 442 }
3f7c398e 443 else if (*SvPVX_const(check) != *s
653099ff 444 || ((slen = SvCUR(check)) > 1
3f7c398e 445 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 446 goto report_neq;
c315bfe8 447 check_at = s;
2c2d71f5 448 goto success_at_start;
7e25d62c 449 }
cad2e5aa 450 }
2c2d71f5 451 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 452 s = strpos;
2c2d71f5 453 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
454 end_shift = prog->check_end_shift;
455
2c2d71f5 456 if (!ml_anch) {
a3b680e6 457 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 458 - (SvTAIL(check) != 0);
a3b680e6 459 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
460
461 if (end_shift < eshift)
462 end_shift = eshift;
463 }
cad2e5aa 464 }
2c2d71f5 465 else { /* Can match at random position */
cad2e5aa
JH
466 ml_anch = 0;
467 s = strpos;
1de06328
YO
468 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
469 end_shift = prog->check_end_shift;
470
471 /* end shift should be non negative here */
cad2e5aa
JH
472 }
473
2c2d71f5 474#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 475 if (end_shift < 0)
1de06328 476 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 477 (IV)end_shift, prog->precomp);
2c2d71f5
JH
478#endif
479
2c2d71f5
JH
480 restart:
481 /* Find a possible match in the region s..strend by looking for
482 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
483
484 {
485 I32 srch_start_shift = start_shift;
486 I32 srch_end_shift = end_shift;
487 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
488 srch_end_shift -= ((strbeg - s) - srch_start_shift);
489 srch_start_shift = strbeg - s;
490 }
491 DEBUG_OPTIMISE_r({
492 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
493 (IV)prog->check_offset_min,
494 (IV)srch_start_shift,
495 (IV)srch_end_shift,
496 (IV)prog->check_end_shift);
497 });
498
cad2e5aa 499 if (flags & REXEC_SCREAM) {
cad2e5aa 500 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 501 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 502
2c2d71f5
JH
503 if (PL_screamfirst[BmRARE(check)] >= 0
504 || ( BmRARE(check) == '\n'
505 && (BmPREVIOUS(check) == SvCUR(check) - 1)
506 && SvTAIL(check) ))
9041c2e3 507 s = screaminstr(sv, check,
1de06328 508 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 509 else
2c2d71f5 510 goto fail_finish;
4addbd3b
HS
511 /* we may be pointing at the wrong string */
512 if (s && RX_MATCH_COPIED(prog))
3f7c398e 513 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
514 if (data)
515 *data->scream_olds = s;
516 }
1de06328
YO
517 else {
518 U8* start_point;
519 U8* end_point;
520 if (prog->reganch & ROPT_CANY_SEEN) {
521 start_point= (U8*)(s + srch_start_shift);
522 end_point= (U8*)(strend - srch_end_shift);
523 } else {
524 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
525 end_point= HOP3(strend, -srch_end_shift, strbeg);
526 }
527 DEBUG_OPTIMISE_r({
56570a2c 528 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 529 (int)(end_point - start_point),
fc8cd66c 530 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
531 start_point);
532 });
533
534 s = fbm_instr( start_point, end_point,
7fba1cd6 535 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
536 }
537 }
cad2e5aa
JH
538 /* Update the count-of-usability, remove useless subpatterns,
539 unshift s. */
2c2d71f5 540
ab3bbdeb
YO
541 DEBUG_EXECUTE_r({
542 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
543 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
544 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 545 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
547 ? "anchored" : "floating"),
548 quoted,
549 RE_SV_TAIL(check),
550 (s ? " at offset " : "...\n") );
551 });
2c2d71f5
JH
552
553 if (!s)
554 goto fail_finish;
2c2d71f5 555 /* Finish the diagnostic message */
a3621e74 556 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 557
1de06328
YO
558 /* XXX dmq: first branch is for positive lookbehind...
559 Our check string is offset from the beginning of the pattern.
560 So we need to do any stclass tests offset forward from that
561 point. I think. :-(
562 */
563
564
565
566 check_at=s;
567
568
2c2d71f5
JH
569 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
570 Start with the other substr.
571 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 572 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
573 *always* match. Probably should be marked during compile...
574 Probably it is right to do no SCREAM here...
575 */
576
1de06328
YO
577 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
578 : (prog->float_substr && prog->anchored_substr))
579 {
30944b6d 580 /* Take into account the "other" substring. */
2c2d71f5
JH
581 /* XXXX May be hopelessly wrong for UTF... */
582 if (!other_last)
6eb5f6b9 583 other_last = strpos;
33b8afdf 584 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
585 do_other_anchored:
586 {
890ce7af
AL
587 char * const last = HOP3c(s, -start_shift, strbeg);
588 char *last1, *last2;
be8e71aa 589 char * const saved_s = s;
33b8afdf 590 SV* must;
2c2d71f5 591
2c2d71f5
JH
592 t = s - prog->check_offset_max;
593 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 594 && (!do_utf8
0ce71af7 595 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 596 && t > strpos)))
6f207bd3 597 NOOP;
2c2d71f5
JH
598 else
599 t = strpos;
1aa99e6b 600 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
601 if (t < other_last) /* These positions already checked */
602 t = other_last;
1aa99e6b 603 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
604 if (last < last1)
605 last1 = last;
1de06328
YO
606 /* XXXX It is not documented what units *_offsets are in.
607 We assume bytes, but this is clearly wrong.
608 Meaning this code needs to be carefully reviewed for errors.
609 dmq.
610 */
611
2c2d71f5 612 /* On end-of-str: see comment below. */
33b8afdf
JH
613 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
614 if (must == &PL_sv_undef) {
615 s = (char*)NULL;
1de06328 616 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
617 }
618 else
619 s = fbm_instr(
620 (unsigned char*)t,
621 HOP3(HOP3(last1, prog->anchored_offset, strend)
622 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
623 must,
7fba1cd6 624 multiline ? FBMrf_MULTILINE : 0
33b8afdf 625 );
ab3bbdeb
YO
626 DEBUG_EXECUTE_r({
627 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
628 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
629 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 630 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
631 quoted, RE_SV_TAIL(must));
632 });
633
634
2c2d71f5
JH
635 if (!s) {
636 if (last1 >= last2) {
a3621e74 637 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
638 ", giving up...\n"));
639 goto fail_finish;
640 }
a3621e74 641 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 642 ", trying floating at offset %ld...\n",
be8e71aa 643 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
644 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
645 s = HOP3c(last, 1, strend);
2c2d71f5
JH
646 goto restart;
647 }
648 else {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 650 (long)(s - i_strpos)));
1aa99e6b
IH
651 t = HOP3c(s, -prog->anchored_offset, strbeg);
652 other_last = HOP3c(s, 1, strend);
be8e71aa 653 s = saved_s;
2c2d71f5
JH
654 if (t == strpos)
655 goto try_at_start;
2c2d71f5
JH
656 goto try_at_offset;
657 }
30944b6d 658 }
2c2d71f5
JH
659 }
660 else { /* Take into account the floating substring. */
33b8afdf 661 char *last, *last1;
be8e71aa 662 char * const saved_s = s;
33b8afdf
JH
663 SV* must;
664
665 t = HOP3c(s, -start_shift, strbeg);
666 last1 = last =
667 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
668 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
669 last = HOP3c(t, prog->float_max_offset, strend);
670 s = HOP3c(t, prog->float_min_offset, strend);
671 if (s < other_last)
672 s = other_last;
2c2d71f5 673 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
674 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
675 /* fbm_instr() takes into account exact value of end-of-str
676 if the check is SvTAIL(ed). Since false positives are OK,
677 and end-of-str is not later than strend we are OK. */
678 if (must == &PL_sv_undef) {
679 s = (char*)NULL;
1de06328 680 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
681 }
682 else
2c2d71f5 683 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
684 (unsigned char*)last + SvCUR(must)
685 - (SvTAIL(must)!=0),
7fba1cd6 686 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
687 DEBUG_EXECUTE_r({
688 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
689 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
690 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 691 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
692 quoted, RE_SV_TAIL(must));
693 });
33b8afdf
JH
694 if (!s) {
695 if (last1 == last) {
a3621e74 696 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
697 ", giving up...\n"));
698 goto fail_finish;
2c2d71f5 699 }
a3621e74 700 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 701 ", trying anchored starting at offset %ld...\n",
be8e71aa 702 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
703 other_last = last;
704 s = HOP3c(t, 1, strend);
705 goto restart;
706 }
707 else {
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
709 (long)(s - i_strpos)));
710 other_last = s; /* Fix this later. --Hugo */
be8e71aa 711 s = saved_s;
33b8afdf
JH
712 if (t == strpos)
713 goto try_at_start;
714 goto try_at_offset;
715 }
2c2d71f5 716 }
cad2e5aa 717 }
2c2d71f5 718
1de06328 719
9ef43ace 720 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328
YO
721
722 DEBUG_OPTIMISE_r(
723 PerlIO_printf(Perl_debug_log,
724 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
725 (IV)prog->check_offset_min,
726 (IV)prog->check_offset_max,
727 (IV)(s-strpos),
728 (IV)(t-strpos),
729 (IV)(t-s),
730 (IV)(strend-strpos)
731 )
732 );
733
2c2d71f5 734 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 735 && (!do_utf8
9ef43ace 736 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
737 && t > strpos)))
738 {
2c2d71f5
JH
739 /* Fixed substring is found far enough so that the match
740 cannot start at strpos. */
741 try_at_offset:
cad2e5aa 742 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
743 /* Eventually fbm_*() should handle this, but often
744 anchored_offset is not 0, so this check will not be wasted. */
745 /* XXXX In the code below we prefer to look for "^" even in
746 presence of anchored substrings. And we search even
747 beyond the found float position. These pessimizations
748 are historical artefacts only. */
749 find_anchor:
2c2d71f5 750 while (t < strend - prog->minlen) {
cad2e5aa 751 if (*t == '\n') {
4ee3650e 752 if (t < check_at - prog->check_offset_min) {
33b8afdf 753 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
754 /* Since we moved from the found position,
755 we definitely contradict the found anchored
30944b6d
IZ
756 substr. Due to the above check we do not
757 contradict "check" substr.
758 Thus we can arrive here only if check substr
759 is float. Redo checking for "other"=="fixed".
760 */
9041c2e3 761 strpos = t + 1;
a3621e74 762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 763 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
764 goto do_other_anchored;
765 }
4ee3650e
GS
766 /* We don't contradict the found floating substring. */
767 /* XXXX Why not check for STCLASS? */
cad2e5aa 768 s = t + 1;
a3621e74 769 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 770 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
771 goto set_useful;
772 }
4ee3650e
GS
773 /* Position contradicts check-string */
774 /* XXXX probably better to look for check-string
775 than for "\n", so one should lower the limit for t? */
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 777 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 778 other_last = strpos = s = t + 1;
cad2e5aa
JH
779 goto restart;
780 }
781 t++;
782 }
a3621e74 783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 784 PL_colors[0], PL_colors[1]));
2c2d71f5 785 goto fail_finish;
cad2e5aa 786 }
f5952150 787 else {
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 789 PL_colors[0], PL_colors[1]));
f5952150 790 }
cad2e5aa
JH
791 s = t;
792 set_useful:
33b8afdf 793 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
794 }
795 else {
f5952150 796 /* The found string does not prohibit matching at strpos,
2c2d71f5 797 - no optimization of calling REx engine can be performed,
f5952150
GS
798 unless it was an MBOL and we are not after MBOL,
799 or a future STCLASS check will fail this. */
2c2d71f5
JH
800 try_at_start:
801 /* Even in this situation we may use MBOL flag if strpos is offset
802 wrt the start of the string. */
05b4157f 803 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 804 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
805 /* May be due to an implicit anchor of m{.*foo} */
806 && !(prog->reganch & ROPT_IMPLICIT))
807 {
cad2e5aa
JH
808 t = strpos;
809 goto find_anchor;
810 }
a3621e74 811 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 812 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 813 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 814 );
2c2d71f5 815 success_at_start:
30944b6d 816 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
817 && (do_utf8 ? (
818 prog->check_utf8 /* Could be deleted already */
819 && --BmUSEFUL(prog->check_utf8) < 0
820 && (prog->check_utf8 == prog->float_utf8)
821 ) : (
822 prog->check_substr /* Could be deleted already */
823 && --BmUSEFUL(prog->check_substr) < 0
824 && (prog->check_substr == prog->float_substr)
825 )))
66e933ab 826 {
cad2e5aa 827 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
829 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
830 if (do_utf8 ? prog->check_substr : prog->check_utf8)
831 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
832 prog->check_substr = prog->check_utf8 = NULL; /* disable */
833 prog->float_substr = prog->float_utf8 = NULL; /* clear */
834 check = NULL; /* abort */
cad2e5aa 835 s = strpos;
3cf5c195
IZ
836 /* XXXX This is a remnant of the old implementation. It
837 looks wasteful, since now INTUIT can use many
6eb5f6b9 838 other heuristics. */
cad2e5aa
JH
839 prog->reganch &= ~RE_USE_INTUIT;
840 }
841 else
842 s = strpos;
843 }
844
6eb5f6b9
JH
845 /* Last resort... */
846 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
847 /* trie stclasses are too expensive to use here, we are better off to
848 leave it to regmatch itself */
786e8c11 849 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
850 /* minlen == 0 is possible if regstclass is \b or \B,
851 and the fixed substr is ''$.
852 Since minlen is already taken into account, s+1 is before strend;
853 accidentally, minlen >= 1 guaranties no false positives at s + 1
854 even for \b or \B. But (minlen? 1 : 0) below assumes that
855 regstclass does not come from lookahead... */
856 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
857 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 858 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 859 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 860 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 861 : 1);
1de06328
YO
862 char * endpos;
863 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
864 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
865 else if (prog->float_substr || prog->float_utf8)
866 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
867 else
868 endpos= strend;
869
56570a2c 870 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 871 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 872
6eb5f6b9 873 t = s;
3b0527fe 874 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
875 if (!s) {
876#ifdef DEBUGGING
cbbf8932 877 const char *what = NULL;
6eb5f6b9
JH
878#endif
879 if (endpos == strend) {
a3621e74 880 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 885 "This position contradicts STCLASS...\n") );
653099ff
GS
886 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
887 goto fail;
6eb5f6b9 888 /* Contradict one of substrings */
33b8afdf
JH
889 if (prog->anchored_substr || prog->anchored_utf8) {
890 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 891 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 892 hop_and_restart:
1aa99e6b 893 s = HOP3c(t, 1, strend);
66e933ab
GS
894 if (s + start_shift + end_shift > strend) {
895 /* XXXX Should be taken into account earlier? */
a3621e74 896 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
897 "Could not match STCLASS...\n") );
898 goto fail;
899 }
5e39e1e5
HS
900 if (!check)
901 goto giveup;
a3621e74 902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 903 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
904 what, (long)(s + start_shift - i_strpos)) );
905 goto restart;
906 }
66e933ab 907 /* Have both, check_string is floating */
6eb5f6b9
JH
908 if (t + start_shift >= check_at) /* Contradicts floating=check */
909 goto retry_floating_check;
910 /* Recheck anchored substring, but not floating... */
9041c2e3 911 s = check_at;
5e39e1e5
HS
912 if (!check)
913 goto giveup;
a3621e74 914 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 915 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
916 (long)(other_last - i_strpos)) );
917 goto do_other_anchored;
918 }
60e71179
GS
919 /* Another way we could have checked stclass at the
920 current position only: */
921 if (ml_anch) {
922 s = t = t + 1;
5e39e1e5
HS
923 if (!check)
924 goto giveup;
a3621e74 925 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 926 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 927 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 928 goto try_at_offset;
66e933ab 929 }
33b8afdf 930 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 931 goto fail;
6eb5f6b9
JH
932 /* Check is floating subtring. */
933 retry_floating_check:
934 t = check_at - start_shift;
a3621e74 935 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
936 goto hop_and_restart;
937 }
b7953727 938 if (t != s) {
a3621e74 939 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 940 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
941 (long)(t - i_strpos), (long)(s - i_strpos))
942 );
943 }
944 else {
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
946 "Does not contradict STCLASS...\n");
947 );
948 }
6eb5f6b9 949 }
5e39e1e5 950 giveup:
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
952 PL_colors[4], (check ? "Guessed" : "Giving up"),
953 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 954 return s;
2c2d71f5
JH
955
956 fail_finish: /* Substring not found */
33b8afdf
JH
957 if (prog->check_substr || prog->check_utf8) /* could be removed already */
958 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 959 fail:
a3621e74 960 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 961 PL_colors[4], PL_colors[5]));
bd61b366 962 return NULL;
cad2e5aa 963}
9661b544 964
786e8c11 965
3b0527fe 966
4cadc6a9
YO
967#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
968foldlen, foldbuf, uniflags) STMT_START { \
969 switch (trie_type) { \
970 case trie_utf8_fold: \
971 if ( foldlen>0 ) { \
972 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
973 foldlen -= len; \
974 uscan += len; \
975 len=0; \
976 } else { \
977 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
978 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
979 foldlen -= UNISKIP( uvc ); \
980 uscan = foldbuf + UNISKIP( uvc ); \
981 } \
982 break; \
983 case trie_utf8: \
984 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
985 break; \
986 case trie_plain: \
987 uvc = (UV)*uc; \
988 len = 1; \
989 } \
990 \
991 if (uvc < 256) { \
992 charid = trie->charmap[ uvc ]; \
993 } \
994 else { \
995 charid = 0; \
996 if (trie->widecharmap) { \
997 SV** const svpp = hv_fetch(trie->widecharmap, \
998 (char*)&uvc, sizeof(UV), 0); \
999 if (svpp) \
1000 charid = (U16)SvIV(*svpp); \
1001 } \
1002 } \
1003} STMT_END
1004
1005#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1006 if ( (CoNd) \
1007 && (ln == len || \
1008 ibcmp_utf8(s, NULL, 0, do_utf8, \
1009 m, NULL, ln, (bool)UTF)) \
1010 && (!reginfo || regtry(reginfo, s)) ) \
1011 goto got_it; \
1012 else { \
1013 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1014 uvchr_to_utf8(tmpbuf, c); \
1015 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1016 if ( f != c \
1017 && (f == c1 || f == c2) \
1018 && (ln == foldlen || \
1019 !ibcmp_utf8((char *) foldbuf, \
1020 NULL, foldlen, do_utf8, \
1021 m, \
1022 NULL, ln, (bool)UTF)) \
1023 && (!reginfo || regtry(reginfo, s)) ) \
1024 goto got_it; \
1025 } \
1026 s += len
1027
1028#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1029STMT_START { \
1030 while (s <= e) { \
1031 if ( (CoNd) \
1032 && (ln == 1 || !(OP(c) == EXACTF \
1033 ? ibcmp(s, m, ln) \
1034 : ibcmp_locale(s, m, ln))) \
1035 && (!reginfo || regtry(reginfo, s)) ) \
1036 goto got_it; \
1037 s++; \
1038 } \
1039} STMT_END
1040
1041#define REXEC_FBC_UTF8_SCAN(CoDe) \
1042STMT_START { \
1043 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1044 CoDe \
1045 s += uskip; \
1046 } \
1047} STMT_END
1048
1049#define REXEC_FBC_SCAN(CoDe) \
1050STMT_START { \
1051 while (s < strend) { \
1052 CoDe \
1053 s++; \
1054 } \
1055} STMT_END
1056
1057#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1058REXEC_FBC_UTF8_SCAN( \
1059 if (CoNd) { \
1060 if (tmp && (!reginfo || regtry(reginfo, s))) \
1061 goto got_it; \
1062 else \
1063 tmp = doevery; \
1064 } \
1065 else \
1066 tmp = 1; \
1067)
1068
1069#define REXEC_FBC_CLASS_SCAN(CoNd) \
1070REXEC_FBC_SCAN( \
1071 if (CoNd) { \
1072 if (tmp && (!reginfo || regtry(reginfo, s))) \
1073 goto got_it; \
1074 else \
1075 tmp = doevery; \
1076 } \
1077 else \
1078 tmp = 1; \
1079)
1080
1081#define REXEC_FBC_TRYIT \
1082if ((!reginfo || regtry(reginfo, s))) \
1083 goto got_it
1084
1085#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1086 if (do_utf8) { \
1087 UtFpReLoAd; \
1088 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1089 } \
1090 else { \
1091 REXEC_FBC_CLASS_SCAN(CoNd); \
1092 } \
1093 break
1094
1095#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1096 PL_reg_flags |= RF_tainted; \
1097 if (do_utf8) { \
1098 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1099 } \
1100 else { \
1101 REXEC_FBC_CLASS_SCAN(CoNd); \
1102 } \
1103 break
1104
786e8c11
YO
1105#define DUMP_EXEC_POS(li,s,doutf8) \
1106 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1107
1108/* We know what class REx starts with. Try to find this position... */
1109/* if reginfo is NULL, its a dryrun */
1110/* annoyingly all the vars in this routine have different names from their counterparts
1111 in regmatch. /grrr */
1112
3c3eec57 1113STATIC char *
07be1b83
YO
1114S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1115 const char *strend, const regmatch_info *reginfo)
a687059c 1116{
27da23d5 1117 dVAR;
1df70142 1118 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1119 char *m;
d8093b23 1120 STRLEN ln;
5dab1207 1121 STRLEN lnc;
078c425b 1122 register STRLEN uskip;
d8093b23
G
1123 unsigned int c1;
1124 unsigned int c2;
6eb5f6b9
JH
1125 char *e;
1126 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1127 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1128
6eb5f6b9
JH
1129 /* We know what class it must start with. */
1130 switch (OP(c)) {
6eb5f6b9 1131 case ANYOF:
388cc4de 1132 if (do_utf8) {
4cadc6a9 1133 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1134 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1135 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1136 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1137 }
1138 else {
1139 while (s < strend) {
1140 STRLEN skip = 1;
1141
32fc9b6a 1142 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1143 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1144 /* The assignment of 2 is intentional:
1145 * for the folded sharp s, the skip is 2. */
1146 (skip = SHARP_S_SKIP))) {
3b0527fe 1147 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1148 goto got_it;
1149 else
1150 tmp = doevery;
1151 }
1152 else
1153 tmp = 1;
1154 s += skip;
1155 }
a0d0e21e 1156 }
6eb5f6b9 1157 break;
f33976b4 1158 case CANY:
4cadc6a9 1159 REXEC_FBC_SCAN(
3b0527fe 1160 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1161 goto got_it;
1162 else
1163 tmp = doevery;
4cadc6a9 1164 );
f33976b4 1165 break;
6eb5f6b9 1166 case EXACTF:
5dab1207
NIS
1167 m = STRING(c);
1168 ln = STR_LEN(c); /* length to match in octets/bytes */
1169 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1170 if (UTF) {
a2a2844f 1171 STRLEN ulen1, ulen2;
5dab1207 1172 U8 *sm = (U8 *) m;
89ebb4a3
JH
1173 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1174 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1175 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1176
1177 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1178 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1179
89ebb4a3 1180 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1181 0, uniflags);
89ebb4a3 1182 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1183 0, uniflags);
5dab1207
NIS
1184 lnc = 0;
1185 while (sm < ((U8 *) m + ln)) {
1186 lnc++;
1187 sm += UTF8SKIP(sm);
1188 }
1aa99e6b
IH
1189 }
1190 else {
1191 c1 = *(U8*)m;
1192 c2 = PL_fold[c1];
1193 }
6eb5f6b9
JH
1194 goto do_exactf;
1195 case EXACTFL:
5dab1207
NIS
1196 m = STRING(c);
1197 ln = STR_LEN(c);
1198 lnc = (I32) ln;
d8093b23 1199 c1 = *(U8*)m;
6eb5f6b9
JH
1200 c2 = PL_fold_locale[c1];
1201 do_exactf:
db12adc6 1202 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1203
3b0527fe 1204 if (!reginfo && e < s)
6eb5f6b9 1205 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1206
60a8b682
JH
1207 /* The idea in the EXACTF* cases is to first find the
1208 * first character of the EXACTF* node and then, if
1209 * necessary, case-insensitively compare the full
1210 * text of the node. The c1 and c2 are the first
1211 * characters (though in Unicode it gets a bit
1212 * more complicated because there are more cases
7f16dd3d
JH
1213 * than just upper and lower: one needs to use
1214 * the so-called folding case for case-insensitive
1215 * matching (called "loose matching" in Unicode).
1216 * ibcmp_utf8() will do just that. */
60a8b682 1217
1aa99e6b 1218 if (do_utf8) {
575cac57 1219 UV c, f;
89ebb4a3 1220 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1221 STRLEN len, foldlen;
4ad0818d 1222 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1223 if (c1 == c2) {
5dab1207
NIS
1224 /* Upper and lower of 1st char are equal -
1225 * probably not a "letter". */
1aa99e6b 1226 while (s <= e) {
89ebb4a3 1227 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1228 uniflags);
4cadc6a9 1229 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1230 }
09091399
JH
1231 }
1232 else {
1aa99e6b 1233 while (s <= e) {
89ebb4a3 1234 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1235 uniflags);
80aecb99 1236
60a8b682 1237 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1238 * Note that not all the possible combinations
1239 * are handled here: some of them are handled
1240 * by the standard folding rules, and some of
1241 * them (the character class or ANYOF cases)
1242 * are handled during compiletime in
1243 * regexec.c:S_regclass(). */
880bd946
JH
1244 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1245 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1246 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1247
4cadc6a9 1248 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1249 }
09091399 1250 }
1aa99e6b
IH
1251 }
1252 else {
1253 if (c1 == c2)
4cadc6a9 1254 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1255 else
4cadc6a9 1256 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1257 }
1258 break;
bbce6d69 1259 case BOUNDL:
3280af22 1260 PL_reg_flags |= RF_tainted;
bbce6d69 1261 /* FALL THROUGH */
a0d0e21e 1262 case BOUND:
ffc61ed2 1263 if (do_utf8) {
12d33761 1264 if (s == PL_bostr)
ffc61ed2
JH
1265 tmp = '\n';
1266 else {
6136c704 1267 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1268 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1269 }
1270 tmp = ((OP(c) == BOUND ?
9041c2e3 1271 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1272 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1273 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1274 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1275 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1276 isALNUM_LC_utf8((U8*)s)))
1277 {
1278 tmp = !tmp;
4cadc6a9 1279 REXEC_FBC_TRYIT;
a687059c 1280 }
4cadc6a9 1281 );
a0d0e21e 1282 }
667bb95a 1283 else {
12d33761 1284 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1285 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1286 REXEC_FBC_SCAN(
ffc61ed2
JH
1287 if (tmp ==
1288 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1289 tmp = !tmp;
4cadc6a9 1290 REXEC_FBC_TRYIT;
a0ed51b3 1291 }
4cadc6a9 1292 );
a0ed51b3 1293 }
3b0527fe 1294 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1295 goto got_it;
1296 break;
bbce6d69 1297 case NBOUNDL:
3280af22 1298 PL_reg_flags |= RF_tainted;
bbce6d69 1299 /* FALL THROUGH */
a0d0e21e 1300 case NBOUND:
ffc61ed2 1301 if (do_utf8) {
12d33761 1302 if (s == PL_bostr)
ffc61ed2
JH
1303 tmp = '\n';
1304 else {
6136c704 1305 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1306 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1307 }
1308 tmp = ((OP(c) == NBOUND ?
9041c2e3 1309 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1310 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1311 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1312 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1313 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1314 isALNUM_LC_utf8((U8*)s)))
1315 tmp = !tmp;
4cadc6a9
YO
1316 else REXEC_FBC_TRYIT;
1317 );
a0d0e21e 1318 }
667bb95a 1319 else {
12d33761 1320 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1321 tmp = ((OP(c) == NBOUND ?
1322 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1323 REXEC_FBC_SCAN(
ffc61ed2
JH
1324 if (tmp ==
1325 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1326 tmp = !tmp;
4cadc6a9
YO
1327 else REXEC_FBC_TRYIT;
1328 );
a0ed51b3 1329 }
3b0527fe 1330 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1331 goto got_it;
1332 break;
a0d0e21e 1333 case ALNUM:
4cadc6a9
YO
1334 REXEC_FBC_CSCAN_PRELOAD(
1335 LOAD_UTF8_CHARCLASS_ALNUM(),
1336 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1337 isALNUM(*s)
1338 );
bbce6d69 1339 case ALNUML:
4cadc6a9
YO
1340 REXEC_FBC_CSCAN_TAINT(
1341 isALNUM_LC_utf8((U8*)s),
1342 isALNUM_LC(*s)
1343 );
a0d0e21e 1344 case NALNUM:
4cadc6a9
YO
1345 REXEC_FBC_CSCAN_PRELOAD(
1346 LOAD_UTF8_CHARCLASS_ALNUM(),
1347 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1348 !isALNUM(*s)
1349 );
bbce6d69 1350 case NALNUML:
4cadc6a9
YO
1351 REXEC_FBC_CSCAN_TAINT(
1352 !isALNUM_LC_utf8((U8*)s),
1353 !isALNUM_LC(*s)
1354 );
a0d0e21e 1355 case SPACE:
4cadc6a9
YO
1356 REXEC_FBC_CSCAN_PRELOAD(
1357 LOAD_UTF8_CHARCLASS_SPACE(),
1358 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1359 isSPACE(*s)
1360 );
bbce6d69 1361 case SPACEL:
4cadc6a9
YO
1362 REXEC_FBC_CSCAN_TAINT(
1363 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1364 isSPACE_LC(*s)
1365 );
a0d0e21e 1366 case NSPACE:
4cadc6a9
YO
1367 REXEC_FBC_CSCAN_PRELOAD(
1368 LOAD_UTF8_CHARCLASS_SPACE(),
1369 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1370 !isSPACE(*s)
1371 );
bbce6d69 1372 case NSPACEL:
4cadc6a9
YO
1373 REXEC_FBC_CSCAN_TAINT(
1374 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1375 !isSPACE_LC(*s)
1376 );
a0d0e21e 1377 case DIGIT:
4cadc6a9
YO
1378 REXEC_FBC_CSCAN_PRELOAD(
1379 LOAD_UTF8_CHARCLASS_DIGIT(),
1380 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1381 isDIGIT(*s)
1382 );
b8c5462f 1383 case DIGITL:
4cadc6a9
YO
1384 REXEC_FBC_CSCAN_TAINT(
1385 isDIGIT_LC_utf8((U8*)s),
1386 isDIGIT_LC(*s)
1387 );
a0d0e21e 1388 case NDIGIT:
4cadc6a9
YO
1389 REXEC_FBC_CSCAN_PRELOAD(
1390 LOAD_UTF8_CHARCLASS_DIGIT(),
1391 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1392 !isDIGIT(*s)
1393 );
b8c5462f 1394 case NDIGITL:
4cadc6a9
YO
1395 REXEC_FBC_CSCAN_TAINT(
1396 !isDIGIT_LC_utf8((U8*)s),
1397 !isDIGIT_LC(*s)
1398 );
1de06328
YO
1399 case AHOCORASICKC:
1400 case AHOCORASICK:
07be1b83
YO
1401 {
1402 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1403 trie_type = do_utf8 ?
1404 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1405 : trie_plain;
1406 /* what trie are we using right now */
1407 reg_ac_data *aho
1408 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1409 reg_trie_data *trie=aho->trie;
1410
1411 const char *last_start = strend - trie->minlen;
6148ee25 1412#ifdef DEBUGGING
07be1b83 1413 const char *real_start = s;
6148ee25 1414#endif
07be1b83 1415 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1416 SV *sv_points;
1417 U8 **points; /* map of where we were in the input string
786e8c11 1418 when reading a given char. For ASCII this
be8e71aa
YO
1419 is unnecessary overhead as the relationship
1420 is always 1:1, but for unicode, especially
1421 case folded unicode this is not true. */
f9e705e8 1422 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1423 U8 *bitmap=NULL;
1424
07be1b83
YO
1425
1426 GET_RE_DEBUG_FLAGS_DECL;
1427
be8e71aa
YO
1428 /* We can't just allocate points here. We need to wrap it in
1429 * an SV so it gets freed properly if there is a croak while
1430 * running the match */
1431 ENTER;
1432 SAVETMPS;
1433 sv_points=newSV(maxlen * sizeof(U8 *));
1434 SvCUR_set(sv_points,
1435 maxlen * sizeof(U8 *));
1436 SvPOK_on(sv_points);
1437 sv_2mortal(sv_points);
1438 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1439 if ( trie_type != trie_utf8_fold
1440 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1441 {
786e8c11
YO
1442 if (trie->bitmap)
1443 bitmap=(U8*)trie->bitmap;
1444 else
1445 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1446 }
786e8c11
YO
1447 /* this is the Aho-Corasick algorithm modified a touch
1448 to include special handling for long "unknown char"
1449 sequences. The basic idea being that we use AC as long
1450 as we are dealing with a possible matching char, when
1451 we encounter an unknown char (and we have not encountered
1452 an accepting state) we scan forward until we find a legal
1453 starting char.
1454 AC matching is basically that of trie matching, except
1455 that when we encounter a failing transition, we fall back
1456 to the current states "fail state", and try the current char
1457 again, a process we repeat until we reach the root state,
1458 state 1, or a legal transition. If we fail on the root state
1459 then we can either terminate if we have reached an accepting
1460 state previously, or restart the entire process from the beginning
1461 if we have not.
1462
1463 */
07be1b83
YO
1464 while (s <= last_start) {
1465 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1466 U8 *uc = (U8*)s;
1467 U16 charid = 0;
1468 U32 base = 1;
1469 U32 state = 1;
1470 UV uvc = 0;
1471 STRLEN len = 0;
1472 STRLEN foldlen = 0;
1473 U8 *uscan = (U8*)NULL;
1474 U8 *leftmost = NULL;
786e8c11
YO
1475#ifdef DEBUGGING
1476 U32 accepted_word= 0;
1477#endif
07be1b83
YO
1478 U32 pointpos = 0;
1479
1480 while ( state && uc <= (U8*)strend ) {
1481 int failed=0;
786e8c11
YO
1482 U32 word = aho->states[ state ].wordnum;
1483
1de06328
YO
1484 if( state==1 ) {
1485 if ( bitmap ) {
1486 DEBUG_TRIE_EXECUTE_r(
1487 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1488 dump_exec_pos( (char *)uc, c, strend, real_start,
1489 (char *)uc, do_utf8 );
1490 PerlIO_printf( Perl_debug_log,
1491 " Scanning for legal start char...\n");
1492 }
1493 );
1494 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1495 uc++;
786e8c11 1496 }
1de06328 1497 s= (char *)uc;
786e8c11 1498 }
786e8c11
YO
1499 if (uc >(U8*)last_start) break;
1500 }
1501
1502 if ( word ) {
1503 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1504 if (!leftmost || lpos < leftmost) {
1505 DEBUG_r(accepted_word=word);
07be1b83 1506 leftmost= lpos;
786e8c11 1507 }
07be1b83 1508 if (base==0) break;
786e8c11 1509
07be1b83
YO
1510 }
1511 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1512 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1513 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1514 DEBUG_TRIE_EXECUTE_r({
1515 dump_exec_pos( (char *)uc, c, strend, real_start,
1516 s, do_utf8 );
07be1b83 1517 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1518 " Charid:%3u CP:%4"UVxf" ",
1519 charid, uvc);
1520 });
07be1b83
YO
1521
1522 do {
6148ee25 1523#ifdef DEBUGGING
786e8c11 1524 word = aho->states[ state ].wordnum;
6148ee25 1525#endif
07be1b83
YO
1526 base = aho->states[ state ].trans.base;
1527
786e8c11
YO
1528 DEBUG_TRIE_EXECUTE_r({
1529 if (failed)
1530 dump_exec_pos( (char *)uc, c, strend, real_start,
1531 s, do_utf8 );
07be1b83 1532 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1533 "%sState: %4"UVxf", word=%"UVxf,
1534 failed ? " Fail transition to " : "",
1535 (UV)state, (UV)word);
1536 });
07be1b83
YO
1537 if ( base ) {
1538 U32 tmp;
1539 if (charid &&
1540 (base + charid > trie->uniquecharcount )
1541 && (base + charid - 1 - trie->uniquecharcount
1542 < trie->lasttrans)
1543 && trie->trans[base + charid - 1 -
1544 trie->uniquecharcount].check == state
1545 && (tmp=trie->trans[base + charid - 1 -
1546 trie->uniquecharcount ].next))
1547 {
786e8c11
YO
1548 DEBUG_TRIE_EXECUTE_r(
1549 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1550 state = tmp;
1551 break;
1552 }
1553 else {
786e8c11
YO
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - fail\n"));
1556 failed = 1;
1557 state = aho->fail[state];
07be1b83
YO
1558 }
1559 }
1560 else {
1561 /* we must be accepting here */
786e8c11
YO
1562 DEBUG_TRIE_EXECUTE_r(
1563 PerlIO_printf( Perl_debug_log," - accepting\n"));
1564 failed = 1;
07be1b83
YO
1565 break;
1566 }
1567 } while(state);
786e8c11 1568 uc += len;
07be1b83
YO
1569 if (failed) {
1570 if (leftmost)
1571 break;
786e8c11 1572 if (!state) state = 1;
07be1b83
YO
1573 }
1574 }
1575 if ( aho->states[ state ].wordnum ) {
1576 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1577 if (!leftmost || lpos < leftmost) {
1578 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1579 leftmost = lpos;
786e8c11 1580 }
07be1b83 1581 }
07be1b83
YO
1582 if (leftmost) {
1583 s = (char*)leftmost;
786e8c11
YO
1584 DEBUG_TRIE_EXECUTE_r({
1585 PerlIO_printf(
1586 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1587 (UV)accepted_word, s - real_start
1588 );
1589 });
be8e71aa
YO
1590 if (!reginfo || regtry(reginfo, s)) {
1591 FREETMPS;
1592 LEAVE;
07be1b83 1593 goto got_it;
be8e71aa 1594 }
07be1b83 1595 s = HOPc(s,1);
786e8c11
YO
1596 DEBUG_TRIE_EXECUTE_r({
1597 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1598 });
07be1b83 1599 } else {
786e8c11
YO
1600 DEBUG_TRIE_EXECUTE_r(
1601 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1602 break;
1603 }
1604 }
be8e71aa
YO
1605 FREETMPS;
1606 LEAVE;
07be1b83
YO
1607 }
1608 break;
b3c9acc1 1609 default:
3c3eec57
GS
1610 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1611 break;
d6a28714 1612 }
6eb5f6b9
JH
1613 return 0;
1614 got_it:
1615 return s;
1616}
1617
1618/*
1619 - regexec_flags - match a regexp against a string
1620 */
1621I32
1622Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1623 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1624/* strend: pointer to null at end of string */
1625/* strbeg: real beginning of string */
1626/* minend: end of match must be >=minend after stringarg. */
1627/* data: May be used for some additional optimizations. */
1628/* nosave: For optimizations. */
1629{
97aff369 1630 dVAR;
6eb5f6b9
JH
1631 register char *s;
1632 register regnode *c;
1633 register char *startpos = stringarg;
6eb5f6b9
JH
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1638 char *scream_olds = NULL;
3dab1dad 1639 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1640 const bool do_utf8 = DO_UTF8(sv);
2757e526 1641 I32 multiline;
0df25f3d 1642
3b0527fe 1643 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1644
1645 GET_RE_DEBUG_FLAGS_DECL;
1646
9d4ba2ae 1647 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1648
1649 /* Be paranoid... */
1650 if (prog == NULL || startpos == NULL) {
1651 Perl_croak(aTHX_ "NULL regexp parameter");
1652 return 0;
1653 }
1654
2757e526 1655 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1656 reginfo.prog = prog;
2757e526 1657
bac06658 1658 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1659 DEBUG_EXECUTE_r(
1660 debug_start_match(prog, do_utf8, startpos, strend,
1661 "Matching");
1662 );
bac06658 1663
6eb5f6b9 1664 minlen = prog->minlen;
1de06328
YO
1665
1666 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1668 "String too short [regexec_flags]...\n"));
1669 goto phooey;
1aa99e6b 1670 }
6eb5f6b9 1671
1de06328 1672
6eb5f6b9
JH
1673 /* Check validity of program. */
1674 if (UCHARAT(prog->program) != REG_MAGIC) {
1675 Perl_croak(aTHX_ "corrupted regexp program");
1676 }
1677
1678 PL_reg_flags = 0;
1679 PL_reg_eval_set = 0;
1680 PL_reg_maxiter = 0;
1681
1682 if (prog->reganch & ROPT_UTF8)
1683 PL_reg_flags |= RF_utf8;
1684
1685 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1686 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1687 PL_bostr = strbeg;
3b0527fe 1688 reginfo.sv = sv;
6eb5f6b9
JH
1689
1690 /* Mark end of line for $ (and such) */
1691 PL_regeol = strend;
1692
1693 /* see how far we have to get to not match where we matched before */
3b0527fe 1694 reginfo.till = startpos+minend;
6eb5f6b9 1695
6eb5f6b9
JH
1696 /* If there is a "must appear" string, look for it. */
1697 s = startpos;
1698
3b0527fe 1699 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1700 MAGIC *mg;
1701
1702 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1703 reginfo.ganch = startpos;
6eb5f6b9
JH
1704 else if (sv && SvTYPE(sv) >= SVt_PVMG
1705 && SvMAGIC(sv)
14befaf4
DM
1706 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1707 && mg->mg_len >= 0) {
3b0527fe 1708 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1709 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1710 if (s > reginfo.ganch)
6eb5f6b9 1711 goto phooey;
3b0527fe 1712 s = reginfo.ganch;
6eb5f6b9
JH
1713 }
1714 }
1715 else /* pos() not defined */
3b0527fe 1716 reginfo.ganch = strbeg;
6eb5f6b9
JH
1717 }
1718
a0714e2c 1719 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1720 re_scream_pos_data d;
1721
1722 d.scream_olds = &scream_olds;
1723 d.scream_pos = &scream_pos;
1724 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1725 if (!s) {
a3621e74 1726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1727 goto phooey; /* not present */
3fa9c3d7 1728 }
6eb5f6b9
JH
1729 }
1730
1de06328 1731
6eb5f6b9
JH
1732
1733 /* Simplest case: anchored match need be tried only once. */
1734 /* [unless only anchor is BOL and multiline is set] */
1735 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1736 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1737 goto got_it;
7fba1cd6 1738 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1739 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1740 {
1741 char *end;
1742
1743 if (minlen)
1744 dontbother = minlen - 1;
1aa99e6b 1745 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1746 /* for multiline we only have to try after newlines */
33b8afdf 1747 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1748 if (s == startpos)
1749 goto after_try;
1750 while (1) {
3b0527fe 1751 if (regtry(&reginfo, s))
6eb5f6b9
JH
1752 goto got_it;
1753 after_try:
1754 if (s >= end)
1755 goto phooey;
1756 if (prog->reganch & RE_USE_INTUIT) {
1757 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1758 if (!s)
1759 goto phooey;
1760 }
1761 else
1762 s++;
1763 }
1764 } else {
1765 if (s > startpos)
1766 s--;
1767 while (s < end) {
1768 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1769 if (regtry(&reginfo, s))
6eb5f6b9
JH
1770 goto got_it;
1771 }
1772 }
1773 }
1774 }
1775 goto phooey;
1776 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1777 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1778 goto got_it;
1779 goto phooey;
1780 }
1781
1782 /* Messy cases: unanchored match. */
33b8afdf 1783 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1784 /* we have /x+whatever/ */
1785 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1786 char ch;
bf93d4cc
GS
1787#ifdef DEBUGGING
1788 int did_match = 0;
1789#endif
33b8afdf
JH
1790 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1791 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1792 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1793
1aa99e6b 1794 if (do_utf8) {
4cadc6a9 1795 REXEC_FBC_SCAN(
6eb5f6b9 1796 if (*s == ch) {
a3621e74 1797 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1798 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1799 s += UTF8SKIP(s);
1800 while (s < strend && *s == ch)
1801 s += UTF8SKIP(s);
1802 }
4cadc6a9 1803 );
6eb5f6b9
JH
1804 }
1805 else {
4cadc6a9 1806 REXEC_FBC_SCAN(
6eb5f6b9 1807 if (*s == ch) {
a3621e74 1808 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1809 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1810 s++;
1811 while (s < strend && *s == ch)
1812 s++;
1813 }
4cadc6a9 1814 );
6eb5f6b9 1815 }
a3621e74 1816 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1817 PerlIO_printf(Perl_debug_log,
b7953727
JH
1818 "Did not find anchored character...\n")
1819 );
6eb5f6b9 1820 }
a0714e2c
SS
1821 else if (prog->anchored_substr != NULL
1822 || prog->anchored_utf8 != NULL
1823 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1824 && prog->float_max_offset < strend - s)) {
1825 SV *must;
1826 I32 back_max;
1827 I32 back_min;
1828 char *last;
6eb5f6b9 1829 char *last1; /* Last position checked before */
bf93d4cc
GS
1830#ifdef DEBUGGING
1831 int did_match = 0;
1832#endif
33b8afdf
JH
1833 if (prog->anchored_substr || prog->anchored_utf8) {
1834 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1835 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1836 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1837 back_max = back_min = prog->anchored_offset;
1838 } else {
1839 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1840 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1841 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1842 back_max = prog->float_max_offset;
1843 back_min = prog->float_min_offset;
1844 }
1de06328
YO
1845
1846
33b8afdf
JH
1847 if (must == &PL_sv_undef)
1848 /* could not downgrade utf8 check substring, so must fail */
1849 goto phooey;
1850
1de06328
YO
1851 if (back_min<0) {
1852 last = strend;
1853 } else {
1854 last = HOP3c(strend, /* Cannot start after this */
1855 -(I32)(CHR_SVLEN(must)
1856 - (SvTAIL(must) != 0) + back_min), strbeg);
1857 }
6eb5f6b9
JH
1858 if (s > PL_bostr)
1859 last1 = HOPc(s, -1);
1860 else
1861 last1 = s - 1; /* bogus */
1862
a0288114 1863 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1864 check_substr==must. */
1865 scream_pos = -1;
1866 dontbother = end_shift;
1867 strend = HOPc(strend, -dontbother);
1868 while ( (s <= last) &&
9041c2e3 1869 ((flags & REXEC_SCREAM)
1de06328 1870 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1871 end_shift, &scream_pos, 0))
1de06328 1872 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1873 (unsigned char*)strend, must,
7fba1cd6 1874 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1875 /* we may be pointing at the wrong string */
1876 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1877 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1878 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1879 if (HOPc(s, -back_max) > last1) {
1880 last1 = HOPc(s, -back_min);
1881 s = HOPc(s, -back_max);
1882 }
1883 else {
52657f30 1884 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1885
1886 last1 = HOPc(s, -back_min);
52657f30 1887 s = t;
6eb5f6b9 1888 }
1aa99e6b 1889 if (do_utf8) {
6eb5f6b9 1890 while (s <= last1) {
3b0527fe 1891 if (regtry(&reginfo, s))
6eb5f6b9
JH
1892 goto got_it;
1893 s += UTF8SKIP(s);
1894 }
1895 }
1896 else {
1897 while (s <= last1) {
3b0527fe 1898 if (regtry(&reginfo, s))
6eb5f6b9
JH
1899 goto got_it;
1900 s++;
1901 }
1902 }
1903 }
ab3bbdeb
YO
1904 DEBUG_EXECUTE_r(if (!did_match) {
1905 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1906 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1907 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1908 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1909 ? "anchored" : "floating"),
ab3bbdeb
YO
1910 quoted, RE_SV_TAIL(must));
1911 });
6eb5f6b9
JH
1912 goto phooey;
1913 }
786e8c11 1914 else if ( (c = prog->regstclass) ) {
f14c76ed 1915 if (minlen) {
be8e71aa 1916 const OPCODE op = OP(prog->regstclass);
66e933ab 1917 /* don't bother with what can't match */
786e8c11 1918 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1919 strend = HOPc(strend, -(minlen - 1));
1920 }
a3621e74 1921 DEBUG_EXECUTE_r({
be8e71aa 1922 SV * const prop = sv_newmortal();
32fc9b6a 1923 regprop(prog, prop, c);
0df25f3d 1924 {
ab3bbdeb
YO
1925 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1926 s,strend-s,60);
0df25f3d 1927 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1928 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1929 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1930 quoted, (int)(strend - s));
0df25f3d 1931 }
ffc61ed2 1932 });
3b0527fe 1933 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1934 goto got_it;
07be1b83 1935 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1936 }
1937 else {
1938 dontbother = 0;
a0714e2c 1939 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1940 /* Trim the end. */
d6a28714 1941 char *last;
33b8afdf
JH
1942 SV* float_real;
1943
1944 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1945 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1946 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1947
1948 if (flags & REXEC_SCREAM) {
33b8afdf 1949 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1950 end_shift, &scream_pos, 1); /* last one */
1951 if (!last)
ffc61ed2 1952 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1953 /* we may be pointing at the wrong string */
1954 else if (RX_MATCH_COPIED(prog))
3f7c398e 1955 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1956 }
d6a28714
JH
1957 else {
1958 STRLEN len;
cfd0369c 1959 const char * const little = SvPV_const(float_real, len);
d6a28714 1960
33b8afdf 1961 if (SvTAIL(float_real)) {
d6a28714
JH
1962 if (memEQ(strend - len + 1, little, len - 1))
1963 last = strend - len + 1;
7fba1cd6 1964 else if (!multiline)
9041c2e3 1965 last = memEQ(strend - len, little, len)
bd61b366 1966 ? strend - len : NULL;
b8c5462f 1967 else
d6a28714
JH
1968 goto find_last;
1969 } else {
1970 find_last:
9041c2e3 1971 if (len)
d6a28714 1972 last = rninstr(s, strend, little, little + len);
b8c5462f 1973 else
a0288114 1974 last = strend; /* matching "$" */
b8c5462f 1975 }
b8c5462f 1976 }
bf93d4cc 1977 if (last == NULL) {
a3621e74 1978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1979 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1980 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1981 goto phooey; /* Should not happen! */
1982 }
d6a28714
JH
1983 dontbother = strend - last + prog->float_min_offset;
1984 }
1985 if (minlen && (dontbother < minlen))
1986 dontbother = minlen - 1;
1987 strend -= dontbother; /* this one's always in bytes! */
1988 /* We don't know much -- general case. */
1aa99e6b 1989 if (do_utf8) {
d6a28714 1990 for (;;) {
3b0527fe 1991 if (regtry(&reginfo, s))
d6a28714
JH
1992 goto got_it;
1993 if (s >= strend)
1994 break;
b8c5462f 1995 s += UTF8SKIP(s);
d6a28714
JH
1996 };
1997 }
1998 else {
1999 do {
3b0527fe 2000 if (regtry(&reginfo, s))
d6a28714
JH
2001 goto got_it;
2002 } while (s++ < strend);
2003 }
2004 }
2005
2006 /* Failure. */
2007 goto phooey;
2008
2009got_it:
2010 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2011
2012 if (PL_reg_eval_set) {
2013 /* Preserve the current value of $^R */
2014 if (oreplsv != GvSV(PL_replgv))
2015 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2016 restored, the value remains
2017 the same. */
4f639d21 2018 restore_pos(aTHX_ prog);
d6a28714
JH
2019 }
2020
2021 /* make sure $`, $&, $', and $digit will work later */
2022 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2023 RX_MATCH_COPY_FREE(prog);
d6a28714 2024 if (flags & REXEC_COPY_STR) {
be8e71aa 2025 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2026#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2027 if ((SvIsCOW(sv)
2028 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2029 if (DEBUG_C_TEST) {
2030 PerlIO_printf(Perl_debug_log,
2031 "Copy on write: regexp capture, type %d\n",
2032 (int) SvTYPE(sv));
2033 }
2034 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2035 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2036 assert (SvPOKp(prog->saved_copy));
2037 } else
2038#endif
2039 {
2040 RX_MATCH_COPIED_on(prog);
2041 s = savepvn(strbeg, i);
2042 prog->subbeg = s;
2043 }
d6a28714 2044 prog->sublen = i;
d6a28714
JH
2045 }
2046 else {
2047 prog->subbeg = strbeg;
2048 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2049 }
2050 }
9041c2e3 2051
d6a28714
JH
2052 return 1;
2053
2054phooey:
a3621e74 2055 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2056 PL_colors[4], PL_colors[5]));
d6a28714 2057 if (PL_reg_eval_set)
4f639d21 2058 restore_pos(aTHX_ prog);
d6a28714
JH
2059 return 0;
2060}
2061
2062/*
2063 - regtry - try match at specific point
2064 */
2065STATIC I32 /* 0 failure, 1 success */
3b0527fe 2066S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2067{
97aff369 2068 dVAR;
d6a28714
JH
2069 register I32 *sp;
2070 register I32 *ep;
2071 CHECKPOINT lastcp;
3b0527fe 2072 regexp *prog = reginfo->prog;
a3621e74 2073 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2074
02db2b7b
IZ
2075#ifdef DEBUGGING
2076 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2077#endif
d6a28714
JH
2078 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2079 MAGIC *mg;
2080
2081 PL_reg_eval_set = RS_init;
a3621e74 2082 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2083 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2084 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2085 ));
e8347627 2086 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2087 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2088 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2089 SAVETMPS;
2090 /* Apparently this is not needed, judging by wantarray. */
e8347627 2091 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2092 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2093
3b0527fe 2094 if (reginfo->sv) {
d6a28714 2095 /* Make $_ available to executed code. */
3b0527fe 2096 if (reginfo->sv != DEFSV) {
59f00321 2097 SAVE_DEFSV;
3b0527fe 2098 DEFSV = reginfo->sv;
b8c5462f 2099 }
d6a28714 2100
3b0527fe
DM
2101 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2102 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2103 /* prepare for quick setting of pos */
d300d9fa
NC
2104#ifdef PERL_OLD_COPY_ON_WRITE
2105 if (SvIsCOW(sv))
2106 sv_force_normal_flags(sv, 0);
2107#endif
3dab1dad 2108 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2109 &PL_vtbl_mglob, NULL, 0);
d6a28714 2110 mg->mg_len = -1;
b8c5462f 2111 }
d6a28714
JH
2112 PL_reg_magic = mg;
2113 PL_reg_oldpos = mg->mg_len;
4f639d21 2114 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2115 }
09687e5a 2116 if (!PL_reg_curpm) {
a02a5408 2117 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2118#ifdef USE_ITHREADS
2119 {
be8e71aa 2120 SV* const repointer = newSViv(0);
577e12cc 2121 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2122 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2123 av_push(PL_regex_padav,repointer);
2124 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2125 PL_regex_pad = AvARRAY(PL_regex_padav);
2126 }
2127#endif
2128 }
aaa362c4 2129 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2130 PL_reg_oldcurpm = PL_curpm;
2131 PL_curpm = PL_reg_curpm;
2132 if (RX_MATCH_COPIED(prog)) {
2133 /* Here is a serious problem: we cannot rewrite subbeg,
2134 since it may be needed if this match fails. Thus
2135 $` inside (?{}) could fail... */
2136 PL_reg_oldsaved = prog->subbeg;
2137 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2138#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2139 PL_nrs = prog->saved_copy;
2140#endif
d6a28714
JH
2141 RX_MATCH_COPIED_off(prog);
2142 }
2143 else
bd61b366 2144 PL_reg_oldsaved = NULL;
d6a28714
JH
2145 prog->subbeg = PL_bostr;
2146 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2147 }
973dddac 2148 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2149 PL_reginput = startpos;
2150 PL_regstartp = prog->startp;
2151 PL_regendp = prog->endp;
2152 PL_reglastparen = &prog->lastparen;
a01268b5 2153 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2154 prog->lastparen = 0;
03994de8 2155 prog->lastcloseparen = 0;
d6a28714 2156 PL_regsize = 0;
a3621e74 2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2158 if (PL_reg_start_tmpl <= prog->nparens) {
2159 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2160 if(PL_reg_start_tmp)
2161 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2162 else
a02a5408 2163 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2164 }
2165
2166 /* XXXX What this code is doing here?!!! There should be no need
2167 to do this again and again, PL_reglastparen should take care of
3dd2943c 2168 this! --ilya*/
dafc8851
JH
2169
2170 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2171 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2172 * PL_reglastparen), is not needed at all by the test suite
2173 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2174 * enough, for building DynaLoader, or otherwise this
2175 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2176 * will happen. Meanwhile, this code *is* needed for the
2177 * above-mentioned test suite tests to succeed. The common theme
2178 * on those tests seems to be returning null fields from matches.
2179 * --jhi */
dafc8851 2180#if 1
d6a28714
JH
2181 sp = prog->startp;
2182 ep = prog->endp;
2183 if (prog->nparens) {
097eb12c 2184 register I32 i;
eb160463 2185 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2186 *++sp = -1;
2187 *++ep = -1;
2188 }
2189 }
dafc8851 2190#endif
02db2b7b 2191 REGCP_SET(lastcp);
3b0527fe 2192 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2193 prog->endp[0] = PL_reginput - PL_bostr;
2194 return 1;
2195 }
02db2b7b 2196 REGCP_UNWIND(lastcp);
d6a28714
JH
2197 return 0;
2198}
2199
02db2b7b 2200
8ba1375e
MJD
2201#define sayYES goto yes
2202#define sayNO goto no
e0f9d4a8 2203#define sayNO_ANYOF goto no_anyof
8ba1375e 2204#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2205#define sayNO_FINAL goto no_final
2206#define sayNO_SILENT goto do_no
2207#define saySAME(x) if (x) goto yes; else goto no
2208
3ab3c9b4 2209#define CACHEsayNO STMT_START { \
3298f257
DM
2210 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2211 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2212 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2213 sayNO; \
2214} STMT_END
2215
3298f257 2216
a3621e74
YO
2217/* this is used to determine how far from the left messages like
2218 'failed...' are printed. Currently 29 makes these messages line
2219 up with the opcode they refer to. Earlier perls used 25 which
2220 left these messages outdented making reviewing a debug output
2221 quite difficult.
2222*/
2223#define REPORT_CODE_OFF 29
2224
2225
2226/* Make sure there is a test for this +1 options in re_tests */
2227#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2228
40a82448
DM
2229#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2230#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2231
86545054
DM
2232#define SLAB_FIRST(s) (&(s)->states[0])
2233#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2234
5d9a96ca
DM
2235/* grab a new slab and return the first slot in it */
2236
2237STATIC regmatch_state *
2238S_push_slab(pTHX)
2239{
54df2634
NC
2240#if PERL_VERSION < 9
2241 dMY_CXT;
2242#endif
5d9a96ca
DM
2243 regmatch_slab *s = PL_regmatch_slab->next;
2244 if (!s) {
2245 Newx(s, 1, regmatch_slab);
2246 s->prev = PL_regmatch_slab;
2247 s->next = NULL;
2248 PL_regmatch_slab->next = s;
2249 }
2250 PL_regmatch_slab = s;
86545054 2251 return SLAB_FIRST(s);
5d9a96ca 2252}
5b47454d 2253
95b24440
DM
2254/* simulate a recursive call to regmatch */
2255
2256#define REGMATCH(ns, where) \
5d9a96ca
DM
2257 st->scan = scan; \
2258 scan = (ns); \
2259 st->resume_state = resume_##where; \
95b24440
DM
2260 goto start_recurse; \
2261 resume_point_##where:
2262
40a82448
DM
2263/* push a new state then goto it */
2264
2265#define PUSH_STATE_GOTO(state, node) \
2266 scan = node; \
2267 st->resume_state = state; \
2268 goto push_state;
2269
2270/* push a new state with success backtracking, then goto it */
2271
2272#define PUSH_YES_STATE_GOTO(state, node) \
2273 scan = node; \
2274 st->resume_state = state; \
2275 goto push_yes_state;
2276
aa283a38 2277
aa283a38 2278
d6a28714
JH
2279/*
2280 - regmatch - main matching routine
2281 *
2282 * Conceptually the strategy is simple: check to see whether the current
2283 * node matches, call self recursively to see whether the rest matches,
2284 * and then act accordingly. In practice we make some effort to avoid
2285 * recursion, in particular by going through "ordinary" nodes (that don't
2286 * need to know whether the rest of the match failed) by a loop instead of
2287 * by recursion.
2288 */
2289/* [lwall] I've hoisted the register declarations to the outer block in order to
2290 * maybe save a little bit of pushing and popping on the stack. It also takes
2291 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2292 *
2293 * This function used to be heavily recursive, but since this had the
2294 * effect of blowing the CPU stack on complex regexes, it has been
2295 * restructured to be iterative, and to save state onto the heap rather
2296 * than the stack. Essentially whereever regmatch() used to be called, it
2297 * pushes the current state, notes where to return, then jumps back into
2298 * the main loop.
2299 *
2300 * Originally the structure of this function used to look something like
2301
2302 S_regmatch() {
2303 int a = 1, b = 2;
2304 ...
2305 while (scan != NULL) {
5d9a96ca 2306 a++; // do stuff with a and b
95b24440
DM
2307 ...
2308 switch (OP(scan)) {
2309 case FOO: {
2310 int local = 3;
2311 ...
2312 if (regmatch(...)) // recurse
2313 goto yes;
2314 }
2315 ...
2316 }
2317 }
2318 yes:
2319 return 1;
2320 }
2321
2322 * Now it looks something like this:
2323
5d9a96ca 2324 typedef struct {
95b24440
DM
2325 int a, b, local;
2326 int resume_state;
5d9a96ca 2327 } regmatch_state;
95b24440
DM
2328
2329 S_regmatch() {
5d9a96ca
DM
2330 regmatch_state *st = new();
2331 int depth=0;
2332 st->a++; // do stuff with a and b
95b24440
DM
2333 ...
2334 while (scan != NULL) {
2335 ...
2336 switch (OP(scan)) {
2337 case FOO: {
5d9a96ca 2338 st->local = 3;
95b24440 2339 ...
5d9a96ca
DM
2340 st->scan = scan;
2341 scan = ...;
2342 st->resume_state = resume_FOO;
2343 goto start_recurse; // recurse
95b24440 2344
5d9a96ca
DM
2345 resume_point_FOO:
2346 if (result)
95b24440
DM
2347 goto yes;
2348 }
2349 ...
2350 }
5d9a96ca
DM
2351 start_recurse:
2352 st = new(); push a new state
2353 st->a = 1; st->b = 2;
2354 depth++;
95b24440 2355 }
5d9a96ca 2356 yes:
95b24440 2357 result = 1;
5d9a96ca
DM
2358 if (depth--) {
2359 st = pop();
95b24440
DM
2360 switch (resume_state) {
2361 case resume_FOO:
2362 goto resume_point_FOO;
2363 ...
2364 }
2365 }
2366 return result
2367 }
2368
2369 * WARNING: this means that any line in this function that contains a
2370 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2371 * regmatch() using gotos instead. Thus the values of any local variables
2372 * not saved in the regmatch_state structure will have been lost when
2373 * execution resumes on the next line .
5d9a96ca
DM
2374 *
2375 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2376 * PL_regmatch_state always points to the currently active state, and
2377 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2378 * The first time regmatch is called, the first slab is allocated, and is
2379 * never freed until interpreter desctruction. When the slab is full,
2380 * a new one is allocated chained to the end. At exit from regmatch, slabs
2381 * allocated since entry are freed.
d6a28714 2382 */
95b24440 2383
40a82448 2384/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2385#define TRIE_next (REGNODE_MAX+1)
2386#define TRIE_next_fail (REGNODE_MAX+2)
faec1544
DM
2387#define EVAL_AB (REGNODE_MAX+3)
2388#define EVAL_AB_fail (REGNODE_MAX+4)
c255a977
DM
2389#define resume_CURLYX (REGNODE_MAX+5)
2390#define resume_WHILEM1 (REGNODE_MAX+6)
2391#define resume_WHILEM2 (REGNODE_MAX+7)
2392#define resume_WHILEM3 (REGNODE_MAX+8)
2393#define resume_WHILEM4 (REGNODE_MAX+9)
2394#define resume_WHILEM5 (REGNODE_MAX+10)
2395#define resume_WHILEM6 (REGNODE_MAX+11)
2396#define BRANCH_next (REGNODE_MAX+12)
2397#define BRANCH_next_fail (REGNODE_MAX+13)
2398#define CURLYM_A (REGNODE_MAX+14)
2399#define CURLYM_A_fail (REGNODE_MAX+15)
2400#define CURLYM_B (REGNODE_MAX+16)
2401#define CURLYM_B_fail (REGNODE_MAX+17)
2402#define IFMATCH_A (REGNODE_MAX+18)
2403#define IFMATCH_A_fail (REGNODE_MAX+19)
2404#define CURLY_B_min_known (REGNODE_MAX+20)
2405#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2406#define CURLY_B_min (REGNODE_MAX+22)
2407#define CURLY_B_min_fail (REGNODE_MAX+23)
2408#define CURLY_B_max (REGNODE_MAX+24)
2409#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2410
2411
3dab1dad 2412#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2413
3df15adc 2414#ifdef DEBUGGING
ab3bbdeb
YO
2415STATIC void
2416S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2417 const char *start, const char *end, const char *blurb)
2418{
2419 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2420 if (!PL_colorset)
2421 reginitcolors();
2422 {
2423 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2424 prog->precomp, prog->prelen, 60);
2425
2426 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2427 start, end - start, 60);
2428
2429 PerlIO_printf(Perl_debug_log,
2430 "%s%s REx%s %s against %s\n",
2431 PL_colors[4], blurb, PL_colors[5], s0, s1);
2432
2433 if (do_utf8||utf8_pat)
1de06328
YO
2434 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2435 utf8_pat ? "pattern" : "",
2436 utf8_pat && do_utf8 ? " and " : "",
2437 do_utf8 ? "string" : ""
ab3bbdeb
YO
2438 );
2439 }
2440}
3df15adc
YO
2441
2442STATIC void
786e8c11
YO
2443S_dump_exec_pos(pTHX_ const char *locinput,
2444 const regnode *scan,
2445 const char *loc_regeol,
2446 const char *loc_bostr,
2447 const char *loc_reg_starttry,
2448 const bool do_utf8)
07be1b83 2449{
786e8c11 2450 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2451 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2452 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2453 /* The part of the string before starttry has one color
2454 (pref0_len chars), between starttry and current
2455 position another one (pref_len - pref0_len chars),
2456 after the current position the third one.
2457 We assume that pref0_len <= pref_len, otherwise we
2458 decrease pref0_len. */
786e8c11
YO
2459 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2460 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2461 int pref0_len;
2462
2463 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2464 pref_len++;
786e8c11
YO
2465 pref0_len = pref_len - (locinput - loc_reg_starttry);
2466 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2467 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2468 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2469 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2470 l--;
2471 if (pref0_len < 0)
2472 pref0_len = 0;
2473 if (pref0_len > pref_len)
2474 pref0_len = pref_len;
2475 {
3df15adc 2476 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2477
ab3bbdeb 2478 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2479 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2480
ab3bbdeb 2481 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2482 (locinput - pref_len + pref0_len),
1de06328 2483 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2484
ab3bbdeb 2485 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2486 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2487
1de06328 2488 const STRLEN tlen=len0+len1+len2;
3df15adc 2489 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2490 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2491 (IV)(locinput - loc_bostr),
07be1b83 2492 len0, s0,
07be1b83 2493 len1, s1,
07be1b83 2494 (docolor ? "" : "> <"),
07be1b83 2495 len2, s2,
1de06328 2496 tlen > 19 ? 0 : 19 - tlen,
07be1b83
YO
2497 "");
2498 }
2499}
3df15adc 2500
07be1b83
YO
2501#endif
2502
d6a28714 2503STATIC I32 /* 0 failure, 1 success */
3b0527fe 2504S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2505{
54df2634
NC
2506#if PERL_VERSION < 9
2507 dMY_CXT;
2508#endif
27da23d5 2509 dVAR;
95b24440 2510 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2511 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2512
3b0527fe
DM
2513 regexp *rex = reginfo->prog;
2514
5d9a96ca
DM
2515 regmatch_slab *orig_slab;
2516 regmatch_state *orig_state;
a3621e74 2517
5d9a96ca
DM
2518 /* the current state. This is a cached copy of PL_regmatch_state */
2519 register regmatch_state *st;
95b24440 2520
5d9a96ca
DM
2521 /* cache heavy used fields of st in registers */
2522 register regnode *scan;
2523 register regnode *next;
2524 register I32 n = 0; /* initialize to shut up compiler warning */
2525 register char *locinput = PL_reginput;
95b24440 2526
5d9a96ca
DM
2527 /* these variables are NOT saved during a recusive RFEGMATCH: */
2528 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2529 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2530 int depth = 0; /* depth of recursion */
77cb431f
DM
2531 regmatch_state *yes_state = NULL; /* state to pop to on success of
2532 subpattern */
faec1544 2533 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2534 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2535 U32 state_num;
95b24440 2536
10edeb5d
JH
2537 I32 parenfloor = 0;
2538
95b24440 2539#ifdef DEBUGGING
e68ec53f 2540 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2541 PL_regindent++;
2542#endif
2543
5d9a96ca
DM
2544 /* on first ever call to regmatch, allocate first slab */
2545 if (!PL_regmatch_slab) {
2546 Newx(PL_regmatch_slab, 1, regmatch_slab);
2547 PL_regmatch_slab->prev = NULL;
2548 PL_regmatch_slab->next = NULL;
86545054 2549 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2550 }
2551
2552 /* remember current high-water mark for exit */
2553 /* XXX this should be done with SAVE* instead */
2554 orig_slab = PL_regmatch_slab;
2555 orig_state = PL_regmatch_state;
2556
2557 /* grab next free state slot */
2558 st = ++PL_regmatch_state;
86545054 2559 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2560 st = PL_regmatch_state = S_push_slab(aTHX);
2561
2562 st->minmod = 0;
2563 st->sw = 0;
2564 st->logical = 0;
b8591aee 2565 cur_curlyx = NULL;
786e8c11 2566
d6a28714
JH
2567 /* Note that nextchr is a byte even in UTF */
2568 nextchr = UCHARAT(locinput);
2569 scan = prog;
2570 while (scan != NULL) {
8ba1375e 2571
a3621e74 2572 DEBUG_EXECUTE_r( {
6136c704 2573 SV * const prop = sv_newmortal();
1de06328 2574 regnode *rnext=regnext(scan);
786e8c11 2575 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2576 regprop(rex, prop, scan);
07be1b83
YO
2577
2578 PerlIO_printf(Perl_debug_log,
2579 "%3"IVdf":%*s%s(%"IVdf")\n",
2580 (IV)(scan - rex->program), PL_regindent*2, "",
2581 SvPVX_const(prop),
1de06328
YO
2582 (PL_regkind[OP(scan)] == END || !rnext) ?
2583 0 : (IV)(rnext - rex->program));
2a782b5b 2584 });
d6a28714
JH
2585
2586 next = scan + NEXT_OFF(scan);
2587 if (next == scan)
2588 next = NULL;
40a82448 2589 state_num = OP(scan);
d6a28714 2590
40a82448
DM
2591 reenter_switch:
2592 switch (state_num) {
d6a28714 2593 case BOL:
7fba1cd6 2594 if (locinput == PL_bostr)
d6a28714 2595 {
3b0527fe 2596 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2597 break;
2598 }
d6a28714
JH
2599 sayNO;
2600 case MBOL:
12d33761
HS
2601 if (locinput == PL_bostr ||
2602 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2603 {
b8c5462f
JH
2604 break;
2605 }
d6a28714
JH
2606 sayNO;
2607 case SBOL:
c2a73568 2608 if (locinput == PL_bostr)
b8c5462f 2609 break;
d6a28714
JH
2610 sayNO;
2611 case GPOS:
3b0527fe 2612 if (locinput == reginfo->ganch)
d6a28714
JH
2613 break;
2614 sayNO;
2615 case EOL:
d6a28714
JH
2616 goto seol;
2617 case MEOL:
d6a28714 2618 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2619 sayNO;
b8c5462f 2620 break;
d6a28714
JH
2621 case SEOL:
2622 seol:
2623 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2624 sayNO;
d6a28714 2625 if (PL_regeol - locinput > 1)
b8c5462f 2626 sayNO;
b8c5462f 2627 break;
d6a28714
JH
2628 case EOS:
2629 if (PL_regeol != locinput)
b8c5462f 2630 sayNO;
d6a28714 2631 break;
ffc61ed2 2632 case SANY:
d6a28714 2633 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2634 sayNO;
f33976b4
DB
2635 if (do_utf8) {
2636 locinput += PL_utf8skip[nextchr];
2637 if (locinput > PL_regeol)
2638 sayNO;
2639 nextchr = UCHARAT(locinput);
2640 }
2641 else
2642 nextchr = UCHARAT(++locinput);
2643 break;
2644 case CANY:
2645 if (!nextchr && locinput >= PL_regeol)
2646 sayNO;
b8c5462f 2647 nextchr = UCHARAT(++locinput);
a0d0e21e 2648 break;
ffc61ed2 2649 case REG_ANY:
1aa99e6b
IH
2650 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2651 sayNO;
2652 if (do_utf8) {
b8c5462f 2653 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2654 if (locinput > PL_regeol)
2655 sayNO;
a0ed51b3 2656 nextchr = UCHARAT(locinput);
a0ed51b3 2657 }
1aa99e6b
IH
2658 else
2659 nextchr = UCHARAT(++locinput);
a0ed51b3 2660 break;
166ba7cd
DM
2661
2662#undef ST
2663#define ST st->u.trie
786e8c11
YO
2664 case TRIEC:
2665 /* In this case the charclass data is available inline so
2666 we can fail fast without a lot of extra overhead.
2667 */
2668 if (scan->flags == EXACT || !do_utf8) {
2669 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2670 DEBUG_EXECUTE_r(
2671 PerlIO_printf(Perl_debug_log,
2672 "%*s %sfailed to match trie start class...%s\n",
2673 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2674 );
2675 sayNO_SILENT;
2676 /* NOTREACHED */
2677 }
2678 }
2679 /* FALL THROUGH */
5b47454d 2680 case TRIE:
3dab1dad 2681 {
07be1b83 2682 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2683 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2684 trie_type = do_utf8 ?
2685 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2686 : trie_plain;
2687
2688 /* what trie are we using right now */
be8e71aa 2689 reg_trie_data * const trie
3dab1dad
YO
2690 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2691 U32 state = trie->startstate;
166ba7cd 2692
3dab1dad
YO
2693 if (trie->bitmap && trie_type != trie_utf8_fold &&
2694 !TRIE_BITMAP_TEST(trie,*locinput)
2695 ) {
2696 if (trie->states[ state ].wordnum) {
2697 DEBUG_EXECUTE_r(
2698 PerlIO_printf(Perl_debug_log,
2699 "%*s %smatched empty string...%s\n",
2700 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2701 );
2702 break;
2703 } else {
2704 DEBUG_EXECUTE_r(
2705 PerlIO_printf(Perl_debug_log,
786e8c11 2706 "%*s %sfailed to match trie start class...%s\n",
3dab1dad
YO
2707 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2708 );
2709 sayNO_SILENT;
2710 }
2711 }
166ba7cd 2712
786e8c11
YO
2713 {
2714 U8 *uc = ( U8* )locinput;
2715
2716 STRLEN len = 0;
2717 STRLEN foldlen = 0;
2718 U8 *uscan = (U8*)NULL;
2719 STRLEN bufflen=0;
2720 SV *sv_accept_buff = NULL;
2721 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2722
2723 ST.accepted = 0; /* how many accepting states we have seen */
2724 ST.B = next;
2725 ST.jump = trie->jump;
2726
2727#ifdef DEBUGGING
2728 ST.me = scan;
2729#endif
2730
2731
2732
07be1b83
YO
2733 /*
2734 traverse the TRIE keeping track of all accepting states
2735 we transition through until we get to a failing node.
2736 */
2737
a3621e74 2738 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11
YO
2739 U32 base = trie->states[ state ].trans.base;
2740 UV uvc;
2741 U16 charid;
2742 /* We use charid to hold the wordnum as we don't use it
2743 for charid until after we have done the wordnum logic.
2744 We define an alias just so that the wordnum logic reads
2745 more naturally. */
2746
2747#define got_wordnum charid
2748 got_wordnum = trie->states[ state ].wordnum;
2749
2750 if ( got_wordnum ) {
2751 if ( ! ST.accepted ) {
5b47454d
DM
2752 ENTER;
2753 SAVETMPS;
2754 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2755 sv_accept_buff=newSV(bufflen *
2756 sizeof(reg_trie_accepted) - 1);
786e8c11 2757 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2758 SvPOK_on(sv_accept_buff);
2759 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2760 SAVETMPS;
2761 ST.accept_buff =
5b47454d
DM
2762 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2763 }
786e8c11 2764 do {
166ba7cd 2765 if (ST.accepted >= bufflen) {
5b47454d 2766 bufflen *= 2;
166ba7cd 2767 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2768 SvGROW(sv_accept_buff,
2769 bufflen * sizeof(reg_trie_accepted));
2770 }
2771 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2772 + sizeof(reg_trie_accepted));
a3621e74 2773
786e8c11
YO
2774
2775 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2776 ST.accept_buff[ST.accepted].endpos = uc;
2777 ++ST.accepted;
2778 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2779 }
2780#undef got_wordnum
a3621e74 2781
07be1b83 2782 DEBUG_TRIE_EXECUTE_r({
786e8c11 2783 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2784 PerlIO_printf( Perl_debug_log,
786e8c11 2785 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
07be1b83 2786 2+PL_regindent * 2, "", PL_colors[4],
786e8c11 2787 (UV)state, (UV)ST.accepted );
07be1b83 2788 });
a3621e74
YO
2789
2790 if ( base ) {
4cadc6a9
YO
2791 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2792 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2793
5b47454d
DM
2794 if (charid &&
2795 (base + charid > trie->uniquecharcount )
2796 && (base + charid - 1 - trie->uniquecharcount
2797 < trie->lasttrans)
2798 && trie->trans[base + charid - 1 -
2799 trie->uniquecharcount].check == state)
2800 {
2801 state = trie->trans[base + charid - 1 -
2802 trie->uniquecharcount ].next;
2803 }
2804 else {
2805 state = 0;
2806 }
2807 uc += len;
2808
2809 }
2810 else {
a3621e74
YO
2811 state = 0;
2812 }
2813 DEBUG_TRIE_EXECUTE_r(
e4584336 2814 PerlIO_printf( Perl_debug_log,
786e8c11 2815 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2816 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2817 );
2818 }
166ba7cd 2819 if (!ST.accepted )
a3621e74 2820 sayNO;
a3621e74 2821
166ba7cd
DM
2822 DEBUG_EXECUTE_r(
2823 PerlIO_printf( Perl_debug_log,
2824 "%*s %sgot %"IVdf" possible matches%s\n",
2825 REPORT_CODE_OFF + PL_regindent * 2, "",
2826 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2827 );
786e8c11 2828 }}
166ba7cd
DM
2829
2830 /* FALL THROUGH */
2831
2832 case TRIE_next_fail: /* we failed - try next alterative */
2833
2834 if ( ST.accepted == 1 ) {
2835 /* only one choice left - just continue */
2836 DEBUG_EXECUTE_r({
2837 reg_trie_data * const trie
2838 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2839 SV ** const tmp = RX_DEBUG(reginfo->prog)
2840 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2841 : NULL;
2842 PerlIO_printf( Perl_debug_log,
2843 "%*s %sonly one match left: #%d <%s>%s\n",
2844 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2845 ST.accept_buff[ 0 ].wordnum,
2846 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2847 PL_colors[5] );
2848 });
2849 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2850 /* in this case we free tmps/leave before we call regmatch
2851 as we wont be using accept_buff again. */
2852 FREETMPS;
2853 LEAVE;
2854 locinput = PL_reginput;
2855 nextchr = UCHARAT(locinput);
786e8c11
YO
2856
2857 if ( !ST.jump )
2858 scan = ST.B;
2859 else
2860 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2861
166ba7cd
DM
2862 continue; /* execute rest of RE */
2863 }
2864
2865 if (!ST.accepted-- ) {
2866 FREETMPS;
2867 LEAVE;
2868 sayNO;
2869 }
2870
a3621e74 2871 /*
166ba7cd
DM
2872 There are at least two accepting states left. Presumably
2873 the number of accepting states is going to be low,
2874 typically two. So we simply scan through to find the one
2875 with lowest wordnum. Once we find it, we swap the last
2876 state into its place and decrement the size. We then try to
2877 match the rest of the pattern at the point where the word
2878 ends. If we succeed, control just continues along the
2879 regex; if we fail we return here to try the next accepting
2880 state
2881 */
a3621e74 2882
166ba7cd
DM
2883 {
2884 U32 best = 0;
2885 U32 cur;
2886 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2887 DEBUG_TRIE_EXECUTE_r(
f2278c82 2888 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2889 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2890 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2891 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2892 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2893 );
2894
2895 if (ST.accept_buff[cur].wordnum <
2896 ST.accept_buff[best].wordnum)
2897 best = cur;
a3621e74 2898 }
166ba7cd
DM
2899
2900 DEBUG_EXECUTE_r({
2901 reg_trie_data * const trie
2902 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2903 SV ** const tmp = RX_DEBUG(reginfo->prog)
2904 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2905 : NULL;
2906 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2907 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2908 ST.accept_buff[best].wordnum,
2909 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2910 PL_colors[5] );
2911 });
2912
2913 if ( best<ST.accepted ) {
2914 reg_trie_accepted tmp = ST.accept_buff[ best ];
2915 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2916 ST.accept_buff[ ST.accepted ] = tmp;
2917 best = ST.accepted;
a3621e74 2918 }
166ba7cd 2919 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2920 if ( !ST.jump ) {
2921 PUSH_STATE_GOTO(TRIE_next, ST.B);
2922 /* NOTREACHED */
2923 } else {
2924 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2925 /* NOTREACHED */
2926 }
2927 /* NOTREACHED */
166ba7cd 2928 }
166ba7cd
DM
2929 /* NOTREACHED */
2930
2931#undef ST
2932
95b24440
DM
2933 case EXACT: {
2934 char *s = STRING(scan);
5d9a96ca 2935 st->ln = STR_LEN(scan);
eb160463 2936 if (do_utf8 != UTF) {
bc517b45 2937 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2938 char *l = locinput;
be8e71aa 2939 const char * const e = s + st->ln;
a72c7584 2940
5ff6fc6d
JH
2941 if (do_utf8) {
2942 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2943 while (s < e) {
a3b680e6 2944 STRLEN ulen;
1aa99e6b 2945 if (l >= PL_regeol)
5ff6fc6d
JH
2946 sayNO;
2947 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2948 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2949 uniflags))
5ff6fc6d 2950 sayNO;
bc517b45 2951 l += ulen;
5ff6fc6d 2952 s ++;
1aa99e6b 2953 }
5ff6fc6d
JH
2954 }
2955 else {
2956 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2957 while (s < e) {
a3b680e6 2958 STRLEN ulen;
1aa99e6b
IH
2959 if (l >= PL_regeol)
2960 sayNO;
5ff6fc6d 2961 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2962 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2963 uniflags))
1aa99e6b 2964 sayNO;
bc517b45 2965 s += ulen;
a72c7584 2966 l ++;
1aa99e6b 2967 }
5ff6fc6d 2968 }
1aa99e6b
IH
2969 locinput = l;
2970 nextchr = UCHARAT(locinput);
2971 break;
2972 }
bc517b45 2973 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2974 /* Inline the first character, for speed. */
2975 if (UCHARAT(s) != nextchr)
2976 sayNO;
5d9a96ca 2977 if (PL_regeol - locinput < st->ln)
d6a28714 2978 sayNO;
5d9a96ca 2979 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2980 sayNO;
5d9a96ca 2981 locinput += st->ln;
d6a28714
JH
2982 nextchr = UCHARAT(locinput);
2983 break;
95b24440 2984 }
d6a28714 2985 case EXACTFL:
b8c5462f
JH
2986 PL_reg_flags |= RF_tainted;
2987 /* FALL THROUGH */
95b24440 2988 case EXACTF: {
be8e71aa 2989 char * const s = STRING(scan);
5d9a96ca 2990 st->ln = STR_LEN(scan);
d6a28714 2991
d07ddd77
JH
2992 if (do_utf8 || UTF) {
2993 /* Either target or the pattern are utf8. */
be8e71aa 2994 const char * const l = locinput;
d07ddd77 2995 char *e = PL_regeol;
bc517b45 2996
5d9a96ca 2997 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2998 l, &e, 0, do_utf8)) {
5486206c
JH
2999 /* One more case for the sharp s:
3000 * pack("U0U*", 0xDF) =~ /ss/i,
3001 * the 0xC3 0x9F are the UTF-8
3002 * byte sequence for the U+00DF. */
3003 if (!(do_utf8 &&
3004 toLOWER(s[0]) == 's' &&
5d9a96ca 3005 st->ln >= 2 &&
5486206c
JH
3006 toLOWER(s[1]) == 's' &&
3007 (U8)l[0] == 0xC3 &&
3008 e - l >= 2 &&
3009 (U8)l[1] == 0x9F))
3010 sayNO;
3011 }
d07ddd77
JH
3012 locinput = e;
3013 nextchr = UCHARAT(locinput);
3014 break;
a0ed51b3 3015 }
d6a28714 3016
bc517b45
JH
3017 /* Neither the target and the pattern are utf8. */
3018
d6a28714
JH
3019 /* Inline the first character, for speed. */
3020 if (UCHARAT(s) != nextchr &&
3021 UCHARAT(s) != ((OP(scan) == EXACTF)
3022 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3023 sayNO;
5d9a96ca 3024 if (PL_regeol - locinput < st->ln)
b8c5462f 3025 sayNO;
5d9a96ca
DM
3026 if (st->ln > 1 && (OP(scan) == EXACTF
3027 ? ibcmp(s, locinput, st->ln)
3028 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3029 sayNO;
5d9a96ca 3030 locinput += st->ln;
d6a28714 3031 nextchr = UCHARAT(locinput);
a0d0e21e 3032 break;
95b24440 3033 }
d6a28714 3034 case ANYOF:
ffc61ed2 3035 if (do_utf8) {
9e55ce06
JH
3036 STRLEN inclasslen = PL_regeol - locinput;
3037
32fc9b6a 3038 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3039 sayNO_ANYOF;
ffc61ed2
JH
3040 if (locinput >= PL_regeol)
3041 sayNO;
0f0076b4 3042 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3043 nextchr = UCHARAT(locinput);
e0f9d4a8 3044 break;
ffc61ed2
JH
3045 }
3046 else {
3047 if (nextchr < 0)
3048 nextchr = UCHARAT(locinput);
32fc9b6a 3049 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3050 sayNO_ANYOF;
ffc61ed2
JH
3051 if (!nextchr && locinput >= PL_regeol)
3052 sayNO;
3053 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3054 break;
3055 }
3056 no_anyof:
3057 /* If we might have the case of the German sharp s
3058 * in a casefolding Unicode character class. */
3059
ebc501f0
JH
3060 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3061 locinput += SHARP_S_SKIP;
e0f9d4a8 3062 nextchr = UCHARAT(locinput);
ffc61ed2 3063 }
e0f9d4a8
JH
3064 else
3065 sayNO;
b8c5462f 3066 break;
d6a28714 3067 case ALNUML:
b8c5462f
JH
3068 PL_reg_flags |= RF_tainted;
3069 /* FALL THROUGH */
d6a28714 3070 case ALNUM:
b8c5462f 3071 if (!nextchr)
4633a7c4 3072 sayNO;
ffc61ed2 3073 if (do_utf8) {
1a4fad37 3074 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3075 if (!(OP(scan) == ALNUM
bb7a0f54 3076 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3077 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3078 {
3079 sayNO;
a0ed51b3 3080 }
b8c5462f 3081 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3082 nextchr = UCHARAT(locinput);
3083 break;
3084 }
ffc61ed2 3085 if (!(OP(scan) == ALNUM
d6a28714 3086 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3087 sayNO;
b8c5462f 3088 nextchr = UCHARAT(++locinput);
a0d0e21e 3089 break;
d6a28714 3090 case NALNUML:
b8c5462f
JH
3091 PL_reg_flags |= RF_tainted;
3092 /* FALL THROUGH */
d6a28714
JH
3093 case NALNUM:
3094 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3095 sayNO;
ffc61ed2 3096 if (do_utf8) {
1a4fad37 3097 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3098 if (OP(scan) == NALNUM
bb7a0f54 3099 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3100 : isALNUM_LC_utf8((U8*)locinput))
3101 {
b8c5462f 3102 sayNO;
d6a28714 3103 }
b8c5462f
JH
3104 locinput += PL_utf8skip[nextchr];
3105 nextchr = UCHARAT(locinput);
3106 break;
3107 }
ffc61ed2 3108 if (OP(scan) == NALNUM
d6a28714 3109 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3110 sayNO;
76e3520e 3111 nextchr = UCHARAT(++locinput);
a0d0e21e 3112 break;
d6a28714
JH
3113 case BOUNDL:
3114 case NBOUNDL:
3280af22 3115 PL_reg_flags |= RF_tainted;
bbce6d69 3116 /* FALL THROUGH */
d6a28714
JH
3117 case BOUND:
3118 case NBOUND:
3119 /* was last char in word? */
ffc61ed2 3120 if (do_utf8) {
12d33761 3121 if (locinput == PL_bostr)
5d9a96ca 3122 st->ln = '\n';
ffc61ed2 3123 else {
a3b680e6 3124 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3125
4ad0818d 3126 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3127 }
3128 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3129 st->ln = isALNUM_uni(st->ln);
1a4fad37 3130 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3131 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3132 }
3133 else {
5d9a96ca 3134 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3135 n = isALNUM_LC_utf8((U8*)locinput);
3136 }
a0ed51b3 3137 }
d6a28714 3138 else {
5d9a96ca 3139 st->ln = (locinput != PL_bostr) ?
12d33761 3140 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3141 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3142 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3143 n = isALNUM(nextchr);
3144 }
3145 else {
5d9a96ca 3146 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3147 n = isALNUM_LC(nextchr);
3148 }
d6a28714 3149 }
5d9a96ca 3150 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3151 OP(scan) == BOUNDL))
3152 sayNO;
a0ed51b3 3153 break;
d6a28714 3154 case SPACEL:
3280af22 3155 PL_reg_flags |= RF_tainted;
bbce6d69 3156 /* FALL THROUGH */
d6a28714 3157 case SPACE:
9442cb0e 3158 if (!nextchr)
4633a7c4 3159 sayNO;
1aa99e6b 3160 if (do_utf8) {
fd400ab9 3161 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3162 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3163 if (!(OP(scan) == SPACE
bb7a0f54 3164 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3165 : isSPACE_LC_utf8((U8*)locinput)))
3166 {
3167 sayNO;
3168 }
3169 locinput += PL_utf8skip[nextchr];
3170 nextchr = UCHARAT(locinput);
3171 break;
d6a28714 3172 }
ffc61ed2
JH
3173 if (!(OP(scan) == SPACE
3174 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3175 sayNO;
3176 nextchr = UCHARAT(++locinput);
3177 }
3178 else {
3179 if (!(OP(scan) == SPACE
3180 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3181 sayNO;
3182 nextchr = UCHARAT(++locinput);
a0ed51b3 3183 }
a0ed51b3 3184 break;
d6a28714 3185 case NSPACEL:
3280af22 3186 PL_reg_flags |= RF_tainted;
bbce6d69 3187 /* FALL THROUGH */
d6a28714 3188 case NSPACE:
9442cb0e 3189 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3190 sayNO;
1aa99e6b 3191 if (do_utf8) {
1a4fad37 3192 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3193 if (OP(scan) == NSPACE
bb7a0f54 3194 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3195 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3196 {
3197 sayNO;
3198 }
3199 locinput += PL_utf8skip[nextchr];
3200 nextchr = UCHARAT(locinput);
3201 break;
a0ed51b3 3202 }
ffc61ed2 3203 if (OP(scan) == NSPACE
d6a28714 3204 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3205 sayNO;
76e3520e 3206 nextchr = UCHARAT(++locinput);
a0d0e21e 3207 break;
d6a28714 3208 case DIGITL:
a0ed51b3
LW
3209 PL_reg_flags |= RF_tainted;
3210 /* FALL THROUGH */
d6a28714 3211 case DIGIT:
9442cb0e 3212 if (!nextchr)
a0ed51b3 3213 sayNO;
1aa99e6b 3214 if (do_utf8) {
1a4fad37 3215 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3216 if (!(OP(scan) == DIGIT
bb7a0f54 3217 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3218 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3219 {
a0ed51b3 3220 sayNO;
dfe13c55 3221 }
6f06b55f 3222 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3223 nextchr = UCHARAT(locinput);
3224 break;
3225 }
ffc61ed2 3226 if (!(OP(scan) == DIGIT
9442cb0e 3227 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3228 sayNO;
3229 nextchr = UCHARAT(++locinput);
3230 break;
d6a28714 3231 case NDIGITL:
b8c5462f
JH
3232 PL_reg_flags |= RF_tainted;
3233 /* FALL THROUGH */
d6a28714 3234 case NDIGIT:
9442cb0e 3235 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3236 sayNO;
1aa99e6b 3237 if (do_utf8) {
1a4fad37 3238 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3239 if (OP(scan) == NDIGIT
bb7a0f54 3240 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3241 : isDIGIT_LC_utf8((U8*)locinput))
3242 {
a0ed51b3 3243 sayNO;
9442cb0e 3244 }
6f06b55f 3245 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3246 nextchr = UCHARAT(locinput);
3247 break;
3248 }
ffc61ed2 3249 if (OP(scan) == NDIGIT
9442cb0e 3250 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3251 sayNO;
3252 nextchr = UCHARAT(++locinput);
3253 break;
3254 case CLUMP:
b7c83a7e 3255 if (locinput >= PL_regeol)
a0ed51b3 3256 sayNO;
b7c83a7e 3257 if (do_utf8) {
1a4fad37 3258 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3259 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3260 sayNO;
3261 locinput += PL_utf8skip[nextchr];
3262 while (locinput < PL_regeol &&
3263 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3264 locinput += UTF8SKIP(locinput);
3265 if (locinput > PL_regeol)
3266 sayNO;
eb08e2da
JH
3267 }
3268 else
3269 locinput++;
a0ed51b3
LW
3270 nextchr = UCHARAT(locinput);
3271 break;
c8756f30 3272 case REFFL:
3280af22 3273 PL_reg_flags |= RF_tainted;
c8756f30 3274 /* FALL THROUGH */
c277df42 3275 case REF:
95b24440
DM
3276 case REFF: {
3277 char *s;
c277df42 3278 n = ARG(scan); /* which paren pair */
5d9a96ca 3279 st->ln = PL_regstartp[n];
2c2d71f5 3280 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3281 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3282 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3283 if (st->ln == PL_regendp[n])
a0d0e21e 3284 break;
a0ed51b3 3285
5d9a96ca 3286 s = PL_bostr + st->ln;
1aa99e6b 3287 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3288 char *l = locinput;
a3b680e6 3289 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3290 /*
3291 * Note that we can't do the "other character" lookup trick as
3292 * in the 8-bit case (no pun intended) because in Unicode we
3293 * have to map both upper and title case to lower case.
3294 */
3295 if (OP(scan) == REFF) {
3296 while (s < e) {
a3b680e6
AL
3297 STRLEN ulen1, ulen2;
3298 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3299 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3300
a0ed51b3
LW
3301 if (l >= PL_regeol)
3302 sayNO;
a2a2844f
JH
3303 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3304 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3305 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3306 sayNO;
a2a2844f
JH
3307 s += ulen1;
3308 l += ulen2;
a0ed51b3
LW
3309 }
3310 }
3311 locinput = l;
3312 nextchr = UCHARAT(locinput);
3313 break;
3314 }
3315
a0d0e21e 3316 /* Inline the first character, for speed. */
76e3520e 3317 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3318 (OP(scan) == REF ||
3319 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3320 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3321 sayNO;
5d9a96ca
DM
3322 st->ln = PL_regendp[n] - st->ln;
3323 if (locinput + st->ln > PL_regeol)
4633a7c4 3324 sayNO;
5d9a96ca
DM
3325 if (st->ln > 1 && (OP(scan) == REF
3326 ? memNE(s, locinput, st->ln)
c8756f30 3327 : (OP(scan) == REFF
5d9a96ca
DM
3328 ? ibcmp(s, locinput, st->ln)
3329 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3330 sayNO;
5d9a96ca 3331 locinput += st->ln;
76e3520e 3332 nextchr = UCHARAT(locinput);
a0d0e21e 3333 break;
95b24440 3334 }
a0d0e21e
LW
3335
3336 case NOTHING:
c277df42 3337 case TAIL:
a0d0e21e
LW
3338 break;
3339 case BACK:
3340 break;
40a82448
DM
3341
3342#undef ST
3343#define ST st->u.eval
3344
3345 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3346 {
c277df42 3347 SV *ret;
8e5e9ebe 3348 {
4aabdb9b
DM
3349 /* execute the code in the {...} */
3350 dSP;
6136c704 3351 SV ** const before = SP;
4aabdb9b
DM
3352 OP_4tree * const oop = PL_op;
3353 COP * const ocurcop = PL_curcop;
3354 PAD *old_comppad;
4aabdb9b
DM
3355
3356 n = ARG(scan);
32fc9b6a 3357 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3358 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3359 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3360 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3361
8e5e9ebe
RGS
3362 CALLRUNOPS(aTHX); /* Scalar context. */
3363 SPAGAIN;
3364 if (SP == before)
075aa684 3365 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3366 else {
3367 ret = POPs;
3368 PUTBACK;
3369 }
4aabdb9b
DM
3370
3371 PL_op = oop;
3372 PAD_RESTORE_LOCAL(old_comppad);
3373 PL_curcop = ocurcop;
3374 if (!st->logical) {
3375 /* /(?{...})/ */
3376 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3377 break;
3378 }
8e5e9ebe 3379 }
4aabdb9b
DM
3380 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3381 regexp *re;
4aabdb9b 3382 {
4f639d21
DM
3383 /* extract RE object from returned value; compiling if
3384 * necessary */
3385
6136c704 3386 MAGIC *mg = NULL;
be8e71aa 3387 const SV *sv;
faf82a0b
AE
3388 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3389 mg = mg_find(sv, PERL_MAGIC_qr);
3390 else if (SvSMAGICAL(ret)) {
3391 if (SvGMAGICAL(ret))
3392 sv_unmagic(ret, PERL_MAGIC_qr);
3393 else
3394 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3395 }
faf82a0b 3396
0f5d15d6
IZ
3397 if (mg) {
3398 re = (regexp *)mg->mg_obj;
df0003d4 3399 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3400 }
3401 else {
3402 STRLEN len;
6136c704 3403 const char * const t = SvPV_const(ret, len);
0f5d15d6 3404 PMOP pm;
a3b680e6 3405 const I32 osize = PL_regsize;
0f5d15d6 3406
5fcd1c1b 3407 Zero(&pm, 1, PMOP);
4aabdb9b 3408 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3409 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3410 if (!(SvFLAGS(ret)
faf82a0b
AE
3411 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3412 | SVs_GMG)))
14befaf4
DM
3413 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3414 PERL_MAGIC_qr,0,0);
0f5d15d6 3415 PL_regsize = osize;
0f5d15d6 3416 }
4aabdb9b 3417 }
aa283a38
DM
3418
3419 /* run the pattern returned from (??{...}) */
40a82448
DM
3420 ST.cp = regcppush(0); /* Save *all* the positions. */
3421 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3422 *PL_reglastparen = 0;
3423 *PL_reglastcloseparen = 0;
4aabdb9b 3424 PL_reginput = locinput;
4aabdb9b
DM
3425
3426 /* XXXX This is too dramatic a measure... */
3427 PL_reg_maxiter = 0;
3428
5d9a96ca 3429 st->logical = 0;
faec1544
DM
3430 ST.toggle_reg_flags = PL_reg_flags;
3431 if (re->reganch & ROPT_UTF8)
3432 PL_reg_flags |= RF_utf8;
3433 else
3434 PL_reg_flags &= ~RF_utf8;
3435 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3436
40a82448 3437 ST.prev_rex = rex;
faec1544 3438 ST.prev_curlyx = cur_curlyx;
aa283a38 3439 rex = re;
faec1544 3440 cur_curlyx = NULL;
40a82448 3441 ST.B = next;
faec1544
DM
3442 ST.prev_eval = cur_eval;
3443 cur_eval = st;
3444
1de06328
YO
3445 DEBUG_EXECUTE_r(
3446 debug_start_match(re, do_utf8, locinput, PL_regeol,
3447 "Matching embedded");
3448 );
faec1544
DM
3449 /* now continue from first node in postoned RE */
3450 PUSH_YES_STATE_GOTO(EVAL_AB, re->program + 1);
4aabdb9b 3451 /* NOTREACHED */
a0ed51b3 3452 }
4aabdb9b
DM
3453 /* /(?(?{...})X|Y)/ */
3454 st->sw = SvTRUE(ret);
3455 st->logical = 0;
c277df42
IZ
3456 break;
3457 }
40a82448 3458
faec1544
DM
3459 case EVAL_AB: /* cleanup after a successful (??{A})B */
3460 /* note: this is called twice; first after popping B, then A */
3461 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3462 ReREFCNT_dec(rex);
3463 rex = ST.prev_rex;
faec1544
DM
3464 regcpblow(ST.cp);
3465 cur_eval = ST.prev_eval;
3466 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3467 /* XXXX This is too dramatic a measure... */
3468 PL_reg_maxiter = 0;
faec1544 3469 sayYES_FINAL;
40a82448 3470
40a82448 3471
faec1544
DM
3472 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3473 /* note: this is called twice; first after popping B, then A */
3474 PL_reg_flags ^= ST.toggle_reg_flags;
3475 ReREFCNT_dec(rex);
3476 rex = ST.prev_rex;
40a82448
DM
3477 PL_reginput = locinput;
3478 REGCP_UNWIND(ST.lastcp);
3479 regcppop(rex);
faec1544
DM
3480 cur_eval = ST.prev_eval;
3481 cur_curlyx = ST.prev_curlyx;
3482 /* XXXX This is too dramatic a measure... */
3483 PL_reg_maxiter = 0;
40a82448
DM
3484 sayNO_SILENT;
3485
3486#undef ST
3487
a0d0e21e 3488 case OPEN:
c277df42 3489 n = ARG(scan); /* which paren pair */
3280af22
NIS
3490 PL_reg_start_tmp[n] = locinput;
3491 if (n > PL_regsize)
3492 PL_regsize = n;
a0d0e21e
LW
3493 break;
3494 case CLOSE:
c277df42 3495 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3496 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3497 PL_regendp[n] = locinput - PL_bostr;
eb160463 3498 if (n > (I32)*PL_reglastparen)
3280af22 3499 *PL_reglastparen = n;
a01268b5 3500 *PL_reglastcloseparen = n;
a0d0e21e 3501 break;
c277df42
IZ
3502 case GROUPP:
3503 n = ARG(scan); /* which paren pair */
5d9a96ca 3504 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3505 break;
3506 case IFTHEN:
2c2d71f5 3507 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3508 if (st->sw)
c277df42
IZ
3509 next = NEXTOPER(NEXTOPER(scan));
3510 else {
3511 next = scan + ARG(scan);
3512 if (OP(next) == IFTHEN) /* Fake one. */
3513 next = NEXTOPER(NEXTOPER(next));
3514 }
3515 break;
3516 case LOGICAL:
5d9a96ca 3517 st->logical = scan->flags;
c277df42 3518 break;
2ab05381 3519/*******************************************************************
b8591aee 3520 cur_curlyx points to the regmatch_state associated with the most recent CURLYX.
a0374537 3521 This struct contains info about the innermost (...)* loop (an
b8591aee 3522 "infoblock"), and a pointer to the next outer cur_curlyx.
2ab05381
IZ
3523
3524 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3525
95b24440 3526 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3527
b8591aee 3528 2) This regnode populates cur_curlyx, and calls regmatch() recursively
95b24440 3529 with the starting point at WHILEM node;
2ab05381
IZ
3530
3531 3) Each hit of WHILEM node tries to match A and Z (in the order
3532 depending on the current iteration, min/max of {min,max} and
3533 greediness). The information about where are nodes for "A"
b8591aee 3534 and "Z" is read from cur_curlyx, as is info on how many times "A"
2ab05381
IZ
3535 was already matched, and greediness.
3536
3537 4) After A matches, the same WHILEM node is hit again.
3538
b8591aee 3539 5) Each time WHILEM is hit, cur_curlyx is the infoblock created by CURLYX
2ab05381 3540 of the same pair. Thus when WHILEM tries to match Z, it temporarily
b8591aee 3541 resets cur_curlyx, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3542 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3543 of the external loop.
3544
b8591aee 3545 Currently present infoblocks form a tree with a stem formed by cur_curlyx
2ab05381
IZ
3546 and whatever it mentions via ->next, and additional attached trees
3547 corresponding to temporarily unset infoblocks as in "5" above.
3548
95b24440 3549 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3550 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3551 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3552 infoblocks are drawn below the "reset" infoblock.
3553
3554 In fact in the picture below we do not show failed matches for Z and T
3555 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3556 more obvious *why* one needs to *temporary* unset infoblocks.]
3557
3558 Matched REx position InfoBlocks Comment
3559 (Y(A)*?Z)*?T x
3560 Y(A)*?Z)*?T x <- O
3561 Y (A)*?Z)*?T x <- O
3562 Y A)*?Z)*?T x <- O <- I
3563 YA )*?Z)*?T x <- O <- I
3564 YA A)*?Z)*?T x <- O <- I
3565 YAA )*?Z)*?T x <- O <- I
3566 YAA Z)*?T x <- O # Temporary unset I
3567 I
3568
3569 YAAZ Y(A)*?Z)*?T x <- O
3570 I
3571
3572 YAAZY (A)*?Z)*?T x <- O
3573 I
3574
3575 YAAZY A)*?Z)*?T x <- O <- I
3576 I
3577
3578 YAAZYA )*?Z)*?T x <- O <- I
3579 I
3580
3581 YAAZYA Z)*?T x <- O # Temporary unset I
3582 I,I
3583
3584 YAAZYAZ )*?T x <- O
3585 I,I
3586
3587 YAAZYAZ T x # Temporary unset O
3588 O
3589 I,I
3590
3591 YAAZYAZT x
3592 O
3593 I,I
3594 *******************************************************************/
95b24440 3595
a0d0e21e 3596 case CURLYX: {
cb434fcc 3597 /* No need to save/restore up to this paren */
10edeb5d
JH
3598 parenfloor = scan->flags;
3599
c2b7afd3
NC
3600 /* Dave says:
3601
3602 CURLYX and WHILEM are always paired: they're the moral
3603 equivalent of pp_enteriter anbd pp_iter.
3604
3605 The only time next could be null is if the node tree is
3606 corrupt. This was mentioned on p5p a few days ago.
3607
3608 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3609 So we'll assert that this is true:
3610 */
3611 assert(next);
30b2893d 3612 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3613 next += ARG(next);
cb434fcc
IZ
3614 /* XXXX Probably it is better to teach regpush to support
3615 parenfloor > PL_regsize... */
eb160463 3616 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3617 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3618
d8319b27 3619 st->u.curlyx.cp = PL_savestack_ix;
b8591aee
DM
3620 st->u.curlyx.outercc = cur_curlyx;
3621 cur_curlyx = st;
a0374537
DM
3622 /* these fields contain the state of the current curly.
3623 * they are accessed by subsequent WHILEMs;
3624 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3625 st->u.curlyx.parenfloor = parenfloor;
3626 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3627 st->u.curlyx.min = ARG1(scan);
3628 st->u.curlyx.max = ARG2(scan);
3629 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3630 st->u.curlyx.lastloc = 0;
a0374537
DM
3631 /* st->next and st->minmod are also read by WHILEM */
3632
3280af22 3633 PL_reginput = locinput;
95b24440
DM
3634 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3635 /*** all unsaved local vars undefined at this point */
d8319b27 3636 regcpblow(st->u.curlyx.cp);
b8591aee 3637 cur_curlyx = st->u.curlyx.outercc;
95b24440 3638 saySAME(result);
a0d0e21e 3639 }
5f66b61c 3640 /* NOTREACHED */
a0d0e21e
LW
3641 case WHILEM: {
3642 /*
3643 * This is really hard to understand, because after we match
3644 * what we're trying to match, we must make sure the rest of
2c2d71f5 3645 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3646 * to go back UP the parse tree by recursing ever deeper. And
3647 * if it fails, we have to reset our parent's current state
3648 * that we can try again after backing off.
3649 */
3650
c2b7afd3
NC
3651 /* Dave says:
3652
b8591aee
DM
3653 cur_curlyx gets initialised by CURLYX ready for use by WHILEM.
3654 So again, unless somethings been corrupted, cur_curlyx cannot
c2b7afd3
NC
3655 be null at that point in WHILEM.
3656
3657 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3658 So we'll assert that this is true:
3659 */
b8591aee
DM
3660 assert(cur_curlyx);
3661 st->u.whilem.lastloc = cur_curlyx->u.curlyx.lastloc; /* Detection of 0-len. */
d8319b27
DM
3662 st->u.whilem.cache_offset = 0;
3663 st->u.whilem.cache_bit = 0;
c277df42 3664
b8591aee 3665 n = cur_curlyx->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3666 PL_reginput = locinput;
a0d0e21e 3667
a3621e74 3668 DEBUG_EXECUTE_r(
9041c2e3 3669 PerlIO_printf(Perl_debug_log,
91f3b821 3670 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3671 REPORT_CODE_OFF+PL_regindent*2, "",
b8591aee
DM
3672 (long)n, (long)cur_curlyx->u.curlyx.min,
3673 (long)cur_curlyx->u.curlyx.max,
3674 PTR2UV(cur_curlyx))
c277df42 3675 );
4633a7c4 3676
a0d0e21e
LW
3677 /* If degenerate scan matches "", assume scan done. */
3678
b8591aee
DM
3679 if (locinput == cur_curlyx->u.curlyx.lastloc && n >=
3680 cur_curlyx->u.curlyx.min)
3681 {
3682 st->u.whilem.savecc = cur_curlyx;
3683 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3684 if (cur_curlyx)
3685 st->ln = cur_curlyx->u.curlyx.cur;
a3621e74 3686 DEBUG_EXECUTE_r(
c3464db5
DD
3687 PerlIO_printf(Perl_debug_log,
3688 "%*s empty match detected, try continuation...\n",
3280af22 3689 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3690 );
d8319b27 3691 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3692 /*** all unsaved local vars undefined at this point */
b8591aee 3693 cur_curlyx = st->u.whilem.savecc;
95b24440 3694 if (result)
4633a7c4 3695 sayYES;
b8591aee
DM
3696 if (cur_curlyx->u.curlyx.outercc)
3697 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3698 sayNO;
a0d0e21e
LW
3699 }
3700
3701 /* First just match a string of min scans. */
3702
b8591aee
DM
3703 if (n < cur_curlyx->u.curlyx.min) {
3704 cur_curlyx->u.curlyx.cur = n;
3705 cur_curlyx->u.curlyx.lastloc = locinput;
3706 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM2);
95b24440
DM
3707 /*** all unsaved local vars undefined at this point */
3708 if (result)
4633a7c4 3709 sayYES;
b8591aee
DM
3710 cur_curlyx->u.curlyx.cur = n - 1;
3711 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3712 sayNO;
a0d0e21e
LW
3713 }
3714
2c2d71f5
JH
3715 if (scan->flags) {
3716 /* Check whether we already were at this position.
3717 Postpone detection until we know the match is not
3718 *that* much linear. */
3719 if (!PL_reg_maxiter) {
3720 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3721 /* possible overflow for long strings and many CURLYX's */
3722 if (PL_reg_maxiter < 0)
3723 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3724 PL_reg_leftiter = PL_reg_maxiter;
3725 }
3726 if (PL_reg_leftiter-- == 0) {
3298f257 3727 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3728 if (PL_reg_poscache) {
eb160463 3729 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3730 Renew(PL_reg_poscache, size, char);
3731 PL_reg_poscache_size = size;
3732 }
3733 Zero(PL_reg_poscache, size, char);
3734 }
3735 else {
3736 PL_reg_poscache_size = size;
a02a5408 3737 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3738 }
a3621e74 3739 DEBUG_EXECUTE_r(
2c2d71f5
JH
3740 PerlIO_printf(Perl_debug_log,
3741 "%sDetected a super-linear match, switching on caching%s...\n",
3742 PL_colors[4], PL_colors[5])
3743 );
3744 }
3745 if (PL_reg_leftiter < 0) {
d8319b27 3746 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3747
3298f257 3748 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1
d8319b27
DM
3749 + st->u.whilem.cache_offset * (scan->flags>>4);
3750 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3751 st->u.whilem.cache_offset /= 8;
3752 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3753 DEBUG_EXECUTE_r(
2c2d71f5
JH
3754 PerlIO_printf(Perl_debug_log,
3755 "%*s already tried at this position...\n",
3756 REPORT_CODE_OFF+PL_regindent*2, "")
3757 );
3298f257 3758 sayNO; /* cache records failure */
2c2d71f5 3759 }
2c2d71f5
JH
3760 }
3761 }
3762
a0d0e21e
LW
3763 /* Prefer next over scan for minimal matching. */
3764
b8591aee
DM
3765 if (cur_curlyx->minmod) {
3766 st->u.whilem.savecc = cur_curlyx;
3767 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3768 if (cur_curlyx)
3769 st->ln = cur_curlyx->u.curlyx.cur;
d8319b27
DM
3770 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3771 REGCP_SET(st->u.whilem.lastcp);
3772 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3773 /*** all unsaved local vars undefined at this point */
b8591aee 3774 cur_curlyx = st->u.whilem.savecc;
95b24440 3775 if (result) {
d8319b27 3776 regcpblow(st->u.whilem.cp);
3298f257 3777 sayYES; /* All done. */
5f05dabc 3778 }
d8319b27 3779 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3780 regcppop(rex);
b8591aee
DM
3781 if (cur_curlyx->u.curlyx.outercc)
3782 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3783
b8591aee 3784 if (n >= cur_curlyx->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3785 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3786 && !(PL_reg_flags & RF_warned)) {
3787 PL_reg_flags |= RF_warned;
9014280d 3788 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3789 "Complex regular subexpression recursion",
3790 REG_INFTY - 1);
c277df42 3791 }
3ab3c9b4 3792 CACHEsayNO;
c277df42 3793 }
a687059c 3794
a3621e74 3795 DEBUG_EXECUTE_r(
c3464db5
DD
3796 PerlIO_printf(Perl_debug_log,
3797 "%*s trying longer...\n",
3280af22 3798 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3799 );
a0d0e21e 3800 /* Try scanning more and see if it helps. */
3280af22 3801 PL_reginput = locinput;
b8591aee
DM
3802 cur_curlyx->u.curlyx.cur = n;
3803 cur_curlyx->u.curlyx.lastloc = locinput;
3804 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
d8319b27 3805 REGCP_SET(st->u.whilem.lastcp);
b8591aee 3806 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM4);
95b24440
DM
3807 /*** all unsaved local vars undefined at this point */
3808 if (result) {
d8319b27 3809 regcpblow(st->u.whilem.cp);
3298f257 3810 sayYES;
5f05dabc 3811 }
d8319b27 3812 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3813 regcppop(rex);
b8591aee
DM
3814 cur_curlyx->u.curlyx.cur = n - 1;
3815 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3816 CACHEsayNO;
a0d0e21e
LW
3817 }
3818
3819 /* Prefer scan over next for maximal matching. */
3820
b8591aee
DM
3821 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3822 st->u.whilem.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3823 cur_curlyx->u.curlyx.cur = n;
3824 cur_curlyx->u.curlyx.lastloc = locinput;
d8319b27 3825 REGCP_SET(st->u.whilem.lastcp);
b8591aee 3826 REGMATCH(cur_curlyx->u.curlyx.scan, WHILEM5);
95b24440
DM
3827 /*** all unsaved local vars undefined at this point */
3828 if (result) {
d8319b27 3829 regcpblow(st->u.whilem.cp);
3298f257 3830 sayYES;
5f05dabc 3831 }
d8319b27 3832 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3833 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3834 PL_reginput = locinput;
a3621e74 3835 DEBUG_EXECUTE_r(
c3464db5
DD
3836 PerlIO_printf(Perl_debug_log,
3837 "%*s failed, try continuation...\n",
3280af22 3838 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3839 );
3840 }
9041c2e3 3841 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3842 && !(PL_reg_flags & RF_warned)) {
3280af22 3843 PL_reg_flags |= RF_warned;
9014280d 3844 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3845 "Complex regular subexpression recursion",
3846 REG_INFTY - 1);
a0d0e21e
LW
3847 }
3848
3849 /* Failed deeper matches of scan, so see if this one works. */
b8591aee
DM
3850 st->u.whilem.savecc = cur_curlyx;
3851 cur_curlyx = cur_curlyx->u.curlyx.outercc;
3852 if (cur_curlyx)
3853 st->ln = cur_curlyx->u.curlyx.cur;
d8319b27 3854 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3855 /*** all unsaved local vars undefined at this point */
b8591aee 3856 cur_curlyx = st->u.whilem.savecc;
95b24440 3857 if (result)
3298f257 3858 sayYES;
b8591aee
DM
3859 if (cur_curlyx->u.curlyx.outercc)
3860 cur_curlyx->u.curlyx.outercc->u.curlyx.cur = st->ln;
3861 cur_curlyx->u.curlyx.cur = n - 1;
3862 cur_curlyx->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3863 CACHEsayNO;
a0d0e21e 3864 }
5f66b61c 3865 /* NOTREACHED */
40a82448
DM
3866
3867#undef ST
3868#define ST st->u.branch
3869
3870 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3871 next = scan + ARG(scan);
3872 if (next == scan)
3873 next = NULL;
40a82448
DM
3874 scan = NEXTOPER(scan);
3875 /* FALL THROUGH */
c277df42 3876
40a82448
DM
3877 case BRANCH: /* /(...|A|...)/ */
3878 scan = NEXTOPER(scan); /* scan now points to inner node */
3879 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3880 /* last branch; skip state push and jump direct to node */
3881 continue;
3882 ST.lastparen = *PL_reglastparen;
3883 ST.next_branch = next;
3884 REGCP_SET(ST.cp);
3885 PL_reginput = locinput;
02db2b7b 3886
40a82448
DM
3887 /* Now go into the branch */
3888 PUSH_STATE_GOTO(BRANCH_next, scan);
3889 /* NOTREACHED */
3890
3891 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3892 REGCP_UNWIND(ST.cp);
3893 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3894 PL_regendp[n] = -1;
3895 *PL_reglastparen = n;
3896 scan = ST.next_branch;
3897 /* no more branches? */
3898 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3899 sayNO;
3900 continue; /* execute next BRANCH[J] op */
3901 /* NOTREACHED */
3902
a0d0e21e 3903 case MINMOD:
5d9a96ca 3904 st->minmod = 1;
a0d0e21e 3905 break;
40a82448
DM
3906
3907#undef ST
3908#define ST st->u.curlym
3909
3910 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3911
3912 /* This is an optimisation of CURLYX that enables us to push
3913 * only a single backtracking state, no matter now many matches
3914 * there are in {m,n}. It relies on the pattern being constant
3915 * length, with no parens to influence future backrefs
3916 */
3917
3918 ST.me = scan;
dc45a647 3919 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3920
3921 /* if paren positive, emulate an OPEN/CLOSE around A */
3922 if (ST.me->flags) {
3923 I32 paren = ST.me->flags;
3924 if (paren > PL_regsize)
3925 PL_regsize = paren;
3926 if (paren > (I32)*PL_reglastparen)
3927 *PL_reglastparen = paren;
c277df42 3928 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3929 }
40a82448
DM
3930 ST.A = scan;
3931 ST.B = next;
3932 ST.alen = 0;
3933 ST.count = 0;
3934 ST.minmod = st->minmod;
3935 st->minmod = 0;
3936 ST.c1 = CHRTEST_UNINIT;
3937 REGCP_SET(ST.cp);
6407bf3b 3938
40a82448
DM
3939 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3940 goto curlym_do_B;
3941
3942 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3943 PL_reginput = locinput;
40a82448
DM
3944 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3945 /* NOTREACHED */
5f80c4cf 3946
40a82448
DM
3947 case CURLYM_A: /* we've just matched an A */
3948 locinput = st->locinput;
3949 nextchr = UCHARAT(locinput);
3950
3951 ST.count++;
3952 /* after first match, determine A's length: u.curlym.alen */
3953 if (ST.count == 1) {
3954 if (PL_reg_match_utf8) {
3955 char *s = locinput;
3956 while (s < PL_reginput) {
3957 ST.alen++;
3958 s += UTF8SKIP(s);
3959 }
3960 }
3961 else {
3962 ST.alen = PL_reginput - locinput;
3963 }
3964 if (ST.alen == 0)
3965 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3966 }
0cadcf80
DM
3967 DEBUG_EXECUTE_r(
3968 PerlIO_printf(Perl_debug_log,
40a82448 3969 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
ab3bbdeb 3970 (int)(REPORT_CODE_OFF+(PL_regindent*2)), "",
40a82448 3971 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3972 );
3973
40a82448
DM
3974 locinput = PL_reginput;
3975 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3976 goto curlym_do_A; /* try to match another A */
3977 goto curlym_do_B; /* try to match B */
5f80c4cf 3978
40a82448
DM
3979 case CURLYM_A_fail: /* just failed to match an A */
3980 REGCP_UNWIND(ST.cp);
3981 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3982 sayNO;
0cadcf80 3983
40a82448
DM
3984 curlym_do_B: /* execute the B in /A{m,n}B/ */
3985 PL_reginput = locinput;
3986 if (ST.c1 == CHRTEST_UNINIT) {
3987 /* calculate c1 and c2 for possible match of 1st char
3988 * following curly */
3989 ST.c1 = ST.c2 = CHRTEST_VOID;
3990 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3991 regnode *text_node = ST.B;
3992 if (! HAS_TEXT(text_node))
3993 FIND_NEXT_IMPT(text_node);
3994 if (HAS_TEXT(text_node)
3995 && PL_regkind[OP(text_node)] != REF)
3996 {
3997 ST.c1 = (U8)*STRING(text_node);
3998 ST.c2 =
3999 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4000 ? PL_fold[ST.c1]
4001 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4002 ? PL_fold_locale[ST.c1]
4003 : ST.c1;
c277df42 4004 }
c277df42 4005 }
40a82448
DM
4006 }
4007
4008 DEBUG_EXECUTE_r(
4009 PerlIO_printf(Perl_debug_log,
4010 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
ab3bbdeb 4011 (int)(REPORT_CODE_OFF+(PL_regindent*2)),
40a82448
DM
4012 "", (IV)ST.count)
4013 );
4014 if (ST.c1 != CHRTEST_VOID
4015 && UCHARAT(PL_reginput) != ST.c1
4016 && UCHARAT(PL_reginput) != ST.c2)
4017 {
4018 /* simulate B failing */
4019 state_num = CURLYM_B_fail;
4020 goto reenter_switch;
4021 }
4022
4023 if (ST.me->flags) {
4024 /* mark current A as captured */
4025 I32 paren = ST.me->flags;
4026 if (ST.count) {
4027 PL_regstartp[paren]
4028 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4029 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4030 }
40a82448
DM
4031 else
4032 PL_regendp[paren] = -1;
c277df42 4033 }
40a82448 4034 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4035 /* NOTREACHED */
40a82448
DM
4036
4037 case CURLYM_B_fail: /* just failed to match a B */
4038 REGCP_UNWIND(ST.cp);
4039 if (ST.minmod) {
4040 if (ST.count == ARG2(ST.me) /* max */)
4041 sayNO;
4042 goto curlym_do_A; /* try to match a further A */
4043 }
4044 /* backtrack one A */
4045 if (ST.count == ARG1(ST.me) /* min */)
4046 sayNO;
4047 ST.count--;
4048 locinput = HOPc(locinput, -ST.alen);
4049 goto curlym_do_B; /* try to match B */
4050
c255a977
DM
4051#undef ST
4052#define ST st->u.curly
40a82448 4053
c255a977
DM
4054#define CURLY_SETPAREN(paren, success) \
4055 if (paren) { \
4056 if (success) { \
4057 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4058 PL_regendp[paren] = locinput - PL_bostr; \
4059 } \
4060 else \
4061 PL_regendp[paren] = -1; \
4062 }
4063
4064 case STAR: /* /A*B/ where A is width 1 */
4065 ST.paren = 0;
4066 ST.min = 0;
4067 ST.max = REG_INFTY;
a0d0e21e
LW
4068 scan = NEXTOPER(scan);
4069 goto repeat;
c255a977
DM
4070 case PLUS: /* /A+B/ where A is width 1 */
4071 ST.paren = 0;
4072 ST.min = 1;
4073 ST.max = REG_INFTY;
c277df42 4074 scan = NEXTOPER(scan);
c255a977
DM
4075 goto repeat;
4076 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4077 ST.paren = scan->flags; /* Which paren to set */
4078 if (ST.paren > PL_regsize)
4079 PL_regsize = ST.paren;
4080 if (ST.paren > (I32)*PL_reglastparen)
4081 *PL_reglastparen = ST.paren;
4082 ST.min = ARG1(scan); /* min to match */
4083 ST.max = ARG2(scan); /* max to match */
4084 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4085 goto repeat;
4086 case CURLY: /* /A{m,n}B/ where A is width 1 */
4087 ST.paren = 0;
4088 ST.min = ARG1(scan); /* min to match */
4089 ST.max = ARG2(scan); /* max to match */
4090 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4091 repeat:
a0d0e21e
LW
4092 /*
4093 * Lookahead to avoid useless match attempts
4094 * when we know what character comes next.
c255a977 4095 *
5f80c4cf
JP
4096 * Used to only do .*x and .*?x, but now it allows
4097 * for )'s, ('s and (?{ ... })'s to be in the way
4098 * of the quantifier and the EXACT-like node. -- japhy
4099 */
4100
c255a977
DM
4101 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4102 sayNO;
cca55fe3 4103 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4104 U8 *s;
4105 regnode *text_node = next;
4106
3dab1dad
YO
4107 if (! HAS_TEXT(text_node))
4108 FIND_NEXT_IMPT(text_node);
5f80c4cf 4109
9e137952 4110 if (! HAS_TEXT(text_node))
c255a977 4111 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4112 else {
3dab1dad 4113 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4114 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4115 goto assume_ok_easy;
cca55fe3 4116 }
be8e71aa
YO
4117 else
4118 s = (U8*)STRING(text_node);
5f80c4cf
JP
4119
4120 if (!UTF) {
c255a977 4121 ST.c2 = ST.c1 = *s;
f65d3ee7 4122 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4123 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4124 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4125 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4126 }
5f80c4cf 4127 else { /* UTF */
f65d3ee7 4128 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4129 STRLEN ulen1, ulen2;
89ebb4a3
JH
4130 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4131 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4132
4133 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4134 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4135#ifdef EBCDIC
4136 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4137 ckWARN(WARN_UTF8) ?
4138 0 : UTF8_ALLOW_ANY);
4139 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4140 ckWARN(WARN_UTF8) ?
4141 0 : UTF8_ALLOW_ANY);
4142#else
c255a977 4143 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4144 uniflags);
c255a977 4145 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4146 uniflags);
4147#endif
5f80c4cf
JP
4148 }
4149 else {
c255a977 4150 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4151 uniflags);
5f80c4cf 4152 }
1aa99e6b
IH
4153 }
4154 }
bbce6d69 4155 }
a0d0e21e 4156 else
c255a977 4157 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4158 assume_ok_easy:
c255a977
DM
4159
4160 ST.A = scan;
4161 ST.B = next;
3280af22 4162 PL_reginput = locinput;
5d9a96ca
DM
4163 if (st->minmod) {
4164 st->minmod = 0;
c255a977 4165 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4166 sayNO;
c255a977 4167 ST.count = ST.min;
a0ed51b3 4168 locinput = PL_reginput;
c255a977
DM
4169 REGCP_SET(ST.cp);
4170 if (ST.c1 == CHRTEST_VOID)
4171 goto curly_try_B_min;
4172
4173 ST.oldloc = locinput;
4174
4175 /* set ST.maxpos to the furthest point along the
4176 * string that could possibly match */
4177 if (ST.max == REG_INFTY) {
4178 ST.maxpos = PL_regeol - 1;
4179 if (do_utf8)
4180 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4181 ST.maxpos--;
4182 }
4183 else if (do_utf8) {
4184 int m = ST.max - ST.min;
4185 for (ST.maxpos = locinput;
4186 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4187 ST.maxpos += UTF8SKIP(ST.maxpos);
4188 }
4189 else {
4190 ST.maxpos = locinput + ST.max - ST.min;
4191 if (ST.maxpos >= PL_regeol)
4192 ST.maxpos = PL_regeol - 1;
4193 }
4194 goto curly_try_B_min_known;
4195
4196 }
4197 else {
4198 ST.count = regrepeat(rex, ST.A, ST.max);
4199 locinput = PL_reginput;
4200 if (ST.count < ST.min)
4201 sayNO;
4202 if ((ST.count > ST.min)
4203 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4204 {
4205 /* A{m,n} must come at the end of the string, there's
4206 * no point in backing off ... */
4207 ST.min = ST.count;
4208 /* ...except that $ and \Z can match before *and* after
4209 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4210 We may back off by one in this case. */
4211 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4212 ST.min--;
4213 }
4214 REGCP_SET(ST.cp);
4215 goto curly_try_B_max;
4216 }
4217 /* NOTREACHED */
4218
4219
4220 case CURLY_B_min_known_fail:
4221 /* failed to find B in a non-greedy match where c1,c2 valid */
4222 if (ST.paren && ST.count)
4223 PL_regendp[ST.paren] = -1;
4224
4225 PL_reginput = locinput; /* Could be reset... */
4226 REGCP_UNWIND(ST.cp);
4227 /* Couldn't or didn't -- move forward. */
4228 ST.oldloc = locinput;
4229 if (do_utf8)
4230 locinput += UTF8SKIP(locinput);
4231 else
4232 locinput++;
4233 ST.count++;
4234 curly_try_B_min_known:
4235 /* find the next place where 'B' could work, then call B */
4236 {
4237 int n;
4238 if (do_utf8) {
4239 n = (ST.oldloc == locinput) ? 0 : 1;
4240 if (ST.c1 == ST.c2) {
4241 STRLEN len;
4242 /* set n to utf8_distance(oldloc, locinput) */
4243 while (locinput <= ST.maxpos &&
4244 utf8n_to_uvchr((U8*)locinput,
4245 UTF8_MAXBYTES, &len,
4246 uniflags) != (UV)ST.c1) {
4247 locinput += len;
4248 n++;
4249 }
1aa99e6b
IH
4250 }
4251 else {
c255a977
DM
4252 /* set n to utf8_distance(oldloc, locinput) */
4253 while (locinput <= ST.maxpos) {
4254 STRLEN len;
4255 const UV c = utf8n_to_uvchr((U8*)locinput,
4256 UTF8_MAXBYTES, &len,
4257 uniflags);
4258 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4259 break;
4260 locinput += len;
4261 n++;
1aa99e6b 4262 }
0fe9bf95
IZ
4263 }
4264 }
c255a977
DM
4265 else {
4266 if (ST.c1 == ST.c2) {
4267 while (locinput <= ST.maxpos &&
4268 UCHARAT(locinput) != ST.c1)
4269 locinput++;
bbce6d69 4270 }
c255a977
DM
4271 else {
4272 while (locinput <= ST.maxpos
4273 && UCHARAT(locinput) != ST.c1
4274 && UCHARAT(locinput) != ST.c2)
4275 locinput++;
a0ed51b3 4276 }
c255a977
DM
4277 n = locinput - ST.oldloc;
4278 }
4279 if (locinput > ST.maxpos)
4280 sayNO;
4281 /* PL_reginput == oldloc now */
4282 if (n) {
4283 ST.count += n;
4284 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4285 sayNO;
a0d0e21e 4286 }
c255a977
DM
4287 PL_reginput = locinput;
4288 CURLY_SETPAREN(ST.paren, ST.count);
4289 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4290 }
c255a977
DM
4291 /* NOTREACHED */
4292
4293
4294 case CURLY_B_min_fail:
4295 /* failed to find B in a non-greedy match where c1,c2 invalid */
4296 if (ST.paren && ST.count)
4297 PL_regendp[ST.paren] = -1;
4298
4299 REGCP_UNWIND(ST.cp);
4300 /* failed -- move forward one */
4301 PL_reginput = locinput;
4302 if (regrepeat(rex, ST.A, 1)) {
4303 ST.count++;
a0ed51b3 4304 locinput = PL_reginput;
c255a977
DM
4305 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4306 ST.count > 0)) /* count overflow ? */
15272685 4307 {
c255a977
DM
4308 curly_try_B_min:
4309 CURLY_SETPAREN(ST.paren, ST.count);
4310 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4311 }
4312 }
4633a7c4 4313 sayNO;
c255a977
DM
4314 /* NOTREACHED */
4315
4316
4317 curly_try_B_max:
4318 /* a successful greedy match: now try to match B */
4319 {
4320 UV c = 0;
4321 if (ST.c1 != CHRTEST_VOID)
4322 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4323 UTF8_MAXBYTES, 0, uniflags)
466787eb 4324 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4325 /* If it could work, try it. */
4326 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4327 CURLY_SETPAREN(ST.paren, ST.count);
4328 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4329 /* NOTREACHED */
4330 }
4331 }
4332 /* FALL THROUGH */
4333 case CURLY_B_max_fail:
4334 /* failed to find B in a greedy match */
4335 if (ST.paren && ST.count)
4336 PL_regendp[ST.paren] = -1;
4337
4338 REGCP_UNWIND(ST.cp);
4339 /* back up. */
4340 if (--ST.count < ST.min)
4341 sayNO;
4342 PL_reginput = locinput = HOPc(locinput, -1);
4343 goto curly_try_B_max;
4344
4345#undef ST
4346
4347
a0d0e21e 4348 case END:
faec1544
DM
4349 if (cur_eval) {
4350 /* we've just finished A in /(??{A})B/; now continue with B */
4351 I32 tmpix;
4352
4353
4354 st->u.eval.toggle_reg_flags
4355 = cur_eval->u.eval.toggle_reg_flags;
4356 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4357
4358 st->u.eval.prev_rex = rex; /* inner */
4359 rex = cur_eval->u.eval.prev_rex; /* outer */
4360 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4361 ReREFCNT_inc(rex);
4362 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4363 REGCP_SET(st->u.eval.lastcp);
4364 PL_reginput = locinput;
4365
4366 /* Restore parens of the outer rex without popping the
4367 * savestack */
4368 tmpix = PL_savestack_ix;
4369 PL_savestack_ix = cur_eval->u.eval.lastcp;
4370 regcppop(rex);
4371 PL_savestack_ix = tmpix;
4372
4373 st->u.eval.prev_eval = cur_eval;
4374 cur_eval = cur_eval->u.eval.prev_eval;
4375 DEBUG_EXECUTE_r(
4376 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ...\n",
4377 REPORT_CODE_OFF+PL_regindent*2, ""););
4378 PUSH_YES_STATE_GOTO(EVAL_AB,
4379 st->u.eval.prev_eval->u.eval.B); /* match B */
4380 }
4381
3b0527fe 4382 if (locinput < reginfo->till) {
a3621e74 4383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4384 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4385 PL_colors[4],
4386 (long)(locinput - PL_reg_starttry),
3b0527fe 4387 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4388 PL_colors[5]));
4389 sayNO_FINAL; /* Cannot match: too short. */
4390 }
4391 PL_reginput = locinput; /* put where regtry can find it */
4392 sayYES_FINAL; /* Success! */
dad79028
DM
4393
4394 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4395 DEBUG_EXECUTE_r(
4396 PerlIO_printf(Perl_debug_log,
4397 "%*s %ssubpattern success...%s\n",
4398 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4399 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4400 sayYES_FINAL; /* Success! */
4401
40a82448
DM
4402#undef ST
4403#define ST st->u.ifmatch
4404
4405 case SUSPEND: /* (?>A) */
4406 ST.wanted = 1;
9fe1d20c 4407 PL_reginput = locinput;
9041c2e3 4408 goto do_ifmatch;
dad79028 4409
40a82448
DM
4410 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4411 ST.wanted = 0;
dad79028
DM
4412 goto ifmatch_trivial_fail_test;
4413
40a82448
DM
4414 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4415 ST.wanted = 1;
dad79028 4416 ifmatch_trivial_fail_test:
a0ed51b3 4417 if (scan->flags) {
52657f30 4418 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4419 if (!s) {
4420 /* trivial fail */
4421 if (st->logical) {
4422 st->logical = 0;
40a82448 4423 st->sw = 1 - (bool)ST.wanted;
dad79028 4424 }
40a82448 4425 else if (ST.wanted)
dad79028
DM
4426 sayNO;
4427 next = scan + ARG(scan);
4428 if (next == scan)
4429 next = NULL;
4430 break;
4431 }
efb30f32 4432 PL_reginput = s;
a0ed51b3
LW
4433 }
4434 else
4435 PL_reginput = locinput;
4436
c277df42 4437 do_ifmatch:
40a82448
DM
4438 ST.me = scan;
4439 /* execute body of (?...A) */
4440 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4441 /* NOTREACHED */
4442
4443 case IFMATCH_A_fail: /* body of (?...A) failed */
4444 ST.wanted = !ST.wanted;
4445 /* FALL THROUGH */
4446
4447 case IFMATCH_A: /* body of (?...A) succeeded */
4448 if (st->logical) {
4449 st->logical = 0;
4450 st->sw = (bool)ST.wanted;
4451 }
4452 else if (!ST.wanted)
4453 sayNO;
4454
4455 if (OP(ST.me) == SUSPEND)
4456 locinput = PL_reginput;
4457 else {
4458 locinput = PL_reginput = st->locinput;
4459 nextchr = UCHARAT(locinput);
4460 }
4461 scan = ST.me + ARG(ST.me);
4462 if (scan == ST.me)
4463 scan = NULL;
4464 continue; /* execute B */
4465
4466#undef ST
dad79028 4467
c277df42 4468 case LONGJMP:
c277df42
IZ
4469 next = scan + ARG(scan);
4470 if (next == scan)
4471 next = NULL;
a0d0e21e
LW
4472 break;
4473 default:
b900a521 4474 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4475 PTR2UV(scan), OP(scan));
cea2e8a9 4476 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4477 }
95b24440 4478
a0d0e21e 4479 scan = next;
95b24440
DM
4480 continue;
4481 /* NOTREACHED */
4482
40a82448
DM
4483 push_yes_state:
4484 /* push a state that backtracks on success */
4485 st->u.yes.prev_yes_state = yes_state;
4486 yes_state = st;
4487 /* FALL THROUGH */
4488 push_state:
4489 /* push a new regex state, then continue at scan */
4490 {
4491 regmatch_state *newst;
4492
4493 depth++;
ab3bbdeb 4494 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log,
40a82448
DM
4495 "PUSH STATE(%d)\n", depth));
4496 st->locinput = locinput;
4497 newst = st+1;
4498 if (newst > SLAB_LAST(PL_regmatch_slab))
4499 newst = S_push_slab(aTHX);
4500 PL_regmatch_state = newst;
40a82448
DM
4501 /* XXX probably don't need to initialise these */
4502 newst->minmod = 0;
4503 newst->sw = 0;
4504 newst->logical = 0;
4505
786e8c11 4506
40a82448
DM
4507 locinput = PL_reginput;
4508 nextchr = UCHARAT(locinput);
4509 st = newst;
4510 continue;
4511 /* NOTREACHED */
4512 }
4513
95b24440
DM
4514 /* simulate recursively calling regmatch(), but without actually
4515 * recursing - ie save the current state on the heap rather than on
4516 * the stack, then re-enter the loop. This avoids complex regexes
4517 * blowing the processor stack */
4518
4519 start_recurse:
4520 {
5d9a96ca
DM
4521 /* push new state */
4522 regmatch_state *oldst = st;
4523
4524 depth++;
40a82448 4525 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4526
4527 /* grab the next free state slot */
4528 st++;
86545054 4529 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4530 st = S_push_slab(aTHX);
4531 PL_regmatch_state = st;
4532
4533 oldst->next = next;
4534 oldst->n = n;
4535 oldst->locinput = locinput;
5d9a96ca 4536
95b24440
DM
4537 locinput = PL_reginput;
4538 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4539 st->minmod = 0;
4540 st->sw = 0;
4541 st->logical = 0;
786e8c11 4542
95b24440
DM
4543#ifdef DEBUGGING
4544 PL_regindent++;
4545#endif
4546 }
a0d0e21e 4547 }
a687059c 4548
aa283a38
DM
4549
4550
a0d0e21e
LW
4551 /*
4552 * We get here only if there's trouble -- normally "case END" is
4553 * the terminating point.
4554 */
cea2e8a9 4555 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4556 /*NOTREACHED*/
4633a7c4
LW
4557 sayNO;
4558
7821416a 4559yes_final:
77cb431f
DM
4560
4561 if (yes_state) {
4562 /* we have successfully completed a subexpression, but we must now
4563 * pop to the state marked by yes_state and continue from there */
77cb431f
DM
4564 assert(st != yes_state);
4565 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4566 || yes_state > SLAB_LAST(PL_regmatch_slab))
4567 {
4568 /* not in this slab, pop slab */
4569 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4570 PL_regmatch_slab = PL_regmatch_slab->prev;
4571 st = SLAB_LAST(PL_regmatch_slab);
4572 }
4573 depth -= (st - yes_state);
e4f74956
YO
4574 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%"UVuf"..%"UVuf")\n",
4575 (UV)(depth+1), (UV)(depth+(st - yes_state))));
77cb431f
DM
4576 st = yes_state;
4577 yes_state = st->u.yes.prev_yes_state;
4578 PL_regmatch_state = st;
4579
4580 switch (st->resume_state) {
faec1544 4581 case EVAL_AB:
40a82448
DM
4582 case IFMATCH_A:
4583 case CURLYM_A:
40a82448
DM
4584 state_num = st->resume_state;
4585 goto reenter_switch;
4586
4587 case CURLYM_B:
166ba7cd
DM
4588 case BRANCH_next:
4589 case TRIE_next:
c255a977 4590 case CURLY_B_max:
77cb431f 4591 default:
40a82448 4592 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4593 }
4594 }
4595
a3621e74 4596 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4597 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4598yes:
4599#ifdef DEBUGGING
3280af22 4600 PL_regindent--;
4633a7c4 4601#endif
02db2b7b 4602
95b24440 4603 result = 1;
aa283a38 4604 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4605 * will disappear when REGFMATCH goes */
aa283a38
DM
4606 if (depth) {
4607 /* restore previous state and re-enter */
ab3bbdeb 4608 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4609 depth--;
4610 st--;
4611 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4612 PL_regmatch_slab = PL_regmatch_slab->prev;
4613 st = SLAB_LAST(PL_regmatch_slab);
4614 }
4615 PL_regmatch_state = st;
4616 scan = st->scan;
4617 next = st->next;
4618 n = st->n;
4619 locinput= st->locinput;
4620 nextchr = UCHARAT(locinput);
aa283a38
DM
4621
4622 switch (st->resume_state) {
aa283a38
DM
4623 case resume_CURLYX:
4624 goto resume_point_CURLYX;
4625 case resume_WHILEM1:
4626 goto resume_point_WHILEM1;
4627 case resume_WHILEM2:
4628 goto resume_point_WHILEM2;
4629 case resume_WHILEM3:
4630 goto resume_point_WHILEM3;
4631 case resume_WHILEM4:
4632 goto resume_point_WHILEM4;
4633 case resume_WHILEM5:
4634 goto resume_point_WHILEM5;
4635 case resume_WHILEM6:
4636 goto resume_point_WHILEM6;
77cb431f 4637
166ba7cd 4638 case TRIE_next:
40a82448
DM
4639 case CURLYM_A:
4640 case CURLYM_B:
faec1544 4641 case EVAL_AB:
40a82448
DM
4642 case IFMATCH_A:
4643 case BRANCH_next:
c255a977
DM
4644 case CURLY_B_max:
4645 case CURLY_B_min:
4646 case CURLY_B_min_known:
40a82448
DM
4647 break;
4648
aa283a38
DM
4649 default:
4650 Perl_croak(aTHX_ "regexp resume memory corruption");
4651 }
4652 }
4653 goto final_exit;
4633a7c4
LW
4654
4655no:
a3621e74 4656 DEBUG_EXECUTE_r(
7821416a 4657 PerlIO_printf(Perl_debug_log,
786e8c11
YO
4658 "%*s %sfailed...%s\n",
4659 REPORT_CODE_OFF+PL_regindent*2, "",
4660 PL_colors[4], PL_colors[5])
7821416a 4661 );
7821416a
IZ
4662no_final:
4663do_no:
aa283a38 4664
4633a7c4 4665#ifdef DEBUGGING
3280af22 4666 PL_regindent--;
4633a7c4 4667#endif
95b24440 4668 result = 0;
5d9a96ca 4669
aa283a38
DM
4670 if (depth) {
4671 /* there's a previous state to backtrack to */
ab3bbdeb 4672 DEBUG_STATE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
40a82448
DM
4673 depth--;
4674 st--;
4675 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4676 PL_regmatch_slab = PL_regmatch_slab->prev;
4677 st = SLAB_LAST(PL_regmatch_slab);
4678 }
4679 PL_regmatch_state = st;
4680 scan = st->scan;
4681 next = st->next;
4682 n = st->n;
4683 locinput= st->locinput;
4684 nextchr = UCHARAT(locinput);
4685
5d9a96ca 4686 switch (st->resume_state) {
95b24440
DM
4687 case resume_CURLYX:
4688 goto resume_point_CURLYX;
4689 case resume_WHILEM1:
4690 goto resume_point_WHILEM1;
4691 case resume_WHILEM2:
4692 goto resume_point_WHILEM2;
4693 case resume_WHILEM3:
4694 goto resume_point_WHILEM3;
4695 case resume_WHILEM4:
4696 goto resume_point_WHILEM4;
4697 case resume_WHILEM5:
4698 goto resume_point_WHILEM5;
4699 case resume_WHILEM6:
4700 goto resume_point_WHILEM6;
dad79028 4701
166ba7cd 4702 case TRIE_next:
faec1544 4703 case EVAL_AB:
40a82448
DM
4704 case BRANCH_next:
4705 case CURLYM_A:
4706 case CURLYM_B:
4707 case IFMATCH_A:
c255a977
DM
4708 case CURLY_B_max:
4709 case CURLY_B_min:
4710 case CURLY_B_min_known:
40a82448
DM
4711 if (yes_state == st)
4712 yes_state = st->u.yes.prev_yes_state;
4713 state_num = st->resume_state + 1; /* failure = success + 1 */
4714 goto reenter_switch;
dad79028 4715
95b24440
DM
4716 default:
4717 Perl_croak(aTHX_ "regexp resume memory corruption");
4718 }
95b24440 4719 }
aa283a38
DM
4720
4721final_exit:
4722
5d9a96ca
DM
4723 /* restore original high-water mark */
4724 PL_regmatch_slab = orig_slab;
4725 PL_regmatch_state = orig_state;
4726
4727 /* free all slabs above current one */
4728 if (orig_slab->next) {
c4fd8992 4729 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4730 orig_slab->next = NULL;
4731 while (sl) {
c4fd8992 4732 regmatch_slab * const osl = sl;
5d9a96ca 4733 sl = sl->next;
ad65c075 4734 Safefree(osl);
5d9a96ca
DM
4735 }
4736 }
4737
95b24440
DM
4738 return result;
4739
a687059c
LW
4740}
4741
4742/*
4743 - regrepeat - repeatedly match something simple, report how many
4744 */
4745/*
4746 * [This routine now assumes that it will only match on things of length 1.
4747 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4748 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4749 */
76e3520e 4750STATIC I32
32fc9b6a 4751S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4752{
27da23d5 4753 dVAR;
a0d0e21e 4754 register char *scan;
a0d0e21e 4755 register I32 c;
3280af22 4756 register char *loceol = PL_regeol;
a0ed51b3 4757 register I32 hardcount = 0;
53c4c00c 4758 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4759
3280af22 4760 scan = PL_reginput;
faf11cac
HS
4761 if (max == REG_INFTY)
4762 max = I32_MAX;
4763 else if (max < loceol - scan)
7f596f4c 4764 loceol = scan + max;
a0d0e21e 4765 switch (OP(p)) {
22c35a8c 4766 case REG_ANY:
1aa99e6b 4767 if (do_utf8) {
ffc61ed2 4768 loceol = PL_regeol;
1aa99e6b 4769 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4770 scan += UTF8SKIP(scan);
4771 hardcount++;
4772 }
4773 } else {
4774 while (scan < loceol && *scan != '\n')
4775 scan++;
a0ed51b3
LW
4776 }
4777 break;
ffc61ed2 4778 case SANY:
def8e4ea
JH
4779 if (do_utf8) {
4780 loceol = PL_regeol;
a0804c9e 4781 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4782 scan += UTF8SKIP(scan);
4783 hardcount++;
4784 }
4785 }
4786 else
4787 scan = loceol;
a0ed51b3 4788 break;
f33976b4
DB
4789 case CANY:
4790 scan = loceol;
4791 break;
090f7165
JH
4792 case EXACT: /* length of string is 1 */
4793 c = (U8)*STRING(p);
4794 while (scan < loceol && UCHARAT(scan) == c)
4795 scan++;
bbce6d69 4796 break;
4797 case EXACTF: /* length of string is 1 */
cd439c50 4798 c = (U8)*STRING(p);
bbce6d69 4799 while (scan < loceol &&
22c35a8c 4800 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4801 scan++;
4802 break;
4803 case EXACTFL: /* length of string is 1 */
3280af22 4804 PL_reg_flags |= RF_tainted;
cd439c50 4805 c = (U8)*STRING(p);
bbce6d69 4806 while (scan < loceol &&
22c35a8c 4807 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4808 scan++;
4809 break;
4810 case ANYOF:
ffc61ed2
JH
4811 if (do_utf8) {
4812 loceol = PL_regeol;
cfc92286 4813 while (hardcount < max && scan < loceol &&
32fc9b6a 4814 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4815 scan += UTF8SKIP(scan);
4816 hardcount++;
4817 }
4818 } else {
32fc9b6a 4819 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4820 scan++;
4821 }
a0d0e21e
LW
4822 break;
4823 case ALNUM:
1aa99e6b 4824 if (do_utf8) {
ffc61ed2 4825 loceol = PL_regeol;
1a4fad37 4826 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4827 while (hardcount < max && scan < loceol &&
3568d838 4828 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4829 scan += UTF8SKIP(scan);
4830 hardcount++;
4831 }
4832 } else {
4833 while (scan < loceol && isALNUM(*scan))
4834 scan++;
a0ed51b3
LW
4835 }
4836 break;
bbce6d69 4837 case ALNUML:
3280af22 4838 PL_reg_flags |= RF_tainted;
1aa99e6b 4839 if (do_utf8) {
ffc61ed2 4840 loceol = PL_regeol;
1aa99e6b
IH
4841 while (hardcount < max && scan < loceol &&
4842 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4843 scan += UTF8SKIP(scan);
4844 hardcount++;
4845 }
4846 } else {
4847 while (scan < loceol && isALNUM_LC(*scan))
4848 scan++;
a0ed51b3
LW
4849 }
4850 break;
a0d0e21e 4851 case NALNUM:
1aa99e6b 4852 if (do_utf8) {
ffc61ed2 4853 loceol = PL_regeol;
1a4fad37 4854 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4855 while (hardcount < max && scan < loceol &&
3568d838 4856 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4857 scan += UTF8SKIP(scan);
4858 hardcount++;
4859 }
4860 } else {
4861 while (scan < loceol && !isALNUM(*scan))
4862 scan++;
a0ed51b3
LW
4863 }
4864 break;
bbce6d69 4865 case NALNUML:
3280af22 4866 PL_reg_flags |= RF_tainted;
1aa99e6b 4867 if (do_utf8) {
ffc61ed2 4868 loceol = PL_regeol;
1aa99e6b
IH
4869 while (hardcount < max && scan < loceol &&
4870 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4871 scan += UTF8SKIP(scan);
4872 hardcount++;
4873 }
4874 } else {
4875 while (scan < loceol && !isALNUM_LC(*scan))
4876 scan++;
a0ed51b3
LW
4877 }
4878 break;
a0d0e21e 4879 case SPACE:
1aa99e6b 4880 if (do_utf8) {
ffc61ed2 4881 loceol = PL_regeol;
1a4fad37 4882 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4883 while (hardcount < max && scan < loceol &&
3568d838
JH
4884 (*scan == ' ' ||
4885 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4886 scan += UTF8SKIP(scan);
4887 hardcount++;
4888 }
4889 } else {
4890 while (scan < loceol && isSPACE(*scan))
4891 scan++;
a0ed51b3
LW
4892 }
4893 break;
bbce6d69 4894 case SPACEL:
3280af22 4895 PL_reg_flags |= RF_tainted;
1aa99e6b 4896 if (do_utf8) {
ffc61ed2 4897 loceol = PL_regeol;
1aa99e6b 4898 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4899 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4900 scan += UTF8SKIP(scan);
4901 hardcount++;
4902 }
4903 } else {
4904 while (scan < loceol && isSPACE_LC(*scan))
4905 scan++;
a0ed51b3
LW
4906 }
4907 break;
a0d0e21e 4908 case NSPACE:
1aa99e6b 4909 if (do_utf8) {
ffc61ed2 4910 loceol = PL_regeol;
1a4fad37 4911 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4912 while (hardcount < max && scan < loceol &&
3568d838
JH
4913 !(*scan == ' ' ||
4914 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4915 scan += UTF8SKIP(scan);
4916 hardcount++;
4917 }
4918 } else {
4919 while (scan < loceol && !isSPACE(*scan))
4920 scan++;
4921 break;
a0ed51b3 4922 }
bbce6d69 4923 case NSPACEL:
3280af22 4924 PL_reg_flags |= RF_tainted;
1aa99e6b 4925 if (do_utf8) {
ffc61ed2 4926 loceol = PL_regeol;
1aa99e6b 4927 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4928 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4929 scan += UTF8SKIP(scan);
4930 hardcount++;
4931 }
4932 } else {
4933 while (scan < loceol && !isSPACE_LC(*scan))
4934 scan++;
a0ed51b3
LW
4935 }
4936 break;
a0d0e21e 4937 case DIGIT:
1aa99e6b 4938 if (do_utf8) {
ffc61ed2 4939 loceol = PL_regeol;
1a4fad37 4940 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4941 while (hardcount < max && scan < loceol &&
3568d838 4942 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4943 scan += UTF8SKIP(scan);
4944 hardcount++;
4945 }
4946 } else {
4947 while (scan < loceol && isDIGIT(*scan))
4948 scan++;
a0ed51b3
LW
4949 }
4950 break;
a0d0e21e 4951 case NDIGIT:
1aa99e6b 4952 if (do_utf8) {
ffc61ed2 4953 loceol = PL_regeol;
1a4fad37 4954 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4955 while (hardcount < max && scan < loceol &&
3568d838 4956 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4957 scan += UTF8SKIP(scan);
4958 hardcount++;
4959 }
4960 } else {
4961 while (scan < loceol && !isDIGIT(*scan))
4962 scan++;
a0ed51b3
LW
4963 }
4964 break;
a0d0e21e
LW
4965 default: /* Called on something of 0 width. */
4966 break; /* So match right here or not at all. */
4967 }
a687059c 4968
a0ed51b3
LW
4969 if (hardcount)
4970 c = hardcount;
4971 else
4972 c = scan - PL_reginput;
3280af22 4973 PL_reginput = scan;
a687059c 4974
a3621e74 4975 DEBUG_r({
e68ec53f 4976 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4977 DEBUG_EXECUTE_r({
e68ec53f
YO
4978 SV * const prop = sv_newmortal();
4979 regprop(prog, prop, p);
4980 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4981 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4982 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4983 });
be8e71aa 4984 });
9041c2e3 4985
a0d0e21e 4986 return(c);
a687059c
LW
4987}
4988
c277df42 4989
be8e71aa 4990#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4991/*
ffc61ed2
JH
4992- regclass_swash - prepare the utf8 swash
4993*/
4994
4995SV *
32fc9b6a 4996Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4997{
97aff369 4998 dVAR;
9e55ce06
JH
4999 SV *sw = NULL;
5000 SV *si = NULL;
5001 SV *alt = NULL;
3dab1dad 5002 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5003
4f639d21 5004 if (data && data->count) {
a3b680e6 5005 const U32 n = ARG(node);
ffc61ed2 5006
4f639d21
DM
5007 if (data->what[n] == 's') {
5008 SV * const rv = (SV*)data->data[n];
890ce7af 5009 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5010 SV **const ary = AvARRAY(av);
9e55ce06 5011 SV **a, **b;
9041c2e3 5012
711a919c 5013 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5014 * documentation of these array elements. */
5015
b11f357e 5016 si = *ary;
8f7f7219 5017 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5018 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5019
ffc61ed2
JH
5020 if (a)
5021 sw = *a;
5022 else if (si && doinit) {
5023 sw = swash_init("utf8", "", si, 1, 0);
5024 (void)av_store(av, 1, sw);
5025 }
9e55ce06
JH
5026 if (b)
5027 alt = *b;
ffc61ed2
JH
5028 }
5029 }
5030
9e55ce06
JH
5031 if (listsvp)
5032 *listsvp = si;
5033 if (altsvp)
5034 *altsvp = alt;
ffc61ed2
JH
5035
5036 return sw;
5037}
76234dfb 5038#endif
ffc61ed2
JH
5039
5040/*
ba7b4546 5041 - reginclass - determine if a character falls into a character class
832705d4
JH
5042
5043 The n is the ANYOF regnode, the p is the target string, lenp
5044 is pointer to the maximum length of how far to go in the p
5045 (if the lenp is zero, UTF8SKIP(p) is used),
5046 do_utf8 tells whether the target string is in UTF-8.
5047
bbce6d69 5048 */
5049
76e3520e 5050STATIC bool
32fc9b6a 5051S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5052{
27da23d5 5053 dVAR;
a3b680e6 5054 const char flags = ANYOF_FLAGS(n);
bbce6d69 5055 bool match = FALSE;
cc07378b 5056 UV c = *p;
ae9ddab8 5057 STRLEN len = 0;
9e55ce06 5058 STRLEN plen;
1aa99e6b 5059
19f67299
TS
5060 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5061 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5062 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5063 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5064 if (len == (STRLEN)-1)
5065 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5066 }
bbce6d69 5067
0f0076b4 5068 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5069 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5070 if (lenp)
5071 *lenp = 0;
ffc61ed2 5072 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5073 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5074 match = TRUE;
bbce6d69 5075 }
3568d838 5076 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5077 match = TRUE;
ffc61ed2 5078 if (!match) {
9e55ce06 5079 AV *av;
32fc9b6a 5080 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5081
5082 if (sw) {
3568d838 5083 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5084 match = TRUE;
5085 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5086 if (!match && lenp && av) {
5087 I32 i;
9e55ce06 5088 for (i = 0; i <= av_len(av); i++) {
890ce7af 5089 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5090 STRLEN len;
890ce7af 5091 const char * const s = SvPV_const(sv, len);
9e55ce06 5092
061b10df 5093 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5094 *lenp = len;
5095 match = TRUE;
5096 break;
5097 }
5098 }
5099 }
5100 if (!match) {
89ebb4a3 5101 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5102 STRLEN tmplen;
5103
9e55ce06
JH
5104 to_utf8_fold(p, tmpbuf, &tmplen);
5105 if (swash_fetch(sw, tmpbuf, do_utf8))
5106 match = TRUE;
5107 }
ffc61ed2
JH
5108 }
5109 }
bbce6d69 5110 }
9e55ce06 5111 if (match && lenp && *lenp == 0)
0f0076b4 5112 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5113 }
1aa99e6b 5114 if (!match && c < 256) {
ffc61ed2
JH
5115 if (ANYOF_BITMAP_TEST(n, c))
5116 match = TRUE;
5117 else if (flags & ANYOF_FOLD) {
eb160463 5118 U8 f;
a0ed51b3 5119
ffc61ed2
JH
5120 if (flags & ANYOF_LOCALE) {
5121 PL_reg_flags |= RF_tainted;
5122 f = PL_fold_locale[c];
5123 }
5124 else
5125 f = PL_fold[c];
5126 if (f != c && ANYOF_BITMAP_TEST(n, f))
5127 match = TRUE;
5128 }
5129
5130 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5131 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5132 if (
5133 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5134 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5135 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5136 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5137 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5138 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5139 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5154 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5155 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5156 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5157 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5158 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5159 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5160 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5161 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5162 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5163 ) /* How's that for a conditional? */
5164 {
5165 match = TRUE;
5166 }
a0ed51b3 5167 }
a0ed51b3
LW
5168 }
5169
a0ed51b3
LW
5170 return (flags & ANYOF_INVERT) ? !match : match;
5171}
161b471a 5172
dfe13c55 5173STATIC U8 *
0ce71af7 5174S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5175{
97aff369 5176 dVAR;
a0ed51b3 5177 if (off >= 0) {
1aa99e6b 5178 while (off-- && s < lim) {
ffc61ed2 5179 /* XXX could check well-formedness here */
a0ed51b3 5180 s += UTF8SKIP(s);
ffc61ed2 5181 }
a0ed51b3
LW
5182 }
5183 else {
1de06328
YO
5184 while (off++ && s > lim) {
5185 s--;
5186 if (UTF8_IS_CONTINUED(*s)) {
5187 while (s > lim && UTF8_IS_CONTINUATION(*s))
5188 s--;
a0ed51b3 5189 }
1de06328 5190 /* XXX could check well-formedness here */
a0ed51b3
LW
5191 }
5192 }
5193 return s;
5194}
161b471a 5195
dfe13c55 5196STATIC U8 *
1de06328
YO
5197S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5198{
5199 dVAR;
5200 if (off >= 0) {
5201 while (off-- && s < rlim) {
5202 /* XXX could check well-formedness here */
5203 s += UTF8SKIP(s);
5204 }
5205 }
5206 else {
5207 while (off++ && s > llim) {
5208 s--;
5209 if (UTF8_IS_CONTINUED(*s)) {
5210 while (s > llim && UTF8_IS_CONTINUATION(*s))
5211 s--;
5212 }
5213 /* XXX could check well-formedness here */
5214 }
5215 }
5216 return s;
5217}
5218
5219
5220STATIC U8 *
0ce71af7 5221S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5222{
97aff369 5223 dVAR;
a0ed51b3 5224 if (off >= 0) {
1aa99e6b 5225 while (off-- && s < lim) {
ffc61ed2 5226 /* XXX could check well-formedness here */
a0ed51b3 5227 s += UTF8SKIP(s);
ffc61ed2 5228 }
a0ed51b3 5229 if (off >= 0)
3dab1dad 5230 return NULL;
a0ed51b3
LW
5231 }
5232 else {
1de06328
YO
5233 while (off++ && s > lim) {
5234 s--;
5235 if (UTF8_IS_CONTINUED(*s)) {
5236 while (s > lim && UTF8_IS_CONTINUATION(*s))
5237 s--;
a0ed51b3 5238 }
1de06328 5239 /* XXX could check well-formedness here */
a0ed51b3
LW
5240 }
5241 if (off <= 0)
3dab1dad 5242 return NULL;
a0ed51b3
LW
5243 }
5244 return s;
5245}
51371543 5246
51371543 5247static void
acfe0abc 5248restore_pos(pTHX_ void *arg)
51371543 5249{
97aff369 5250 dVAR;
097eb12c 5251 regexp * const rex = (regexp *)arg;
51371543
GS
5252 if (PL_reg_eval_set) {
5253 if (PL_reg_oldsaved) {
4f639d21
DM
5254 rex->subbeg = PL_reg_oldsaved;
5255 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5256#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5257 rex->saved_copy = PL_nrs;
ed252734 5258#endif
4f639d21 5259 RX_MATCH_COPIED_on(rex);
51371543
GS
5260 }
5261 PL_reg_magic->mg_len = PL_reg_oldpos;
5262 PL_reg_eval_set = 0;
5263 PL_curpm = PL_reg_oldcurpm;
5264 }
5265}
33b8afdf
JH
5266
5267STATIC void
5268S_to_utf8_substr(pTHX_ register regexp *prog)
5269{
33b8afdf 5270 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5271 SV* const sv = newSVsv(prog->float_substr);
5272 prog->float_utf8 = sv;
33b8afdf
JH
5273 sv_utf8_upgrade(sv);
5274 if (SvTAIL(prog->float_substr))
5275 SvTAIL_on(sv);
5276 if (prog->float_substr == prog->check_substr)
5277 prog->check_utf8 = sv;
5278 }
5279 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5280 SV* const sv = newSVsv(prog->anchored_substr);
5281 prog->anchored_utf8 = sv;
33b8afdf
JH
5282 sv_utf8_upgrade(sv);
5283 if (SvTAIL(prog->anchored_substr))
5284 SvTAIL_on(sv);
5285 if (prog->anchored_substr == prog->check_substr)
5286 prog->check_utf8 = sv;
5287 }
5288}
5289
5290STATIC void
5291S_to_byte_substr(pTHX_ register regexp *prog)
5292{
97aff369 5293 dVAR;
33b8afdf 5294 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5295 SV* sv = newSVsv(prog->float_utf8);
5296 prog->float_substr = sv;
33b8afdf
JH
5297 if (sv_utf8_downgrade(sv, TRUE)) {
5298 if (SvTAIL(prog->float_utf8))
5299 SvTAIL_on(sv);
5300 } else {
5301 SvREFCNT_dec(sv);
5302 prog->float_substr = sv = &PL_sv_undef;
5303 }
5304 if (prog->float_utf8 == prog->check_utf8)
5305 prog->check_substr = sv;
5306 }
5307 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5308 SV* sv = newSVsv(prog->anchored_utf8);
5309 prog->anchored_substr = sv;
33b8afdf
JH
5310 if (sv_utf8_downgrade(sv, TRUE)) {
5311 if (SvTAIL(prog->anchored_utf8))
5312 SvTAIL_on(sv);
5313 } else {
5314 SvREFCNT_dec(sv);
5315 prog->anchored_substr = sv = &PL_sv_undef;
5316 }
5317 if (prog->anchored_utf8 == prog->check_utf8)
5318 prog->check_substr = sv;
5319 }
5320}
66610fdd
RGS
5321
5322/*
5323 * Local variables:
5324 * c-indentation-style: bsd
5325 * c-basic-offset: 4
5326 * indent-tabs-mode: t
5327 * End:
5328 *
37442d52
RGS
5329 * ex: set ts=8 sts=4 sw=4 noet:
5330 */