This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
migrate CURLYX/WHILEM branch in regmatch() to new FSM-esque paradigm
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
c476f425 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
6bda09f9 169#define REGCP_OTHER_ELEMS 8
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
3280af22
NIS
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
a01268b5 189 SSPUSHINT(*PL_reglastcloseparen);
3280af22 190 SSPUSHPTR(PL_reginput);
41123dfd
JH
191#define REGCP_FRAME_ELEMS 2
192/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 196
a0d0e21e
LW
197 return retval;
198}
199
c277df42 200/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
201#define REGCP_SET(cp) \
202 DEBUG_STATE_r( \
ab3bbdeb 203 PerlIO_printf(Perl_debug_log, \
e4f74956 204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
205 (IV)PL_savestack_ix)); \
206 cp = PL_savestack_ix
c3464db5 207
ab3bbdeb 208#define REGCP_UNWIND(cp) \
e4f74956 209 DEBUG_STATE_r( \
ab3bbdeb 210 if (cp != PL_savestack_ix) \
e4f74956
YO
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
213 (IV)(cp), (IV)PL_savestack_ix)); \
214 regcpblow(cp)
c277df42 215
76e3520e 216STATIC char *
097eb12c 217S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 218{
97aff369 219 dVAR;
b1ce53c5 220 I32 i;
a0d0e21e 221 char *input;
b1ce53c5 222
a3621e74
YO
223 GET_RE_DEBUG_FLAGS_DECL;
224
b1ce53c5 225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 226 i = SSPOPINT;
b1ce53c5
JH
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 229 input = (char *) SSPOPPTR;
a01268b5 230 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
6bda09f9
YO
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 235
6bda09f9 236
b1ce53c5 237 /* Now restore the parentheses context. */
41123dfd
JH
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 240 I32 tmps;
097eb12c 241 U32 paren = (U32)SSPOPINT;
3280af22 242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
243 PL_regstartp[paren] = SSPOPINT;
244 tmps = SSPOPINT;
3280af22
NIS
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
a3621e74 247 DEBUG_EXECUTE_r(
c3464db5 248 PerlIO_printf(Perl_debug_log,
b900a521 249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 250 (UV)paren, (IV)PL_regstartp[paren],
b900a521 251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 252 (IV)PL_regendp[paren],
3280af22 253 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 254 );
a0d0e21e 255 }
a3621e74 256 DEBUG_EXECUTE_r(
bb7a0f54 257 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 258 PerlIO_printf(Perl_debug_log,
faccc32b 259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
261 }
262 );
daf18116 263#if 1
dafc8851
JH
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
273 * --jhi */
bb7a0f54 274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
275 if (i > PL_regsize)
276 PL_regstartp[i] = -1;
277 PL_regendp[i] = -1;
a0d0e21e 278 }
dafc8851 279#endif
a0d0e21e
LW
280 return input;
281}
282
02db2b7b 283#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 284
a687059c 285/*
e50aee73 286 * pregexec and friends
a687059c
LW
287 */
288
76234dfb 289#ifndef PERL_IN_XSUB_RE
a687059c 290/*
c277df42 291 - pregexec - match a regexp against a string
a687059c 292 */
c277df42 293I32
864dbfa3 294Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
296/* strend: pointer to null at end of string */
297/* strbeg: real beginning of string */
298/* minend: end of match must be >=minend after stringarg. */
299/* nosave: For optimizations. */
300{
301 return
9041c2e3 302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
303 nosave ? 0 : REXEC_COPY_STR);
304}
76234dfb 305#endif
22e551b9 306
9041c2e3 307/*
cad2e5aa
JH
308 * Need to implement the following flags for reg_anch:
309 *
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
311 * USE_INTUIT_ML
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
315 * INTUIT_ONCE_ML
316 *
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
319 */
320
321/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
322
3f7c398e 323/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
325
326/* XXXX We assume that strpos is strbeg unless sv. */
327
6eb5f6b9
JH
328/* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
334
2c2d71f5
JH
335/* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
30944b6d
IZ
338 regtry() should be needed.
339
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
342 b) Fixed substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 345 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
346 string which does not contradict any of them.
347 */
2c2d71f5 348
6eb5f6b9
JH
349/* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
352
cad2e5aa
JH
353char *
354Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
356{
97aff369 357 dVAR;
b7953727 358 register I32 start_shift = 0;
cad2e5aa 359 /* Should be nonnegative! */
b7953727 360 register I32 end_shift = 0;
2c2d71f5
JH
361 register char *s;
362 register SV *check;
a1933d95 363 char *strbeg;
cad2e5aa 364 char *t;
1de06328 365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 366 I32 ml_anch;
bd61b366
SS
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
1df70142 369 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 370#ifdef DEBUGGING
890ce7af 371 const char * const i_strpos = strpos;
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
379 PL_reg_flags |= RF_utf8;
380 }
ab3bbdeb
YO
381 DEBUG_EXECUTE_r(
382 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
2a782b5b 385 );
cad2e5aa 386
c344f387
JH
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
a3621e74 389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 390 "String too short... [re_intuit_start]\n"));
cad2e5aa 391 goto fail;
2c2d71f5 392 }
1de06328 393
a1933d95 394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 395 PL_regeol = strend;
33b8afdf
JH
396 if (do_utf8) {
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
400 } else {
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
404 }
1de06328 405 if (check == &PL_sv_undef) {
a3621e74 406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 407 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
408 goto fail;
409 }
2c2d71f5 410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 413 && !multiline ) ); /* Check after \n? */
cad2e5aa 414
7e25d62c
JH
415 if (!ml_anch) {
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
419 && sv && !SvROK(sv)
420 && (strpos != strbeg)) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
422 goto fail;
423 }
424 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 425 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 426 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
427 I32 slen;
428
1aa99e6b 429 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 430
653099ff
GS
431 if (SvTAIL(check)) {
432 slen = SvCUR(check); /* >= 1 */
cad2e5aa 433
9041c2e3 434 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 435 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 437 goto fail_finish;
cad2e5aa
JH
438 }
439 /* Now should match s[0..slen-2] */
440 slen--;
3f7c398e 441 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 442 || (slen > 1
3f7c398e 443 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 444 report_neq:
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
446 goto fail_finish;
447 }
cad2e5aa 448 }
3f7c398e 449 else if (*SvPVX_const(check) != *s
653099ff 450 || ((slen = SvCUR(check)) > 1
3f7c398e 451 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 452 goto report_neq;
c315bfe8 453 check_at = s;
2c2d71f5 454 goto success_at_start;
7e25d62c 455 }
cad2e5aa 456 }
2c2d71f5 457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 458 s = strpos;
2c2d71f5 459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
460 end_shift = prog->check_end_shift;
461
2c2d71f5 462 if (!ml_anch) {
a3b680e6 463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 464 - (SvTAIL(check) != 0);
a3b680e6 465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
466
467 if (end_shift < eshift)
468 end_shift = eshift;
469 }
cad2e5aa 470 }
2c2d71f5 471 else { /* Can match at random position */
cad2e5aa
JH
472 ml_anch = 0;
473 s = strpos;
1de06328
YO
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
476
477 /* end shift should be non negative here */
cad2e5aa
JH
478 }
479
2c2d71f5 480#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 481 if (end_shift < 0)
1de06328 482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 483 (IV)end_shift, prog->precomp);
2c2d71f5
JH
484#endif
485
2c2d71f5
JH
486 restart:
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
489
490 {
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
496 }
6bda09f9 497 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
501 (IV)srch_end_shift,
502 (IV)prog->check_end_shift);
503 });
504
cad2e5aa 505 if (flags & REXEC_SCREAM) {
cad2e5aa 506 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 507 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 508
2c2d71f5
JH
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
512 && SvTAIL(check) ))
9041c2e3 513 s = screaminstr(sv, check,
1de06328 514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 515 else
2c2d71f5 516 goto fail_finish;
4addbd3b
HS
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
3f7c398e 519 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
520 if (data)
521 *data->scream_olds = s;
522 }
1de06328
YO
523 else {
524 U8* start_point;
525 U8* end_point;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
529 } else {
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
532 }
6bda09f9 533 DEBUG_OPTIMISE_MORE_r({
56570a2c 534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 535 (int)(end_point - start_point),
fc8cd66c 536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
537 start_point);
538 });
539
540 s = fbm_instr( start_point, end_point,
7fba1cd6 541 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
542 }
543 }
cad2e5aa
JH
544 /* Update the count-of-usability, remove useless subpatterns,
545 unshift s. */
2c2d71f5 546
ab3bbdeb
YO
547 DEBUG_EXECUTE_r({
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 551 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
554 quoted,
555 RE_SV_TAIL(check),
556 (s ? " at offset " : "...\n") );
557 });
2c2d71f5
JH
558
559 if (!s)
560 goto fail_finish;
2c2d71f5 561 /* Finish the diagnostic message */
a3621e74 562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 563
1de06328
YO
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
567 point. I think. :-(
568 */
569
570
571
572 check_at=s;
573
574
2c2d71f5
JH
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
581 */
582
1de06328
YO
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
585 {
30944b6d 586 /* Take into account the "other" substring. */
2c2d71f5
JH
587 /* XXXX May be hopelessly wrong for UTF... */
588 if (!other_last)
6eb5f6b9 589 other_last = strpos;
33b8afdf 590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
591 do_other_anchored:
592 {
890ce7af
AL
593 char * const last = HOP3c(s, -start_shift, strbeg);
594 char *last1, *last2;
be8e71aa 595 char * const saved_s = s;
33b8afdf 596 SV* must;
2c2d71f5 597
2c2d71f5
JH
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 600 && (!do_utf8
0ce71af7 601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 602 && t > strpos)))
6f207bd3 603 NOOP;
2c2d71f5
JH
604 else
605 t = strpos;
1aa99e6b 606 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
607 if (t < other_last) /* These positions already checked */
608 t = other_last;
1aa99e6b 609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
610 if (last < last1)
611 last1 = last;
1de06328
YO
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
615 dmq.
616 */
617
2c2d71f5 618 /* On end-of-str: see comment below. */
33b8afdf
JH
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
621 s = (char*)NULL;
1de06328 622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
623 }
624 else
625 s = fbm_instr(
626 (unsigned char*)t,
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
629 must,
7fba1cd6 630 multiline ? FBMrf_MULTILINE : 0
33b8afdf 631 );
ab3bbdeb
YO
632 DEBUG_EXECUTE_r({
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 636 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
637 quoted, RE_SV_TAIL(must));
638 });
639
640
2c2d71f5
JH
641 if (!s) {
642 if (last1 >= last2) {
a3621e74 643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
644 ", giving up...\n"));
645 goto fail_finish;
646 }
a3621e74 647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 648 ", trying floating at offset %ld...\n",
be8e71aa 649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
2c2d71f5
JH
652 goto restart;
653 }
654 else {
a3621e74 655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 656 (long)(s - i_strpos)));
1aa99e6b
IH
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
be8e71aa 659 s = saved_s;
2c2d71f5
JH
660 if (t == strpos)
661 goto try_at_start;
2c2d71f5
JH
662 goto try_at_offset;
663 }
30944b6d 664 }
2c2d71f5
JH
665 }
666 else { /* Take into account the floating substring. */
33b8afdf 667 char *last, *last1;
be8e71aa 668 char * const saved_s = s;
33b8afdf
JH
669 SV* must;
670
671 t = HOP3c(s, -start_shift, strbeg);
672 last1 = last =
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
677 if (s < other_last)
678 s = other_last;
2c2d71f5 679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
685 s = (char*)NULL;
1de06328 686 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
687 }
688 else
2c2d71f5 689 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
690 (unsigned char*)last + SvCUR(must)
691 - (SvTAIL(must)!=0),
7fba1cd6 692 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
693 DEBUG_EXECUTE_r({
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 697 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
698 quoted, RE_SV_TAIL(must));
699 });
33b8afdf
JH
700 if (!s) {
701 if (last1 == last) {
a3621e74 702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
703 ", giving up...\n"));
704 goto fail_finish;
2c2d71f5 705 }
a3621e74 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 707 ", trying anchored starting at offset %ld...\n",
be8e71aa 708 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
709 other_last = last;
710 s = HOP3c(t, 1, strend);
711 goto restart;
712 }
713 else {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
be8e71aa 717 s = saved_s;
33b8afdf
JH
718 if (t == strpos)
719 goto try_at_start;
720 goto try_at_offset;
721 }
2c2d71f5 722 }
cad2e5aa 723 }
2c2d71f5 724
1de06328 725
9ef43ace 726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 727
6bda09f9 728 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
733 (IV)(s-strpos),
734 (IV)(t-strpos),
735 (IV)(t-s),
736 (IV)(strend-strpos)
737 )
738 );
739
2c2d71f5 740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 741 && (!do_utf8
9ef43ace 742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
743 && t > strpos)))
744 {
2c2d71f5
JH
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
747 try_at_offset:
cad2e5aa 748 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
755 find_anchor:
2c2d71f5 756 while (t < strend - prog->minlen) {
cad2e5aa 757 if (*t == '\n') {
4ee3650e 758 if (t < check_at - prog->check_offset_min) {
33b8afdf 759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
30944b6d
IZ
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
766 */
9041c2e3 767 strpos = t + 1;
a3621e74 768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
770 goto do_other_anchored;
771 }
4ee3650e
GS
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
cad2e5aa 774 s = t + 1;
a3621e74 775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
777 goto set_useful;
778 }
4ee3650e
GS
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
a3621e74 782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 784 other_last = strpos = s = t + 1;
cad2e5aa
JH
785 goto restart;
786 }
787 t++;
788 }
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 790 PL_colors[0], PL_colors[1]));
2c2d71f5 791 goto fail_finish;
cad2e5aa 792 }
f5952150 793 else {
a3621e74 794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 795 PL_colors[0], PL_colors[1]));
f5952150 796 }
cad2e5aa
JH
797 s = t;
798 set_useful:
33b8afdf 799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
800 }
801 else {
f5952150 802 /* The found string does not prohibit matching at strpos,
2c2d71f5 803 - no optimization of calling REx engine can be performed,
f5952150
GS
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
2c2d71f5
JH
806 try_at_start:
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
05b4157f 809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 810 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
813 {
cad2e5aa
JH
814 t = strpos;
815 goto find_anchor;
816 }
a3621e74 817 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 820 );
2c2d71f5 821 success_at_start:
30944b6d 822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
823 && (do_utf8 ? (
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
827 ) : (
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
831 )))
66e933ab 832 {
cad2e5aa 833 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
cad2e5aa 841 s = strpos;
3cf5c195
IZ
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
6eb5f6b9 844 other heuristics. */
cad2e5aa
JH
845 prog->reganch &= ~RE_USE_INTUIT;
846 }
847 else
848 s = strpos;
849 }
850
6eb5f6b9
JH
851 /* Last resort... */
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
786e8c11 855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 864 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 867 : 1);
1de06328
YO
868 char * endpos;
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
873 else
874 endpos= strend;
875
56570a2c 876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 878
6eb5f6b9 879 t = s;
3b0527fe 880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
881 if (!s) {
882#ifdef DEBUGGING
cbbf8932 883 const char *what = NULL;
6eb5f6b9
JH
884#endif
885 if (endpos == strend) {
a3621e74 886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
887 "Could not match STCLASS...\n") );
888 goto fail;
889 }
a3621e74 890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 891 "This position contradicts STCLASS...\n") );
653099ff
GS
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
893 goto fail;
6eb5f6b9 894 /* Contradict one of substrings */
33b8afdf
JH
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 897 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 898 hop_and_restart:
1aa99e6b 899 s = HOP3c(t, 1, strend);
66e933ab
GS
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
a3621e74 902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
903 "Could not match STCLASS...\n") );
904 goto fail;
905 }
5e39e1e5
HS
906 if (!check)
907 goto giveup;
a3621e74 908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 909 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
910 what, (long)(s + start_shift - i_strpos)) );
911 goto restart;
912 }
66e933ab 913 /* Have both, check_string is floating */
6eb5f6b9
JH
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
9041c2e3 917 s = check_at;
5e39e1e5
HS
918 if (!check)
919 goto giveup;
a3621e74 920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 921 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
924 }
60e71179
GS
925 /* Another way we could have checked stclass at the
926 current position only: */
927 if (ml_anch) {
928 s = t = t + 1;
5e39e1e5
HS
929 if (!check)
930 goto giveup;
a3621e74 931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 932 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 934 goto try_at_offset;
66e933ab 935 }
33b8afdf 936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 937 goto fail;
6eb5f6b9
JH
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
a3621e74 941 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
942 goto hop_and_restart;
943 }
b7953727 944 if (t != s) {
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 946 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
947 (long)(t - i_strpos), (long)(s - i_strpos))
948 );
949 }
950 else {
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
952 "Does not contradict STCLASS...\n");
953 );
954 }
6eb5f6b9 955 }
5e39e1e5 956 giveup:
a3621e74 957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 960 return s;
2c2d71f5
JH
961
962 fail_finish: /* Substring not found */
33b8afdf
JH
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 965 fail:
a3621e74 966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 967 PL_colors[4], PL_colors[5]));
bd61b366 968 return NULL;
cad2e5aa 969}
9661b544 970
786e8c11 971
3b0527fe 972
4cadc6a9
YO
973#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
977 if ( foldlen>0 ) { \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
979 foldlen -= len; \
980 uscan += len; \
981 len=0; \
982 } else { \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
987 } \
988 break; \
989 case trie_utf8: \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
991 break; \
992 case trie_plain: \
993 uvc = (UV)*uc; \
994 len = 1; \
995 } \
996 \
997 if (uvc < 256) { \
998 charid = trie->charmap[ uvc ]; \
999 } \
1000 else { \
1001 charid = 0; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1005 if (svpp) \
1006 charid = (U16)SvIV(*svpp); \
1007 } \
1008 } \
1009} STMT_END
1010
1011#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1012 if ( (CoNd) \
1013 && (ln == len || \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1017 goto got_it; \
1018 else { \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1022 if ( f != c \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1027 m, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1030 goto got_it; \
1031 } \
1032 s += len
1033
1034#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1035STMT_START { \
1036 while (s <= e) { \
1037 if ( (CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1039 ? ibcmp(s, m, ln) \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1042 goto got_it; \
1043 s++; \
1044 } \
1045} STMT_END
1046
1047#define REXEC_FBC_UTF8_SCAN(CoDe) \
1048STMT_START { \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1050 CoDe \
1051 s += uskip; \
1052 } \
1053} STMT_END
1054
1055#define REXEC_FBC_SCAN(CoDe) \
1056STMT_START { \
1057 while (s < strend) { \
1058 CoDe \
1059 s++; \
1060 } \
1061} STMT_END
1062
1063#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064REXEC_FBC_UTF8_SCAN( \
1065 if (CoNd) { \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1067 goto got_it; \
1068 else \
1069 tmp = doevery; \
1070 } \
1071 else \
1072 tmp = 1; \
1073)
1074
1075#define REXEC_FBC_CLASS_SCAN(CoNd) \
1076REXEC_FBC_SCAN( \
1077 if (CoNd) { \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1079 goto got_it; \
1080 else \
1081 tmp = doevery; \
1082 } \
1083 else \
1084 tmp = 1; \
1085)
1086
1087#define REXEC_FBC_TRYIT \
1088if ((!reginfo || regtry(reginfo, s))) \
1089 goto got_it
1090
1091#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1092 if (do_utf8) { \
1093 UtFpReLoAd; \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1095 } \
1096 else { \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1098 } \
1099 break
1100
1101#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1103 if (do_utf8) { \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1105 } \
1106 else { \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1108 } \
1109 break
1110
786e8c11
YO
1111#define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1113
1114/* We know what class REx starts with. Try to find this position... */
1115/* if reginfo is NULL, its a dryrun */
1116/* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1118
3c3eec57 1119STATIC char *
07be1b83
YO
1120S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
a687059c 1122{
27da23d5 1123 dVAR;
1df70142 1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1125 char *m;
d8093b23 1126 STRLEN ln;
5dab1207 1127 STRLEN lnc;
078c425b 1128 register STRLEN uskip;
d8093b23
G
1129 unsigned int c1;
1130 unsigned int c2;
6eb5f6b9
JH
1131 char *e;
1132 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1133 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1134
6eb5f6b9
JH
1135 /* We know what class it must start with. */
1136 switch (OP(c)) {
6eb5f6b9 1137 case ANYOF:
388cc4de 1138 if (do_utf8) {
4cadc6a9 1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1142 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1143 }
1144 else {
1145 while (s < strend) {
1146 STRLEN skip = 1;
1147
32fc9b6a 1148 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
3b0527fe 1153 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1154 goto got_it;
1155 else
1156 tmp = doevery;
1157 }
1158 else
1159 tmp = 1;
1160 s += skip;
1161 }
a0d0e21e 1162 }
6eb5f6b9 1163 break;
f33976b4 1164 case CANY:
4cadc6a9 1165 REXEC_FBC_SCAN(
3b0527fe 1166 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1167 goto got_it;
1168 else
1169 tmp = doevery;
4cadc6a9 1170 );
f33976b4 1171 break;
6eb5f6b9 1172 case EXACTF:
5dab1207
NIS
1173 m = STRING(c);
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1176 if (UTF) {
a2a2844f 1177 STRLEN ulen1, ulen2;
5dab1207 1178 U8 *sm = (U8 *) m;
89ebb4a3
JH
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1182
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1185
89ebb4a3 1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1187 0, uniflags);
89ebb4a3 1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1189 0, uniflags);
5dab1207
NIS
1190 lnc = 0;
1191 while (sm < ((U8 *) m + ln)) {
1192 lnc++;
1193 sm += UTF8SKIP(sm);
1194 }
1aa99e6b
IH
1195 }
1196 else {
1197 c1 = *(U8*)m;
1198 c2 = PL_fold[c1];
1199 }
6eb5f6b9
JH
1200 goto do_exactf;
1201 case EXACTFL:
5dab1207
NIS
1202 m = STRING(c);
1203 ln = STR_LEN(c);
1204 lnc = (I32) ln;
d8093b23 1205 c1 = *(U8*)m;
6eb5f6b9
JH
1206 c2 = PL_fold_locale[c1];
1207 do_exactf:
db12adc6 1208 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1209
3b0527fe 1210 if (!reginfo && e < s)
6eb5f6b9 1211 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1212
60a8b682
JH
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
7f16dd3d
JH
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
60a8b682 1223
1aa99e6b 1224 if (do_utf8) {
575cac57 1225 UV c, f;
89ebb4a3 1226 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1227 STRLEN len, foldlen;
4ad0818d 1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1229 if (c1 == c2) {
5dab1207
NIS
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1aa99e6b 1232 while (s <= e) {
89ebb4a3 1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1234 uniflags);
4cadc6a9 1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1236 }
09091399
JH
1237 }
1238 else {
1aa99e6b 1239 while (s <= e) {
89ebb4a3 1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1241 uniflags);
80aecb99 1242
60a8b682 1243 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
880bd946
JH
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1253
4cadc6a9 1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1255 }
09091399 1256 }
1aa99e6b
IH
1257 }
1258 else {
1259 if (c1 == c2)
4cadc6a9 1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1261 else
4cadc6a9 1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1263 }
1264 break;
bbce6d69 1265 case BOUNDL:
3280af22 1266 PL_reg_flags |= RF_tainted;
bbce6d69 1267 /* FALL THROUGH */
a0d0e21e 1268 case BOUND:
ffc61ed2 1269 if (do_utf8) {
12d33761 1270 if (s == PL_bostr)
ffc61ed2
JH
1271 tmp = '\n';
1272 else {
6136c704 1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1275 }
1276 tmp = ((OP(c) == BOUND ?
9041c2e3 1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1278 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1279 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1280 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1282 isALNUM_LC_utf8((U8*)s)))
1283 {
1284 tmp = !tmp;
4cadc6a9 1285 REXEC_FBC_TRYIT;
a687059c 1286 }
4cadc6a9 1287 );
a0d0e21e 1288 }
667bb95a 1289 else {
12d33761 1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1292 REXEC_FBC_SCAN(
ffc61ed2
JH
1293 if (tmp ==
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1295 tmp = !tmp;
4cadc6a9 1296 REXEC_FBC_TRYIT;
a0ed51b3 1297 }
4cadc6a9 1298 );
a0ed51b3 1299 }
3b0527fe 1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1301 goto got_it;
1302 break;
bbce6d69 1303 case NBOUNDL:
3280af22 1304 PL_reg_flags |= RF_tainted;
bbce6d69 1305 /* FALL THROUGH */
a0d0e21e 1306 case NBOUND:
ffc61ed2 1307 if (do_utf8) {
12d33761 1308 if (s == PL_bostr)
ffc61ed2
JH
1309 tmp = '\n';
1310 else {
6136c704 1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1313 }
1314 tmp = ((OP(c) == NBOUND ?
9041c2e3 1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1316 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1317 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1318 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1320 isALNUM_LC_utf8((U8*)s)))
1321 tmp = !tmp;
4cadc6a9
YO
1322 else REXEC_FBC_TRYIT;
1323 );
a0d0e21e 1324 }
667bb95a 1325 else {
12d33761 1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1329 REXEC_FBC_SCAN(
ffc61ed2
JH
1330 if (tmp ==
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1332 tmp = !tmp;
4cadc6a9
YO
1333 else REXEC_FBC_TRYIT;
1334 );
a0ed51b3 1335 }
3b0527fe 1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1337 goto got_it;
1338 break;
a0d0e21e 1339 case ALNUM:
4cadc6a9
YO
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1343 isALNUM(*s)
1344 );
bbce6d69 1345 case ALNUML:
4cadc6a9
YO
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1348 isALNUM_LC(*s)
1349 );
a0d0e21e 1350 case NALNUM:
4cadc6a9
YO
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1354 !isALNUM(*s)
1355 );
bbce6d69 1356 case NALNUML:
4cadc6a9
YO
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1359 !isALNUM_LC(*s)
1360 );
a0d0e21e 1361 case SPACE:
4cadc6a9
YO
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1365 isSPACE(*s)
1366 );
bbce6d69 1367 case SPACEL:
4cadc6a9
YO
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1370 isSPACE_LC(*s)
1371 );
a0d0e21e 1372 case NSPACE:
4cadc6a9
YO
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1376 !isSPACE(*s)
1377 );
bbce6d69 1378 case NSPACEL:
4cadc6a9
YO
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1381 !isSPACE_LC(*s)
1382 );
a0d0e21e 1383 case DIGIT:
4cadc6a9
YO
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1387 isDIGIT(*s)
1388 );
b8c5462f 1389 case DIGITL:
4cadc6a9
YO
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1392 isDIGIT_LC(*s)
1393 );
a0d0e21e 1394 case NDIGIT:
4cadc6a9
YO
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1398 !isDIGIT(*s)
1399 );
b8c5462f 1400 case NDIGITL:
4cadc6a9
YO
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1403 !isDIGIT_LC(*s)
1404 );
1de06328
YO
1405 case AHOCORASICKC:
1406 case AHOCORASICK:
07be1b83
YO
1407 {
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1411 : trie_plain;
1412 /* what trie are we using right now */
1413 reg_ac_data *aho
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1416
1417 const char *last_start = strend - trie->minlen;
6148ee25 1418#ifdef DEBUGGING
07be1b83 1419 const char *real_start = s;
6148ee25 1420#endif
07be1b83 1421 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1422 SV *sv_points;
1423 U8 **points; /* map of where we were in the input string
786e8c11 1424 when reading a given char. For ASCII this
be8e71aa
YO
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
f9e705e8 1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1429 U8 *bitmap=NULL;
1430
07be1b83
YO
1431
1432 GET_RE_DEBUG_FLAGS_DECL;
1433
be8e71aa
YO
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1437 ENTER;
1438 SAVETMPS;
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1447 {
786e8c11
YO
1448 if (trie->bitmap)
1449 bitmap=(U8*)trie->bitmap;
1450 else
1451 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1452 }
786e8c11
YO
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1459 starting char.
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1467 if we have not.
1468
1469 */
07be1b83
YO
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1472 U8 *uc = (U8*)s;
1473 U16 charid = 0;
1474 U32 base = 1;
1475 U32 state = 1;
1476 UV uvc = 0;
1477 STRLEN len = 0;
1478 STRLEN foldlen = 0;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
786e8c11
YO
1481#ifdef DEBUGGING
1482 U32 accepted_word= 0;
1483#endif
07be1b83
YO
1484 U32 pointpos = 0;
1485
1486 while ( state && uc <= (U8*)strend ) {
1487 int failed=0;
786e8c11
YO
1488 U32 word = aho->states[ state ].wordnum;
1489
1de06328
YO
1490 if( state==1 ) {
1491 if ( bitmap ) {
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1498 }
1499 );
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1501 uc++;
786e8c11 1502 }
1de06328 1503 s= (char *)uc;
786e8c11 1504 }
786e8c11
YO
1505 if (uc >(U8*)last_start) break;
1506 }
1507
1508 if ( word ) {
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
07be1b83 1512 leftmost= lpos;
786e8c11 1513 }
07be1b83 1514 if (base==0) break;
786e8c11 1515
07be1b83
YO
1516 }
1517 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 s, do_utf8 );
07be1b83 1523 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1524 " Charid:%3u CP:%4"UVxf" ",
1525 charid, uvc);
1526 });
07be1b83
YO
1527
1528 do {
6148ee25 1529#ifdef DEBUGGING
786e8c11 1530 word = aho->states[ state ].wordnum;
6148ee25 1531#endif
07be1b83
YO
1532 base = aho->states[ state ].trans.base;
1533
786e8c11
YO
1534 DEBUG_TRIE_EXECUTE_r({
1535 if (failed)
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1537 s, do_utf8 );
07be1b83 1538 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1542 });
07be1b83
YO
1543 if ( base ) {
1544 U32 tmp;
1545 if (charid &&
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1548 < trie->lasttrans)
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1553 {
786e8c11
YO
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1556 state = tmp;
1557 break;
1558 }
1559 else {
786e8c11
YO
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1562 failed = 1;
1563 state = aho->fail[state];
07be1b83
YO
1564 }
1565 }
1566 else {
1567 /* we must be accepting here */
786e8c11
YO
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1570 failed = 1;
07be1b83
YO
1571 break;
1572 }
1573 } while(state);
786e8c11 1574 uc += len;
07be1b83
YO
1575 if (failed) {
1576 if (leftmost)
1577 break;
786e8c11 1578 if (!state) state = 1;
07be1b83
YO
1579 }
1580 }
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1585 leftmost = lpos;
786e8c11 1586 }
07be1b83 1587 }
07be1b83
YO
1588 if (leftmost) {
1589 s = (char*)leftmost;
786e8c11
YO
1590 DEBUG_TRIE_EXECUTE_r({
1591 PerlIO_printf(
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1594 );
1595 });
be8e71aa
YO
1596 if (!reginfo || regtry(reginfo, s)) {
1597 FREETMPS;
1598 LEAVE;
07be1b83 1599 goto got_it;
be8e71aa 1600 }
07be1b83 1601 s = HOPc(s,1);
786e8c11
YO
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1604 });
07be1b83 1605 } else {
786e8c11
YO
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1608 break;
1609 }
1610 }
be8e71aa
YO
1611 FREETMPS;
1612 LEAVE;
07be1b83
YO
1613 }
1614 break;
b3c9acc1 1615 default:
3c3eec57
GS
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1617 break;
d6a28714 1618 }
6eb5f6b9
JH
1619 return 0;
1620 got_it:
1621 return s;
1622}
1623
1624/*
1625 - regexec_flags - match a regexp against a string
1626 */
1627I32
1628Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630/* strend: pointer to null at end of string */
1631/* strbeg: real beginning of string */
1632/* minend: end of match must be >=minend after stringarg. */
1633/* data: May be used for some additional optimizations. */
1634/* nosave: For optimizations. */
1635{
97aff369 1636 dVAR;
6eb5f6b9
JH
1637 register char *s;
1638 register regnode *c;
1639 register char *startpos = stringarg;
6eb5f6b9
JH
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1644 char *scream_olds = NULL;
3dab1dad 1645 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1646 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1647 I32 multiline;
0df25f3d 1648
3b0527fe 1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1650
1651 GET_RE_DEBUG_FLAGS_DECL;
1652
9d4ba2ae 1653 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1654
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1658 return 0;
1659 }
1660
2757e526 1661 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1662 reginfo.prog = prog;
2757e526 1663
bac06658 1664 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1665 DEBUG_EXECUTE_r(
1666 debug_start_match(prog, do_utf8, startpos, strend,
1667 "Matching");
1668 );
bac06658 1669
6eb5f6b9 1670 minlen = prog->minlen;
1de06328
YO
1671
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1674 "String too short [regexec_flags]...\n"));
1675 goto phooey;
1aa99e6b 1676 }
6eb5f6b9 1677
1de06328 1678
6eb5f6b9
JH
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1682 }
1683
1684 PL_reg_flags = 0;
1685 PL_reg_eval_set = 0;
1686 PL_reg_maxiter = 0;
1687
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1690
1691 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1692 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1693 PL_bostr = strbeg;
3b0527fe 1694 reginfo.sv = sv;
6eb5f6b9
JH
1695
1696 /* Mark end of line for $ (and such) */
1697 PL_regeol = strend;
1698
1699 /* see how far we have to get to not match where we matched before */
3b0527fe 1700 reginfo.till = startpos+minend;
6eb5f6b9 1701
6eb5f6b9
JH
1702 /* If there is a "must appear" string, look for it. */
1703 s = startpos;
1704
3b0527fe 1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1706 MAGIC *mg;
1707
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1709 reginfo.ganch = startpos;
6eb5f6b9
JH
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1711 && SvMAGIC(sv)
14befaf4
DM
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
3b0527fe 1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1715 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1716 if (s > reginfo.ganch)
6eb5f6b9 1717 goto phooey;
3b0527fe 1718 s = reginfo.ganch;
6eb5f6b9
JH
1719 }
1720 }
1721 else /* pos() not defined */
3b0527fe 1722 reginfo.ganch = strbeg;
6eb5f6b9
JH
1723 }
1724
a0714e2c 1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1726 re_scream_pos_data d;
1727
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1731 if (!s) {
a3621e74 1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1733 goto phooey; /* not present */
3fa9c3d7 1734 }
6eb5f6b9
JH
1735 }
1736
1de06328 1737
6eb5f6b9
JH
1738
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1742 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1743 goto got_it;
7fba1cd6 1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1746 {
1747 char *end;
1748
1749 if (minlen)
1750 dontbother = minlen - 1;
1aa99e6b 1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1752 /* for multiline we only have to try after newlines */
33b8afdf 1753 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1754 if (s == startpos)
1755 goto after_try;
1756 while (1) {
3b0527fe 1757 if (regtry(&reginfo, s))
6eb5f6b9
JH
1758 goto got_it;
1759 after_try:
1760 if (s >= end)
1761 goto phooey;
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1764 if (!s)
1765 goto phooey;
1766 }
1767 else
1768 s++;
1769 }
1770 } else {
1771 if (s > startpos)
1772 s--;
1773 while (s < end) {
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1775 if (regtry(&reginfo, s))
6eb5f6b9
JH
1776 goto got_it;
1777 }
1778 }
1779 }
1780 }
1781 goto phooey;
f9f4320a
YO
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1783 {
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
3b0527fe 1787 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1788 goto got_it;
1789 goto phooey;
1790 }
1791
1792 /* Messy cases: unanchored match. */
33b8afdf 1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1796 char ch;
bf93d4cc
GS
1797#ifdef DEBUGGING
1798 int did_match = 0;
1799#endif
33b8afdf
JH
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1803
1aa99e6b 1804 if (do_utf8) {
4cadc6a9 1805 REXEC_FBC_SCAN(
6eb5f6b9 1806 if (*s == ch) {
a3621e74 1807 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1808 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1809 s += UTF8SKIP(s);
1810 while (s < strend && *s == ch)
1811 s += UTF8SKIP(s);
1812 }
4cadc6a9 1813 );
6eb5f6b9
JH
1814 }
1815 else {
4cadc6a9 1816 REXEC_FBC_SCAN(
6eb5f6b9 1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1819 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1820 s++;
1821 while (s < strend && *s == ch)
1822 s++;
1823 }
4cadc6a9 1824 );
6eb5f6b9 1825 }
a3621e74 1826 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1827 PerlIO_printf(Perl_debug_log,
b7953727
JH
1828 "Did not find anchored character...\n")
1829 );
6eb5f6b9 1830 }
a0714e2c
SS
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1834 && prog->float_max_offset < strend - s)) {
1835 SV *must;
1836 I32 back_max;
1837 I32 back_min;
1838 char *last;
6eb5f6b9 1839 char *last1; /* Last position checked before */
bf93d4cc
GS
1840#ifdef DEBUGGING
1841 int did_match = 0;
1842#endif
33b8afdf
JH
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1848 } else {
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1854 }
1de06328
YO
1855
1856
33b8afdf
JH
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1859 goto phooey;
1860
1de06328
YO
1861 if (back_min<0) {
1862 last = strend;
1863 } else {
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1867 }
6eb5f6b9
JH
1868 if (s > PL_bostr)
1869 last1 = HOPc(s, -1);
1870 else
1871 last1 = s - 1; /* bogus */
1872
a0288114 1873 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1874 check_substr==must. */
1875 scream_pos = -1;
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
9041c2e3 1879 ((flags & REXEC_SCREAM)
1de06328 1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1881 end_shift, &scream_pos, 0))
1de06328 1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1883 (unsigned char*)strend, must,
7fba1cd6 1884 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1887 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1888 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1892 }
1893 else {
52657f30 1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1895
1896 last1 = HOPc(s, -back_min);
52657f30 1897 s = t;
6eb5f6b9 1898 }
1aa99e6b 1899 if (do_utf8) {
6eb5f6b9 1900 while (s <= last1) {
3b0527fe 1901 if (regtry(&reginfo, s))
6eb5f6b9
JH
1902 goto got_it;
1903 s += UTF8SKIP(s);
1904 }
1905 }
1906 else {
1907 while (s <= last1) {
3b0527fe 1908 if (regtry(&reginfo, s))
6eb5f6b9
JH
1909 goto got_it;
1910 s++;
1911 }
1912 }
1913 }
ab3bbdeb
YO
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1919 ? "anchored" : "floating"),
ab3bbdeb
YO
1920 quoted, RE_SV_TAIL(must));
1921 });
6eb5f6b9
JH
1922 goto phooey;
1923 }
786e8c11 1924 else if ( (c = prog->regstclass) ) {
f14c76ed 1925 if (minlen) {
be8e71aa 1926 const OPCODE op = OP(prog->regstclass);
66e933ab 1927 /* don't bother with what can't match */
786e8c11 1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1929 strend = HOPc(strend, -(minlen - 1));
1930 }
a3621e74 1931 DEBUG_EXECUTE_r({
be8e71aa 1932 SV * const prop = sv_newmortal();
32fc9b6a 1933 regprop(prog, prop, c);
0df25f3d 1934 {
ab3bbdeb
YO
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1936 s,strend-s,60);
0df25f3d 1937 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1938 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1939 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1940 quoted, (int)(strend - s));
0df25f3d 1941 }
ffc61ed2 1942 });
3b0527fe 1943 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1944 goto got_it;
07be1b83 1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1946 }
1947 else {
1948 dontbother = 0;
a0714e2c 1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1950 /* Trim the end. */
d6a28714 1951 char *last;
33b8afdf
JH
1952 SV* float_real;
1953
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1957
1958 if (flags & REXEC_SCREAM) {
33b8afdf 1959 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1960 end_shift, &scream_pos, 1); /* last one */
1961 if (!last)
ffc61ed2 1962 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
3f7c398e 1965 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1966 }
d6a28714
JH
1967 else {
1968 STRLEN len;
cfd0369c 1969 const char * const little = SvPV_const(float_real, len);
d6a28714 1970
33b8afdf 1971 if (SvTAIL(float_real)) {
d6a28714
JH
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
7fba1cd6 1974 else if (!multiline)
9041c2e3 1975 last = memEQ(strend - len, little, len)
bd61b366 1976 ? strend - len : NULL;
b8c5462f 1977 else
d6a28714
JH
1978 goto find_last;
1979 } else {
1980 find_last:
9041c2e3 1981 if (len)
d6a28714 1982 last = rninstr(s, strend, little, little + len);
b8c5462f 1983 else
a0288114 1984 last = strend; /* matching "$" */
b8c5462f 1985 }
b8c5462f 1986 }
bf93d4cc 1987 if (last == NULL) {
6bda09f9
YO
1988 DEBUG_EXECUTE_r(
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1992 goto phooey; /* Should not happen! */
1993 }
d6a28714
JH
1994 dontbother = strend - last + prog->float_min_offset;
1995 }
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
1aa99e6b 2000 if (do_utf8) {
d6a28714 2001 for (;;) {
3b0527fe 2002 if (regtry(&reginfo, s))
d6a28714
JH
2003 goto got_it;
2004 if (s >= strend)
2005 break;
b8c5462f 2006 s += UTF8SKIP(s);
d6a28714
JH
2007 };
2008 }
2009 else {
2010 do {
3b0527fe 2011 if (regtry(&reginfo, s))
d6a28714
JH
2012 goto got_it;
2013 } while (s++ < strend);
2014 }
2015 }
2016
2017 /* Failure. */
2018 goto phooey;
2019
2020got_it:
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2022
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2028 the same. */
4f639d21 2029 restore_pos(aTHX_ prog);
d6a28714
JH
2030 }
2031
2032 /* make sure $`, $&, $', and $digit will work later */
2033 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2034 RX_MATCH_COPY_FREE(prog);
d6a28714 2035 if (flags & REXEC_COPY_STR) {
be8e71aa 2036 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2037#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2038 if ((SvIsCOW(sv)
2039 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2040 if (DEBUG_C_TEST) {
2041 PerlIO_printf(Perl_debug_log,
2042 "Copy on write: regexp capture, type %d\n",
2043 (int) SvTYPE(sv));
2044 }
2045 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2046 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2047 assert (SvPOKp(prog->saved_copy));
2048 } else
2049#endif
2050 {
2051 RX_MATCH_COPIED_on(prog);
2052 s = savepvn(strbeg, i);
2053 prog->subbeg = s;
2054 }
d6a28714 2055 prog->sublen = i;
d6a28714
JH
2056 }
2057 else {
2058 prog->subbeg = strbeg;
2059 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2060 }
2061 }
9041c2e3 2062
d6a28714
JH
2063 return 1;
2064
2065phooey:
a3621e74 2066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2067 PL_colors[4], PL_colors[5]));
d6a28714 2068 if (PL_reg_eval_set)
4f639d21 2069 restore_pos(aTHX_ prog);
d6a28714
JH
2070 return 0;
2071}
2072
6bda09f9 2073
d6a28714
JH
2074/*
2075 - regtry - try match at specific point
2076 */
2077STATIC I32 /* 0 failure, 1 success */
3b0527fe 2078S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2079{
97aff369 2080 dVAR;
d6a28714
JH
2081 register I32 *sp;
2082 register I32 *ep;
2083 CHECKPOINT lastcp;
3b0527fe 2084 regexp *prog = reginfo->prog;
a3621e74 2085 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2086
2087 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2088 MAGIC *mg;
2089
2090 PL_reg_eval_set = RS_init;
a3621e74 2091 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2092 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2093 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2094 ));
e8347627 2095 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2096 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2097 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2098 SAVETMPS;
2099 /* Apparently this is not needed, judging by wantarray. */
e8347627 2100 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2101 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2102
3b0527fe 2103 if (reginfo->sv) {
d6a28714 2104 /* Make $_ available to executed code. */
3b0527fe 2105 if (reginfo->sv != DEFSV) {
59f00321 2106 SAVE_DEFSV;
3b0527fe 2107 DEFSV = reginfo->sv;
b8c5462f 2108 }
d6a28714 2109
3b0527fe
DM
2110 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2111 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2112 /* prepare for quick setting of pos */
d300d9fa
NC
2113#ifdef PERL_OLD_COPY_ON_WRITE
2114 if (SvIsCOW(sv))
2115 sv_force_normal_flags(sv, 0);
2116#endif
3dab1dad 2117 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2118 &PL_vtbl_mglob, NULL, 0);
d6a28714 2119 mg->mg_len = -1;
b8c5462f 2120 }
d6a28714
JH
2121 PL_reg_magic = mg;
2122 PL_reg_oldpos = mg->mg_len;
4f639d21 2123 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2124 }
09687e5a 2125 if (!PL_reg_curpm) {
a02a5408 2126 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2127#ifdef USE_ITHREADS
2128 {
be8e71aa 2129 SV* const repointer = newSViv(0);
577e12cc 2130 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2131 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2132 av_push(PL_regex_padav,repointer);
2133 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2134 PL_regex_pad = AvARRAY(PL_regex_padav);
2135 }
2136#endif
2137 }
aaa362c4 2138 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2139 PL_reg_oldcurpm = PL_curpm;
2140 PL_curpm = PL_reg_curpm;
2141 if (RX_MATCH_COPIED(prog)) {
2142 /* Here is a serious problem: we cannot rewrite subbeg,
2143 since it may be needed if this match fails. Thus
2144 $` inside (?{}) could fail... */
2145 PL_reg_oldsaved = prog->subbeg;
2146 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2147#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2148 PL_nrs = prog->saved_copy;
2149#endif
d6a28714
JH
2150 RX_MATCH_COPIED_off(prog);
2151 }
2152 else
bd61b366 2153 PL_reg_oldsaved = NULL;
d6a28714
JH
2154 prog->subbeg = PL_bostr;
2155 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2156 }
6bda09f9 2157 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
973dddac 2158 prog->startp[0] = startpos - PL_bostr;
d6a28714 2159 PL_reginput = startpos;
d6a28714 2160 PL_reglastparen = &prog->lastparen;
a01268b5 2161 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2162 prog->lastparen = 0;
03994de8 2163 prog->lastcloseparen = 0;
d6a28714 2164 PL_regsize = 0;
6bda09f9
YO
2165 PL_regstartp = prog->startp;
2166 PL_regendp = prog->endp;
d6a28714
JH
2167 if (PL_reg_start_tmpl <= prog->nparens) {
2168 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2169 if(PL_reg_start_tmp)
2170 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2171 else
a02a5408 2172 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2173 }
2174
2175 /* XXXX What this code is doing here?!!! There should be no need
2176 to do this again and again, PL_reglastparen should take care of
3dd2943c 2177 this! --ilya*/
dafc8851
JH
2178
2179 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2180 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2181 * PL_reglastparen), is not needed at all by the test suite
2182 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2183 * enough, for building DynaLoader, or otherwise this
2184 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2185 * will happen. Meanwhile, this code *is* needed for the
2186 * above-mentioned test suite tests to succeed. The common theme
2187 * on those tests seems to be returning null fields from matches.
2188 * --jhi */
dafc8851 2189#if 1
d6a28714
JH
2190 sp = prog->startp;
2191 ep = prog->endp;
2192 if (prog->nparens) {
097eb12c 2193 register I32 i;
eb160463 2194 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2195 *++sp = -1;
2196 *++ep = -1;
2197 }
2198 }
dafc8851 2199#endif
02db2b7b 2200 REGCP_SET(lastcp);
3b0527fe 2201 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2202 prog->endp[0] = PL_reginput - PL_bostr;
2203 return 1;
2204 }
02db2b7b 2205 REGCP_UNWIND(lastcp);
d6a28714
JH
2206 return 0;
2207}
2208
02db2b7b 2209
8ba1375e
MJD
2210#define sayYES goto yes
2211#define sayNO goto no
262b90c4 2212#define sayNO_SILENT goto no_silent
8ba1375e 2213
f9f4320a
YO
2214/* we dont use STMT_START/END here because it leads to
2215 "unreachable code" warnings, which are bogus, but distracting. */
2216#define CACHEsayNO \
c476f425
DM
2217 if (ST.cache_mask) \
2218 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2219 sayNO
3298f257 2220
a3621e74 2221/* this is used to determine how far from the left messages like
265c4333
YO
2222 'failed...' are printed. It should be set such that messages
2223 are inline with the regop output that created them.
a3621e74 2224*/
265c4333 2225#define REPORT_CODE_OFF 32
a3621e74
YO
2226
2227
2228/* Make sure there is a test for this +1 options in re_tests */
2229#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2230
40a82448
DM
2231#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2232#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2233
86545054
DM
2234#define SLAB_FIRST(s) (&(s)->states[0])
2235#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2236
5d9a96ca
DM
2237/* grab a new slab and return the first slot in it */
2238
2239STATIC regmatch_state *
2240S_push_slab(pTHX)
2241{
54df2634
NC
2242#if PERL_VERSION < 9
2243 dMY_CXT;
2244#endif
5d9a96ca
DM
2245 regmatch_slab *s = PL_regmatch_slab->next;
2246 if (!s) {
2247 Newx(s, 1, regmatch_slab);
2248 s->prev = PL_regmatch_slab;
2249 s->next = NULL;
2250 PL_regmatch_slab->next = s;
2251 }
2252 PL_regmatch_slab = s;
86545054 2253 return SLAB_FIRST(s);
5d9a96ca 2254}
5b47454d 2255
95b24440 2256
40a82448
DM
2257/* push a new state then goto it */
2258
2259#define PUSH_STATE_GOTO(state, node) \
2260 scan = node; \
2261 st->resume_state = state; \
2262 goto push_state;
2263
2264/* push a new state with success backtracking, then goto it */
2265
2266#define PUSH_YES_STATE_GOTO(state, node) \
2267 scan = node; \
2268 st->resume_state = state; \
2269 goto push_yes_state;
2270
aa283a38 2271
aa283a38 2272
d6a28714
JH
2273/*
2274 - regmatch - main matching routine
2275 *
2276 * Conceptually the strategy is simple: check to see whether the current
2277 * node matches, call self recursively to see whether the rest matches,
2278 * and then act accordingly. In practice we make some effort to avoid
2279 * recursion, in particular by going through "ordinary" nodes (that don't
2280 * need to know whether the rest of the match failed) by a loop instead of
2281 * by recursion.
2282 */
2283/* [lwall] I've hoisted the register declarations to the outer block in order to
2284 * maybe save a little bit of pushing and popping on the stack. It also takes
2285 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2286 *
2287 * This function used to be heavily recursive, but since this had the
2288 * effect of blowing the CPU stack on complex regexes, it has been
2289 * restructured to be iterative, and to save state onto the heap rather
2290 * than the stack. Essentially whereever regmatch() used to be called, it
2291 * pushes the current state, notes where to return, then jumps back into
2292 * the main loop.
2293 *
2294 * Originally the structure of this function used to look something like
2295
2296 S_regmatch() {
2297 int a = 1, b = 2;
2298 ...
2299 while (scan != NULL) {
5d9a96ca 2300 a++; // do stuff with a and b
95b24440
DM
2301 ...
2302 switch (OP(scan)) {
2303 case FOO: {
2304 int local = 3;
2305 ...
2306 if (regmatch(...)) // recurse
2307 goto yes;
2308 }
2309 ...
2310 }
2311 }
2312 yes:
2313 return 1;
2314 }
2315
2316 * Now it looks something like this:
2317
5d9a96ca 2318 typedef struct {
95b24440
DM
2319 int a, b, local;
2320 int resume_state;
5d9a96ca 2321 } regmatch_state;
95b24440
DM
2322
2323 S_regmatch() {
5d9a96ca
DM
2324 regmatch_state *st = new();
2325 int depth=0;
2326 st->a++; // do stuff with a and b
95b24440
DM
2327 ...
2328 while (scan != NULL) {
2329 ...
2330 switch (OP(scan)) {
2331 case FOO: {
5d9a96ca 2332 st->local = 3;
95b24440 2333 ...
5d9a96ca
DM
2334 st->scan = scan;
2335 scan = ...;
2336 st->resume_state = resume_FOO;
2337 goto start_recurse; // recurse
95b24440 2338
5d9a96ca
DM
2339 resume_point_FOO:
2340 if (result)
95b24440
DM
2341 goto yes;
2342 }
2343 ...
2344 }
5d9a96ca
DM
2345 start_recurse:
2346 st = new(); push a new state
2347 st->a = 1; st->b = 2;
2348 depth++;
95b24440 2349 }
5d9a96ca 2350 yes:
95b24440 2351 result = 1;
5d9a96ca
DM
2352 if (depth--) {
2353 st = pop();
95b24440
DM
2354 switch (resume_state) {
2355 case resume_FOO:
2356 goto resume_point_FOO;
2357 ...
2358 }
2359 }
2360 return result
2361 }
2362
2363 * WARNING: this means that any line in this function that contains a
2364 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2365 * regmatch() using gotos instead. Thus the values of any local variables
2366 * not saved in the regmatch_state structure will have been lost when
2367 * execution resumes on the next line .
5d9a96ca
DM
2368 *
2369 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2370 * PL_regmatch_state always points to the currently active state, and
2371 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2372 * The first time regmatch is called, the first slab is allocated, and is
2373 * never freed until interpreter desctruction. When the slab is full,
2374 * a new one is allocated chained to the end. At exit from regmatch, slabs
2375 * allocated since entry are freed.
d6a28714 2376 */
95b24440 2377
40a82448 2378
5bc10b2c 2379#define DEBUG_STATE_pp(pp) \
265c4333 2380 DEBUG_STATE_r({ \
5bc10b2c
DM
2381 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2382 PerlIO_printf(Perl_debug_log, \
2383 " %*s"pp" %s\n", \
2384 depth*2, "", \
03363afd 2385 reg_name[st->resume_state] ); \
265c4333 2386 });
5bc10b2c 2387
40a82448 2388
3dab1dad 2389#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2390
3df15adc 2391#ifdef DEBUGGING
5bc10b2c 2392
ab3bbdeb
YO
2393STATIC void
2394S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2395 const char *start, const char *end, const char *blurb)
2396{
2397 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2398 if (!PL_colorset)
2399 reginitcolors();
2400 {
2401 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2402 prog->precomp, prog->prelen, 60);
2403
2404 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2405 start, end - start, 60);
2406
2407 PerlIO_printf(Perl_debug_log,
2408 "%s%s REx%s %s against %s\n",
2409 PL_colors[4], blurb, PL_colors[5], s0, s1);
2410
2411 if (do_utf8||utf8_pat)
1de06328
YO
2412 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2413 utf8_pat ? "pattern" : "",
2414 utf8_pat && do_utf8 ? " and " : "",
2415 do_utf8 ? "string" : ""
ab3bbdeb
YO
2416 );
2417 }
2418}
3df15adc
YO
2419
2420STATIC void
786e8c11
YO
2421S_dump_exec_pos(pTHX_ const char *locinput,
2422 const regnode *scan,
2423 const char *loc_regeol,
2424 const char *loc_bostr,
2425 const char *loc_reg_starttry,
2426 const bool do_utf8)
07be1b83 2427{
786e8c11 2428 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2429 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2430 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2431 /* The part of the string before starttry has one color
2432 (pref0_len chars), between starttry and current
2433 position another one (pref_len - pref0_len chars),
2434 after the current position the third one.
2435 We assume that pref0_len <= pref_len, otherwise we
2436 decrease pref0_len. */
786e8c11
YO
2437 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2438 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2439 int pref0_len;
2440
2441 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2442 pref_len++;
786e8c11
YO
2443 pref0_len = pref_len - (locinput - loc_reg_starttry);
2444 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2445 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2446 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2448 l--;
2449 if (pref0_len < 0)
2450 pref0_len = 0;
2451 if (pref0_len > pref_len)
2452 pref0_len = pref_len;
2453 {
3df15adc 2454 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2455
ab3bbdeb 2456 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2457 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2458
ab3bbdeb 2459 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2460 (locinput - pref_len + pref0_len),
1de06328 2461 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2462
ab3bbdeb 2463 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2464 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2465
1de06328 2466 const STRLEN tlen=len0+len1+len2;
3df15adc 2467 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2468 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2469 (IV)(locinput - loc_bostr),
07be1b83 2470 len0, s0,
07be1b83 2471 len1, s1,
07be1b83 2472 (docolor ? "" : "> <"),
07be1b83 2473 len2, s2,
f9f4320a 2474 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2475 "");
2476 }
2477}
3df15adc 2478
07be1b83
YO
2479#endif
2480
d6a28714 2481STATIC I32 /* 0 failure, 1 success */
3b0527fe 2482S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2483{
54df2634
NC
2484#if PERL_VERSION < 9
2485 dMY_CXT;
2486#endif
27da23d5 2487 dVAR;
95b24440 2488 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2489 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2490
3b0527fe
DM
2491 regexp *rex = reginfo->prog;
2492
5d9a96ca
DM
2493 regmatch_slab *orig_slab;
2494 regmatch_state *orig_state;
a3621e74 2495
5d9a96ca
DM
2496 /* the current state. This is a cached copy of PL_regmatch_state */
2497 register regmatch_state *st;
95b24440 2498
5d9a96ca
DM
2499 /* cache heavy used fields of st in registers */
2500 register regnode *scan;
2501 register regnode *next;
2502 register I32 n = 0; /* initialize to shut up compiler warning */
2503 register char *locinput = PL_reginput;
95b24440 2504
5d9a96ca
DM
2505 /* these variables are NOT saved during a recusive RFEGMATCH: */
2506 register I32 nextchr; /* is always set to UCHARAT(locinput) */
b69b0499 2507 bool result = 0; /* return value of S_regmatch */
5d9a96ca 2508 int depth = 0; /* depth of recursion */
6bda09f9 2509 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
77cb431f
DM
2510 regmatch_state *yes_state = NULL; /* state to pop to on success of
2511 subpattern */
faec1544 2512 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2513 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2514 U32 state_num;
10edeb5d 2515
95b24440 2516#ifdef DEBUGGING
e68ec53f 2517 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2518#endif
2519
5d9a96ca
DM
2520 /* on first ever call to regmatch, allocate first slab */
2521 if (!PL_regmatch_slab) {
2522 Newx(PL_regmatch_slab, 1, regmatch_slab);
2523 PL_regmatch_slab->prev = NULL;
2524 PL_regmatch_slab->next = NULL;
86545054 2525 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2526 }
2527
2528 /* remember current high-water mark for exit */
2529 /* XXX this should be done with SAVE* instead */
2530 orig_slab = PL_regmatch_slab;
2531 orig_state = PL_regmatch_state;
2532
2533 /* grab next free state slot */
2534 st = ++PL_regmatch_state;
86545054 2535 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2536 st = PL_regmatch_state = S_push_slab(aTHX);
2537
2538 st->minmod = 0;
2539 st->sw = 0;
2540 st->logical = 0;
b8591aee 2541 cur_curlyx = NULL;
786e8c11 2542
d6a28714
JH
2543 /* Note that nextchr is a byte even in UTF */
2544 nextchr = UCHARAT(locinput);
2545 scan = prog;
2546 while (scan != NULL) {
8ba1375e 2547
a3621e74 2548 DEBUG_EXECUTE_r( {
6136c704 2549 SV * const prop = sv_newmortal();
1de06328 2550 regnode *rnext=regnext(scan);
786e8c11 2551 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2552 regprop(rex, prop, scan);
07be1b83
YO
2553
2554 PerlIO_printf(Perl_debug_log,
2555 "%3"IVdf":%*s%s(%"IVdf")\n",
5bc10b2c 2556 (IV)(scan - rex->program), depth*2, "",
07be1b83 2557 SvPVX_const(prop),
1de06328
YO
2558 (PL_regkind[OP(scan)] == END || !rnext) ?
2559 0 : (IV)(rnext - rex->program));
2a782b5b 2560 });
d6a28714
JH
2561
2562 next = scan + NEXT_OFF(scan);
2563 if (next == scan)
2564 next = NULL;
40a82448 2565 state_num = OP(scan);
d6a28714 2566
40a82448
DM
2567 reenter_switch:
2568 switch (state_num) {
d6a28714 2569 case BOL:
7fba1cd6 2570 if (locinput == PL_bostr)
d6a28714 2571 {
3b0527fe 2572 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2573 break;
2574 }
d6a28714
JH
2575 sayNO;
2576 case MBOL:
12d33761
HS
2577 if (locinput == PL_bostr ||
2578 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2579 {
b8c5462f
JH
2580 break;
2581 }
d6a28714
JH
2582 sayNO;
2583 case SBOL:
c2a73568 2584 if (locinput == PL_bostr)
b8c5462f 2585 break;
d6a28714
JH
2586 sayNO;
2587 case GPOS:
3b0527fe 2588 if (locinput == reginfo->ganch)
d6a28714
JH
2589 break;
2590 sayNO;
2591 case EOL:
d6a28714
JH
2592 goto seol;
2593 case MEOL:
d6a28714 2594 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2595 sayNO;
b8c5462f 2596 break;
d6a28714
JH
2597 case SEOL:
2598 seol:
2599 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2600 sayNO;
d6a28714 2601 if (PL_regeol - locinput > 1)
b8c5462f 2602 sayNO;
b8c5462f 2603 break;
d6a28714
JH
2604 case EOS:
2605 if (PL_regeol != locinput)
b8c5462f 2606 sayNO;
d6a28714 2607 break;
ffc61ed2 2608 case SANY:
d6a28714 2609 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2610 sayNO;
f33976b4
DB
2611 if (do_utf8) {
2612 locinput += PL_utf8skip[nextchr];
2613 if (locinput > PL_regeol)
2614 sayNO;
2615 nextchr = UCHARAT(locinput);
2616 }
2617 else
2618 nextchr = UCHARAT(++locinput);
2619 break;
2620 case CANY:
2621 if (!nextchr && locinput >= PL_regeol)
2622 sayNO;
b8c5462f 2623 nextchr = UCHARAT(++locinput);
a0d0e21e 2624 break;
ffc61ed2 2625 case REG_ANY:
1aa99e6b
IH
2626 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2627 sayNO;
2628 if (do_utf8) {
b8c5462f 2629 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2630 if (locinput > PL_regeol)
2631 sayNO;
a0ed51b3 2632 nextchr = UCHARAT(locinput);
a0ed51b3 2633 }
1aa99e6b
IH
2634 else
2635 nextchr = UCHARAT(++locinput);
a0ed51b3 2636 break;
166ba7cd
DM
2637
2638#undef ST
2639#define ST st->u.trie
786e8c11
YO
2640 case TRIEC:
2641 /* In this case the charclass data is available inline so
2642 we can fail fast without a lot of extra overhead.
2643 */
2644 if (scan->flags == EXACT || !do_utf8) {
2645 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2646 DEBUG_EXECUTE_r(
2647 PerlIO_printf(Perl_debug_log,
2648 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2649 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2650 );
2651 sayNO_SILENT;
2652 /* NOTREACHED */
2653 }
2654 }
2655 /* FALL THROUGH */
5b47454d 2656 case TRIE:
3dab1dad 2657 {
07be1b83 2658 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2659 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2660 trie_type = do_utf8 ?
2661 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2662 : trie_plain;
2663
2664 /* what trie are we using right now */
be8e71aa 2665 reg_trie_data * const trie
3dab1dad
YO
2666 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2667 U32 state = trie->startstate;
166ba7cd 2668
3dab1dad
YO
2669 if (trie->bitmap && trie_type != trie_utf8_fold &&
2670 !TRIE_BITMAP_TEST(trie,*locinput)
2671 ) {
2672 if (trie->states[ state ].wordnum) {
2673 DEBUG_EXECUTE_r(
2674 PerlIO_printf(Perl_debug_log,
2675 "%*s %smatched empty string...%s\n",
5bc10b2c 2676 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2677 );
2678 break;
2679 } else {
2680 DEBUG_EXECUTE_r(
2681 PerlIO_printf(Perl_debug_log,
786e8c11 2682 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2683 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2684 );
2685 sayNO_SILENT;
2686 }
2687 }
166ba7cd 2688
786e8c11
YO
2689 {
2690 U8 *uc = ( U8* )locinput;
2691
2692 STRLEN len = 0;
2693 STRLEN foldlen = 0;
2694 U8 *uscan = (U8*)NULL;
2695 STRLEN bufflen=0;
2696 SV *sv_accept_buff = NULL;
2697 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2698
2699 ST.accepted = 0; /* how many accepting states we have seen */
2700 ST.B = next;
2701 ST.jump = trie->jump;
2702
2703#ifdef DEBUGGING
2704 ST.me = scan;
2705#endif
2706
2707
2708
07be1b83
YO
2709 /*
2710 traverse the TRIE keeping track of all accepting states
2711 we transition through until we get to a failing node.
2712 */
2713
a3621e74 2714 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2715 U32 base = trie->states[ state ].trans.base;
f9f4320a 2716 UV uvc = 0;
786e8c11
YO
2717 U16 charid;
2718 /* We use charid to hold the wordnum as we don't use it
2719 for charid until after we have done the wordnum logic.
2720 We define an alias just so that the wordnum logic reads
2721 more naturally. */
2722
2723#define got_wordnum charid
2724 got_wordnum = trie->states[ state ].wordnum;
2725
2726 if ( got_wordnum ) {
2727 if ( ! ST.accepted ) {
5b47454d
DM
2728 ENTER;
2729 SAVETMPS;
2730 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2731 sv_accept_buff=newSV(bufflen *
2732 sizeof(reg_trie_accepted) - 1);
786e8c11 2733 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2734 SvPOK_on(sv_accept_buff);
2735 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2736 SAVETMPS;
2737 ST.accept_buff =
5b47454d
DM
2738 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2739 }
786e8c11 2740 do {
166ba7cd 2741 if (ST.accepted >= bufflen) {
5b47454d 2742 bufflen *= 2;
166ba7cd 2743 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2744 SvGROW(sv_accept_buff,
2745 bufflen * sizeof(reg_trie_accepted));
2746 }
2747 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2748 + sizeof(reg_trie_accepted));
a3621e74 2749
786e8c11
YO
2750
2751 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2752 ST.accept_buff[ST.accepted].endpos = uc;
2753 ++ST.accepted;
2754 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2755 }
2756#undef got_wordnum
a3621e74 2757
07be1b83 2758 DEBUG_TRIE_EXECUTE_r({
786e8c11 2759 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2760 PerlIO_printf( Perl_debug_log,
786e8c11 2761 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2762 2+depth * 2, "", PL_colors[4],
786e8c11 2763 (UV)state, (UV)ST.accepted );
07be1b83 2764 });
a3621e74
YO
2765
2766 if ( base ) {
4cadc6a9
YO
2767 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2768 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2769
5b47454d
DM
2770 if (charid &&
2771 (base + charid > trie->uniquecharcount )
2772 && (base + charid - 1 - trie->uniquecharcount
2773 < trie->lasttrans)
2774 && trie->trans[base + charid - 1 -
2775 trie->uniquecharcount].check == state)
2776 {
2777 state = trie->trans[base + charid - 1 -
2778 trie->uniquecharcount ].next;
2779 }
2780 else {
2781 state = 0;
2782 }
2783 uc += len;
2784
2785 }
2786 else {
a3621e74
YO
2787 state = 0;
2788 }
2789 DEBUG_TRIE_EXECUTE_r(
e4584336 2790 PerlIO_printf( Perl_debug_log,
786e8c11 2791 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2792 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2793 );
2794 }
166ba7cd 2795 if (!ST.accepted )
a3621e74 2796 sayNO;
a3621e74 2797
166ba7cd
DM
2798 DEBUG_EXECUTE_r(
2799 PerlIO_printf( Perl_debug_log,
2800 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2801 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2802 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2803 );
786e8c11 2804 }}
166ba7cd
DM
2805
2806 /* FALL THROUGH */
2807
2808 case TRIE_next_fail: /* we failed - try next alterative */
2809
2810 if ( ST.accepted == 1 ) {
2811 /* only one choice left - just continue */
2812 DEBUG_EXECUTE_r({
2813 reg_trie_data * const trie
2814 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2815 SV ** const tmp = RX_DEBUG(reginfo->prog)
2816 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2817 : NULL;
2818 PerlIO_printf( Perl_debug_log,
2819 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2820 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd
DM
2821 ST.accept_buff[ 0 ].wordnum,
2822 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2823 PL_colors[5] );
2824 });
2825 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2826 /* in this case we free tmps/leave before we call regmatch
2827 as we wont be using accept_buff again. */
2828 FREETMPS;
2829 LEAVE;
2830 locinput = PL_reginput;
2831 nextchr = UCHARAT(locinput);
786e8c11
YO
2832
2833 if ( !ST.jump )
2834 scan = ST.B;
2835 else
2836 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2837
166ba7cd
DM
2838 continue; /* execute rest of RE */
2839 }
2840
2841 if (!ST.accepted-- ) {
2842 FREETMPS;
2843 LEAVE;
2844 sayNO;
2845 }
2846
a3621e74 2847 /*
166ba7cd
DM
2848 There are at least two accepting states left. Presumably
2849 the number of accepting states is going to be low,
2850 typically two. So we simply scan through to find the one
2851 with lowest wordnum. Once we find it, we swap the last
2852 state into its place and decrement the size. We then try to
2853 match the rest of the pattern at the point where the word
2854 ends. If we succeed, control just continues along the
2855 regex; if we fail we return here to try the next accepting
2856 state
2857 */
a3621e74 2858
166ba7cd
DM
2859 {
2860 U32 best = 0;
2861 U32 cur;
2862 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2863 DEBUG_TRIE_EXECUTE_r(
f2278c82 2864 PerlIO_printf( Perl_debug_log,
166ba7cd 2865 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 2866 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
2867 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2868 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2869 );
2870
2871 if (ST.accept_buff[cur].wordnum <
2872 ST.accept_buff[best].wordnum)
2873 best = cur;
a3621e74 2874 }
166ba7cd
DM
2875
2876 DEBUG_EXECUTE_r({
2877 reg_trie_data * const trie
2878 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2879 SV ** const tmp = RX_DEBUG(reginfo->prog)
2880 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2881 : NULL;
265c4333
YO
2882 regnode *nextop=!ST.jump ?
2883 ST.B :
2884 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2885 PerlIO_printf( Perl_debug_log,
2886 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 2887 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2888 ST.accept_buff[best].wordnum,
265c4333
YO
2889 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2890 REG_NODE_NUM(nextop),
166ba7cd
DM
2891 PL_colors[5] );
2892 });
2893
2894 if ( best<ST.accepted ) {
2895 reg_trie_accepted tmp = ST.accept_buff[ best ];
2896 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2897 ST.accept_buff[ ST.accepted ] = tmp;
2898 best = ST.accepted;
a3621e74 2899 }
166ba7cd 2900 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2901 if ( !ST.jump ) {
2902 PUSH_STATE_GOTO(TRIE_next, ST.B);
2903 /* NOTREACHED */
2904 } else {
2905 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2906 /* NOTREACHED */
2907 }
2908 /* NOTREACHED */
166ba7cd 2909 }
166ba7cd
DM
2910 /* NOTREACHED */
2911
2912#undef ST
2913
95b24440
DM
2914 case EXACT: {
2915 char *s = STRING(scan);
5d9a96ca 2916 st->ln = STR_LEN(scan);
eb160463 2917 if (do_utf8 != UTF) {
bc517b45 2918 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2919 char *l = locinput;
be8e71aa 2920 const char * const e = s + st->ln;
a72c7584 2921
5ff6fc6d
JH
2922 if (do_utf8) {
2923 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2924 while (s < e) {
a3b680e6 2925 STRLEN ulen;
1aa99e6b 2926 if (l >= PL_regeol)
5ff6fc6d
JH
2927 sayNO;
2928 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2929 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2930 uniflags))
5ff6fc6d 2931 sayNO;
bc517b45 2932 l += ulen;
5ff6fc6d 2933 s ++;
1aa99e6b 2934 }
5ff6fc6d
JH
2935 }
2936 else {
2937 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2938 while (s < e) {
a3b680e6 2939 STRLEN ulen;
1aa99e6b
IH
2940 if (l >= PL_regeol)
2941 sayNO;
5ff6fc6d 2942 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2943 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2944 uniflags))
1aa99e6b 2945 sayNO;
bc517b45 2946 s += ulen;
a72c7584 2947 l ++;
1aa99e6b 2948 }
5ff6fc6d 2949 }
1aa99e6b
IH
2950 locinput = l;
2951 nextchr = UCHARAT(locinput);
2952 break;
2953 }
bc517b45 2954 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2955 /* Inline the first character, for speed. */
2956 if (UCHARAT(s) != nextchr)
2957 sayNO;
5d9a96ca 2958 if (PL_regeol - locinput < st->ln)
d6a28714 2959 sayNO;
5d9a96ca 2960 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2961 sayNO;
5d9a96ca 2962 locinput += st->ln;
d6a28714
JH
2963 nextchr = UCHARAT(locinput);
2964 break;
95b24440 2965 }
d6a28714 2966 case EXACTFL:
b8c5462f
JH
2967 PL_reg_flags |= RF_tainted;
2968 /* FALL THROUGH */
95b24440 2969 case EXACTF: {
be8e71aa 2970 char * const s = STRING(scan);
5d9a96ca 2971 st->ln = STR_LEN(scan);
d6a28714 2972
d07ddd77
JH
2973 if (do_utf8 || UTF) {
2974 /* Either target or the pattern are utf8. */
be8e71aa 2975 const char * const l = locinput;
d07ddd77 2976 char *e = PL_regeol;
bc517b45 2977
5d9a96ca 2978 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2979 l, &e, 0, do_utf8)) {
5486206c
JH
2980 /* One more case for the sharp s:
2981 * pack("U0U*", 0xDF) =~ /ss/i,
2982 * the 0xC3 0x9F are the UTF-8
2983 * byte sequence for the U+00DF. */
2984 if (!(do_utf8 &&
2985 toLOWER(s[0]) == 's' &&
5d9a96ca 2986 st->ln >= 2 &&
5486206c
JH
2987 toLOWER(s[1]) == 's' &&
2988 (U8)l[0] == 0xC3 &&
2989 e - l >= 2 &&
2990 (U8)l[1] == 0x9F))
2991 sayNO;
2992 }
d07ddd77
JH
2993 locinput = e;
2994 nextchr = UCHARAT(locinput);
2995 break;
a0ed51b3 2996 }
d6a28714 2997
bc517b45
JH
2998 /* Neither the target and the pattern are utf8. */
2999
d6a28714
JH
3000 /* Inline the first character, for speed. */
3001 if (UCHARAT(s) != nextchr &&
3002 UCHARAT(s) != ((OP(scan) == EXACTF)
3003 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3004 sayNO;
5d9a96ca 3005 if (PL_regeol - locinput < st->ln)
b8c5462f 3006 sayNO;
5d9a96ca
DM
3007 if (st->ln > 1 && (OP(scan) == EXACTF
3008 ? ibcmp(s, locinput, st->ln)
3009 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3010 sayNO;
5d9a96ca 3011 locinput += st->ln;
d6a28714 3012 nextchr = UCHARAT(locinput);
a0d0e21e 3013 break;
95b24440 3014 }
d6a28714 3015 case ANYOF:
ffc61ed2 3016 if (do_utf8) {
9e55ce06
JH
3017 STRLEN inclasslen = PL_regeol - locinput;
3018
32fc9b6a 3019 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3020 goto anyof_fail;
ffc61ed2
JH
3021 if (locinput >= PL_regeol)
3022 sayNO;
0f0076b4 3023 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3024 nextchr = UCHARAT(locinput);
e0f9d4a8 3025 break;
ffc61ed2
JH
3026 }
3027 else {
3028 if (nextchr < 0)
3029 nextchr = UCHARAT(locinput);
32fc9b6a 3030 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3031 goto anyof_fail;
ffc61ed2
JH
3032 if (!nextchr && locinput >= PL_regeol)
3033 sayNO;
3034 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3035 break;
3036 }
262b90c4 3037 anyof_fail:
e0f9d4a8
JH
3038 /* If we might have the case of the German sharp s
3039 * in a casefolding Unicode character class. */
3040
ebc501f0
JH
3041 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3042 locinput += SHARP_S_SKIP;
e0f9d4a8 3043 nextchr = UCHARAT(locinput);
ffc61ed2 3044 }
e0f9d4a8
JH
3045 else
3046 sayNO;
b8c5462f 3047 break;
d6a28714 3048 case ALNUML:
b8c5462f
JH
3049 PL_reg_flags |= RF_tainted;
3050 /* FALL THROUGH */
d6a28714 3051 case ALNUM:
b8c5462f 3052 if (!nextchr)
4633a7c4 3053 sayNO;
ffc61ed2 3054 if (do_utf8) {
1a4fad37 3055 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3056 if (!(OP(scan) == ALNUM
bb7a0f54 3057 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3058 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3059 {
3060 sayNO;
a0ed51b3 3061 }
b8c5462f 3062 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3063 nextchr = UCHARAT(locinput);
3064 break;
3065 }
ffc61ed2 3066 if (!(OP(scan) == ALNUM
d6a28714 3067 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3068 sayNO;
b8c5462f 3069 nextchr = UCHARAT(++locinput);
a0d0e21e 3070 break;
d6a28714 3071 case NALNUML:
b8c5462f
JH
3072 PL_reg_flags |= RF_tainted;
3073 /* FALL THROUGH */
d6a28714
JH
3074 case NALNUM:
3075 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3076 sayNO;
ffc61ed2 3077 if (do_utf8) {
1a4fad37 3078 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3079 if (OP(scan) == NALNUM
bb7a0f54 3080 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3081 : isALNUM_LC_utf8((U8*)locinput))
3082 {
b8c5462f 3083 sayNO;
d6a28714 3084 }
b8c5462f
JH
3085 locinput += PL_utf8skip[nextchr];
3086 nextchr = UCHARAT(locinput);
3087 break;
3088 }
ffc61ed2 3089 if (OP(scan) == NALNUM
d6a28714 3090 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3091 sayNO;
76e3520e 3092 nextchr = UCHARAT(++locinput);
a0d0e21e 3093 break;
d6a28714
JH
3094 case BOUNDL:
3095 case NBOUNDL:
3280af22 3096 PL_reg_flags |= RF_tainted;
bbce6d69 3097 /* FALL THROUGH */
d6a28714
JH
3098 case BOUND:
3099 case NBOUND:
3100 /* was last char in word? */
ffc61ed2 3101 if (do_utf8) {
12d33761 3102 if (locinput == PL_bostr)
5d9a96ca 3103 st->ln = '\n';
ffc61ed2 3104 else {
a3b680e6 3105 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3106
4ad0818d 3107 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3108 }
3109 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3110 st->ln = isALNUM_uni(st->ln);
1a4fad37 3111 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3112 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3113 }
3114 else {
5d9a96ca 3115 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3116 n = isALNUM_LC_utf8((U8*)locinput);
3117 }
a0ed51b3 3118 }
d6a28714 3119 else {
5d9a96ca 3120 st->ln = (locinput != PL_bostr) ?
12d33761 3121 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3122 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3123 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3124 n = isALNUM(nextchr);
3125 }
3126 else {
5d9a96ca 3127 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3128 n = isALNUM_LC(nextchr);
3129 }
d6a28714 3130 }
5d9a96ca 3131 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3132 OP(scan) == BOUNDL))
3133 sayNO;
a0ed51b3 3134 break;
d6a28714 3135 case SPACEL:
3280af22 3136 PL_reg_flags |= RF_tainted;
bbce6d69 3137 /* FALL THROUGH */
d6a28714 3138 case SPACE:
9442cb0e 3139 if (!nextchr)
4633a7c4 3140 sayNO;
1aa99e6b 3141 if (do_utf8) {
fd400ab9 3142 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3143 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3144 if (!(OP(scan) == SPACE
bb7a0f54 3145 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3146 : isSPACE_LC_utf8((U8*)locinput)))
3147 {
3148 sayNO;
3149 }
3150 locinput += PL_utf8skip[nextchr];
3151 nextchr = UCHARAT(locinput);
3152 break;
d6a28714 3153 }
ffc61ed2
JH
3154 if (!(OP(scan) == SPACE
3155 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3156 sayNO;
3157 nextchr = UCHARAT(++locinput);
3158 }
3159 else {
3160 if (!(OP(scan) == SPACE
3161 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3162 sayNO;
3163 nextchr = UCHARAT(++locinput);
a0ed51b3 3164 }
a0ed51b3 3165 break;
d6a28714 3166 case NSPACEL:
3280af22 3167 PL_reg_flags |= RF_tainted;
bbce6d69 3168 /* FALL THROUGH */
d6a28714 3169 case NSPACE:
9442cb0e 3170 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3171 sayNO;
1aa99e6b 3172 if (do_utf8) {
1a4fad37 3173 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3174 if (OP(scan) == NSPACE
bb7a0f54 3175 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3176 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3177 {
3178 sayNO;
3179 }
3180 locinput += PL_utf8skip[nextchr];
3181 nextchr = UCHARAT(locinput);
3182 break;
a0ed51b3 3183 }
ffc61ed2 3184 if (OP(scan) == NSPACE
d6a28714 3185 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3186 sayNO;
76e3520e 3187 nextchr = UCHARAT(++locinput);
a0d0e21e 3188 break;
d6a28714 3189 case DIGITL:
a0ed51b3
LW
3190 PL_reg_flags |= RF_tainted;
3191 /* FALL THROUGH */
d6a28714 3192 case DIGIT:
9442cb0e 3193 if (!nextchr)
a0ed51b3 3194 sayNO;
1aa99e6b 3195 if (do_utf8) {
1a4fad37 3196 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3197 if (!(OP(scan) == DIGIT
bb7a0f54 3198 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3199 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3200 {
a0ed51b3 3201 sayNO;
dfe13c55 3202 }
6f06b55f 3203 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3204 nextchr = UCHARAT(locinput);
3205 break;
3206 }
ffc61ed2 3207 if (!(OP(scan) == DIGIT
9442cb0e 3208 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3209 sayNO;
3210 nextchr = UCHARAT(++locinput);
3211 break;
d6a28714 3212 case NDIGITL:
b8c5462f
JH
3213 PL_reg_flags |= RF_tainted;
3214 /* FALL THROUGH */
d6a28714 3215 case NDIGIT:
9442cb0e 3216 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3217 sayNO;
1aa99e6b 3218 if (do_utf8) {
1a4fad37 3219 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3220 if (OP(scan) == NDIGIT
bb7a0f54 3221 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3222 : isDIGIT_LC_utf8((U8*)locinput))
3223 {
a0ed51b3 3224 sayNO;
9442cb0e 3225 }
6f06b55f 3226 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3227 nextchr = UCHARAT(locinput);
3228 break;
3229 }
ffc61ed2 3230 if (OP(scan) == NDIGIT
9442cb0e 3231 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3232 sayNO;
3233 nextchr = UCHARAT(++locinput);
3234 break;
3235 case CLUMP:
b7c83a7e 3236 if (locinput >= PL_regeol)
a0ed51b3 3237 sayNO;
b7c83a7e 3238 if (do_utf8) {
1a4fad37 3239 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3240 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3241 sayNO;
3242 locinput += PL_utf8skip[nextchr];
3243 while (locinput < PL_regeol &&
3244 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3245 locinput += UTF8SKIP(locinput);
3246 if (locinput > PL_regeol)
3247 sayNO;
eb08e2da
JH
3248 }
3249 else
3250 locinput++;
a0ed51b3
LW
3251 nextchr = UCHARAT(locinput);
3252 break;
c8756f30 3253 case REFFL:
3280af22 3254 PL_reg_flags |= RF_tainted;
c8756f30 3255 /* FALL THROUGH */
c277df42 3256 case REF:
95b24440
DM
3257 case REFF: {
3258 char *s;
c277df42 3259 n = ARG(scan); /* which paren pair */
5d9a96ca 3260 st->ln = PL_regstartp[n];
2c2d71f5 3261 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3262 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3263 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3264 if (st->ln == PL_regendp[n])
a0d0e21e 3265 break;
a0ed51b3 3266
5d9a96ca 3267 s = PL_bostr + st->ln;
1aa99e6b 3268 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3269 char *l = locinput;
a3b680e6 3270 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3271 /*
3272 * Note that we can't do the "other character" lookup trick as
3273 * in the 8-bit case (no pun intended) because in Unicode we
3274 * have to map both upper and title case to lower case.
3275 */
3276 if (OP(scan) == REFF) {
3277 while (s < e) {
a3b680e6
AL
3278 STRLEN ulen1, ulen2;
3279 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3280 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3281
a0ed51b3
LW
3282 if (l >= PL_regeol)
3283 sayNO;
a2a2844f
JH
3284 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3285 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3286 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3287 sayNO;
a2a2844f
JH
3288 s += ulen1;
3289 l += ulen2;
a0ed51b3
LW
3290 }
3291 }
3292 locinput = l;
3293 nextchr = UCHARAT(locinput);
3294 break;
3295 }
3296
a0d0e21e 3297 /* Inline the first character, for speed. */
76e3520e 3298 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3299 (OP(scan) == REF ||
3300 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3301 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3302 sayNO;
5d9a96ca
DM
3303 st->ln = PL_regendp[n] - st->ln;
3304 if (locinput + st->ln > PL_regeol)
4633a7c4 3305 sayNO;
5d9a96ca
DM
3306 if (st->ln > 1 && (OP(scan) == REF
3307 ? memNE(s, locinput, st->ln)
c8756f30 3308 : (OP(scan) == REFF
5d9a96ca
DM
3309 ? ibcmp(s, locinput, st->ln)
3310 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3311 sayNO;
5d9a96ca 3312 locinput += st->ln;
76e3520e 3313 nextchr = UCHARAT(locinput);
a0d0e21e 3314 break;
95b24440 3315 }
a0d0e21e
LW
3316
3317 case NOTHING:
c277df42 3318 case TAIL:
a0d0e21e
LW
3319 break;
3320 case BACK:
3321 break;
40a82448
DM
3322
3323#undef ST
3324#define ST st->u.eval
c277df42 3325 {
c277df42 3326 SV *ret;
6bda09f9
YO
3327 regexp *re;
3328 regnode *startpoint;
3329
3330 case SRECURSE:
3331 case RECURSE: /* /(...(?1))/ */
3332 if (cur_eval && cur_eval->locinput==locinput) {
3333 if (cur_eval->u.eval.close_paren == ARG(scan))
3334 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3335 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3336 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3337 } else {
3338 nochange_depth = 0;
3339 }
3340 re = rex;
3341 (void)ReREFCNT_inc(rex);
3342 if (OP(scan)==RECURSE) {
3343 startpoint = scan + ARG2L(scan);
3344 ST.close_paren = ARG(scan);
3345 } else {
3346 startpoint = re->program+1;
3347 ST.close_paren = 0;
3348 }
3349 goto eval_recurse_doit;
3350 /* NOTREACHED */
3351 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3352 if (cur_eval && cur_eval->locinput==locinput) {
3353 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3354 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3355 } else {
3356 nochange_depth = 0;
3357 }
8e5e9ebe 3358 {
4aabdb9b
DM
3359 /* execute the code in the {...} */
3360 dSP;
6136c704 3361 SV ** const before = SP;
4aabdb9b
DM
3362 OP_4tree * const oop = PL_op;
3363 COP * const ocurcop = PL_curcop;
3364 PAD *old_comppad;
4aabdb9b
DM
3365
3366 n = ARG(scan);
32fc9b6a 3367 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3368 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3369 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3370 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3371
8e5e9ebe
RGS
3372 CALLRUNOPS(aTHX); /* Scalar context. */
3373 SPAGAIN;
3374 if (SP == before)
075aa684 3375 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3376 else {
3377 ret = POPs;
3378 PUTBACK;
3379 }
4aabdb9b
DM
3380
3381 PL_op = oop;
3382 PAD_RESTORE_LOCAL(old_comppad);
3383 PL_curcop = ocurcop;
3384 if (!st->logical) {
3385 /* /(?{...})/ */
3386 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3387 break;
3388 }
8e5e9ebe 3389 }
4aabdb9b 3390 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
6bda09f9 3391
4aabdb9b 3392 {
4f639d21
DM
3393 /* extract RE object from returned value; compiling if
3394 * necessary */
3395
6136c704 3396 MAGIC *mg = NULL;
be8e71aa 3397 const SV *sv;
faf82a0b
AE
3398 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3399 mg = mg_find(sv, PERL_MAGIC_qr);
3400 else if (SvSMAGICAL(ret)) {
3401 if (SvGMAGICAL(ret))
3402 sv_unmagic(ret, PERL_MAGIC_qr);
3403 else
3404 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3405 }
faf82a0b 3406
0f5d15d6
IZ
3407 if (mg) {
3408 re = (regexp *)mg->mg_obj;
df0003d4 3409 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3410 }
3411 else {
3412 STRLEN len;
6136c704 3413 const char * const t = SvPV_const(ret, len);
0f5d15d6 3414 PMOP pm;
a3b680e6 3415 const I32 osize = PL_regsize;
0f5d15d6 3416
5fcd1c1b 3417 Zero(&pm, 1, PMOP);
4aabdb9b 3418 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3419 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3420 if (!(SvFLAGS(ret)
faf82a0b
AE
3421 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3422 | SVs_GMG)))
14befaf4
DM
3423 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3424 PERL_MAGIC_qr,0,0);
0f5d15d6 3425 PL_regsize = osize;
0f5d15d6 3426 }
4aabdb9b 3427 }
6bda09f9
YO
3428 DEBUG_EXECUTE_r(
3429 debug_start_match(re, do_utf8, locinput, PL_regeol,
3430 "Matching embedded");
3431 );
3432 startpoint = re->program + 1;
3433 ST.close_paren = 0; /* only used for RECURSE */
3434 /* borrowed from regtry */
3435 if (PL_reg_start_tmpl <= re->nparens) {
3436 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3437 if(PL_reg_start_tmp)
3438 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3439 else
3440 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3441 }
aa283a38 3442
6bda09f9 3443 eval_recurse_doit: /* Share code with RECURSE below this line */
aa283a38 3444 /* run the pattern returned from (??{...}) */
40a82448
DM
3445 ST.cp = regcppush(0); /* Save *all* the positions. */
3446 REGCP_SET(ST.lastcp);
6bda09f9
YO
3447
3448 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3449 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3450
4aabdb9b
DM
3451 *PL_reglastparen = 0;
3452 *PL_reglastcloseparen = 0;
4aabdb9b 3453 PL_reginput = locinput;
4aabdb9b
DM
3454
3455 /* XXXX This is too dramatic a measure... */
3456 PL_reg_maxiter = 0;
3457
5d9a96ca 3458 st->logical = 0;
faec1544
DM
3459 ST.toggle_reg_flags = PL_reg_flags;
3460 if (re->reganch & ROPT_UTF8)
3461 PL_reg_flags |= RF_utf8;
3462 else
3463 PL_reg_flags &= ~RF_utf8;
3464 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3465
40a82448 3466 ST.prev_rex = rex;
faec1544 3467 ST.prev_curlyx = cur_curlyx;
aa283a38 3468 rex = re;
faec1544 3469 cur_curlyx = NULL;
40a82448 3470 ST.B = next;
faec1544
DM
3471 ST.prev_eval = cur_eval;
3472 cur_eval = st;
faec1544 3473 /* now continue from first node in postoned RE */
6bda09f9 3474 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3475 /* NOTREACHED */
a0ed51b3 3476 }
4aabdb9b 3477 /* /(?(?{...})X|Y)/ */
f9f4320a 3478 st->sw = (bool)SvTRUE(ret);
4aabdb9b 3479 st->logical = 0;
c277df42
IZ
3480 break;
3481 }
40a82448 3482
faec1544
DM
3483 case EVAL_AB: /* cleanup after a successful (??{A})B */
3484 /* note: this is called twice; first after popping B, then A */
3485 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3486 ReREFCNT_dec(rex);
3487 rex = ST.prev_rex;
faec1544
DM
3488 regcpblow(ST.cp);
3489 cur_eval = ST.prev_eval;
3490 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3491 /* XXXX This is too dramatic a measure... */
3492 PL_reg_maxiter = 0;
262b90c4 3493 sayYES;
40a82448 3494
40a82448 3495
faec1544
DM
3496 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3497 /* note: this is called twice; first after popping B, then A */
3498 PL_reg_flags ^= ST.toggle_reg_flags;
3499 ReREFCNT_dec(rex);
3500 rex = ST.prev_rex;
40a82448
DM
3501 PL_reginput = locinput;
3502 REGCP_UNWIND(ST.lastcp);
3503 regcppop(rex);
faec1544
DM
3504 cur_eval = ST.prev_eval;
3505 cur_curlyx = ST.prev_curlyx;
3506 /* XXXX This is too dramatic a measure... */
3507 PL_reg_maxiter = 0;
40a82448 3508 sayNO_SILENT;
40a82448
DM
3509#undef ST
3510
a0d0e21e 3511 case OPEN:
c277df42 3512 n = ARG(scan); /* which paren pair */
3280af22
NIS
3513 PL_reg_start_tmp[n] = locinput;
3514 if (n > PL_regsize)
3515 PL_regsize = n;
a0d0e21e
LW
3516 break;
3517 case CLOSE:
c277df42 3518 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3519 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3520 PL_regendp[n] = locinput - PL_bostr;
eb160463 3521 if (n > (I32)*PL_reglastparen)
3280af22 3522 *PL_reglastparen = n;
a01268b5 3523 *PL_reglastcloseparen = n;
6bda09f9
YO
3524 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3525 goto fake_end;
3526 }
a0d0e21e 3527 break;
c277df42
IZ
3528 case GROUPP:
3529 n = ARG(scan); /* which paren pair */
f9f4320a 3530 st->sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3531 break;
3532 case IFTHEN:
2c2d71f5 3533 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3534 if (st->sw)
c277df42
IZ
3535 next = NEXTOPER(NEXTOPER(scan));
3536 else {
3537 next = scan + ARG(scan);
3538 if (OP(next) == IFTHEN) /* Fake one. */
3539 next = NEXTOPER(NEXTOPER(next));
3540 }
3541 break;
3542 case LOGICAL:
5d9a96ca 3543 st->logical = scan->flags;
c277df42 3544 break;
c476f425 3545
2ab05381 3546/*******************************************************************
2ab05381 3547
c476f425
DM
3548The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3549pattern, where A and B are subpatterns. (For simple A, CURLYM or
3550STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3551
c476f425 3552A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3553
c476f425
DM
3554On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3555state, which contains the current count, initialised to -1. It also sets
3556cur_curlyx to point to this state, with any previous value saved in the
3557state block.
2ab05381 3558
c476f425
DM
3559CURLYX then jumps straight to the WHILEM op, rather than executing A,
3560since the pattern may possibly match zero times (i.e. it's a while {} loop
3561rather than a do {} while loop).
2ab05381 3562
c476f425
DM
3563Each entry to WHILEM represents a successful match of A. The count in the
3564CURLYX block is incremented, another WHILEM state is pushed, and execution
3565passes to A or B depending on greediness and the current count.
2ab05381 3566
c476f425
DM
3567For example, if matching against the string a1a2a3b (where the aN are
3568substrings that match /A/), then the match progresses as follows: (the
3569pushed states are interspersed with the bits of strings matched so far):
2ab05381 3570
c476f425
DM
3571 <CURLYX cnt=-1>
3572 <CURLYX cnt=0><WHILEM>
3573 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3574 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3575 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3576 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3577
c476f425
DM
3578(Contrast this with something like CURLYM, which maintains only a single
3579backtrack state:
2ab05381 3580
c476f425
DM
3581 <CURLYM cnt=0> a1
3582 a1 <CURLYM cnt=1> a2
3583 a1 a2 <CURLYM cnt=2> a3
3584 a1 a2 a3 <CURLYM cnt=3> b
3585)
2ab05381 3586
c476f425
DM
3587Each WHILEM state block marks a point to backtrack to upon partial failure
3588of A or B, and also contains some minor state data related to that
3589iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3590overall state, such as the count, and pointers to the A and B ops.
2ab05381 3591
c476f425
DM
3592This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3593must always point to the *current* CURLYX block, the rules are:
2ab05381 3594
c476f425
DM
3595When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3596and set cur_curlyx to point the new block.
2ab05381 3597
c476f425
DM
3598When popping the CURLYX block after a successful or unsuccessful match,
3599restore the previous cur_curlyx.
2ab05381 3600
c476f425
DM
3601When WHILEM is about to execute B, save the current cur_curlyx, and set it
3602to the outer one saved in the CURLYX block.
2ab05381 3603
c476f425
DM
3604When popping the WHILEM block after a successful or unsuccessful B match,
3605restore the previous cur_curlyx.
2ab05381 3606
c476f425
DM
3607Here's an example for the pattern (AI* BI)*BO
3608I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3609
c476f425
DM
3610cur_
3611curlyx backtrack stack
3612------ ---------------
3613NULL
3614CO <CO prev=NULL> <WO>
3615CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3616CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3617NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3618
c476f425
DM
3619At this point the pattern succeeds, and we work back down the stack to
3620clean up, restoring as we go:
95b24440 3621
c476f425
DM
3622CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3623CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3624CO <CO prev=NULL> <WO>
3625NULL
a0374537 3626
c476f425
DM
3627*******************************************************************/
3628
3629#define ST st->u.curlyx
3630
3631 case CURLYX: /* start of /A*B/ (for complex A) */
3632 {
3633 /* No need to save/restore up to this paren */
3634 I32 parenfloor = scan->flags;
3635
3636 assert(next); /* keep Coverity happy */
3637 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3638 next += ARG(next);
3639
3640 /* XXXX Probably it is better to teach regpush to support
3641 parenfloor > PL_regsize... */
3642 if (parenfloor > (I32)*PL_reglastparen)
3643 parenfloor = *PL_reglastparen; /* Pessimization... */
3644
3645 ST.prev_curlyx= cur_curlyx;
3646 cur_curlyx = st;
3647 ST.cp = PL_savestack_ix;
3648
3649 /* these fields contain the state of the current curly.
3650 * they are accessed by subsequent WHILEMs */
3651 ST.parenfloor = parenfloor;
3652 ST.min = ARG1(scan);
3653 ST.max = ARG2(scan);
3654 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3655 ST.B = next;
3656 ST.minmod = st->minmod;
3657 ST.count = -1; /* this will be updated by WHILEM */
3658 ST.lastloc = NULL; /* this will be updated by WHILEM */
3659
3660 PL_reginput = locinput;
3661 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 3662 /* NOTREACHED */
c476f425 3663 }
a0d0e21e 3664
c476f425
DM
3665 case CURLYX_end: /* just finished matching all of A*B */
3666 regcpblow(ST.cp);
3667 cur_curlyx = ST.prev_curlyx;
3668 sayYES;
3669 /* NOTREACHED */
a0d0e21e 3670
c476f425
DM
3671 case CURLYX_end_fail: /* just failed to match all of A*B */
3672 regcpblow(ST.cp);
3673 cur_curlyx = ST.prev_curlyx;
3674 sayNO;
3675 /* NOTREACHED */
4633a7c4 3676
a0d0e21e 3677
c476f425
DM
3678#undef ST
3679#define ST st->u.whilem
3680
3681 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3682 {
3683 /* see the discussion above about CURLYX/WHILEM */
3684
3685 I32 n;
3686 assert(cur_curlyx); /* keep Coverity happy */
3687 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3688 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3689 ST.cache_offset = 0;
3690 ST.cache_mask = 0;
3691
3692 PL_reginput = locinput;
3693
3694 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3695 "%*s whilem: matched %ld out of %ld..%ld\n",
3696 REPORT_CODE_OFF+depth*2, "", (long)n,
3697 (long)cur_curlyx->u.curlyx.min,
3698 (long)cur_curlyx->u.curlyx.max)
3699 );
a0d0e21e 3700
c476f425 3701 /* First just match a string of min A's. */
a0d0e21e 3702
c476f425
DM
3703 if (n < cur_curlyx->u.curlyx.min) {
3704 cur_curlyx->u.curlyx.lastloc = locinput;
3705 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3706 /* NOTREACHED */
3707 }
3708
3709 /* If degenerate A matches "", assume A done. */
3710
3711 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3712 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3713 "%*s whilem: empty match detected, trying continuation...\n",
3714 REPORT_CODE_OFF+depth*2, "")
3715 );
3716 goto do_whilem_B_max;
3717 }
3718
3719 /* super-linear cache processing */
3720
3721 if (scan->flags) {
a0d0e21e 3722
2c2d71f5 3723 if (!PL_reg_maxiter) {
c476f425
DM
3724 /* start the countdown: Postpone detection until we
3725 * know the match is not *that* much linear. */
2c2d71f5 3726 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3727 /* possible overflow for long strings and many CURLYX's */
3728 if (PL_reg_maxiter < 0)
3729 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3730 PL_reg_leftiter = PL_reg_maxiter;
3731 }
c476f425 3732
2c2d71f5 3733 if (PL_reg_leftiter-- == 0) {
c476f425 3734 /* initialise cache */
3298f257 3735 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3736 if (PL_reg_poscache) {
eb160463 3737 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3738 Renew(PL_reg_poscache, size, char);
3739 PL_reg_poscache_size = size;
3740 }
3741 Zero(PL_reg_poscache, size, char);
3742 }
3743 else {
3744 PL_reg_poscache_size = size;
a02a5408 3745 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3746 }
c476f425
DM
3747 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3748 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3749 PL_colors[4], PL_colors[5])
3750 );
2c2d71f5 3751 }
c476f425 3752
2c2d71f5 3753 if (PL_reg_leftiter < 0) {
c476f425
DM
3754 /* have we already failed at this position? */
3755 I32 offset, mask;
3756 offset = (scan->flags & 0xf) - 1
3757 + (locinput - PL_bostr) * (scan->flags>>4);
3758 mask = 1 << (offset % 8);
3759 offset /= 8;
3760 if (PL_reg_poscache[offset] & mask) {
3761 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3762 "%*s whilem: (cache) already tried at this position...\n",
3763 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 3764 );
3298f257 3765 sayNO; /* cache records failure */
2c2d71f5 3766 }
c476f425
DM
3767 ST.cache_offset = offset;
3768 ST.cache_mask = mask;
2c2d71f5 3769 }
c476f425 3770 }
2c2d71f5 3771
c476f425 3772 /* Prefer B over A for minimal matching. */
a687059c 3773
c476f425
DM
3774 if (cur_curlyx->u.curlyx.minmod) {
3775 ST.save_curlyx = cur_curlyx;
3776 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3777 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3778 REGCP_SET(ST.lastcp);
3779 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3780 /* NOTREACHED */
3781 }
a0d0e21e 3782
c476f425
DM
3783 /* Prefer A over B for maximal matching. */
3784
3785 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3786 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3787 cur_curlyx->u.curlyx.lastloc = locinput;
3788 REGCP_SET(ST.lastcp);
3789 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3790 /* NOTREACHED */
3791 }
3792 goto do_whilem_B_max;
3793 }
3794 /* NOTREACHED */
3795
3796 case WHILEM_B_min: /* just matched B in a minimal match */
3797 case WHILEM_B_max: /* just matched B in a maximal match */
3798 cur_curlyx = ST.save_curlyx;
3799 sayYES;
3800 /* NOTREACHED */
3801
3802 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3803 cur_curlyx = ST.save_curlyx;
3804 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3805 cur_curlyx->u.curlyx.count--;
3806 CACHEsayNO;
3807 /* NOTREACHED */
3808
3809 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3810 REGCP_UNWIND(ST.lastcp);
3811 regcppop(rex);
3812 /* FALL THROUGH */
3813 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3814 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3815 cur_curlyx->u.curlyx.count--;
3816 CACHEsayNO;
3817 /* NOTREACHED */
3818
3819 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3820 REGCP_UNWIND(ST.lastcp);
3821 regcppop(rex); /* Restore some previous $<digit>s? */
3822 PL_reginput = locinput;
3823 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3824 "%*s whilem: failed, trying continuation...\n",
3825 REPORT_CODE_OFF+depth*2, "")
3826 );
3827 do_whilem_B_max:
3828 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3829 && ckWARN(WARN_REGEXP)
3830 && !(PL_reg_flags & RF_warned))
3831 {
3832 PL_reg_flags |= RF_warned;
3833 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3834 "Complex regular subexpression recursion",
3835 REG_INFTY - 1);
3836 }
3837
3838 /* now try B */
3839 ST.save_curlyx = cur_curlyx;
3840 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3841 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3842 /* NOTREACHED */
3843
3844 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3845 cur_curlyx = ST.save_curlyx;
3846 REGCP_UNWIND(ST.lastcp);
3847 regcppop(rex);
3848
3849 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3850 /* Maximum greed exceeded */
3851 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3852 && ckWARN(WARN_REGEXP)
3853 && !(PL_reg_flags & RF_warned))
3854 {
3280af22 3855 PL_reg_flags |= RF_warned;
c476f425
DM
3856 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3857 "%s limit (%d) exceeded",
3858 "Complex regular subexpression recursion",
3859 REG_INFTY - 1);
a0d0e21e 3860 }
c476f425 3861 cur_curlyx->u.curlyx.count--;
3ab3c9b4 3862 CACHEsayNO;
a0d0e21e 3863 }
c476f425
DM
3864
3865 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3866 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3867 );
3868 /* Try grabbing another A and see if it helps. */
3869 PL_reginput = locinput;
3870 cur_curlyx->u.curlyx.lastloc = locinput;
3871 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3872 REGCP_SET(ST.lastcp);
3873 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 3874 /* NOTREACHED */
40a82448
DM
3875
3876#undef ST
3877#define ST st->u.branch
3878
3879 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3880 next = scan + ARG(scan);
3881 if (next == scan)
3882 next = NULL;
40a82448
DM
3883 scan = NEXTOPER(scan);
3884 /* FALL THROUGH */
c277df42 3885
40a82448
DM
3886 case BRANCH: /* /(...|A|...)/ */
3887 scan = NEXTOPER(scan); /* scan now points to inner node */
3888 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3889 /* last branch; skip state push and jump direct to node */
3890 continue;
3891 ST.lastparen = *PL_reglastparen;
3892 ST.next_branch = next;
3893 REGCP_SET(ST.cp);
3894 PL_reginput = locinput;
02db2b7b 3895
40a82448
DM
3896 /* Now go into the branch */
3897 PUSH_STATE_GOTO(BRANCH_next, scan);
3898 /* NOTREACHED */
3899
3900 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3901 REGCP_UNWIND(ST.cp);
3902 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3903 PL_regendp[n] = -1;
3904 *PL_reglastparen = n;
3905 scan = ST.next_branch;
3906 /* no more branches? */
3907 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3908 sayNO;
3909 continue; /* execute next BRANCH[J] op */
3910 /* NOTREACHED */
3911
a0d0e21e 3912 case MINMOD:
5d9a96ca 3913 st->minmod = 1;
a0d0e21e 3914 break;
40a82448
DM
3915
3916#undef ST
3917#define ST st->u.curlym
3918
3919 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3920
3921 /* This is an optimisation of CURLYX that enables us to push
3922 * only a single backtracking state, no matter now many matches
3923 * there are in {m,n}. It relies on the pattern being constant
3924 * length, with no parens to influence future backrefs
3925 */
3926
3927 ST.me = scan;
dc45a647 3928 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3929
3930 /* if paren positive, emulate an OPEN/CLOSE around A */
3931 if (ST.me->flags) {
3932 I32 paren = ST.me->flags;
3933 if (paren > PL_regsize)
3934 PL_regsize = paren;
3935 if (paren > (I32)*PL_reglastparen)
3936 *PL_reglastparen = paren;
c277df42 3937 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3938 }
40a82448
DM
3939 ST.A = scan;
3940 ST.B = next;
3941 ST.alen = 0;
3942 ST.count = 0;
3943 ST.minmod = st->minmod;
3944 st->minmod = 0;
3945 ST.c1 = CHRTEST_UNINIT;
3946 REGCP_SET(ST.cp);
6407bf3b 3947
40a82448
DM
3948 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3949 goto curlym_do_B;
3950
3951 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3952 PL_reginput = locinput;
40a82448
DM
3953 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3954 /* NOTREACHED */
5f80c4cf 3955
40a82448
DM
3956 case CURLYM_A: /* we've just matched an A */
3957 locinput = st->locinput;
3958 nextchr = UCHARAT(locinput);
3959
3960 ST.count++;
3961 /* after first match, determine A's length: u.curlym.alen */
3962 if (ST.count == 1) {
3963 if (PL_reg_match_utf8) {
3964 char *s = locinput;
3965 while (s < PL_reginput) {
3966 ST.alen++;
3967 s += UTF8SKIP(s);
3968 }
3969 }
3970 else {
3971 ST.alen = PL_reginput - locinput;
3972 }
3973 if (ST.alen == 0)
3974 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3975 }
0cadcf80
DM
3976 DEBUG_EXECUTE_r(
3977 PerlIO_printf(Perl_debug_log,
40a82448 3978 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 3979 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 3980 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3981 );
3982
40a82448
DM
3983 locinput = PL_reginput;
3984 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3985 goto curlym_do_A; /* try to match another A */
3986 goto curlym_do_B; /* try to match B */
5f80c4cf 3987
40a82448
DM
3988 case CURLYM_A_fail: /* just failed to match an A */
3989 REGCP_UNWIND(ST.cp);
3990 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3991 sayNO;
0cadcf80 3992
40a82448
DM
3993 curlym_do_B: /* execute the B in /A{m,n}B/ */
3994 PL_reginput = locinput;
3995 if (ST.c1 == CHRTEST_UNINIT) {
3996 /* calculate c1 and c2 for possible match of 1st char
3997 * following curly */
3998 ST.c1 = ST.c2 = CHRTEST_VOID;
3999 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4000 regnode *text_node = ST.B;
4001 if (! HAS_TEXT(text_node))
4002 FIND_NEXT_IMPT(text_node);
4003 if (HAS_TEXT(text_node)
4004 && PL_regkind[OP(text_node)] != REF)
4005 {
4006 ST.c1 = (U8)*STRING(text_node);
4007 ST.c2 =
4008 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4009 ? PL_fold[ST.c1]
4010 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4011 ? PL_fold_locale[ST.c1]
4012 : ST.c1;
c277df42 4013 }
c277df42 4014 }
40a82448
DM
4015 }
4016
4017 DEBUG_EXECUTE_r(
4018 PerlIO_printf(Perl_debug_log,
4019 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4020 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4021 "", (IV)ST.count)
4022 );
4023 if (ST.c1 != CHRTEST_VOID
4024 && UCHARAT(PL_reginput) != ST.c1
4025 && UCHARAT(PL_reginput) != ST.c2)
4026 {
4027 /* simulate B failing */
4028 state_num = CURLYM_B_fail;
4029 goto reenter_switch;
4030 }
4031
4032 if (ST.me->flags) {
4033 /* mark current A as captured */
4034 I32 paren = ST.me->flags;
4035 if (ST.count) {
4036 PL_regstartp[paren]
4037 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4038 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4039 }
40a82448
DM
4040 else
4041 PL_regendp[paren] = -1;
c277df42 4042 }
40a82448 4043 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4044 /* NOTREACHED */
40a82448
DM
4045
4046 case CURLYM_B_fail: /* just failed to match a B */
4047 REGCP_UNWIND(ST.cp);
4048 if (ST.minmod) {
4049 if (ST.count == ARG2(ST.me) /* max */)
4050 sayNO;
4051 goto curlym_do_A; /* try to match a further A */
4052 }
4053 /* backtrack one A */
4054 if (ST.count == ARG1(ST.me) /* min */)
4055 sayNO;
4056 ST.count--;
4057 locinput = HOPc(locinput, -ST.alen);
4058 goto curlym_do_B; /* try to match B */
4059
c255a977
DM
4060#undef ST
4061#define ST st->u.curly
40a82448 4062
c255a977
DM
4063#define CURLY_SETPAREN(paren, success) \
4064 if (paren) { \
4065 if (success) { \
4066 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4067 PL_regendp[paren] = locinput - PL_bostr; \
4068 } \
4069 else \
4070 PL_regendp[paren] = -1; \
4071 }
4072
4073 case STAR: /* /A*B/ where A is width 1 */
4074 ST.paren = 0;
4075 ST.min = 0;
4076 ST.max = REG_INFTY;
a0d0e21e
LW
4077 scan = NEXTOPER(scan);
4078 goto repeat;
c255a977
DM
4079 case PLUS: /* /A+B/ where A is width 1 */
4080 ST.paren = 0;
4081 ST.min = 1;
4082 ST.max = REG_INFTY;
c277df42 4083 scan = NEXTOPER(scan);
c255a977
DM
4084 goto repeat;
4085 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4086 ST.paren = scan->flags; /* Which paren to set */
4087 if (ST.paren > PL_regsize)
4088 PL_regsize = ST.paren;
4089 if (ST.paren > (I32)*PL_reglastparen)
4090 *PL_reglastparen = ST.paren;
4091 ST.min = ARG1(scan); /* min to match */
4092 ST.max = ARG2(scan); /* max to match */
4093 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4094 goto repeat;
4095 case CURLY: /* /A{m,n}B/ where A is width 1 */
4096 ST.paren = 0;
4097 ST.min = ARG1(scan); /* min to match */
4098 ST.max = ARG2(scan); /* max to match */
4099 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4100 repeat:
a0d0e21e
LW
4101 /*
4102 * Lookahead to avoid useless match attempts
4103 * when we know what character comes next.
c255a977 4104 *
5f80c4cf
JP
4105 * Used to only do .*x and .*?x, but now it allows
4106 * for )'s, ('s and (?{ ... })'s to be in the way
4107 * of the quantifier and the EXACT-like node. -- japhy
4108 */
4109
c255a977
DM
4110 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4111 sayNO;
cca55fe3 4112 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4113 U8 *s;
4114 regnode *text_node = next;
4115
3dab1dad
YO
4116 if (! HAS_TEXT(text_node))
4117 FIND_NEXT_IMPT(text_node);
5f80c4cf 4118
9e137952 4119 if (! HAS_TEXT(text_node))
c255a977 4120 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4121 else {
3dab1dad 4122 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4123 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4124 goto assume_ok_easy;
cca55fe3 4125 }
be8e71aa
YO
4126 else
4127 s = (U8*)STRING(text_node);
5f80c4cf
JP
4128
4129 if (!UTF) {
c255a977 4130 ST.c2 = ST.c1 = *s;
f65d3ee7 4131 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4132 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4133 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4134 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4135 }
5f80c4cf 4136 else { /* UTF */
f65d3ee7 4137 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4138 STRLEN ulen1, ulen2;
89ebb4a3
JH
4139 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4140 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4141
4142 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4143 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4144#ifdef EBCDIC
4145 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4146 ckWARN(WARN_UTF8) ?
4147 0 : UTF8_ALLOW_ANY);
4148 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4149 ckWARN(WARN_UTF8) ?
4150 0 : UTF8_ALLOW_ANY);
4151#else
c255a977 4152 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4153 uniflags);
c255a977 4154 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4155 uniflags);
4156#endif
5f80c4cf
JP
4157 }
4158 else {
c255a977 4159 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4160 uniflags);
5f80c4cf 4161 }
1aa99e6b
IH
4162 }
4163 }
bbce6d69 4164 }
a0d0e21e 4165 else
c255a977 4166 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4167 assume_ok_easy:
c255a977
DM
4168
4169 ST.A = scan;
4170 ST.B = next;
3280af22 4171 PL_reginput = locinput;
5d9a96ca
DM
4172 if (st->minmod) {
4173 st->minmod = 0;
c255a977 4174 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4175 sayNO;
c255a977 4176 ST.count = ST.min;
a0ed51b3 4177 locinput = PL_reginput;
c255a977
DM
4178 REGCP_SET(ST.cp);
4179 if (ST.c1 == CHRTEST_VOID)
4180 goto curly_try_B_min;
4181
4182 ST.oldloc = locinput;
4183
4184 /* set ST.maxpos to the furthest point along the
4185 * string that could possibly match */
4186 if (ST.max == REG_INFTY) {
4187 ST.maxpos = PL_regeol - 1;
4188 if (do_utf8)
4189 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4190 ST.maxpos--;
4191 }
4192 else if (do_utf8) {
4193 int m = ST.max - ST.min;
4194 for (ST.maxpos = locinput;
4195 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4196 ST.maxpos += UTF8SKIP(ST.maxpos);
4197 }
4198 else {
4199 ST.maxpos = locinput + ST.max - ST.min;
4200 if (ST.maxpos >= PL_regeol)
4201 ST.maxpos = PL_regeol - 1;
4202 }
4203 goto curly_try_B_min_known;
4204
4205 }
4206 else {
4207 ST.count = regrepeat(rex, ST.A, ST.max);
4208 locinput = PL_reginput;
4209 if (ST.count < ST.min)
4210 sayNO;
4211 if ((ST.count > ST.min)
4212 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4213 {
4214 /* A{m,n} must come at the end of the string, there's
4215 * no point in backing off ... */
4216 ST.min = ST.count;
4217 /* ...except that $ and \Z can match before *and* after
4218 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4219 We may back off by one in this case. */
4220 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4221 ST.min--;
4222 }
4223 REGCP_SET(ST.cp);
4224 goto curly_try_B_max;
4225 }
4226 /* NOTREACHED */
4227
4228
4229 case CURLY_B_min_known_fail:
4230 /* failed to find B in a non-greedy match where c1,c2 valid */
4231 if (ST.paren && ST.count)
4232 PL_regendp[ST.paren] = -1;
4233
4234 PL_reginput = locinput; /* Could be reset... */
4235 REGCP_UNWIND(ST.cp);
4236 /* Couldn't or didn't -- move forward. */
4237 ST.oldloc = locinput;
4238 if (do_utf8)
4239 locinput += UTF8SKIP(locinput);
4240 else
4241 locinput++;
4242 ST.count++;
4243 curly_try_B_min_known:
4244 /* find the next place where 'B' could work, then call B */
4245 {
4246 int n;
4247 if (do_utf8) {
4248 n = (ST.oldloc == locinput) ? 0 : 1;
4249 if (ST.c1 == ST.c2) {
4250 STRLEN len;
4251 /* set n to utf8_distance(oldloc, locinput) */
4252 while (locinput <= ST.maxpos &&
4253 utf8n_to_uvchr((U8*)locinput,
4254 UTF8_MAXBYTES, &len,
4255 uniflags) != (UV)ST.c1) {
4256 locinput += len;
4257 n++;
4258 }
1aa99e6b
IH
4259 }
4260 else {
c255a977
DM
4261 /* set n to utf8_distance(oldloc, locinput) */
4262 while (locinput <= ST.maxpos) {
4263 STRLEN len;
4264 const UV c = utf8n_to_uvchr((U8*)locinput,
4265 UTF8_MAXBYTES, &len,
4266 uniflags);
4267 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4268 break;
4269 locinput += len;
4270 n++;
1aa99e6b 4271 }
0fe9bf95
IZ
4272 }
4273 }
c255a977
DM
4274 else {
4275 if (ST.c1 == ST.c2) {
4276 while (locinput <= ST.maxpos &&
4277 UCHARAT(locinput) != ST.c1)
4278 locinput++;
bbce6d69 4279 }
c255a977
DM
4280 else {
4281 while (locinput <= ST.maxpos
4282 && UCHARAT(locinput) != ST.c1
4283 && UCHARAT(locinput) != ST.c2)
4284 locinput++;
a0ed51b3 4285 }
c255a977
DM
4286 n = locinput - ST.oldloc;
4287 }
4288 if (locinput > ST.maxpos)
4289 sayNO;
4290 /* PL_reginput == oldloc now */
4291 if (n) {
4292 ST.count += n;
4293 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4294 sayNO;
a0d0e21e 4295 }
c255a977
DM
4296 PL_reginput = locinput;
4297 CURLY_SETPAREN(ST.paren, ST.count);
4298 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4299 }
c255a977
DM
4300 /* NOTREACHED */
4301
4302
4303 case CURLY_B_min_fail:
4304 /* failed to find B in a non-greedy match where c1,c2 invalid */
4305 if (ST.paren && ST.count)
4306 PL_regendp[ST.paren] = -1;
4307
4308 REGCP_UNWIND(ST.cp);
4309 /* failed -- move forward one */
4310 PL_reginput = locinput;
4311 if (regrepeat(rex, ST.A, 1)) {
4312 ST.count++;
a0ed51b3 4313 locinput = PL_reginput;
c255a977
DM
4314 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4315 ST.count > 0)) /* count overflow ? */
15272685 4316 {
c255a977
DM
4317 curly_try_B_min:
4318 CURLY_SETPAREN(ST.paren, ST.count);
4319 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4320 }
4321 }
4633a7c4 4322 sayNO;
c255a977
DM
4323 /* NOTREACHED */
4324
4325
4326 curly_try_B_max:
4327 /* a successful greedy match: now try to match B */
4328 {
4329 UV c = 0;
4330 if (ST.c1 != CHRTEST_VOID)
4331 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4332 UTF8_MAXBYTES, 0, uniflags)
466787eb 4333 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4334 /* If it could work, try it. */
4335 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4336 CURLY_SETPAREN(ST.paren, ST.count);
4337 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4338 /* NOTREACHED */
4339 }
4340 }
4341 /* FALL THROUGH */
4342 case CURLY_B_max_fail:
4343 /* failed to find B in a greedy match */
4344 if (ST.paren && ST.count)
4345 PL_regendp[ST.paren] = -1;
4346
4347 REGCP_UNWIND(ST.cp);
4348 /* back up. */
4349 if (--ST.count < ST.min)
4350 sayNO;
4351 PL_reginput = locinput = HOPc(locinput, -1);
4352 goto curly_try_B_max;
4353
4354#undef ST
4355
4356
a0d0e21e 4357 case END:
6bda09f9 4358 fake_end:
faec1544
DM
4359 if (cur_eval) {
4360 /* we've just finished A in /(??{A})B/; now continue with B */
4361 I32 tmpix;
4362
4363
4364 st->u.eval.toggle_reg_flags
4365 = cur_eval->u.eval.toggle_reg_flags;
4366 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4367
4368 st->u.eval.prev_rex = rex; /* inner */
4369 rex = cur_eval->u.eval.prev_rex; /* outer */
4370 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4371 ReREFCNT_inc(rex);
4372 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4373 REGCP_SET(st->u.eval.lastcp);
4374 PL_reginput = locinput;
4375
4376 /* Restore parens of the outer rex without popping the
4377 * savestack */
4378 tmpix = PL_savestack_ix;
4379 PL_savestack_ix = cur_eval->u.eval.lastcp;
4380 regcppop(rex);
4381 PL_savestack_ix = tmpix;
4382
4383 st->u.eval.prev_eval = cur_eval;
4384 cur_eval = cur_eval->u.eval.prev_eval;
4385 DEBUG_EXECUTE_r(
6bda09f9
YO
4386 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4387 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
faec1544
DM
4388 PUSH_YES_STATE_GOTO(EVAL_AB,
4389 st->u.eval.prev_eval->u.eval.B); /* match B */
4390 }
4391
3b0527fe 4392 if (locinput < reginfo->till) {
a3621e74 4393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4394 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4395 PL_colors[4],
4396 (long)(locinput - PL_reg_starttry),
3b0527fe 4397 (long)(reginfo->till - PL_reg_starttry),
7821416a 4398 PL_colors[5]));
262b90c4 4399 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4400 }
4401 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4402 sayYES; /* Success! */
dad79028
DM
4403
4404 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4405 DEBUG_EXECUTE_r(
4406 PerlIO_printf(Perl_debug_log,
4407 "%*s %ssubpattern success...%s\n",
5bc10b2c 4408 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4409 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4410 sayYES; /* Success! */
dad79028 4411
40a82448
DM
4412#undef ST
4413#define ST st->u.ifmatch
4414
4415 case SUSPEND: /* (?>A) */
4416 ST.wanted = 1;
9fe1d20c 4417 PL_reginput = locinput;
9041c2e3 4418 goto do_ifmatch;
dad79028 4419
40a82448
DM
4420 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4421 ST.wanted = 0;
dad79028
DM
4422 goto ifmatch_trivial_fail_test;
4423
40a82448
DM
4424 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4425 ST.wanted = 1;
dad79028 4426 ifmatch_trivial_fail_test:
a0ed51b3 4427 if (scan->flags) {
52657f30 4428 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4429 if (!s) {
4430 /* trivial fail */
4431 if (st->logical) {
4432 st->logical = 0;
40a82448 4433 st->sw = 1 - (bool)ST.wanted;
dad79028 4434 }
40a82448 4435 else if (ST.wanted)
dad79028
DM
4436 sayNO;
4437 next = scan + ARG(scan);
4438 if (next == scan)
4439 next = NULL;
4440 break;
4441 }
efb30f32 4442 PL_reginput = s;
a0ed51b3
LW
4443 }
4444 else
4445 PL_reginput = locinput;
4446
c277df42 4447 do_ifmatch:
40a82448
DM
4448 ST.me = scan;
4449 /* execute body of (?...A) */
4450 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4451 /* NOTREACHED */
4452
4453 case IFMATCH_A_fail: /* body of (?...A) failed */
4454 ST.wanted = !ST.wanted;
4455 /* FALL THROUGH */
4456
4457 case IFMATCH_A: /* body of (?...A) succeeded */
4458 if (st->logical) {
4459 st->logical = 0;
4460 st->sw = (bool)ST.wanted;
4461 }
4462 else if (!ST.wanted)
4463 sayNO;
4464
4465 if (OP(ST.me) == SUSPEND)
4466 locinput = PL_reginput;
4467 else {
4468 locinput = PL_reginput = st->locinput;
4469 nextchr = UCHARAT(locinput);
4470 }
4471 scan = ST.me + ARG(ST.me);
4472 if (scan == ST.me)
4473 scan = NULL;
4474 continue; /* execute B */
4475
4476#undef ST
dad79028 4477
c277df42 4478 case LONGJMP:
c277df42
IZ
4479 next = scan + ARG(scan);
4480 if (next == scan)
4481 next = NULL;
a0d0e21e
LW
4482 break;
4483 default:
b900a521 4484 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4485 PTR2UV(scan), OP(scan));
cea2e8a9 4486 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4487 }
95b24440 4488
a0d0e21e 4489 scan = next;
95b24440
DM
4490 continue;
4491 /* NOTREACHED */
4492
40a82448
DM
4493 push_yes_state:
4494 /* push a state that backtracks on success */
4495 st->u.yes.prev_yes_state = yes_state;
4496 yes_state = st;
4497 /* FALL THROUGH */
4498 push_state:
4499 /* push a new regex state, then continue at scan */
4500 {
4501 regmatch_state *newst;
4502
5bc10b2c 4503 DEBUG_STATE_pp("push");
40a82448 4504 depth++;
40a82448
DM
4505 st->locinput = locinput;
4506 newst = st+1;
4507 if (newst > SLAB_LAST(PL_regmatch_slab))
4508 newst = S_push_slab(aTHX);
4509 PL_regmatch_state = newst;
40a82448
DM
4510 /* XXX probably don't need to initialise these */
4511 newst->minmod = 0;
4512 newst->sw = 0;
4513 newst->logical = 0;
4514
786e8c11 4515
40a82448
DM
4516 locinput = PL_reginput;
4517 nextchr = UCHARAT(locinput);
4518 st = newst;
4519 continue;
4520 /* NOTREACHED */
4521 }
a0d0e21e 4522 }
a687059c 4523
a0d0e21e
LW
4524 /*
4525 * We get here only if there's trouble -- normally "case END" is
4526 * the terminating point.
4527 */
cea2e8a9 4528 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4529 /*NOTREACHED*/
4633a7c4
LW
4530 sayNO;
4531
262b90c4 4532yes:
77cb431f
DM
4533 if (yes_state) {
4534 /* we have successfully completed a subexpression, but we must now
4535 * pop to the state marked by yes_state and continue from there */
77cb431f 4536 assert(st != yes_state);
5bc10b2c
DM
4537#ifdef DEBUGGING
4538 while (st != yes_state) {
4539 st--;
4540 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4541 PL_regmatch_slab = PL_regmatch_slab->prev;
4542 st = SLAB_LAST(PL_regmatch_slab);
4543 }
4544 DEBUG_STATE_pp("pop (yes)");
4545 depth--;
4546 }
4547#else
77cb431f
DM
4548 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4549 || yes_state > SLAB_LAST(PL_regmatch_slab))
4550 {
4551 /* not in this slab, pop slab */
4552 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4553 PL_regmatch_slab = PL_regmatch_slab->prev;
4554 st = SLAB_LAST(PL_regmatch_slab);
4555 }
4556 depth -= (st - yes_state);
5bc10b2c 4557#endif
77cb431f
DM
4558 st = yes_state;
4559 yes_state = st->u.yes.prev_yes_state;
4560 PL_regmatch_state = st;
4561
4562 switch (st->resume_state) {
262b90c4 4563
faec1544 4564 case EVAL_AB:
40a82448
DM
4565 case IFMATCH_A:
4566 case CURLYM_A:
c476f425
DM
4567 case CURLYX_end:
4568 case WHILEM_B_min:
4569 case WHILEM_B_max:
40a82448
DM
4570 state_num = st->resume_state;
4571 goto reenter_switch;
4572
4573 case CURLYM_B:
166ba7cd
DM
4574 case BRANCH_next:
4575 case TRIE_next:
c255a977 4576 case CURLY_B_max:
c476f425
DM
4577 case WHILEM_A_pre:
4578 case WHILEM_A_min:
4579 case WHILEM_A_max:
77cb431f 4580 default:
40a82448 4581 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4582 }
4583 }
4584
a3621e74 4585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4586 PL_colors[4], PL_colors[5]));
02db2b7b 4587
95b24440 4588 result = 1;
aa283a38 4589 goto final_exit;
4633a7c4
LW
4590
4591no:
a3621e74 4592 DEBUG_EXECUTE_r(
7821416a 4593 PerlIO_printf(Perl_debug_log,
786e8c11 4594 "%*s %sfailed...%s\n",
5bc10b2c 4595 REPORT_CODE_OFF+depth*2, "",
786e8c11 4596 PL_colors[4], PL_colors[5])
7821416a 4597 );
aa283a38 4598
262b90c4 4599no_silent:
95b24440 4600 result = 0;
5d9a96ca 4601
aa283a38
DM
4602 if (depth) {
4603 /* there's a previous state to backtrack to */
40a82448
DM
4604 st--;
4605 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4606 PL_regmatch_slab = PL_regmatch_slab->prev;
4607 st = SLAB_LAST(PL_regmatch_slab);
4608 }
4609 PL_regmatch_state = st;
4610 scan = st->scan;
4611 next = st->next;
4612 n = st->n;
4613 locinput= st->locinput;
4614 nextchr = UCHARAT(locinput);
4615
5bc10b2c
DM
4616 DEBUG_STATE_pp("pop");
4617 depth--;
262b90c4
DM
4618 if (yes_state == st)
4619 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 4620
5d9a96ca 4621 switch (st->resume_state) {
166ba7cd 4622 case TRIE_next:
faec1544 4623 case EVAL_AB:
40a82448
DM
4624 case BRANCH_next:
4625 case CURLYM_A:
4626 case CURLYM_B:
4627 case IFMATCH_A:
c255a977
DM
4628 case CURLY_B_max:
4629 case CURLY_B_min:
4630 case CURLY_B_min_known:
c476f425
DM
4631 case CURLYX_end:
4632 case WHILEM_A_pre:
4633 case WHILEM_A_min:
4634 case WHILEM_A_max:
4635 case WHILEM_B_min:
4636 case WHILEM_B_max:
40a82448
DM
4637 state_num = st->resume_state + 1; /* failure = success + 1 */
4638 goto reenter_switch;
dad79028 4639
95b24440
DM
4640 default:
4641 Perl_croak(aTHX_ "regexp resume memory corruption");
4642 }
95b24440 4643 }
aa283a38 4644
262b90c4 4645 final_exit:
aa283a38 4646
5d9a96ca
DM
4647 /* restore original high-water mark */
4648 PL_regmatch_slab = orig_slab;
4649 PL_regmatch_state = orig_state;
4650
4651 /* free all slabs above current one */
4652 if (orig_slab->next) {
c4fd8992 4653 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4654 orig_slab->next = NULL;
4655 while (sl) {
c4fd8992 4656 regmatch_slab * const osl = sl;
5d9a96ca 4657 sl = sl->next;
ad65c075 4658 Safefree(osl);
5d9a96ca
DM
4659 }
4660 }
4661
95b24440 4662 return result;
a687059c
LW
4663}
4664
4665/*
4666 - regrepeat - repeatedly match something simple, report how many
4667 */
4668/*
4669 * [This routine now assumes that it will only match on things of length 1.
4670 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4671 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4672 */
76e3520e 4673STATIC I32
32fc9b6a 4674S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4675{
27da23d5 4676 dVAR;
a0d0e21e 4677 register char *scan;
a0d0e21e 4678 register I32 c;
3280af22 4679 register char *loceol = PL_regeol;
a0ed51b3 4680 register I32 hardcount = 0;
53c4c00c 4681 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4682
3280af22 4683 scan = PL_reginput;
faf11cac
HS
4684 if (max == REG_INFTY)
4685 max = I32_MAX;
4686 else if (max < loceol - scan)
7f596f4c 4687 loceol = scan + max;
a0d0e21e 4688 switch (OP(p)) {
22c35a8c 4689 case REG_ANY:
1aa99e6b 4690 if (do_utf8) {
ffc61ed2 4691 loceol = PL_regeol;
1aa99e6b 4692 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4693 scan += UTF8SKIP(scan);
4694 hardcount++;
4695 }
4696 } else {
4697 while (scan < loceol && *scan != '\n')
4698 scan++;
a0ed51b3
LW
4699 }
4700 break;
ffc61ed2 4701 case SANY:
def8e4ea
JH
4702 if (do_utf8) {
4703 loceol = PL_regeol;
a0804c9e 4704 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4705 scan += UTF8SKIP(scan);
4706 hardcount++;
4707 }
4708 }
4709 else
4710 scan = loceol;
a0ed51b3 4711 break;
f33976b4
DB
4712 case CANY:
4713 scan = loceol;
4714 break;
090f7165
JH
4715 case EXACT: /* length of string is 1 */
4716 c = (U8)*STRING(p);
4717 while (scan < loceol && UCHARAT(scan) == c)
4718 scan++;
bbce6d69 4719 break;
4720 case EXACTF: /* length of string is 1 */
cd439c50 4721 c = (U8)*STRING(p);
bbce6d69 4722 while (scan < loceol &&
22c35a8c 4723 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4724 scan++;
4725 break;
4726 case EXACTFL: /* length of string is 1 */
3280af22 4727 PL_reg_flags |= RF_tainted;
cd439c50 4728 c = (U8)*STRING(p);
bbce6d69 4729 while (scan < loceol &&
22c35a8c 4730 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4731 scan++;
4732 break;
4733 case ANYOF:
ffc61ed2
JH
4734 if (do_utf8) {
4735 loceol = PL_regeol;
cfc92286 4736 while (hardcount < max && scan < loceol &&
32fc9b6a 4737 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4738 scan += UTF8SKIP(scan);
4739 hardcount++;
4740 }
4741 } else {
32fc9b6a 4742 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4743 scan++;
4744 }
a0d0e21e
LW
4745 break;
4746 case ALNUM:
1aa99e6b 4747 if (do_utf8) {
ffc61ed2 4748 loceol = PL_regeol;
1a4fad37 4749 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4750 while (hardcount < max && scan < loceol &&
3568d838 4751 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4752 scan += UTF8SKIP(scan);
4753 hardcount++;
4754 }
4755 } else {
4756 while (scan < loceol && isALNUM(*scan))
4757 scan++;
a0ed51b3
LW
4758 }
4759 break;
bbce6d69 4760 case ALNUML:
3280af22 4761 PL_reg_flags |= RF_tainted;
1aa99e6b 4762 if (do_utf8) {
ffc61ed2 4763 loceol = PL_regeol;
1aa99e6b
IH
4764 while (hardcount < max && scan < loceol &&
4765 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4766 scan += UTF8SKIP(scan);
4767 hardcount++;
4768 }
4769 } else {
4770 while (scan < loceol && isALNUM_LC(*scan))
4771 scan++;
a0ed51b3
LW
4772 }
4773 break;
a0d0e21e 4774 case NALNUM:
1aa99e6b 4775 if (do_utf8) {
ffc61ed2 4776 loceol = PL_regeol;
1a4fad37 4777 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4778 while (hardcount < max && scan < loceol &&
3568d838 4779 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4780 scan += UTF8SKIP(scan);
4781 hardcount++;
4782 }
4783 } else {
4784 while (scan < loceol && !isALNUM(*scan))
4785 scan++;
a0ed51b3
LW
4786 }
4787 break;
bbce6d69 4788 case NALNUML:
3280af22 4789 PL_reg_flags |= RF_tainted;
1aa99e6b 4790 if (do_utf8) {
ffc61ed2 4791 loceol = PL_regeol;
1aa99e6b
IH
4792 while (hardcount < max && scan < loceol &&
4793 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4794 scan += UTF8SKIP(scan);
4795 hardcount++;
4796 }
4797 } else {
4798 while (scan < loceol && !isALNUM_LC(*scan))
4799 scan++;
a0ed51b3
LW
4800 }
4801 break;
a0d0e21e 4802 case SPACE:
1aa99e6b 4803 if (do_utf8) {
ffc61ed2 4804 loceol = PL_regeol;
1a4fad37 4805 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4806 while (hardcount < max && scan < loceol &&
3568d838
JH
4807 (*scan == ' ' ||
4808 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4809 scan += UTF8SKIP(scan);
4810 hardcount++;
4811 }
4812 } else {
4813 while (scan < loceol && isSPACE(*scan))
4814 scan++;
a0ed51b3
LW
4815 }
4816 break;
bbce6d69 4817 case SPACEL:
3280af22 4818 PL_reg_flags |= RF_tainted;
1aa99e6b 4819 if (do_utf8) {
ffc61ed2 4820 loceol = PL_regeol;
1aa99e6b 4821 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4822 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4823 scan += UTF8SKIP(scan);
4824 hardcount++;
4825 }
4826 } else {
4827 while (scan < loceol && isSPACE_LC(*scan))
4828 scan++;
a0ed51b3
LW
4829 }
4830 break;
a0d0e21e 4831 case NSPACE:
1aa99e6b 4832 if (do_utf8) {
ffc61ed2 4833 loceol = PL_regeol;
1a4fad37 4834 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4835 while (hardcount < max && scan < loceol &&
3568d838
JH
4836 !(*scan == ' ' ||
4837 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4838 scan += UTF8SKIP(scan);
4839 hardcount++;
4840 }
4841 } else {
4842 while (scan < loceol && !isSPACE(*scan))
4843 scan++;
4844 break;
a0ed51b3 4845 }
bbce6d69 4846 case NSPACEL:
3280af22 4847 PL_reg_flags |= RF_tainted;
1aa99e6b 4848 if (do_utf8) {
ffc61ed2 4849 loceol = PL_regeol;
1aa99e6b 4850 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4851 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4852 scan += UTF8SKIP(scan);
4853 hardcount++;
4854 }
4855 } else {
4856 while (scan < loceol && !isSPACE_LC(*scan))
4857 scan++;
a0ed51b3
LW
4858 }
4859 break;
a0d0e21e 4860 case DIGIT:
1aa99e6b 4861 if (do_utf8) {
ffc61ed2 4862 loceol = PL_regeol;
1a4fad37 4863 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4864 while (hardcount < max && scan < loceol &&
3568d838 4865 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4866 scan += UTF8SKIP(scan);
4867 hardcount++;
4868 }
4869 } else {
4870 while (scan < loceol && isDIGIT(*scan))
4871 scan++;
a0ed51b3
LW
4872 }
4873 break;
a0d0e21e 4874 case NDIGIT:
1aa99e6b 4875 if (do_utf8) {
ffc61ed2 4876 loceol = PL_regeol;
1a4fad37 4877 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4878 while (hardcount < max && scan < loceol &&
3568d838 4879 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4880 scan += UTF8SKIP(scan);
4881 hardcount++;
4882 }
4883 } else {
4884 while (scan < loceol && !isDIGIT(*scan))
4885 scan++;
a0ed51b3
LW
4886 }
4887 break;
a0d0e21e
LW
4888 default: /* Called on something of 0 width. */
4889 break; /* So match right here or not at all. */
4890 }
a687059c 4891
a0ed51b3
LW
4892 if (hardcount)
4893 c = hardcount;
4894 else
4895 c = scan - PL_reginput;
3280af22 4896 PL_reginput = scan;
a687059c 4897
a3621e74 4898 DEBUG_r({
e68ec53f 4899 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4900 DEBUG_EXECUTE_r({
e68ec53f
YO
4901 SV * const prop = sv_newmortal();
4902 regprop(prog, prop, p);
4903 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4904 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4905 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4906 });
be8e71aa 4907 });
9041c2e3 4908
a0d0e21e 4909 return(c);
a687059c
LW
4910}
4911
c277df42 4912
be8e71aa 4913#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4914/*
ffc61ed2
JH
4915- regclass_swash - prepare the utf8 swash
4916*/
4917
4918SV *
32fc9b6a 4919Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4920{
97aff369 4921 dVAR;
9e55ce06
JH
4922 SV *sw = NULL;
4923 SV *si = NULL;
4924 SV *alt = NULL;
3dab1dad 4925 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4926
4f639d21 4927 if (data && data->count) {
a3b680e6 4928 const U32 n = ARG(node);
ffc61ed2 4929
4f639d21
DM
4930 if (data->what[n] == 's') {
4931 SV * const rv = (SV*)data->data[n];
890ce7af 4932 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4933 SV **const ary = AvARRAY(av);
9e55ce06 4934 SV **a, **b;
9041c2e3 4935
711a919c 4936 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4937 * documentation of these array elements. */
4938
b11f357e 4939 si = *ary;
8f7f7219 4940 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4941 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4942
ffc61ed2
JH
4943 if (a)
4944 sw = *a;
4945 else if (si && doinit) {
4946 sw = swash_init("utf8", "", si, 1, 0);
4947 (void)av_store(av, 1, sw);
4948 }
9e55ce06
JH
4949 if (b)
4950 alt = *b;
ffc61ed2
JH
4951 }
4952 }
4953
9e55ce06
JH
4954 if (listsvp)
4955 *listsvp = si;
4956 if (altsvp)
4957 *altsvp = alt;
ffc61ed2
JH
4958
4959 return sw;
4960}
76234dfb 4961#endif
ffc61ed2
JH
4962
4963/*
ba7b4546 4964 - reginclass - determine if a character falls into a character class
832705d4
JH
4965
4966 The n is the ANYOF regnode, the p is the target string, lenp
4967 is pointer to the maximum length of how far to go in the p
4968 (if the lenp is zero, UTF8SKIP(p) is used),
4969 do_utf8 tells whether the target string is in UTF-8.
4970
bbce6d69 4971 */
4972
76e3520e 4973STATIC bool
32fc9b6a 4974S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4975{
27da23d5 4976 dVAR;
a3b680e6 4977 const char flags = ANYOF_FLAGS(n);
bbce6d69 4978 bool match = FALSE;
cc07378b 4979 UV c = *p;
ae9ddab8 4980 STRLEN len = 0;
9e55ce06 4981 STRLEN plen;
1aa99e6b 4982
19f67299
TS
4983 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4984 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4985 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4986 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
4987 if (len == (STRLEN)-1)
4988 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 4989 }
bbce6d69 4990
0f0076b4 4991 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4992 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4993 if (lenp)
4994 *lenp = 0;
ffc61ed2 4995 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4996 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4997 match = TRUE;
bbce6d69 4998 }
3568d838 4999 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5000 match = TRUE;
ffc61ed2 5001 if (!match) {
9e55ce06 5002 AV *av;
32fc9b6a 5003 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5004
5005 if (sw) {
3568d838 5006 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5007 match = TRUE;
5008 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5009 if (!match && lenp && av) {
5010 I32 i;
9e55ce06 5011 for (i = 0; i <= av_len(av); i++) {
890ce7af 5012 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5013 STRLEN len;
890ce7af 5014 const char * const s = SvPV_const(sv, len);
9e55ce06 5015
061b10df 5016 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5017 *lenp = len;
5018 match = TRUE;
5019 break;
5020 }
5021 }
5022 }
5023 if (!match) {
89ebb4a3 5024 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5025 STRLEN tmplen;
5026
9e55ce06
JH
5027 to_utf8_fold(p, tmpbuf, &tmplen);
5028 if (swash_fetch(sw, tmpbuf, do_utf8))
5029 match = TRUE;
5030 }
ffc61ed2
JH
5031 }
5032 }
bbce6d69 5033 }
9e55ce06 5034 if (match && lenp && *lenp == 0)
0f0076b4 5035 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5036 }
1aa99e6b 5037 if (!match && c < 256) {
ffc61ed2
JH
5038 if (ANYOF_BITMAP_TEST(n, c))
5039 match = TRUE;
5040 else if (flags & ANYOF_FOLD) {
eb160463 5041 U8 f;
a0ed51b3 5042
ffc61ed2
JH
5043 if (flags & ANYOF_LOCALE) {
5044 PL_reg_flags |= RF_tainted;
5045 f = PL_fold_locale[c];
5046 }
5047 else
5048 f = PL_fold[c];
5049 if (f != c && ANYOF_BITMAP_TEST(n, f))
5050 match = TRUE;
5051 }
5052
5053 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5054 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5055 if (
5056 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5057 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5058 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5059 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5060 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5061 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5062 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5063 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5064 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5065 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5066 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5067 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5068 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5069 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5070 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5071 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5072 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5073 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5074 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5075 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5076 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5077 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5078 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5079 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5080 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5081 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5082 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5083 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5084 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5085 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5086 ) /* How's that for a conditional? */
5087 {
5088 match = TRUE;
5089 }
a0ed51b3 5090 }
a0ed51b3
LW
5091 }
5092
a0ed51b3
LW
5093 return (flags & ANYOF_INVERT) ? !match : match;
5094}
161b471a 5095
dfe13c55 5096STATIC U8 *
0ce71af7 5097S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5098{
97aff369 5099 dVAR;
a0ed51b3 5100 if (off >= 0) {
1aa99e6b 5101 while (off-- && s < lim) {
ffc61ed2 5102 /* XXX could check well-formedness here */
a0ed51b3 5103 s += UTF8SKIP(s);
ffc61ed2 5104 }
a0ed51b3
LW
5105 }
5106 else {
1de06328
YO
5107 while (off++ && s > lim) {
5108 s--;
5109 if (UTF8_IS_CONTINUED(*s)) {
5110 while (s > lim && UTF8_IS_CONTINUATION(*s))
5111 s--;
a0ed51b3 5112 }
1de06328 5113 /* XXX could check well-formedness here */
a0ed51b3
LW
5114 }
5115 }
5116 return s;
5117}
161b471a 5118
f9f4320a
YO
5119#ifdef XXX_dmq
5120/* there are a bunch of places where we use two reghop3's that should
5121 be replaced with this routine. but since thats not done yet
5122 we ifdef it out - dmq
5123*/
dfe13c55 5124STATIC U8 *
1de06328
YO
5125S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5126{
5127 dVAR;
5128 if (off >= 0) {
5129 while (off-- && s < rlim) {
5130 /* XXX could check well-formedness here */
5131 s += UTF8SKIP(s);
5132 }
5133 }
5134 else {
5135 while (off++ && s > llim) {
5136 s--;
5137 if (UTF8_IS_CONTINUED(*s)) {
5138 while (s > llim && UTF8_IS_CONTINUATION(*s))
5139 s--;
5140 }
5141 /* XXX could check well-formedness here */
5142 }
5143 }
5144 return s;
5145}
f9f4320a 5146#endif
1de06328
YO
5147
5148STATIC U8 *
0ce71af7 5149S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5150{
97aff369 5151 dVAR;
a0ed51b3 5152 if (off >= 0) {
1aa99e6b 5153 while (off-- && s < lim) {
ffc61ed2 5154 /* XXX could check well-formedness here */
a0ed51b3 5155 s += UTF8SKIP(s);
ffc61ed2 5156 }
a0ed51b3 5157 if (off >= 0)
3dab1dad 5158 return NULL;
a0ed51b3
LW
5159 }
5160 else {
1de06328
YO
5161 while (off++ && s > lim) {
5162 s--;
5163 if (UTF8_IS_CONTINUED(*s)) {
5164 while (s > lim && UTF8_IS_CONTINUATION(*s))
5165 s--;
a0ed51b3 5166 }
1de06328 5167 /* XXX could check well-formedness here */
a0ed51b3
LW
5168 }
5169 if (off <= 0)
3dab1dad 5170 return NULL;
a0ed51b3
LW
5171 }
5172 return s;
5173}
51371543 5174
51371543 5175static void
acfe0abc 5176restore_pos(pTHX_ void *arg)
51371543 5177{
97aff369 5178 dVAR;
097eb12c 5179 regexp * const rex = (regexp *)arg;
51371543
GS
5180 if (PL_reg_eval_set) {
5181 if (PL_reg_oldsaved) {
4f639d21
DM
5182 rex->subbeg = PL_reg_oldsaved;
5183 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5184#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5185 rex->saved_copy = PL_nrs;
ed252734 5186#endif
4f639d21 5187 RX_MATCH_COPIED_on(rex);
51371543
GS
5188 }
5189 PL_reg_magic->mg_len = PL_reg_oldpos;
5190 PL_reg_eval_set = 0;
5191 PL_curpm = PL_reg_oldcurpm;
5192 }
5193}
33b8afdf
JH
5194
5195STATIC void
5196S_to_utf8_substr(pTHX_ register regexp *prog)
5197{
33b8afdf 5198 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5199 SV* const sv = newSVsv(prog->float_substr);
5200 prog->float_utf8 = sv;
33b8afdf
JH
5201 sv_utf8_upgrade(sv);
5202 if (SvTAIL(prog->float_substr))
5203 SvTAIL_on(sv);
5204 if (prog->float_substr == prog->check_substr)
5205 prog->check_utf8 = sv;
5206 }
5207 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5208 SV* const sv = newSVsv(prog->anchored_substr);
5209 prog->anchored_utf8 = sv;
33b8afdf
JH
5210 sv_utf8_upgrade(sv);
5211 if (SvTAIL(prog->anchored_substr))
5212 SvTAIL_on(sv);
5213 if (prog->anchored_substr == prog->check_substr)
5214 prog->check_utf8 = sv;
5215 }
5216}
5217
5218STATIC void
5219S_to_byte_substr(pTHX_ register regexp *prog)
5220{
97aff369 5221 dVAR;
33b8afdf 5222 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5223 SV* sv = newSVsv(prog->float_utf8);
5224 prog->float_substr = sv;
33b8afdf
JH
5225 if (sv_utf8_downgrade(sv, TRUE)) {
5226 if (SvTAIL(prog->float_utf8))
5227 SvTAIL_on(sv);
5228 } else {
5229 SvREFCNT_dec(sv);
5230 prog->float_substr = sv = &PL_sv_undef;
5231 }
5232 if (prog->float_utf8 == prog->check_utf8)
5233 prog->check_substr = sv;
5234 }
5235 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5236 SV* sv = newSVsv(prog->anchored_utf8);
5237 prog->anchored_substr = sv;
33b8afdf
JH
5238 if (sv_utf8_downgrade(sv, TRUE)) {
5239 if (SvTAIL(prog->anchored_utf8))
5240 SvTAIL_on(sv);
5241 } else {
5242 SvREFCNT_dec(sv);
5243 prog->anchored_substr = sv = &PL_sv_undef;
5244 }
5245 if (prog->anchored_utf8 == prog->check_utf8)
5246 prog->check_substr = sv;
5247 }
5248}
66610fdd
RGS
5249
5250/*
5251 * Local variables:
5252 * c-indentation-style: bsd
5253 * c-basic-offset: 4
5254 * indent-tabs-mode: t
5255 * End:
5256 *
37442d52
RGS
5257 * ex: set ts=8 sts=4 sw=4 noet:
5258 */