This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
IO::Compress modules
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
c476f425 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
6bda09f9 169#define REGCP_OTHER_ELEMS 8
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
185 SSPUSHPTR(PL_regstartp);
186 SSPUSHPTR(PL_regendp);
3280af22
NIS
187 SSPUSHINT(PL_regsize);
188 SSPUSHINT(*PL_reglastparen);
a01268b5 189 SSPUSHINT(*PL_reglastcloseparen);
3280af22 190 SSPUSHPTR(PL_reginput);
41123dfd
JH
191#define REGCP_FRAME_ELEMS 2
192/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
193 * are needed for the regexp context stack bookkeeping. */
194 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 195 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 196
a0d0e21e
LW
197 return retval;
198}
199
c277df42 200/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
201#define REGCP_SET(cp) \
202 DEBUG_STATE_r( \
ab3bbdeb 203 PerlIO_printf(Perl_debug_log, \
e4f74956 204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
205 (IV)PL_savestack_ix)); \
206 cp = PL_savestack_ix
c3464db5 207
ab3bbdeb 208#define REGCP_UNWIND(cp) \
e4f74956 209 DEBUG_STATE_r( \
ab3bbdeb 210 if (cp != PL_savestack_ix) \
e4f74956
YO
211 PerlIO_printf(Perl_debug_log, \
212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
213 (IV)(cp), (IV)PL_savestack_ix)); \
214 regcpblow(cp)
c277df42 215
76e3520e 216STATIC char *
097eb12c 217S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 218{
97aff369 219 dVAR;
b1ce53c5 220 I32 i;
a0d0e21e 221 char *input;
b1ce53c5 222
a3621e74
YO
223 GET_RE_DEBUG_FLAGS_DECL;
224
b1ce53c5 225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 226 i = SSPOPINT;
b1ce53c5
JH
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 229 input = (char *) SSPOPPTR;
a01268b5 230 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
6bda09f9
YO
233 PL_regendp=(I32 *) SSPOPPTR;
234 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 235
6bda09f9 236
b1ce53c5 237 /* Now restore the parentheses context. */
41123dfd
JH
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 240 I32 tmps;
097eb12c 241 U32 paren = (U32)SSPOPINT;
3280af22 242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
243 PL_regstartp[paren] = SSPOPINT;
244 tmps = SSPOPINT;
3280af22
NIS
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
a3621e74 247 DEBUG_EXECUTE_r(
c3464db5 248 PerlIO_printf(Perl_debug_log,
b900a521 249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 250 (UV)paren, (IV)PL_regstartp[paren],
b900a521 251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 252 (IV)PL_regendp[paren],
3280af22 253 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 254 );
a0d0e21e 255 }
a3621e74 256 DEBUG_EXECUTE_r(
bb7a0f54 257 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 258 PerlIO_printf(Perl_debug_log,
faccc32b 259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 260 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
261 }
262 );
daf18116 263#if 1
dafc8851
JH
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
273 * --jhi */
bb7a0f54 274 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
275 if (i > PL_regsize)
276 PL_regstartp[i] = -1;
277 PL_regendp[i] = -1;
a0d0e21e 278 }
dafc8851 279#endif
a0d0e21e
LW
280 return input;
281}
282
02db2b7b 283#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 284
a687059c 285/*
e50aee73 286 * pregexec and friends
a687059c
LW
287 */
288
76234dfb 289#ifndef PERL_IN_XSUB_RE
a687059c 290/*
c277df42 291 - pregexec - match a regexp against a string
a687059c 292 */
c277df42 293I32
864dbfa3 294Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 295 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
296/* strend: pointer to null at end of string */
297/* strbeg: real beginning of string */
298/* minend: end of match must be >=minend after stringarg. */
299/* nosave: For optimizations. */
300{
301 return
9041c2e3 302 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
303 nosave ? 0 : REXEC_COPY_STR);
304}
76234dfb 305#endif
22e551b9 306
9041c2e3 307/*
cad2e5aa
JH
308 * Need to implement the following flags for reg_anch:
309 *
310 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
311 * USE_INTUIT_ML
312 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
313 * INTUIT_AUTORITATIVE_ML
314 * INTUIT_ONCE_NOML - Intuit can match in one location only.
315 * INTUIT_ONCE_ML
316 *
317 * Another flag for this function: SECOND_TIME (so that float substrs
318 * with giant delta may be not rechecked).
319 */
320
321/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
322
3f7c398e 323/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
324 Otherwise, only SvCUR(sv) is used to get strbeg. */
325
326/* XXXX We assume that strpos is strbeg unless sv. */
327
6eb5f6b9
JH
328/* XXXX Some places assume that there is a fixed substring.
329 An update may be needed if optimizer marks as "INTUITable"
330 RExen without fixed substrings. Similarly, it is assumed that
331 lengths of all the strings are no more than minlen, thus they
332 cannot come from lookahead.
333 (Or minlen should take into account lookahead.) */
334
2c2d71f5
JH
335/* A failure to find a constant substring means that there is no need to make
336 an expensive call to REx engine, thus we celebrate a failure. Similarly,
337 finding a substring too deep into the string means that less calls to
30944b6d
IZ
338 regtry() should be needed.
339
340 REx compiler's optimizer found 4 possible hints:
341 a) Anchored substring;
342 b) Fixed substring;
343 c) Whether we are anchored (beginning-of-line or \G);
344 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 345 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
346 string which does not contradict any of them.
347 */
2c2d71f5 348
6eb5f6b9
JH
349/* Most of decisions we do here should have been done at compile time.
350 The nodes of the REx which we used for the search should have been
351 deleted from the finite automaton. */
352
cad2e5aa
JH
353char *
354Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
355 char *strend, U32 flags, re_scream_pos_data *data)
356{
97aff369 357 dVAR;
b7953727 358 register I32 start_shift = 0;
cad2e5aa 359 /* Should be nonnegative! */
b7953727 360 register I32 end_shift = 0;
2c2d71f5
JH
361 register char *s;
362 register SV *check;
a1933d95 363 char *strbeg;
cad2e5aa 364 char *t;
1de06328 365 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 366 I32 ml_anch;
bd61b366
SS
367 register char *other_last = NULL; /* other substr checked before this */
368 char *check_at = NULL; /* check substr found at this pos */
1df70142 369 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 370#ifdef DEBUGGING
890ce7af 371 const char * const i_strpos = strpos;
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
379 PL_reg_flags |= RF_utf8;
380 }
ab3bbdeb
YO
381 DEBUG_EXECUTE_r(
382 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
383 sv ? "Guessing start of match in sv for"
384 : "Guessing start of match in string for");
2a782b5b 385 );
cad2e5aa 386
c344f387
JH
387 /* CHR_DIST() would be more correct here but it makes things slow. */
388 if (prog->minlen > strend - strpos) {
a3621e74 389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 390 "String too short... [re_intuit_start]\n"));
cad2e5aa 391 goto fail;
2c2d71f5 392 }
1de06328 393
a1933d95 394 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 395 PL_regeol = strend;
33b8afdf
JH
396 if (do_utf8) {
397 if (!prog->check_utf8 && prog->check_substr)
398 to_utf8_substr(prog);
399 check = prog->check_utf8;
400 } else {
401 if (!prog->check_substr && prog->check_utf8)
402 to_byte_substr(prog);
403 check = prog->check_substr;
404 }
1de06328 405 if (check == &PL_sv_undef) {
a3621e74 406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 407 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
408 goto fail;
409 }
2c2d71f5 410 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
411 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
412 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 413 && !multiline ) ); /* Check after \n? */
cad2e5aa 414
7e25d62c
JH
415 if (!ml_anch) {
416 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
417 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 418 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
419 && sv && !SvROK(sv)
420 && (strpos != strbeg)) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
422 goto fail;
423 }
424 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 425 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 426 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
427 I32 slen;
428
1aa99e6b 429 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 430
653099ff
GS
431 if (SvTAIL(check)) {
432 slen = SvCUR(check); /* >= 1 */
cad2e5aa 433
9041c2e3 434 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 435 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 437 goto fail_finish;
cad2e5aa
JH
438 }
439 /* Now should match s[0..slen-2] */
440 slen--;
3f7c398e 441 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 442 || (slen > 1
3f7c398e 443 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 444 report_neq:
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
446 goto fail_finish;
447 }
cad2e5aa 448 }
3f7c398e 449 else if (*SvPVX_const(check) != *s
653099ff 450 || ((slen = SvCUR(check)) > 1
3f7c398e 451 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 452 goto report_neq;
c315bfe8 453 check_at = s;
2c2d71f5 454 goto success_at_start;
7e25d62c 455 }
cad2e5aa 456 }
2c2d71f5 457 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 458 s = strpos;
2c2d71f5 459 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
460 end_shift = prog->check_end_shift;
461
2c2d71f5 462 if (!ml_anch) {
a3b680e6 463 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 464 - (SvTAIL(check) != 0);
a3b680e6 465 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
466
467 if (end_shift < eshift)
468 end_shift = eshift;
469 }
cad2e5aa 470 }
2c2d71f5 471 else { /* Can match at random position */
cad2e5aa
JH
472 ml_anch = 0;
473 s = strpos;
1de06328
YO
474 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
475 end_shift = prog->check_end_shift;
476
477 /* end shift should be non negative here */
cad2e5aa
JH
478 }
479
2c2d71f5 480#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 481 if (end_shift < 0)
1de06328 482 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 483 (IV)end_shift, prog->precomp);
2c2d71f5
JH
484#endif
485
2c2d71f5
JH
486 restart:
487 /* Find a possible match in the region s..strend by looking for
488 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
489
490 {
491 I32 srch_start_shift = start_shift;
492 I32 srch_end_shift = end_shift;
493 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
494 srch_end_shift -= ((strbeg - s) - srch_start_shift);
495 srch_start_shift = strbeg - s;
496 }
6bda09f9 497 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
498 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
499 (IV)prog->check_offset_min,
500 (IV)srch_start_shift,
501 (IV)srch_end_shift,
502 (IV)prog->check_end_shift);
503 });
504
cad2e5aa 505 if (flags & REXEC_SCREAM) {
cad2e5aa 506 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 507 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 508
2c2d71f5
JH
509 if (PL_screamfirst[BmRARE(check)] >= 0
510 || ( BmRARE(check) == '\n'
511 && (BmPREVIOUS(check) == SvCUR(check) - 1)
512 && SvTAIL(check) ))
9041c2e3 513 s = screaminstr(sv, check,
1de06328 514 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 515 else
2c2d71f5 516 goto fail_finish;
4addbd3b
HS
517 /* we may be pointing at the wrong string */
518 if (s && RX_MATCH_COPIED(prog))
3f7c398e 519 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
520 if (data)
521 *data->scream_olds = s;
522 }
1de06328
YO
523 else {
524 U8* start_point;
525 U8* end_point;
526 if (prog->reganch & ROPT_CANY_SEEN) {
527 start_point= (U8*)(s + srch_start_shift);
528 end_point= (U8*)(strend - srch_end_shift);
529 } else {
530 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
531 end_point= HOP3(strend, -srch_end_shift, strbeg);
532 }
6bda09f9 533 DEBUG_OPTIMISE_MORE_r({
56570a2c 534 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 535 (int)(end_point - start_point),
fc8cd66c 536 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
537 start_point);
538 });
539
540 s = fbm_instr( start_point, end_point,
7fba1cd6 541 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
542 }
543 }
cad2e5aa
JH
544 /* Update the count-of-usability, remove useless subpatterns,
545 unshift s. */
2c2d71f5 546
ab3bbdeb
YO
547 DEBUG_EXECUTE_r({
548 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
549 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
550 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 551 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
552 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
553 ? "anchored" : "floating"),
554 quoted,
555 RE_SV_TAIL(check),
556 (s ? " at offset " : "...\n") );
557 });
2c2d71f5
JH
558
559 if (!s)
560 goto fail_finish;
2c2d71f5 561 /* Finish the diagnostic message */
a3621e74 562 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 563
1de06328
YO
564 /* XXX dmq: first branch is for positive lookbehind...
565 Our check string is offset from the beginning of the pattern.
566 So we need to do any stclass tests offset forward from that
567 point. I think. :-(
568 */
569
570
571
572 check_at=s;
573
574
2c2d71f5
JH
575 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
576 Start with the other substr.
577 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 578 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
579 *always* match. Probably should be marked during compile...
580 Probably it is right to do no SCREAM here...
581 */
582
1de06328
YO
583 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
584 : (prog->float_substr && prog->anchored_substr))
585 {
30944b6d 586 /* Take into account the "other" substring. */
2c2d71f5
JH
587 /* XXXX May be hopelessly wrong for UTF... */
588 if (!other_last)
6eb5f6b9 589 other_last = strpos;
33b8afdf 590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
591 do_other_anchored:
592 {
890ce7af
AL
593 char * const last = HOP3c(s, -start_shift, strbeg);
594 char *last1, *last2;
be8e71aa 595 char * const saved_s = s;
33b8afdf 596 SV* must;
2c2d71f5 597
2c2d71f5
JH
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 600 && (!do_utf8
0ce71af7 601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 602 && t > strpos)))
6f207bd3 603 NOOP;
2c2d71f5
JH
604 else
605 t = strpos;
1aa99e6b 606 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
607 if (t < other_last) /* These positions already checked */
608 t = other_last;
1aa99e6b 609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
610 if (last < last1)
611 last1 = last;
1de06328
YO
612 /* XXXX It is not documented what units *_offsets are in.
613 We assume bytes, but this is clearly wrong.
614 Meaning this code needs to be carefully reviewed for errors.
615 dmq.
616 */
617
2c2d71f5 618 /* On end-of-str: see comment below. */
33b8afdf
JH
619 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
620 if (must == &PL_sv_undef) {
621 s = (char*)NULL;
1de06328 622 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
623 }
624 else
625 s = fbm_instr(
626 (unsigned char*)t,
627 HOP3(HOP3(last1, prog->anchored_offset, strend)
628 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
629 must,
7fba1cd6 630 multiline ? FBMrf_MULTILINE : 0
33b8afdf 631 );
ab3bbdeb
YO
632 DEBUG_EXECUTE_r({
633 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
634 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
635 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 636 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
637 quoted, RE_SV_TAIL(must));
638 });
639
640
2c2d71f5
JH
641 if (!s) {
642 if (last1 >= last2) {
a3621e74 643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
644 ", giving up...\n"));
645 goto fail_finish;
646 }
a3621e74 647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 648 ", trying floating at offset %ld...\n",
be8e71aa 649 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
650 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
651 s = HOP3c(last, 1, strend);
2c2d71f5
JH
652 goto restart;
653 }
654 else {
a3621e74 655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 656 (long)(s - i_strpos)));
1aa99e6b
IH
657 t = HOP3c(s, -prog->anchored_offset, strbeg);
658 other_last = HOP3c(s, 1, strend);
be8e71aa 659 s = saved_s;
2c2d71f5
JH
660 if (t == strpos)
661 goto try_at_start;
2c2d71f5
JH
662 goto try_at_offset;
663 }
30944b6d 664 }
2c2d71f5
JH
665 }
666 else { /* Take into account the floating substring. */
33b8afdf 667 char *last, *last1;
be8e71aa 668 char * const saved_s = s;
33b8afdf
JH
669 SV* must;
670
671 t = HOP3c(s, -start_shift, strbeg);
672 last1 = last =
673 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
674 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
675 last = HOP3c(t, prog->float_max_offset, strend);
676 s = HOP3c(t, prog->float_min_offset, strend);
677 if (s < other_last)
678 s = other_last;
2c2d71f5 679 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
680 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
681 /* fbm_instr() takes into account exact value of end-of-str
682 if the check is SvTAIL(ed). Since false positives are OK,
683 and end-of-str is not later than strend we are OK. */
684 if (must == &PL_sv_undef) {
685 s = (char*)NULL;
1de06328 686 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
687 }
688 else
2c2d71f5 689 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
690 (unsigned char*)last + SvCUR(must)
691 - (SvTAIL(must)!=0),
7fba1cd6 692 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
693 DEBUG_EXECUTE_r({
694 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
695 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
696 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 697 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
698 quoted, RE_SV_TAIL(must));
699 });
33b8afdf
JH
700 if (!s) {
701 if (last1 == last) {
a3621e74 702 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
703 ", giving up...\n"));
704 goto fail_finish;
2c2d71f5 705 }
a3621e74 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 707 ", trying anchored starting at offset %ld...\n",
be8e71aa 708 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
709 other_last = last;
710 s = HOP3c(t, 1, strend);
711 goto restart;
712 }
713 else {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
715 (long)(s - i_strpos)));
716 other_last = s; /* Fix this later. --Hugo */
be8e71aa 717 s = saved_s;
33b8afdf
JH
718 if (t == strpos)
719 goto try_at_start;
720 goto try_at_offset;
721 }
2c2d71f5 722 }
cad2e5aa 723 }
2c2d71f5 724
1de06328 725
9ef43ace 726 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 727
6bda09f9 728 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
729 PerlIO_printf(Perl_debug_log,
730 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
731 (IV)prog->check_offset_min,
732 (IV)prog->check_offset_max,
733 (IV)(s-strpos),
734 (IV)(t-strpos),
735 (IV)(t-s),
736 (IV)(strend-strpos)
737 )
738 );
739
2c2d71f5 740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 741 && (!do_utf8
9ef43ace 742 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
743 && t > strpos)))
744 {
2c2d71f5
JH
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
747 try_at_offset:
cad2e5aa 748 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
755 find_anchor:
2c2d71f5 756 while (t < strend - prog->minlen) {
cad2e5aa 757 if (*t == '\n') {
4ee3650e 758 if (t < check_at - prog->check_offset_min) {
33b8afdf 759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
30944b6d
IZ
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
766 */
9041c2e3 767 strpos = t + 1;
a3621e74 768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 769 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
770 goto do_other_anchored;
771 }
4ee3650e
GS
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
cad2e5aa 774 s = t + 1;
a3621e74 775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 776 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
777 goto set_useful;
778 }
4ee3650e
GS
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
a3621e74 782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 783 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 784 other_last = strpos = s = t + 1;
cad2e5aa
JH
785 goto restart;
786 }
787 t++;
788 }
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 790 PL_colors[0], PL_colors[1]));
2c2d71f5 791 goto fail_finish;
cad2e5aa 792 }
f5952150 793 else {
a3621e74 794 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 795 PL_colors[0], PL_colors[1]));
f5952150 796 }
cad2e5aa
JH
797 s = t;
798 set_useful:
33b8afdf 799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
800 }
801 else {
f5952150 802 /* The found string does not prohibit matching at strpos,
2c2d71f5 803 - no optimization of calling REx engine can be performed,
f5952150
GS
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
2c2d71f5
JH
806 try_at_start:
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
05b4157f 809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 810 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
813 {
cad2e5aa
JH
814 t = strpos;
815 goto find_anchor;
816 }
a3621e74 817 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 819 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 820 );
2c2d71f5 821 success_at_start:
30944b6d 822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
823 && (do_utf8 ? (
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
827 ) : (
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
831 )))
66e933ab 832 {
cad2e5aa 833 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
838 prog->check_substr = prog->check_utf8 = NULL; /* disable */
839 prog->float_substr = prog->float_utf8 = NULL; /* clear */
840 check = NULL; /* abort */
cad2e5aa 841 s = strpos;
3cf5c195
IZ
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
6eb5f6b9 844 other heuristics. */
cad2e5aa
JH
845 prog->reganch &= ~RE_USE_INTUIT;
846 }
847 else
848 s = strpos;
849 }
850
6eb5f6b9
JH
851 /* Last resort... */
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
853 /* trie stclasses are too expensive to use here, we are better off to
854 leave it to regmatch itself */
786e8c11 855 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
856 /* minlen == 0 is possible if regstclass is \b or \B,
857 and the fixed substr is ''$.
858 Since minlen is already taken into account, s+1 is before strend;
859 accidentally, minlen >= 1 guaranties no false positives at s + 1
860 even for \b or \B. But (minlen? 1 : 0) below assumes that
861 regstclass does not come from lookahead... */
862 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
863 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 864 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 865 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 866 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 867 : 1);
1de06328
YO
868 char * endpos;
869 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
870 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
871 else if (prog->float_substr || prog->float_utf8)
872 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
873 else
874 endpos= strend;
875
56570a2c 876 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 877 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 878
6eb5f6b9 879 t = s;
3b0527fe 880 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
881 if (!s) {
882#ifdef DEBUGGING
cbbf8932 883 const char *what = NULL;
6eb5f6b9
JH
884#endif
885 if (endpos == strend) {
a3621e74 886 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
887 "Could not match STCLASS...\n") );
888 goto fail;
889 }
a3621e74 890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 891 "This position contradicts STCLASS...\n") );
653099ff
GS
892 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
893 goto fail;
6eb5f6b9 894 /* Contradict one of substrings */
33b8afdf
JH
895 if (prog->anchored_substr || prog->anchored_utf8) {
896 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 897 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 898 hop_and_restart:
1aa99e6b 899 s = HOP3c(t, 1, strend);
66e933ab
GS
900 if (s + start_shift + end_shift > strend) {
901 /* XXXX Should be taken into account earlier? */
a3621e74 902 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
903 "Could not match STCLASS...\n") );
904 goto fail;
905 }
5e39e1e5
HS
906 if (!check)
907 goto giveup;
a3621e74 908 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 909 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
910 what, (long)(s + start_shift - i_strpos)) );
911 goto restart;
912 }
66e933ab 913 /* Have both, check_string is floating */
6eb5f6b9
JH
914 if (t + start_shift >= check_at) /* Contradicts floating=check */
915 goto retry_floating_check;
916 /* Recheck anchored substring, but not floating... */
9041c2e3 917 s = check_at;
5e39e1e5
HS
918 if (!check)
919 goto giveup;
a3621e74 920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 921 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
922 (long)(other_last - i_strpos)) );
923 goto do_other_anchored;
924 }
60e71179
GS
925 /* Another way we could have checked stclass at the
926 current position only: */
927 if (ml_anch) {
928 s = t = t + 1;
5e39e1e5
HS
929 if (!check)
930 goto giveup;
a3621e74 931 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 932 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 933 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 934 goto try_at_offset;
66e933ab 935 }
33b8afdf 936 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 937 goto fail;
6eb5f6b9
JH
938 /* Check is floating subtring. */
939 retry_floating_check:
940 t = check_at - start_shift;
a3621e74 941 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
942 goto hop_and_restart;
943 }
b7953727 944 if (t != s) {
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 946 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
947 (long)(t - i_strpos), (long)(s - i_strpos))
948 );
949 }
950 else {
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
952 "Does not contradict STCLASS...\n");
953 );
954 }
6eb5f6b9 955 }
5e39e1e5 956 giveup:
a3621e74 957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
958 PL_colors[4], (check ? "Guessed" : "Giving up"),
959 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 960 return s;
2c2d71f5
JH
961
962 fail_finish: /* Substring not found */
33b8afdf
JH
963 if (prog->check_substr || prog->check_utf8) /* could be removed already */
964 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 965 fail:
a3621e74 966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 967 PL_colors[4], PL_colors[5]));
bd61b366 968 return NULL;
cad2e5aa 969}
9661b544 970
786e8c11 971
3b0527fe 972
4cadc6a9
YO
973#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
974foldlen, foldbuf, uniflags) STMT_START { \
975 switch (trie_type) { \
976 case trie_utf8_fold: \
977 if ( foldlen>0 ) { \
978 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
979 foldlen -= len; \
980 uscan += len; \
981 len=0; \
982 } else { \
983 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
984 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
985 foldlen -= UNISKIP( uvc ); \
986 uscan = foldbuf + UNISKIP( uvc ); \
987 } \
988 break; \
989 case trie_utf8: \
990 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
991 break; \
992 case trie_plain: \
993 uvc = (UV)*uc; \
994 len = 1; \
995 } \
996 \
997 if (uvc < 256) { \
998 charid = trie->charmap[ uvc ]; \
999 } \
1000 else { \
1001 charid = 0; \
1002 if (trie->widecharmap) { \
1003 SV** const svpp = hv_fetch(trie->widecharmap, \
1004 (char*)&uvc, sizeof(UV), 0); \
1005 if (svpp) \
1006 charid = (U16)SvIV(*svpp); \
1007 } \
1008 } \
1009} STMT_END
1010
1011#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1012 if ( (CoNd) \
1013 && (ln == len || \
1014 ibcmp_utf8(s, NULL, 0, do_utf8, \
1015 m, NULL, ln, (bool)UTF)) \
1016 && (!reginfo || regtry(reginfo, s)) ) \
1017 goto got_it; \
1018 else { \
1019 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1020 uvchr_to_utf8(tmpbuf, c); \
1021 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1022 if ( f != c \
1023 && (f == c1 || f == c2) \
1024 && (ln == foldlen || \
1025 !ibcmp_utf8((char *) foldbuf, \
1026 NULL, foldlen, do_utf8, \
1027 m, \
1028 NULL, ln, (bool)UTF)) \
1029 && (!reginfo || regtry(reginfo, s)) ) \
1030 goto got_it; \
1031 } \
1032 s += len
1033
1034#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1035STMT_START { \
1036 while (s <= e) { \
1037 if ( (CoNd) \
1038 && (ln == 1 || !(OP(c) == EXACTF \
1039 ? ibcmp(s, m, ln) \
1040 : ibcmp_locale(s, m, ln))) \
1041 && (!reginfo || regtry(reginfo, s)) ) \
1042 goto got_it; \
1043 s++; \
1044 } \
1045} STMT_END
1046
1047#define REXEC_FBC_UTF8_SCAN(CoDe) \
1048STMT_START { \
1049 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1050 CoDe \
1051 s += uskip; \
1052 } \
1053} STMT_END
1054
1055#define REXEC_FBC_SCAN(CoDe) \
1056STMT_START { \
1057 while (s < strend) { \
1058 CoDe \
1059 s++; \
1060 } \
1061} STMT_END
1062
1063#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1064REXEC_FBC_UTF8_SCAN( \
1065 if (CoNd) { \
1066 if (tmp && (!reginfo || regtry(reginfo, s))) \
1067 goto got_it; \
1068 else \
1069 tmp = doevery; \
1070 } \
1071 else \
1072 tmp = 1; \
1073)
1074
1075#define REXEC_FBC_CLASS_SCAN(CoNd) \
1076REXEC_FBC_SCAN( \
1077 if (CoNd) { \
1078 if (tmp && (!reginfo || regtry(reginfo, s))) \
1079 goto got_it; \
1080 else \
1081 tmp = doevery; \
1082 } \
1083 else \
1084 tmp = 1; \
1085)
1086
1087#define REXEC_FBC_TRYIT \
1088if ((!reginfo || regtry(reginfo, s))) \
1089 goto got_it
1090
1091#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1092 if (do_utf8) { \
1093 UtFpReLoAd; \
1094 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1095 } \
1096 else { \
1097 REXEC_FBC_CLASS_SCAN(CoNd); \
1098 } \
1099 break
1100
1101#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1102 PL_reg_flags |= RF_tainted; \
1103 if (do_utf8) { \
1104 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1105 } \
1106 else { \
1107 REXEC_FBC_CLASS_SCAN(CoNd); \
1108 } \
1109 break
1110
786e8c11
YO
1111#define DUMP_EXEC_POS(li,s,doutf8) \
1112 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1113
1114/* We know what class REx starts with. Try to find this position... */
1115/* if reginfo is NULL, its a dryrun */
1116/* annoyingly all the vars in this routine have different names from their counterparts
1117 in regmatch. /grrr */
1118
3c3eec57 1119STATIC char *
07be1b83
YO
1120S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1121 const char *strend, const regmatch_info *reginfo)
a687059c 1122{
27da23d5 1123 dVAR;
1df70142 1124 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1125 char *m;
d8093b23 1126 STRLEN ln;
5dab1207 1127 STRLEN lnc;
078c425b 1128 register STRLEN uskip;
d8093b23
G
1129 unsigned int c1;
1130 unsigned int c2;
6eb5f6b9
JH
1131 char *e;
1132 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1133 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1134
6eb5f6b9
JH
1135 /* We know what class it must start with. */
1136 switch (OP(c)) {
6eb5f6b9 1137 case ANYOF:
388cc4de 1138 if (do_utf8) {
4cadc6a9 1139 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1140 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1141 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1142 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1143 }
1144 else {
1145 while (s < strend) {
1146 STRLEN skip = 1;
1147
32fc9b6a 1148 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1149 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1150 /* The assignment of 2 is intentional:
1151 * for the folded sharp s, the skip is 2. */
1152 (skip = SHARP_S_SKIP))) {
3b0527fe 1153 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1154 goto got_it;
1155 else
1156 tmp = doevery;
1157 }
1158 else
1159 tmp = 1;
1160 s += skip;
1161 }
a0d0e21e 1162 }
6eb5f6b9 1163 break;
f33976b4 1164 case CANY:
4cadc6a9 1165 REXEC_FBC_SCAN(
3b0527fe 1166 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1167 goto got_it;
1168 else
1169 tmp = doevery;
4cadc6a9 1170 );
f33976b4 1171 break;
6eb5f6b9 1172 case EXACTF:
5dab1207
NIS
1173 m = STRING(c);
1174 ln = STR_LEN(c); /* length to match in octets/bytes */
1175 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1176 if (UTF) {
a2a2844f 1177 STRLEN ulen1, ulen2;
5dab1207 1178 U8 *sm = (U8 *) m;
89ebb4a3
JH
1179 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1180 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1181 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1182
1183 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1184 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1185
89ebb4a3 1186 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1187 0, uniflags);
89ebb4a3 1188 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1189 0, uniflags);
5dab1207
NIS
1190 lnc = 0;
1191 while (sm < ((U8 *) m + ln)) {
1192 lnc++;
1193 sm += UTF8SKIP(sm);
1194 }
1aa99e6b
IH
1195 }
1196 else {
1197 c1 = *(U8*)m;
1198 c2 = PL_fold[c1];
1199 }
6eb5f6b9
JH
1200 goto do_exactf;
1201 case EXACTFL:
5dab1207
NIS
1202 m = STRING(c);
1203 ln = STR_LEN(c);
1204 lnc = (I32) ln;
d8093b23 1205 c1 = *(U8*)m;
6eb5f6b9
JH
1206 c2 = PL_fold_locale[c1];
1207 do_exactf:
db12adc6 1208 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1209
3b0527fe 1210 if (!reginfo && e < s)
6eb5f6b9 1211 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1212
60a8b682
JH
1213 /* The idea in the EXACTF* cases is to first find the
1214 * first character of the EXACTF* node and then, if
1215 * necessary, case-insensitively compare the full
1216 * text of the node. The c1 and c2 are the first
1217 * characters (though in Unicode it gets a bit
1218 * more complicated because there are more cases
7f16dd3d
JH
1219 * than just upper and lower: one needs to use
1220 * the so-called folding case for case-insensitive
1221 * matching (called "loose matching" in Unicode).
1222 * ibcmp_utf8() will do just that. */
60a8b682 1223
1aa99e6b 1224 if (do_utf8) {
575cac57 1225 UV c, f;
89ebb4a3 1226 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1227 STRLEN len, foldlen;
4ad0818d 1228 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1229 if (c1 == c2) {
5dab1207
NIS
1230 /* Upper and lower of 1st char are equal -
1231 * probably not a "letter". */
1aa99e6b 1232 while (s <= e) {
89ebb4a3 1233 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1234 uniflags);
4cadc6a9 1235 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1236 }
09091399
JH
1237 }
1238 else {
1aa99e6b 1239 while (s <= e) {
89ebb4a3 1240 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1241 uniflags);
80aecb99 1242
60a8b682 1243 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1244 * Note that not all the possible combinations
1245 * are handled here: some of them are handled
1246 * by the standard folding rules, and some of
1247 * them (the character class or ANYOF cases)
1248 * are handled during compiletime in
1249 * regexec.c:S_regclass(). */
880bd946
JH
1250 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1251 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1252 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1253
4cadc6a9 1254 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1255 }
09091399 1256 }
1aa99e6b
IH
1257 }
1258 else {
1259 if (c1 == c2)
4cadc6a9 1260 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1261 else
4cadc6a9 1262 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1263 }
1264 break;
bbce6d69 1265 case BOUNDL:
3280af22 1266 PL_reg_flags |= RF_tainted;
bbce6d69 1267 /* FALL THROUGH */
a0d0e21e 1268 case BOUND:
ffc61ed2 1269 if (do_utf8) {
12d33761 1270 if (s == PL_bostr)
ffc61ed2
JH
1271 tmp = '\n';
1272 else {
6136c704 1273 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1274 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1275 }
1276 tmp = ((OP(c) == BOUND ?
9041c2e3 1277 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1278 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1279 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1280 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1281 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1282 isALNUM_LC_utf8((U8*)s)))
1283 {
1284 tmp = !tmp;
4cadc6a9 1285 REXEC_FBC_TRYIT;
a687059c 1286 }
4cadc6a9 1287 );
a0d0e21e 1288 }
667bb95a 1289 else {
12d33761 1290 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1291 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1292 REXEC_FBC_SCAN(
ffc61ed2
JH
1293 if (tmp ==
1294 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1295 tmp = !tmp;
4cadc6a9 1296 REXEC_FBC_TRYIT;
a0ed51b3 1297 }
4cadc6a9 1298 );
a0ed51b3 1299 }
3b0527fe 1300 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1301 goto got_it;
1302 break;
bbce6d69 1303 case NBOUNDL:
3280af22 1304 PL_reg_flags |= RF_tainted;
bbce6d69 1305 /* FALL THROUGH */
a0d0e21e 1306 case NBOUND:
ffc61ed2 1307 if (do_utf8) {
12d33761 1308 if (s == PL_bostr)
ffc61ed2
JH
1309 tmp = '\n';
1310 else {
6136c704 1311 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1312 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1313 }
1314 tmp = ((OP(c) == NBOUND ?
9041c2e3 1315 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1316 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1317 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1318 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1319 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1320 isALNUM_LC_utf8((U8*)s)))
1321 tmp = !tmp;
4cadc6a9
YO
1322 else REXEC_FBC_TRYIT;
1323 );
a0d0e21e 1324 }
667bb95a 1325 else {
12d33761 1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1327 tmp = ((OP(c) == NBOUND ?
1328 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1329 REXEC_FBC_SCAN(
ffc61ed2
JH
1330 if (tmp ==
1331 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1332 tmp = !tmp;
4cadc6a9
YO
1333 else REXEC_FBC_TRYIT;
1334 );
a0ed51b3 1335 }
3b0527fe 1336 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1337 goto got_it;
1338 break;
a0d0e21e 1339 case ALNUM:
4cadc6a9
YO
1340 REXEC_FBC_CSCAN_PRELOAD(
1341 LOAD_UTF8_CHARCLASS_ALNUM(),
1342 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1343 isALNUM(*s)
1344 );
bbce6d69 1345 case ALNUML:
4cadc6a9
YO
1346 REXEC_FBC_CSCAN_TAINT(
1347 isALNUM_LC_utf8((U8*)s),
1348 isALNUM_LC(*s)
1349 );
a0d0e21e 1350 case NALNUM:
4cadc6a9
YO
1351 REXEC_FBC_CSCAN_PRELOAD(
1352 LOAD_UTF8_CHARCLASS_ALNUM(),
1353 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1354 !isALNUM(*s)
1355 );
bbce6d69 1356 case NALNUML:
4cadc6a9
YO
1357 REXEC_FBC_CSCAN_TAINT(
1358 !isALNUM_LC_utf8((U8*)s),
1359 !isALNUM_LC(*s)
1360 );
a0d0e21e 1361 case SPACE:
4cadc6a9
YO
1362 REXEC_FBC_CSCAN_PRELOAD(
1363 LOAD_UTF8_CHARCLASS_SPACE(),
1364 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1365 isSPACE(*s)
1366 );
bbce6d69 1367 case SPACEL:
4cadc6a9
YO
1368 REXEC_FBC_CSCAN_TAINT(
1369 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1370 isSPACE_LC(*s)
1371 );
a0d0e21e 1372 case NSPACE:
4cadc6a9
YO
1373 REXEC_FBC_CSCAN_PRELOAD(
1374 LOAD_UTF8_CHARCLASS_SPACE(),
1375 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1376 !isSPACE(*s)
1377 );
bbce6d69 1378 case NSPACEL:
4cadc6a9
YO
1379 REXEC_FBC_CSCAN_TAINT(
1380 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1381 !isSPACE_LC(*s)
1382 );
a0d0e21e 1383 case DIGIT:
4cadc6a9
YO
1384 REXEC_FBC_CSCAN_PRELOAD(
1385 LOAD_UTF8_CHARCLASS_DIGIT(),
1386 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1387 isDIGIT(*s)
1388 );
b8c5462f 1389 case DIGITL:
4cadc6a9
YO
1390 REXEC_FBC_CSCAN_TAINT(
1391 isDIGIT_LC_utf8((U8*)s),
1392 isDIGIT_LC(*s)
1393 );
a0d0e21e 1394 case NDIGIT:
4cadc6a9
YO
1395 REXEC_FBC_CSCAN_PRELOAD(
1396 LOAD_UTF8_CHARCLASS_DIGIT(),
1397 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1398 !isDIGIT(*s)
1399 );
b8c5462f 1400 case NDIGITL:
4cadc6a9
YO
1401 REXEC_FBC_CSCAN_TAINT(
1402 !isDIGIT_LC_utf8((U8*)s),
1403 !isDIGIT_LC(*s)
1404 );
1de06328
YO
1405 case AHOCORASICKC:
1406 case AHOCORASICK:
07be1b83
YO
1407 {
1408 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1409 trie_type = do_utf8 ?
1410 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1411 : trie_plain;
1412 /* what trie are we using right now */
1413 reg_ac_data *aho
1414 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1415 reg_trie_data *trie=aho->trie;
1416
1417 const char *last_start = strend - trie->minlen;
6148ee25 1418#ifdef DEBUGGING
07be1b83 1419 const char *real_start = s;
6148ee25 1420#endif
07be1b83 1421 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1422 SV *sv_points;
1423 U8 **points; /* map of where we were in the input string
786e8c11 1424 when reading a given char. For ASCII this
be8e71aa
YO
1425 is unnecessary overhead as the relationship
1426 is always 1:1, but for unicode, especially
1427 case folded unicode this is not true. */
f9e705e8 1428 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1429 U8 *bitmap=NULL;
1430
07be1b83
YO
1431
1432 GET_RE_DEBUG_FLAGS_DECL;
1433
be8e71aa
YO
1434 /* We can't just allocate points here. We need to wrap it in
1435 * an SV so it gets freed properly if there is a croak while
1436 * running the match */
1437 ENTER;
1438 SAVETMPS;
1439 sv_points=newSV(maxlen * sizeof(U8 *));
1440 SvCUR_set(sv_points,
1441 maxlen * sizeof(U8 *));
1442 SvPOK_on(sv_points);
1443 sv_2mortal(sv_points);
1444 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1445 if ( trie_type != trie_utf8_fold
1446 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1447 {
786e8c11
YO
1448 if (trie->bitmap)
1449 bitmap=(U8*)trie->bitmap;
1450 else
1451 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1452 }
786e8c11
YO
1453 /* this is the Aho-Corasick algorithm modified a touch
1454 to include special handling for long "unknown char"
1455 sequences. The basic idea being that we use AC as long
1456 as we are dealing with a possible matching char, when
1457 we encounter an unknown char (and we have not encountered
1458 an accepting state) we scan forward until we find a legal
1459 starting char.
1460 AC matching is basically that of trie matching, except
1461 that when we encounter a failing transition, we fall back
1462 to the current states "fail state", and try the current char
1463 again, a process we repeat until we reach the root state,
1464 state 1, or a legal transition. If we fail on the root state
1465 then we can either terminate if we have reached an accepting
1466 state previously, or restart the entire process from the beginning
1467 if we have not.
1468
1469 */
07be1b83
YO
1470 while (s <= last_start) {
1471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1472 U8 *uc = (U8*)s;
1473 U16 charid = 0;
1474 U32 base = 1;
1475 U32 state = 1;
1476 UV uvc = 0;
1477 STRLEN len = 0;
1478 STRLEN foldlen = 0;
1479 U8 *uscan = (U8*)NULL;
1480 U8 *leftmost = NULL;
786e8c11
YO
1481#ifdef DEBUGGING
1482 U32 accepted_word= 0;
1483#endif
07be1b83
YO
1484 U32 pointpos = 0;
1485
1486 while ( state && uc <= (U8*)strend ) {
1487 int failed=0;
786e8c11
YO
1488 U32 word = aho->states[ state ].wordnum;
1489
1de06328
YO
1490 if( state==1 ) {
1491 if ( bitmap ) {
1492 DEBUG_TRIE_EXECUTE_r(
1493 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1494 dump_exec_pos( (char *)uc, c, strend, real_start,
1495 (char *)uc, do_utf8 );
1496 PerlIO_printf( Perl_debug_log,
1497 " Scanning for legal start char...\n");
1498 }
1499 );
1500 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1501 uc++;
786e8c11 1502 }
1de06328 1503 s= (char *)uc;
786e8c11 1504 }
786e8c11
YO
1505 if (uc >(U8*)last_start) break;
1506 }
1507
1508 if ( word ) {
1509 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1510 if (!leftmost || lpos < leftmost) {
1511 DEBUG_r(accepted_word=word);
07be1b83 1512 leftmost= lpos;
786e8c11 1513 }
07be1b83 1514 if (base==0) break;
786e8c11 1515
07be1b83
YO
1516 }
1517 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1518 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1519 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1520 DEBUG_TRIE_EXECUTE_r({
1521 dump_exec_pos( (char *)uc, c, strend, real_start,
1522 s, do_utf8 );
07be1b83 1523 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1524 " Charid:%3u CP:%4"UVxf" ",
1525 charid, uvc);
1526 });
07be1b83
YO
1527
1528 do {
6148ee25 1529#ifdef DEBUGGING
786e8c11 1530 word = aho->states[ state ].wordnum;
6148ee25 1531#endif
07be1b83
YO
1532 base = aho->states[ state ].trans.base;
1533
786e8c11
YO
1534 DEBUG_TRIE_EXECUTE_r({
1535 if (failed)
1536 dump_exec_pos( (char *)uc, c, strend, real_start,
1537 s, do_utf8 );
07be1b83 1538 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1539 "%sState: %4"UVxf", word=%"UVxf,
1540 failed ? " Fail transition to " : "",
1541 (UV)state, (UV)word);
1542 });
07be1b83
YO
1543 if ( base ) {
1544 U32 tmp;
1545 if (charid &&
1546 (base + charid > trie->uniquecharcount )
1547 && (base + charid - 1 - trie->uniquecharcount
1548 < trie->lasttrans)
1549 && trie->trans[base + charid - 1 -
1550 trie->uniquecharcount].check == state
1551 && (tmp=trie->trans[base + charid - 1 -
1552 trie->uniquecharcount ].next))
1553 {
786e8c11
YO
1554 DEBUG_TRIE_EXECUTE_r(
1555 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1556 state = tmp;
1557 break;
1558 }
1559 else {
786e8c11
YO
1560 DEBUG_TRIE_EXECUTE_r(
1561 PerlIO_printf( Perl_debug_log," - fail\n"));
1562 failed = 1;
1563 state = aho->fail[state];
07be1b83
YO
1564 }
1565 }
1566 else {
1567 /* we must be accepting here */
786e8c11
YO
1568 DEBUG_TRIE_EXECUTE_r(
1569 PerlIO_printf( Perl_debug_log," - accepting\n"));
1570 failed = 1;
07be1b83
YO
1571 break;
1572 }
1573 } while(state);
786e8c11 1574 uc += len;
07be1b83
YO
1575 if (failed) {
1576 if (leftmost)
1577 break;
786e8c11 1578 if (!state) state = 1;
07be1b83
YO
1579 }
1580 }
1581 if ( aho->states[ state ].wordnum ) {
1582 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1583 if (!leftmost || lpos < leftmost) {
1584 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1585 leftmost = lpos;
786e8c11 1586 }
07be1b83 1587 }
07be1b83
YO
1588 if (leftmost) {
1589 s = (char*)leftmost;
786e8c11
YO
1590 DEBUG_TRIE_EXECUTE_r({
1591 PerlIO_printf(
1592 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1593 (UV)accepted_word, s - real_start
1594 );
1595 });
be8e71aa
YO
1596 if (!reginfo || regtry(reginfo, s)) {
1597 FREETMPS;
1598 LEAVE;
07be1b83 1599 goto got_it;
be8e71aa 1600 }
07be1b83 1601 s = HOPc(s,1);
786e8c11
YO
1602 DEBUG_TRIE_EXECUTE_r({
1603 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1604 });
07be1b83 1605 } else {
786e8c11
YO
1606 DEBUG_TRIE_EXECUTE_r(
1607 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1608 break;
1609 }
1610 }
be8e71aa
YO
1611 FREETMPS;
1612 LEAVE;
07be1b83
YO
1613 }
1614 break;
b3c9acc1 1615 default:
3c3eec57
GS
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1617 break;
d6a28714 1618 }
6eb5f6b9
JH
1619 return 0;
1620 got_it:
1621 return s;
1622}
1623
1624/*
1625 - regexec_flags - match a regexp against a string
1626 */
1627I32
1628Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630/* strend: pointer to null at end of string */
1631/* strbeg: real beginning of string */
1632/* minend: end of match must be >=minend after stringarg. */
1633/* data: May be used for some additional optimizations. */
1634/* nosave: For optimizations. */
1635{
97aff369 1636 dVAR;
6eb5f6b9
JH
1637 register char *s;
1638 register regnode *c;
1639 register char *startpos = stringarg;
6eb5f6b9
JH
1640 I32 minlen; /* must match at least this many chars */
1641 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1642 I32 end_shift = 0; /* Same for the end. */ /* CC */
1643 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1644 char *scream_olds = NULL;
3dab1dad 1645 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1646 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1647 I32 multiline;
0df25f3d 1648
3b0527fe 1649 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1650
1651 GET_RE_DEBUG_FLAGS_DECL;
1652
9d4ba2ae 1653 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1654
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1658 return 0;
1659 }
1660
2757e526 1661 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1662 reginfo.prog = prog;
2757e526 1663
bac06658 1664 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1665 DEBUG_EXECUTE_r(
1666 debug_start_match(prog, do_utf8, startpos, strend,
1667 "Matching");
1668 );
bac06658 1669
6eb5f6b9 1670 minlen = prog->minlen;
1de06328
YO
1671
1672 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1674 "String too short [regexec_flags]...\n"));
1675 goto phooey;
1aa99e6b 1676 }
6eb5f6b9 1677
1de06328 1678
6eb5f6b9
JH
1679 /* Check validity of program. */
1680 if (UCHARAT(prog->program) != REG_MAGIC) {
1681 Perl_croak(aTHX_ "corrupted regexp program");
1682 }
1683
1684 PL_reg_flags = 0;
1685 PL_reg_eval_set = 0;
1686 PL_reg_maxiter = 0;
1687
1688 if (prog->reganch & ROPT_UTF8)
1689 PL_reg_flags |= RF_utf8;
1690
1691 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1692 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1693 PL_bostr = strbeg;
3b0527fe 1694 reginfo.sv = sv;
6eb5f6b9
JH
1695
1696 /* Mark end of line for $ (and such) */
1697 PL_regeol = strend;
1698
1699 /* see how far we have to get to not match where we matched before */
3b0527fe 1700 reginfo.till = startpos+minend;
6eb5f6b9 1701
6eb5f6b9
JH
1702 /* If there is a "must appear" string, look for it. */
1703 s = startpos;
1704
3b0527fe 1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1706 MAGIC *mg;
1707
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1709 reginfo.ganch = startpos;
6eb5f6b9
JH
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1711 && SvMAGIC(sv)
14befaf4
DM
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
3b0527fe 1714 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1715 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1716 if (s > reginfo.ganch)
6eb5f6b9 1717 goto phooey;
3b0527fe 1718 s = reginfo.ganch;
6eb5f6b9
JH
1719 }
1720 }
1721 else /* pos() not defined */
3b0527fe 1722 reginfo.ganch = strbeg;
6eb5f6b9
JH
1723 }
1724
a0714e2c 1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1726 re_scream_pos_data d;
1727
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1731 if (!s) {
a3621e74 1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1733 goto phooey; /* not present */
3fa9c3d7 1734 }
6eb5f6b9
JH
1735 }
1736
1de06328 1737
6eb5f6b9
JH
1738
1739 /* Simplest case: anchored match need be tried only once. */
1740 /* [unless only anchor is BOL and multiline is set] */
1741 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1742 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1743 goto got_it;
7fba1cd6 1744 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1745 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1746 {
1747 char *end;
1748
1749 if (minlen)
1750 dontbother = minlen - 1;
1aa99e6b 1751 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1752 /* for multiline we only have to try after newlines */
33b8afdf 1753 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1754 if (s == startpos)
1755 goto after_try;
1756 while (1) {
3b0527fe 1757 if (regtry(&reginfo, s))
6eb5f6b9
JH
1758 goto got_it;
1759 after_try:
1760 if (s >= end)
1761 goto phooey;
1762 if (prog->reganch & RE_USE_INTUIT) {
1763 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1764 if (!s)
1765 goto phooey;
1766 }
1767 else
1768 s++;
1769 }
1770 } else {
1771 if (s > startpos)
1772 s--;
1773 while (s < end) {
1774 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1775 if (regtry(&reginfo, s))
6eb5f6b9
JH
1776 goto got_it;
1777 }
1778 }
1779 }
1780 }
1781 goto phooey;
f9f4320a
YO
1782 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1783 {
1784 /* the warning about reginfo.ganch being used without intialization
1785 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1786 and we only enter this block when the same bit is set. */
3b0527fe 1787 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1788 goto got_it;
1789 goto phooey;
1790 }
1791
1792 /* Messy cases: unanchored match. */
33b8afdf 1793 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1794 /* we have /x+whatever/ */
1795 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1796 char ch;
bf93d4cc
GS
1797#ifdef DEBUGGING
1798 int did_match = 0;
1799#endif
33b8afdf
JH
1800 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1801 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1802 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1803
1aa99e6b 1804 if (do_utf8) {
4cadc6a9 1805 REXEC_FBC_SCAN(
6eb5f6b9 1806 if (*s == ch) {
a3621e74 1807 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1808 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1809 s += UTF8SKIP(s);
1810 while (s < strend && *s == ch)
1811 s += UTF8SKIP(s);
1812 }
4cadc6a9 1813 );
6eb5f6b9
JH
1814 }
1815 else {
4cadc6a9 1816 REXEC_FBC_SCAN(
6eb5f6b9 1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1819 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1820 s++;
1821 while (s < strend && *s == ch)
1822 s++;
1823 }
4cadc6a9 1824 );
6eb5f6b9 1825 }
a3621e74 1826 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1827 PerlIO_printf(Perl_debug_log,
b7953727
JH
1828 "Did not find anchored character...\n")
1829 );
6eb5f6b9 1830 }
a0714e2c
SS
1831 else if (prog->anchored_substr != NULL
1832 || prog->anchored_utf8 != NULL
1833 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1834 && prog->float_max_offset < strend - s)) {
1835 SV *must;
1836 I32 back_max;
1837 I32 back_min;
1838 char *last;
6eb5f6b9 1839 char *last1; /* Last position checked before */
bf93d4cc
GS
1840#ifdef DEBUGGING
1841 int did_match = 0;
1842#endif
33b8afdf
JH
1843 if (prog->anchored_substr || prog->anchored_utf8) {
1844 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1845 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1846 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1847 back_max = back_min = prog->anchored_offset;
1848 } else {
1849 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1850 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1851 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1852 back_max = prog->float_max_offset;
1853 back_min = prog->float_min_offset;
1854 }
1de06328
YO
1855
1856
33b8afdf
JH
1857 if (must == &PL_sv_undef)
1858 /* could not downgrade utf8 check substring, so must fail */
1859 goto phooey;
1860
1de06328
YO
1861 if (back_min<0) {
1862 last = strend;
1863 } else {
1864 last = HOP3c(strend, /* Cannot start after this */
1865 -(I32)(CHR_SVLEN(must)
1866 - (SvTAIL(must) != 0) + back_min), strbeg);
1867 }
6eb5f6b9
JH
1868 if (s > PL_bostr)
1869 last1 = HOPc(s, -1);
1870 else
1871 last1 = s - 1; /* bogus */
1872
a0288114 1873 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1874 check_substr==must. */
1875 scream_pos = -1;
1876 dontbother = end_shift;
1877 strend = HOPc(strend, -dontbother);
1878 while ( (s <= last) &&
9041c2e3 1879 ((flags & REXEC_SCREAM)
1de06328 1880 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1881 end_shift, &scream_pos, 0))
1de06328 1882 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1883 (unsigned char*)strend, must,
7fba1cd6 1884 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1885 /* we may be pointing at the wrong string */
1886 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1887 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1888 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1889 if (HOPc(s, -back_max) > last1) {
1890 last1 = HOPc(s, -back_min);
1891 s = HOPc(s, -back_max);
1892 }
1893 else {
52657f30 1894 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1895
1896 last1 = HOPc(s, -back_min);
52657f30 1897 s = t;
6eb5f6b9 1898 }
1aa99e6b 1899 if (do_utf8) {
6eb5f6b9 1900 while (s <= last1) {
3b0527fe 1901 if (regtry(&reginfo, s))
6eb5f6b9
JH
1902 goto got_it;
1903 s += UTF8SKIP(s);
1904 }
1905 }
1906 else {
1907 while (s <= last1) {
3b0527fe 1908 if (regtry(&reginfo, s))
6eb5f6b9
JH
1909 goto got_it;
1910 s++;
1911 }
1912 }
1913 }
ab3bbdeb
YO
1914 DEBUG_EXECUTE_r(if (!did_match) {
1915 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1916 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1917 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1918 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1919 ? "anchored" : "floating"),
ab3bbdeb
YO
1920 quoted, RE_SV_TAIL(must));
1921 });
6eb5f6b9
JH
1922 goto phooey;
1923 }
786e8c11 1924 else if ( (c = prog->regstclass) ) {
f14c76ed 1925 if (minlen) {
be8e71aa 1926 const OPCODE op = OP(prog->regstclass);
66e933ab 1927 /* don't bother with what can't match */
786e8c11 1928 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1929 strend = HOPc(strend, -(minlen - 1));
1930 }
a3621e74 1931 DEBUG_EXECUTE_r({
be8e71aa 1932 SV * const prop = sv_newmortal();
32fc9b6a 1933 regprop(prog, prop, c);
0df25f3d 1934 {
ab3bbdeb
YO
1935 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1936 s,strend-s,60);
0df25f3d 1937 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1938 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1939 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1940 quoted, (int)(strend - s));
0df25f3d 1941 }
ffc61ed2 1942 });
3b0527fe 1943 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1944 goto got_it;
07be1b83 1945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1946 }
1947 else {
1948 dontbother = 0;
a0714e2c 1949 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1950 /* Trim the end. */
d6a28714 1951 char *last;
33b8afdf
JH
1952 SV* float_real;
1953
1954 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1955 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1956 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1957
1958 if (flags & REXEC_SCREAM) {
33b8afdf 1959 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1960 end_shift, &scream_pos, 1); /* last one */
1961 if (!last)
ffc61ed2 1962 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1963 /* we may be pointing at the wrong string */
1964 else if (RX_MATCH_COPIED(prog))
3f7c398e 1965 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1966 }
d6a28714
JH
1967 else {
1968 STRLEN len;
cfd0369c 1969 const char * const little = SvPV_const(float_real, len);
d6a28714 1970
33b8afdf 1971 if (SvTAIL(float_real)) {
d6a28714
JH
1972 if (memEQ(strend - len + 1, little, len - 1))
1973 last = strend - len + 1;
7fba1cd6 1974 else if (!multiline)
9041c2e3 1975 last = memEQ(strend - len, little, len)
bd61b366 1976 ? strend - len : NULL;
b8c5462f 1977 else
d6a28714
JH
1978 goto find_last;
1979 } else {
1980 find_last:
9041c2e3 1981 if (len)
d6a28714 1982 last = rninstr(s, strend, little, little + len);
b8c5462f 1983 else
a0288114 1984 last = strend; /* matching "$" */
b8c5462f 1985 }
b8c5462f 1986 }
bf93d4cc 1987 if (last == NULL) {
6bda09f9
YO
1988 DEBUG_EXECUTE_r(
1989 PerlIO_printf(Perl_debug_log,
1990 "%sCan't trim the tail, match fails (should not happen)%s\n",
1991 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1992 goto phooey; /* Should not happen! */
1993 }
d6a28714
JH
1994 dontbother = strend - last + prog->float_min_offset;
1995 }
1996 if (minlen && (dontbother < minlen))
1997 dontbother = minlen - 1;
1998 strend -= dontbother; /* this one's always in bytes! */
1999 /* We don't know much -- general case. */
1aa99e6b 2000 if (do_utf8) {
d6a28714 2001 for (;;) {
3b0527fe 2002 if (regtry(&reginfo, s))
d6a28714
JH
2003 goto got_it;
2004 if (s >= strend)
2005 break;
b8c5462f 2006 s += UTF8SKIP(s);
d6a28714
JH
2007 };
2008 }
2009 else {
2010 do {
3b0527fe 2011 if (regtry(&reginfo, s))
d6a28714
JH
2012 goto got_it;
2013 } while (s++ < strend);
2014 }
2015 }
2016
2017 /* Failure. */
2018 goto phooey;
2019
2020got_it:
2021 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2022
2023 if (PL_reg_eval_set) {
2024 /* Preserve the current value of $^R */
2025 if (oreplsv != GvSV(PL_replgv))
2026 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2027 restored, the value remains
2028 the same. */
4f639d21 2029 restore_pos(aTHX_ prog);
d6a28714 2030 }
81714fb9
YO
2031 if (prog->paren_names)
2032 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2033
2034 /* make sure $`, $&, $', and $digit will work later */
2035 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2036 RX_MATCH_COPY_FREE(prog);
d6a28714 2037 if (flags & REXEC_COPY_STR) {
be8e71aa 2038 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2039#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2040 if ((SvIsCOW(sv)
2041 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2042 if (DEBUG_C_TEST) {
2043 PerlIO_printf(Perl_debug_log,
2044 "Copy on write: regexp capture, type %d\n",
2045 (int) SvTYPE(sv));
2046 }
2047 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2048 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2049 assert (SvPOKp(prog->saved_copy));
2050 } else
2051#endif
2052 {
2053 RX_MATCH_COPIED_on(prog);
2054 s = savepvn(strbeg, i);
2055 prog->subbeg = s;
2056 }
d6a28714 2057 prog->sublen = i;
d6a28714
JH
2058 }
2059 else {
2060 prog->subbeg = strbeg;
2061 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2062 }
2063 }
9041c2e3 2064
d6a28714
JH
2065 return 1;
2066
2067phooey:
a3621e74 2068 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2069 PL_colors[4], PL_colors[5]));
d6a28714 2070 if (PL_reg_eval_set)
4f639d21 2071 restore_pos(aTHX_ prog);
d6a28714
JH
2072 return 0;
2073}
2074
6bda09f9 2075
d6a28714
JH
2076/*
2077 - regtry - try match at specific point
2078 */
2079STATIC I32 /* 0 failure, 1 success */
3b0527fe 2080S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2081{
97aff369 2082 dVAR;
d6a28714
JH
2083 register I32 *sp;
2084 register I32 *ep;
2085 CHECKPOINT lastcp;
3b0527fe 2086 regexp *prog = reginfo->prog;
a3621e74 2087 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2088
2089 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2090 MAGIC *mg;
2091
2092 PL_reg_eval_set = RS_init;
a3621e74 2093 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2094 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2095 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2096 ));
ea8d6ae1 2097 SAVESTACK_CXPOS();
d6a28714
JH
2098 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2099 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2100 SAVETMPS;
2101 /* Apparently this is not needed, judging by wantarray. */
e8347627 2102 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2103 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2104
3b0527fe 2105 if (reginfo->sv) {
d6a28714 2106 /* Make $_ available to executed code. */
3b0527fe 2107 if (reginfo->sv != DEFSV) {
59f00321 2108 SAVE_DEFSV;
3b0527fe 2109 DEFSV = reginfo->sv;
b8c5462f 2110 }
d6a28714 2111
3b0527fe
DM
2112 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2113 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2114 /* prepare for quick setting of pos */
d300d9fa
NC
2115#ifdef PERL_OLD_COPY_ON_WRITE
2116 if (SvIsCOW(sv))
2117 sv_force_normal_flags(sv, 0);
2118#endif
3dab1dad 2119 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2120 &PL_vtbl_mglob, NULL, 0);
d6a28714 2121 mg->mg_len = -1;
b8c5462f 2122 }
d6a28714
JH
2123 PL_reg_magic = mg;
2124 PL_reg_oldpos = mg->mg_len;
4f639d21 2125 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2126 }
09687e5a 2127 if (!PL_reg_curpm) {
a02a5408 2128 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2129#ifdef USE_ITHREADS
2130 {
be8e71aa 2131 SV* const repointer = newSViv(0);
577e12cc 2132 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2133 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2134 av_push(PL_regex_padav,repointer);
2135 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2136 PL_regex_pad = AvARRAY(PL_regex_padav);
2137 }
2138#endif
2139 }
aaa362c4 2140 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2141 PL_reg_oldcurpm = PL_curpm;
2142 PL_curpm = PL_reg_curpm;
2143 if (RX_MATCH_COPIED(prog)) {
2144 /* Here is a serious problem: we cannot rewrite subbeg,
2145 since it may be needed if this match fails. Thus
2146 $` inside (?{}) could fail... */
2147 PL_reg_oldsaved = prog->subbeg;
2148 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2149#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2150 PL_nrs = prog->saved_copy;
2151#endif
d6a28714
JH
2152 RX_MATCH_COPIED_off(prog);
2153 }
2154 else
bd61b366 2155 PL_reg_oldsaved = NULL;
d6a28714
JH
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2158 }
6bda09f9 2159 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
973dddac 2160 prog->startp[0] = startpos - PL_bostr;
d6a28714 2161 PL_reginput = startpos;
d6a28714 2162 PL_reglastparen = &prog->lastparen;
a01268b5 2163 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2164 prog->lastparen = 0;
03994de8 2165 prog->lastcloseparen = 0;
d6a28714 2166 PL_regsize = 0;
6bda09f9
YO
2167 PL_regstartp = prog->startp;
2168 PL_regendp = prog->endp;
d6a28714
JH
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2173 else
a02a5408 2174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2175 }
2176
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
3dd2943c 2179 this! --ilya*/
dafc8851
JH
2180
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2190 * --jhi */
dafc8851 2191#if 1
d6a28714
JH
2192 sp = prog->startp;
2193 ep = prog->endp;
2194 if (prog->nparens) {
097eb12c 2195 register I32 i;
eb160463 2196 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2197 *++sp = -1;
2198 *++ep = -1;
2199 }
2200 }
dafc8851 2201#endif
02db2b7b 2202 REGCP_SET(lastcp);
3b0527fe 2203 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2204 prog->endp[0] = PL_reginput - PL_bostr;
2205 return 1;
2206 }
02db2b7b 2207 REGCP_UNWIND(lastcp);
d6a28714
JH
2208 return 0;
2209}
2210
02db2b7b 2211
8ba1375e
MJD
2212#define sayYES goto yes
2213#define sayNO goto no
262b90c4 2214#define sayNO_SILENT goto no_silent
8ba1375e 2215
f9f4320a
YO
2216/* we dont use STMT_START/END here because it leads to
2217 "unreachable code" warnings, which are bogus, but distracting. */
2218#define CACHEsayNO \
c476f425
DM
2219 if (ST.cache_mask) \
2220 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2221 sayNO
3298f257 2222
a3621e74 2223/* this is used to determine how far from the left messages like
265c4333
YO
2224 'failed...' are printed. It should be set such that messages
2225 are inline with the regop output that created them.
a3621e74 2226*/
265c4333 2227#define REPORT_CODE_OFF 32
a3621e74
YO
2228
2229
2230/* Make sure there is a test for this +1 options in re_tests */
2231#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2232
40a82448
DM
2233#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2234#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2235
86545054
DM
2236#define SLAB_FIRST(s) (&(s)->states[0])
2237#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2238
5d9a96ca
DM
2239/* grab a new slab and return the first slot in it */
2240
2241STATIC regmatch_state *
2242S_push_slab(pTHX)
2243{
54df2634
NC
2244#if PERL_VERSION < 9
2245 dMY_CXT;
2246#endif
5d9a96ca
DM
2247 regmatch_slab *s = PL_regmatch_slab->next;
2248 if (!s) {
2249 Newx(s, 1, regmatch_slab);
2250 s->prev = PL_regmatch_slab;
2251 s->next = NULL;
2252 PL_regmatch_slab->next = s;
2253 }
2254 PL_regmatch_slab = s;
86545054 2255 return SLAB_FIRST(s);
5d9a96ca 2256}
5b47454d 2257
95b24440 2258
40a82448
DM
2259/* push a new state then goto it */
2260
2261#define PUSH_STATE_GOTO(state, node) \
2262 scan = node; \
2263 st->resume_state = state; \
2264 goto push_state;
2265
2266/* push a new state with success backtracking, then goto it */
2267
2268#define PUSH_YES_STATE_GOTO(state, node) \
2269 scan = node; \
2270 st->resume_state = state; \
2271 goto push_yes_state;
2272
aa283a38 2273
aa283a38 2274
d6a28714 2275/*
95b24440 2276
bf1f174e
DM
2277regmatch() - main matching routine
2278
2279This is basically one big switch statement in a loop. We execute an op,
2280set 'next' to point the next op, and continue. If we come to a point which
2281we may need to backtrack to on failure such as (A|B|C), we push a
2282backtrack state onto the backtrack stack. On failure, we pop the top
2283state, and re-enter the loop at the state indicated. If there are no more
2284states to pop, we return failure.
2285
2286Sometimes we also need to backtrack on success; for example /A+/, where
2287after successfully matching one A, we need to go back and try to
2288match another one; similarly for lookahead assertions: if the assertion
2289completes successfully, we backtrack to the state just before the assertion
2290and then carry on. In these cases, the pushed state is marked as
2291'backtrack on success too'. This marking is in fact done by a chain of
2292pointers, each pointing to the previous 'yes' state. On success, we pop to
2293the nearest yes state, discarding any intermediate failure-only states.
2294Sometimes a yes state is pushed just to force some cleanup code to be
2295called at the end of a successful match or submatch; e.g. (??{$re}) uses
2296it to free the inner regex.
2297
2298Note that failure backtracking rewinds the cursor position, while
2299success backtracking leaves it alone.
2300
2301A pattern is complete when the END op is executed, while a subpattern
2302such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2303ops trigger the "pop to last yes state if any, otherwise return true"
2304behaviour.
2305
2306A common convention in this function is to use A and B to refer to the two
2307subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2308the subpattern to be matched possibly multiple times, while B is the entire
2309rest of the pattern. Variable and state names reflect this convention.
2310
2311The states in the main switch are the union of ops and failure/success of
2312substates associated with with that op. For example, IFMATCH is the op
2313that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2314'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2315successfully matched A and IFMATCH_A_fail is a state saying that we have
2316just failed to match A. Resume states always come in pairs. The backtrack
2317state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2318at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2319on success or failure.
2320
2321The struct that holds a backtracking state is actually a big union, with
2322one variant for each major type of op. The variable st points to the
2323top-most backtrack struct. To make the code clearer, within each
2324block of code we #define ST to alias the relevant union.
2325
2326Here's a concrete example of a (vastly oversimplified) IFMATCH
2327implementation:
2328
2329 switch (state) {
2330 ....
2331
2332#define ST st->u.ifmatch
2333
2334 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2335 ST.foo = ...; // some state we wish to save
95b24440 2336 ...
bf1f174e
DM
2337 // push a yes backtrack state with a resume value of
2338 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2339 // first node of A:
2340 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2341 // NOTREACHED
2342
2343 case IFMATCH_A: // we have successfully executed A; now continue with B
2344 next = B;
2345 bar = ST.foo; // do something with the preserved value
2346 break;
2347
2348 case IFMATCH_A_fail: // A failed, so the assertion failed
2349 ...; // do some housekeeping, then ...
2350 sayNO; // propagate the failure
2351
2352#undef ST
95b24440 2353
bf1f174e
DM
2354 ...
2355 }
95b24440 2356
bf1f174e
DM
2357For any old-timers reading this who are familiar with the old recursive
2358approach, the code above is equivalent to:
95b24440 2359
bf1f174e
DM
2360 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2361 {
2362 int foo = ...
95b24440 2363 ...
bf1f174e
DM
2364 if (regmatch(A)) {
2365 next = B;
2366 bar = foo;
2367 break;
95b24440 2368 }
bf1f174e
DM
2369 ...; // do some housekeeping, then ...
2370 sayNO; // propagate the failure
95b24440 2371 }
bf1f174e
DM
2372
2373The topmost backtrack state, pointed to by st, is usually free. If you
2374want to claim it, populate any ST.foo fields in it with values you wish to
2375save, then do one of
2376
2377 PUSH_STATE_GOTO(resume_state, node);
2378 PUSH_YES_STATE_GOTO(resume_state, node);
2379
2380which sets that backtrack state's resume value to 'resume_state', pushes a
2381new free entry to the top of the backtrack stack, then goes to 'node'.
2382On backtracking, the free slot is popped, and the saved state becomes the
2383new free state. An ST.foo field in this new top state can be temporarily
2384accessed to retrieve values, but once the main loop is re-entered, it
2385becomes available for reuse.
2386
2387Note that the depth of the backtrack stack constantly increases during the
2388left-to-right execution of the pattern, rather than going up and down with
2389the pattern nesting. For example the stack is at its maximum at Z at the
2390end of the pattern, rather than at X in the following:
2391
2392 /(((X)+)+)+....(Y)+....Z/
2393
2394The only exceptions to this are lookahead/behind assertions and the cut,
2395(?>A), which pop all the backtrack states associated with A before
2396continuing.
2397
2398Bascktrack state structs are allocated in slabs of about 4K in size.
2399PL_regmatch_state and st always point to the currently active state,
2400and PL_regmatch_slab points to the slab currently containing
2401PL_regmatch_state. The first time regmatch() is called, the first slab is
2402allocated, and is never freed until interpreter destruction. When the slab
2403is full, a new one is allocated and chained to the end. At exit from
2404regmatch(), slabs allocated since entry are freed.
2405
2406*/
95b24440 2407
40a82448 2408
5bc10b2c 2409#define DEBUG_STATE_pp(pp) \
265c4333 2410 DEBUG_STATE_r({ \
5bc10b2c
DM
2411 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2412 PerlIO_printf(Perl_debug_log, \
2413 " %*s"pp" %s\n", \
2414 depth*2, "", \
03363afd 2415 reg_name[st->resume_state] ); \
265c4333 2416 });
5bc10b2c 2417
40a82448 2418
3dab1dad 2419#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2420
3df15adc 2421#ifdef DEBUGGING
5bc10b2c 2422
ab3bbdeb
YO
2423STATIC void
2424S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2425 const char *start, const char *end, const char *blurb)
2426{
2427 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2428 if (!PL_colorset)
2429 reginitcolors();
2430 {
2431 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2432 prog->precomp, prog->prelen, 60);
2433
2434 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2435 start, end - start, 60);
2436
2437 PerlIO_printf(Perl_debug_log,
2438 "%s%s REx%s %s against %s\n",
2439 PL_colors[4], blurb, PL_colors[5], s0, s1);
2440
2441 if (do_utf8||utf8_pat)
1de06328
YO
2442 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2443 utf8_pat ? "pattern" : "",
2444 utf8_pat && do_utf8 ? " and " : "",
2445 do_utf8 ? "string" : ""
ab3bbdeb
YO
2446 );
2447 }
2448}
3df15adc
YO
2449
2450STATIC void
786e8c11
YO
2451S_dump_exec_pos(pTHX_ const char *locinput,
2452 const regnode *scan,
2453 const char *loc_regeol,
2454 const char *loc_bostr,
2455 const char *loc_reg_starttry,
2456 const bool do_utf8)
07be1b83 2457{
786e8c11 2458 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2459 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2460 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2461 /* The part of the string before starttry has one color
2462 (pref0_len chars), between starttry and current
2463 position another one (pref_len - pref0_len chars),
2464 after the current position the third one.
2465 We assume that pref0_len <= pref_len, otherwise we
2466 decrease pref0_len. */
786e8c11
YO
2467 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2468 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2469 int pref0_len;
2470
2471 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2472 pref_len++;
786e8c11
YO
2473 pref0_len = pref_len - (locinput - loc_reg_starttry);
2474 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2475 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2476 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2477 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2478 l--;
2479 if (pref0_len < 0)
2480 pref0_len = 0;
2481 if (pref0_len > pref_len)
2482 pref0_len = pref_len;
2483 {
3df15adc 2484 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2485
ab3bbdeb 2486 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2487 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2488
ab3bbdeb 2489 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2490 (locinput - pref_len + pref0_len),
1de06328 2491 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2492
ab3bbdeb 2493 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2494 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2495
1de06328 2496 const STRLEN tlen=len0+len1+len2;
3df15adc 2497 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2498 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2499 (IV)(locinput - loc_bostr),
07be1b83 2500 len0, s0,
07be1b83 2501 len1, s1,
07be1b83 2502 (docolor ? "" : "> <"),
07be1b83 2503 len2, s2,
f9f4320a 2504 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2505 "");
2506 }
2507}
3df15adc 2508
07be1b83
YO
2509#endif
2510
0a4db386
YO
2511/* reg_check_named_buff_matched()
2512 * Checks to see if a named buffer has matched. The data array of
2513 * buffer numbers corresponding to the buffer is expected to reside
2514 * in the regexp->data->data array in the slot stored in the ARG() of
2515 * node involved. Note that this routine doesn't actually care about the
2516 * name, that information is not preserved from compilation to execution.
2517 * Returns the index of the leftmost defined buffer with the given name
2518 * or 0 if non of the buffers matched.
2519 */
2520STATIC I32
2521S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2522 I32 n;
2523 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2524 I32 *nums=(I32*)SvPVX(sv_dat);
2525 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2526 if ((I32)*PL_reglastparen >= nums[n] &&
2527 PL_regendp[nums[n]] != -1)
2528 {
2529 return nums[n];
2530 }
2531 }
2532 return 0;
2533}
2534
d6a28714 2535STATIC I32 /* 0 failure, 1 success */
3b0527fe 2536S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2537{
54df2634
NC
2538#if PERL_VERSION < 9
2539 dMY_CXT;
2540#endif
27da23d5 2541 dVAR;
95b24440 2542 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2543 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2544
3b0527fe
DM
2545 regexp *rex = reginfo->prog;
2546
5d9a96ca
DM
2547 regmatch_slab *orig_slab;
2548 regmatch_state *orig_state;
a3621e74 2549
5d9a96ca
DM
2550 /* the current state. This is a cached copy of PL_regmatch_state */
2551 register regmatch_state *st;
95b24440 2552
5d9a96ca
DM
2553 /* cache heavy used fields of st in registers */
2554 register regnode *scan;
2555 register regnode *next;
24d3c4a9
DM
2556 register I32 n = 0; /* general value; init to avoid compiler warning */
2557 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2558 register char *locinput = PL_reginput;
5d9a96ca 2559 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2560
b69b0499 2561 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2562 int depth = 0; /* depth of backtrack stack */
6bda09f9 2563 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
77cb431f
DM
2564 regmatch_state *yes_state = NULL; /* state to pop to on success of
2565 subpattern */
faec1544 2566 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2567 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2568 U32 state_num;
10edeb5d 2569
24d3c4a9
DM
2570 /* these three flags are set by various ops to signal information to
2571 * the very next op. They have a useful lifetime of exactly one loop
2572 * iteration, and are not preserved or restored by state pushes/pops
2573 */
2574 bool sw = 0; /* the condition value in (?(cond)a|b) */
2575 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2576 int logical = 0; /* the following EVAL is:
2577 0: (?{...})
2578 1: (?(?{...})X|Y)
2579 2: (??{...})
2580 or the following IFMATCH/UNLESSM is:
2581 false: plain (?=foo)
2582 true: used as a condition: (?(?=foo))
2583 */
2584
95b24440 2585#ifdef DEBUGGING
e68ec53f 2586 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2587#endif
2588
5d9a96ca
DM
2589 /* on first ever call to regmatch, allocate first slab */
2590 if (!PL_regmatch_slab) {
2591 Newx(PL_regmatch_slab, 1, regmatch_slab);
2592 PL_regmatch_slab->prev = NULL;
2593 PL_regmatch_slab->next = NULL;
86545054 2594 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2595 }
2596
2597 /* remember current high-water mark for exit */
2598 /* XXX this should be done with SAVE* instead */
2599 orig_slab = PL_regmatch_slab;
2600 orig_state = PL_regmatch_state;
2601
2602 /* grab next free state slot */
2603 st = ++PL_regmatch_state;
86545054 2604 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2605 st = PL_regmatch_state = S_push_slab(aTHX);
2606
d6a28714
JH
2607 /* Note that nextchr is a byte even in UTF */
2608 nextchr = UCHARAT(locinput);
2609 scan = prog;
2610 while (scan != NULL) {
8ba1375e 2611
a3621e74 2612 DEBUG_EXECUTE_r( {
6136c704 2613 SV * const prop = sv_newmortal();
1de06328 2614 regnode *rnext=regnext(scan);
786e8c11 2615 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2616 regprop(rex, prop, scan);
07be1b83
YO
2617
2618 PerlIO_printf(Perl_debug_log,
2619 "%3"IVdf":%*s%s(%"IVdf")\n",
5bc10b2c 2620 (IV)(scan - rex->program), depth*2, "",
07be1b83 2621 SvPVX_const(prop),
1de06328
YO
2622 (PL_regkind[OP(scan)] == END || !rnext) ?
2623 0 : (IV)(rnext - rex->program));
2a782b5b 2624 });
d6a28714
JH
2625
2626 next = scan + NEXT_OFF(scan);
2627 if (next == scan)
2628 next = NULL;
40a82448 2629 state_num = OP(scan);
d6a28714 2630
40a82448
DM
2631 reenter_switch:
2632 switch (state_num) {
d6a28714 2633 case BOL:
7fba1cd6 2634 if (locinput == PL_bostr)
d6a28714 2635 {
3b0527fe 2636 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2637 break;
2638 }
d6a28714
JH
2639 sayNO;
2640 case MBOL:
12d33761
HS
2641 if (locinput == PL_bostr ||
2642 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2643 {
b8c5462f
JH
2644 break;
2645 }
d6a28714
JH
2646 sayNO;
2647 case SBOL:
c2a73568 2648 if (locinput == PL_bostr)
b8c5462f 2649 break;
d6a28714
JH
2650 sayNO;
2651 case GPOS:
3b0527fe 2652 if (locinput == reginfo->ganch)
d6a28714
JH
2653 break;
2654 sayNO;
2655 case EOL:
d6a28714
JH
2656 goto seol;
2657 case MEOL:
d6a28714 2658 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2659 sayNO;
b8c5462f 2660 break;
d6a28714
JH
2661 case SEOL:
2662 seol:
2663 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2664 sayNO;
d6a28714 2665 if (PL_regeol - locinput > 1)
b8c5462f 2666 sayNO;
b8c5462f 2667 break;
d6a28714
JH
2668 case EOS:
2669 if (PL_regeol != locinput)
b8c5462f 2670 sayNO;
d6a28714 2671 break;
ffc61ed2 2672 case SANY:
d6a28714 2673 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2674 sayNO;
f33976b4
DB
2675 if (do_utf8) {
2676 locinput += PL_utf8skip[nextchr];
2677 if (locinput > PL_regeol)
2678 sayNO;
2679 nextchr = UCHARAT(locinput);
2680 }
2681 else
2682 nextchr = UCHARAT(++locinput);
2683 break;
2684 case CANY:
2685 if (!nextchr && locinput >= PL_regeol)
2686 sayNO;
b8c5462f 2687 nextchr = UCHARAT(++locinput);
a0d0e21e 2688 break;
ffc61ed2 2689 case REG_ANY:
1aa99e6b
IH
2690 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2691 sayNO;
2692 if (do_utf8) {
b8c5462f 2693 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2694 if (locinput > PL_regeol)
2695 sayNO;
a0ed51b3 2696 nextchr = UCHARAT(locinput);
a0ed51b3 2697 }
1aa99e6b
IH
2698 else
2699 nextchr = UCHARAT(++locinput);
a0ed51b3 2700 break;
166ba7cd
DM
2701
2702#undef ST
2703#define ST st->u.trie
786e8c11
YO
2704 case TRIEC:
2705 /* In this case the charclass data is available inline so
2706 we can fail fast without a lot of extra overhead.
2707 */
2708 if (scan->flags == EXACT || !do_utf8) {
2709 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2710 DEBUG_EXECUTE_r(
2711 PerlIO_printf(Perl_debug_log,
2712 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2713 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2714 );
2715 sayNO_SILENT;
2716 /* NOTREACHED */
2717 }
2718 }
2719 /* FALL THROUGH */
5b47454d 2720 case TRIE:
3dab1dad 2721 {
07be1b83 2722 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2723 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2724 trie_type = do_utf8 ?
2725 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2726 : trie_plain;
2727
2728 /* what trie are we using right now */
be8e71aa 2729 reg_trie_data * const trie
3dab1dad
YO
2730 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2731 U32 state = trie->startstate;
166ba7cd 2732
3dab1dad
YO
2733 if (trie->bitmap && trie_type != trie_utf8_fold &&
2734 !TRIE_BITMAP_TEST(trie,*locinput)
2735 ) {
2736 if (trie->states[ state ].wordnum) {
2737 DEBUG_EXECUTE_r(
2738 PerlIO_printf(Perl_debug_log,
2739 "%*s %smatched empty string...%s\n",
5bc10b2c 2740 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2741 );
2742 break;
2743 } else {
2744 DEBUG_EXECUTE_r(
2745 PerlIO_printf(Perl_debug_log,
786e8c11 2746 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2747 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2748 );
2749 sayNO_SILENT;
2750 }
2751 }
166ba7cd 2752
786e8c11
YO
2753 {
2754 U8 *uc = ( U8* )locinput;
2755
2756 STRLEN len = 0;
2757 STRLEN foldlen = 0;
2758 U8 *uscan = (U8*)NULL;
2759 STRLEN bufflen=0;
2760 SV *sv_accept_buff = NULL;
2761 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2762
2763 ST.accepted = 0; /* how many accepting states we have seen */
2764 ST.B = next;
2765 ST.jump = trie->jump;
2766
2767#ifdef DEBUGGING
2768 ST.me = scan;
2769#endif
2770
2771
2772
07be1b83
YO
2773 /*
2774 traverse the TRIE keeping track of all accepting states
2775 we transition through until we get to a failing node.
2776 */
2777
a3621e74 2778 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2779 U32 base = trie->states[ state ].trans.base;
f9f4320a 2780 UV uvc = 0;
786e8c11
YO
2781 U16 charid;
2782 /* We use charid to hold the wordnum as we don't use it
2783 for charid until after we have done the wordnum logic.
2784 We define an alias just so that the wordnum logic reads
2785 more naturally. */
2786
2787#define got_wordnum charid
2788 got_wordnum = trie->states[ state ].wordnum;
2789
2790 if ( got_wordnum ) {
2791 if ( ! ST.accepted ) {
5b47454d
DM
2792 ENTER;
2793 SAVETMPS;
2794 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2795 sv_accept_buff=newSV(bufflen *
2796 sizeof(reg_trie_accepted) - 1);
786e8c11 2797 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2798 SvPOK_on(sv_accept_buff);
2799 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2800 SAVETMPS;
2801 ST.accept_buff =
5b47454d
DM
2802 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2803 }
786e8c11 2804 do {
166ba7cd 2805 if (ST.accepted >= bufflen) {
5b47454d 2806 bufflen *= 2;
166ba7cd 2807 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2808 SvGROW(sv_accept_buff,
2809 bufflen * sizeof(reg_trie_accepted));
2810 }
2811 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2812 + sizeof(reg_trie_accepted));
a3621e74 2813
786e8c11
YO
2814
2815 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2816 ST.accept_buff[ST.accepted].endpos = uc;
2817 ++ST.accepted;
2818 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2819 }
2820#undef got_wordnum
a3621e74 2821
07be1b83 2822 DEBUG_TRIE_EXECUTE_r({
786e8c11 2823 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2824 PerlIO_printf( Perl_debug_log,
786e8c11 2825 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2826 2+depth * 2, "", PL_colors[4],
786e8c11 2827 (UV)state, (UV)ST.accepted );
07be1b83 2828 });
a3621e74
YO
2829
2830 if ( base ) {
4cadc6a9
YO
2831 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2832 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2833
5b47454d
DM
2834 if (charid &&
2835 (base + charid > trie->uniquecharcount )
2836 && (base + charid - 1 - trie->uniquecharcount
2837 < trie->lasttrans)
2838 && trie->trans[base + charid - 1 -
2839 trie->uniquecharcount].check == state)
2840 {
2841 state = trie->trans[base + charid - 1 -
2842 trie->uniquecharcount ].next;
2843 }
2844 else {
2845 state = 0;
2846 }
2847 uc += len;
2848
2849 }
2850 else {
a3621e74
YO
2851 state = 0;
2852 }
2853 DEBUG_TRIE_EXECUTE_r(
e4584336 2854 PerlIO_printf( Perl_debug_log,
786e8c11 2855 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2856 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2857 );
2858 }
166ba7cd 2859 if (!ST.accepted )
a3621e74 2860 sayNO;
a3621e74 2861
166ba7cd
DM
2862 DEBUG_EXECUTE_r(
2863 PerlIO_printf( Perl_debug_log,
2864 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2865 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2866 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2867 );
786e8c11 2868 }}
166ba7cd
DM
2869
2870 /* FALL THROUGH */
2871
2872 case TRIE_next_fail: /* we failed - try next alterative */
2873
2874 if ( ST.accepted == 1 ) {
2875 /* only one choice left - just continue */
2876 DEBUG_EXECUTE_r({
2877 reg_trie_data * const trie
2878 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2879 SV ** const tmp = RX_DEBUG(reginfo->prog)
2880 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2881 : NULL;
2882 PerlIO_printf( Perl_debug_log,
2883 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2884 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd
DM
2885 ST.accept_buff[ 0 ].wordnum,
2886 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2887 PL_colors[5] );
2888 });
2889 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2890 /* in this case we free tmps/leave before we call regmatch
2891 as we wont be using accept_buff again. */
2892 FREETMPS;
2893 LEAVE;
2894 locinput = PL_reginput;
2895 nextchr = UCHARAT(locinput);
786e8c11
YO
2896
2897 if ( !ST.jump )
2898 scan = ST.B;
2899 else
2900 scan = ST.B - ST.jump[ST.accept_buff[0].wordnum];
2901
166ba7cd
DM
2902 continue; /* execute rest of RE */
2903 }
2904
2905 if (!ST.accepted-- ) {
2906 FREETMPS;
2907 LEAVE;
2908 sayNO;
2909 }
2910
a3621e74 2911 /*
166ba7cd
DM
2912 There are at least two accepting states left. Presumably
2913 the number of accepting states is going to be low,
2914 typically two. So we simply scan through to find the one
2915 with lowest wordnum. Once we find it, we swap the last
2916 state into its place and decrement the size. We then try to
2917 match the rest of the pattern at the point where the word
2918 ends. If we succeed, control just continues along the
2919 regex; if we fail we return here to try the next accepting
2920 state
2921 */
a3621e74 2922
166ba7cd
DM
2923 {
2924 U32 best = 0;
2925 U32 cur;
2926 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2927 DEBUG_TRIE_EXECUTE_r(
f2278c82 2928 PerlIO_printf( Perl_debug_log,
166ba7cd 2929 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 2930 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
2931 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2932 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2933 );
2934
2935 if (ST.accept_buff[cur].wordnum <
2936 ST.accept_buff[best].wordnum)
2937 best = cur;
a3621e74 2938 }
166ba7cd
DM
2939
2940 DEBUG_EXECUTE_r({
2941 reg_trie_data * const trie
2942 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2943 SV ** const tmp = RX_DEBUG(reginfo->prog)
2944 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2945 : NULL;
265c4333
YO
2946 regnode *nextop=!ST.jump ?
2947 ST.B :
2948 ST.B - ST.jump[ST.accept_buff[best].wordnum];
2949 PerlIO_printf( Perl_debug_log,
2950 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 2951 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2952 ST.accept_buff[best].wordnum,
265c4333
YO
2953 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2954 REG_NODE_NUM(nextop),
166ba7cd
DM
2955 PL_colors[5] );
2956 });
2957
2958 if ( best<ST.accepted ) {
2959 reg_trie_accepted tmp = ST.accept_buff[ best ];
2960 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2961 ST.accept_buff[ ST.accepted ] = tmp;
2962 best = ST.accepted;
a3621e74 2963 }
166ba7cd 2964 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
786e8c11
YO
2965 if ( !ST.jump ) {
2966 PUSH_STATE_GOTO(TRIE_next, ST.B);
2967 /* NOTREACHED */
2968 } else {
2969 PUSH_STATE_GOTO(TRIE_next, ST.B - ST.jump[ST.accept_buff[best].wordnum]);
2970 /* NOTREACHED */
2971 }
2972 /* NOTREACHED */
166ba7cd 2973 }
166ba7cd
DM
2974 /* NOTREACHED */
2975
2976#undef ST
2977
95b24440
DM
2978 case EXACT: {
2979 char *s = STRING(scan);
24d3c4a9 2980 ln = STR_LEN(scan);
eb160463 2981 if (do_utf8 != UTF) {
bc517b45 2982 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2983 char *l = locinput;
24d3c4a9 2984 const char * const e = s + ln;
a72c7584 2985
5ff6fc6d
JH
2986 if (do_utf8) {
2987 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2988 while (s < e) {
a3b680e6 2989 STRLEN ulen;
1aa99e6b 2990 if (l >= PL_regeol)
5ff6fc6d
JH
2991 sayNO;
2992 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2993 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2994 uniflags))
5ff6fc6d 2995 sayNO;
bc517b45 2996 l += ulen;
5ff6fc6d 2997 s ++;
1aa99e6b 2998 }
5ff6fc6d
JH
2999 }
3000 else {
3001 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3002 while (s < e) {
a3b680e6 3003 STRLEN ulen;
1aa99e6b
IH
3004 if (l >= PL_regeol)
3005 sayNO;
5ff6fc6d 3006 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3007 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3008 uniflags))
1aa99e6b 3009 sayNO;
bc517b45 3010 s += ulen;
a72c7584 3011 l ++;
1aa99e6b 3012 }
5ff6fc6d 3013 }
1aa99e6b
IH
3014 locinput = l;
3015 nextchr = UCHARAT(locinput);
3016 break;
3017 }
bc517b45 3018 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3019 /* Inline the first character, for speed. */
3020 if (UCHARAT(s) != nextchr)
3021 sayNO;
24d3c4a9 3022 if (PL_regeol - locinput < ln)
d6a28714 3023 sayNO;
24d3c4a9 3024 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3025 sayNO;
24d3c4a9 3026 locinput += ln;
d6a28714
JH
3027 nextchr = UCHARAT(locinput);
3028 break;
95b24440 3029 }
d6a28714 3030 case EXACTFL:
b8c5462f
JH
3031 PL_reg_flags |= RF_tainted;
3032 /* FALL THROUGH */
95b24440 3033 case EXACTF: {
be8e71aa 3034 char * const s = STRING(scan);
24d3c4a9 3035 ln = STR_LEN(scan);
d6a28714 3036
d07ddd77
JH
3037 if (do_utf8 || UTF) {
3038 /* Either target or the pattern are utf8. */
be8e71aa 3039 const char * const l = locinput;
d07ddd77 3040 char *e = PL_regeol;
bc517b45 3041
24d3c4a9 3042 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3043 l, &e, 0, do_utf8)) {
5486206c
JH
3044 /* One more case for the sharp s:
3045 * pack("U0U*", 0xDF) =~ /ss/i,
3046 * the 0xC3 0x9F are the UTF-8
3047 * byte sequence for the U+00DF. */
3048 if (!(do_utf8 &&
3049 toLOWER(s[0]) == 's' &&
24d3c4a9 3050 ln >= 2 &&
5486206c
JH
3051 toLOWER(s[1]) == 's' &&
3052 (U8)l[0] == 0xC3 &&
3053 e - l >= 2 &&
3054 (U8)l[1] == 0x9F))
3055 sayNO;
3056 }
d07ddd77
JH
3057 locinput = e;
3058 nextchr = UCHARAT(locinput);
3059 break;
a0ed51b3 3060 }
d6a28714 3061
bc517b45
JH
3062 /* Neither the target and the pattern are utf8. */
3063
d6a28714
JH
3064 /* Inline the first character, for speed. */
3065 if (UCHARAT(s) != nextchr &&
3066 UCHARAT(s) != ((OP(scan) == EXACTF)
3067 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3068 sayNO;
24d3c4a9 3069 if (PL_regeol - locinput < ln)
b8c5462f 3070 sayNO;
24d3c4a9
DM
3071 if (ln > 1 && (OP(scan) == EXACTF
3072 ? ibcmp(s, locinput, ln)
3073 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3074 sayNO;
24d3c4a9 3075 locinput += ln;
d6a28714 3076 nextchr = UCHARAT(locinput);
a0d0e21e 3077 break;
95b24440 3078 }
d6a28714 3079 case ANYOF:
ffc61ed2 3080 if (do_utf8) {
9e55ce06
JH
3081 STRLEN inclasslen = PL_regeol - locinput;
3082
32fc9b6a 3083 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3084 goto anyof_fail;
ffc61ed2
JH
3085 if (locinput >= PL_regeol)
3086 sayNO;
0f0076b4 3087 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3088 nextchr = UCHARAT(locinput);
e0f9d4a8 3089 break;
ffc61ed2
JH
3090 }
3091 else {
3092 if (nextchr < 0)
3093 nextchr = UCHARAT(locinput);
32fc9b6a 3094 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3095 goto anyof_fail;
ffc61ed2
JH
3096 if (!nextchr && locinput >= PL_regeol)
3097 sayNO;
3098 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3099 break;
3100 }
262b90c4 3101 anyof_fail:
e0f9d4a8
JH
3102 /* If we might have the case of the German sharp s
3103 * in a casefolding Unicode character class. */
3104
ebc501f0
JH
3105 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3106 locinput += SHARP_S_SKIP;
e0f9d4a8 3107 nextchr = UCHARAT(locinput);
ffc61ed2 3108 }
e0f9d4a8
JH
3109 else
3110 sayNO;
b8c5462f 3111 break;
d6a28714 3112 case ALNUML:
b8c5462f
JH
3113 PL_reg_flags |= RF_tainted;
3114 /* FALL THROUGH */
d6a28714 3115 case ALNUM:
b8c5462f 3116 if (!nextchr)
4633a7c4 3117 sayNO;
ffc61ed2 3118 if (do_utf8) {
1a4fad37 3119 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3120 if (!(OP(scan) == ALNUM
bb7a0f54 3121 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3122 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3123 {
3124 sayNO;
a0ed51b3 3125 }
b8c5462f 3126 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3127 nextchr = UCHARAT(locinput);
3128 break;
3129 }
ffc61ed2 3130 if (!(OP(scan) == ALNUM
d6a28714 3131 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3132 sayNO;
b8c5462f 3133 nextchr = UCHARAT(++locinput);
a0d0e21e 3134 break;
d6a28714 3135 case NALNUML:
b8c5462f
JH
3136 PL_reg_flags |= RF_tainted;
3137 /* FALL THROUGH */
d6a28714
JH
3138 case NALNUM:
3139 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3140 sayNO;
ffc61ed2 3141 if (do_utf8) {
1a4fad37 3142 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3143 if (OP(scan) == NALNUM
bb7a0f54 3144 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3145 : isALNUM_LC_utf8((U8*)locinput))
3146 {
b8c5462f 3147 sayNO;
d6a28714 3148 }
b8c5462f
JH
3149 locinput += PL_utf8skip[nextchr];
3150 nextchr = UCHARAT(locinput);
3151 break;
3152 }
ffc61ed2 3153 if (OP(scan) == NALNUM
d6a28714 3154 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3155 sayNO;
76e3520e 3156 nextchr = UCHARAT(++locinput);
a0d0e21e 3157 break;
d6a28714
JH
3158 case BOUNDL:
3159 case NBOUNDL:
3280af22 3160 PL_reg_flags |= RF_tainted;
bbce6d69 3161 /* FALL THROUGH */
d6a28714
JH
3162 case BOUND:
3163 case NBOUND:
3164 /* was last char in word? */
ffc61ed2 3165 if (do_utf8) {
12d33761 3166 if (locinput == PL_bostr)
24d3c4a9 3167 ln = '\n';
ffc61ed2 3168 else {
a3b680e6 3169 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3170
24d3c4a9 3171 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3172 }
3173 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3174 ln = isALNUM_uni(ln);
1a4fad37 3175 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3176 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3177 }
3178 else {
24d3c4a9 3179 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3180 n = isALNUM_LC_utf8((U8*)locinput);
3181 }
a0ed51b3 3182 }
d6a28714 3183 else {
24d3c4a9 3184 ln = (locinput != PL_bostr) ?
12d33761 3185 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3186 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3187 ln = isALNUM(ln);
ffc61ed2
JH
3188 n = isALNUM(nextchr);
3189 }
3190 else {
24d3c4a9 3191 ln = isALNUM_LC(ln);
ffc61ed2
JH
3192 n = isALNUM_LC(nextchr);
3193 }
d6a28714 3194 }
24d3c4a9 3195 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3196 OP(scan) == BOUNDL))
3197 sayNO;
a0ed51b3 3198 break;
d6a28714 3199 case SPACEL:
3280af22 3200 PL_reg_flags |= RF_tainted;
bbce6d69 3201 /* FALL THROUGH */
d6a28714 3202 case SPACE:
9442cb0e 3203 if (!nextchr)
4633a7c4 3204 sayNO;
1aa99e6b 3205 if (do_utf8) {
fd400ab9 3206 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3207 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3208 if (!(OP(scan) == SPACE
bb7a0f54 3209 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3210 : isSPACE_LC_utf8((U8*)locinput)))
3211 {
3212 sayNO;
3213 }
3214 locinput += PL_utf8skip[nextchr];
3215 nextchr = UCHARAT(locinput);
3216 break;
d6a28714 3217 }
ffc61ed2
JH
3218 if (!(OP(scan) == SPACE
3219 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3220 sayNO;
3221 nextchr = UCHARAT(++locinput);
3222 }
3223 else {
3224 if (!(OP(scan) == SPACE
3225 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3226 sayNO;
3227 nextchr = UCHARAT(++locinput);
a0ed51b3 3228 }
a0ed51b3 3229 break;
d6a28714 3230 case NSPACEL:
3280af22 3231 PL_reg_flags |= RF_tainted;
bbce6d69 3232 /* FALL THROUGH */
d6a28714 3233 case NSPACE:
9442cb0e 3234 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3235 sayNO;
1aa99e6b 3236 if (do_utf8) {
1a4fad37 3237 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3238 if (OP(scan) == NSPACE
bb7a0f54 3239 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3240 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3241 {
3242 sayNO;
3243 }
3244 locinput += PL_utf8skip[nextchr];
3245 nextchr = UCHARAT(locinput);
3246 break;
a0ed51b3 3247 }
ffc61ed2 3248 if (OP(scan) == NSPACE
d6a28714 3249 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3250 sayNO;
76e3520e 3251 nextchr = UCHARAT(++locinput);
a0d0e21e 3252 break;
d6a28714 3253 case DIGITL:
a0ed51b3
LW
3254 PL_reg_flags |= RF_tainted;
3255 /* FALL THROUGH */
d6a28714 3256 case DIGIT:
9442cb0e 3257 if (!nextchr)
a0ed51b3 3258 sayNO;
1aa99e6b 3259 if (do_utf8) {
1a4fad37 3260 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3261 if (!(OP(scan) == DIGIT
bb7a0f54 3262 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3263 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3264 {
a0ed51b3 3265 sayNO;
dfe13c55 3266 }
6f06b55f 3267 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3268 nextchr = UCHARAT(locinput);
3269 break;
3270 }
ffc61ed2 3271 if (!(OP(scan) == DIGIT
9442cb0e 3272 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3273 sayNO;
3274 nextchr = UCHARAT(++locinput);
3275 break;
d6a28714 3276 case NDIGITL:
b8c5462f
JH
3277 PL_reg_flags |= RF_tainted;
3278 /* FALL THROUGH */
d6a28714 3279 case NDIGIT:
9442cb0e 3280 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3281 sayNO;
1aa99e6b 3282 if (do_utf8) {
1a4fad37 3283 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3284 if (OP(scan) == NDIGIT
bb7a0f54 3285 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3286 : isDIGIT_LC_utf8((U8*)locinput))
3287 {
a0ed51b3 3288 sayNO;
9442cb0e 3289 }
6f06b55f 3290 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3291 nextchr = UCHARAT(locinput);
3292 break;
3293 }
ffc61ed2 3294 if (OP(scan) == NDIGIT
9442cb0e 3295 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3296 sayNO;
3297 nextchr = UCHARAT(++locinput);
3298 break;
3299 case CLUMP:
b7c83a7e 3300 if (locinput >= PL_regeol)
a0ed51b3 3301 sayNO;
b7c83a7e 3302 if (do_utf8) {
1a4fad37 3303 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3304 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3305 sayNO;
3306 locinput += PL_utf8skip[nextchr];
3307 while (locinput < PL_regeol &&
3308 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3309 locinput += UTF8SKIP(locinput);
3310 if (locinput > PL_regeol)
3311 sayNO;
eb08e2da
JH
3312 }
3313 else
3314 locinput++;
a0ed51b3
LW
3315 nextchr = UCHARAT(locinput);
3316 break;
81714fb9
YO
3317
3318 case NREFFL:
3319 {
3320 char *s;
ff1157ca 3321 char type;
81714fb9
YO
3322 PL_reg_flags |= RF_tainted;
3323 /* FALL THROUGH */
3324 case NREF:
3325 case NREFF:
ff1157ca 3326 type = OP(scan);
0a4db386
YO
3327 n = reg_check_named_buff_matched(rex,scan);
3328
3329 if ( n ) {
3330 type = REF + ( type - NREF );
3331 goto do_ref;
3332 } else {
81714fb9 3333 sayNO;
0a4db386
YO
3334 }
3335 /* unreached */
c8756f30 3336 case REFFL:
3280af22 3337 PL_reg_flags |= RF_tainted;
c8756f30 3338 /* FALL THROUGH */
c277df42 3339 case REF:
81714fb9 3340 case REFF:
c277df42 3341 n = ARG(scan); /* which paren pair */
81714fb9
YO
3342 type = OP(scan);
3343 do_ref:
24d3c4a9 3344 ln = PL_regstartp[n];
2c2d71f5 3345 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3346 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3347 sayNO; /* Do not match unless seen CLOSEn. */
24d3c4a9 3348 if (ln == PL_regendp[n])
a0d0e21e 3349 break;
a0ed51b3 3350
24d3c4a9 3351 s = PL_bostr + ln;
81714fb9 3352 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3353 char *l = locinput;
a3b680e6 3354 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3355 /*
3356 * Note that we can't do the "other character" lookup trick as
3357 * in the 8-bit case (no pun intended) because in Unicode we
3358 * have to map both upper and title case to lower case.
3359 */
81714fb9 3360 if (type == REFF) {
a0ed51b3 3361 while (s < e) {
a3b680e6
AL
3362 STRLEN ulen1, ulen2;
3363 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3364 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3365
a0ed51b3
LW
3366 if (l >= PL_regeol)
3367 sayNO;
a2a2844f
JH
3368 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3369 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3370 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3371 sayNO;
a2a2844f
JH
3372 s += ulen1;
3373 l += ulen2;
a0ed51b3
LW
3374 }
3375 }
3376 locinput = l;
3377 nextchr = UCHARAT(locinput);
3378 break;
3379 }
3380
a0d0e21e 3381 /* Inline the first character, for speed. */
76e3520e 3382 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3383 (type == REF ||
3384 (UCHARAT(s) != (type == REFF
3385 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3386 sayNO;
24d3c4a9
DM
3387 ln = PL_regendp[n] - ln;
3388 if (locinput + ln > PL_regeol)
4633a7c4 3389 sayNO;
81714fb9 3390 if (ln > 1 && (type == REF
24d3c4a9 3391 ? memNE(s, locinput, ln)
81714fb9 3392 : (type == REFF
24d3c4a9
DM
3393 ? ibcmp(s, locinput, ln)
3394 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3395 sayNO;
24d3c4a9 3396 locinput += ln;
76e3520e 3397 nextchr = UCHARAT(locinput);
a0d0e21e 3398 break;
81714fb9 3399 }
a0d0e21e 3400 case NOTHING:
c277df42 3401 case TAIL:
a0d0e21e
LW
3402 break;
3403 case BACK:
3404 break;
40a82448
DM
3405
3406#undef ST
3407#define ST st->u.eval
c277df42 3408 {
c277df42 3409 SV *ret;
6bda09f9
YO
3410 regexp *re;
3411 regnode *startpoint;
3412
3413 case SRECURSE:
3414 case RECURSE: /* /(...(?1))/ */
3415 if (cur_eval && cur_eval->locinput==locinput) {
3416 if (cur_eval->u.eval.close_paren == ARG(scan))
3417 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3418 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3419 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3420 } else {
3421 nochange_depth = 0;
3422 }
3423 re = rex;
3424 (void)ReREFCNT_inc(rex);
3425 if (OP(scan)==RECURSE) {
3426 startpoint = scan + ARG2L(scan);
3427 ST.close_paren = ARG(scan);
3428 } else {
3429 startpoint = re->program+1;
3430 ST.close_paren = 0;
3431 }
3432 goto eval_recurse_doit;
3433 /* NOTREACHED */
3434 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3435 if (cur_eval && cur_eval->locinput==locinput) {
3436 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3437 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3438 } else {
3439 nochange_depth = 0;
3440 }
8e5e9ebe 3441 {
4aabdb9b
DM
3442 /* execute the code in the {...} */
3443 dSP;
6136c704 3444 SV ** const before = SP;
4aabdb9b
DM
3445 OP_4tree * const oop = PL_op;
3446 COP * const ocurcop = PL_curcop;
3447 PAD *old_comppad;
4aabdb9b
DM
3448
3449 n = ARG(scan);
32fc9b6a 3450 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3451 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3452 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3453 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3454
8e5e9ebe
RGS
3455 CALLRUNOPS(aTHX); /* Scalar context. */
3456 SPAGAIN;
3457 if (SP == before)
075aa684 3458 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3459 else {
3460 ret = POPs;
3461 PUTBACK;
3462 }
4aabdb9b
DM
3463
3464 PL_op = oop;
3465 PAD_RESTORE_LOCAL(old_comppad);
3466 PL_curcop = ocurcop;
24d3c4a9 3467 if (!logical) {
4aabdb9b
DM
3468 /* /(?{...})/ */
3469 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3470 break;
3471 }
8e5e9ebe 3472 }
24d3c4a9
DM
3473 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3474 logical = 0;
4aabdb9b 3475 {
4f639d21
DM
3476 /* extract RE object from returned value; compiling if
3477 * necessary */
3478
6136c704 3479 MAGIC *mg = NULL;
be8e71aa 3480 const SV *sv;
faf82a0b
AE
3481 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3482 mg = mg_find(sv, PERL_MAGIC_qr);
3483 else if (SvSMAGICAL(ret)) {
3484 if (SvGMAGICAL(ret))
3485 sv_unmagic(ret, PERL_MAGIC_qr);
3486 else
3487 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3488 }
faf82a0b 3489
0f5d15d6
IZ
3490 if (mg) {
3491 re = (regexp *)mg->mg_obj;
df0003d4 3492 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3493 }
3494 else {
3495 STRLEN len;
6136c704 3496 const char * const t = SvPV_const(ret, len);
0f5d15d6 3497 PMOP pm;
a3b680e6 3498 const I32 osize = PL_regsize;
0f5d15d6 3499
5fcd1c1b 3500 Zero(&pm, 1, PMOP);
4aabdb9b 3501 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3502 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3503 if (!(SvFLAGS(ret)
faf82a0b
AE
3504 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3505 | SVs_GMG)))
14befaf4
DM
3506 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3507 PERL_MAGIC_qr,0,0);
0f5d15d6 3508 PL_regsize = osize;
0f5d15d6 3509 }
4aabdb9b 3510 }
6bda09f9
YO
3511 DEBUG_EXECUTE_r(
3512 debug_start_match(re, do_utf8, locinput, PL_regeol,
3513 "Matching embedded");
3514 );
3515 startpoint = re->program + 1;
3516 ST.close_paren = 0; /* only used for RECURSE */
3517 /* borrowed from regtry */
3518 if (PL_reg_start_tmpl <= re->nparens) {
3519 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3520 if(PL_reg_start_tmp)
3521 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3522 else
3523 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3524 }
aa283a38 3525
ae0beba1 3526 eval_recurse_doit: /* Share code with RECURSE below this line */
aa283a38 3527 /* run the pattern returned from (??{...}) */
40a82448
DM
3528 ST.cp = regcppush(0); /* Save *all* the positions. */
3529 REGCP_SET(ST.lastcp);
6bda09f9
YO
3530
3531 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3532 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3533
4aabdb9b
DM
3534 *PL_reglastparen = 0;
3535 *PL_reglastcloseparen = 0;
4aabdb9b 3536 PL_reginput = locinput;
ae0beba1 3537 PL_regsize = 0;
4aabdb9b
DM
3538
3539 /* XXXX This is too dramatic a measure... */
3540 PL_reg_maxiter = 0;
3541
faec1544
DM
3542 ST.toggle_reg_flags = PL_reg_flags;
3543 if (re->reganch & ROPT_UTF8)
3544 PL_reg_flags |= RF_utf8;
3545 else
3546 PL_reg_flags &= ~RF_utf8;
3547 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3548
40a82448 3549 ST.prev_rex = rex;
faec1544 3550 ST.prev_curlyx = cur_curlyx;
aa283a38 3551 rex = re;
faec1544 3552 cur_curlyx = NULL;
40a82448 3553 ST.B = next;
faec1544
DM
3554 ST.prev_eval = cur_eval;
3555 cur_eval = st;
faec1544 3556 /* now continue from first node in postoned RE */
6bda09f9 3557 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3558 /* NOTREACHED */
a0ed51b3 3559 }
24d3c4a9
DM
3560 /* logical is 1, /(?(?{...})X|Y)/ */
3561 sw = (bool)SvTRUE(ret);
3562 logical = 0;
c277df42
IZ
3563 break;
3564 }
40a82448 3565
faec1544
DM
3566 case EVAL_AB: /* cleanup after a successful (??{A})B */
3567 /* note: this is called twice; first after popping B, then A */
3568 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3569 ReREFCNT_dec(rex);
3570 rex = ST.prev_rex;
faec1544
DM
3571 regcpblow(ST.cp);
3572 cur_eval = ST.prev_eval;
3573 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3574 /* XXXX This is too dramatic a measure... */
3575 PL_reg_maxiter = 0;
262b90c4 3576 sayYES;
40a82448 3577
40a82448 3578
faec1544
DM
3579 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3580 /* note: this is called twice; first after popping B, then A */
3581 PL_reg_flags ^= ST.toggle_reg_flags;
3582 ReREFCNT_dec(rex);
3583 rex = ST.prev_rex;
40a82448
DM
3584 PL_reginput = locinput;
3585 REGCP_UNWIND(ST.lastcp);
3586 regcppop(rex);
faec1544
DM
3587 cur_eval = ST.prev_eval;
3588 cur_curlyx = ST.prev_curlyx;
3589 /* XXXX This is too dramatic a measure... */
3590 PL_reg_maxiter = 0;
40a82448 3591 sayNO_SILENT;
40a82448
DM
3592#undef ST
3593
a0d0e21e 3594 case OPEN:
c277df42 3595 n = ARG(scan); /* which paren pair */
3280af22
NIS
3596 PL_reg_start_tmp[n] = locinput;
3597 if (n > PL_regsize)
3598 PL_regsize = n;
a0d0e21e
LW
3599 break;
3600 case CLOSE:
c277df42 3601 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3602 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3603 PL_regendp[n] = locinput - PL_bostr;
eb160463 3604 if (n > (I32)*PL_reglastparen)
3280af22 3605 *PL_reglastparen = n;
a01268b5 3606 *PL_reglastcloseparen = n;
6bda09f9
YO
3607 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3608 goto fake_end;
3609 }
a0d0e21e 3610 break;
c277df42
IZ
3611 case GROUPP:
3612 n = ARG(scan); /* which paren pair */
24d3c4a9 3613 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42 3614 break;
0a4db386
YO
3615 case NGROUPP:
3616 /* reg_check_named_buff_matched returns 0 for no match */
3617 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3618 break;
3619 case RECURSEP:
3620 n = ARG(scan);
3621 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3622 break;
3623 case DEFINEP:
3624 sw = 0;
3625 break;
c277df42 3626 case IFTHEN:
2c2d71f5 3627 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3628 if (sw)
c277df42
IZ
3629 next = NEXTOPER(NEXTOPER(scan));
3630 else {
3631 next = scan + ARG(scan);
3632 if (OP(next) == IFTHEN) /* Fake one. */
3633 next = NEXTOPER(NEXTOPER(next));
3634 }
3635 break;
3636 case LOGICAL:
24d3c4a9 3637 logical = scan->flags;
c277df42 3638 break;
c476f425 3639
2ab05381 3640/*******************************************************************
2ab05381 3641
c476f425
DM
3642The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3643pattern, where A and B are subpatterns. (For simple A, CURLYM or
3644STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3645
c476f425 3646A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3647
c476f425
DM
3648On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3649state, which contains the current count, initialised to -1. It also sets
3650cur_curlyx to point to this state, with any previous value saved in the
3651state block.
2ab05381 3652
c476f425
DM
3653CURLYX then jumps straight to the WHILEM op, rather than executing A,
3654since the pattern may possibly match zero times (i.e. it's a while {} loop
3655rather than a do {} while loop).
2ab05381 3656
c476f425
DM
3657Each entry to WHILEM represents a successful match of A. The count in the
3658CURLYX block is incremented, another WHILEM state is pushed, and execution
3659passes to A or B depending on greediness and the current count.
2ab05381 3660
c476f425
DM
3661For example, if matching against the string a1a2a3b (where the aN are
3662substrings that match /A/), then the match progresses as follows: (the
3663pushed states are interspersed with the bits of strings matched so far):
2ab05381 3664
c476f425
DM
3665 <CURLYX cnt=-1>
3666 <CURLYX cnt=0><WHILEM>
3667 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3668 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3669 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3670 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3671
c476f425
DM
3672(Contrast this with something like CURLYM, which maintains only a single
3673backtrack state:
2ab05381 3674
c476f425
DM
3675 <CURLYM cnt=0> a1
3676 a1 <CURLYM cnt=1> a2
3677 a1 a2 <CURLYM cnt=2> a3
3678 a1 a2 a3 <CURLYM cnt=3> b
3679)
2ab05381 3680
c476f425
DM
3681Each WHILEM state block marks a point to backtrack to upon partial failure
3682of A or B, and also contains some minor state data related to that
3683iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3684overall state, such as the count, and pointers to the A and B ops.
2ab05381 3685
c476f425
DM
3686This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3687must always point to the *current* CURLYX block, the rules are:
2ab05381 3688
c476f425
DM
3689When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3690and set cur_curlyx to point the new block.
2ab05381 3691
c476f425
DM
3692When popping the CURLYX block after a successful or unsuccessful match,
3693restore the previous cur_curlyx.
2ab05381 3694
c476f425
DM
3695When WHILEM is about to execute B, save the current cur_curlyx, and set it
3696to the outer one saved in the CURLYX block.
2ab05381 3697
c476f425
DM
3698When popping the WHILEM block after a successful or unsuccessful B match,
3699restore the previous cur_curlyx.
2ab05381 3700
c476f425
DM
3701Here's an example for the pattern (AI* BI)*BO
3702I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3703
c476f425
DM
3704cur_
3705curlyx backtrack stack
3706------ ---------------
3707NULL
3708CO <CO prev=NULL> <WO>
3709CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3710CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3711NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3712
c476f425
DM
3713At this point the pattern succeeds, and we work back down the stack to
3714clean up, restoring as we go:
95b24440 3715
c476f425
DM
3716CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3717CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3718CO <CO prev=NULL> <WO>
3719NULL
a0374537 3720
c476f425
DM
3721*******************************************************************/
3722
3723#define ST st->u.curlyx
3724
3725 case CURLYX: /* start of /A*B/ (for complex A) */
3726 {
3727 /* No need to save/restore up to this paren */
3728 I32 parenfloor = scan->flags;
3729
3730 assert(next); /* keep Coverity happy */
3731 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3732 next += ARG(next);
3733
3734 /* XXXX Probably it is better to teach regpush to support
3735 parenfloor > PL_regsize... */
3736 if (parenfloor > (I32)*PL_reglastparen)
3737 parenfloor = *PL_reglastparen; /* Pessimization... */
3738
3739 ST.prev_curlyx= cur_curlyx;
3740 cur_curlyx = st;
3741 ST.cp = PL_savestack_ix;
3742
3743 /* these fields contain the state of the current curly.
3744 * they are accessed by subsequent WHILEMs */
3745 ST.parenfloor = parenfloor;
3746 ST.min = ARG1(scan);
3747 ST.max = ARG2(scan);
3748 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3749 ST.B = next;
24d3c4a9
DM
3750 ST.minmod = minmod;
3751 minmod = 0;
c476f425
DM
3752 ST.count = -1; /* this will be updated by WHILEM */
3753 ST.lastloc = NULL; /* this will be updated by WHILEM */
3754
3755 PL_reginput = locinput;
3756 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 3757 /* NOTREACHED */
c476f425 3758 }
a0d0e21e 3759
c476f425
DM
3760 case CURLYX_end: /* just finished matching all of A*B */
3761 regcpblow(ST.cp);
3762 cur_curlyx = ST.prev_curlyx;
3763 sayYES;
3764 /* NOTREACHED */
a0d0e21e 3765
c476f425
DM
3766 case CURLYX_end_fail: /* just failed to match all of A*B */
3767 regcpblow(ST.cp);
3768 cur_curlyx = ST.prev_curlyx;
3769 sayNO;
3770 /* NOTREACHED */
4633a7c4 3771
a0d0e21e 3772
c476f425
DM
3773#undef ST
3774#define ST st->u.whilem
3775
3776 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3777 {
3778 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
3779 I32 n;
3780 assert(cur_curlyx); /* keep Coverity happy */
3781 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3782 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3783 ST.cache_offset = 0;
3784 ST.cache_mask = 0;
3785
3786 PL_reginput = locinput;
3787
3788 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3789 "%*s whilem: matched %ld out of %ld..%ld\n",
3790 REPORT_CODE_OFF+depth*2, "", (long)n,
3791 (long)cur_curlyx->u.curlyx.min,
3792 (long)cur_curlyx->u.curlyx.max)
3793 );
a0d0e21e 3794
c476f425 3795 /* First just match a string of min A's. */
a0d0e21e 3796
c476f425
DM
3797 if (n < cur_curlyx->u.curlyx.min) {
3798 cur_curlyx->u.curlyx.lastloc = locinput;
3799 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3800 /* NOTREACHED */
3801 }
3802
3803 /* If degenerate A matches "", assume A done. */
3804
3805 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3806 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3807 "%*s whilem: empty match detected, trying continuation...\n",
3808 REPORT_CODE_OFF+depth*2, "")
3809 );
3810 goto do_whilem_B_max;
3811 }
3812
3813 /* super-linear cache processing */
3814
3815 if (scan->flags) {
a0d0e21e 3816
2c2d71f5 3817 if (!PL_reg_maxiter) {
c476f425
DM
3818 /* start the countdown: Postpone detection until we
3819 * know the match is not *that* much linear. */
2c2d71f5 3820 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3821 /* possible overflow for long strings and many CURLYX's */
3822 if (PL_reg_maxiter < 0)
3823 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3824 PL_reg_leftiter = PL_reg_maxiter;
3825 }
c476f425 3826
2c2d71f5 3827 if (PL_reg_leftiter-- == 0) {
c476f425 3828 /* initialise cache */
3298f257 3829 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3830 if (PL_reg_poscache) {
eb160463 3831 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3832 Renew(PL_reg_poscache, size, char);
3833 PL_reg_poscache_size = size;
3834 }
3835 Zero(PL_reg_poscache, size, char);
3836 }
3837 else {
3838 PL_reg_poscache_size = size;
a02a5408 3839 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3840 }
c476f425
DM
3841 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3842 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3843 PL_colors[4], PL_colors[5])
3844 );
2c2d71f5 3845 }
c476f425 3846
2c2d71f5 3847 if (PL_reg_leftiter < 0) {
c476f425
DM
3848 /* have we already failed at this position? */
3849 I32 offset, mask;
3850 offset = (scan->flags & 0xf) - 1
3851 + (locinput - PL_bostr) * (scan->flags>>4);
3852 mask = 1 << (offset % 8);
3853 offset /= 8;
3854 if (PL_reg_poscache[offset] & mask) {
3855 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3856 "%*s whilem: (cache) already tried at this position...\n",
3857 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 3858 );
3298f257 3859 sayNO; /* cache records failure */
2c2d71f5 3860 }
c476f425
DM
3861 ST.cache_offset = offset;
3862 ST.cache_mask = mask;
2c2d71f5 3863 }
c476f425 3864 }
2c2d71f5 3865
c476f425 3866 /* Prefer B over A for minimal matching. */
a687059c 3867
c476f425
DM
3868 if (cur_curlyx->u.curlyx.minmod) {
3869 ST.save_curlyx = cur_curlyx;
3870 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3871 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3872 REGCP_SET(ST.lastcp);
3873 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3874 /* NOTREACHED */
3875 }
a0d0e21e 3876
c476f425
DM
3877 /* Prefer A over B for maximal matching. */
3878
3879 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3880 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3881 cur_curlyx->u.curlyx.lastloc = locinput;
3882 REGCP_SET(ST.lastcp);
3883 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3884 /* NOTREACHED */
3885 }
3886 goto do_whilem_B_max;
3887 }
3888 /* NOTREACHED */
3889
3890 case WHILEM_B_min: /* just matched B in a minimal match */
3891 case WHILEM_B_max: /* just matched B in a maximal match */
3892 cur_curlyx = ST.save_curlyx;
3893 sayYES;
3894 /* NOTREACHED */
3895
3896 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3897 cur_curlyx = ST.save_curlyx;
3898 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3899 cur_curlyx->u.curlyx.count--;
3900 CACHEsayNO;
3901 /* NOTREACHED */
3902
3903 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3904 REGCP_UNWIND(ST.lastcp);
3905 regcppop(rex);
3906 /* FALL THROUGH */
3907 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3908 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3909 cur_curlyx->u.curlyx.count--;
3910 CACHEsayNO;
3911 /* NOTREACHED */
3912
3913 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3914 REGCP_UNWIND(ST.lastcp);
3915 regcppop(rex); /* Restore some previous $<digit>s? */
3916 PL_reginput = locinput;
3917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3918 "%*s whilem: failed, trying continuation...\n",
3919 REPORT_CODE_OFF+depth*2, "")
3920 );
3921 do_whilem_B_max:
3922 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3923 && ckWARN(WARN_REGEXP)
3924 && !(PL_reg_flags & RF_warned))
3925 {
3926 PL_reg_flags |= RF_warned;
3927 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3928 "Complex regular subexpression recursion",
3929 REG_INFTY - 1);
3930 }
3931
3932 /* now try B */
3933 ST.save_curlyx = cur_curlyx;
3934 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3935 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3936 /* NOTREACHED */
3937
3938 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3939 cur_curlyx = ST.save_curlyx;
3940 REGCP_UNWIND(ST.lastcp);
3941 regcppop(rex);
3942
3943 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3944 /* Maximum greed exceeded */
3945 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3946 && ckWARN(WARN_REGEXP)
3947 && !(PL_reg_flags & RF_warned))
3948 {
3280af22 3949 PL_reg_flags |= RF_warned;
c476f425
DM
3950 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3951 "%s limit (%d) exceeded",
3952 "Complex regular subexpression recursion",
3953 REG_INFTY - 1);
a0d0e21e 3954 }
c476f425 3955 cur_curlyx->u.curlyx.count--;
3ab3c9b4 3956 CACHEsayNO;
a0d0e21e 3957 }
c476f425
DM
3958
3959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3960 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3961 );
3962 /* Try grabbing another A and see if it helps. */
3963 PL_reginput = locinput;
3964 cur_curlyx->u.curlyx.lastloc = locinput;
3965 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3966 REGCP_SET(ST.lastcp);
3967 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 3968 /* NOTREACHED */
40a82448
DM
3969
3970#undef ST
3971#define ST st->u.branch
3972
3973 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3974 next = scan + ARG(scan);
3975 if (next == scan)
3976 next = NULL;
40a82448
DM
3977 scan = NEXTOPER(scan);
3978 /* FALL THROUGH */
c277df42 3979
40a82448
DM
3980 case BRANCH: /* /(...|A|...)/ */
3981 scan = NEXTOPER(scan); /* scan now points to inner node */
3982 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3983 /* last branch; skip state push and jump direct to node */
3984 continue;
3985 ST.lastparen = *PL_reglastparen;
3986 ST.next_branch = next;
3987 REGCP_SET(ST.cp);
3988 PL_reginput = locinput;
02db2b7b 3989
40a82448
DM
3990 /* Now go into the branch */
3991 PUSH_STATE_GOTO(BRANCH_next, scan);
3992 /* NOTREACHED */
3993
3994 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3995 REGCP_UNWIND(ST.cp);
3996 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3997 PL_regendp[n] = -1;
3998 *PL_reglastparen = n;
0a4db386 3999 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4000 scan = ST.next_branch;
4001 /* no more branches? */
4002 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4003 sayNO;
4004 continue; /* execute next BRANCH[J] op */
4005 /* NOTREACHED */
4006
a0d0e21e 4007 case MINMOD:
24d3c4a9 4008 minmod = 1;
a0d0e21e 4009 break;
40a82448
DM
4010
4011#undef ST
4012#define ST st->u.curlym
4013
4014 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4015
4016 /* This is an optimisation of CURLYX that enables us to push
4017 * only a single backtracking state, no matter now many matches
4018 * there are in {m,n}. It relies on the pattern being constant
4019 * length, with no parens to influence future backrefs
4020 */
4021
4022 ST.me = scan;
dc45a647 4023 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4024
4025 /* if paren positive, emulate an OPEN/CLOSE around A */
4026 if (ST.me->flags) {
4027 I32 paren = ST.me->flags;
4028 if (paren > PL_regsize)
4029 PL_regsize = paren;
4030 if (paren > (I32)*PL_reglastparen)
4031 *PL_reglastparen = paren;
c277df42 4032 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4033 }
40a82448
DM
4034 ST.A = scan;
4035 ST.B = next;
4036 ST.alen = 0;
4037 ST.count = 0;
24d3c4a9
DM
4038 ST.minmod = minmod;
4039 minmod = 0;
40a82448
DM
4040 ST.c1 = CHRTEST_UNINIT;
4041 REGCP_SET(ST.cp);
6407bf3b 4042
40a82448
DM
4043 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4044 goto curlym_do_B;
4045
4046 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4047 PL_reginput = locinput;
40a82448
DM
4048 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4049 /* NOTREACHED */
5f80c4cf 4050
40a82448
DM
4051 case CURLYM_A: /* we've just matched an A */
4052 locinput = st->locinput;
4053 nextchr = UCHARAT(locinput);
4054
4055 ST.count++;
4056 /* after first match, determine A's length: u.curlym.alen */
4057 if (ST.count == 1) {
4058 if (PL_reg_match_utf8) {
4059 char *s = locinput;
4060 while (s < PL_reginput) {
4061 ST.alen++;
4062 s += UTF8SKIP(s);
4063 }
4064 }
4065 else {
4066 ST.alen = PL_reginput - locinput;
4067 }
4068 if (ST.alen == 0)
4069 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4070 }
0cadcf80
DM
4071 DEBUG_EXECUTE_r(
4072 PerlIO_printf(Perl_debug_log,
40a82448 4073 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4074 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4075 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4076 );
4077
40a82448 4078 locinput = PL_reginput;
0a4db386
YO
4079
4080 if (cur_eval && cur_eval->u.eval.close_paren &&
4081 cur_eval->u.eval.close_paren == ST.me->flags)
4082 goto fake_end;
4083
4084 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4085 goto curlym_do_A; /* try to match another A */
4086 goto curlym_do_B; /* try to match B */
5f80c4cf 4087
40a82448
DM
4088 case CURLYM_A_fail: /* just failed to match an A */
4089 REGCP_UNWIND(ST.cp);
0a4db386
YO
4090
4091 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4092 || (cur_eval && cur_eval->u.eval.close_paren &&
4093 cur_eval->u.eval.close_paren == ST.me->flags))
40a82448 4094 sayNO;
0cadcf80 4095
40a82448
DM
4096 curlym_do_B: /* execute the B in /A{m,n}B/ */
4097 PL_reginput = locinput;
4098 if (ST.c1 == CHRTEST_UNINIT) {
4099 /* calculate c1 and c2 for possible match of 1st char
4100 * following curly */
4101 ST.c1 = ST.c2 = CHRTEST_VOID;
4102 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4103 regnode *text_node = ST.B;
4104 if (! HAS_TEXT(text_node))
4105 FIND_NEXT_IMPT(text_node);
4106 if (HAS_TEXT(text_node)
4107 && PL_regkind[OP(text_node)] != REF)
4108 {
4109 ST.c1 = (U8)*STRING(text_node);
4110 ST.c2 =
4111 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4112 ? PL_fold[ST.c1]
4113 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4114 ? PL_fold_locale[ST.c1]
4115 : ST.c1;
c277df42 4116 }
c277df42 4117 }
40a82448
DM
4118 }
4119
4120 DEBUG_EXECUTE_r(
4121 PerlIO_printf(Perl_debug_log,
4122 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4123 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4124 "", (IV)ST.count)
4125 );
4126 if (ST.c1 != CHRTEST_VOID
4127 && UCHARAT(PL_reginput) != ST.c1
4128 && UCHARAT(PL_reginput) != ST.c2)
4129 {
4130 /* simulate B failing */
4131 state_num = CURLYM_B_fail;
4132 goto reenter_switch;
4133 }
4134
4135 if (ST.me->flags) {
4136 /* mark current A as captured */
4137 I32 paren = ST.me->flags;
4138 if (ST.count) {
4139 PL_regstartp[paren]
4140 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4141 PL_regendp[paren] = PL_reginput - PL_bostr;
0a4db386 4142 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4143 }
40a82448
DM
4144 else
4145 PL_regendp[paren] = -1;
0a4db386
YO
4146 if (cur_eval && cur_eval->u.eval.close_paren &&
4147 cur_eval->u.eval.close_paren == ST.me->flags)
4148 {
4149 if (ST.count)
4150 goto fake_end;
4151 else
4152 sayNO;
4153 }
c277df42 4154 }
0a4db386 4155
40a82448 4156 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4157 /* NOTREACHED */
40a82448
DM
4158
4159 case CURLYM_B_fail: /* just failed to match a B */
4160 REGCP_UNWIND(ST.cp);
4161 if (ST.minmod) {
4162 if (ST.count == ARG2(ST.me) /* max */)
4163 sayNO;
4164 goto curlym_do_A; /* try to match a further A */
4165 }
4166 /* backtrack one A */
4167 if (ST.count == ARG1(ST.me) /* min */)
4168 sayNO;
4169 ST.count--;
4170 locinput = HOPc(locinput, -ST.alen);
4171 goto curlym_do_B; /* try to match B */
4172
c255a977
DM
4173#undef ST
4174#define ST st->u.curly
40a82448 4175
c255a977
DM
4176#define CURLY_SETPAREN(paren, success) \
4177 if (paren) { \
4178 if (success) { \
4179 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4180 PL_regendp[paren] = locinput - PL_bostr; \
0a4db386 4181 *PL_reglastcloseparen = paren; \
c255a977
DM
4182 } \
4183 else \
4184 PL_regendp[paren] = -1; \
4185 }
4186
4187 case STAR: /* /A*B/ where A is width 1 */
4188 ST.paren = 0;
4189 ST.min = 0;
4190 ST.max = REG_INFTY;
a0d0e21e
LW
4191 scan = NEXTOPER(scan);
4192 goto repeat;
c255a977
DM
4193 case PLUS: /* /A+B/ where A is width 1 */
4194 ST.paren = 0;
4195 ST.min = 1;
4196 ST.max = REG_INFTY;
c277df42 4197 scan = NEXTOPER(scan);
c255a977
DM
4198 goto repeat;
4199 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4200 ST.paren = scan->flags; /* Which paren to set */
4201 if (ST.paren > PL_regsize)
4202 PL_regsize = ST.paren;
4203 if (ST.paren > (I32)*PL_reglastparen)
4204 *PL_reglastparen = ST.paren;
4205 ST.min = ARG1(scan); /* min to match */
4206 ST.max = ARG2(scan); /* max to match */
0a4db386
YO
4207 if (cur_eval && cur_eval->u.eval.close_paren &&
4208 cur_eval->u.eval.close_paren == ST.paren) {
4209 ST.min=1;
4210 ST.max=1;
4211 }
c255a977
DM
4212 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4213 goto repeat;
4214 case CURLY: /* /A{m,n}B/ where A is width 1 */
4215 ST.paren = 0;
4216 ST.min = ARG1(scan); /* min to match */
4217 ST.max = ARG2(scan); /* max to match */
4218 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4219 repeat:
a0d0e21e
LW
4220 /*
4221 * Lookahead to avoid useless match attempts
4222 * when we know what character comes next.
c255a977 4223 *
5f80c4cf
JP
4224 * Used to only do .*x and .*?x, but now it allows
4225 * for )'s, ('s and (?{ ... })'s to be in the way
4226 * of the quantifier and the EXACT-like node. -- japhy
4227 */
4228
c255a977
DM
4229 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4230 sayNO;
cca55fe3 4231 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4232 U8 *s;
4233 regnode *text_node = next;
4234
3dab1dad
YO
4235 if (! HAS_TEXT(text_node))
4236 FIND_NEXT_IMPT(text_node);
5f80c4cf 4237
9e137952 4238 if (! HAS_TEXT(text_node))
c255a977 4239 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4240 else {
3dab1dad 4241 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4242 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4243 goto assume_ok_easy;
cca55fe3 4244 }
be8e71aa
YO
4245 else
4246 s = (U8*)STRING(text_node);
5f80c4cf
JP
4247
4248 if (!UTF) {
c255a977 4249 ST.c2 = ST.c1 = *s;
f65d3ee7 4250 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4251 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4252 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4253 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4254 }
5f80c4cf 4255 else { /* UTF */
f65d3ee7 4256 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4257 STRLEN ulen1, ulen2;
89ebb4a3
JH
4258 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4259 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4260
4261 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4262 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4263#ifdef EBCDIC
4264 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4265 ckWARN(WARN_UTF8) ?
4266 0 : UTF8_ALLOW_ANY);
4267 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4268 ckWARN(WARN_UTF8) ?
4269 0 : UTF8_ALLOW_ANY);
4270#else
c255a977 4271 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4272 uniflags);
c255a977 4273 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4274 uniflags);
4275#endif
5f80c4cf
JP
4276 }
4277 else {
c255a977 4278 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4279 uniflags);
5f80c4cf 4280 }
1aa99e6b
IH
4281 }
4282 }
bbce6d69 4283 }
a0d0e21e 4284 else
c255a977 4285 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4286 assume_ok_easy:
c255a977
DM
4287
4288 ST.A = scan;
4289 ST.B = next;
3280af22 4290 PL_reginput = locinput;
24d3c4a9
DM
4291 if (minmod) {
4292 minmod = 0;
c255a977 4293 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4294 sayNO;
c255a977 4295 ST.count = ST.min;
a0ed51b3 4296 locinput = PL_reginput;
c255a977
DM
4297 REGCP_SET(ST.cp);
4298 if (ST.c1 == CHRTEST_VOID)
4299 goto curly_try_B_min;
4300
4301 ST.oldloc = locinput;
4302
4303 /* set ST.maxpos to the furthest point along the
4304 * string that could possibly match */
4305 if (ST.max == REG_INFTY) {
4306 ST.maxpos = PL_regeol - 1;
4307 if (do_utf8)
4308 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4309 ST.maxpos--;
4310 }
4311 else if (do_utf8) {
4312 int m = ST.max - ST.min;
4313 for (ST.maxpos = locinput;
4314 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4315 ST.maxpos += UTF8SKIP(ST.maxpos);
4316 }
4317 else {
4318 ST.maxpos = locinput + ST.max - ST.min;
4319 if (ST.maxpos >= PL_regeol)
4320 ST.maxpos = PL_regeol - 1;
4321 }
4322 goto curly_try_B_min_known;
4323
4324 }
4325 else {
4326 ST.count = regrepeat(rex, ST.A, ST.max);
4327 locinput = PL_reginput;
4328 if (ST.count < ST.min)
4329 sayNO;
4330 if ((ST.count > ST.min)
4331 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4332 {
4333 /* A{m,n} must come at the end of the string, there's
4334 * no point in backing off ... */
4335 ST.min = ST.count;
4336 /* ...except that $ and \Z can match before *and* after
4337 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4338 We may back off by one in this case. */
4339 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4340 ST.min--;
4341 }
4342 REGCP_SET(ST.cp);
4343 goto curly_try_B_max;
4344 }
4345 /* NOTREACHED */
4346
4347
4348 case CURLY_B_min_known_fail:
4349 /* failed to find B in a non-greedy match where c1,c2 valid */
4350 if (ST.paren && ST.count)
4351 PL_regendp[ST.paren] = -1;
4352
4353 PL_reginput = locinput; /* Could be reset... */
4354 REGCP_UNWIND(ST.cp);
4355 /* Couldn't or didn't -- move forward. */
4356 ST.oldloc = locinput;
4357 if (do_utf8)
4358 locinput += UTF8SKIP(locinput);
4359 else
4360 locinput++;
4361 ST.count++;
4362 curly_try_B_min_known:
4363 /* find the next place where 'B' could work, then call B */
4364 {
4365 int n;
4366 if (do_utf8) {
4367 n = (ST.oldloc == locinput) ? 0 : 1;
4368 if (ST.c1 == ST.c2) {
4369 STRLEN len;
4370 /* set n to utf8_distance(oldloc, locinput) */
4371 while (locinput <= ST.maxpos &&
4372 utf8n_to_uvchr((U8*)locinput,
4373 UTF8_MAXBYTES, &len,
4374 uniflags) != (UV)ST.c1) {
4375 locinput += len;
4376 n++;
4377 }
1aa99e6b
IH
4378 }
4379 else {
c255a977
DM
4380 /* set n to utf8_distance(oldloc, locinput) */
4381 while (locinput <= ST.maxpos) {
4382 STRLEN len;
4383 const UV c = utf8n_to_uvchr((U8*)locinput,
4384 UTF8_MAXBYTES, &len,
4385 uniflags);
4386 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4387 break;
4388 locinput += len;
4389 n++;
1aa99e6b 4390 }
0fe9bf95
IZ
4391 }
4392 }
c255a977
DM
4393 else {
4394 if (ST.c1 == ST.c2) {
4395 while (locinput <= ST.maxpos &&
4396 UCHARAT(locinput) != ST.c1)
4397 locinput++;
bbce6d69 4398 }
c255a977
DM
4399 else {
4400 while (locinput <= ST.maxpos
4401 && UCHARAT(locinput) != ST.c1
4402 && UCHARAT(locinput) != ST.c2)
4403 locinput++;
a0ed51b3 4404 }
c255a977
DM
4405 n = locinput - ST.oldloc;
4406 }
4407 if (locinput > ST.maxpos)
4408 sayNO;
4409 /* PL_reginput == oldloc now */
4410 if (n) {
4411 ST.count += n;
4412 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4413 sayNO;
a0d0e21e 4414 }
c255a977
DM
4415 PL_reginput = locinput;
4416 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4417 if (cur_eval && cur_eval->u.eval.close_paren &&
4418 cur_eval->u.eval.close_paren == ST.paren) {
4419 goto fake_end;
4420 }
c255a977 4421 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4422 }
c255a977
DM
4423 /* NOTREACHED */
4424
4425
4426 case CURLY_B_min_fail:
4427 /* failed to find B in a non-greedy match where c1,c2 invalid */
4428 if (ST.paren && ST.count)
4429 PL_regendp[ST.paren] = -1;
4430
4431 REGCP_UNWIND(ST.cp);
4432 /* failed -- move forward one */
4433 PL_reginput = locinput;
4434 if (regrepeat(rex, ST.A, 1)) {
4435 ST.count++;
a0ed51b3 4436 locinput = PL_reginput;
c255a977
DM
4437 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4438 ST.count > 0)) /* count overflow ? */
15272685 4439 {
c255a977
DM
4440 curly_try_B_min:
4441 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4442 if (cur_eval && cur_eval->u.eval.close_paren &&
4443 cur_eval->u.eval.close_paren == ST.paren) {
4444 goto fake_end;
4445 }
c255a977 4446 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4447 }
4448 }
4633a7c4 4449 sayNO;
c255a977
DM
4450 /* NOTREACHED */
4451
4452
4453 curly_try_B_max:
4454 /* a successful greedy match: now try to match B */
4455 {
4456 UV c = 0;
4457 if (ST.c1 != CHRTEST_VOID)
4458 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4459 UTF8_MAXBYTES, 0, uniflags)
466787eb 4460 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4461 /* If it could work, try it. */
4462 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4463 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4464 if (cur_eval && cur_eval->u.eval.close_paren &&
4465 cur_eval->u.eval.close_paren == ST.paren) {
4466 goto fake_end;
4467 }
c255a977
DM
4468 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4469 /* NOTREACHED */
4470 }
4471 }
4472 /* FALL THROUGH */
4473 case CURLY_B_max_fail:
4474 /* failed to find B in a greedy match */
4475 if (ST.paren && ST.count)
4476 PL_regendp[ST.paren] = -1;
4477
4478 REGCP_UNWIND(ST.cp);
4479 /* back up. */
4480 if (--ST.count < ST.min)
4481 sayNO;
4482 PL_reginput = locinput = HOPc(locinput, -1);
4483 goto curly_try_B_max;
4484
4485#undef ST
4486
4487
a0d0e21e 4488 case END:
6bda09f9 4489 fake_end:
faec1544
DM
4490 if (cur_eval) {
4491 /* we've just finished A in /(??{A})B/; now continue with B */
4492 I32 tmpix;
4493
4494
4495 st->u.eval.toggle_reg_flags
4496 = cur_eval->u.eval.toggle_reg_flags;
4497 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4498
4499 st->u.eval.prev_rex = rex; /* inner */
4500 rex = cur_eval->u.eval.prev_rex; /* outer */
4501 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4502 ReREFCNT_inc(rex);
4503 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4504 REGCP_SET(st->u.eval.lastcp);
4505 PL_reginput = locinput;
4506
4507 /* Restore parens of the outer rex without popping the
4508 * savestack */
4509 tmpix = PL_savestack_ix;
4510 PL_savestack_ix = cur_eval->u.eval.lastcp;
4511 regcppop(rex);
4512 PL_savestack_ix = tmpix;
4513
4514 st->u.eval.prev_eval = cur_eval;
4515 cur_eval = cur_eval->u.eval.prev_eval;
4516 DEBUG_EXECUTE_r(
6bda09f9
YO
4517 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4518 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
faec1544
DM
4519 PUSH_YES_STATE_GOTO(EVAL_AB,
4520 st->u.eval.prev_eval->u.eval.B); /* match B */
4521 }
4522
3b0527fe 4523 if (locinput < reginfo->till) {
a3621e74 4524 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4525 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4526 PL_colors[4],
4527 (long)(locinput - PL_reg_starttry),
3b0527fe 4528 (long)(reginfo->till - PL_reg_starttry),
7821416a 4529 PL_colors[5]));
262b90c4 4530 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4531 }
4532 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4533 sayYES; /* Success! */
dad79028
DM
4534
4535 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4536 DEBUG_EXECUTE_r(
4537 PerlIO_printf(Perl_debug_log,
4538 "%*s %ssubpattern success...%s\n",
5bc10b2c 4539 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4540 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4541 sayYES; /* Success! */
dad79028 4542
40a82448
DM
4543#undef ST
4544#define ST st->u.ifmatch
4545
4546 case SUSPEND: /* (?>A) */
4547 ST.wanted = 1;
9fe1d20c 4548 PL_reginput = locinput;
9041c2e3 4549 goto do_ifmatch;
dad79028 4550
40a82448
DM
4551 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4552 ST.wanted = 0;
dad79028
DM
4553 goto ifmatch_trivial_fail_test;
4554
40a82448
DM
4555 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4556 ST.wanted = 1;
dad79028 4557 ifmatch_trivial_fail_test:
a0ed51b3 4558 if (scan->flags) {
52657f30 4559 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4560 if (!s) {
4561 /* trivial fail */
24d3c4a9
DM
4562 if (logical) {
4563 logical = 0;
4564 sw = 1 - (bool)ST.wanted;
dad79028 4565 }
40a82448 4566 else if (ST.wanted)
dad79028
DM
4567 sayNO;
4568 next = scan + ARG(scan);
4569 if (next == scan)
4570 next = NULL;
4571 break;
4572 }
efb30f32 4573 PL_reginput = s;
a0ed51b3
LW
4574 }
4575 else
4576 PL_reginput = locinput;
4577
c277df42 4578 do_ifmatch:
40a82448 4579 ST.me = scan;
24d3c4a9 4580 ST.logical = logical;
40a82448
DM
4581 /* execute body of (?...A) */
4582 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4583 /* NOTREACHED */
4584
4585 case IFMATCH_A_fail: /* body of (?...A) failed */
4586 ST.wanted = !ST.wanted;
4587 /* FALL THROUGH */
4588
4589 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4590 if (ST.logical) {
4591 sw = (bool)ST.wanted;
40a82448
DM
4592 }
4593 else if (!ST.wanted)
4594 sayNO;
4595
4596 if (OP(ST.me) == SUSPEND)
4597 locinput = PL_reginput;
4598 else {
4599 locinput = PL_reginput = st->locinput;
4600 nextchr = UCHARAT(locinput);
4601 }
4602 scan = ST.me + ARG(ST.me);
4603 if (scan == ST.me)
4604 scan = NULL;
4605 continue; /* execute B */
4606
4607#undef ST
dad79028 4608
c277df42 4609 case LONGJMP:
c277df42
IZ
4610 next = scan + ARG(scan);
4611 if (next == scan)
4612 next = NULL;
a0d0e21e
LW
4613 break;
4614 default:
b900a521 4615 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4616 PTR2UV(scan), OP(scan));
cea2e8a9 4617 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4618 }
95b24440 4619
a0d0e21e 4620 scan = next;
95b24440
DM
4621 continue;
4622 /* NOTREACHED */
4623
40a82448
DM
4624 push_yes_state:
4625 /* push a state that backtracks on success */
4626 st->u.yes.prev_yes_state = yes_state;
4627 yes_state = st;
4628 /* FALL THROUGH */
4629 push_state:
4630 /* push a new regex state, then continue at scan */
4631 {
4632 regmatch_state *newst;
4633
5bc10b2c 4634 DEBUG_STATE_pp("push");
40a82448 4635 depth++;
40a82448
DM
4636 st->locinput = locinput;
4637 newst = st+1;
4638 if (newst > SLAB_LAST(PL_regmatch_slab))
4639 newst = S_push_slab(aTHX);
4640 PL_regmatch_state = newst;
786e8c11 4641
40a82448
DM
4642 locinput = PL_reginput;
4643 nextchr = UCHARAT(locinput);
4644 st = newst;
4645 continue;
4646 /* NOTREACHED */
4647 }
a0d0e21e 4648 }
a687059c 4649
a0d0e21e
LW
4650 /*
4651 * We get here only if there's trouble -- normally "case END" is
4652 * the terminating point.
4653 */
cea2e8a9 4654 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4655 /*NOTREACHED*/
4633a7c4
LW
4656 sayNO;
4657
262b90c4 4658yes:
77cb431f
DM
4659 if (yes_state) {
4660 /* we have successfully completed a subexpression, but we must now
4661 * pop to the state marked by yes_state and continue from there */
77cb431f 4662 assert(st != yes_state);
5bc10b2c
DM
4663#ifdef DEBUGGING
4664 while (st != yes_state) {
4665 st--;
4666 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4667 PL_regmatch_slab = PL_regmatch_slab->prev;
4668 st = SLAB_LAST(PL_regmatch_slab);
4669 }
4670 DEBUG_STATE_pp("pop (yes)");
4671 depth--;
4672 }
4673#else
77cb431f
DM
4674 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4675 || yes_state > SLAB_LAST(PL_regmatch_slab))
4676 {
4677 /* not in this slab, pop slab */
4678 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4679 PL_regmatch_slab = PL_regmatch_slab->prev;
4680 st = SLAB_LAST(PL_regmatch_slab);
4681 }
4682 depth -= (st - yes_state);
5bc10b2c 4683#endif
77cb431f
DM
4684 st = yes_state;
4685 yes_state = st->u.yes.prev_yes_state;
4686 PL_regmatch_state = st;
4687
24d3c4a9
DM
4688 state_num = st->resume_state;
4689 goto reenter_switch;
77cb431f
DM
4690 }
4691
a3621e74 4692 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4693 PL_colors[4], PL_colors[5]));
02db2b7b 4694
95b24440 4695 result = 1;
aa283a38 4696 goto final_exit;
4633a7c4
LW
4697
4698no:
a3621e74 4699 DEBUG_EXECUTE_r(
7821416a 4700 PerlIO_printf(Perl_debug_log,
786e8c11 4701 "%*s %sfailed...%s\n",
5bc10b2c 4702 REPORT_CODE_OFF+depth*2, "",
786e8c11 4703 PL_colors[4], PL_colors[5])
7821416a 4704 );
aa283a38 4705
262b90c4 4706no_silent:
aa283a38
DM
4707 if (depth) {
4708 /* there's a previous state to backtrack to */
40a82448
DM
4709 st--;
4710 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4711 PL_regmatch_slab = PL_regmatch_slab->prev;
4712 st = SLAB_LAST(PL_regmatch_slab);
4713 }
4714 PL_regmatch_state = st;
40a82448
DM
4715 locinput= st->locinput;
4716 nextchr = UCHARAT(locinput);
4717
5bc10b2c
DM
4718 DEBUG_STATE_pp("pop");
4719 depth--;
262b90c4
DM
4720 if (yes_state == st)
4721 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 4722
24d3c4a9
DM
4723 state_num = st->resume_state + 1; /* failure = success + 1 */
4724 goto reenter_switch;
95b24440 4725 }
24d3c4a9 4726 result = 0;
aa283a38 4727
262b90c4 4728 final_exit:
aa283a38 4729
5d9a96ca
DM
4730 /* restore original high-water mark */
4731 PL_regmatch_slab = orig_slab;
4732 PL_regmatch_state = orig_state;
4733
4734 /* free all slabs above current one */
4735 if (orig_slab->next) {
c4fd8992 4736 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4737 orig_slab->next = NULL;
4738 while (sl) {
c4fd8992 4739 regmatch_slab * const osl = sl;
5d9a96ca 4740 sl = sl->next;
ad65c075 4741 Safefree(osl);
5d9a96ca
DM
4742 }
4743 }
4744
95b24440 4745 return result;
a687059c
LW
4746}
4747
4748/*
4749 - regrepeat - repeatedly match something simple, report how many
4750 */
4751/*
4752 * [This routine now assumes that it will only match on things of length 1.
4753 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4754 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4755 */
76e3520e 4756STATIC I32
32fc9b6a 4757S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4758{
27da23d5 4759 dVAR;
a0d0e21e 4760 register char *scan;
a0d0e21e 4761 register I32 c;
3280af22 4762 register char *loceol = PL_regeol;
a0ed51b3 4763 register I32 hardcount = 0;
53c4c00c 4764 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4765
3280af22 4766 scan = PL_reginput;
faf11cac
HS
4767 if (max == REG_INFTY)
4768 max = I32_MAX;
4769 else if (max < loceol - scan)
7f596f4c 4770 loceol = scan + max;
a0d0e21e 4771 switch (OP(p)) {
22c35a8c 4772 case REG_ANY:
1aa99e6b 4773 if (do_utf8) {
ffc61ed2 4774 loceol = PL_regeol;
1aa99e6b 4775 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4776 scan += UTF8SKIP(scan);
4777 hardcount++;
4778 }
4779 } else {
4780 while (scan < loceol && *scan != '\n')
4781 scan++;
a0ed51b3
LW
4782 }
4783 break;
ffc61ed2 4784 case SANY:
def8e4ea
JH
4785 if (do_utf8) {
4786 loceol = PL_regeol;
a0804c9e 4787 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4788 scan += UTF8SKIP(scan);
4789 hardcount++;
4790 }
4791 }
4792 else
4793 scan = loceol;
a0ed51b3 4794 break;
f33976b4
DB
4795 case CANY:
4796 scan = loceol;
4797 break;
090f7165
JH
4798 case EXACT: /* length of string is 1 */
4799 c = (U8)*STRING(p);
4800 while (scan < loceol && UCHARAT(scan) == c)
4801 scan++;
bbce6d69 4802 break;
4803 case EXACTF: /* length of string is 1 */
cd439c50 4804 c = (U8)*STRING(p);
bbce6d69 4805 while (scan < loceol &&
22c35a8c 4806 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4807 scan++;
4808 break;
4809 case EXACTFL: /* length of string is 1 */
3280af22 4810 PL_reg_flags |= RF_tainted;
cd439c50 4811 c = (U8)*STRING(p);
bbce6d69 4812 while (scan < loceol &&
22c35a8c 4813 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4814 scan++;
4815 break;
4816 case ANYOF:
ffc61ed2
JH
4817 if (do_utf8) {
4818 loceol = PL_regeol;
cfc92286 4819 while (hardcount < max && scan < loceol &&
32fc9b6a 4820 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4821 scan += UTF8SKIP(scan);
4822 hardcount++;
4823 }
4824 } else {
32fc9b6a 4825 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4826 scan++;
4827 }
a0d0e21e
LW
4828 break;
4829 case ALNUM:
1aa99e6b 4830 if (do_utf8) {
ffc61ed2 4831 loceol = PL_regeol;
1a4fad37 4832 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4833 while (hardcount < max && scan < loceol &&
3568d838 4834 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4835 scan += UTF8SKIP(scan);
4836 hardcount++;
4837 }
4838 } else {
4839 while (scan < loceol && isALNUM(*scan))
4840 scan++;
a0ed51b3
LW
4841 }
4842 break;
bbce6d69 4843 case ALNUML:
3280af22 4844 PL_reg_flags |= RF_tainted;
1aa99e6b 4845 if (do_utf8) {
ffc61ed2 4846 loceol = PL_regeol;
1aa99e6b
IH
4847 while (hardcount < max && scan < loceol &&
4848 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4849 scan += UTF8SKIP(scan);
4850 hardcount++;
4851 }
4852 } else {
4853 while (scan < loceol && isALNUM_LC(*scan))
4854 scan++;
a0ed51b3
LW
4855 }
4856 break;
a0d0e21e 4857 case NALNUM:
1aa99e6b 4858 if (do_utf8) {
ffc61ed2 4859 loceol = PL_regeol;
1a4fad37 4860 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4861 while (hardcount < max && scan < loceol &&
3568d838 4862 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4863 scan += UTF8SKIP(scan);
4864 hardcount++;
4865 }
4866 } else {
4867 while (scan < loceol && !isALNUM(*scan))
4868 scan++;
a0ed51b3
LW
4869 }
4870 break;
bbce6d69 4871 case NALNUML:
3280af22 4872 PL_reg_flags |= RF_tainted;
1aa99e6b 4873 if (do_utf8) {
ffc61ed2 4874 loceol = PL_regeol;
1aa99e6b
IH
4875 while (hardcount < max && scan < loceol &&
4876 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4877 scan += UTF8SKIP(scan);
4878 hardcount++;
4879 }
4880 } else {
4881 while (scan < loceol && !isALNUM_LC(*scan))
4882 scan++;
a0ed51b3
LW
4883 }
4884 break;
a0d0e21e 4885 case SPACE:
1aa99e6b 4886 if (do_utf8) {
ffc61ed2 4887 loceol = PL_regeol;
1a4fad37 4888 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4889 while (hardcount < max && scan < loceol &&
3568d838
JH
4890 (*scan == ' ' ||
4891 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4892 scan += UTF8SKIP(scan);
4893 hardcount++;
4894 }
4895 } else {
4896 while (scan < loceol && isSPACE(*scan))
4897 scan++;
a0ed51b3
LW
4898 }
4899 break;
bbce6d69 4900 case SPACEL:
3280af22 4901 PL_reg_flags |= RF_tainted;
1aa99e6b 4902 if (do_utf8) {
ffc61ed2 4903 loceol = PL_regeol;
1aa99e6b 4904 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4905 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4906 scan += UTF8SKIP(scan);
4907 hardcount++;
4908 }
4909 } else {
4910 while (scan < loceol && isSPACE_LC(*scan))
4911 scan++;
a0ed51b3
LW
4912 }
4913 break;
a0d0e21e 4914 case NSPACE:
1aa99e6b 4915 if (do_utf8) {
ffc61ed2 4916 loceol = PL_regeol;
1a4fad37 4917 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4918 while (hardcount < max && scan < loceol &&
3568d838
JH
4919 !(*scan == ' ' ||
4920 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4921 scan += UTF8SKIP(scan);
4922 hardcount++;
4923 }
4924 } else {
4925 while (scan < loceol && !isSPACE(*scan))
4926 scan++;
4927 break;
a0ed51b3 4928 }
bbce6d69 4929 case NSPACEL:
3280af22 4930 PL_reg_flags |= RF_tainted;
1aa99e6b 4931 if (do_utf8) {
ffc61ed2 4932 loceol = PL_regeol;
1aa99e6b 4933 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4934 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4935 scan += UTF8SKIP(scan);
4936 hardcount++;
4937 }
4938 } else {
4939 while (scan < loceol && !isSPACE_LC(*scan))
4940 scan++;
a0ed51b3
LW
4941 }
4942 break;
a0d0e21e 4943 case DIGIT:
1aa99e6b 4944 if (do_utf8) {
ffc61ed2 4945 loceol = PL_regeol;
1a4fad37 4946 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4947 while (hardcount < max && scan < loceol &&
3568d838 4948 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4949 scan += UTF8SKIP(scan);
4950 hardcount++;
4951 }
4952 } else {
4953 while (scan < loceol && isDIGIT(*scan))
4954 scan++;
a0ed51b3
LW
4955 }
4956 break;
a0d0e21e 4957 case NDIGIT:
1aa99e6b 4958 if (do_utf8) {
ffc61ed2 4959 loceol = PL_regeol;
1a4fad37 4960 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4961 while (hardcount < max && scan < loceol &&
3568d838 4962 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4963 scan += UTF8SKIP(scan);
4964 hardcount++;
4965 }
4966 } else {
4967 while (scan < loceol && !isDIGIT(*scan))
4968 scan++;
a0ed51b3
LW
4969 }
4970 break;
a0d0e21e
LW
4971 default: /* Called on something of 0 width. */
4972 break; /* So match right here or not at all. */
4973 }
a687059c 4974
a0ed51b3
LW
4975 if (hardcount)
4976 c = hardcount;
4977 else
4978 c = scan - PL_reginput;
3280af22 4979 PL_reginput = scan;
a687059c 4980
a3621e74 4981 DEBUG_r({
e68ec53f 4982 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4983 DEBUG_EXECUTE_r({
e68ec53f
YO
4984 SV * const prop = sv_newmortal();
4985 regprop(prog, prop, p);
4986 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4987 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4988 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4989 });
be8e71aa 4990 });
9041c2e3 4991
a0d0e21e 4992 return(c);
a687059c
LW
4993}
4994
c277df42 4995
be8e71aa 4996#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4997/*
ffc61ed2
JH
4998- regclass_swash - prepare the utf8 swash
4999*/
5000
5001SV *
32fc9b6a 5002Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5003{
97aff369 5004 dVAR;
9e55ce06
JH
5005 SV *sw = NULL;
5006 SV *si = NULL;
5007 SV *alt = NULL;
3dab1dad 5008 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5009
4f639d21 5010 if (data && data->count) {
a3b680e6 5011 const U32 n = ARG(node);
ffc61ed2 5012
4f639d21
DM
5013 if (data->what[n] == 's') {
5014 SV * const rv = (SV*)data->data[n];
890ce7af 5015 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5016 SV **const ary = AvARRAY(av);
9e55ce06 5017 SV **a, **b;
9041c2e3 5018
711a919c 5019 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5020 * documentation of these array elements. */
5021
b11f357e 5022 si = *ary;
8f7f7219 5023 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5024 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5025
ffc61ed2
JH
5026 if (a)
5027 sw = *a;
5028 else if (si && doinit) {
5029 sw = swash_init("utf8", "", si, 1, 0);
5030 (void)av_store(av, 1, sw);
5031 }
9e55ce06
JH
5032 if (b)
5033 alt = *b;
ffc61ed2
JH
5034 }
5035 }
5036
9e55ce06
JH
5037 if (listsvp)
5038 *listsvp = si;
5039 if (altsvp)
5040 *altsvp = alt;
ffc61ed2
JH
5041
5042 return sw;
5043}
76234dfb 5044#endif
ffc61ed2
JH
5045
5046/*
ba7b4546 5047 - reginclass - determine if a character falls into a character class
832705d4
JH
5048
5049 The n is the ANYOF regnode, the p is the target string, lenp
5050 is pointer to the maximum length of how far to go in the p
5051 (if the lenp is zero, UTF8SKIP(p) is used),
5052 do_utf8 tells whether the target string is in UTF-8.
5053
bbce6d69 5054 */
5055
76e3520e 5056STATIC bool
32fc9b6a 5057S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5058{
27da23d5 5059 dVAR;
a3b680e6 5060 const char flags = ANYOF_FLAGS(n);
bbce6d69 5061 bool match = FALSE;
cc07378b 5062 UV c = *p;
ae9ddab8 5063 STRLEN len = 0;
9e55ce06 5064 STRLEN plen;
1aa99e6b 5065
19f67299
TS
5066 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5067 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5068 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5069 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5070 if (len == (STRLEN)-1)
5071 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5072 }
bbce6d69 5073
0f0076b4 5074 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5075 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5076 if (lenp)
5077 *lenp = 0;
ffc61ed2 5078 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5079 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5080 match = TRUE;
bbce6d69 5081 }
3568d838 5082 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5083 match = TRUE;
ffc61ed2 5084 if (!match) {
9e55ce06 5085 AV *av;
32fc9b6a 5086 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5087
5088 if (sw) {
3568d838 5089 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5090 match = TRUE;
5091 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5092 if (!match && lenp && av) {
5093 I32 i;
9e55ce06 5094 for (i = 0; i <= av_len(av); i++) {
890ce7af 5095 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5096 STRLEN len;
890ce7af 5097 const char * const s = SvPV_const(sv, len);
9e55ce06 5098
061b10df 5099 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5100 *lenp = len;
5101 match = TRUE;
5102 break;
5103 }
5104 }
5105 }
5106 if (!match) {
89ebb4a3 5107 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5108 STRLEN tmplen;
5109
9e55ce06
JH
5110 to_utf8_fold(p, tmpbuf, &tmplen);
5111 if (swash_fetch(sw, tmpbuf, do_utf8))
5112 match = TRUE;
5113 }
ffc61ed2
JH
5114 }
5115 }
bbce6d69 5116 }
9e55ce06 5117 if (match && lenp && *lenp == 0)
0f0076b4 5118 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5119 }
1aa99e6b 5120 if (!match && c < 256) {
ffc61ed2
JH
5121 if (ANYOF_BITMAP_TEST(n, c))
5122 match = TRUE;
5123 else if (flags & ANYOF_FOLD) {
eb160463 5124 U8 f;
a0ed51b3 5125
ffc61ed2
JH
5126 if (flags & ANYOF_LOCALE) {
5127 PL_reg_flags |= RF_tainted;
5128 f = PL_fold_locale[c];
5129 }
5130 else
5131 f = PL_fold[c];
5132 if (f != c && ANYOF_BITMAP_TEST(n, f))
5133 match = TRUE;
5134 }
5135
5136 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5137 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5138 if (
5139 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5154 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5155 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5156 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5157 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5158 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5159 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5160 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5161 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5162 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5163 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5164 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5165 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5166 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5167 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5168 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5169 ) /* How's that for a conditional? */
5170 {
5171 match = TRUE;
5172 }
a0ed51b3 5173 }
a0ed51b3
LW
5174 }
5175
a0ed51b3
LW
5176 return (flags & ANYOF_INVERT) ? !match : match;
5177}
161b471a 5178
dfe13c55 5179STATIC U8 *
0ce71af7 5180S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5181{
97aff369 5182 dVAR;
a0ed51b3 5183 if (off >= 0) {
1aa99e6b 5184 while (off-- && s < lim) {
ffc61ed2 5185 /* XXX could check well-formedness here */
a0ed51b3 5186 s += UTF8SKIP(s);
ffc61ed2 5187 }
a0ed51b3
LW
5188 }
5189 else {
1de06328
YO
5190 while (off++ && s > lim) {
5191 s--;
5192 if (UTF8_IS_CONTINUED(*s)) {
5193 while (s > lim && UTF8_IS_CONTINUATION(*s))
5194 s--;
a0ed51b3 5195 }
1de06328 5196 /* XXX could check well-formedness here */
a0ed51b3
LW
5197 }
5198 }
5199 return s;
5200}
161b471a 5201
f9f4320a
YO
5202#ifdef XXX_dmq
5203/* there are a bunch of places where we use two reghop3's that should
5204 be replaced with this routine. but since thats not done yet
5205 we ifdef it out - dmq
5206*/
dfe13c55 5207STATIC U8 *
1de06328
YO
5208S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5209{
5210 dVAR;
5211 if (off >= 0) {
5212 while (off-- && s < rlim) {
5213 /* XXX could check well-formedness here */
5214 s += UTF8SKIP(s);
5215 }
5216 }
5217 else {
5218 while (off++ && s > llim) {
5219 s--;
5220 if (UTF8_IS_CONTINUED(*s)) {
5221 while (s > llim && UTF8_IS_CONTINUATION(*s))
5222 s--;
5223 }
5224 /* XXX could check well-formedness here */
5225 }
5226 }
5227 return s;
5228}
f9f4320a 5229#endif
1de06328
YO
5230
5231STATIC U8 *
0ce71af7 5232S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5233{
97aff369 5234 dVAR;
a0ed51b3 5235 if (off >= 0) {
1aa99e6b 5236 while (off-- && s < lim) {
ffc61ed2 5237 /* XXX could check well-formedness here */
a0ed51b3 5238 s += UTF8SKIP(s);
ffc61ed2 5239 }
a0ed51b3 5240 if (off >= 0)
3dab1dad 5241 return NULL;
a0ed51b3
LW
5242 }
5243 else {
1de06328
YO
5244 while (off++ && s > lim) {
5245 s--;
5246 if (UTF8_IS_CONTINUED(*s)) {
5247 while (s > lim && UTF8_IS_CONTINUATION(*s))
5248 s--;
a0ed51b3 5249 }
1de06328 5250 /* XXX could check well-formedness here */
a0ed51b3
LW
5251 }
5252 if (off <= 0)
3dab1dad 5253 return NULL;
a0ed51b3
LW
5254 }
5255 return s;
5256}
51371543 5257
51371543 5258static void
acfe0abc 5259restore_pos(pTHX_ void *arg)
51371543 5260{
97aff369 5261 dVAR;
097eb12c 5262 regexp * const rex = (regexp *)arg;
51371543
GS
5263 if (PL_reg_eval_set) {
5264 if (PL_reg_oldsaved) {
4f639d21
DM
5265 rex->subbeg = PL_reg_oldsaved;
5266 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5267#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5268 rex->saved_copy = PL_nrs;
ed252734 5269#endif
4f639d21 5270 RX_MATCH_COPIED_on(rex);
51371543
GS
5271 }
5272 PL_reg_magic->mg_len = PL_reg_oldpos;
5273 PL_reg_eval_set = 0;
5274 PL_curpm = PL_reg_oldcurpm;
5275 }
5276}
33b8afdf
JH
5277
5278STATIC void
5279S_to_utf8_substr(pTHX_ register regexp *prog)
5280{
33b8afdf 5281 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5282 SV* const sv = newSVsv(prog->float_substr);
5283 prog->float_utf8 = sv;
33b8afdf
JH
5284 sv_utf8_upgrade(sv);
5285 if (SvTAIL(prog->float_substr))
5286 SvTAIL_on(sv);
5287 if (prog->float_substr == prog->check_substr)
5288 prog->check_utf8 = sv;
5289 }
5290 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5291 SV* const sv = newSVsv(prog->anchored_substr);
5292 prog->anchored_utf8 = sv;
33b8afdf
JH
5293 sv_utf8_upgrade(sv);
5294 if (SvTAIL(prog->anchored_substr))
5295 SvTAIL_on(sv);
5296 if (prog->anchored_substr == prog->check_substr)
5297 prog->check_utf8 = sv;
5298 }
5299}
5300
5301STATIC void
5302S_to_byte_substr(pTHX_ register regexp *prog)
5303{
97aff369 5304 dVAR;
33b8afdf 5305 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5306 SV* sv = newSVsv(prog->float_utf8);
5307 prog->float_substr = sv;
33b8afdf
JH
5308 if (sv_utf8_downgrade(sv, TRUE)) {
5309 if (SvTAIL(prog->float_utf8))
5310 SvTAIL_on(sv);
5311 } else {
5312 SvREFCNT_dec(sv);
5313 prog->float_substr = sv = &PL_sv_undef;
5314 }
5315 if (prog->float_utf8 == prog->check_utf8)
5316 prog->check_substr = sv;
5317 }
5318 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5319 SV* sv = newSVsv(prog->anchored_utf8);
5320 prog->anchored_substr = sv;
33b8afdf
JH
5321 if (sv_utf8_downgrade(sv, TRUE)) {
5322 if (SvTAIL(prog->anchored_utf8))
5323 SvTAIL_on(sv);
5324 } else {
5325 SvREFCNT_dec(sv);
5326 prog->anchored_substr = sv = &PL_sv_undef;
5327 }
5328 if (prog->anchored_utf8 == prog->check_utf8)
5329 prog->check_substr = sv;
5330 }
5331}
66610fdd
RGS
5332
5333/*
5334 * Local variables:
5335 * c-indentation-style: bsd
5336 * c-basic-offset: 4
5337 * indent-tabs-mode: t
5338 * End:
5339 *
37442d52
RGS
5340 * ex: set ts=8 sts=4 sw=4 noet:
5341 */