This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Mention that field hashes are for inside-out objects
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
faec1544 80
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
e7409c1b 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
c476f425 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
6bda09f9 169#define REGCP_OTHER_ELEMS 8
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 171
3280af22 172 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 173/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
174 SSPUSHINT(PL_regendp[p]);
175 SSPUSHINT(PL_regstartp[p]);
3280af22 176 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 177 SSPUSHINT(p);
40a82448
DM
178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
179 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
180 (UV)p, (IV)PL_regstartp[p],
181 (IV)(PL_reg_start_tmp[p] - PL_bostr),
182 (IV)PL_regendp[p]
183 ));
a0d0e21e 184 }
b1ce53c5 185/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
6bda09f9
YO
186 SSPUSHPTR(PL_regstartp);
187 SSPUSHPTR(PL_regendp);
3280af22
NIS
188 SSPUSHINT(PL_regsize);
189 SSPUSHINT(*PL_reglastparen);
a01268b5 190 SSPUSHINT(*PL_reglastcloseparen);
3280af22 191 SSPUSHPTR(PL_reginput);
41123dfd
JH
192#define REGCP_FRAME_ELEMS 2
193/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
194 * are needed for the regexp context stack bookkeeping. */
195 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 196 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 197
a0d0e21e
LW
198 return retval;
199}
200
c277df42 201/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
202#define REGCP_SET(cp) \
203 DEBUG_STATE_r( \
ab3bbdeb 204 PerlIO_printf(Perl_debug_log, \
e4f74956 205 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
206 (IV)PL_savestack_ix)); \
207 cp = PL_savestack_ix
c3464db5 208
ab3bbdeb 209#define REGCP_UNWIND(cp) \
e4f74956 210 DEBUG_STATE_r( \
ab3bbdeb 211 if (cp != PL_savestack_ix) \
e4f74956
YO
212 PerlIO_printf(Perl_debug_log, \
213 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
214 (IV)(cp), (IV)PL_savestack_ix)); \
215 regcpblow(cp)
c277df42 216
76e3520e 217STATIC char *
097eb12c 218S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 219{
97aff369 220 dVAR;
b1ce53c5 221 I32 i;
a0d0e21e 222 char *input;
b1ce53c5 223
a3621e74
YO
224 GET_RE_DEBUG_FLAGS_DECL;
225
b1ce53c5 226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 227 i = SSPOPINT;
b1ce53c5
JH
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 230 input = (char *) SSPOPPTR;
a01268b5 231 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
6bda09f9
YO
234 PL_regendp=(I32 *) SSPOPPTR;
235 PL_regstartp=(I32 *) SSPOPPTR;
b1ce53c5 236
6bda09f9 237
b1ce53c5 238 /* Now restore the parentheses context. */
41123dfd
JH
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 241 I32 tmps;
097eb12c 242 U32 paren = (U32)SSPOPINT;
3280af22 243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
244 PL_regstartp[paren] = SSPOPINT;
245 tmps = SSPOPINT;
3280af22
NIS
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
a3621e74 248 DEBUG_EXECUTE_r(
c3464db5 249 PerlIO_printf(Perl_debug_log,
b900a521 250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 251 (UV)paren, (IV)PL_regstartp[paren],
b900a521 252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 253 (IV)PL_regendp[paren],
3280af22 254 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 255 );
a0d0e21e 256 }
a3621e74 257 DEBUG_EXECUTE_r(
bb7a0f54 258 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 259 PerlIO_printf(Perl_debug_log,
faccc32b 260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 261 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
262 }
263 );
daf18116 264#if 1
dafc8851
JH
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
274 * --jhi */
bb7a0f54 275 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
276 if (i > PL_regsize)
277 PL_regstartp[i] = -1;
278 PL_regendp[i] = -1;
a0d0e21e 279 }
dafc8851 280#endif
a0d0e21e
LW
281 return input;
282}
283
02db2b7b 284#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 285
a687059c 286/*
e50aee73 287 * pregexec and friends
a687059c
LW
288 */
289
76234dfb 290#ifndef PERL_IN_XSUB_RE
a687059c 291/*
c277df42 292 - pregexec - match a regexp against a string
a687059c 293 */
c277df42 294I32
864dbfa3 295Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 296 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
297/* strend: pointer to null at end of string */
298/* strbeg: real beginning of string */
299/* minend: end of match must be >=minend after stringarg. */
300/* nosave: For optimizations. */
301{
302 return
9041c2e3 303 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
304 nosave ? 0 : REXEC_COPY_STR);
305}
76234dfb 306#endif
22e551b9 307
9041c2e3 308/*
cad2e5aa
JH
309 * Need to implement the following flags for reg_anch:
310 *
311 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
312 * USE_INTUIT_ML
313 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
314 * INTUIT_AUTORITATIVE_ML
315 * INTUIT_ONCE_NOML - Intuit can match in one location only.
316 * INTUIT_ONCE_ML
317 *
318 * Another flag for this function: SECOND_TIME (so that float substrs
319 * with giant delta may be not rechecked).
320 */
321
322/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
323
3f7c398e 324/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
325 Otherwise, only SvCUR(sv) is used to get strbeg. */
326
327/* XXXX We assume that strpos is strbeg unless sv. */
328
6eb5f6b9
JH
329/* XXXX Some places assume that there is a fixed substring.
330 An update may be needed if optimizer marks as "INTUITable"
331 RExen without fixed substrings. Similarly, it is assumed that
332 lengths of all the strings are no more than minlen, thus they
333 cannot come from lookahead.
334 (Or minlen should take into account lookahead.) */
335
2c2d71f5
JH
336/* A failure to find a constant substring means that there is no need to make
337 an expensive call to REx engine, thus we celebrate a failure. Similarly,
338 finding a substring too deep into the string means that less calls to
30944b6d
IZ
339 regtry() should be needed.
340
341 REx compiler's optimizer found 4 possible hints:
342 a) Anchored substring;
343 b) Fixed substring;
344 c) Whether we are anchored (beginning-of-line or \G);
345 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 346 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
347 string which does not contradict any of them.
348 */
2c2d71f5 349
6eb5f6b9
JH
350/* Most of decisions we do here should have been done at compile time.
351 The nodes of the REx which we used for the search should have been
352 deleted from the finite automaton. */
353
cad2e5aa
JH
354char *
355Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
356 char *strend, U32 flags, re_scream_pos_data *data)
357{
97aff369 358 dVAR;
b7953727 359 register I32 start_shift = 0;
cad2e5aa 360 /* Should be nonnegative! */
b7953727 361 register I32 end_shift = 0;
2c2d71f5
JH
362 register char *s;
363 register SV *check;
a1933d95 364 char *strbeg;
cad2e5aa 365 char *t;
1de06328 366 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 367 I32 ml_anch;
bd61b366
SS
368 register char *other_last = NULL; /* other substr checked before this */
369 char *check_at = NULL; /* check substr found at this pos */
1df70142 370 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 371#ifdef DEBUGGING
890ce7af 372 const char * const i_strpos = strpos;
30944b6d 373#endif
a3621e74
YO
374
375 GET_RE_DEBUG_FLAGS_DECL;
376
a30b2f1f 377 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 378
b8d68ded 379 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
380 PL_reg_flags |= RF_utf8;
381 }
ab3bbdeb
YO
382 DEBUG_EXECUTE_r(
383 debug_start_match(prog, do_utf8, strpos, strend,
1de06328
YO
384 sv ? "Guessing start of match in sv for"
385 : "Guessing start of match in string for");
2a782b5b 386 );
cad2e5aa 387
c344f387
JH
388 /* CHR_DIST() would be more correct here but it makes things slow. */
389 if (prog->minlen > strend - strpos) {
a3621e74 390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 391 "String too short... [re_intuit_start]\n"));
cad2e5aa 392 goto fail;
2c2d71f5 393 }
1de06328 394
a1933d95 395 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 396 PL_regeol = strend;
33b8afdf
JH
397 if (do_utf8) {
398 if (!prog->check_utf8 && prog->check_substr)
399 to_utf8_substr(prog);
400 check = prog->check_utf8;
401 } else {
402 if (!prog->check_substr && prog->check_utf8)
403 to_byte_substr(prog);
404 check = prog->check_substr;
405 }
1de06328 406 if (check == &PL_sv_undef) {
a3621e74 407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 408 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
409 goto fail;
410 }
2c2d71f5 411 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
412 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
413 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 414 && !multiline ) ); /* Check after \n? */
cad2e5aa 415
7e25d62c
JH
416 if (!ml_anch) {
417 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
418 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 419 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
420 && sv && !SvROK(sv)
421 && (strpos != strbeg)) {
a3621e74 422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
423 goto fail;
424 }
425 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 426 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 427 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
428 I32 slen;
429
1aa99e6b 430 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 431
653099ff
GS
432 if (SvTAIL(check)) {
433 slen = SvCUR(check); /* >= 1 */
cad2e5aa 434
9041c2e3 435 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 436 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 438 goto fail_finish;
cad2e5aa
JH
439 }
440 /* Now should match s[0..slen-2] */
441 slen--;
3f7c398e 442 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 443 || (slen > 1
3f7c398e 444 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 445 report_neq:
a3621e74 446 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
447 goto fail_finish;
448 }
cad2e5aa 449 }
3f7c398e 450 else if (*SvPVX_const(check) != *s
653099ff 451 || ((slen = SvCUR(check)) > 1
3f7c398e 452 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 453 goto report_neq;
c315bfe8 454 check_at = s;
2c2d71f5 455 goto success_at_start;
7e25d62c 456 }
cad2e5aa 457 }
2c2d71f5 458 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 459 s = strpos;
2c2d71f5 460 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
461 end_shift = prog->check_end_shift;
462
2c2d71f5 463 if (!ml_anch) {
a3b680e6 464 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 465 - (SvTAIL(check) != 0);
a3b680e6 466 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
467
468 if (end_shift < eshift)
469 end_shift = eshift;
470 }
cad2e5aa 471 }
2c2d71f5 472 else { /* Can match at random position */
cad2e5aa
JH
473 ml_anch = 0;
474 s = strpos;
1de06328
YO
475 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
476 end_shift = prog->check_end_shift;
477
478 /* end shift should be non negative here */
cad2e5aa
JH
479 }
480
2c2d71f5 481#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 482 if (end_shift < 0)
1de06328 483 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
b57a0404 484 (IV)end_shift, prog->precomp);
2c2d71f5
JH
485#endif
486
2c2d71f5
JH
487 restart:
488 /* Find a possible match in the region s..strend by looking for
489 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
490
491 {
492 I32 srch_start_shift = start_shift;
493 I32 srch_end_shift = end_shift;
494 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
495 srch_end_shift -= ((strbeg - s) - srch_start_shift);
496 srch_start_shift = strbeg - s;
497 }
6bda09f9 498 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
499 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
500 (IV)prog->check_offset_min,
501 (IV)srch_start_shift,
502 (IV)srch_end_shift,
503 (IV)prog->check_end_shift);
504 });
505
cad2e5aa 506 if (flags & REXEC_SCREAM) {
cad2e5aa 507 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 508 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 509
2c2d71f5
JH
510 if (PL_screamfirst[BmRARE(check)] >= 0
511 || ( BmRARE(check) == '\n'
512 && (BmPREVIOUS(check) == SvCUR(check) - 1)
513 && SvTAIL(check) ))
9041c2e3 514 s = screaminstr(sv, check,
1de06328 515 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 516 else
2c2d71f5 517 goto fail_finish;
4addbd3b
HS
518 /* we may be pointing at the wrong string */
519 if (s && RX_MATCH_COPIED(prog))
3f7c398e 520 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
521 if (data)
522 *data->scream_olds = s;
523 }
1de06328
YO
524 else {
525 U8* start_point;
526 U8* end_point;
527 if (prog->reganch & ROPT_CANY_SEEN) {
528 start_point= (U8*)(s + srch_start_shift);
529 end_point= (U8*)(strend - srch_end_shift);
530 } else {
531 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
532 end_point= HOP3(strend, -srch_end_shift, strbeg);
533 }
6bda09f9 534 DEBUG_OPTIMISE_MORE_r({
56570a2c 535 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 536 (int)(end_point - start_point),
fc8cd66c 537 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
538 start_point);
539 });
540
541 s = fbm_instr( start_point, end_point,
7fba1cd6 542 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
543 }
544 }
cad2e5aa
JH
545 /* Update the count-of-usability, remove useless subpatterns,
546 unshift s. */
2c2d71f5 547
ab3bbdeb
YO
548 DEBUG_EXECUTE_r({
549 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
550 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
551 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 552 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
553 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
554 ? "anchored" : "floating"),
555 quoted,
556 RE_SV_TAIL(check),
557 (s ? " at offset " : "...\n") );
558 });
2c2d71f5
JH
559
560 if (!s)
561 goto fail_finish;
2c2d71f5 562 /* Finish the diagnostic message */
a3621e74 563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 564
1de06328
YO
565 /* XXX dmq: first branch is for positive lookbehind...
566 Our check string is offset from the beginning of the pattern.
567 So we need to do any stclass tests offset forward from that
568 point. I think. :-(
569 */
570
571
572
573 check_at=s;
574
575
2c2d71f5
JH
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 579 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
1de06328
YO
584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
585 : (prog->float_substr && prog->anchored_substr))
586 {
30944b6d 587 /* Take into account the "other" substring. */
2c2d71f5
JH
588 /* XXXX May be hopelessly wrong for UTF... */
589 if (!other_last)
6eb5f6b9 590 other_last = strpos;
33b8afdf 591 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
592 do_other_anchored:
593 {
890ce7af
AL
594 char * const last = HOP3c(s, -start_shift, strbeg);
595 char *last1, *last2;
be8e71aa 596 char * const saved_s = s;
33b8afdf 597 SV* must;
2c2d71f5 598
2c2d71f5
JH
599 t = s - prog->check_offset_max;
600 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 601 && (!do_utf8
0ce71af7 602 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 603 && t > strpos)))
6f207bd3 604 NOOP;
2c2d71f5
JH
605 else
606 t = strpos;
1aa99e6b 607 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
608 if (t < other_last) /* These positions already checked */
609 t = other_last;
1aa99e6b 610 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
611 if (last < last1)
612 last1 = last;
1de06328
YO
613 /* XXXX It is not documented what units *_offsets are in.
614 We assume bytes, but this is clearly wrong.
615 Meaning this code needs to be carefully reviewed for errors.
616 dmq.
617 */
618
2c2d71f5 619 /* On end-of-str: see comment below. */
33b8afdf
JH
620 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
621 if (must == &PL_sv_undef) {
622 s = (char*)NULL;
1de06328 623 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
624 }
625 else
626 s = fbm_instr(
627 (unsigned char*)t,
628 HOP3(HOP3(last1, prog->anchored_offset, strend)
629 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
630 must,
7fba1cd6 631 multiline ? FBMrf_MULTILINE : 0
33b8afdf 632 );
ab3bbdeb
YO
633 DEBUG_EXECUTE_r({
634 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
635 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
636 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 637 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
638 quoted, RE_SV_TAIL(must));
639 });
640
641
2c2d71f5
JH
642 if (!s) {
643 if (last1 >= last2) {
a3621e74 644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
645 ", giving up...\n"));
646 goto fail_finish;
647 }
a3621e74 648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 649 ", trying floating at offset %ld...\n",
be8e71aa 650 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
651 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
652 s = HOP3c(last, 1, strend);
2c2d71f5
JH
653 goto restart;
654 }
655 else {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 657 (long)(s - i_strpos)));
1aa99e6b
IH
658 t = HOP3c(s, -prog->anchored_offset, strbeg);
659 other_last = HOP3c(s, 1, strend);
be8e71aa 660 s = saved_s;
2c2d71f5
JH
661 if (t == strpos)
662 goto try_at_start;
2c2d71f5
JH
663 goto try_at_offset;
664 }
30944b6d 665 }
2c2d71f5
JH
666 }
667 else { /* Take into account the floating substring. */
33b8afdf 668 char *last, *last1;
be8e71aa 669 char * const saved_s = s;
33b8afdf
JH
670 SV* must;
671
672 t = HOP3c(s, -start_shift, strbeg);
673 last1 = last =
674 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
675 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
676 last = HOP3c(t, prog->float_max_offset, strend);
677 s = HOP3c(t, prog->float_min_offset, strend);
678 if (s < other_last)
679 s = other_last;
2c2d71f5 680 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
681 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
682 /* fbm_instr() takes into account exact value of end-of-str
683 if the check is SvTAIL(ed). Since false positives are OK,
684 and end-of-str is not later than strend we are OK. */
685 if (must == &PL_sv_undef) {
686 s = (char*)NULL;
1de06328 687 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
688 }
689 else
2c2d71f5 690 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
691 (unsigned char*)last + SvCUR(must)
692 - (SvTAIL(must)!=0),
7fba1cd6 693 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
694 DEBUG_EXECUTE_r({
695 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
696 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
697 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 698 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
699 quoted, RE_SV_TAIL(must));
700 });
33b8afdf
JH
701 if (!s) {
702 if (last1 == last) {
a3621e74 703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
704 ", giving up...\n"));
705 goto fail_finish;
2c2d71f5 706 }
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 708 ", trying anchored starting at offset %ld...\n",
be8e71aa 709 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
710 other_last = last;
711 s = HOP3c(t, 1, strend);
712 goto restart;
713 }
714 else {
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
716 (long)(s - i_strpos)));
717 other_last = s; /* Fix this later. --Hugo */
be8e71aa 718 s = saved_s;
33b8afdf
JH
719 if (t == strpos)
720 goto try_at_start;
721 goto try_at_offset;
722 }
2c2d71f5 723 }
cad2e5aa 724 }
2c2d71f5 725
1de06328 726
9ef43ace 727 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 728
6bda09f9 729 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
730 PerlIO_printf(Perl_debug_log,
731 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
732 (IV)prog->check_offset_min,
733 (IV)prog->check_offset_max,
734 (IV)(s-strpos),
735 (IV)(t-strpos),
736 (IV)(t-s),
737 (IV)(strend-strpos)
738 )
739 );
740
2c2d71f5 741 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 742 && (!do_utf8
9ef43ace 743 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
744 && t > strpos)))
745 {
2c2d71f5
JH
746 /* Fixed substring is found far enough so that the match
747 cannot start at strpos. */
748 try_at_offset:
cad2e5aa 749 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
750 /* Eventually fbm_*() should handle this, but often
751 anchored_offset is not 0, so this check will not be wasted. */
752 /* XXXX In the code below we prefer to look for "^" even in
753 presence of anchored substrings. And we search even
754 beyond the found float position. These pessimizations
755 are historical artefacts only. */
756 find_anchor:
2c2d71f5 757 while (t < strend - prog->minlen) {
cad2e5aa 758 if (*t == '\n') {
4ee3650e 759 if (t < check_at - prog->check_offset_min) {
33b8afdf 760 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
761 /* Since we moved from the found position,
762 we definitely contradict the found anchored
30944b6d
IZ
763 substr. Due to the above check we do not
764 contradict "check" substr.
765 Thus we can arrive here only if check substr
766 is float. Redo checking for "other"=="fixed".
767 */
9041c2e3 768 strpos = t + 1;
a3621e74 769 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 770 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
771 goto do_other_anchored;
772 }
4ee3650e
GS
773 /* We don't contradict the found floating substring. */
774 /* XXXX Why not check for STCLASS? */
cad2e5aa 775 s = t + 1;
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 777 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
778 goto set_useful;
779 }
4ee3650e
GS
780 /* Position contradicts check-string */
781 /* XXXX probably better to look for check-string
782 than for "\n", so one should lower the limit for t? */
a3621e74 783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 784 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 785 other_last = strpos = s = t + 1;
cad2e5aa
JH
786 goto restart;
787 }
788 t++;
789 }
a3621e74 790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 791 PL_colors[0], PL_colors[1]));
2c2d71f5 792 goto fail_finish;
cad2e5aa 793 }
f5952150 794 else {
a3621e74 795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 796 PL_colors[0], PL_colors[1]));
f5952150 797 }
cad2e5aa
JH
798 s = t;
799 set_useful:
33b8afdf 800 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
801 }
802 else {
f5952150 803 /* The found string does not prohibit matching at strpos,
2c2d71f5 804 - no optimization of calling REx engine can be performed,
f5952150
GS
805 unless it was an MBOL and we are not after MBOL,
806 or a future STCLASS check will fail this. */
2c2d71f5
JH
807 try_at_start:
808 /* Even in this situation we may use MBOL flag if strpos is offset
809 wrt the start of the string. */
05b4157f 810 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 811 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
812 /* May be due to an implicit anchor of m{.*foo} */
813 && !(prog->reganch & ROPT_IMPLICIT))
814 {
cad2e5aa
JH
815 t = strpos;
816 goto find_anchor;
817 }
a3621e74 818 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 819 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 820 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 821 );
2c2d71f5 822 success_at_start:
30944b6d 823 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
824 && (do_utf8 ? (
825 prog->check_utf8 /* Could be deleted already */
826 && --BmUSEFUL(prog->check_utf8) < 0
827 && (prog->check_utf8 == prog->float_utf8)
828 ) : (
829 prog->check_substr /* Could be deleted already */
830 && --BmUSEFUL(prog->check_substr) < 0
831 && (prog->check_substr == prog->float_substr)
832 )))
66e933ab 833 {
cad2e5aa 834 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
836 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
837 if (do_utf8 ? prog->check_substr : prog->check_utf8)
838 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
839 prog->check_substr = prog->check_utf8 = NULL; /* disable */
840 prog->float_substr = prog->float_utf8 = NULL; /* clear */
841 check = NULL; /* abort */
cad2e5aa 842 s = strpos;
3cf5c195
IZ
843 /* XXXX This is a remnant of the old implementation. It
844 looks wasteful, since now INTUIT can use many
6eb5f6b9 845 other heuristics. */
cad2e5aa
JH
846 prog->reganch &= ~RE_USE_INTUIT;
847 }
848 else
849 s = strpos;
850 }
851
6eb5f6b9
JH
852 /* Last resort... */
853 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
854 /* trie stclasses are too expensive to use here, we are better off to
855 leave it to regmatch itself */
786e8c11 856 if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
6eb5f6b9
JH
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 865 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 866 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 868 : 1);
1de06328
YO
869 char * endpos;
870 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
871 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
872 else if (prog->float_substr || prog->float_utf8)
873 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
874 else
875 endpos= strend;
876
56570a2c 877 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %d s: %d endpos: %d\n",
b57a0404 878 (IV)start_shift, check_at - strbeg, s - strbeg, endpos - strbeg));
1de06328 879
6eb5f6b9 880 t = s;
3b0527fe 881 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
882 if (!s) {
883#ifdef DEBUGGING
cbbf8932 884 const char *what = NULL;
6eb5f6b9
JH
885#endif
886 if (endpos == strend) {
a3621e74 887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
888 "Could not match STCLASS...\n") );
889 goto fail;
890 }
a3621e74 891 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 892 "This position contradicts STCLASS...\n") );
653099ff
GS
893 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
894 goto fail;
6eb5f6b9 895 /* Contradict one of substrings */
33b8afdf
JH
896 if (prog->anchored_substr || prog->anchored_utf8) {
897 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 898 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 899 hop_and_restart:
1aa99e6b 900 s = HOP3c(t, 1, strend);
66e933ab
GS
901 if (s + start_shift + end_shift > strend) {
902 /* XXXX Should be taken into account earlier? */
a3621e74 903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
904 "Could not match STCLASS...\n") );
905 goto fail;
906 }
5e39e1e5
HS
907 if (!check)
908 goto giveup;
a3621e74 909 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 910 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
911 what, (long)(s + start_shift - i_strpos)) );
912 goto restart;
913 }
66e933ab 914 /* Have both, check_string is floating */
6eb5f6b9
JH
915 if (t + start_shift >= check_at) /* Contradicts floating=check */
916 goto retry_floating_check;
917 /* Recheck anchored substring, but not floating... */
9041c2e3 918 s = check_at;
5e39e1e5
HS
919 if (!check)
920 goto giveup;
a3621e74 921 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 922 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
923 (long)(other_last - i_strpos)) );
924 goto do_other_anchored;
925 }
60e71179
GS
926 /* Another way we could have checked stclass at the
927 current position only: */
928 if (ml_anch) {
929 s = t = t + 1;
5e39e1e5
HS
930 if (!check)
931 goto giveup;
a3621e74 932 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 933 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 934 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 935 goto try_at_offset;
66e933ab 936 }
33b8afdf 937 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 938 goto fail;
6eb5f6b9
JH
939 /* Check is floating subtring. */
940 retry_floating_check:
941 t = check_at - start_shift;
a3621e74 942 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
943 goto hop_and_restart;
944 }
b7953727 945 if (t != s) {
a3621e74 946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 947 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
948 (long)(t - i_strpos), (long)(s - i_strpos))
949 );
950 }
951 else {
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
953 "Does not contradict STCLASS...\n");
954 );
955 }
6eb5f6b9 956 }
5e39e1e5 957 giveup:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
959 PL_colors[4], (check ? "Guessed" : "Giving up"),
960 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 961 return s;
2c2d71f5
JH
962
963 fail_finish: /* Substring not found */
33b8afdf
JH
964 if (prog->check_substr || prog->check_utf8) /* could be removed already */
965 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 966 fail:
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 968 PL_colors[4], PL_colors[5]));
bd61b366 969 return NULL;
cad2e5aa 970}
9661b544 971
786e8c11 972
3b0527fe 973
4cadc6a9
YO
974#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
975foldlen, foldbuf, uniflags) STMT_START { \
976 switch (trie_type) { \
977 case trie_utf8_fold: \
978 if ( foldlen>0 ) { \
979 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
980 foldlen -= len; \
981 uscan += len; \
982 len=0; \
983 } else { \
984 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
985 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
986 foldlen -= UNISKIP( uvc ); \
987 uscan = foldbuf + UNISKIP( uvc ); \
988 } \
989 break; \
990 case trie_utf8: \
991 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
992 break; \
993 case trie_plain: \
994 uvc = (UV)*uc; \
995 len = 1; \
996 } \
997 \
998 if (uvc < 256) { \
999 charid = trie->charmap[ uvc ]; \
1000 } \
1001 else { \
1002 charid = 0; \
1003 if (trie->widecharmap) { \
1004 SV** const svpp = hv_fetch(trie->widecharmap, \
1005 (char*)&uvc, sizeof(UV), 0); \
1006 if (svpp) \
1007 charid = (U16)SvIV(*svpp); \
1008 } \
1009 } \
1010} STMT_END
1011
1012#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1013 if ( (CoNd) \
1014 && (ln == len || \
1015 ibcmp_utf8(s, NULL, 0, do_utf8, \
1016 m, NULL, ln, (bool)UTF)) \
1017 && (!reginfo || regtry(reginfo, s)) ) \
1018 goto got_it; \
1019 else { \
1020 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1021 uvchr_to_utf8(tmpbuf, c); \
1022 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1023 if ( f != c \
1024 && (f == c1 || f == c2) \
1025 && (ln == foldlen || \
1026 !ibcmp_utf8((char *) foldbuf, \
1027 NULL, foldlen, do_utf8, \
1028 m, \
1029 NULL, ln, (bool)UTF)) \
1030 && (!reginfo || regtry(reginfo, s)) ) \
1031 goto got_it; \
1032 } \
1033 s += len
1034
1035#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1036STMT_START { \
1037 while (s <= e) { \
1038 if ( (CoNd) \
1039 && (ln == 1 || !(OP(c) == EXACTF \
1040 ? ibcmp(s, m, ln) \
1041 : ibcmp_locale(s, m, ln))) \
1042 && (!reginfo || regtry(reginfo, s)) ) \
1043 goto got_it; \
1044 s++; \
1045 } \
1046} STMT_END
1047
1048#define REXEC_FBC_UTF8_SCAN(CoDe) \
1049STMT_START { \
1050 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1051 CoDe \
1052 s += uskip; \
1053 } \
1054} STMT_END
1055
1056#define REXEC_FBC_SCAN(CoDe) \
1057STMT_START { \
1058 while (s < strend) { \
1059 CoDe \
1060 s++; \
1061 } \
1062} STMT_END
1063
1064#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1065REXEC_FBC_UTF8_SCAN( \
1066 if (CoNd) { \
1067 if (tmp && (!reginfo || regtry(reginfo, s))) \
1068 goto got_it; \
1069 else \
1070 tmp = doevery; \
1071 } \
1072 else \
1073 tmp = 1; \
1074)
1075
1076#define REXEC_FBC_CLASS_SCAN(CoNd) \
1077REXEC_FBC_SCAN( \
1078 if (CoNd) { \
1079 if (tmp && (!reginfo || regtry(reginfo, s))) \
1080 goto got_it; \
1081 else \
1082 tmp = doevery; \
1083 } \
1084 else \
1085 tmp = 1; \
1086)
1087
1088#define REXEC_FBC_TRYIT \
1089if ((!reginfo || regtry(reginfo, s))) \
1090 goto got_it
1091
1092#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1093 if (do_utf8) { \
1094 UtFpReLoAd; \
1095 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1096 } \
1097 else { \
1098 REXEC_FBC_CLASS_SCAN(CoNd); \
1099 } \
1100 break
1101
1102#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1103 PL_reg_flags |= RF_tainted; \
1104 if (do_utf8) { \
1105 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1106 } \
1107 else { \
1108 REXEC_FBC_CLASS_SCAN(CoNd); \
1109 } \
1110 break
1111
786e8c11
YO
1112#define DUMP_EXEC_POS(li,s,doutf8) \
1113 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1114
1115/* We know what class REx starts with. Try to find this position... */
1116/* if reginfo is NULL, its a dryrun */
1117/* annoyingly all the vars in this routine have different names from their counterparts
1118 in regmatch. /grrr */
1119
3c3eec57 1120STATIC char *
07be1b83
YO
1121S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1122 const char *strend, const regmatch_info *reginfo)
a687059c 1123{
27da23d5 1124 dVAR;
1df70142 1125 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1126 char *m;
d8093b23 1127 STRLEN ln;
5dab1207 1128 STRLEN lnc;
078c425b 1129 register STRLEN uskip;
d8093b23
G
1130 unsigned int c1;
1131 unsigned int c2;
6eb5f6b9
JH
1132 char *e;
1133 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1134 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1135
6eb5f6b9
JH
1136 /* We know what class it must start with. */
1137 switch (OP(c)) {
6eb5f6b9 1138 case ANYOF:
388cc4de 1139 if (do_utf8) {
4cadc6a9 1140 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1141 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1142 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1143 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1144 }
1145 else {
1146 while (s < strend) {
1147 STRLEN skip = 1;
1148
32fc9b6a 1149 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1150 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1151 /* The assignment of 2 is intentional:
1152 * for the folded sharp s, the skip is 2. */
1153 (skip = SHARP_S_SKIP))) {
3b0527fe 1154 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1155 goto got_it;
1156 else
1157 tmp = doevery;
1158 }
1159 else
1160 tmp = 1;
1161 s += skip;
1162 }
a0d0e21e 1163 }
6eb5f6b9 1164 break;
f33976b4 1165 case CANY:
4cadc6a9 1166 REXEC_FBC_SCAN(
3b0527fe 1167 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1168 goto got_it;
1169 else
1170 tmp = doevery;
4cadc6a9 1171 );
f33976b4 1172 break;
6eb5f6b9 1173 case EXACTF:
5dab1207
NIS
1174 m = STRING(c);
1175 ln = STR_LEN(c); /* length to match in octets/bytes */
1176 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1177 if (UTF) {
a2a2844f 1178 STRLEN ulen1, ulen2;
5dab1207 1179 U8 *sm = (U8 *) m;
89ebb4a3
JH
1180 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1181 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1182 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1183
1184 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1185 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1186
89ebb4a3 1187 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1188 0, uniflags);
89ebb4a3 1189 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1190 0, uniflags);
5dab1207
NIS
1191 lnc = 0;
1192 while (sm < ((U8 *) m + ln)) {
1193 lnc++;
1194 sm += UTF8SKIP(sm);
1195 }
1aa99e6b
IH
1196 }
1197 else {
1198 c1 = *(U8*)m;
1199 c2 = PL_fold[c1];
1200 }
6eb5f6b9
JH
1201 goto do_exactf;
1202 case EXACTFL:
5dab1207
NIS
1203 m = STRING(c);
1204 ln = STR_LEN(c);
1205 lnc = (I32) ln;
d8093b23 1206 c1 = *(U8*)m;
6eb5f6b9
JH
1207 c2 = PL_fold_locale[c1];
1208 do_exactf:
db12adc6 1209 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1210
3b0527fe 1211 if (!reginfo && e < s)
6eb5f6b9 1212 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1213
60a8b682
JH
1214 /* The idea in the EXACTF* cases is to first find the
1215 * first character of the EXACTF* node and then, if
1216 * necessary, case-insensitively compare the full
1217 * text of the node. The c1 and c2 are the first
1218 * characters (though in Unicode it gets a bit
1219 * more complicated because there are more cases
7f16dd3d
JH
1220 * than just upper and lower: one needs to use
1221 * the so-called folding case for case-insensitive
1222 * matching (called "loose matching" in Unicode).
1223 * ibcmp_utf8() will do just that. */
60a8b682 1224
1aa99e6b 1225 if (do_utf8) {
575cac57 1226 UV c, f;
89ebb4a3 1227 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1228 STRLEN len, foldlen;
4ad0818d 1229 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1230 if (c1 == c2) {
5dab1207
NIS
1231 /* Upper and lower of 1st char are equal -
1232 * probably not a "letter". */
1aa99e6b 1233 while (s <= e) {
89ebb4a3 1234 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1235 uniflags);
4cadc6a9 1236 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1237 }
09091399
JH
1238 }
1239 else {
1aa99e6b 1240 while (s <= e) {
89ebb4a3 1241 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1242 uniflags);
80aecb99 1243
60a8b682 1244 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1245 * Note that not all the possible combinations
1246 * are handled here: some of them are handled
1247 * by the standard folding rules, and some of
1248 * them (the character class or ANYOF cases)
1249 * are handled during compiletime in
1250 * regexec.c:S_regclass(). */
880bd946
JH
1251 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1252 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1253 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1254
4cadc6a9 1255 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1256 }
09091399 1257 }
1aa99e6b
IH
1258 }
1259 else {
1260 if (c1 == c2)
4cadc6a9 1261 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1262 else
4cadc6a9 1263 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1264 }
1265 break;
bbce6d69 1266 case BOUNDL:
3280af22 1267 PL_reg_flags |= RF_tainted;
bbce6d69 1268 /* FALL THROUGH */
a0d0e21e 1269 case BOUND:
ffc61ed2 1270 if (do_utf8) {
12d33761 1271 if (s == PL_bostr)
ffc61ed2
JH
1272 tmp = '\n';
1273 else {
6136c704 1274 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1275 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1276 }
1277 tmp = ((OP(c) == BOUND ?
9041c2e3 1278 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1279 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1280 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1281 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1282 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1283 isALNUM_LC_utf8((U8*)s)))
1284 {
1285 tmp = !tmp;
4cadc6a9 1286 REXEC_FBC_TRYIT;
a687059c 1287 }
4cadc6a9 1288 );
a0d0e21e 1289 }
667bb95a 1290 else {
12d33761 1291 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1292 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1293 REXEC_FBC_SCAN(
ffc61ed2
JH
1294 if (tmp ==
1295 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1296 tmp = !tmp;
4cadc6a9 1297 REXEC_FBC_TRYIT;
a0ed51b3 1298 }
4cadc6a9 1299 );
a0ed51b3 1300 }
3b0527fe 1301 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1302 goto got_it;
1303 break;
bbce6d69 1304 case NBOUNDL:
3280af22 1305 PL_reg_flags |= RF_tainted;
bbce6d69 1306 /* FALL THROUGH */
a0d0e21e 1307 case NBOUND:
ffc61ed2 1308 if (do_utf8) {
12d33761 1309 if (s == PL_bostr)
ffc61ed2
JH
1310 tmp = '\n';
1311 else {
6136c704 1312 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1313 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1314 }
1315 tmp = ((OP(c) == NBOUND ?
9041c2e3 1316 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1317 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1318 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1319 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1320 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1321 isALNUM_LC_utf8((U8*)s)))
1322 tmp = !tmp;
4cadc6a9
YO
1323 else REXEC_FBC_TRYIT;
1324 );
a0d0e21e 1325 }
667bb95a 1326 else {
12d33761 1327 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1328 tmp = ((OP(c) == NBOUND ?
1329 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1330 REXEC_FBC_SCAN(
ffc61ed2
JH
1331 if (tmp ==
1332 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1333 tmp = !tmp;
4cadc6a9
YO
1334 else REXEC_FBC_TRYIT;
1335 );
a0ed51b3 1336 }
3b0527fe 1337 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1338 goto got_it;
1339 break;
a0d0e21e 1340 case ALNUM:
4cadc6a9
YO
1341 REXEC_FBC_CSCAN_PRELOAD(
1342 LOAD_UTF8_CHARCLASS_ALNUM(),
1343 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1344 isALNUM(*s)
1345 );
bbce6d69 1346 case ALNUML:
4cadc6a9
YO
1347 REXEC_FBC_CSCAN_TAINT(
1348 isALNUM_LC_utf8((U8*)s),
1349 isALNUM_LC(*s)
1350 );
a0d0e21e 1351 case NALNUM:
4cadc6a9
YO
1352 REXEC_FBC_CSCAN_PRELOAD(
1353 LOAD_UTF8_CHARCLASS_ALNUM(),
1354 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1355 !isALNUM(*s)
1356 );
bbce6d69 1357 case NALNUML:
4cadc6a9
YO
1358 REXEC_FBC_CSCAN_TAINT(
1359 !isALNUM_LC_utf8((U8*)s),
1360 !isALNUM_LC(*s)
1361 );
a0d0e21e 1362 case SPACE:
4cadc6a9
YO
1363 REXEC_FBC_CSCAN_PRELOAD(
1364 LOAD_UTF8_CHARCLASS_SPACE(),
1365 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1366 isSPACE(*s)
1367 );
bbce6d69 1368 case SPACEL:
4cadc6a9
YO
1369 REXEC_FBC_CSCAN_TAINT(
1370 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1371 isSPACE_LC(*s)
1372 );
a0d0e21e 1373 case NSPACE:
4cadc6a9
YO
1374 REXEC_FBC_CSCAN_PRELOAD(
1375 LOAD_UTF8_CHARCLASS_SPACE(),
1376 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1377 !isSPACE(*s)
1378 );
bbce6d69 1379 case NSPACEL:
4cadc6a9
YO
1380 REXEC_FBC_CSCAN_TAINT(
1381 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1382 !isSPACE_LC(*s)
1383 );
a0d0e21e 1384 case DIGIT:
4cadc6a9
YO
1385 REXEC_FBC_CSCAN_PRELOAD(
1386 LOAD_UTF8_CHARCLASS_DIGIT(),
1387 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1388 isDIGIT(*s)
1389 );
b8c5462f 1390 case DIGITL:
4cadc6a9
YO
1391 REXEC_FBC_CSCAN_TAINT(
1392 isDIGIT_LC_utf8((U8*)s),
1393 isDIGIT_LC(*s)
1394 );
a0d0e21e 1395 case NDIGIT:
4cadc6a9
YO
1396 REXEC_FBC_CSCAN_PRELOAD(
1397 LOAD_UTF8_CHARCLASS_DIGIT(),
1398 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1399 !isDIGIT(*s)
1400 );
b8c5462f 1401 case NDIGITL:
4cadc6a9
YO
1402 REXEC_FBC_CSCAN_TAINT(
1403 !isDIGIT_LC_utf8((U8*)s),
1404 !isDIGIT_LC(*s)
1405 );
1de06328
YO
1406 case AHOCORASICKC:
1407 case AHOCORASICK:
07be1b83
YO
1408 {
1409 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1410 trie_type = do_utf8 ?
1411 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1412 : trie_plain;
1413 /* what trie are we using right now */
1414 reg_ac_data *aho
1415 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1416 reg_trie_data *trie=aho->trie;
1417
1418 const char *last_start = strend - trie->minlen;
6148ee25 1419#ifdef DEBUGGING
07be1b83 1420 const char *real_start = s;
6148ee25 1421#endif
07be1b83 1422 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1423 SV *sv_points;
1424 U8 **points; /* map of where we were in the input string
786e8c11 1425 when reading a given char. For ASCII this
be8e71aa
YO
1426 is unnecessary overhead as the relationship
1427 is always 1:1, but for unicode, especially
1428 case folded unicode this is not true. */
f9e705e8 1429 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1430 U8 *bitmap=NULL;
1431
07be1b83
YO
1432
1433 GET_RE_DEBUG_FLAGS_DECL;
1434
be8e71aa
YO
1435 /* We can't just allocate points here. We need to wrap it in
1436 * an SV so it gets freed properly if there is a croak while
1437 * running the match */
1438 ENTER;
1439 SAVETMPS;
1440 sv_points=newSV(maxlen * sizeof(U8 *));
1441 SvCUR_set(sv_points,
1442 maxlen * sizeof(U8 *));
1443 SvPOK_on(sv_points);
1444 sv_2mortal(sv_points);
1445 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1446 if ( trie_type != trie_utf8_fold
1447 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1448 {
786e8c11
YO
1449 if (trie->bitmap)
1450 bitmap=(U8*)trie->bitmap;
1451 else
1452 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1453 }
786e8c11
YO
1454 /* this is the Aho-Corasick algorithm modified a touch
1455 to include special handling for long "unknown char"
1456 sequences. The basic idea being that we use AC as long
1457 as we are dealing with a possible matching char, when
1458 we encounter an unknown char (and we have not encountered
1459 an accepting state) we scan forward until we find a legal
1460 starting char.
1461 AC matching is basically that of trie matching, except
1462 that when we encounter a failing transition, we fall back
1463 to the current states "fail state", and try the current char
1464 again, a process we repeat until we reach the root state,
1465 state 1, or a legal transition. If we fail on the root state
1466 then we can either terminate if we have reached an accepting
1467 state previously, or restart the entire process from the beginning
1468 if we have not.
1469
1470 */
07be1b83
YO
1471 while (s <= last_start) {
1472 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1473 U8 *uc = (U8*)s;
1474 U16 charid = 0;
1475 U32 base = 1;
1476 U32 state = 1;
1477 UV uvc = 0;
1478 STRLEN len = 0;
1479 STRLEN foldlen = 0;
1480 U8 *uscan = (U8*)NULL;
1481 U8 *leftmost = NULL;
786e8c11
YO
1482#ifdef DEBUGGING
1483 U32 accepted_word= 0;
1484#endif
07be1b83
YO
1485 U32 pointpos = 0;
1486
1487 while ( state && uc <= (U8*)strend ) {
1488 int failed=0;
786e8c11
YO
1489 U32 word = aho->states[ state ].wordnum;
1490
1de06328
YO
1491 if( state==1 ) {
1492 if ( bitmap ) {
1493 DEBUG_TRIE_EXECUTE_r(
1494 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1495 dump_exec_pos( (char *)uc, c, strend, real_start,
1496 (char *)uc, do_utf8 );
1497 PerlIO_printf( Perl_debug_log,
1498 " Scanning for legal start char...\n");
1499 }
1500 );
1501 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1502 uc++;
786e8c11 1503 }
1de06328 1504 s= (char *)uc;
786e8c11 1505 }
786e8c11
YO
1506 if (uc >(U8*)last_start) break;
1507 }
1508
1509 if ( word ) {
1510 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1511 if (!leftmost || lpos < leftmost) {
1512 DEBUG_r(accepted_word=word);
07be1b83 1513 leftmost= lpos;
786e8c11 1514 }
07be1b83 1515 if (base==0) break;
786e8c11 1516
07be1b83
YO
1517 }
1518 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1519 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1520 uvc, charid, foldlen, foldbuf, uniflags);
786e8c11
YO
1521 DEBUG_TRIE_EXECUTE_r({
1522 dump_exec_pos( (char *)uc, c, strend, real_start,
1523 s, do_utf8 );
07be1b83 1524 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1525 " Charid:%3u CP:%4"UVxf" ",
1526 charid, uvc);
1527 });
07be1b83
YO
1528
1529 do {
6148ee25 1530#ifdef DEBUGGING
786e8c11 1531 word = aho->states[ state ].wordnum;
6148ee25 1532#endif
07be1b83
YO
1533 base = aho->states[ state ].trans.base;
1534
786e8c11
YO
1535 DEBUG_TRIE_EXECUTE_r({
1536 if (failed)
1537 dump_exec_pos( (char *)uc, c, strend, real_start,
1538 s, do_utf8 );
07be1b83 1539 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1540 "%sState: %4"UVxf", word=%"UVxf,
1541 failed ? " Fail transition to " : "",
1542 (UV)state, (UV)word);
1543 });
07be1b83
YO
1544 if ( base ) {
1545 U32 tmp;
1546 if (charid &&
1547 (base + charid > trie->uniquecharcount )
1548 && (base + charid - 1 - trie->uniquecharcount
1549 < trie->lasttrans)
1550 && trie->trans[base + charid - 1 -
1551 trie->uniquecharcount].check == state
1552 && (tmp=trie->trans[base + charid - 1 -
1553 trie->uniquecharcount ].next))
1554 {
786e8c11
YO
1555 DEBUG_TRIE_EXECUTE_r(
1556 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1557 state = tmp;
1558 break;
1559 }
1560 else {
786e8c11
YO
1561 DEBUG_TRIE_EXECUTE_r(
1562 PerlIO_printf( Perl_debug_log," - fail\n"));
1563 failed = 1;
1564 state = aho->fail[state];
07be1b83
YO
1565 }
1566 }
1567 else {
1568 /* we must be accepting here */
786e8c11
YO
1569 DEBUG_TRIE_EXECUTE_r(
1570 PerlIO_printf( Perl_debug_log," - accepting\n"));
1571 failed = 1;
07be1b83
YO
1572 break;
1573 }
1574 } while(state);
786e8c11 1575 uc += len;
07be1b83
YO
1576 if (failed) {
1577 if (leftmost)
1578 break;
786e8c11 1579 if (!state) state = 1;
07be1b83
YO
1580 }
1581 }
1582 if ( aho->states[ state ].wordnum ) {
1583 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1584 if (!leftmost || lpos < leftmost) {
1585 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1586 leftmost = lpos;
786e8c11 1587 }
07be1b83 1588 }
07be1b83
YO
1589 if (leftmost) {
1590 s = (char*)leftmost;
786e8c11
YO
1591 DEBUG_TRIE_EXECUTE_r({
1592 PerlIO_printf(
1593 Perl_debug_log,"Matches word #%"UVxf" at position %d. Trying full pattern...\n",
1594 (UV)accepted_word, s - real_start
1595 );
1596 });
be8e71aa
YO
1597 if (!reginfo || regtry(reginfo, s)) {
1598 FREETMPS;
1599 LEAVE;
07be1b83 1600 goto got_it;
be8e71aa 1601 }
07be1b83 1602 s = HOPc(s,1);
786e8c11
YO
1603 DEBUG_TRIE_EXECUTE_r({
1604 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1605 });
07be1b83 1606 } else {
786e8c11
YO
1607 DEBUG_TRIE_EXECUTE_r(
1608 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1609 break;
1610 }
1611 }
be8e71aa
YO
1612 FREETMPS;
1613 LEAVE;
07be1b83
YO
1614 }
1615 break;
b3c9acc1 1616 default:
3c3eec57
GS
1617 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1618 break;
d6a28714 1619 }
6eb5f6b9
JH
1620 return 0;
1621 got_it:
1622 return s;
1623}
1624
1625/*
1626 - regexec_flags - match a regexp against a string
1627 */
1628I32
1629Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1630 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1631/* strend: pointer to null at end of string */
1632/* strbeg: real beginning of string */
1633/* minend: end of match must be >=minend after stringarg. */
1634/* data: May be used for some additional optimizations. */
1635/* nosave: For optimizations. */
1636{
97aff369 1637 dVAR;
6eb5f6b9
JH
1638 register char *s;
1639 register regnode *c;
1640 register char *startpos = stringarg;
6eb5f6b9
JH
1641 I32 minlen; /* must match at least this many chars */
1642 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1643 I32 end_shift = 0; /* Same for the end. */ /* CC */
1644 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1645 char *scream_olds = NULL;
3dab1dad 1646 SV* const oreplsv = GvSV(PL_replgv);
f9f4320a 1647 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1648 I32 multiline;
0df25f3d 1649
3b0527fe 1650 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1651
1652 GET_RE_DEBUG_FLAGS_DECL;
1653
9d4ba2ae 1654 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
2757e526 1662 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1663 reginfo.prog = prog;
2757e526 1664
bac06658 1665 RX_MATCH_UTF8_set(prog, do_utf8);
1de06328
YO
1666 DEBUG_EXECUTE_r(
1667 debug_start_match(prog, do_utf8, startpos, strend,
1668 "Matching");
1669 );
bac06658 1670
6eb5f6b9 1671 minlen = prog->minlen;
1de06328
YO
1672
1673 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1675 "String too short [regexec_flags]...\n"));
1676 goto phooey;
1aa99e6b 1677 }
6eb5f6b9 1678
1de06328 1679
6eb5f6b9
JH
1680 /* Check validity of program. */
1681 if (UCHARAT(prog->program) != REG_MAGIC) {
1682 Perl_croak(aTHX_ "corrupted regexp program");
1683 }
1684
1685 PL_reg_flags = 0;
1686 PL_reg_eval_set = 0;
1687 PL_reg_maxiter = 0;
1688
1689 if (prog->reganch & ROPT_UTF8)
1690 PL_reg_flags |= RF_utf8;
1691
1692 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1693 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1694 PL_bostr = strbeg;
3b0527fe 1695 reginfo.sv = sv;
6eb5f6b9
JH
1696
1697 /* Mark end of line for $ (and such) */
1698 PL_regeol = strend;
1699
1700 /* see how far we have to get to not match where we matched before */
3b0527fe 1701 reginfo.till = startpos+minend;
6eb5f6b9 1702
6eb5f6b9
JH
1703 /* If there is a "must appear" string, look for it. */
1704 s = startpos;
1705
3b0527fe 1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1707 MAGIC *mg;
1708
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1710 reginfo.ganch = startpos;
6eb5f6b9
JH
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && SvMAGIC(sv)
14befaf4
DM
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
3b0527fe 1715 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1716 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1717 if (s > reginfo.ganch)
6eb5f6b9 1718 goto phooey;
3b0527fe 1719 s = reginfo.ganch;
6eb5f6b9
JH
1720 }
1721 }
1722 else /* pos() not defined */
3b0527fe 1723 reginfo.ganch = strbeg;
6eb5f6b9
JH
1724 }
1725
a0714e2c 1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1727 re_scream_pos_data d;
1728
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1732 if (!s) {
a3621e74 1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1734 goto phooey; /* not present */
3fa9c3d7 1735 }
6eb5f6b9
JH
1736 }
1737
1de06328 1738
6eb5f6b9
JH
1739
1740 /* Simplest case: anchored match need be tried only once. */
1741 /* [unless only anchor is BOL and multiline is set] */
1742 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1743 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1744 goto got_it;
7fba1cd6 1745 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1746 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1747 {
1748 char *end;
1749
1750 if (minlen)
1751 dontbother = minlen - 1;
1aa99e6b 1752 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1753 /* for multiline we only have to try after newlines */
33b8afdf 1754 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1755 if (s == startpos)
1756 goto after_try;
1757 while (1) {
3b0527fe 1758 if (regtry(&reginfo, s))
6eb5f6b9
JH
1759 goto got_it;
1760 after_try:
1761 if (s >= end)
1762 goto phooey;
1763 if (prog->reganch & RE_USE_INTUIT) {
1764 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1765 if (!s)
1766 goto phooey;
1767 }
1768 else
1769 s++;
1770 }
1771 } else {
1772 if (s > startpos)
1773 s--;
1774 while (s < end) {
1775 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1776 if (regtry(&reginfo, s))
6eb5f6b9
JH
1777 goto got_it;
1778 }
1779 }
1780 }
1781 }
1782 goto phooey;
f9f4320a
YO
1783 } else if (ROPT_GPOS_CHECK == (prog->reganch & ROPT_GPOS_CHECK))
1784 {
1785 /* the warning about reginfo.ganch being used without intialization
1786 is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN
1787 and we only enter this block when the same bit is set. */
3b0527fe 1788 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1789 goto got_it;
1790 goto phooey;
1791 }
1792
1793 /* Messy cases: unanchored match. */
33b8afdf 1794 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1795 /* we have /x+whatever/ */
1796 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1797 char ch;
bf93d4cc
GS
1798#ifdef DEBUGGING
1799 int did_match = 0;
1800#endif
33b8afdf
JH
1801 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1802 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1803 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1804
1aa99e6b 1805 if (do_utf8) {
4cadc6a9 1806 REXEC_FBC_SCAN(
6eb5f6b9 1807 if (*s == ch) {
a3621e74 1808 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1809 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1810 s += UTF8SKIP(s);
1811 while (s < strend && *s == ch)
1812 s += UTF8SKIP(s);
1813 }
4cadc6a9 1814 );
6eb5f6b9
JH
1815 }
1816 else {
4cadc6a9 1817 REXEC_FBC_SCAN(
6eb5f6b9 1818 if (*s == ch) {
a3621e74 1819 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1820 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1821 s++;
1822 while (s < strend && *s == ch)
1823 s++;
1824 }
4cadc6a9 1825 );
6eb5f6b9 1826 }
a3621e74 1827 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1828 PerlIO_printf(Perl_debug_log,
b7953727
JH
1829 "Did not find anchored character...\n")
1830 );
6eb5f6b9 1831 }
a0714e2c
SS
1832 else if (prog->anchored_substr != NULL
1833 || prog->anchored_utf8 != NULL
1834 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1835 && prog->float_max_offset < strend - s)) {
1836 SV *must;
1837 I32 back_max;
1838 I32 back_min;
1839 char *last;
6eb5f6b9 1840 char *last1; /* Last position checked before */
bf93d4cc
GS
1841#ifdef DEBUGGING
1842 int did_match = 0;
1843#endif
33b8afdf
JH
1844 if (prog->anchored_substr || prog->anchored_utf8) {
1845 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1846 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1847 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1848 back_max = back_min = prog->anchored_offset;
1849 } else {
1850 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1851 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1852 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1853 back_max = prog->float_max_offset;
1854 back_min = prog->float_min_offset;
1855 }
1de06328
YO
1856
1857
33b8afdf
JH
1858 if (must == &PL_sv_undef)
1859 /* could not downgrade utf8 check substring, so must fail */
1860 goto phooey;
1861
1de06328
YO
1862 if (back_min<0) {
1863 last = strend;
1864 } else {
1865 last = HOP3c(strend, /* Cannot start after this */
1866 -(I32)(CHR_SVLEN(must)
1867 - (SvTAIL(must) != 0) + back_min), strbeg);
1868 }
6eb5f6b9
JH
1869 if (s > PL_bostr)
1870 last1 = HOPc(s, -1);
1871 else
1872 last1 = s - 1; /* bogus */
1873
a0288114 1874 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1875 check_substr==must. */
1876 scream_pos = -1;
1877 dontbother = end_shift;
1878 strend = HOPc(strend, -dontbother);
1879 while ( (s <= last) &&
9041c2e3 1880 ((flags & REXEC_SCREAM)
1de06328 1881 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 1882 end_shift, &scream_pos, 0))
1de06328 1883 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 1884 (unsigned char*)strend, must,
7fba1cd6 1885 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1886 /* we may be pointing at the wrong string */
1887 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1888 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1889 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1890 if (HOPc(s, -back_max) > last1) {
1891 last1 = HOPc(s, -back_min);
1892 s = HOPc(s, -back_max);
1893 }
1894 else {
52657f30 1895 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1896
1897 last1 = HOPc(s, -back_min);
52657f30 1898 s = t;
6eb5f6b9 1899 }
1aa99e6b 1900 if (do_utf8) {
6eb5f6b9 1901 while (s <= last1) {
3b0527fe 1902 if (regtry(&reginfo, s))
6eb5f6b9
JH
1903 goto got_it;
1904 s += UTF8SKIP(s);
1905 }
1906 }
1907 else {
1908 while (s <= last1) {
3b0527fe 1909 if (regtry(&reginfo, s))
6eb5f6b9
JH
1910 goto got_it;
1911 s++;
1912 }
1913 }
1914 }
ab3bbdeb
YO
1915 DEBUG_EXECUTE_r(if (!did_match) {
1916 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1917 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1918 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1919 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1920 ? "anchored" : "floating"),
ab3bbdeb
YO
1921 quoted, RE_SV_TAIL(must));
1922 });
6eb5f6b9
JH
1923 goto phooey;
1924 }
786e8c11 1925 else if ( (c = prog->regstclass) ) {
f14c76ed 1926 if (minlen) {
be8e71aa 1927 const OPCODE op = OP(prog->regstclass);
66e933ab 1928 /* don't bother with what can't match */
786e8c11 1929 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
1930 strend = HOPc(strend, -(minlen - 1));
1931 }
a3621e74 1932 DEBUG_EXECUTE_r({
be8e71aa 1933 SV * const prop = sv_newmortal();
32fc9b6a 1934 regprop(prog, prop, c);
0df25f3d 1935 {
ab3bbdeb
YO
1936 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1937 s,strend-s,60);
0df25f3d 1938 PerlIO_printf(Perl_debug_log,
ab3bbdeb 1939 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 1940 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 1941 quoted, (int)(strend - s));
0df25f3d 1942 }
ffc61ed2 1943 });
3b0527fe 1944 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1945 goto got_it;
07be1b83 1946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1947 }
1948 else {
1949 dontbother = 0;
a0714e2c 1950 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1951 /* Trim the end. */
d6a28714 1952 char *last;
33b8afdf
JH
1953 SV* float_real;
1954
1955 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1956 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1957 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1958
1959 if (flags & REXEC_SCREAM) {
33b8afdf 1960 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1961 end_shift, &scream_pos, 1); /* last one */
1962 if (!last)
ffc61ed2 1963 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1964 /* we may be pointing at the wrong string */
1965 else if (RX_MATCH_COPIED(prog))
3f7c398e 1966 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1967 }
d6a28714
JH
1968 else {
1969 STRLEN len;
cfd0369c 1970 const char * const little = SvPV_const(float_real, len);
d6a28714 1971
33b8afdf 1972 if (SvTAIL(float_real)) {
d6a28714
JH
1973 if (memEQ(strend - len + 1, little, len - 1))
1974 last = strend - len + 1;
7fba1cd6 1975 else if (!multiline)
9041c2e3 1976 last = memEQ(strend - len, little, len)
bd61b366 1977 ? strend - len : NULL;
b8c5462f 1978 else
d6a28714
JH
1979 goto find_last;
1980 } else {
1981 find_last:
9041c2e3 1982 if (len)
d6a28714 1983 last = rninstr(s, strend, little, little + len);
b8c5462f 1984 else
a0288114 1985 last = strend; /* matching "$" */
b8c5462f 1986 }
b8c5462f 1987 }
bf93d4cc 1988 if (last == NULL) {
6bda09f9
YO
1989 DEBUG_EXECUTE_r(
1990 PerlIO_printf(Perl_debug_log,
1991 "%sCan't trim the tail, match fails (should not happen)%s\n",
1992 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1993 goto phooey; /* Should not happen! */
1994 }
d6a28714
JH
1995 dontbother = strend - last + prog->float_min_offset;
1996 }
1997 if (minlen && (dontbother < minlen))
1998 dontbother = minlen - 1;
1999 strend -= dontbother; /* this one's always in bytes! */
2000 /* We don't know much -- general case. */
1aa99e6b 2001 if (do_utf8) {
d6a28714 2002 for (;;) {
3b0527fe 2003 if (regtry(&reginfo, s))
d6a28714
JH
2004 goto got_it;
2005 if (s >= strend)
2006 break;
b8c5462f 2007 s += UTF8SKIP(s);
d6a28714
JH
2008 };
2009 }
2010 else {
2011 do {
3b0527fe 2012 if (regtry(&reginfo, s))
d6a28714
JH
2013 goto got_it;
2014 } while (s++ < strend);
2015 }
2016 }
2017
2018 /* Failure. */
2019 goto phooey;
2020
2021got_it:
2022 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023
2024 if (PL_reg_eval_set) {
2025 /* Preserve the current value of $^R */
2026 if (oreplsv != GvSV(PL_replgv))
2027 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2028 restored, the value remains
2029 the same. */
4f639d21 2030 restore_pos(aTHX_ prog);
d6a28714 2031 }
81714fb9
YO
2032 if (prog->paren_names)
2033 (void)hv_iterinit(prog->paren_names);
d6a28714
JH
2034
2035 /* make sure $`, $&, $', and $digit will work later */
2036 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2037 RX_MATCH_COPY_FREE(prog);
d6a28714 2038 if (flags & REXEC_COPY_STR) {
be8e71aa 2039 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2040#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2041 if ((SvIsCOW(sv)
2042 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2043 if (DEBUG_C_TEST) {
2044 PerlIO_printf(Perl_debug_log,
2045 "Copy on write: regexp capture, type %d\n",
2046 (int) SvTYPE(sv));
2047 }
2048 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2049 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2050 assert (SvPOKp(prog->saved_copy));
2051 } else
2052#endif
2053 {
2054 RX_MATCH_COPIED_on(prog);
2055 s = savepvn(strbeg, i);
2056 prog->subbeg = s;
2057 }
d6a28714 2058 prog->sublen = i;
d6a28714
JH
2059 }
2060 else {
2061 prog->subbeg = strbeg;
2062 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2063 }
2064 }
9041c2e3 2065
d6a28714
JH
2066 return 1;
2067
2068phooey:
a3621e74 2069 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2070 PL_colors[4], PL_colors[5]));
d6a28714 2071 if (PL_reg_eval_set)
4f639d21 2072 restore_pos(aTHX_ prog);
d6a28714
JH
2073 return 0;
2074}
2075
6bda09f9 2076
d6a28714
JH
2077/*
2078 - regtry - try match at specific point
2079 */
2080STATIC I32 /* 0 failure, 1 success */
3b0527fe 2081S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2082{
97aff369 2083 dVAR;
d6a28714
JH
2084 register I32 *sp;
2085 register I32 *ep;
2086 CHECKPOINT lastcp;
3b0527fe 2087 regexp *prog = reginfo->prog;
a3621e74 2088 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2089
2090 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2091 MAGIC *mg;
2092
2093 PL_reg_eval_set = RS_init;
a3621e74 2094 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2095 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2096 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2097 ));
ea8d6ae1 2098 SAVESTACK_CXPOS();
d6a28714
JH
2099 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2100 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2101 SAVETMPS;
2102 /* Apparently this is not needed, judging by wantarray. */
e8347627 2103 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2104 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2105
3b0527fe 2106 if (reginfo->sv) {
d6a28714 2107 /* Make $_ available to executed code. */
3b0527fe 2108 if (reginfo->sv != DEFSV) {
59f00321 2109 SAVE_DEFSV;
3b0527fe 2110 DEFSV = reginfo->sv;
b8c5462f 2111 }
d6a28714 2112
3b0527fe
DM
2113 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2114 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2115 /* prepare for quick setting of pos */
d300d9fa
NC
2116#ifdef PERL_OLD_COPY_ON_WRITE
2117 if (SvIsCOW(sv))
2118 sv_force_normal_flags(sv, 0);
2119#endif
3dab1dad 2120 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2121 &PL_vtbl_mglob, NULL, 0);
d6a28714 2122 mg->mg_len = -1;
b8c5462f 2123 }
d6a28714
JH
2124 PL_reg_magic = mg;
2125 PL_reg_oldpos = mg->mg_len;
4f639d21 2126 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2127 }
09687e5a 2128 if (!PL_reg_curpm) {
a02a5408 2129 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2130#ifdef USE_ITHREADS
2131 {
be8e71aa 2132 SV* const repointer = newSViv(0);
577e12cc 2133 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2134 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2135 av_push(PL_regex_padav,repointer);
2136 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2137 PL_regex_pad = AvARRAY(PL_regex_padav);
2138 }
2139#endif
2140 }
aaa362c4 2141 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2142 PL_reg_oldcurpm = PL_curpm;
2143 PL_curpm = PL_reg_curpm;
2144 if (RX_MATCH_COPIED(prog)) {
2145 /* Here is a serious problem: we cannot rewrite subbeg,
2146 since it may be needed if this match fails. Thus
2147 $` inside (?{}) could fail... */
2148 PL_reg_oldsaved = prog->subbeg;
2149 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2150#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2151 PL_nrs = prog->saved_copy;
2152#endif
d6a28714
JH
2153 RX_MATCH_COPIED_off(prog);
2154 }
2155 else
bd61b366 2156 PL_reg_oldsaved = NULL;
d6a28714
JH
2157 prog->subbeg = PL_bostr;
2158 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 }
6bda09f9 2160 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
973dddac 2161 prog->startp[0] = startpos - PL_bostr;
d6a28714 2162 PL_reginput = startpos;
d6a28714 2163 PL_reglastparen = &prog->lastparen;
a01268b5 2164 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2165 prog->lastparen = 0;
03994de8 2166 prog->lastcloseparen = 0;
d6a28714 2167 PL_regsize = 0;
6bda09f9
YO
2168 PL_regstartp = prog->startp;
2169 PL_regendp = prog->endp;
d6a28714
JH
2170 if (PL_reg_start_tmpl <= prog->nparens) {
2171 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2172 if(PL_reg_start_tmp)
2173 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 else
a02a5408 2175 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2176 }
2177
2178 /* XXXX What this code is doing here?!!! There should be no need
2179 to do this again and again, PL_reglastparen should take care of
3dd2943c 2180 this! --ilya*/
dafc8851
JH
2181
2182 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2183 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2184 * PL_reglastparen), is not needed at all by the test suite
2185 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2186 * enough, for building DynaLoader, or otherwise this
2187 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2188 * will happen. Meanwhile, this code *is* needed for the
2189 * above-mentioned test suite tests to succeed. The common theme
2190 * on those tests seems to be returning null fields from matches.
2191 * --jhi */
dafc8851 2192#if 1
d6a28714
JH
2193 sp = prog->startp;
2194 ep = prog->endp;
2195 if (prog->nparens) {
097eb12c 2196 register I32 i;
eb160463 2197 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2198 *++sp = -1;
2199 *++ep = -1;
2200 }
2201 }
dafc8851 2202#endif
02db2b7b 2203 REGCP_SET(lastcp);
3b0527fe 2204 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2205 prog->endp[0] = PL_reginput - PL_bostr;
2206 return 1;
2207 }
02db2b7b 2208 REGCP_UNWIND(lastcp);
d6a28714
JH
2209 return 0;
2210}
2211
02db2b7b 2212
8ba1375e
MJD
2213#define sayYES goto yes
2214#define sayNO goto no
262b90c4 2215#define sayNO_SILENT goto no_silent
8ba1375e 2216
f9f4320a
YO
2217/* we dont use STMT_START/END here because it leads to
2218 "unreachable code" warnings, which are bogus, but distracting. */
2219#define CACHEsayNO \
c476f425
DM
2220 if (ST.cache_mask) \
2221 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2222 sayNO
3298f257 2223
a3621e74 2224/* this is used to determine how far from the left messages like
265c4333
YO
2225 'failed...' are printed. It should be set such that messages
2226 are inline with the regop output that created them.
a3621e74 2227*/
265c4333 2228#define REPORT_CODE_OFF 32
a3621e74
YO
2229
2230
2231/* Make sure there is a test for this +1 options in re_tests */
2232#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2233
40a82448
DM
2234#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2235#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2236
86545054
DM
2237#define SLAB_FIRST(s) (&(s)->states[0])
2238#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2239
5d9a96ca
DM
2240/* grab a new slab and return the first slot in it */
2241
2242STATIC regmatch_state *
2243S_push_slab(pTHX)
2244{
54df2634
NC
2245#if PERL_VERSION < 9
2246 dMY_CXT;
2247#endif
5d9a96ca
DM
2248 regmatch_slab *s = PL_regmatch_slab->next;
2249 if (!s) {
2250 Newx(s, 1, regmatch_slab);
2251 s->prev = PL_regmatch_slab;
2252 s->next = NULL;
2253 PL_regmatch_slab->next = s;
2254 }
2255 PL_regmatch_slab = s;
86545054 2256 return SLAB_FIRST(s);
5d9a96ca 2257}
5b47454d 2258
95b24440 2259
40a82448
DM
2260/* push a new state then goto it */
2261
2262#define PUSH_STATE_GOTO(state, node) \
2263 scan = node; \
2264 st->resume_state = state; \
2265 goto push_state;
2266
2267/* push a new state with success backtracking, then goto it */
2268
2269#define PUSH_YES_STATE_GOTO(state, node) \
2270 scan = node; \
2271 st->resume_state = state; \
2272 goto push_yes_state;
2273
aa283a38 2274
aa283a38 2275
d6a28714 2276/*
95b24440 2277
bf1f174e
DM
2278regmatch() - main matching routine
2279
2280This is basically one big switch statement in a loop. We execute an op,
2281set 'next' to point the next op, and continue. If we come to a point which
2282we may need to backtrack to on failure such as (A|B|C), we push a
2283backtrack state onto the backtrack stack. On failure, we pop the top
2284state, and re-enter the loop at the state indicated. If there are no more
2285states to pop, we return failure.
2286
2287Sometimes we also need to backtrack on success; for example /A+/, where
2288after successfully matching one A, we need to go back and try to
2289match another one; similarly for lookahead assertions: if the assertion
2290completes successfully, we backtrack to the state just before the assertion
2291and then carry on. In these cases, the pushed state is marked as
2292'backtrack on success too'. This marking is in fact done by a chain of
2293pointers, each pointing to the previous 'yes' state. On success, we pop to
2294the nearest yes state, discarding any intermediate failure-only states.
2295Sometimes a yes state is pushed just to force some cleanup code to be
2296called at the end of a successful match or submatch; e.g. (??{$re}) uses
2297it to free the inner regex.
2298
2299Note that failure backtracking rewinds the cursor position, while
2300success backtracking leaves it alone.
2301
2302A pattern is complete when the END op is executed, while a subpattern
2303such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2304ops trigger the "pop to last yes state if any, otherwise return true"
2305behaviour.
2306
2307A common convention in this function is to use A and B to refer to the two
2308subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2309the subpattern to be matched possibly multiple times, while B is the entire
2310rest of the pattern. Variable and state names reflect this convention.
2311
2312The states in the main switch are the union of ops and failure/success of
2313substates associated with with that op. For example, IFMATCH is the op
2314that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2315'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2316successfully matched A and IFMATCH_A_fail is a state saying that we have
2317just failed to match A. Resume states always come in pairs. The backtrack
2318state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2319at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2320on success or failure.
2321
2322The struct that holds a backtracking state is actually a big union, with
2323one variant for each major type of op. The variable st points to the
2324top-most backtrack struct. To make the code clearer, within each
2325block of code we #define ST to alias the relevant union.
2326
2327Here's a concrete example of a (vastly oversimplified) IFMATCH
2328implementation:
2329
2330 switch (state) {
2331 ....
2332
2333#define ST st->u.ifmatch
2334
2335 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2336 ST.foo = ...; // some state we wish to save
95b24440 2337 ...
bf1f174e
DM
2338 // push a yes backtrack state with a resume value of
2339 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2340 // first node of A:
2341 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2342 // NOTREACHED
2343
2344 case IFMATCH_A: // we have successfully executed A; now continue with B
2345 next = B;
2346 bar = ST.foo; // do something with the preserved value
2347 break;
2348
2349 case IFMATCH_A_fail: // A failed, so the assertion failed
2350 ...; // do some housekeeping, then ...
2351 sayNO; // propagate the failure
2352
2353#undef ST
95b24440 2354
bf1f174e
DM
2355 ...
2356 }
95b24440 2357
bf1f174e
DM
2358For any old-timers reading this who are familiar with the old recursive
2359approach, the code above is equivalent to:
95b24440 2360
bf1f174e
DM
2361 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2362 {
2363 int foo = ...
95b24440 2364 ...
bf1f174e
DM
2365 if (regmatch(A)) {
2366 next = B;
2367 bar = foo;
2368 break;
95b24440 2369 }
bf1f174e
DM
2370 ...; // do some housekeeping, then ...
2371 sayNO; // propagate the failure
95b24440 2372 }
bf1f174e
DM
2373
2374The topmost backtrack state, pointed to by st, is usually free. If you
2375want to claim it, populate any ST.foo fields in it with values you wish to
2376save, then do one of
2377
2378 PUSH_STATE_GOTO(resume_state, node);
2379 PUSH_YES_STATE_GOTO(resume_state, node);
2380
2381which sets that backtrack state's resume value to 'resume_state', pushes a
2382new free entry to the top of the backtrack stack, then goes to 'node'.
2383On backtracking, the free slot is popped, and the saved state becomes the
2384new free state. An ST.foo field in this new top state can be temporarily
2385accessed to retrieve values, but once the main loop is re-entered, it
2386becomes available for reuse.
2387
2388Note that the depth of the backtrack stack constantly increases during the
2389left-to-right execution of the pattern, rather than going up and down with
2390the pattern nesting. For example the stack is at its maximum at Z at the
2391end of the pattern, rather than at X in the following:
2392
2393 /(((X)+)+)+....(Y)+....Z/
2394
2395The only exceptions to this are lookahead/behind assertions and the cut,
2396(?>A), which pop all the backtrack states associated with A before
2397continuing.
2398
2399Bascktrack state structs are allocated in slabs of about 4K in size.
2400PL_regmatch_state and st always point to the currently active state,
2401and PL_regmatch_slab points to the slab currently containing
2402PL_regmatch_state. The first time regmatch() is called, the first slab is
2403allocated, and is never freed until interpreter destruction. When the slab
2404is full, a new one is allocated and chained to the end. At exit from
2405regmatch(), slabs allocated since entry are freed.
2406
2407*/
95b24440 2408
40a82448 2409
5bc10b2c 2410#define DEBUG_STATE_pp(pp) \
265c4333 2411 DEBUG_STATE_r({ \
5bc10b2c
DM
2412 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2413 PerlIO_printf(Perl_debug_log, \
2414 " %*s"pp" %s\n", \
2415 depth*2, "", \
03363afd 2416 reg_name[st->resume_state] ); \
265c4333 2417 });
5bc10b2c 2418
40a82448 2419
3dab1dad 2420#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2421
3df15adc 2422#ifdef DEBUGGING
5bc10b2c 2423
ab3bbdeb
YO
2424STATIC void
2425S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2426 const char *start, const char *end, const char *blurb)
2427{
2428 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2429 if (!PL_colorset)
2430 reginitcolors();
2431 {
2432 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2433 prog->precomp, prog->prelen, 60);
2434
2435 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2436 start, end - start, 60);
2437
2438 PerlIO_printf(Perl_debug_log,
2439 "%s%s REx%s %s against %s\n",
2440 PL_colors[4], blurb, PL_colors[5], s0, s1);
2441
2442 if (do_utf8||utf8_pat)
1de06328
YO
2443 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2444 utf8_pat ? "pattern" : "",
2445 utf8_pat && do_utf8 ? " and " : "",
2446 do_utf8 ? "string" : ""
ab3bbdeb
YO
2447 );
2448 }
2449}
3df15adc
YO
2450
2451STATIC void
786e8c11
YO
2452S_dump_exec_pos(pTHX_ const char *locinput,
2453 const regnode *scan,
2454 const char *loc_regeol,
2455 const char *loc_bostr,
2456 const char *loc_reg_starttry,
2457 const bool do_utf8)
07be1b83 2458{
786e8c11 2459 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2460 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2461 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2462 /* The part of the string before starttry has one color
2463 (pref0_len chars), between starttry and current
2464 position another one (pref_len - pref0_len chars),
2465 after the current position the third one.
2466 We assume that pref0_len <= pref_len, otherwise we
2467 decrease pref0_len. */
786e8c11
YO
2468 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2469 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2470 int pref0_len;
2471
2472 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2473 pref_len++;
786e8c11
YO
2474 pref0_len = pref_len - (locinput - loc_reg_starttry);
2475 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2476 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2477 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2478 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2479 l--;
2480 if (pref0_len < 0)
2481 pref0_len = 0;
2482 if (pref0_len > pref_len)
2483 pref0_len = pref_len;
2484 {
3df15adc 2485 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2486
ab3bbdeb 2487 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2488 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2489
ab3bbdeb 2490 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2491 (locinput - pref_len + pref0_len),
1de06328 2492 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2493
ab3bbdeb 2494 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2495 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2496
1de06328 2497 const STRLEN tlen=len0+len1+len2;
3df15adc 2498 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2499 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2500 (IV)(locinput - loc_bostr),
07be1b83 2501 len0, s0,
07be1b83 2502 len1, s1,
07be1b83 2503 (docolor ? "" : "> <"),
07be1b83 2504 len2, s2,
f9f4320a 2505 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2506 "");
2507 }
2508}
3df15adc 2509
07be1b83
YO
2510#endif
2511
0a4db386
YO
2512/* reg_check_named_buff_matched()
2513 * Checks to see if a named buffer has matched. The data array of
2514 * buffer numbers corresponding to the buffer is expected to reside
2515 * in the regexp->data->data array in the slot stored in the ARG() of
2516 * node involved. Note that this routine doesn't actually care about the
2517 * name, that information is not preserved from compilation to execution.
2518 * Returns the index of the leftmost defined buffer with the given name
2519 * or 0 if non of the buffers matched.
2520 */
2521STATIC I32
2522S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) {
2523 I32 n;
2524 SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];
2525 I32 *nums=(I32*)SvPVX(sv_dat);
2526 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2527 if ((I32)*PL_reglastparen >= nums[n] &&
2528 PL_regendp[nums[n]] != -1)
2529 {
2530 return nums[n];
2531 }
2532 }
2533 return 0;
2534}
2535
d6a28714 2536STATIC I32 /* 0 failure, 1 success */
3b0527fe 2537S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2538{
54df2634
NC
2539#if PERL_VERSION < 9
2540 dMY_CXT;
2541#endif
27da23d5 2542 dVAR;
95b24440 2543 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2544 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2545
3b0527fe
DM
2546 regexp *rex = reginfo->prog;
2547
5d9a96ca
DM
2548 regmatch_slab *orig_slab;
2549 regmatch_state *orig_state;
a3621e74 2550
5d9a96ca
DM
2551 /* the current state. This is a cached copy of PL_regmatch_state */
2552 register regmatch_state *st;
95b24440 2553
5d9a96ca
DM
2554 /* cache heavy used fields of st in registers */
2555 register regnode *scan;
2556 register regnode *next;
24d3c4a9
DM
2557 register I32 n = 0; /* general value; init to avoid compiler warning */
2558 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2559 register char *locinput = PL_reginput;
5d9a96ca 2560 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2561
b69b0499 2562 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2563 int depth = 0; /* depth of backtrack stack */
6bda09f9 2564 int nochange_depth = 0; /* depth of RECURSE recursion with nochange*/
77cb431f
DM
2565 regmatch_state *yes_state = NULL; /* state to pop to on success of
2566 subpattern */
faec1544 2567 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2568 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2569 U32 state_num;
10edeb5d 2570
24d3c4a9
DM
2571 /* these three flags are set by various ops to signal information to
2572 * the very next op. They have a useful lifetime of exactly one loop
2573 * iteration, and are not preserved or restored by state pushes/pops
2574 */
2575 bool sw = 0; /* the condition value in (?(cond)a|b) */
2576 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2577 int logical = 0; /* the following EVAL is:
2578 0: (?{...})
2579 1: (?(?{...})X|Y)
2580 2: (??{...})
2581 or the following IFMATCH/UNLESSM is:
2582 false: plain (?=foo)
2583 true: used as a condition: (?(?=foo))
2584 */
2585
95b24440 2586#ifdef DEBUGGING
e68ec53f 2587 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2588#endif
2589
5d9a96ca
DM
2590 /* on first ever call to regmatch, allocate first slab */
2591 if (!PL_regmatch_slab) {
2592 Newx(PL_regmatch_slab, 1, regmatch_slab);
2593 PL_regmatch_slab->prev = NULL;
2594 PL_regmatch_slab->next = NULL;
86545054 2595 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2596 }
2597
2598 /* remember current high-water mark for exit */
2599 /* XXX this should be done with SAVE* instead */
2600 orig_slab = PL_regmatch_slab;
2601 orig_state = PL_regmatch_state;
2602
2603 /* grab next free state slot */
2604 st = ++PL_regmatch_state;
86545054 2605 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2606 st = PL_regmatch_state = S_push_slab(aTHX);
2607
d6a28714
JH
2608 /* Note that nextchr is a byte even in UTF */
2609 nextchr = UCHARAT(locinput);
2610 scan = prog;
2611 while (scan != NULL) {
8ba1375e 2612
a3621e74 2613 DEBUG_EXECUTE_r( {
6136c704 2614 SV * const prop = sv_newmortal();
1de06328 2615 regnode *rnext=regnext(scan);
786e8c11 2616 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2617 regprop(rex, prop, scan);
07be1b83
YO
2618
2619 PerlIO_printf(Perl_debug_log,
2620 "%3"IVdf":%*s%s(%"IVdf")\n",
5bc10b2c 2621 (IV)(scan - rex->program), depth*2, "",
07be1b83 2622 SvPVX_const(prop),
1de06328
YO
2623 (PL_regkind[OP(scan)] == END || !rnext) ?
2624 0 : (IV)(rnext - rex->program));
2a782b5b 2625 });
d6a28714
JH
2626
2627 next = scan + NEXT_OFF(scan);
2628 if (next == scan)
2629 next = NULL;
40a82448 2630 state_num = OP(scan);
d6a28714 2631
40a82448
DM
2632 reenter_switch:
2633 switch (state_num) {
d6a28714 2634 case BOL:
7fba1cd6 2635 if (locinput == PL_bostr)
d6a28714 2636 {
3b0527fe 2637 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2638 break;
2639 }
d6a28714
JH
2640 sayNO;
2641 case MBOL:
12d33761
HS
2642 if (locinput == PL_bostr ||
2643 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2644 {
b8c5462f
JH
2645 break;
2646 }
d6a28714
JH
2647 sayNO;
2648 case SBOL:
c2a73568 2649 if (locinput == PL_bostr)
b8c5462f 2650 break;
d6a28714
JH
2651 sayNO;
2652 case GPOS:
3b0527fe 2653 if (locinput == reginfo->ganch)
d6a28714
JH
2654 break;
2655 sayNO;
2656 case EOL:
d6a28714
JH
2657 goto seol;
2658 case MEOL:
d6a28714 2659 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2660 sayNO;
b8c5462f 2661 break;
d6a28714
JH
2662 case SEOL:
2663 seol:
2664 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2665 sayNO;
d6a28714 2666 if (PL_regeol - locinput > 1)
b8c5462f 2667 sayNO;
b8c5462f 2668 break;
d6a28714
JH
2669 case EOS:
2670 if (PL_regeol != locinput)
b8c5462f 2671 sayNO;
d6a28714 2672 break;
ffc61ed2 2673 case SANY:
d6a28714 2674 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2675 sayNO;
f33976b4
DB
2676 if (do_utf8) {
2677 locinput += PL_utf8skip[nextchr];
2678 if (locinput > PL_regeol)
2679 sayNO;
2680 nextchr = UCHARAT(locinput);
2681 }
2682 else
2683 nextchr = UCHARAT(++locinput);
2684 break;
2685 case CANY:
2686 if (!nextchr && locinput >= PL_regeol)
2687 sayNO;
b8c5462f 2688 nextchr = UCHARAT(++locinput);
a0d0e21e 2689 break;
ffc61ed2 2690 case REG_ANY:
1aa99e6b
IH
2691 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2692 sayNO;
2693 if (do_utf8) {
b8c5462f 2694 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2695 if (locinput > PL_regeol)
2696 sayNO;
a0ed51b3 2697 nextchr = UCHARAT(locinput);
a0ed51b3 2698 }
1aa99e6b
IH
2699 else
2700 nextchr = UCHARAT(++locinput);
a0ed51b3 2701 break;
166ba7cd
DM
2702
2703#undef ST
2704#define ST st->u.trie
786e8c11
YO
2705 case TRIEC:
2706 /* In this case the charclass data is available inline so
2707 we can fail fast without a lot of extra overhead.
2708 */
2709 if (scan->flags == EXACT || !do_utf8) {
2710 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
2711 DEBUG_EXECUTE_r(
2712 PerlIO_printf(Perl_debug_log,
2713 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2714 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
2715 );
2716 sayNO_SILENT;
2717 /* NOTREACHED */
2718 }
2719 }
2720 /* FALL THROUGH */
5b47454d 2721 case TRIE:
3dab1dad 2722 {
07be1b83 2723 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2724 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2725 trie_type = do_utf8 ?
2726 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2727 : trie_plain;
2728
2729 /* what trie are we using right now */
be8e71aa 2730 reg_trie_data * const trie
3dab1dad
YO
2731 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2732 U32 state = trie->startstate;
166ba7cd 2733
3dab1dad
YO
2734 if (trie->bitmap && trie_type != trie_utf8_fold &&
2735 !TRIE_BITMAP_TEST(trie,*locinput)
2736 ) {
2737 if (trie->states[ state ].wordnum) {
2738 DEBUG_EXECUTE_r(
2739 PerlIO_printf(Perl_debug_log,
2740 "%*s %smatched empty string...%s\n",
5bc10b2c 2741 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2742 );
2743 break;
2744 } else {
2745 DEBUG_EXECUTE_r(
2746 PerlIO_printf(Perl_debug_log,
786e8c11 2747 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 2748 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
2749 );
2750 sayNO_SILENT;
2751 }
2752 }
166ba7cd 2753
786e8c11
YO
2754 {
2755 U8 *uc = ( U8* )locinput;
2756
2757 STRLEN len = 0;
2758 STRLEN foldlen = 0;
2759 U8 *uscan = (U8*)NULL;
2760 STRLEN bufflen=0;
2761 SV *sv_accept_buff = NULL;
2762 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2763
2764 ST.accepted = 0; /* how many accepting states we have seen */
2765 ST.B = next;
2766 ST.jump = trie->jump;
786e8c11 2767 ST.me = scan;
786e8c11 2768
07be1b83
YO
2769 /*
2770 traverse the TRIE keeping track of all accepting states
2771 we transition through until we get to a failing node.
2772 */
2773
a3621e74 2774 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 2775 U32 base = trie->states[ state ].trans.base;
f9f4320a 2776 UV uvc = 0;
786e8c11
YO
2777 U16 charid;
2778 /* We use charid to hold the wordnum as we don't use it
2779 for charid until after we have done the wordnum logic.
2780 We define an alias just so that the wordnum logic reads
2781 more naturally. */
2782
2783#define got_wordnum charid
2784 got_wordnum = trie->states[ state ].wordnum;
2785
2786 if ( got_wordnum ) {
2787 if ( ! ST.accepted ) {
5b47454d
DM
2788 ENTER;
2789 SAVETMPS;
2790 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2791 sv_accept_buff=newSV(bufflen *
2792 sizeof(reg_trie_accepted) - 1);
786e8c11 2793 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
2794 SvPOK_on(sv_accept_buff);
2795 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2796 SAVETMPS;
2797 ST.accept_buff =
5b47454d
DM
2798 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2799 }
786e8c11 2800 do {
166ba7cd 2801 if (ST.accepted >= bufflen) {
5b47454d 2802 bufflen *= 2;
166ba7cd 2803 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2804 SvGROW(sv_accept_buff,
2805 bufflen * sizeof(reg_trie_accepted));
2806 }
2807 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2808 + sizeof(reg_trie_accepted));
a3621e74 2809
786e8c11
YO
2810
2811 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
2812 ST.accept_buff[ST.accepted].endpos = uc;
2813 ++ST.accepted;
2814 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
2815 }
2816#undef got_wordnum
a3621e74 2817
07be1b83 2818 DEBUG_TRIE_EXECUTE_r({
786e8c11 2819 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 2820 PerlIO_printf( Perl_debug_log,
786e8c11 2821 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 2822 2+depth * 2, "", PL_colors[4],
786e8c11 2823 (UV)state, (UV)ST.accepted );
07be1b83 2824 });
a3621e74
YO
2825
2826 if ( base ) {
4cadc6a9
YO
2827 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2828 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2829
5b47454d
DM
2830 if (charid &&
2831 (base + charid > trie->uniquecharcount )
2832 && (base + charid - 1 - trie->uniquecharcount
2833 < trie->lasttrans)
2834 && trie->trans[base + charid - 1 -
2835 trie->uniquecharcount].check == state)
2836 {
2837 state = trie->trans[base + charid - 1 -
2838 trie->uniquecharcount ].next;
2839 }
2840 else {
2841 state = 0;
2842 }
2843 uc += len;
2844
2845 }
2846 else {
a3621e74
YO
2847 state = 0;
2848 }
2849 DEBUG_TRIE_EXECUTE_r(
e4584336 2850 PerlIO_printf( Perl_debug_log,
786e8c11 2851 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 2852 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2853 );
2854 }
166ba7cd 2855 if (!ST.accepted )
a3621e74 2856 sayNO;
a3621e74 2857
166ba7cd
DM
2858 DEBUG_EXECUTE_r(
2859 PerlIO_printf( Perl_debug_log,
2860 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 2861 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
2862 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2863 );
786e8c11 2864 }}
166ba7cd
DM
2865
2866 /* FALL THROUGH */
2867
2868 case TRIE_next_fail: /* we failed - try next alterative */
2869
2870 if ( ST.accepted == 1 ) {
2871 /* only one choice left - just continue */
2872 DEBUG_EXECUTE_r({
2873 reg_trie_data * const trie
2874 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2875 SV ** const tmp = RX_DEBUG(reginfo->prog)
2876 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2877 : NULL;
2878 PerlIO_printf( Perl_debug_log,
2879 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 2880 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd
DM
2881 ST.accept_buff[ 0 ].wordnum,
2882 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2883 PL_colors[5] );
2884 });
2885 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2886 /* in this case we free tmps/leave before we call regmatch
2887 as we wont be using accept_buff again. */
2888 FREETMPS;
2889 LEAVE;
2890 locinput = PL_reginput;
2891 nextchr = UCHARAT(locinput);
786e8c11 2892
7f69552c 2893 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
786e8c11
YO
2894 scan = ST.B;
2895 else
7f69552c 2896 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
786e8c11 2897
166ba7cd
DM
2898 continue; /* execute rest of RE */
2899 }
2900
2901 if (!ST.accepted-- ) {
2902 FREETMPS;
2903 LEAVE;
2904 sayNO;
2905 }
2906
a3621e74 2907 /*
166ba7cd
DM
2908 There are at least two accepting states left. Presumably
2909 the number of accepting states is going to be low,
2910 typically two. So we simply scan through to find the one
2911 with lowest wordnum. Once we find it, we swap the last
2912 state into its place and decrement the size. We then try to
2913 match the rest of the pattern at the point where the word
2914 ends. If we succeed, control just continues along the
2915 regex; if we fail we return here to try the next accepting
2916 state
2917 */
a3621e74 2918
166ba7cd
DM
2919 {
2920 U32 best = 0;
2921 U32 cur;
2922 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2923 DEBUG_TRIE_EXECUTE_r(
f2278c82 2924 PerlIO_printf( Perl_debug_log,
166ba7cd 2925 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 2926 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
2927 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2928 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2929 );
2930
2931 if (ST.accept_buff[cur].wordnum <
2932 ST.accept_buff[best].wordnum)
2933 best = cur;
a3621e74 2934 }
166ba7cd
DM
2935
2936 DEBUG_EXECUTE_r({
2937 reg_trie_data * const trie
2938 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2939 SV ** const tmp = RX_DEBUG(reginfo->prog)
2940 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2941 : NULL;
7f69552c 2942 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 2943 ST.B :
7f69552c 2944 ST.me + ST.jump[ST.accept_buff[best].wordnum];
265c4333
YO
2945 PerlIO_printf( Perl_debug_log,
2946 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 2947 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 2948 ST.accept_buff[best].wordnum,
265c4333
YO
2949 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2950 REG_NODE_NUM(nextop),
166ba7cd
DM
2951 PL_colors[5] );
2952 });
2953
2954 if ( best<ST.accepted ) {
2955 reg_trie_accepted tmp = ST.accept_buff[ best ];
2956 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2957 ST.accept_buff[ ST.accepted ] = tmp;
2958 best = ST.accepted;
a3621e74 2959 }
166ba7cd 2960 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 2961 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
786e8c11
YO
2962 PUSH_STATE_GOTO(TRIE_next, ST.B);
2963 /* NOTREACHED */
2964 } else {
7f69552c 2965 PUSH_STATE_GOTO(TRIE_next, ST.me + ST.jump[ST.accept_buff[best].wordnum]);
786e8c11
YO
2966 /* NOTREACHED */
2967 }
2968 /* NOTREACHED */
166ba7cd 2969 }
166ba7cd
DM
2970 /* NOTREACHED */
2971
2972#undef ST
2973
95b24440
DM
2974 case EXACT: {
2975 char *s = STRING(scan);
24d3c4a9 2976 ln = STR_LEN(scan);
eb160463 2977 if (do_utf8 != UTF) {
bc517b45 2978 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2979 char *l = locinput;
24d3c4a9 2980 const char * const e = s + ln;
a72c7584 2981
5ff6fc6d
JH
2982 if (do_utf8) {
2983 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2984 while (s < e) {
a3b680e6 2985 STRLEN ulen;
1aa99e6b 2986 if (l >= PL_regeol)
5ff6fc6d
JH
2987 sayNO;
2988 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2989 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2990 uniflags))
5ff6fc6d 2991 sayNO;
bc517b45 2992 l += ulen;
5ff6fc6d 2993 s ++;
1aa99e6b 2994 }
5ff6fc6d
JH
2995 }
2996 else {
2997 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2998 while (s < e) {
a3b680e6 2999 STRLEN ulen;
1aa99e6b
IH
3000 if (l >= PL_regeol)
3001 sayNO;
5ff6fc6d 3002 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3003 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3004 uniflags))
1aa99e6b 3005 sayNO;
bc517b45 3006 s += ulen;
a72c7584 3007 l ++;
1aa99e6b 3008 }
5ff6fc6d 3009 }
1aa99e6b
IH
3010 locinput = l;
3011 nextchr = UCHARAT(locinput);
3012 break;
3013 }
bc517b45 3014 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3015 /* Inline the first character, for speed. */
3016 if (UCHARAT(s) != nextchr)
3017 sayNO;
24d3c4a9 3018 if (PL_regeol - locinput < ln)
d6a28714 3019 sayNO;
24d3c4a9 3020 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3021 sayNO;
24d3c4a9 3022 locinput += ln;
d6a28714
JH
3023 nextchr = UCHARAT(locinput);
3024 break;
95b24440 3025 }
d6a28714 3026 case EXACTFL:
b8c5462f
JH
3027 PL_reg_flags |= RF_tainted;
3028 /* FALL THROUGH */
95b24440 3029 case EXACTF: {
be8e71aa 3030 char * const s = STRING(scan);
24d3c4a9 3031 ln = STR_LEN(scan);
d6a28714 3032
d07ddd77
JH
3033 if (do_utf8 || UTF) {
3034 /* Either target or the pattern are utf8. */
be8e71aa 3035 const char * const l = locinput;
d07ddd77 3036 char *e = PL_regeol;
bc517b45 3037
24d3c4a9 3038 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3039 l, &e, 0, do_utf8)) {
5486206c
JH
3040 /* One more case for the sharp s:
3041 * pack("U0U*", 0xDF) =~ /ss/i,
3042 * the 0xC3 0x9F are the UTF-8
3043 * byte sequence for the U+00DF. */
3044 if (!(do_utf8 &&
3045 toLOWER(s[0]) == 's' &&
24d3c4a9 3046 ln >= 2 &&
5486206c
JH
3047 toLOWER(s[1]) == 's' &&
3048 (U8)l[0] == 0xC3 &&
3049 e - l >= 2 &&
3050 (U8)l[1] == 0x9F))
3051 sayNO;
3052 }
d07ddd77
JH
3053 locinput = e;
3054 nextchr = UCHARAT(locinput);
3055 break;
a0ed51b3 3056 }
d6a28714 3057
bc517b45
JH
3058 /* Neither the target and the pattern are utf8. */
3059
d6a28714
JH
3060 /* Inline the first character, for speed. */
3061 if (UCHARAT(s) != nextchr &&
3062 UCHARAT(s) != ((OP(scan) == EXACTF)
3063 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3064 sayNO;
24d3c4a9 3065 if (PL_regeol - locinput < ln)
b8c5462f 3066 sayNO;
24d3c4a9
DM
3067 if (ln > 1 && (OP(scan) == EXACTF
3068 ? ibcmp(s, locinput, ln)
3069 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3070 sayNO;
24d3c4a9 3071 locinput += ln;
d6a28714 3072 nextchr = UCHARAT(locinput);
a0d0e21e 3073 break;
95b24440 3074 }
d6a28714 3075 case ANYOF:
ffc61ed2 3076 if (do_utf8) {
9e55ce06
JH
3077 STRLEN inclasslen = PL_regeol - locinput;
3078
32fc9b6a 3079 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3080 goto anyof_fail;
ffc61ed2
JH
3081 if (locinput >= PL_regeol)
3082 sayNO;
0f0076b4 3083 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3084 nextchr = UCHARAT(locinput);
e0f9d4a8 3085 break;
ffc61ed2
JH
3086 }
3087 else {
3088 if (nextchr < 0)
3089 nextchr = UCHARAT(locinput);
32fc9b6a 3090 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3091 goto anyof_fail;
ffc61ed2
JH
3092 if (!nextchr && locinput >= PL_regeol)
3093 sayNO;
3094 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3095 break;
3096 }
262b90c4 3097 anyof_fail:
e0f9d4a8
JH
3098 /* If we might have the case of the German sharp s
3099 * in a casefolding Unicode character class. */
3100
ebc501f0
JH
3101 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3102 locinput += SHARP_S_SKIP;
e0f9d4a8 3103 nextchr = UCHARAT(locinput);
ffc61ed2 3104 }
e0f9d4a8
JH
3105 else
3106 sayNO;
b8c5462f 3107 break;
d6a28714 3108 case ALNUML:
b8c5462f
JH
3109 PL_reg_flags |= RF_tainted;
3110 /* FALL THROUGH */
d6a28714 3111 case ALNUM:
b8c5462f 3112 if (!nextchr)
4633a7c4 3113 sayNO;
ffc61ed2 3114 if (do_utf8) {
1a4fad37 3115 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3116 if (!(OP(scan) == ALNUM
bb7a0f54 3117 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3118 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3119 {
3120 sayNO;
a0ed51b3 3121 }
b8c5462f 3122 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3123 nextchr = UCHARAT(locinput);
3124 break;
3125 }
ffc61ed2 3126 if (!(OP(scan) == ALNUM
d6a28714 3127 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3128 sayNO;
b8c5462f 3129 nextchr = UCHARAT(++locinput);
a0d0e21e 3130 break;
d6a28714 3131 case NALNUML:
b8c5462f
JH
3132 PL_reg_flags |= RF_tainted;
3133 /* FALL THROUGH */
d6a28714
JH
3134 case NALNUM:
3135 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3136 sayNO;
ffc61ed2 3137 if (do_utf8) {
1a4fad37 3138 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3139 if (OP(scan) == NALNUM
bb7a0f54 3140 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3141 : isALNUM_LC_utf8((U8*)locinput))
3142 {
b8c5462f 3143 sayNO;
d6a28714 3144 }
b8c5462f
JH
3145 locinput += PL_utf8skip[nextchr];
3146 nextchr = UCHARAT(locinput);
3147 break;
3148 }
ffc61ed2 3149 if (OP(scan) == NALNUM
d6a28714 3150 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3151 sayNO;
76e3520e 3152 nextchr = UCHARAT(++locinput);
a0d0e21e 3153 break;
d6a28714
JH
3154 case BOUNDL:
3155 case NBOUNDL:
3280af22 3156 PL_reg_flags |= RF_tainted;
bbce6d69 3157 /* FALL THROUGH */
d6a28714
JH
3158 case BOUND:
3159 case NBOUND:
3160 /* was last char in word? */
ffc61ed2 3161 if (do_utf8) {
12d33761 3162 if (locinput == PL_bostr)
24d3c4a9 3163 ln = '\n';
ffc61ed2 3164 else {
a3b680e6 3165 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3166
24d3c4a9 3167 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3168 }
3169 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3170 ln = isALNUM_uni(ln);
1a4fad37 3171 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3172 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3173 }
3174 else {
24d3c4a9 3175 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3176 n = isALNUM_LC_utf8((U8*)locinput);
3177 }
a0ed51b3 3178 }
d6a28714 3179 else {
24d3c4a9 3180 ln = (locinput != PL_bostr) ?
12d33761 3181 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3182 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
24d3c4a9 3183 ln = isALNUM(ln);
ffc61ed2
JH
3184 n = isALNUM(nextchr);
3185 }
3186 else {
24d3c4a9 3187 ln = isALNUM_LC(ln);
ffc61ed2
JH
3188 n = isALNUM_LC(nextchr);
3189 }
d6a28714 3190 }
24d3c4a9 3191 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3192 OP(scan) == BOUNDL))
3193 sayNO;
a0ed51b3 3194 break;
d6a28714 3195 case SPACEL:
3280af22 3196 PL_reg_flags |= RF_tainted;
bbce6d69 3197 /* FALL THROUGH */
d6a28714 3198 case SPACE:
9442cb0e 3199 if (!nextchr)
4633a7c4 3200 sayNO;
1aa99e6b 3201 if (do_utf8) {
fd400ab9 3202 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3203 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3204 if (!(OP(scan) == SPACE
bb7a0f54 3205 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3206 : isSPACE_LC_utf8((U8*)locinput)))
3207 {
3208 sayNO;
3209 }
3210 locinput += PL_utf8skip[nextchr];
3211 nextchr = UCHARAT(locinput);
3212 break;
d6a28714 3213 }
ffc61ed2
JH
3214 if (!(OP(scan) == SPACE
3215 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3216 sayNO;
3217 nextchr = UCHARAT(++locinput);
3218 }
3219 else {
3220 if (!(OP(scan) == SPACE
3221 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3222 sayNO;
3223 nextchr = UCHARAT(++locinput);
a0ed51b3 3224 }
a0ed51b3 3225 break;
d6a28714 3226 case NSPACEL:
3280af22 3227 PL_reg_flags |= RF_tainted;
bbce6d69 3228 /* FALL THROUGH */
d6a28714 3229 case NSPACE:
9442cb0e 3230 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3231 sayNO;
1aa99e6b 3232 if (do_utf8) {
1a4fad37 3233 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3234 if (OP(scan) == NSPACE
bb7a0f54 3235 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3236 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3237 {
3238 sayNO;
3239 }
3240 locinput += PL_utf8skip[nextchr];
3241 nextchr = UCHARAT(locinput);
3242 break;
a0ed51b3 3243 }
ffc61ed2 3244 if (OP(scan) == NSPACE
d6a28714 3245 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3246 sayNO;
76e3520e 3247 nextchr = UCHARAT(++locinput);
a0d0e21e 3248 break;
d6a28714 3249 case DIGITL:
a0ed51b3
LW
3250 PL_reg_flags |= RF_tainted;
3251 /* FALL THROUGH */
d6a28714 3252 case DIGIT:
9442cb0e 3253 if (!nextchr)
a0ed51b3 3254 sayNO;
1aa99e6b 3255 if (do_utf8) {
1a4fad37 3256 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3257 if (!(OP(scan) == DIGIT
bb7a0f54 3258 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3259 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3260 {
a0ed51b3 3261 sayNO;
dfe13c55 3262 }
6f06b55f 3263 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3264 nextchr = UCHARAT(locinput);
3265 break;
3266 }
ffc61ed2 3267 if (!(OP(scan) == DIGIT
9442cb0e 3268 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3269 sayNO;
3270 nextchr = UCHARAT(++locinput);
3271 break;
d6a28714 3272 case NDIGITL:
b8c5462f
JH
3273 PL_reg_flags |= RF_tainted;
3274 /* FALL THROUGH */
d6a28714 3275 case NDIGIT:
9442cb0e 3276 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3277 sayNO;
1aa99e6b 3278 if (do_utf8) {
1a4fad37 3279 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3280 if (OP(scan) == NDIGIT
bb7a0f54 3281 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3282 : isDIGIT_LC_utf8((U8*)locinput))
3283 {
a0ed51b3 3284 sayNO;
9442cb0e 3285 }
6f06b55f 3286 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3287 nextchr = UCHARAT(locinput);
3288 break;
3289 }
ffc61ed2 3290 if (OP(scan) == NDIGIT
9442cb0e 3291 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3292 sayNO;
3293 nextchr = UCHARAT(++locinput);
3294 break;
3295 case CLUMP:
b7c83a7e 3296 if (locinput >= PL_regeol)
a0ed51b3 3297 sayNO;
b7c83a7e 3298 if (do_utf8) {
1a4fad37 3299 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3300 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3301 sayNO;
3302 locinput += PL_utf8skip[nextchr];
3303 while (locinput < PL_regeol &&
3304 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3305 locinput += UTF8SKIP(locinput);
3306 if (locinput > PL_regeol)
3307 sayNO;
eb08e2da
JH
3308 }
3309 else
3310 locinput++;
a0ed51b3
LW
3311 nextchr = UCHARAT(locinput);
3312 break;
81714fb9
YO
3313
3314 case NREFFL:
3315 {
3316 char *s;
ff1157ca 3317 char type;
81714fb9
YO
3318 PL_reg_flags |= RF_tainted;
3319 /* FALL THROUGH */
3320 case NREF:
3321 case NREFF:
ff1157ca 3322 type = OP(scan);
0a4db386
YO
3323 n = reg_check_named_buff_matched(rex,scan);
3324
3325 if ( n ) {
3326 type = REF + ( type - NREF );
3327 goto do_ref;
3328 } else {
81714fb9 3329 sayNO;
0a4db386
YO
3330 }
3331 /* unreached */
c8756f30 3332 case REFFL:
3280af22 3333 PL_reg_flags |= RF_tainted;
c8756f30 3334 /* FALL THROUGH */
c277df42 3335 case REF:
81714fb9 3336 case REFF:
c277df42 3337 n = ARG(scan); /* which paren pair */
81714fb9
YO
3338 type = OP(scan);
3339 do_ref:
24d3c4a9 3340 ln = PL_regstartp[n];
2c2d71f5 3341 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3342 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3343 sayNO; /* Do not match unless seen CLOSEn. */
24d3c4a9 3344 if (ln == PL_regendp[n])
a0d0e21e 3345 break;
a0ed51b3 3346
24d3c4a9 3347 s = PL_bostr + ln;
81714fb9 3348 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3349 char *l = locinput;
a3b680e6 3350 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3351 /*
3352 * Note that we can't do the "other character" lookup trick as
3353 * in the 8-bit case (no pun intended) because in Unicode we
3354 * have to map both upper and title case to lower case.
3355 */
81714fb9 3356 if (type == REFF) {
a0ed51b3 3357 while (s < e) {
a3b680e6
AL
3358 STRLEN ulen1, ulen2;
3359 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3360 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3361
a0ed51b3
LW
3362 if (l >= PL_regeol)
3363 sayNO;
a2a2844f
JH
3364 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3365 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3366 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3367 sayNO;
a2a2844f
JH
3368 s += ulen1;
3369 l += ulen2;
a0ed51b3
LW
3370 }
3371 }
3372 locinput = l;
3373 nextchr = UCHARAT(locinput);
3374 break;
3375 }
3376
a0d0e21e 3377 /* Inline the first character, for speed. */
76e3520e 3378 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3379 (type == REF ||
3380 (UCHARAT(s) != (type == REFF
3381 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3382 sayNO;
24d3c4a9
DM
3383 ln = PL_regendp[n] - ln;
3384 if (locinput + ln > PL_regeol)
4633a7c4 3385 sayNO;
81714fb9 3386 if (ln > 1 && (type == REF
24d3c4a9 3387 ? memNE(s, locinput, ln)
81714fb9 3388 : (type == REFF
24d3c4a9
DM
3389 ? ibcmp(s, locinput, ln)
3390 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3391 sayNO;
24d3c4a9 3392 locinput += ln;
76e3520e 3393 nextchr = UCHARAT(locinput);
a0d0e21e 3394 break;
81714fb9 3395 }
a0d0e21e 3396 case NOTHING:
c277df42 3397 case TAIL:
a0d0e21e
LW
3398 break;
3399 case BACK:
3400 break;
40a82448
DM
3401
3402#undef ST
3403#define ST st->u.eval
c277df42 3404 {
c277df42 3405 SV *ret;
6bda09f9
YO
3406 regexp *re;
3407 regnode *startpoint;
3408
3409 case SRECURSE:
3410 case RECURSE: /* /(...(?1))/ */
3411 if (cur_eval && cur_eval->locinput==locinput) {
3412 if (cur_eval->u.eval.close_paren == ARG(scan))
3413 Perl_croak(aTHX_ "Infinite recursion in RECURSE in regexp");
3414 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3415 Perl_croak(aTHX_ "RECURSE without pos change exceeded limit in regexp");
3416 } else {
3417 nochange_depth = 0;
3418 }
3419 re = rex;
3420 (void)ReREFCNT_inc(rex);
3421 if (OP(scan)==RECURSE) {
3422 startpoint = scan + ARG2L(scan);
3423 ST.close_paren = ARG(scan);
3424 } else {
3425 startpoint = re->program+1;
3426 ST.close_paren = 0;
3427 }
3428 goto eval_recurse_doit;
3429 /* NOTREACHED */
3430 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3431 if (cur_eval && cur_eval->locinput==locinput) {
3432 if ( ++nochange_depth > MAX_RECURSE_EVAL_NOCHANGE_DEPTH )
3433 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regexp");
3434 } else {
3435 nochange_depth = 0;
3436 }
8e5e9ebe 3437 {
4aabdb9b
DM
3438 /* execute the code in the {...} */
3439 dSP;
6136c704 3440 SV ** const before = SP;
4aabdb9b
DM
3441 OP_4tree * const oop = PL_op;
3442 COP * const ocurcop = PL_curcop;
3443 PAD *old_comppad;
4aabdb9b
DM
3444
3445 n = ARG(scan);
32fc9b6a 3446 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3447 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3448 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3449 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3450
8e5e9ebe
RGS
3451 CALLRUNOPS(aTHX); /* Scalar context. */
3452 SPAGAIN;
3453 if (SP == before)
075aa684 3454 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3455 else {
3456 ret = POPs;
3457 PUTBACK;
3458 }
4aabdb9b
DM
3459
3460 PL_op = oop;
3461 PAD_RESTORE_LOCAL(old_comppad);
3462 PL_curcop = ocurcop;
24d3c4a9 3463 if (!logical) {
4aabdb9b
DM
3464 /* /(?{...})/ */
3465 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3466 break;
3467 }
8e5e9ebe 3468 }
24d3c4a9
DM
3469 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3470 logical = 0;
4aabdb9b 3471 {
4f639d21
DM
3472 /* extract RE object from returned value; compiling if
3473 * necessary */
3474
6136c704 3475 MAGIC *mg = NULL;
be8e71aa 3476 const SV *sv;
faf82a0b
AE
3477 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3478 mg = mg_find(sv, PERL_MAGIC_qr);
3479 else if (SvSMAGICAL(ret)) {
3480 if (SvGMAGICAL(ret))
3481 sv_unmagic(ret, PERL_MAGIC_qr);
3482 else
3483 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3484 }
faf82a0b 3485
0f5d15d6
IZ
3486 if (mg) {
3487 re = (regexp *)mg->mg_obj;
df0003d4 3488 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3489 }
3490 else {
3491 STRLEN len;
6136c704 3492 const char * const t = SvPV_const(ret, len);
0f5d15d6 3493 PMOP pm;
a3b680e6 3494 const I32 osize = PL_regsize;
0f5d15d6 3495
5fcd1c1b 3496 Zero(&pm, 1, PMOP);
4aabdb9b 3497 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
f9f4320a 3498 re = CALLREGCOMP((char*)t, (char*)t + len, &pm);
9041c2e3 3499 if (!(SvFLAGS(ret)
faf82a0b
AE
3500 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3501 | SVs_GMG)))
14befaf4
DM
3502 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3503 PERL_MAGIC_qr,0,0);
0f5d15d6 3504 PL_regsize = osize;
0f5d15d6 3505 }
4aabdb9b 3506 }
6bda09f9
YO
3507 DEBUG_EXECUTE_r(
3508 debug_start_match(re, do_utf8, locinput, PL_regeol,
3509 "Matching embedded");
3510 );
3511 startpoint = re->program + 1;
3512 ST.close_paren = 0; /* only used for RECURSE */
3513 /* borrowed from regtry */
3514 if (PL_reg_start_tmpl <= re->nparens) {
3515 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3516 if(PL_reg_start_tmp)
3517 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3518 else
3519 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3520 }
aa283a38 3521
ae0beba1 3522 eval_recurse_doit: /* Share code with RECURSE below this line */
aa283a38 3523 /* run the pattern returned from (??{...}) */
40a82448
DM
3524 ST.cp = regcppush(0); /* Save *all* the positions. */
3525 REGCP_SET(ST.lastcp);
6bda09f9
YO
3526
3527 PL_regstartp = re->startp; /* essentially NOOP on RECURSE */
3528 PL_regendp = re->endp; /* essentially NOOP on RECURSE */
3529
4aabdb9b
DM
3530 *PL_reglastparen = 0;
3531 *PL_reglastcloseparen = 0;
4aabdb9b 3532 PL_reginput = locinput;
ae0beba1 3533 PL_regsize = 0;
4aabdb9b
DM
3534
3535 /* XXXX This is too dramatic a measure... */
3536 PL_reg_maxiter = 0;
3537
faec1544
DM
3538 ST.toggle_reg_flags = PL_reg_flags;
3539 if (re->reganch & ROPT_UTF8)
3540 PL_reg_flags |= RF_utf8;
3541 else
3542 PL_reg_flags &= ~RF_utf8;
3543 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3544
40a82448 3545 ST.prev_rex = rex;
faec1544 3546 ST.prev_curlyx = cur_curlyx;
aa283a38 3547 rex = re;
faec1544 3548 cur_curlyx = NULL;
40a82448 3549 ST.B = next;
faec1544
DM
3550 ST.prev_eval = cur_eval;
3551 cur_eval = st;
faec1544 3552 /* now continue from first node in postoned RE */
6bda09f9 3553 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 3554 /* NOTREACHED */
a0ed51b3 3555 }
24d3c4a9
DM
3556 /* logical is 1, /(?(?{...})X|Y)/ */
3557 sw = (bool)SvTRUE(ret);
3558 logical = 0;
c277df42
IZ
3559 break;
3560 }
40a82448 3561
faec1544
DM
3562 case EVAL_AB: /* cleanup after a successful (??{A})B */
3563 /* note: this is called twice; first after popping B, then A */
3564 PL_reg_flags ^= ST.toggle_reg_flags;
40a82448
DM
3565 ReREFCNT_dec(rex);
3566 rex = ST.prev_rex;
faec1544
DM
3567 regcpblow(ST.cp);
3568 cur_eval = ST.prev_eval;
3569 cur_curlyx = ST.prev_curlyx;
40a82448
DM
3570 /* XXXX This is too dramatic a measure... */
3571 PL_reg_maxiter = 0;
262b90c4 3572 sayYES;
40a82448 3573
40a82448 3574
faec1544
DM
3575 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3576 /* note: this is called twice; first after popping B, then A */
3577 PL_reg_flags ^= ST.toggle_reg_flags;
3578 ReREFCNT_dec(rex);
3579 rex = ST.prev_rex;
40a82448
DM
3580 PL_reginput = locinput;
3581 REGCP_UNWIND(ST.lastcp);
3582 regcppop(rex);
faec1544
DM
3583 cur_eval = ST.prev_eval;
3584 cur_curlyx = ST.prev_curlyx;
3585 /* XXXX This is too dramatic a measure... */
3586 PL_reg_maxiter = 0;
40a82448 3587 sayNO_SILENT;
40a82448
DM
3588#undef ST
3589
a0d0e21e 3590 case OPEN:
c277df42 3591 n = ARG(scan); /* which paren pair */
3280af22
NIS
3592 PL_reg_start_tmp[n] = locinput;
3593 if (n > PL_regsize)
3594 PL_regsize = n;
a0d0e21e
LW
3595 break;
3596 case CLOSE:
c277df42 3597 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3598 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3599 PL_regendp[n] = locinput - PL_bostr;
7f69552c
YO
3600 /*if (n > PL_regsize)
3601 PL_regsize = n;*/
eb160463 3602 if (n > (I32)*PL_reglastparen)
3280af22 3603 *PL_reglastparen = n;
a01268b5 3604 *PL_reglastcloseparen = n;
6bda09f9
YO
3605 if (cur_eval && cur_eval->u.eval.close_paren == (U32)n) {
3606 goto fake_end;
3607 }
a0d0e21e 3608 break;
c277df42
IZ
3609 case GROUPP:
3610 n = ARG(scan); /* which paren pair */
24d3c4a9 3611 sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42 3612 break;
0a4db386
YO
3613 case NGROUPP:
3614 /* reg_check_named_buff_matched returns 0 for no match */
3615 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3616 break;
3617 case RECURSEP:
3618 n = ARG(scan);
3619 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3620 break;
3621 case DEFINEP:
3622 sw = 0;
3623 break;
c277df42 3624 case IFTHEN:
2c2d71f5 3625 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 3626 if (sw)
c277df42
IZ
3627 next = NEXTOPER(NEXTOPER(scan));
3628 else {
3629 next = scan + ARG(scan);
3630 if (OP(next) == IFTHEN) /* Fake one. */
3631 next = NEXTOPER(NEXTOPER(next));
3632 }
3633 break;
3634 case LOGICAL:
24d3c4a9 3635 logical = scan->flags;
c277df42 3636 break;
c476f425 3637
2ab05381 3638/*******************************************************************
2ab05381 3639
c476f425
DM
3640The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3641pattern, where A and B are subpatterns. (For simple A, CURLYM or
3642STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 3643
c476f425 3644A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 3645
c476f425
DM
3646On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3647state, which contains the current count, initialised to -1. It also sets
3648cur_curlyx to point to this state, with any previous value saved in the
3649state block.
2ab05381 3650
c476f425
DM
3651CURLYX then jumps straight to the WHILEM op, rather than executing A,
3652since the pattern may possibly match zero times (i.e. it's a while {} loop
3653rather than a do {} while loop).
2ab05381 3654
c476f425
DM
3655Each entry to WHILEM represents a successful match of A. The count in the
3656CURLYX block is incremented, another WHILEM state is pushed, and execution
3657passes to A or B depending on greediness and the current count.
2ab05381 3658
c476f425
DM
3659For example, if matching against the string a1a2a3b (where the aN are
3660substrings that match /A/), then the match progresses as follows: (the
3661pushed states are interspersed with the bits of strings matched so far):
2ab05381 3662
c476f425
DM
3663 <CURLYX cnt=-1>
3664 <CURLYX cnt=0><WHILEM>
3665 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
3666 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
3667 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
3668 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 3669
c476f425
DM
3670(Contrast this with something like CURLYM, which maintains only a single
3671backtrack state:
2ab05381 3672
c476f425
DM
3673 <CURLYM cnt=0> a1
3674 a1 <CURLYM cnt=1> a2
3675 a1 a2 <CURLYM cnt=2> a3
3676 a1 a2 a3 <CURLYM cnt=3> b
3677)
2ab05381 3678
c476f425
DM
3679Each WHILEM state block marks a point to backtrack to upon partial failure
3680of A or B, and also contains some minor state data related to that
3681iteration. The CURLYX block, pointed to by cur_curlyx, contains the
3682overall state, such as the count, and pointers to the A and B ops.
2ab05381 3683
c476f425
DM
3684This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
3685must always point to the *current* CURLYX block, the rules are:
2ab05381 3686
c476f425
DM
3687When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
3688and set cur_curlyx to point the new block.
2ab05381 3689
c476f425
DM
3690When popping the CURLYX block after a successful or unsuccessful match,
3691restore the previous cur_curlyx.
2ab05381 3692
c476f425
DM
3693When WHILEM is about to execute B, save the current cur_curlyx, and set it
3694to the outer one saved in the CURLYX block.
2ab05381 3695
c476f425
DM
3696When popping the WHILEM block after a successful or unsuccessful B match,
3697restore the previous cur_curlyx.
2ab05381 3698
c476f425
DM
3699Here's an example for the pattern (AI* BI)*BO
3700I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 3701
c476f425
DM
3702cur_
3703curlyx backtrack stack
3704------ ---------------
3705NULL
3706CO <CO prev=NULL> <WO>
3707CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3708CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3709NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 3710
c476f425
DM
3711At this point the pattern succeeds, and we work back down the stack to
3712clean up, restoring as we go:
95b24440 3713
c476f425
DM
3714CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
3715CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
3716CO <CO prev=NULL> <WO>
3717NULL
a0374537 3718
c476f425
DM
3719*******************************************************************/
3720
3721#define ST st->u.curlyx
3722
3723 case CURLYX: /* start of /A*B/ (for complex A) */
3724 {
3725 /* No need to save/restore up to this paren */
3726 I32 parenfloor = scan->flags;
3727
3728 assert(next); /* keep Coverity happy */
3729 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3730 next += ARG(next);
3731
3732 /* XXXX Probably it is better to teach regpush to support
3733 parenfloor > PL_regsize... */
3734 if (parenfloor > (I32)*PL_reglastparen)
3735 parenfloor = *PL_reglastparen; /* Pessimization... */
3736
3737 ST.prev_curlyx= cur_curlyx;
3738 cur_curlyx = st;
3739 ST.cp = PL_savestack_ix;
3740
3741 /* these fields contain the state of the current curly.
3742 * they are accessed by subsequent WHILEMs */
3743 ST.parenfloor = parenfloor;
3744 ST.min = ARG1(scan);
3745 ST.max = ARG2(scan);
3746 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3747 ST.B = next;
24d3c4a9
DM
3748 ST.minmod = minmod;
3749 minmod = 0;
c476f425
DM
3750 ST.count = -1; /* this will be updated by WHILEM */
3751 ST.lastloc = NULL; /* this will be updated by WHILEM */
3752
3753 PL_reginput = locinput;
3754 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 3755 /* NOTREACHED */
c476f425 3756 }
a0d0e21e 3757
c476f425
DM
3758 case CURLYX_end: /* just finished matching all of A*B */
3759 regcpblow(ST.cp);
3760 cur_curlyx = ST.prev_curlyx;
3761 sayYES;
3762 /* NOTREACHED */
a0d0e21e 3763
c476f425
DM
3764 case CURLYX_end_fail: /* just failed to match all of A*B */
3765 regcpblow(ST.cp);
3766 cur_curlyx = ST.prev_curlyx;
3767 sayNO;
3768 /* NOTREACHED */
4633a7c4 3769
a0d0e21e 3770
c476f425
DM
3771#undef ST
3772#define ST st->u.whilem
3773
3774 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
3775 {
3776 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
3777 I32 n;
3778 assert(cur_curlyx); /* keep Coverity happy */
3779 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
3780 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
3781 ST.cache_offset = 0;
3782 ST.cache_mask = 0;
3783
3784 PL_reginput = locinput;
3785
3786 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3787 "%*s whilem: matched %ld out of %ld..%ld\n",
3788 REPORT_CODE_OFF+depth*2, "", (long)n,
3789 (long)cur_curlyx->u.curlyx.min,
3790 (long)cur_curlyx->u.curlyx.max)
3791 );
a0d0e21e 3792
c476f425 3793 /* First just match a string of min A's. */
a0d0e21e 3794
c476f425
DM
3795 if (n < cur_curlyx->u.curlyx.min) {
3796 cur_curlyx->u.curlyx.lastloc = locinput;
3797 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
3798 /* NOTREACHED */
3799 }
3800
3801 /* If degenerate A matches "", assume A done. */
3802
3803 if (locinput == cur_curlyx->u.curlyx.lastloc) {
3804 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3805 "%*s whilem: empty match detected, trying continuation...\n",
3806 REPORT_CODE_OFF+depth*2, "")
3807 );
3808 goto do_whilem_B_max;
3809 }
3810
3811 /* super-linear cache processing */
3812
3813 if (scan->flags) {
a0d0e21e 3814
2c2d71f5 3815 if (!PL_reg_maxiter) {
c476f425
DM
3816 /* start the countdown: Postpone detection until we
3817 * know the match is not *that* much linear. */
2c2d71f5 3818 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3819 /* possible overflow for long strings and many CURLYX's */
3820 if (PL_reg_maxiter < 0)
3821 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3822 PL_reg_leftiter = PL_reg_maxiter;
3823 }
c476f425 3824
2c2d71f5 3825 if (PL_reg_leftiter-- == 0) {
c476f425 3826 /* initialise cache */
3298f257 3827 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3828 if (PL_reg_poscache) {
eb160463 3829 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3830 Renew(PL_reg_poscache, size, char);
3831 PL_reg_poscache_size = size;
3832 }
3833 Zero(PL_reg_poscache, size, char);
3834 }
3835 else {
3836 PL_reg_poscache_size = size;
a02a5408 3837 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3838 }
c476f425
DM
3839 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3840 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
3841 PL_colors[4], PL_colors[5])
3842 );
2c2d71f5 3843 }
c476f425 3844
2c2d71f5 3845 if (PL_reg_leftiter < 0) {
c476f425
DM
3846 /* have we already failed at this position? */
3847 I32 offset, mask;
3848 offset = (scan->flags & 0xf) - 1
3849 + (locinput - PL_bostr) * (scan->flags>>4);
3850 mask = 1 << (offset % 8);
3851 offset /= 8;
3852 if (PL_reg_poscache[offset] & mask) {
3853 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
3854 "%*s whilem: (cache) already tried at this position...\n",
3855 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 3856 );
3298f257 3857 sayNO; /* cache records failure */
2c2d71f5 3858 }
c476f425
DM
3859 ST.cache_offset = offset;
3860 ST.cache_mask = mask;
2c2d71f5 3861 }
c476f425 3862 }
2c2d71f5 3863
c476f425 3864 /* Prefer B over A for minimal matching. */
a687059c 3865
c476f425
DM
3866 if (cur_curlyx->u.curlyx.minmod) {
3867 ST.save_curlyx = cur_curlyx;
3868 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3869 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
3870 REGCP_SET(ST.lastcp);
3871 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
3872 /* NOTREACHED */
3873 }
a0d0e21e 3874
c476f425
DM
3875 /* Prefer A over B for maximal matching. */
3876
3877 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
3878 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3879 cur_curlyx->u.curlyx.lastloc = locinput;
3880 REGCP_SET(ST.lastcp);
3881 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
3882 /* NOTREACHED */
3883 }
3884 goto do_whilem_B_max;
3885 }
3886 /* NOTREACHED */
3887
3888 case WHILEM_B_min: /* just matched B in a minimal match */
3889 case WHILEM_B_max: /* just matched B in a maximal match */
3890 cur_curlyx = ST.save_curlyx;
3891 sayYES;
3892 /* NOTREACHED */
3893
3894 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
3895 cur_curlyx = ST.save_curlyx;
3896 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3897 cur_curlyx->u.curlyx.count--;
3898 CACHEsayNO;
3899 /* NOTREACHED */
3900
3901 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
3902 REGCP_UNWIND(ST.lastcp);
3903 regcppop(rex);
3904 /* FALL THROUGH */
3905 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
3906 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
3907 cur_curlyx->u.curlyx.count--;
3908 CACHEsayNO;
3909 /* NOTREACHED */
3910
3911 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
3912 REGCP_UNWIND(ST.lastcp);
3913 regcppop(rex); /* Restore some previous $<digit>s? */
3914 PL_reginput = locinput;
3915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3916 "%*s whilem: failed, trying continuation...\n",
3917 REPORT_CODE_OFF+depth*2, "")
3918 );
3919 do_whilem_B_max:
3920 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3921 && ckWARN(WARN_REGEXP)
3922 && !(PL_reg_flags & RF_warned))
3923 {
3924 PL_reg_flags |= RF_warned;
3925 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3926 "Complex regular subexpression recursion",
3927 REG_INFTY - 1);
3928 }
3929
3930 /* now try B */
3931 ST.save_curlyx = cur_curlyx;
3932 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
3933 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
3934 /* NOTREACHED */
3935
3936 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
3937 cur_curlyx = ST.save_curlyx;
3938 REGCP_UNWIND(ST.lastcp);
3939 regcppop(rex);
3940
3941 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
3942 /* Maximum greed exceeded */
3943 if (cur_curlyx->u.curlyx.count >= REG_INFTY
3944 && ckWARN(WARN_REGEXP)
3945 && !(PL_reg_flags & RF_warned))
3946 {
3280af22 3947 PL_reg_flags |= RF_warned;
c476f425
DM
3948 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
3949 "%s limit (%d) exceeded",
3950 "Complex regular subexpression recursion",
3951 REG_INFTY - 1);
a0d0e21e 3952 }
c476f425 3953 cur_curlyx->u.curlyx.count--;
3ab3c9b4 3954 CACHEsayNO;
a0d0e21e 3955 }
c476f425
DM
3956
3957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3958 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
3959 );
3960 /* Try grabbing another A and see if it helps. */
3961 PL_reginput = locinput;
3962 cur_curlyx->u.curlyx.lastloc = locinput;
3963 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
3964 REGCP_SET(ST.lastcp);
3965 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 3966 /* NOTREACHED */
40a82448
DM
3967
3968#undef ST
3969#define ST st->u.branch
3970
3971 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3972 next = scan + ARG(scan);
3973 if (next == scan)
3974 next = NULL;
40a82448
DM
3975 scan = NEXTOPER(scan);
3976 /* FALL THROUGH */
c277df42 3977
40a82448
DM
3978 case BRANCH: /* /(...|A|...)/ */
3979 scan = NEXTOPER(scan); /* scan now points to inner node */
3980 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3981 /* last branch; skip state push and jump direct to node */
3982 continue;
3983 ST.lastparen = *PL_reglastparen;
3984 ST.next_branch = next;
3985 REGCP_SET(ST.cp);
3986 PL_reginput = locinput;
02db2b7b 3987
40a82448
DM
3988 /* Now go into the branch */
3989 PUSH_STATE_GOTO(BRANCH_next, scan);
3990 /* NOTREACHED */
3991
3992 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3993 REGCP_UNWIND(ST.cp);
3994 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3995 PL_regendp[n] = -1;
3996 *PL_reglastparen = n;
0a4db386 3997 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
3998 scan = ST.next_branch;
3999 /* no more branches? */
4000 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4001 sayNO;
4002 continue; /* execute next BRANCH[J] op */
4003 /* NOTREACHED */
4004
a0d0e21e 4005 case MINMOD:
24d3c4a9 4006 minmod = 1;
a0d0e21e 4007 break;
40a82448
DM
4008
4009#undef ST
4010#define ST st->u.curlym
4011
4012 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4013
4014 /* This is an optimisation of CURLYX that enables us to push
4015 * only a single backtracking state, no matter now many matches
4016 * there are in {m,n}. It relies on the pattern being constant
4017 * length, with no parens to influence future backrefs
4018 */
4019
4020 ST.me = scan;
dc45a647 4021 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4022
4023 /* if paren positive, emulate an OPEN/CLOSE around A */
4024 if (ST.me->flags) {
4025 I32 paren = ST.me->flags;
4026 if (paren > PL_regsize)
4027 PL_regsize = paren;
4028 if (paren > (I32)*PL_reglastparen)
4029 *PL_reglastparen = paren;
c277df42 4030 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4031 }
40a82448
DM
4032 ST.A = scan;
4033 ST.B = next;
4034 ST.alen = 0;
4035 ST.count = 0;
24d3c4a9
DM
4036 ST.minmod = minmod;
4037 minmod = 0;
40a82448
DM
4038 ST.c1 = CHRTEST_UNINIT;
4039 REGCP_SET(ST.cp);
6407bf3b 4040
40a82448
DM
4041 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4042 goto curlym_do_B;
4043
4044 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4045 PL_reginput = locinput;
40a82448
DM
4046 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4047 /* NOTREACHED */
5f80c4cf 4048
40a82448
DM
4049 case CURLYM_A: /* we've just matched an A */
4050 locinput = st->locinput;
4051 nextchr = UCHARAT(locinput);
4052
4053 ST.count++;
4054 /* after first match, determine A's length: u.curlym.alen */
4055 if (ST.count == 1) {
4056 if (PL_reg_match_utf8) {
4057 char *s = locinput;
4058 while (s < PL_reginput) {
4059 ST.alen++;
4060 s += UTF8SKIP(s);
4061 }
4062 }
4063 else {
4064 ST.alen = PL_reginput - locinput;
4065 }
4066 if (ST.alen == 0)
4067 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4068 }
0cadcf80
DM
4069 DEBUG_EXECUTE_r(
4070 PerlIO_printf(Perl_debug_log,
40a82448 4071 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4072 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4073 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4074 );
4075
40a82448 4076 locinput = PL_reginput;
0a4db386
YO
4077
4078 if (cur_eval && cur_eval->u.eval.close_paren &&
4079 cur_eval->u.eval.close_paren == ST.me->flags)
4080 goto fake_end;
4081
4082 if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) )
40a82448
DM
4083 goto curlym_do_A; /* try to match another A */
4084 goto curlym_do_B; /* try to match B */
5f80c4cf 4085
40a82448
DM
4086 case CURLYM_A_fail: /* just failed to match an A */
4087 REGCP_UNWIND(ST.cp);
0a4db386
YO
4088
4089 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4090 || (cur_eval && cur_eval->u.eval.close_paren &&
4091 cur_eval->u.eval.close_paren == ST.me->flags))
40a82448 4092 sayNO;
0cadcf80 4093
40a82448
DM
4094 curlym_do_B: /* execute the B in /A{m,n}B/ */
4095 PL_reginput = locinput;
4096 if (ST.c1 == CHRTEST_UNINIT) {
4097 /* calculate c1 and c2 for possible match of 1st char
4098 * following curly */
4099 ST.c1 = ST.c2 = CHRTEST_VOID;
4100 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4101 regnode *text_node = ST.B;
4102 if (! HAS_TEXT(text_node))
4103 FIND_NEXT_IMPT(text_node);
4104 if (HAS_TEXT(text_node)
4105 && PL_regkind[OP(text_node)] != REF)
4106 {
4107 ST.c1 = (U8)*STRING(text_node);
4108 ST.c2 =
4109 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4110 ? PL_fold[ST.c1]
4111 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4112 ? PL_fold_locale[ST.c1]
4113 : ST.c1;
c277df42 4114 }
c277df42 4115 }
40a82448
DM
4116 }
4117
4118 DEBUG_EXECUTE_r(
4119 PerlIO_printf(Perl_debug_log,
4120 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4121 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4122 "", (IV)ST.count)
4123 );
4124 if (ST.c1 != CHRTEST_VOID
4125 && UCHARAT(PL_reginput) != ST.c1
4126 && UCHARAT(PL_reginput) != ST.c2)
4127 {
4128 /* simulate B failing */
4129 state_num = CURLYM_B_fail;
4130 goto reenter_switch;
4131 }
4132
4133 if (ST.me->flags) {
4134 /* mark current A as captured */
4135 I32 paren = ST.me->flags;
4136 if (ST.count) {
4137 PL_regstartp[paren]
4138 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4139 PL_regendp[paren] = PL_reginput - PL_bostr;
0a4db386 4140 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4141 }
40a82448
DM
4142 else
4143 PL_regendp[paren] = -1;
0a4db386
YO
4144 if (cur_eval && cur_eval->u.eval.close_paren &&
4145 cur_eval->u.eval.close_paren == ST.me->flags)
4146 {
4147 if (ST.count)
4148 goto fake_end;
4149 else
4150 sayNO;
4151 }
c277df42 4152 }
0a4db386 4153
40a82448 4154 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4155 /* NOTREACHED */
40a82448
DM
4156
4157 case CURLYM_B_fail: /* just failed to match a B */
4158 REGCP_UNWIND(ST.cp);
4159 if (ST.minmod) {
4160 if (ST.count == ARG2(ST.me) /* max */)
4161 sayNO;
4162 goto curlym_do_A; /* try to match a further A */
4163 }
4164 /* backtrack one A */
4165 if (ST.count == ARG1(ST.me) /* min */)
4166 sayNO;
4167 ST.count--;
4168 locinput = HOPc(locinput, -ST.alen);
4169 goto curlym_do_B; /* try to match B */
4170
c255a977
DM
4171#undef ST
4172#define ST st->u.curly
40a82448 4173
c255a977
DM
4174#define CURLY_SETPAREN(paren, success) \
4175 if (paren) { \
4176 if (success) { \
4177 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4178 PL_regendp[paren] = locinput - PL_bostr; \
0a4db386 4179 *PL_reglastcloseparen = paren; \
c255a977
DM
4180 } \
4181 else \
4182 PL_regendp[paren] = -1; \
4183 }
4184
4185 case STAR: /* /A*B/ where A is width 1 */
4186 ST.paren = 0;
4187 ST.min = 0;
4188 ST.max = REG_INFTY;
a0d0e21e
LW
4189 scan = NEXTOPER(scan);
4190 goto repeat;
c255a977
DM
4191 case PLUS: /* /A+B/ where A is width 1 */
4192 ST.paren = 0;
4193 ST.min = 1;
4194 ST.max = REG_INFTY;
c277df42 4195 scan = NEXTOPER(scan);
c255a977
DM
4196 goto repeat;
4197 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4198 ST.paren = scan->flags; /* Which paren to set */
4199 if (ST.paren > PL_regsize)
4200 PL_regsize = ST.paren;
4201 if (ST.paren > (I32)*PL_reglastparen)
4202 *PL_reglastparen = ST.paren;
4203 ST.min = ARG1(scan); /* min to match */
4204 ST.max = ARG2(scan); /* max to match */
0a4db386
YO
4205 if (cur_eval && cur_eval->u.eval.close_paren &&
4206 cur_eval->u.eval.close_paren == ST.paren) {
4207 ST.min=1;
4208 ST.max=1;
4209 }
c255a977
DM
4210 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4211 goto repeat;
4212 case CURLY: /* /A{m,n}B/ where A is width 1 */
4213 ST.paren = 0;
4214 ST.min = ARG1(scan); /* min to match */
4215 ST.max = ARG2(scan); /* max to match */
4216 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4217 repeat:
a0d0e21e
LW
4218 /*
4219 * Lookahead to avoid useless match attempts
4220 * when we know what character comes next.
c255a977 4221 *
5f80c4cf
JP
4222 * Used to only do .*x and .*?x, but now it allows
4223 * for )'s, ('s and (?{ ... })'s to be in the way
4224 * of the quantifier and the EXACT-like node. -- japhy
4225 */
4226
c255a977
DM
4227 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4228 sayNO;
cca55fe3 4229 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4230 U8 *s;
4231 regnode *text_node = next;
4232
3dab1dad
YO
4233 if (! HAS_TEXT(text_node))
4234 FIND_NEXT_IMPT(text_node);
5f80c4cf 4235
9e137952 4236 if (! HAS_TEXT(text_node))
c255a977 4237 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4238 else {
3dab1dad 4239 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4240 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4241 goto assume_ok_easy;
cca55fe3 4242 }
be8e71aa
YO
4243 else
4244 s = (U8*)STRING(text_node);
5f80c4cf
JP
4245
4246 if (!UTF) {
c255a977 4247 ST.c2 = ST.c1 = *s;
f65d3ee7 4248 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4249 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4250 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4251 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4252 }
5f80c4cf 4253 else { /* UTF */
f65d3ee7 4254 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4255 STRLEN ulen1, ulen2;
89ebb4a3
JH
4256 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4257 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4258
4259 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4260 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4261#ifdef EBCDIC
4262 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4263 ckWARN(WARN_UTF8) ?
4264 0 : UTF8_ALLOW_ANY);
4265 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4266 ckWARN(WARN_UTF8) ?
4267 0 : UTF8_ALLOW_ANY);
4268#else
c255a977 4269 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4270 uniflags);
c255a977 4271 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4272 uniflags);
4273#endif
5f80c4cf
JP
4274 }
4275 else {
c255a977 4276 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4277 uniflags);
5f80c4cf 4278 }
1aa99e6b
IH
4279 }
4280 }
bbce6d69 4281 }
a0d0e21e 4282 else
c255a977 4283 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4284 assume_ok_easy:
c255a977
DM
4285
4286 ST.A = scan;
4287 ST.B = next;
3280af22 4288 PL_reginput = locinput;
24d3c4a9
DM
4289 if (minmod) {
4290 minmod = 0;
c255a977 4291 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4292 sayNO;
c255a977 4293 ST.count = ST.min;
a0ed51b3 4294 locinput = PL_reginput;
c255a977
DM
4295 REGCP_SET(ST.cp);
4296 if (ST.c1 == CHRTEST_VOID)
4297 goto curly_try_B_min;
4298
4299 ST.oldloc = locinput;
4300
4301 /* set ST.maxpos to the furthest point along the
4302 * string that could possibly match */
4303 if (ST.max == REG_INFTY) {
4304 ST.maxpos = PL_regeol - 1;
4305 if (do_utf8)
4306 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4307 ST.maxpos--;
4308 }
4309 else if (do_utf8) {
4310 int m = ST.max - ST.min;
4311 for (ST.maxpos = locinput;
4312 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4313 ST.maxpos += UTF8SKIP(ST.maxpos);
4314 }
4315 else {
4316 ST.maxpos = locinput + ST.max - ST.min;
4317 if (ST.maxpos >= PL_regeol)
4318 ST.maxpos = PL_regeol - 1;
4319 }
4320 goto curly_try_B_min_known;
4321
4322 }
4323 else {
4324 ST.count = regrepeat(rex, ST.A, ST.max);
4325 locinput = PL_reginput;
4326 if (ST.count < ST.min)
4327 sayNO;
4328 if ((ST.count > ST.min)
4329 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4330 {
4331 /* A{m,n} must come at the end of the string, there's
4332 * no point in backing off ... */
4333 ST.min = ST.count;
4334 /* ...except that $ and \Z can match before *and* after
4335 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4336 We may back off by one in this case. */
4337 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4338 ST.min--;
4339 }
4340 REGCP_SET(ST.cp);
4341 goto curly_try_B_max;
4342 }
4343 /* NOTREACHED */
4344
4345
4346 case CURLY_B_min_known_fail:
4347 /* failed to find B in a non-greedy match where c1,c2 valid */
4348 if (ST.paren && ST.count)
4349 PL_regendp[ST.paren] = -1;
4350
4351 PL_reginput = locinput; /* Could be reset... */
4352 REGCP_UNWIND(ST.cp);
4353 /* Couldn't or didn't -- move forward. */
4354 ST.oldloc = locinput;
4355 if (do_utf8)
4356 locinput += UTF8SKIP(locinput);
4357 else
4358 locinput++;
4359 ST.count++;
4360 curly_try_B_min_known:
4361 /* find the next place where 'B' could work, then call B */
4362 {
4363 int n;
4364 if (do_utf8) {
4365 n = (ST.oldloc == locinput) ? 0 : 1;
4366 if (ST.c1 == ST.c2) {
4367 STRLEN len;
4368 /* set n to utf8_distance(oldloc, locinput) */
4369 while (locinput <= ST.maxpos &&
4370 utf8n_to_uvchr((U8*)locinput,
4371 UTF8_MAXBYTES, &len,
4372 uniflags) != (UV)ST.c1) {
4373 locinput += len;
4374 n++;
4375 }
1aa99e6b
IH
4376 }
4377 else {
c255a977
DM
4378 /* set n to utf8_distance(oldloc, locinput) */
4379 while (locinput <= ST.maxpos) {
4380 STRLEN len;
4381 const UV c = utf8n_to_uvchr((U8*)locinput,
4382 UTF8_MAXBYTES, &len,
4383 uniflags);
4384 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4385 break;
4386 locinput += len;
4387 n++;
1aa99e6b 4388 }
0fe9bf95
IZ
4389 }
4390 }
c255a977
DM
4391 else {
4392 if (ST.c1 == ST.c2) {
4393 while (locinput <= ST.maxpos &&
4394 UCHARAT(locinput) != ST.c1)
4395 locinput++;
bbce6d69 4396 }
c255a977
DM
4397 else {
4398 while (locinput <= ST.maxpos
4399 && UCHARAT(locinput) != ST.c1
4400 && UCHARAT(locinput) != ST.c2)
4401 locinput++;
a0ed51b3 4402 }
c255a977
DM
4403 n = locinput - ST.oldloc;
4404 }
4405 if (locinput > ST.maxpos)
4406 sayNO;
4407 /* PL_reginput == oldloc now */
4408 if (n) {
4409 ST.count += n;
4410 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4411 sayNO;
a0d0e21e 4412 }
c255a977
DM
4413 PL_reginput = locinput;
4414 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4415 if (cur_eval && cur_eval->u.eval.close_paren &&
4416 cur_eval->u.eval.close_paren == ST.paren) {
4417 goto fake_end;
4418 }
c255a977 4419 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4420 }
c255a977
DM
4421 /* NOTREACHED */
4422
4423
4424 case CURLY_B_min_fail:
4425 /* failed to find B in a non-greedy match where c1,c2 invalid */
4426 if (ST.paren && ST.count)
4427 PL_regendp[ST.paren] = -1;
4428
4429 REGCP_UNWIND(ST.cp);
4430 /* failed -- move forward one */
4431 PL_reginput = locinput;
4432 if (regrepeat(rex, ST.A, 1)) {
4433 ST.count++;
a0ed51b3 4434 locinput = PL_reginput;
c255a977
DM
4435 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4436 ST.count > 0)) /* count overflow ? */
15272685 4437 {
c255a977
DM
4438 curly_try_B_min:
4439 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4440 if (cur_eval && cur_eval->u.eval.close_paren &&
4441 cur_eval->u.eval.close_paren == ST.paren) {
4442 goto fake_end;
4443 }
c255a977 4444 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4445 }
4446 }
4633a7c4 4447 sayNO;
c255a977
DM
4448 /* NOTREACHED */
4449
4450
4451 curly_try_B_max:
4452 /* a successful greedy match: now try to match B */
4453 {
4454 UV c = 0;
4455 if (ST.c1 != CHRTEST_VOID)
4456 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4457 UTF8_MAXBYTES, 0, uniflags)
466787eb 4458 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4459 /* If it could work, try it. */
4460 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4461 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386
YO
4462 if (cur_eval && cur_eval->u.eval.close_paren &&
4463 cur_eval->u.eval.close_paren == ST.paren) {
4464 goto fake_end;
4465 }
c255a977
DM
4466 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4467 /* NOTREACHED */
4468 }
4469 }
4470 /* FALL THROUGH */
4471 case CURLY_B_max_fail:
4472 /* failed to find B in a greedy match */
4473 if (ST.paren && ST.count)
4474 PL_regendp[ST.paren] = -1;
4475
4476 REGCP_UNWIND(ST.cp);
4477 /* back up. */
4478 if (--ST.count < ST.min)
4479 sayNO;
4480 PL_reginput = locinput = HOPc(locinput, -1);
4481 goto curly_try_B_max;
4482
4483#undef ST
4484
a0d0e21e 4485 case END:
6bda09f9 4486 fake_end:
faec1544
DM
4487 if (cur_eval) {
4488 /* we've just finished A in /(??{A})B/; now continue with B */
4489 I32 tmpix;
4490
4491
4492 st->u.eval.toggle_reg_flags
4493 = cur_eval->u.eval.toggle_reg_flags;
4494 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4495
4496 st->u.eval.prev_rex = rex; /* inner */
4497 rex = cur_eval->u.eval.prev_rex; /* outer */
4498 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4499 ReREFCNT_inc(rex);
4500 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4501 REGCP_SET(st->u.eval.lastcp);
4502 PL_reginput = locinput;
4503
4504 /* Restore parens of the outer rex without popping the
4505 * savestack */
4506 tmpix = PL_savestack_ix;
4507 PL_savestack_ix = cur_eval->u.eval.lastcp;
4508 regcppop(rex);
4509 PL_savestack_ix = tmpix;
4510
4511 st->u.eval.prev_eval = cur_eval;
4512 cur_eval = cur_eval->u.eval.prev_eval;
4513 DEBUG_EXECUTE_r(
6bda09f9
YO
4514 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %x\n",
4515 REPORT_CODE_OFF+depth*2, "",(int)cur_eval););
faec1544
DM
4516 PUSH_YES_STATE_GOTO(EVAL_AB,
4517 st->u.eval.prev_eval->u.eval.B); /* match B */
4518 }
4519
3b0527fe 4520 if (locinput < reginfo->till) {
a3621e74 4521 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4522 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4523 PL_colors[4],
4524 (long)(locinput - PL_reg_starttry),
3b0527fe 4525 (long)(reginfo->till - PL_reg_starttry),
7821416a 4526 PL_colors[5]));
262b90c4 4527 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
4528 }
4529 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4530 sayYES; /* Success! */
dad79028
DM
4531
4532 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4533 DEBUG_EXECUTE_r(
4534 PerlIO_printf(Perl_debug_log,
4535 "%*s %ssubpattern success...%s\n",
5bc10b2c 4536 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 4537 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 4538 sayYES; /* Success! */
dad79028 4539
40a82448
DM
4540#undef ST
4541#define ST st->u.ifmatch
4542
4543 case SUSPEND: /* (?>A) */
4544 ST.wanted = 1;
9fe1d20c 4545 PL_reginput = locinput;
9041c2e3 4546 goto do_ifmatch;
dad79028 4547
40a82448
DM
4548 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4549 ST.wanted = 0;
dad79028
DM
4550 goto ifmatch_trivial_fail_test;
4551
40a82448
DM
4552 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4553 ST.wanted = 1;
dad79028 4554 ifmatch_trivial_fail_test:
a0ed51b3 4555 if (scan->flags) {
52657f30 4556 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4557 if (!s) {
4558 /* trivial fail */
24d3c4a9
DM
4559 if (logical) {
4560 logical = 0;
4561 sw = 1 - (bool)ST.wanted;
dad79028 4562 }
40a82448 4563 else if (ST.wanted)
dad79028
DM
4564 sayNO;
4565 next = scan + ARG(scan);
4566 if (next == scan)
4567 next = NULL;
4568 break;
4569 }
efb30f32 4570 PL_reginput = s;
a0ed51b3
LW
4571 }
4572 else
4573 PL_reginput = locinput;
4574
c277df42 4575 do_ifmatch:
40a82448 4576 ST.me = scan;
24d3c4a9 4577 ST.logical = logical;
40a82448
DM
4578 /* execute body of (?...A) */
4579 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4580 /* NOTREACHED */
4581
4582 case IFMATCH_A_fail: /* body of (?...A) failed */
4583 ST.wanted = !ST.wanted;
4584 /* FALL THROUGH */
4585
4586 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
4587 if (ST.logical) {
4588 sw = (bool)ST.wanted;
40a82448
DM
4589 }
4590 else if (!ST.wanted)
4591 sayNO;
4592
4593 if (OP(ST.me) == SUSPEND)
4594 locinput = PL_reginput;
4595 else {
4596 locinput = PL_reginput = st->locinput;
4597 nextchr = UCHARAT(locinput);
4598 }
4599 scan = ST.me + ARG(ST.me);
4600 if (scan == ST.me)
4601 scan = NULL;
4602 continue; /* execute B */
4603
4604#undef ST
dad79028 4605
c277df42 4606 case LONGJMP:
c277df42
IZ
4607 next = scan + ARG(scan);
4608 if (next == scan)
4609 next = NULL;
a0d0e21e 4610 break;
7f69552c
YO
4611 case OPFAIL:
4612 sayNO;
a0d0e21e 4613 default:
b900a521 4614 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4615 PTR2UV(scan), OP(scan));
cea2e8a9 4616 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4617 }
95b24440 4618
a0d0e21e 4619 scan = next;
95b24440
DM
4620 continue;
4621 /* NOTREACHED */
4622
40a82448
DM
4623 push_yes_state:
4624 /* push a state that backtracks on success */
4625 st->u.yes.prev_yes_state = yes_state;
4626 yes_state = st;
4627 /* FALL THROUGH */
4628 push_state:
4629 /* push a new regex state, then continue at scan */
4630 {
4631 regmatch_state *newst;
4632
5bc10b2c 4633 DEBUG_STATE_pp("push");
40a82448 4634 depth++;
40a82448
DM
4635 st->locinput = locinput;
4636 newst = st+1;
4637 if (newst > SLAB_LAST(PL_regmatch_slab))
4638 newst = S_push_slab(aTHX);
4639 PL_regmatch_state = newst;
786e8c11 4640
40a82448
DM
4641 locinput = PL_reginput;
4642 nextchr = UCHARAT(locinput);
4643 st = newst;
4644 continue;
4645 /* NOTREACHED */
4646 }
a0d0e21e 4647 }
a687059c 4648
a0d0e21e
LW
4649 /*
4650 * We get here only if there's trouble -- normally "case END" is
4651 * the terminating point.
4652 */
cea2e8a9 4653 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4654 /*NOTREACHED*/
4633a7c4
LW
4655 sayNO;
4656
262b90c4 4657yes:
77cb431f
DM
4658 if (yes_state) {
4659 /* we have successfully completed a subexpression, but we must now
4660 * pop to the state marked by yes_state and continue from there */
77cb431f 4661 assert(st != yes_state);
5bc10b2c
DM
4662#ifdef DEBUGGING
4663 while (st != yes_state) {
4664 st--;
4665 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4666 PL_regmatch_slab = PL_regmatch_slab->prev;
4667 st = SLAB_LAST(PL_regmatch_slab);
4668 }
4669 DEBUG_STATE_pp("pop (yes)");
4670 depth--;
4671 }
4672#else
77cb431f
DM
4673 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4674 || yes_state > SLAB_LAST(PL_regmatch_slab))
4675 {
4676 /* not in this slab, pop slab */
4677 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4678 PL_regmatch_slab = PL_regmatch_slab->prev;
4679 st = SLAB_LAST(PL_regmatch_slab);
4680 }
4681 depth -= (st - yes_state);
5bc10b2c 4682#endif
77cb431f
DM
4683 st = yes_state;
4684 yes_state = st->u.yes.prev_yes_state;
4685 PL_regmatch_state = st;
4686
24d3c4a9
DM
4687 state_num = st->resume_state;
4688 goto reenter_switch;
77cb431f
DM
4689 }
4690
a3621e74 4691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4692 PL_colors[4], PL_colors[5]));
02db2b7b 4693
95b24440 4694 result = 1;
aa283a38 4695 goto final_exit;
4633a7c4
LW
4696
4697no:
a3621e74 4698 DEBUG_EXECUTE_r(
7821416a 4699 PerlIO_printf(Perl_debug_log,
786e8c11 4700 "%*s %sfailed...%s\n",
5bc10b2c 4701 REPORT_CODE_OFF+depth*2, "",
786e8c11 4702 PL_colors[4], PL_colors[5])
7821416a 4703 );
aa283a38 4704
262b90c4 4705no_silent:
aa283a38
DM
4706 if (depth) {
4707 /* there's a previous state to backtrack to */
40a82448
DM
4708 st--;
4709 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4710 PL_regmatch_slab = PL_regmatch_slab->prev;
4711 st = SLAB_LAST(PL_regmatch_slab);
4712 }
4713 PL_regmatch_state = st;
40a82448
DM
4714 locinput= st->locinput;
4715 nextchr = UCHARAT(locinput);
4716
5bc10b2c
DM
4717 DEBUG_STATE_pp("pop");
4718 depth--;
262b90c4
DM
4719 if (yes_state == st)
4720 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 4721
24d3c4a9
DM
4722 state_num = st->resume_state + 1; /* failure = success + 1 */
4723 goto reenter_switch;
95b24440 4724 }
24d3c4a9 4725 result = 0;
aa283a38 4726
262b90c4 4727 final_exit:
aa283a38 4728
5d9a96ca
DM
4729 /* restore original high-water mark */
4730 PL_regmatch_slab = orig_slab;
4731 PL_regmatch_state = orig_state;
4732
4733 /* free all slabs above current one */
4734 if (orig_slab->next) {
c4fd8992 4735 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4736 orig_slab->next = NULL;
4737 while (sl) {
c4fd8992 4738 regmatch_slab * const osl = sl;
5d9a96ca 4739 sl = sl->next;
ad65c075 4740 Safefree(osl);
5d9a96ca
DM
4741 }
4742 }
4743
95b24440 4744 return result;
a687059c
LW
4745}
4746
4747/*
4748 - regrepeat - repeatedly match something simple, report how many
4749 */
4750/*
4751 * [This routine now assumes that it will only match on things of length 1.
4752 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4753 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4754 */
76e3520e 4755STATIC I32
32fc9b6a 4756S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4757{
27da23d5 4758 dVAR;
a0d0e21e 4759 register char *scan;
a0d0e21e 4760 register I32 c;
3280af22 4761 register char *loceol = PL_regeol;
a0ed51b3 4762 register I32 hardcount = 0;
53c4c00c 4763 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4764
3280af22 4765 scan = PL_reginput;
faf11cac
HS
4766 if (max == REG_INFTY)
4767 max = I32_MAX;
4768 else if (max < loceol - scan)
7f596f4c 4769 loceol = scan + max;
a0d0e21e 4770 switch (OP(p)) {
22c35a8c 4771 case REG_ANY:
1aa99e6b 4772 if (do_utf8) {
ffc61ed2 4773 loceol = PL_regeol;
1aa99e6b 4774 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4775 scan += UTF8SKIP(scan);
4776 hardcount++;
4777 }
4778 } else {
4779 while (scan < loceol && *scan != '\n')
4780 scan++;
a0ed51b3
LW
4781 }
4782 break;
ffc61ed2 4783 case SANY:
def8e4ea
JH
4784 if (do_utf8) {
4785 loceol = PL_regeol;
a0804c9e 4786 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4787 scan += UTF8SKIP(scan);
4788 hardcount++;
4789 }
4790 }
4791 else
4792 scan = loceol;
a0ed51b3 4793 break;
f33976b4
DB
4794 case CANY:
4795 scan = loceol;
4796 break;
090f7165
JH
4797 case EXACT: /* length of string is 1 */
4798 c = (U8)*STRING(p);
4799 while (scan < loceol && UCHARAT(scan) == c)
4800 scan++;
bbce6d69 4801 break;
4802 case EXACTF: /* length of string is 1 */
cd439c50 4803 c = (U8)*STRING(p);
bbce6d69 4804 while (scan < loceol &&
22c35a8c 4805 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4806 scan++;
4807 break;
4808 case EXACTFL: /* length of string is 1 */
3280af22 4809 PL_reg_flags |= RF_tainted;
cd439c50 4810 c = (U8)*STRING(p);
bbce6d69 4811 while (scan < loceol &&
22c35a8c 4812 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4813 scan++;
4814 break;
4815 case ANYOF:
ffc61ed2
JH
4816 if (do_utf8) {
4817 loceol = PL_regeol;
cfc92286 4818 while (hardcount < max && scan < loceol &&
32fc9b6a 4819 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4820 scan += UTF8SKIP(scan);
4821 hardcount++;
4822 }
4823 } else {
32fc9b6a 4824 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4825 scan++;
4826 }
a0d0e21e
LW
4827 break;
4828 case ALNUM:
1aa99e6b 4829 if (do_utf8) {
ffc61ed2 4830 loceol = PL_regeol;
1a4fad37 4831 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4832 while (hardcount < max && scan < loceol &&
3568d838 4833 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4834 scan += UTF8SKIP(scan);
4835 hardcount++;
4836 }
4837 } else {
4838 while (scan < loceol && isALNUM(*scan))
4839 scan++;
a0ed51b3
LW
4840 }
4841 break;
bbce6d69 4842 case ALNUML:
3280af22 4843 PL_reg_flags |= RF_tainted;
1aa99e6b 4844 if (do_utf8) {
ffc61ed2 4845 loceol = PL_regeol;
1aa99e6b
IH
4846 while (hardcount < max && scan < loceol &&
4847 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4848 scan += UTF8SKIP(scan);
4849 hardcount++;
4850 }
4851 } else {
4852 while (scan < loceol && isALNUM_LC(*scan))
4853 scan++;
a0ed51b3
LW
4854 }
4855 break;
a0d0e21e 4856 case NALNUM:
1aa99e6b 4857 if (do_utf8) {
ffc61ed2 4858 loceol = PL_regeol;
1a4fad37 4859 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4860 while (hardcount < max && scan < loceol &&
3568d838 4861 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4862 scan += UTF8SKIP(scan);
4863 hardcount++;
4864 }
4865 } else {
4866 while (scan < loceol && !isALNUM(*scan))
4867 scan++;
a0ed51b3
LW
4868 }
4869 break;
bbce6d69 4870 case NALNUML:
3280af22 4871 PL_reg_flags |= RF_tainted;
1aa99e6b 4872 if (do_utf8) {
ffc61ed2 4873 loceol = PL_regeol;
1aa99e6b
IH
4874 while (hardcount < max && scan < loceol &&
4875 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4876 scan += UTF8SKIP(scan);
4877 hardcount++;
4878 }
4879 } else {
4880 while (scan < loceol && !isALNUM_LC(*scan))
4881 scan++;
a0ed51b3
LW
4882 }
4883 break;
a0d0e21e 4884 case SPACE:
1aa99e6b 4885 if (do_utf8) {
ffc61ed2 4886 loceol = PL_regeol;
1a4fad37 4887 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4888 while (hardcount < max && scan < loceol &&
3568d838
JH
4889 (*scan == ' ' ||
4890 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4891 scan += UTF8SKIP(scan);
4892 hardcount++;
4893 }
4894 } else {
4895 while (scan < loceol && isSPACE(*scan))
4896 scan++;
a0ed51b3
LW
4897 }
4898 break;
bbce6d69 4899 case SPACEL:
3280af22 4900 PL_reg_flags |= RF_tainted;
1aa99e6b 4901 if (do_utf8) {
ffc61ed2 4902 loceol = PL_regeol;
1aa99e6b 4903 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4904 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4905 scan += UTF8SKIP(scan);
4906 hardcount++;
4907 }
4908 } else {
4909 while (scan < loceol && isSPACE_LC(*scan))
4910 scan++;
a0ed51b3
LW
4911 }
4912 break;
a0d0e21e 4913 case NSPACE:
1aa99e6b 4914 if (do_utf8) {
ffc61ed2 4915 loceol = PL_regeol;
1a4fad37 4916 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4917 while (hardcount < max && scan < loceol &&
3568d838
JH
4918 !(*scan == ' ' ||
4919 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4920 scan += UTF8SKIP(scan);
4921 hardcount++;
4922 }
4923 } else {
4924 while (scan < loceol && !isSPACE(*scan))
4925 scan++;
4926 break;
a0ed51b3 4927 }
bbce6d69 4928 case NSPACEL:
3280af22 4929 PL_reg_flags |= RF_tainted;
1aa99e6b 4930 if (do_utf8) {
ffc61ed2 4931 loceol = PL_regeol;
1aa99e6b 4932 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4933 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4934 scan += UTF8SKIP(scan);
4935 hardcount++;
4936 }
4937 } else {
4938 while (scan < loceol && !isSPACE_LC(*scan))
4939 scan++;
a0ed51b3
LW
4940 }
4941 break;
a0d0e21e 4942 case DIGIT:
1aa99e6b 4943 if (do_utf8) {
ffc61ed2 4944 loceol = PL_regeol;
1a4fad37 4945 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4946 while (hardcount < max && scan < loceol &&
3568d838 4947 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4948 scan += UTF8SKIP(scan);
4949 hardcount++;
4950 }
4951 } else {
4952 while (scan < loceol && isDIGIT(*scan))
4953 scan++;
a0ed51b3
LW
4954 }
4955 break;
a0d0e21e 4956 case NDIGIT:
1aa99e6b 4957 if (do_utf8) {
ffc61ed2 4958 loceol = PL_regeol;
1a4fad37 4959 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4960 while (hardcount < max && scan < loceol &&
3568d838 4961 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4962 scan += UTF8SKIP(scan);
4963 hardcount++;
4964 }
4965 } else {
4966 while (scan < loceol && !isDIGIT(*scan))
4967 scan++;
a0ed51b3
LW
4968 }
4969 break;
a0d0e21e
LW
4970 default: /* Called on something of 0 width. */
4971 break; /* So match right here or not at all. */
4972 }
a687059c 4973
a0ed51b3
LW
4974 if (hardcount)
4975 c = hardcount;
4976 else
4977 c = scan - PL_reginput;
3280af22 4978 PL_reginput = scan;
a687059c 4979
a3621e74 4980 DEBUG_r({
e68ec53f 4981 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4982 DEBUG_EXECUTE_r({
e68ec53f
YO
4983 SV * const prop = sv_newmortal();
4984 regprop(prog, prop, p);
4985 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4986 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4987 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4988 });
be8e71aa 4989 });
9041c2e3 4990
a0d0e21e 4991 return(c);
a687059c
LW
4992}
4993
c277df42 4994
be8e71aa 4995#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4996/*
ffc61ed2
JH
4997- regclass_swash - prepare the utf8 swash
4998*/
4999
5000SV *
32fc9b6a 5001Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5002{
97aff369 5003 dVAR;
9e55ce06
JH
5004 SV *sw = NULL;
5005 SV *si = NULL;
5006 SV *alt = NULL;
3dab1dad 5007 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5008
4f639d21 5009 if (data && data->count) {
a3b680e6 5010 const U32 n = ARG(node);
ffc61ed2 5011
4f639d21
DM
5012 if (data->what[n] == 's') {
5013 SV * const rv = (SV*)data->data[n];
890ce7af 5014 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5015 SV **const ary = AvARRAY(av);
9e55ce06 5016 SV **a, **b;
9041c2e3 5017
711a919c 5018 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5019 * documentation of these array elements. */
5020
b11f357e 5021 si = *ary;
8f7f7219 5022 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5023 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5024
ffc61ed2
JH
5025 if (a)
5026 sw = *a;
5027 else if (si && doinit) {
5028 sw = swash_init("utf8", "", si, 1, 0);
5029 (void)av_store(av, 1, sw);
5030 }
9e55ce06
JH
5031 if (b)
5032 alt = *b;
ffc61ed2
JH
5033 }
5034 }
5035
9e55ce06
JH
5036 if (listsvp)
5037 *listsvp = si;
5038 if (altsvp)
5039 *altsvp = alt;
ffc61ed2
JH
5040
5041 return sw;
5042}
76234dfb 5043#endif
ffc61ed2
JH
5044
5045/*
ba7b4546 5046 - reginclass - determine if a character falls into a character class
832705d4
JH
5047
5048 The n is the ANYOF regnode, the p is the target string, lenp
5049 is pointer to the maximum length of how far to go in the p
5050 (if the lenp is zero, UTF8SKIP(p) is used),
5051 do_utf8 tells whether the target string is in UTF-8.
5052
bbce6d69 5053 */
5054
76e3520e 5055STATIC bool
32fc9b6a 5056S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5057{
27da23d5 5058 dVAR;
a3b680e6 5059 const char flags = ANYOF_FLAGS(n);
bbce6d69 5060 bool match = FALSE;
cc07378b 5061 UV c = *p;
ae9ddab8 5062 STRLEN len = 0;
9e55ce06 5063 STRLEN plen;
1aa99e6b 5064
19f67299
TS
5065 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5066 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5067 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5068 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
e8a70c6f
SP
5069 if (len == (STRLEN)-1)
5070 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5071 }
bbce6d69 5072
0f0076b4 5073 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5074 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5075 if (lenp)
5076 *lenp = 0;
ffc61ed2 5077 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5078 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5079 match = TRUE;
bbce6d69 5080 }
3568d838 5081 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5082 match = TRUE;
ffc61ed2 5083 if (!match) {
9e55ce06 5084 AV *av;
32fc9b6a 5085 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5086
5087 if (sw) {
3568d838 5088 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5089 match = TRUE;
5090 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5091 if (!match && lenp && av) {
5092 I32 i;
9e55ce06 5093 for (i = 0; i <= av_len(av); i++) {
890ce7af 5094 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5095 STRLEN len;
890ce7af 5096 const char * const s = SvPV_const(sv, len);
9e55ce06 5097
061b10df 5098 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5099 *lenp = len;
5100 match = TRUE;
5101 break;
5102 }
5103 }
5104 }
5105 if (!match) {
89ebb4a3 5106 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5107 STRLEN tmplen;
5108
9e55ce06
JH
5109 to_utf8_fold(p, tmpbuf, &tmplen);
5110 if (swash_fetch(sw, tmpbuf, do_utf8))
5111 match = TRUE;
5112 }
ffc61ed2
JH
5113 }
5114 }
bbce6d69 5115 }
9e55ce06 5116 if (match && lenp && *lenp == 0)
0f0076b4 5117 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5118 }
1aa99e6b 5119 if (!match && c < 256) {
ffc61ed2
JH
5120 if (ANYOF_BITMAP_TEST(n, c))
5121 match = TRUE;
5122 else if (flags & ANYOF_FOLD) {
eb160463 5123 U8 f;
a0ed51b3 5124
ffc61ed2
JH
5125 if (flags & ANYOF_LOCALE) {
5126 PL_reg_flags |= RF_tainted;
5127 f = PL_fold_locale[c];
5128 }
5129 else
5130 f = PL_fold[c];
5131 if (f != c && ANYOF_BITMAP_TEST(n, f))
5132 match = TRUE;
5133 }
5134
5135 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5136 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5137 if (
5138 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5139 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5140 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5141 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5142 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5143 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5144 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5145 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5146 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5147 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5148 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5149 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5150 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5151 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5152 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5153 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5154 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5155 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5156 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5157 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5158 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5159 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5160 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5161 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5162 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5163 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5164 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5165 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5166 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5167 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5168 ) /* How's that for a conditional? */
5169 {
5170 match = TRUE;
5171 }
a0ed51b3 5172 }
a0ed51b3
LW
5173 }
5174
a0ed51b3
LW
5175 return (flags & ANYOF_INVERT) ? !match : match;
5176}
161b471a 5177
dfe13c55 5178STATIC U8 *
0ce71af7 5179S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5180{
97aff369 5181 dVAR;
a0ed51b3 5182 if (off >= 0) {
1aa99e6b 5183 while (off-- && s < lim) {
ffc61ed2 5184 /* XXX could check well-formedness here */
a0ed51b3 5185 s += UTF8SKIP(s);
ffc61ed2 5186 }
a0ed51b3
LW
5187 }
5188 else {
1de06328
YO
5189 while (off++ && s > lim) {
5190 s--;
5191 if (UTF8_IS_CONTINUED(*s)) {
5192 while (s > lim && UTF8_IS_CONTINUATION(*s))
5193 s--;
a0ed51b3 5194 }
1de06328 5195 /* XXX could check well-formedness here */
a0ed51b3
LW
5196 }
5197 }
5198 return s;
5199}
161b471a 5200
f9f4320a
YO
5201#ifdef XXX_dmq
5202/* there are a bunch of places where we use two reghop3's that should
5203 be replaced with this routine. but since thats not done yet
5204 we ifdef it out - dmq
5205*/
dfe13c55 5206STATIC U8 *
1de06328
YO
5207S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5208{
5209 dVAR;
5210 if (off >= 0) {
5211 while (off-- && s < rlim) {
5212 /* XXX could check well-formedness here */
5213 s += UTF8SKIP(s);
5214 }
5215 }
5216 else {
5217 while (off++ && s > llim) {
5218 s--;
5219 if (UTF8_IS_CONTINUED(*s)) {
5220 while (s > llim && UTF8_IS_CONTINUATION(*s))
5221 s--;
5222 }
5223 /* XXX could check well-formedness here */
5224 }
5225 }
5226 return s;
5227}
f9f4320a 5228#endif
1de06328
YO
5229
5230STATIC U8 *
0ce71af7 5231S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5232{
97aff369 5233 dVAR;
a0ed51b3 5234 if (off >= 0) {
1aa99e6b 5235 while (off-- && s < lim) {
ffc61ed2 5236 /* XXX could check well-formedness here */
a0ed51b3 5237 s += UTF8SKIP(s);
ffc61ed2 5238 }
a0ed51b3 5239 if (off >= 0)
3dab1dad 5240 return NULL;
a0ed51b3
LW
5241 }
5242 else {
1de06328
YO
5243 while (off++ && s > lim) {
5244 s--;
5245 if (UTF8_IS_CONTINUED(*s)) {
5246 while (s > lim && UTF8_IS_CONTINUATION(*s))
5247 s--;
a0ed51b3 5248 }
1de06328 5249 /* XXX could check well-formedness here */
a0ed51b3
LW
5250 }
5251 if (off <= 0)
3dab1dad 5252 return NULL;
a0ed51b3
LW
5253 }
5254 return s;
5255}
51371543 5256
51371543 5257static void
acfe0abc 5258restore_pos(pTHX_ void *arg)
51371543 5259{
97aff369 5260 dVAR;
097eb12c 5261 regexp * const rex = (regexp *)arg;
51371543
GS
5262 if (PL_reg_eval_set) {
5263 if (PL_reg_oldsaved) {
4f639d21
DM
5264 rex->subbeg = PL_reg_oldsaved;
5265 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5266#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5267 rex->saved_copy = PL_nrs;
ed252734 5268#endif
4f639d21 5269 RX_MATCH_COPIED_on(rex);
51371543
GS
5270 }
5271 PL_reg_magic->mg_len = PL_reg_oldpos;
5272 PL_reg_eval_set = 0;
5273 PL_curpm = PL_reg_oldcurpm;
5274 }
5275}
33b8afdf
JH
5276
5277STATIC void
5278S_to_utf8_substr(pTHX_ register regexp *prog)
5279{
33b8afdf 5280 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5281 SV* const sv = newSVsv(prog->float_substr);
5282 prog->float_utf8 = sv;
33b8afdf
JH
5283 sv_utf8_upgrade(sv);
5284 if (SvTAIL(prog->float_substr))
5285 SvTAIL_on(sv);
5286 if (prog->float_substr == prog->check_substr)
5287 prog->check_utf8 = sv;
5288 }
5289 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5290 SV* const sv = newSVsv(prog->anchored_substr);
5291 prog->anchored_utf8 = sv;
33b8afdf
JH
5292 sv_utf8_upgrade(sv);
5293 if (SvTAIL(prog->anchored_substr))
5294 SvTAIL_on(sv);
5295 if (prog->anchored_substr == prog->check_substr)
5296 prog->check_utf8 = sv;
5297 }
5298}
5299
5300STATIC void
5301S_to_byte_substr(pTHX_ register regexp *prog)
5302{
97aff369 5303 dVAR;
33b8afdf 5304 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5305 SV* sv = newSVsv(prog->float_utf8);
5306 prog->float_substr = sv;
33b8afdf
JH
5307 if (sv_utf8_downgrade(sv, TRUE)) {
5308 if (SvTAIL(prog->float_utf8))
5309 SvTAIL_on(sv);
5310 } else {
5311 SvREFCNT_dec(sv);
5312 prog->float_substr = sv = &PL_sv_undef;
5313 }
5314 if (prog->float_utf8 == prog->check_utf8)
5315 prog->check_substr = sv;
5316 }
5317 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5318 SV* sv = newSVsv(prog->anchored_utf8);
5319 prog->anchored_substr = sv;
33b8afdf
JH
5320 if (sv_utf8_downgrade(sv, TRUE)) {
5321 if (SvTAIL(prog->anchored_utf8))
5322 SvTAIL_on(sv);
5323 } else {
5324 SvREFCNT_dec(sv);
5325 prog->anchored_substr = sv = &PL_sv_undef;
5326 }
5327 if (prog->anchored_utf8 == prog->check_utf8)
5328 prog->check_substr = sv;
5329 }
5330}
66610fdd
RGS
5331
5332/*
5333 * Local variables:
5334 * c-indentation-style: bsd
5335 * c-basic-offset: 4
5336 * indent-tabs-mode: t
5337 * End:
5338 *
37442d52
RGS
5339 * ex: set ts=8 sts=4 sw=4 noet:
5340 */