This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Not everybody has env
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
81#define RF_utf8 8 /* String contains multibyte chars? */
82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
a3621e74 198# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 201
a3621e74 202# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 203 PerlIO_printf(Perl_debug_log, \
faccc32b 204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 206
76e3520e 207STATIC char *
097eb12c 208S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 209{
97aff369 210 dVAR;
b1ce53c5 211 I32 i;
a0d0e21e 212 char *input;
b1ce53c5 213
a3621e74
YO
214 GET_RE_DEBUG_FLAGS_DECL;
215
b1ce53c5 216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 217 i = SSPOPINT;
b1ce53c5
JH
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 220 input = (char *) SSPOPPTR;
a01268b5 221 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
b1ce53c5
JH
224
225 /* Now restore the parentheses context. */
41123dfd
JH
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 228 I32 tmps;
097eb12c 229 U32 paren = (U32)SSPOPINT;
3280af22 230 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
231 PL_regstartp[paren] = SSPOPINT;
232 tmps = SSPOPINT;
3280af22
NIS
233 if (paren <= *PL_reglastparen)
234 PL_regendp[paren] = tmps;
a3621e74 235 DEBUG_EXECUTE_r(
c3464db5 236 PerlIO_printf(Perl_debug_log,
b900a521 237 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 238 (UV)paren, (IV)PL_regstartp[paren],
b900a521 239 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 240 (IV)PL_regendp[paren],
3280af22 241 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 242 );
a0d0e21e 243 }
a3621e74 244 DEBUG_EXECUTE_r(
bb7a0f54 245 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 246 PerlIO_printf(Perl_debug_log,
faccc32b 247 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 248 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
249 }
250 );
daf18116 251#if 1
dafc8851
JH
252 /* It would seem that the similar code in regtry()
253 * already takes care of this, and in fact it is in
254 * a better location to since this code can #if 0-ed out
255 * but the code in regtry() is needed or otherwise tests
256 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
257 * (as of patchlevel 7877) will fail. Then again,
258 * this code seems to be necessary or otherwise
259 * building DynaLoader will fail:
260 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
261 * --jhi */
bb7a0f54 262 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
263 if (i > PL_regsize)
264 PL_regstartp[i] = -1;
265 PL_regendp[i] = -1;
a0d0e21e 266 }
dafc8851 267#endif
a0d0e21e
LW
268 return input;
269}
270
02db2b7b 271#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 272
a687059c 273/*
e50aee73 274 * pregexec and friends
a687059c
LW
275 */
276
76234dfb 277#ifndef PERL_IN_XSUB_RE
a687059c 278/*
c277df42 279 - pregexec - match a regexp against a string
a687059c 280 */
c277df42 281I32
864dbfa3 282Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 283 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
284/* strend: pointer to null at end of string */
285/* strbeg: real beginning of string */
286/* minend: end of match must be >=minend after stringarg. */
287/* nosave: For optimizations. */
288{
289 return
9041c2e3 290 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
291 nosave ? 0 : REXEC_COPY_STR);
292}
76234dfb 293#endif
22e551b9 294
9041c2e3 295/*
cad2e5aa
JH
296 * Need to implement the following flags for reg_anch:
297 *
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
299 * USE_INTUIT_ML
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
303 * INTUIT_ONCE_ML
304 *
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
307 */
308
309/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
310
3f7c398e 311/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
313
314/* XXXX We assume that strpos is strbeg unless sv. */
315
6eb5f6b9
JH
316/* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
322
2c2d71f5
JH
323/* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
30944b6d
IZ
326 regtry() should be needed.
327
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
330 b) Fixed substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 333 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
334 string which does not contradict any of them.
335 */
2c2d71f5 336
6eb5f6b9
JH
337/* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
340
cad2e5aa
JH
341char *
342Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
344{
97aff369 345 dVAR;
b7953727 346 register I32 start_shift = 0;
cad2e5aa 347 /* Should be nonnegative! */
b7953727 348 register I32 end_shift = 0;
2c2d71f5
JH
349 register char *s;
350 register SV *check;
a1933d95 351 char *strbeg;
cad2e5aa 352 char *t;
a3b680e6 353 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 354 I32 ml_anch;
bd61b366
SS
355 register char *other_last = NULL; /* other substr checked before this */
356 char *check_at = NULL; /* check substr found at this pos */
1df70142 357 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 358#ifdef DEBUGGING
890ce7af 359 const char * const i_strpos = strpos;
30944b6d 360#endif
a3621e74
YO
361
362 GET_RE_DEBUG_FLAGS_DECL;
363
a30b2f1f 364 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 365
b8d68ded 366 if (prog->reganch & ROPT_UTF8) {
a3621e74 367 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
368 "UTF-8 regex...\n"));
369 PL_reg_flags |= RF_utf8;
370 }
371
a3621e74 372 DEBUG_EXECUTE_r({
0df25f3d
YO
373 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
374 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
375
2a782b5b
JH
376 if (!PL_colorset)
377 reginitcolors();
b8d68ded 378 if (PL_reg_match_utf8)
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 380 "UTF-8 target...\n"));
2a782b5b 381 PerlIO_printf(Perl_debug_log,
a0288114 382 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 383 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
384 prog->precomp,
385 PL_colors[1],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
387 PL_colors[0],
388 (int)(len > 60 ? 60 : len),
389 s, PL_colors[1],
390 (len > 60 ? "..." : "")
391 );
392 });
cad2e5aa 393
c344f387
JH
394 /* CHR_DIST() would be more correct here but it makes things slow. */
395 if (prog->minlen > strend - strpos) {
a3621e74 396 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 397 "String too short... [re_intuit_start]\n"));
cad2e5aa 398 goto fail;
2c2d71f5 399 }
a1933d95 400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 401 PL_regeol = strend;
33b8afdf
JH
402 if (do_utf8) {
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
406 } else {
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
410 }
411 if (check == &PL_sv_undef) {
a3621e74 412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
413 "Non-utf string cannot match utf check string\n"));
414 goto fail;
415 }
2c2d71f5 416 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
417 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
418 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 419 && !multiline ) ); /* Check after \n? */
cad2e5aa 420
7e25d62c
JH
421 if (!ml_anch) {
422 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
423 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
425 && sv && !SvROK(sv)
426 && (strpos != strbeg)) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
428 goto fail;
429 }
430 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 431 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 432 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
433 I32 slen;
434
1aa99e6b 435 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
436 if (SvTAIL(check)) {
437 slen = SvCUR(check); /* >= 1 */
cad2e5aa 438
9041c2e3 439 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 440 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 442 goto fail_finish;
cad2e5aa
JH
443 }
444 /* Now should match s[0..slen-2] */
445 slen--;
3f7c398e 446 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 447 || (slen > 1
3f7c398e 448 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 449 report_neq:
a3621e74 450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
451 goto fail_finish;
452 }
cad2e5aa 453 }
3f7c398e 454 else if (*SvPVX_const(check) != *s
653099ff 455 || ((slen = SvCUR(check)) > 1
3f7c398e 456 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 457 goto report_neq;
c315bfe8 458 check_at = s;
2c2d71f5 459 goto success_at_start;
7e25d62c 460 }
cad2e5aa 461 }
2c2d71f5 462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 463 s = strpos;
2c2d71f5 464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 465 end_shift = prog->minlen - start_shift -
653099ff 466 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 467 if (!ml_anch) {
a3b680e6 468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 469 - (SvTAIL(check) != 0);
a3b680e6 470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
471
472 if (end_shift < eshift)
473 end_shift = eshift;
474 }
cad2e5aa 475 }
2c2d71f5 476 else { /* Can match at random position */
cad2e5aa
JH
477 ml_anch = 0;
478 s = strpos;
2c2d71f5
JH
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 /* Should be nonnegative! */
481 end_shift = prog->minlen - start_shift -
653099ff 482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
483 }
484
2c2d71f5 485#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 486 if (end_shift < 0)
6bbae5e6 487 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
488#endif
489
2c2d71f5
JH
490 restart:
491 /* Find a possible match in the region s..strend by looking for
492 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 493 if (flags & REXEC_SCREAM) {
cad2e5aa 494 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 495 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 496
2c2d71f5
JH
497 if (PL_screamfirst[BmRARE(check)] >= 0
498 || ( BmRARE(check) == '\n'
499 && (BmPREVIOUS(check) == SvCUR(check) - 1)
500 && SvTAIL(check) ))
9041c2e3 501 s = screaminstr(sv, check,
2c2d71f5 502 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 503 else
2c2d71f5 504 goto fail_finish;
4addbd3b
HS
505 /* we may be pointing at the wrong string */
506 if (s && RX_MATCH_COPIED(prog))
3f7c398e 507 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
508 if (data)
509 *data->scream_olds = s;
510 }
f33976b4 511 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
512 s = fbm_instr((U8*)(s + start_shift),
513 (U8*)(strend - end_shift),
7fba1cd6 514 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 515 else
1aa99e6b
IH
516 s = fbm_instr(HOP3(s, start_shift, strend),
517 HOP3(strend, -end_shift, strbeg),
7fba1cd6 518 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
519
520 /* Update the count-of-usability, remove useless subpatterns,
521 unshift s. */
2c2d71f5 522
a0288114 523 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 524 (s ? "Found" : "Did not find"),
33b8afdf 525 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 526 PL_colors[0],
7b0972df 527 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 528 SvPVX_const(check),
2c2d71f5
JH
529 PL_colors[1], (SvTAIL(check) ? "$" : ""),
530 (s ? " at offset " : "...\n") ) );
531
532 if (!s)
533 goto fail_finish;
534
6eb5f6b9
JH
535 check_at = s;
536
2c2d71f5 537 /* Finish the diagnostic message */
a3621e74 538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
539
540 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
541 Start with the other substr.
542 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 543 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
544 *always* match. Probably should be marked during compile...
545 Probably it is right to do no SCREAM here...
546 */
547
33b8afdf 548 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 549 /* Take into account the "other" substring. */
2c2d71f5
JH
550 /* XXXX May be hopelessly wrong for UTF... */
551 if (!other_last)
6eb5f6b9 552 other_last = strpos;
33b8afdf 553 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
554 do_other_anchored:
555 {
890ce7af
AL
556 char * const last = HOP3c(s, -start_shift, strbeg);
557 char *last1, *last2;
be8e71aa 558 char * const saved_s = s;
33b8afdf 559 SV* must;
2c2d71f5 560
2c2d71f5
JH
561 t = s - prog->check_offset_max;
562 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 563 && (!do_utf8
0ce71af7 564 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 565 && t > strpos)))
6f207bd3 566 NOOP;
2c2d71f5
JH
567 else
568 t = strpos;
1aa99e6b 569 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
570 if (t < other_last) /* These positions already checked */
571 t = other_last;
1aa99e6b 572 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
573 if (last < last1)
574 last1 = last;
575 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
576 /* On end-of-str: see comment below. */
33b8afdf
JH
577 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
578 if (must == &PL_sv_undef) {
579 s = (char*)NULL;
a3621e74 580 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
581 }
582 else
583 s = fbm_instr(
584 (unsigned char*)t,
585 HOP3(HOP3(last1, prog->anchored_offset, strend)
586 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
587 must,
7fba1cd6 588 multiline ? FBMrf_MULTILINE : 0
33b8afdf 589 );
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 591 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
592 (s ? "Found" : "Contradicts"),
593 PL_colors[0],
33b8afdf
JH
594 (int)(SvCUR(must)
595 - (SvTAIL(must)!=0)),
3f7c398e 596 SvPVX_const(must),
33b8afdf 597 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
598 if (!s) {
599 if (last1 >= last2) {
a3621e74 600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
601 ", giving up...\n"));
602 goto fail_finish;
603 }
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 605 ", trying floating at offset %ld...\n",
be8e71aa 606 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
607 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
608 s = HOP3c(last, 1, strend);
2c2d71f5
JH
609 goto restart;
610 }
611 else {
a3621e74 612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 613 (long)(s - i_strpos)));
1aa99e6b
IH
614 t = HOP3c(s, -prog->anchored_offset, strbeg);
615 other_last = HOP3c(s, 1, strend);
be8e71aa 616 s = saved_s;
2c2d71f5
JH
617 if (t == strpos)
618 goto try_at_start;
2c2d71f5
JH
619 goto try_at_offset;
620 }
30944b6d 621 }
2c2d71f5
JH
622 }
623 else { /* Take into account the floating substring. */
33b8afdf 624 char *last, *last1;
be8e71aa 625 char * const saved_s = s;
33b8afdf
JH
626 SV* must;
627
628 t = HOP3c(s, -start_shift, strbeg);
629 last1 = last =
630 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
631 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
632 last = HOP3c(t, prog->float_max_offset, strend);
633 s = HOP3c(t, prog->float_min_offset, strend);
634 if (s < other_last)
635 s = other_last;
2c2d71f5 636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
637 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
638 /* fbm_instr() takes into account exact value of end-of-str
639 if the check is SvTAIL(ed). Since false positives are OK,
640 and end-of-str is not later than strend we are OK. */
641 if (must == &PL_sv_undef) {
642 s = (char*)NULL;
a3621e74 643 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
644 }
645 else
2c2d71f5 646 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
647 (unsigned char*)last + SvCUR(must)
648 - (SvTAIL(must)!=0),
7fba1cd6 649 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
651 (s ? "Found" : "Contradicts"),
652 PL_colors[0],
653 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 654 SvPVX_const(must),
33b8afdf
JH
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
656 if (!s) {
657 if (last1 == last) {
a3621e74 658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
659 ", giving up...\n"));
660 goto fail_finish;
2c2d71f5 661 }
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 663 ", trying anchored starting at offset %ld...\n",
be8e71aa 664 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
665 other_last = last;
666 s = HOP3c(t, 1, strend);
667 goto restart;
668 }
669 else {
a3621e74 670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
671 (long)(s - i_strpos)));
672 other_last = s; /* Fix this later. --Hugo */
be8e71aa 673 s = saved_s;
33b8afdf
JH
674 if (t == strpos)
675 goto try_at_start;
676 goto try_at_offset;
677 }
2c2d71f5 678 }
cad2e5aa 679 }
2c2d71f5
JH
680
681 t = s - prog->check_offset_max;
2c2d71f5 682 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 683 && (!do_utf8
0ce71af7 684 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 685 && t > strpos))) {
2c2d71f5
JH
686 /* Fixed substring is found far enough so that the match
687 cannot start at strpos. */
688 try_at_offset:
cad2e5aa 689 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
690 /* Eventually fbm_*() should handle this, but often
691 anchored_offset is not 0, so this check will not be wasted. */
692 /* XXXX In the code below we prefer to look for "^" even in
693 presence of anchored substrings. And we search even
694 beyond the found float position. These pessimizations
695 are historical artefacts only. */
696 find_anchor:
2c2d71f5 697 while (t < strend - prog->minlen) {
cad2e5aa 698 if (*t == '\n') {
4ee3650e 699 if (t < check_at - prog->check_offset_min) {
33b8afdf 700 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
701 /* Since we moved from the found position,
702 we definitely contradict the found anchored
30944b6d
IZ
703 substr. Due to the above check we do not
704 contradict "check" substr.
705 Thus we can arrive here only if check substr
706 is float. Redo checking for "other"=="fixed".
707 */
9041c2e3 708 strpos = t + 1;
a3621e74 709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 710 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
711 goto do_other_anchored;
712 }
4ee3650e
GS
713 /* We don't contradict the found floating substring. */
714 /* XXXX Why not check for STCLASS? */
cad2e5aa 715 s = t + 1;
a3621e74 716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 717 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
718 goto set_useful;
719 }
4ee3650e
GS
720 /* Position contradicts check-string */
721 /* XXXX probably better to look for check-string
722 than for "\n", so one should lower the limit for t? */
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 725 other_last = strpos = s = t + 1;
cad2e5aa
JH
726 goto restart;
727 }
728 t++;
729 }
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 731 PL_colors[0], PL_colors[1]));
2c2d71f5 732 goto fail_finish;
cad2e5aa 733 }
f5952150 734 else {
a3621e74 735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 736 PL_colors[0], PL_colors[1]));
f5952150 737 }
cad2e5aa
JH
738 s = t;
739 set_useful:
33b8afdf 740 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
741 }
742 else {
f5952150 743 /* The found string does not prohibit matching at strpos,
2c2d71f5 744 - no optimization of calling REx engine can be performed,
f5952150
GS
745 unless it was an MBOL and we are not after MBOL,
746 or a future STCLASS check will fail this. */
2c2d71f5
JH
747 try_at_start:
748 /* Even in this situation we may use MBOL flag if strpos is offset
749 wrt the start of the string. */
05b4157f 750 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 751 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
752 /* May be due to an implicit anchor of m{.*foo} */
753 && !(prog->reganch & ROPT_IMPLICIT))
754 {
cad2e5aa
JH
755 t = strpos;
756 goto find_anchor;
757 }
a3621e74 758 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 759 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 760 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 761 );
2c2d71f5 762 success_at_start:
30944b6d 763 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
764 && (do_utf8 ? (
765 prog->check_utf8 /* Could be deleted already */
766 && --BmUSEFUL(prog->check_utf8) < 0
767 && (prog->check_utf8 == prog->float_utf8)
768 ) : (
769 prog->check_substr /* Could be deleted already */
770 && --BmUSEFUL(prog->check_substr) < 0
771 && (prog->check_substr == prog->float_substr)
772 )))
66e933ab 773 {
cad2e5aa 774 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
776 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
777 if (do_utf8 ? prog->check_substr : prog->check_utf8)
778 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
781 check = NULL; /* abort */
cad2e5aa 782 s = strpos;
3cf5c195
IZ
783 /* XXXX This is a remnant of the old implementation. It
784 looks wasteful, since now INTUIT can use many
6eb5f6b9 785 other heuristics. */
cad2e5aa
JH
786 prog->reganch &= ~RE_USE_INTUIT;
787 }
788 else
789 s = strpos;
790 }
791
6eb5f6b9
JH
792 /* Last resort... */
793 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 794 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
795 /* minlen == 0 is possible if regstclass is \b or \B,
796 and the fixed substr is ''$.
797 Since minlen is already taken into account, s+1 is before strend;
798 accidentally, minlen >= 1 guaranties no false positives at s + 1
799 even for \b or \B. But (minlen? 1 : 0) below assumes that
800 regstclass does not come from lookahead... */
801 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
802 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 803 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 804 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 805 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 806 : 1);
07be1b83 807 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 808 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 809 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
810 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
811 cl_l, strend)
812 : strend);
07be1b83
YO
813 /*if (OP(prog->regstclass) == TRIE)
814 endpos++;*/
6eb5f6b9 815 t = s;
3b0527fe 816 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
817 if (!s) {
818#ifdef DEBUGGING
cbbf8932 819 const char *what = NULL;
6eb5f6b9
JH
820#endif
821 if (endpos == strend) {
a3621e74 822 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
823 "Could not match STCLASS...\n") );
824 goto fail;
825 }
a3621e74 826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 827 "This position contradicts STCLASS...\n") );
653099ff
GS
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
829 goto fail;
6eb5f6b9 830 /* Contradict one of substrings */
33b8afdf
JH
831 if (prog->anchored_substr || prog->anchored_utf8) {
832 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 833 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 834 hop_and_restart:
1aa99e6b 835 s = HOP3c(t, 1, strend);
66e933ab
GS
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
a3621e74 838 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
839 "Could not match STCLASS...\n") );
840 goto fail;
841 }
5e39e1e5
HS
842 if (!check)
843 goto giveup;
a3621e74 844 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 845 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
846 what, (long)(s + start_shift - i_strpos)) );
847 goto restart;
848 }
66e933ab 849 /* Have both, check_string is floating */
6eb5f6b9
JH
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
9041c2e3 853 s = check_at;
5e39e1e5
HS
854 if (!check)
855 goto giveup;
a3621e74 856 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 857 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
860 }
60e71179
GS
861 /* Another way we could have checked stclass at the
862 current position only: */
863 if (ml_anch) {
864 s = t = t + 1;
5e39e1e5
HS
865 if (!check)
866 goto giveup;
a3621e74 867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 868 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 869 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 870 goto try_at_offset;
66e933ab 871 }
33b8afdf 872 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 873 goto fail;
6eb5f6b9
JH
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
a3621e74 877 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
878 goto hop_and_restart;
879 }
b7953727 880 if (t != s) {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 882 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
883 (long)(t - i_strpos), (long)(s - i_strpos))
884 );
885 }
886 else {
a3621e74 887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
888 "Does not contradict STCLASS...\n");
889 );
890 }
6eb5f6b9 891 }
5e39e1e5 892 giveup:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 896 return s;
2c2d71f5
JH
897
898 fail_finish: /* Substring not found */
33b8afdf
JH
899 if (prog->check_substr || prog->check_utf8) /* could be removed already */
900 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 901 fail:
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 903 PL_colors[4], PL_colors[5]));
bd61b366 904 return NULL;
cad2e5aa 905}
9661b544 906
6eb5f6b9 907/* We know what class REx starts with. Try to find this position... */
3b0527fe 908/* if reginfo is NULL, its a dryrun */
07be1b83
YO
909/* annoyingly all the vars in this routine have different names from their counterparts
910 in regmatch. /grrr */
3b0527fe 911
4cadc6a9
YO
912#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
913foldlen, foldbuf, uniflags) STMT_START { \
914 switch (trie_type) { \
915 case trie_utf8_fold: \
916 if ( foldlen>0 ) { \
917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
918 foldlen -= len; \
919 uscan += len; \
920 len=0; \
921 } else { \
922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
923 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
924 foldlen -= UNISKIP( uvc ); \
925 uscan = foldbuf + UNISKIP( uvc ); \
926 } \
927 break; \
928 case trie_utf8: \
929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
930 break; \
931 case trie_plain: \
932 uvc = (UV)*uc; \
933 len = 1; \
934 } \
935 \
936 if (uvc < 256) { \
937 charid = trie->charmap[ uvc ]; \
938 } \
939 else { \
940 charid = 0; \
941 if (trie->widecharmap) { \
942 SV** const svpp = hv_fetch(trie->widecharmap, \
943 (char*)&uvc, sizeof(UV), 0); \
944 if (svpp) \
945 charid = (U16)SvIV(*svpp); \
946 } \
947 } \
948} STMT_END
949
950#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
951 if ( (CoNd) \
952 && (ln == len || \
953 ibcmp_utf8(s, NULL, 0, do_utf8, \
954 m, NULL, ln, (bool)UTF)) \
955 && (!reginfo || regtry(reginfo, s)) ) \
956 goto got_it; \
957 else { \
958 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
959 uvchr_to_utf8(tmpbuf, c); \
960 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
961 if ( f != c \
962 && (f == c1 || f == c2) \
963 && (ln == foldlen || \
964 !ibcmp_utf8((char *) foldbuf, \
965 NULL, foldlen, do_utf8, \
966 m, \
967 NULL, ln, (bool)UTF)) \
968 && (!reginfo || regtry(reginfo, s)) ) \
969 goto got_it; \
970 } \
971 s += len
972
973#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
974STMT_START { \
975 while (s <= e) { \
976 if ( (CoNd) \
977 && (ln == 1 || !(OP(c) == EXACTF \
978 ? ibcmp(s, m, ln) \
979 : ibcmp_locale(s, m, ln))) \
980 && (!reginfo || regtry(reginfo, s)) ) \
981 goto got_it; \
982 s++; \
983 } \
984} STMT_END
985
986#define REXEC_FBC_UTF8_SCAN(CoDe) \
987STMT_START { \
988 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
989 CoDe \
990 s += uskip; \
991 } \
992} STMT_END
993
994#define REXEC_FBC_SCAN(CoDe) \
995STMT_START { \
996 while (s < strend) { \
997 CoDe \
998 s++; \
999 } \
1000} STMT_END
1001
1002#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1003REXEC_FBC_UTF8_SCAN( \
1004 if (CoNd) { \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1006 goto got_it; \
1007 else \
1008 tmp = doevery; \
1009 } \
1010 else \
1011 tmp = 1; \
1012)
1013
1014#define REXEC_FBC_CLASS_SCAN(CoNd) \
1015REXEC_FBC_SCAN( \
1016 if (CoNd) { \
1017 if (tmp && (!reginfo || regtry(reginfo, s))) \
1018 goto got_it; \
1019 else \
1020 tmp = doevery; \
1021 } \
1022 else \
1023 tmp = 1; \
1024)
1025
1026#define REXEC_FBC_TRYIT \
1027if ((!reginfo || regtry(reginfo, s))) \
1028 goto got_it
1029
1030#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1031 if (do_utf8) { \
1032 UtFpReLoAd; \
1033 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1034 } \
1035 else { \
1036 REXEC_FBC_CLASS_SCAN(CoNd); \
1037 } \
1038 break
1039
1040#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1041 PL_reg_flags |= RF_tainted; \
1042 if (do_utf8) { \
1043 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1044 } \
1045 else { \
1046 REXEC_FBC_CLASS_SCAN(CoNd); \
1047 } \
1048 break
1049
3c3eec57 1050STATIC char *
07be1b83
YO
1051S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1052 const char *strend, const regmatch_info *reginfo)
a687059c 1053{
27da23d5 1054 dVAR;
1df70142 1055 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1056 char *m;
d8093b23 1057 STRLEN ln;
5dab1207 1058 STRLEN lnc;
078c425b 1059 register STRLEN uskip;
d8093b23
G
1060 unsigned int c1;
1061 unsigned int c2;
6eb5f6b9
JH
1062 char *e;
1063 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1064 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1065
6eb5f6b9
JH
1066 /* We know what class it must start with. */
1067 switch (OP(c)) {
6eb5f6b9 1068 case ANYOF:
388cc4de 1069 if (do_utf8) {
4cadc6a9 1070 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1071 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1072 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1073 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1074 }
1075 else {
1076 while (s < strend) {
1077 STRLEN skip = 1;
1078
32fc9b6a 1079 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1080 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1081 /* The assignment of 2 is intentional:
1082 * for the folded sharp s, the skip is 2. */
1083 (skip = SHARP_S_SKIP))) {
3b0527fe 1084 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1085 goto got_it;
1086 else
1087 tmp = doevery;
1088 }
1089 else
1090 tmp = 1;
1091 s += skip;
1092 }
a0d0e21e 1093 }
6eb5f6b9 1094 break;
f33976b4 1095 case CANY:
4cadc6a9 1096 REXEC_FBC_SCAN(
3b0527fe 1097 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1098 goto got_it;
1099 else
1100 tmp = doevery;
4cadc6a9 1101 );
f33976b4 1102 break;
6eb5f6b9 1103 case EXACTF:
5dab1207
NIS
1104 m = STRING(c);
1105 ln = STR_LEN(c); /* length to match in octets/bytes */
1106 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1107 if (UTF) {
a2a2844f 1108 STRLEN ulen1, ulen2;
5dab1207 1109 U8 *sm = (U8 *) m;
89ebb4a3
JH
1110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1112 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1113
1114 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1115 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1116
89ebb4a3 1117 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1118 0, uniflags);
89ebb4a3 1119 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1120 0, uniflags);
5dab1207
NIS
1121 lnc = 0;
1122 while (sm < ((U8 *) m + ln)) {
1123 lnc++;
1124 sm += UTF8SKIP(sm);
1125 }
1aa99e6b
IH
1126 }
1127 else {
1128 c1 = *(U8*)m;
1129 c2 = PL_fold[c1];
1130 }
6eb5f6b9
JH
1131 goto do_exactf;
1132 case EXACTFL:
5dab1207
NIS
1133 m = STRING(c);
1134 ln = STR_LEN(c);
1135 lnc = (I32) ln;
d8093b23 1136 c1 = *(U8*)m;
6eb5f6b9
JH
1137 c2 = PL_fold_locale[c1];
1138 do_exactf:
db12adc6 1139 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1140
3b0527fe 1141 if (!reginfo && e < s)
6eb5f6b9 1142 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1143
60a8b682
JH
1144 /* The idea in the EXACTF* cases is to first find the
1145 * first character of the EXACTF* node and then, if
1146 * necessary, case-insensitively compare the full
1147 * text of the node. The c1 and c2 are the first
1148 * characters (though in Unicode it gets a bit
1149 * more complicated because there are more cases
7f16dd3d
JH
1150 * than just upper and lower: one needs to use
1151 * the so-called folding case for case-insensitive
1152 * matching (called "loose matching" in Unicode).
1153 * ibcmp_utf8() will do just that. */
60a8b682 1154
1aa99e6b 1155 if (do_utf8) {
575cac57 1156 UV c, f;
89ebb4a3 1157 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1158 STRLEN len, foldlen;
4ad0818d 1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1160 if (c1 == c2) {
5dab1207
NIS
1161 /* Upper and lower of 1st char are equal -
1162 * probably not a "letter". */
1aa99e6b 1163 while (s <= e) {
89ebb4a3 1164 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1165 uniflags);
4cadc6a9 1166 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1167 }
09091399
JH
1168 }
1169 else {
1aa99e6b 1170 while (s <= e) {
89ebb4a3 1171 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1172 uniflags);
80aecb99 1173
60a8b682 1174 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1175 * Note that not all the possible combinations
1176 * are handled here: some of them are handled
1177 * by the standard folding rules, and some of
1178 * them (the character class or ANYOF cases)
1179 * are handled during compiletime in
1180 * regexec.c:S_regclass(). */
880bd946
JH
1181 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1182 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1183 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1184
4cadc6a9 1185 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1186 }
09091399 1187 }
1aa99e6b
IH
1188 }
1189 else {
1190 if (c1 == c2)
4cadc6a9 1191 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1192 else
4cadc6a9 1193 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1194 }
1195 break;
bbce6d69 1196 case BOUNDL:
3280af22 1197 PL_reg_flags |= RF_tainted;
bbce6d69 1198 /* FALL THROUGH */
a0d0e21e 1199 case BOUND:
ffc61ed2 1200 if (do_utf8) {
12d33761 1201 if (s == PL_bostr)
ffc61ed2
JH
1202 tmp = '\n';
1203 else {
6136c704 1204 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1205 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1206 }
1207 tmp = ((OP(c) == BOUND ?
9041c2e3 1208 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1209 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1210 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1211 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1212 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1213 isALNUM_LC_utf8((U8*)s)))
1214 {
1215 tmp = !tmp;
4cadc6a9 1216 REXEC_FBC_TRYIT;
a687059c 1217 }
4cadc6a9 1218 );
a0d0e21e 1219 }
667bb95a 1220 else {
12d33761 1221 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1222 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1223 REXEC_FBC_SCAN(
ffc61ed2
JH
1224 if (tmp ==
1225 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1226 tmp = !tmp;
4cadc6a9 1227 REXEC_FBC_TRYIT;
a0ed51b3 1228 }
4cadc6a9 1229 );
a0ed51b3 1230 }
3b0527fe 1231 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1232 goto got_it;
1233 break;
bbce6d69 1234 case NBOUNDL:
3280af22 1235 PL_reg_flags |= RF_tainted;
bbce6d69 1236 /* FALL THROUGH */
a0d0e21e 1237 case NBOUND:
ffc61ed2 1238 if (do_utf8) {
12d33761 1239 if (s == PL_bostr)
ffc61ed2
JH
1240 tmp = '\n';
1241 else {
6136c704 1242 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1243 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1244 }
1245 tmp = ((OP(c) == NBOUND ?
9041c2e3 1246 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1247 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1248 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1249 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1250 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1251 isALNUM_LC_utf8((U8*)s)))
1252 tmp = !tmp;
4cadc6a9
YO
1253 else REXEC_FBC_TRYIT;
1254 );
a0d0e21e 1255 }
667bb95a 1256 else {
12d33761 1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1260 REXEC_FBC_SCAN(
ffc61ed2
JH
1261 if (tmp ==
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1263 tmp = !tmp;
4cadc6a9
YO
1264 else REXEC_FBC_TRYIT;
1265 );
a0ed51b3 1266 }
3b0527fe 1267 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1268 goto got_it;
1269 break;
a0d0e21e 1270 case ALNUM:
4cadc6a9
YO
1271 REXEC_FBC_CSCAN_PRELOAD(
1272 LOAD_UTF8_CHARCLASS_ALNUM(),
1273 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1274 isALNUM(*s)
1275 );
bbce6d69 1276 case ALNUML:
4cadc6a9
YO
1277 REXEC_FBC_CSCAN_TAINT(
1278 isALNUM_LC_utf8((U8*)s),
1279 isALNUM_LC(*s)
1280 );
a0d0e21e 1281 case NALNUM:
4cadc6a9
YO
1282 REXEC_FBC_CSCAN_PRELOAD(
1283 LOAD_UTF8_CHARCLASS_ALNUM(),
1284 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1285 !isALNUM(*s)
1286 );
bbce6d69 1287 case NALNUML:
4cadc6a9
YO
1288 REXEC_FBC_CSCAN_TAINT(
1289 !isALNUM_LC_utf8((U8*)s),
1290 !isALNUM_LC(*s)
1291 );
a0d0e21e 1292 case SPACE:
4cadc6a9
YO
1293 REXEC_FBC_CSCAN_PRELOAD(
1294 LOAD_UTF8_CHARCLASS_SPACE(),
1295 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1296 isSPACE(*s)
1297 );
bbce6d69 1298 case SPACEL:
4cadc6a9
YO
1299 REXEC_FBC_CSCAN_TAINT(
1300 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1301 isSPACE_LC(*s)
1302 );
a0d0e21e 1303 case NSPACE:
4cadc6a9
YO
1304 REXEC_FBC_CSCAN_PRELOAD(
1305 LOAD_UTF8_CHARCLASS_SPACE(),
1306 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1307 !isSPACE(*s)
1308 );
bbce6d69 1309 case NSPACEL:
4cadc6a9
YO
1310 REXEC_FBC_CSCAN_TAINT(
1311 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1312 !isSPACE_LC(*s)
1313 );
a0d0e21e 1314 case DIGIT:
4cadc6a9
YO
1315 REXEC_FBC_CSCAN_PRELOAD(
1316 LOAD_UTF8_CHARCLASS_DIGIT(),
1317 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1318 isDIGIT(*s)
1319 );
b8c5462f 1320 case DIGITL:
4cadc6a9
YO
1321 REXEC_FBC_CSCAN_TAINT(
1322 isDIGIT_LC_utf8((U8*)s),
1323 isDIGIT_LC(*s)
1324 );
a0d0e21e 1325 case NDIGIT:
4cadc6a9
YO
1326 REXEC_FBC_CSCAN_PRELOAD(
1327 LOAD_UTF8_CHARCLASS_DIGIT(),
1328 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1329 !isDIGIT(*s)
1330 );
b8c5462f 1331 case NDIGITL:
4cadc6a9
YO
1332 REXEC_FBC_CSCAN_TAINT(
1333 !isDIGIT_LC_utf8((U8*)s),
1334 !isDIGIT_LC(*s)
1335 );
07be1b83
YO
1336 case TRIE:
1337 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1338 {
1339 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1340 trie_type = do_utf8 ?
1341 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1342 : trie_plain;
1343 /* what trie are we using right now */
1344 reg_ac_data *aho
1345 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1346 reg_trie_data *trie=aho->trie;
1347
1348 const char *last_start = strend - trie->minlen;
1349 const char *real_start = s;
1350 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1351 SV *sv_points;
1352 U8 **points; /* map of where we were in the input string
1353 when reading a given string. For ASCII this
1354 is unnecessary overhead as the relationship
1355 is always 1:1, but for unicode, especially
1356 case folded unicode this is not true. */
f9e705e8 1357 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1358
1359 GET_RE_DEBUG_FLAGS_DECL;
1360
be8e71aa
YO
1361 /* We can't just allocate points here. We need to wrap it in
1362 * an SV so it gets freed properly if there is a croak while
1363 * running the match */
1364 ENTER;
1365 SAVETMPS;
1366 sv_points=newSV(maxlen * sizeof(U8 *));
1367 SvCUR_set(sv_points,
1368 maxlen * sizeof(U8 *));
1369 SvPOK_on(sv_points);
1370 sv_2mortal(sv_points);
1371 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1372
1373 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1374 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1375 s++;
1376 }
1377 }
1378
1379 while (s <= last_start) {
1380 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1381 U8 *uc = (U8*)s;
1382 U16 charid = 0;
1383 U32 base = 1;
1384 U32 state = 1;
1385 UV uvc = 0;
1386 STRLEN len = 0;
1387 STRLEN foldlen = 0;
1388 U8 *uscan = (U8*)NULL;
1389 U8 *leftmost = NULL;
1390
1391 U32 pointpos = 0;
1392
1393 while ( state && uc <= (U8*)strend ) {
1394 int failed=0;
1395 if (aho->states[ state ].wordnum) {
1396 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1397 if (!leftmost || lpos < leftmost)
1398 leftmost= lpos;
1399 if (base==0) break;
1400 }
1401 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1402 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1403 uvc, charid, foldlen, foldbuf, uniflags);
07be1b83
YO
1404 DEBUG_TRIE_EXECUTE_r(
1405 PerlIO_printf(Perl_debug_log,
1406 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1407 (int)((const char*)uc - real_start), charid, uvc)
1408 );
1409 uc += len;
1410
1411 do {
1412 U32 word = aho->states[ state ].wordnum;
1413 base = aho->states[ state ].trans.base;
1414
1415 DEBUG_TRIE_EXECUTE_r(
1416 PerlIO_printf( Perl_debug_log,
1417 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1418 failed ? "Fail transition to " : "",
1419 state, base, uvc, word)
1420 );
1421 if ( base ) {
1422 U32 tmp;
1423 if (charid &&
1424 (base + charid > trie->uniquecharcount )
1425 && (base + charid - 1 - trie->uniquecharcount
1426 < trie->lasttrans)
1427 && trie->trans[base + charid - 1 -
1428 trie->uniquecharcount].check == state
1429 && (tmp=trie->trans[base + charid - 1 -
1430 trie->uniquecharcount ].next))
1431 {
1432 state = tmp;
1433 break;
1434 }
1435 else {
1436 failed++;
1437 if ( state == 1 )
1438 break;
1439 else
1440 state = aho->fail[state];
1441 }
1442 }
1443 else {
1444 /* we must be accepting here */
1445 failed++;
1446 break;
1447 }
1448 } while(state);
1449 if (failed) {
1450 if (leftmost)
1451 break;
1452 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1453 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1454 uc++;
1455 }
1456 }
1457 }
1458 }
1459 if ( aho->states[ state ].wordnum ) {
1460 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1461 if (!leftmost || lpos < leftmost)
1462 leftmost = lpos;
1463 }
1464 DEBUG_TRIE_EXECUTE_r(
1465 PerlIO_printf( Perl_debug_log,
1466 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1467 "All done: ",
1468 state, base, uvc)
1469 );
1470 if (leftmost) {
1471 s = (char*)leftmost;
be8e71aa
YO
1472 if (!reginfo || regtry(reginfo, s)) {
1473 FREETMPS;
1474 LEAVE;
07be1b83 1475 goto got_it;
be8e71aa 1476 }
07be1b83
YO
1477 s = HOPc(s,1);
1478 } else {
1479 break;
1480 }
1481 }
be8e71aa
YO
1482 FREETMPS;
1483 LEAVE;
07be1b83
YO
1484 }
1485 break;
b3c9acc1 1486 default:
3c3eec57
GS
1487 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1488 break;
d6a28714 1489 }
6eb5f6b9
JH
1490 return 0;
1491 got_it:
1492 return s;
1493}
1494
1495/*
1496 - regexec_flags - match a regexp against a string
1497 */
1498I32
1499Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1500 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1501/* strend: pointer to null at end of string */
1502/* strbeg: real beginning of string */
1503/* minend: end of match must be >=minend after stringarg. */
1504/* data: May be used for some additional optimizations. */
1505/* nosave: For optimizations. */
1506{
97aff369 1507 dVAR;
6eb5f6b9
JH
1508 register char *s;
1509 register regnode *c;
1510 register char *startpos = stringarg;
6eb5f6b9
JH
1511 I32 minlen; /* must match at least this many chars */
1512 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1513 I32 end_shift = 0; /* Same for the end. */ /* CC */
1514 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1515 char *scream_olds = NULL;
3dab1dad 1516 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1517 const bool do_utf8 = DO_UTF8(sv);
2757e526 1518 I32 multiline;
0df25f3d 1519
3b0527fe 1520 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1521
1522 GET_RE_DEBUG_FLAGS_DECL;
1523
9d4ba2ae 1524 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1525
1526 /* Be paranoid... */
1527 if (prog == NULL || startpos == NULL) {
1528 Perl_croak(aTHX_ "NULL regexp parameter");
1529 return 0;
1530 }
1531
2757e526 1532 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1533 reginfo.prog = prog;
2757e526 1534
bac06658
JH
1535 RX_MATCH_UTF8_set(prog, do_utf8);
1536
6eb5f6b9 1537 minlen = prog->minlen;
61a36c01 1538 if (strend - startpos < minlen) {
a3621e74 1539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1540 "String too short [regexec_flags]...\n"));
1541 goto phooey;
1aa99e6b 1542 }
6eb5f6b9 1543
6eb5f6b9
JH
1544 /* Check validity of program. */
1545 if (UCHARAT(prog->program) != REG_MAGIC) {
1546 Perl_croak(aTHX_ "corrupted regexp program");
1547 }
1548
1549 PL_reg_flags = 0;
1550 PL_reg_eval_set = 0;
1551 PL_reg_maxiter = 0;
1552
1553 if (prog->reganch & ROPT_UTF8)
1554 PL_reg_flags |= RF_utf8;
1555
1556 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1557 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1558 PL_bostr = strbeg;
3b0527fe 1559 reginfo.sv = sv;
6eb5f6b9
JH
1560
1561 /* Mark end of line for $ (and such) */
1562 PL_regeol = strend;
1563
1564 /* see how far we have to get to not match where we matched before */
3b0527fe 1565 reginfo.till = startpos+minend;
6eb5f6b9 1566
6eb5f6b9
JH
1567 /* If there is a "must appear" string, look for it. */
1568 s = startpos;
1569
3b0527fe 1570 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1571 MAGIC *mg;
1572
1573 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1574 reginfo.ganch = startpos;
6eb5f6b9
JH
1575 else if (sv && SvTYPE(sv) >= SVt_PVMG
1576 && SvMAGIC(sv)
14befaf4
DM
1577 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1578 && mg->mg_len >= 0) {
3b0527fe 1579 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1580 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1581 if (s > reginfo.ganch)
6eb5f6b9 1582 goto phooey;
3b0527fe 1583 s = reginfo.ganch;
6eb5f6b9
JH
1584 }
1585 }
1586 else /* pos() not defined */
3b0527fe 1587 reginfo.ganch = strbeg;
6eb5f6b9
JH
1588 }
1589
a0714e2c 1590 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1591 re_scream_pos_data d;
1592
1593 d.scream_olds = &scream_olds;
1594 d.scream_pos = &scream_pos;
1595 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1596 if (!s) {
a3621e74 1597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1598 goto phooey; /* not present */
3fa9c3d7 1599 }
6eb5f6b9
JH
1600 }
1601
a3621e74 1602 DEBUG_EXECUTE_r({
0df25f3d
YO
1603 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1604 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1605 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1606 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1607
2a782b5b
JH
1608 if (!PL_colorset)
1609 reginitcolors();
1610 PerlIO_printf(Perl_debug_log,
a0288114 1611 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1612 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1613 len0, len0, s0,
2a782b5b 1614 PL_colors[1],
9e55ce06 1615 len0 > 60 ? "..." : "",
2a782b5b 1616 PL_colors[0],
9e55ce06
JH
1617 (int)(len1 > 60 ? 60 : len1),
1618 s1, PL_colors[1],
1619 (len1 > 60 ? "..." : "")
2a782b5b
JH
1620 );
1621 });
6eb5f6b9
JH
1622
1623 /* Simplest case: anchored match need be tried only once. */
1624 /* [unless only anchor is BOL and multiline is set] */
1625 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1626 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1627 goto got_it;
7fba1cd6 1628 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1629 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1630 {
1631 char *end;
1632
1633 if (minlen)
1634 dontbother = minlen - 1;
1aa99e6b 1635 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1636 /* for multiline we only have to try after newlines */
33b8afdf 1637 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1638 if (s == startpos)
1639 goto after_try;
1640 while (1) {
3b0527fe 1641 if (regtry(&reginfo, s))
6eb5f6b9
JH
1642 goto got_it;
1643 after_try:
1644 if (s >= end)
1645 goto phooey;
1646 if (prog->reganch & RE_USE_INTUIT) {
1647 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1648 if (!s)
1649 goto phooey;
1650 }
1651 else
1652 s++;
1653 }
1654 } else {
1655 if (s > startpos)
1656 s--;
1657 while (s < end) {
1658 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1659 if (regtry(&reginfo, s))
6eb5f6b9
JH
1660 goto got_it;
1661 }
1662 }
1663 }
1664 }
1665 goto phooey;
1666 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1667 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1668 goto got_it;
1669 goto phooey;
1670 }
1671
1672 /* Messy cases: unanchored match. */
33b8afdf 1673 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1674 /* we have /x+whatever/ */
1675 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1676 char ch;
bf93d4cc
GS
1677#ifdef DEBUGGING
1678 int did_match = 0;
1679#endif
33b8afdf
JH
1680 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1681 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1682 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1683
1aa99e6b 1684 if (do_utf8) {
4cadc6a9 1685 REXEC_FBC_SCAN(
6eb5f6b9 1686 if (*s == ch) {
a3621e74 1687 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1688 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1689 s += UTF8SKIP(s);
1690 while (s < strend && *s == ch)
1691 s += UTF8SKIP(s);
1692 }
4cadc6a9 1693 );
6eb5f6b9
JH
1694 }
1695 else {
4cadc6a9 1696 REXEC_FBC_SCAN(
6eb5f6b9 1697 if (*s == ch) {
a3621e74 1698 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1699 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1700 s++;
1701 while (s < strend && *s == ch)
1702 s++;
1703 }
4cadc6a9 1704 );
6eb5f6b9 1705 }
a3621e74 1706 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1707 PerlIO_printf(Perl_debug_log,
b7953727
JH
1708 "Did not find anchored character...\n")
1709 );
6eb5f6b9 1710 }
a0714e2c
SS
1711 else if (prog->anchored_substr != NULL
1712 || prog->anchored_utf8 != NULL
1713 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1714 && prog->float_max_offset < strend - s)) {
1715 SV *must;
1716 I32 back_max;
1717 I32 back_min;
1718 char *last;
6eb5f6b9 1719 char *last1; /* Last position checked before */
bf93d4cc
GS
1720#ifdef DEBUGGING
1721 int did_match = 0;
1722#endif
33b8afdf
JH
1723 if (prog->anchored_substr || prog->anchored_utf8) {
1724 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1725 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1726 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1727 back_max = back_min = prog->anchored_offset;
1728 } else {
1729 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1730 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1731 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1732 back_max = prog->float_max_offset;
1733 back_min = prog->float_min_offset;
1734 }
1735 if (must == &PL_sv_undef)
1736 /* could not downgrade utf8 check substring, so must fail */
1737 goto phooey;
1738
1739 last = HOP3c(strend, /* Cannot start after this */
1740 -(I32)(CHR_SVLEN(must)
1741 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1742
1743 if (s > PL_bostr)
1744 last1 = HOPc(s, -1);
1745 else
1746 last1 = s - 1; /* bogus */
1747
a0288114 1748 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1749 check_substr==must. */
1750 scream_pos = -1;
1751 dontbother = end_shift;
1752 strend = HOPc(strend, -dontbother);
1753 while ( (s <= last) &&
9041c2e3 1754 ((flags & REXEC_SCREAM)
1aa99e6b 1755 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1756 end_shift, &scream_pos, 0))
1aa99e6b 1757 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1758 (unsigned char*)strend, must,
7fba1cd6 1759 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1760 /* we may be pointing at the wrong string */
1761 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1762 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1763 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1764 if (HOPc(s, -back_max) > last1) {
1765 last1 = HOPc(s, -back_min);
1766 s = HOPc(s, -back_max);
1767 }
1768 else {
52657f30 1769 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1770
1771 last1 = HOPc(s, -back_min);
52657f30 1772 s = t;
6eb5f6b9 1773 }
1aa99e6b 1774 if (do_utf8) {
6eb5f6b9 1775 while (s <= last1) {
3b0527fe 1776 if (regtry(&reginfo, s))
6eb5f6b9
JH
1777 goto got_it;
1778 s += UTF8SKIP(s);
1779 }
1780 }
1781 else {
1782 while (s <= last1) {
3b0527fe 1783 if (regtry(&reginfo, s))
6eb5f6b9
JH
1784 goto got_it;
1785 s++;
1786 }
1787 }
1788 }
a3621e74 1789 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1790 PerlIO_printf(Perl_debug_log,
a0288114 1791 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1792 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1793 ? "anchored" : "floating"),
1794 PL_colors[0],
1795 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1796 SvPVX_const(must),
b7953727
JH
1797 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1798 );
6eb5f6b9
JH
1799 goto phooey;
1800 }
155aba94 1801 else if ((c = prog->regstclass)) {
f14c76ed 1802 if (minlen) {
be8e71aa 1803 const OPCODE op = OP(prog->regstclass);
66e933ab 1804 /* don't bother with what can't match */
07be1b83 1805 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
1806 strend = HOPc(strend, -(minlen - 1));
1807 }
a3621e74 1808 DEBUG_EXECUTE_r({
be8e71aa 1809 SV * const prop = sv_newmortal();
32fc9b6a 1810 regprop(prog, prop, c);
0df25f3d
YO
1811 {
1812 RE_PV_DISPLAY_DECL(s0,len0,UTF,
1813 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
1814 RE_PV_DISPLAY_DECL(s1,len1,UTF,
1815 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
1816 PerlIO_printf(Perl_debug_log,
1817 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
1818 len0, len0, s0,
1819 len1, len1, s1, (int)(strend - s));
1820 }
ffc61ed2 1821 });
3b0527fe 1822 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1823 goto got_it;
07be1b83 1824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1825 }
1826 else {
1827 dontbother = 0;
a0714e2c 1828 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1829 /* Trim the end. */
d6a28714 1830 char *last;
33b8afdf
JH
1831 SV* float_real;
1832
1833 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1834 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1835 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1836
1837 if (flags & REXEC_SCREAM) {
33b8afdf 1838 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1839 end_shift, &scream_pos, 1); /* last one */
1840 if (!last)
ffc61ed2 1841 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1842 /* we may be pointing at the wrong string */
1843 else if (RX_MATCH_COPIED(prog))
3f7c398e 1844 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1845 }
d6a28714
JH
1846 else {
1847 STRLEN len;
cfd0369c 1848 const char * const little = SvPV_const(float_real, len);
d6a28714 1849
33b8afdf 1850 if (SvTAIL(float_real)) {
d6a28714
JH
1851 if (memEQ(strend - len + 1, little, len - 1))
1852 last = strend - len + 1;
7fba1cd6 1853 else if (!multiline)
9041c2e3 1854 last = memEQ(strend - len, little, len)
bd61b366 1855 ? strend - len : NULL;
b8c5462f 1856 else
d6a28714
JH
1857 goto find_last;
1858 } else {
1859 find_last:
9041c2e3 1860 if (len)
d6a28714 1861 last = rninstr(s, strend, little, little + len);
b8c5462f 1862 else
a0288114 1863 last = strend; /* matching "$" */
b8c5462f 1864 }
b8c5462f 1865 }
bf93d4cc 1866 if (last == NULL) {
a3621e74 1867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1868 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1869 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1870 goto phooey; /* Should not happen! */
1871 }
d6a28714
JH
1872 dontbother = strend - last + prog->float_min_offset;
1873 }
1874 if (minlen && (dontbother < minlen))
1875 dontbother = minlen - 1;
1876 strend -= dontbother; /* this one's always in bytes! */
1877 /* We don't know much -- general case. */
1aa99e6b 1878 if (do_utf8) {
d6a28714 1879 for (;;) {
3b0527fe 1880 if (regtry(&reginfo, s))
d6a28714
JH
1881 goto got_it;
1882 if (s >= strend)
1883 break;
b8c5462f 1884 s += UTF8SKIP(s);
d6a28714
JH
1885 };
1886 }
1887 else {
1888 do {
3b0527fe 1889 if (regtry(&reginfo, s))
d6a28714
JH
1890 goto got_it;
1891 } while (s++ < strend);
1892 }
1893 }
1894
1895 /* Failure. */
1896 goto phooey;
1897
1898got_it:
1899 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1900
1901 if (PL_reg_eval_set) {
1902 /* Preserve the current value of $^R */
1903 if (oreplsv != GvSV(PL_replgv))
1904 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1905 restored, the value remains
1906 the same. */
4f639d21 1907 restore_pos(aTHX_ prog);
d6a28714
JH
1908 }
1909
1910 /* make sure $`, $&, $', and $digit will work later */
1911 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1912 RX_MATCH_COPY_FREE(prog);
d6a28714 1913 if (flags & REXEC_COPY_STR) {
be8e71aa 1914 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1915#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1916 if ((SvIsCOW(sv)
1917 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1918 if (DEBUG_C_TEST) {
1919 PerlIO_printf(Perl_debug_log,
1920 "Copy on write: regexp capture, type %d\n",
1921 (int) SvTYPE(sv));
1922 }
1923 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1924 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1925 assert (SvPOKp(prog->saved_copy));
1926 } else
1927#endif
1928 {
1929 RX_MATCH_COPIED_on(prog);
1930 s = savepvn(strbeg, i);
1931 prog->subbeg = s;
1932 }
d6a28714 1933 prog->sublen = i;
d6a28714
JH
1934 }
1935 else {
1936 prog->subbeg = strbeg;
1937 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1938 }
1939 }
9041c2e3 1940
d6a28714
JH
1941 return 1;
1942
1943phooey:
a3621e74 1944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1945 PL_colors[4], PL_colors[5]));
d6a28714 1946 if (PL_reg_eval_set)
4f639d21 1947 restore_pos(aTHX_ prog);
d6a28714
JH
1948 return 0;
1949}
1950
1951/*
1952 - regtry - try match at specific point
1953 */
1954STATIC I32 /* 0 failure, 1 success */
3b0527fe 1955S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1956{
97aff369 1957 dVAR;
d6a28714
JH
1958 register I32 *sp;
1959 register I32 *ep;
1960 CHECKPOINT lastcp;
3b0527fe 1961 regexp *prog = reginfo->prog;
a3621e74 1962 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1963
02db2b7b
IZ
1964#ifdef DEBUGGING
1965 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1966#endif
d6a28714
JH
1967 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1968 MAGIC *mg;
1969
1970 PL_reg_eval_set = RS_init;
a3621e74 1971 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
1972 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1973 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1974 ));
e8347627 1975 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1976 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1977 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1978 SAVETMPS;
1979 /* Apparently this is not needed, judging by wantarray. */
e8347627 1980 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1981 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1982
3b0527fe 1983 if (reginfo->sv) {
d6a28714 1984 /* Make $_ available to executed code. */
3b0527fe 1985 if (reginfo->sv != DEFSV) {
59f00321 1986 SAVE_DEFSV;
3b0527fe 1987 DEFSV = reginfo->sv;
b8c5462f 1988 }
d6a28714 1989
3b0527fe
DM
1990 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1991 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 1992 /* prepare for quick setting of pos */
d300d9fa
NC
1993#ifdef PERL_OLD_COPY_ON_WRITE
1994 if (SvIsCOW(sv))
1995 sv_force_normal_flags(sv, 0);
1996#endif
3dab1dad 1997 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 1998 &PL_vtbl_mglob, NULL, 0);
d6a28714 1999 mg->mg_len = -1;
b8c5462f 2000 }
d6a28714
JH
2001 PL_reg_magic = mg;
2002 PL_reg_oldpos = mg->mg_len;
4f639d21 2003 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2004 }
09687e5a 2005 if (!PL_reg_curpm) {
a02a5408 2006 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2007#ifdef USE_ITHREADS
2008 {
be8e71aa 2009 SV* const repointer = newSViv(0);
577e12cc 2010 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2011 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2012 av_push(PL_regex_padav,repointer);
2013 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2014 PL_regex_pad = AvARRAY(PL_regex_padav);
2015 }
2016#endif
2017 }
aaa362c4 2018 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2019 PL_reg_oldcurpm = PL_curpm;
2020 PL_curpm = PL_reg_curpm;
2021 if (RX_MATCH_COPIED(prog)) {
2022 /* Here is a serious problem: we cannot rewrite subbeg,
2023 since it may be needed if this match fails. Thus
2024 $` inside (?{}) could fail... */
2025 PL_reg_oldsaved = prog->subbeg;
2026 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2027#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2028 PL_nrs = prog->saved_copy;
2029#endif
d6a28714
JH
2030 RX_MATCH_COPIED_off(prog);
2031 }
2032 else
bd61b366 2033 PL_reg_oldsaved = NULL;
d6a28714
JH
2034 prog->subbeg = PL_bostr;
2035 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2036 }
973dddac 2037 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2038 PL_reginput = startpos;
2039 PL_regstartp = prog->startp;
2040 PL_regendp = prog->endp;
2041 PL_reglastparen = &prog->lastparen;
a01268b5 2042 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2043 prog->lastparen = 0;
03994de8 2044 prog->lastcloseparen = 0;
d6a28714 2045 PL_regsize = 0;
a3621e74 2046 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2047 if (PL_reg_start_tmpl <= prog->nparens) {
2048 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2049 if(PL_reg_start_tmp)
2050 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2051 else
a02a5408 2052 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2053 }
2054
2055 /* XXXX What this code is doing here?!!! There should be no need
2056 to do this again and again, PL_reglastparen should take care of
3dd2943c 2057 this! --ilya*/
dafc8851
JH
2058
2059 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2060 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2061 * PL_reglastparen), is not needed at all by the test suite
2062 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2063 * enough, for building DynaLoader, or otherwise this
2064 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2065 * will happen. Meanwhile, this code *is* needed for the
2066 * above-mentioned test suite tests to succeed. The common theme
2067 * on those tests seems to be returning null fields from matches.
2068 * --jhi */
dafc8851 2069#if 1
d6a28714
JH
2070 sp = prog->startp;
2071 ep = prog->endp;
2072 if (prog->nparens) {
097eb12c 2073 register I32 i;
eb160463 2074 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2075 *++sp = -1;
2076 *++ep = -1;
2077 }
2078 }
dafc8851 2079#endif
02db2b7b 2080 REGCP_SET(lastcp);
3b0527fe 2081 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2082 prog->endp[0] = PL_reginput - PL_bostr;
2083 return 1;
2084 }
02db2b7b 2085 REGCP_UNWIND(lastcp);
d6a28714
JH
2086 return 0;
2087}
2088
02db2b7b 2089
8ba1375e
MJD
2090#define sayYES goto yes
2091#define sayNO goto no
e0f9d4a8 2092#define sayNO_ANYOF goto no_anyof
8ba1375e 2093#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2094#define sayNO_FINAL goto no_final
2095#define sayNO_SILENT goto do_no
2096#define saySAME(x) if (x) goto yes; else goto no
2097
3ab3c9b4
HS
2098#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2099#define POSCACHE_SEEN 1 /* we know what we're caching */
2100#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2101
3ab3c9b4 2102#define CACHEsayYES STMT_START { \
d8319b27 2103 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2104 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2105 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2106 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2107 } \
2108 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2109 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2110 } \
2111 else { \
3ab3c9b4
HS
2112 /* cache records failure, but this is success */ \
2113 DEBUG_r( \
2114 PerlIO_printf(Perl_debug_log, \
2115 "%*s (remove success from failure cache)\n", \
2116 REPORT_CODE_OFF+PL_regindent*2, "") \
2117 ); \
d8319b27 2118 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2119 } \
2120 } \
2121 sayYES; \
2122} STMT_END
7409bbd3 2123
3ab3c9b4 2124#define CACHEsayNO STMT_START { \
d8319b27 2125 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2126 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2127 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2128 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2129 } \
2130 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2131 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2132 } \
2133 else { \
3ab3c9b4
HS
2134 /* cache records success, but this is failure */ \
2135 DEBUG_r( \
2136 PerlIO_printf(Perl_debug_log, \
2137 "%*s (remove failure from success cache)\n", \
2138 REPORT_CODE_OFF+PL_regindent*2, "") \
2139 ); \
d8319b27 2140 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2141 } \
2142 } \
2143 sayNO; \
2144} STMT_END
2145
a3621e74
YO
2146/* this is used to determine how far from the left messages like
2147 'failed...' are printed. Currently 29 makes these messages line
2148 up with the opcode they refer to. Earlier perls used 25 which
2149 left these messages outdented making reviewing a debug output
2150 quite difficult.
2151*/
2152#define REPORT_CODE_OFF 29
2153
2154
2155/* Make sure there is a test for this +1 options in re_tests */
2156#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2157
40a82448
DM
2158#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2159#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2160
86545054
DM
2161#define SLAB_FIRST(s) (&(s)->states[0])
2162#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2163
5d9a96ca
DM
2164/* grab a new slab and return the first slot in it */
2165
2166STATIC regmatch_state *
2167S_push_slab(pTHX)
2168{
54df2634
NC
2169#if PERL_VERSION < 9
2170 dMY_CXT;
2171#endif
5d9a96ca
DM
2172 regmatch_slab *s = PL_regmatch_slab->next;
2173 if (!s) {
2174 Newx(s, 1, regmatch_slab);
2175 s->prev = PL_regmatch_slab;
2176 s->next = NULL;
2177 PL_regmatch_slab->next = s;
2178 }
2179 PL_regmatch_slab = s;
86545054 2180 return SLAB_FIRST(s);
5d9a96ca 2181}
5b47454d 2182
95b24440
DM
2183/* simulate a recursive call to regmatch */
2184
2185#define REGMATCH(ns, where) \
5d9a96ca
DM
2186 st->scan = scan; \
2187 scan = (ns); \
2188 st->resume_state = resume_##where; \
95b24440
DM
2189 goto start_recurse; \
2190 resume_point_##where:
2191
40a82448
DM
2192/* push a new state then goto it */
2193
2194#define PUSH_STATE_GOTO(state, node) \
2195 scan = node; \
2196 st->resume_state = state; \
2197 goto push_state;
2198
2199/* push a new state with success backtracking, then goto it */
2200
2201#define PUSH_YES_STATE_GOTO(state, node) \
2202 scan = node; \
2203 st->resume_state = state; \
2204 goto push_yes_state;
2205
aa283a38 2206
aa283a38 2207
d6a28714
JH
2208/*
2209 - regmatch - main matching routine
2210 *
2211 * Conceptually the strategy is simple: check to see whether the current
2212 * node matches, call self recursively to see whether the rest matches,
2213 * and then act accordingly. In practice we make some effort to avoid
2214 * recursion, in particular by going through "ordinary" nodes (that don't
2215 * need to know whether the rest of the match failed) by a loop instead of
2216 * by recursion.
2217 */
2218/* [lwall] I've hoisted the register declarations to the outer block in order to
2219 * maybe save a little bit of pushing and popping on the stack. It also takes
2220 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2221 *
2222 * This function used to be heavily recursive, but since this had the
2223 * effect of blowing the CPU stack on complex regexes, it has been
2224 * restructured to be iterative, and to save state onto the heap rather
2225 * than the stack. Essentially whereever regmatch() used to be called, it
2226 * pushes the current state, notes where to return, then jumps back into
2227 * the main loop.
2228 *
2229 * Originally the structure of this function used to look something like
2230
2231 S_regmatch() {
2232 int a = 1, b = 2;
2233 ...
2234 while (scan != NULL) {
5d9a96ca 2235 a++; // do stuff with a and b
95b24440
DM
2236 ...
2237 switch (OP(scan)) {
2238 case FOO: {
2239 int local = 3;
2240 ...
2241 if (regmatch(...)) // recurse
2242 goto yes;
2243 }
2244 ...
2245 }
2246 }
2247 yes:
2248 return 1;
2249 }
2250
2251 * Now it looks something like this:
2252
5d9a96ca 2253 typedef struct {
95b24440
DM
2254 int a, b, local;
2255 int resume_state;
5d9a96ca 2256 } regmatch_state;
95b24440
DM
2257
2258 S_regmatch() {
5d9a96ca
DM
2259 regmatch_state *st = new();
2260 int depth=0;
2261 st->a++; // do stuff with a and b
95b24440
DM
2262 ...
2263 while (scan != NULL) {
2264 ...
2265 switch (OP(scan)) {
2266 case FOO: {
5d9a96ca 2267 st->local = 3;
95b24440 2268 ...
5d9a96ca
DM
2269 st->scan = scan;
2270 scan = ...;
2271 st->resume_state = resume_FOO;
2272 goto start_recurse; // recurse
95b24440 2273
5d9a96ca
DM
2274 resume_point_FOO:
2275 if (result)
95b24440
DM
2276 goto yes;
2277 }
2278 ...
2279 }
5d9a96ca
DM
2280 start_recurse:
2281 st = new(); push a new state
2282 st->a = 1; st->b = 2;
2283 depth++;
95b24440 2284 }
5d9a96ca 2285 yes:
95b24440 2286 result = 1;
5d9a96ca
DM
2287 if (depth--) {
2288 st = pop();
95b24440
DM
2289 switch (resume_state) {
2290 case resume_FOO:
2291 goto resume_point_FOO;
2292 ...
2293 }
2294 }
2295 return result
2296 }
2297
2298 * WARNING: this means that any line in this function that contains a
2299 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2300 * regmatch() using gotos instead. Thus the values of any local variables
2301 * not saved in the regmatch_state structure will have been lost when
2302 * execution resumes on the next line .
5d9a96ca
DM
2303 *
2304 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2305 * PL_regmatch_state always points to the currently active state, and
2306 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2307 * The first time regmatch is called, the first slab is allocated, and is
2308 * never freed until interpreter desctruction. When the slab is full,
2309 * a new one is allocated chained to the end. At exit from regmatch, slabs
2310 * allocated since entry are freed.
d6a28714 2311 */
95b24440 2312
40a82448 2313/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2314#define TRIE_next (REGNODE_MAX+1)
2315#define TRIE_next_fail (REGNODE_MAX+2)
2316#define EVAL_A (REGNODE_MAX+3)
2317#define EVAL_A_fail (REGNODE_MAX+4)
2318#define resume_CURLYX (REGNODE_MAX+5)
2319#define resume_WHILEM1 (REGNODE_MAX+6)
2320#define resume_WHILEM2 (REGNODE_MAX+7)
2321#define resume_WHILEM3 (REGNODE_MAX+8)
2322#define resume_WHILEM4 (REGNODE_MAX+9)
2323#define resume_WHILEM5 (REGNODE_MAX+10)
2324#define resume_WHILEM6 (REGNODE_MAX+11)
2325#define BRANCH_next (REGNODE_MAX+12)
2326#define BRANCH_next_fail (REGNODE_MAX+13)
2327#define CURLYM_A (REGNODE_MAX+14)
2328#define CURLYM_A_fail (REGNODE_MAX+15)
2329#define CURLYM_B (REGNODE_MAX+16)
2330#define CURLYM_B_fail (REGNODE_MAX+17)
2331#define IFMATCH_A (REGNODE_MAX+18)
2332#define IFMATCH_A_fail (REGNODE_MAX+19)
2333#define CURLY_B_min_known (REGNODE_MAX+20)
2334#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2335#define CURLY_B_min (REGNODE_MAX+22)
2336#define CURLY_B_min_fail (REGNODE_MAX+23)
2337#define CURLY_B_max (REGNODE_MAX+24)
2338#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2339
2340
3dab1dad 2341#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2342
3df15adc
YO
2343#ifdef DEBUGGING
2344
2345STATIC void
07be1b83
YO
2346S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2347{
2348 const int docolor = *PL_colors[0];
2349 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2350 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2351 /* The part of the string before starttry has one color
2352 (pref0_len chars), between starttry and current
2353 position another one (pref_len - pref0_len chars),
2354 after the current position the third one.
2355 We assume that pref0_len <= pref_len, otherwise we
2356 decrease pref0_len. */
2357 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2358 ? (5 + taill) - l : locinput - PL_bostr;
2359 int pref0_len;
2360
2361 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2362 pref_len++;
2363 pref0_len = pref_len - (locinput - PL_reg_starttry);
2364 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2365 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2366 ? (5 + taill) - pref_len : PL_regeol - locinput);
2367 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2368 l--;
2369 if (pref0_len < 0)
2370 pref0_len = 0;
2371 if (pref0_len > pref_len)
2372 pref0_len = pref_len;
2373 {
3df15adc 2374 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d
YO
2375
2376 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2377 (locinput - pref_len),pref0_len, 60);
2378
2379 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2380 (locinput - pref_len + pref0_len),
0df25f3d
YO
2381 pref_len - pref0_len, 60);
2382
2383 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2384 locinput, PL_regeol - locinput, 60);
2385
3df15adc 2386 PerlIO_printf(Perl_debug_log,
07be1b83
YO
2387 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2388 (IV)(locinput - PL_bostr),
2389 PL_colors[4],
2390 len0, s0,
2391 PL_colors[5],
2392 PL_colors[2],
2393 len1, s1,
2394 PL_colors[3],
2395 (docolor ? "" : "> <"),
2396 PL_colors[0],
2397 len2, s2,
2398 PL_colors[1],
2399 15 - l - pref_len + 1,
2400 "");
2401 }
2402}
3df15adc 2403
07be1b83
YO
2404#endif
2405
d6a28714 2406STATIC I32 /* 0 failure, 1 success */
3b0527fe 2407S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2408{
54df2634
NC
2409#if PERL_VERSION < 9
2410 dMY_CXT;
2411#endif
27da23d5 2412 dVAR;
95b24440 2413 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2414 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2415
3b0527fe
DM
2416 regexp *rex = reginfo->prog;
2417
5d9a96ca
DM
2418 regmatch_slab *orig_slab;
2419 regmatch_state *orig_state;
a3621e74 2420
5d9a96ca
DM
2421 /* the current state. This is a cached copy of PL_regmatch_state */
2422 register regmatch_state *st;
95b24440 2423
5d9a96ca
DM
2424 /* cache heavy used fields of st in registers */
2425 register regnode *scan;
2426 register regnode *next;
2427 register I32 n = 0; /* initialize to shut up compiler warning */
2428 register char *locinput = PL_reginput;
95b24440 2429
5d9a96ca
DM
2430 /* these variables are NOT saved during a recusive RFEGMATCH: */
2431 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2432 bool result; /* return value of S_regmatch */
5d9a96ca 2433 int depth = 0; /* depth of recursion */
77cb431f
DM
2434 regmatch_state *yes_state = NULL; /* state to pop to on success of
2435 subpattern */
40a82448 2436 U32 state_num;
95b24440
DM
2437
2438#ifdef DEBUGGING
e68ec53f 2439 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2440 PL_regindent++;
2441#endif
2442
5d9a96ca
DM
2443 /* on first ever call to regmatch, allocate first slab */
2444 if (!PL_regmatch_slab) {
2445 Newx(PL_regmatch_slab, 1, regmatch_slab);
2446 PL_regmatch_slab->prev = NULL;
2447 PL_regmatch_slab->next = NULL;
86545054 2448 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2449 }
2450
2451 /* remember current high-water mark for exit */
2452 /* XXX this should be done with SAVE* instead */
2453 orig_slab = PL_regmatch_slab;
2454 orig_state = PL_regmatch_state;
2455
2456 /* grab next free state slot */
2457 st = ++PL_regmatch_state;
86545054 2458 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2459 st = PL_regmatch_state = S_push_slab(aTHX);
2460
2461 st->minmod = 0;
2462 st->sw = 0;
2463 st->logical = 0;
5d9a96ca 2464 st->cc = NULL;
d6a28714
JH
2465 /* Note that nextchr is a byte even in UTF */
2466 nextchr = UCHARAT(locinput);
2467 scan = prog;
2468 while (scan != NULL) {
8ba1375e 2469
a3621e74 2470 DEBUG_EXECUTE_r( {
6136c704 2471 SV * const prop = sv_newmortal();
07be1b83 2472 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2473 regprop(rex, prop, scan);
07be1b83
YO
2474
2475 PerlIO_printf(Perl_debug_log,
2476 "%3"IVdf":%*s%s(%"IVdf")\n",
2477 (IV)(scan - rex->program), PL_regindent*2, "",
2478 SvPVX_const(prop),
2479 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2480 });
d6a28714
JH
2481
2482 next = scan + NEXT_OFF(scan);
2483 if (next == scan)
2484 next = NULL;
40a82448 2485 state_num = OP(scan);
d6a28714 2486
40a82448
DM
2487 reenter_switch:
2488 switch (state_num) {
d6a28714 2489 case BOL:
7fba1cd6 2490 if (locinput == PL_bostr)
d6a28714 2491 {
3b0527fe 2492 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2493 break;
2494 }
d6a28714
JH
2495 sayNO;
2496 case MBOL:
12d33761
HS
2497 if (locinput == PL_bostr ||
2498 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2499 {
b8c5462f
JH
2500 break;
2501 }
d6a28714
JH
2502 sayNO;
2503 case SBOL:
c2a73568 2504 if (locinput == PL_bostr)
b8c5462f 2505 break;
d6a28714
JH
2506 sayNO;
2507 case GPOS:
3b0527fe 2508 if (locinput == reginfo->ganch)
d6a28714
JH
2509 break;
2510 sayNO;
2511 case EOL:
d6a28714
JH
2512 goto seol;
2513 case MEOL:
d6a28714 2514 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2515 sayNO;
b8c5462f 2516 break;
d6a28714
JH
2517 case SEOL:
2518 seol:
2519 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2520 sayNO;
d6a28714 2521 if (PL_regeol - locinput > 1)
b8c5462f 2522 sayNO;
b8c5462f 2523 break;
d6a28714
JH
2524 case EOS:
2525 if (PL_regeol != locinput)
b8c5462f 2526 sayNO;
d6a28714 2527 break;
ffc61ed2 2528 case SANY:
d6a28714 2529 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2530 sayNO;
f33976b4
DB
2531 if (do_utf8) {
2532 locinput += PL_utf8skip[nextchr];
2533 if (locinput > PL_regeol)
2534 sayNO;
2535 nextchr = UCHARAT(locinput);
2536 }
2537 else
2538 nextchr = UCHARAT(++locinput);
2539 break;
2540 case CANY:
2541 if (!nextchr && locinput >= PL_regeol)
2542 sayNO;
b8c5462f 2543 nextchr = UCHARAT(++locinput);
a0d0e21e 2544 break;
ffc61ed2 2545 case REG_ANY:
1aa99e6b
IH
2546 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2547 sayNO;
2548 if (do_utf8) {
b8c5462f 2549 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2550 if (locinput > PL_regeol)
2551 sayNO;
a0ed51b3 2552 nextchr = UCHARAT(locinput);
a0ed51b3 2553 }
1aa99e6b
IH
2554 else
2555 nextchr = UCHARAT(++locinput);
a0ed51b3 2556 break;
166ba7cd
DM
2557
2558#undef ST
2559#define ST st->u.trie
2560
5b47454d 2561 case TRIE:
3dab1dad 2562 {
07be1b83 2563 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2564 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2565 trie_type = do_utf8 ?
2566 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2567 : trie_plain;
2568
2569 /* what trie are we using right now */
be8e71aa 2570 reg_trie_data * const trie
3dab1dad
YO
2571 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2572 U32 state = trie->startstate;
166ba7cd
DM
2573
2574 U8 *uc = ( U8* )locinput;
2575 U16 charid = 0;
2576 U32 base = 0;
2577 UV uvc = 0;
2578 STRLEN len = 0;
2579 STRLEN foldlen = 0;
2580 U8 *uscan = (U8*)NULL;
2581 STRLEN bufflen=0;
2582 SV *sv_accept_buff = NULL;
2583 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2584
2585 ST.accepted = 0; /* how many accepting states we have seen */
2586 ST.B = next;
2587#ifdef DEBUGGING
2588 ST.me = scan;
2589#endif
3dab1dad
YO
2590
2591 if (trie->bitmap && trie_type != trie_utf8_fold &&
2592 !TRIE_BITMAP_TEST(trie,*locinput)
2593 ) {
2594 if (trie->states[ state ].wordnum) {
2595 DEBUG_EXECUTE_r(
2596 PerlIO_printf(Perl_debug_log,
2597 "%*s %smatched empty string...%s\n",
2598 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2599 );
2600 break;
2601 } else {
2602 DEBUG_EXECUTE_r(
2603 PerlIO_printf(Perl_debug_log,
2604 "%*s %sfailed to match start class...%s\n",
2605 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2606 );
2607 sayNO_SILENT;
2608 }
2609 }
166ba7cd 2610
07be1b83
YO
2611 /*
2612 traverse the TRIE keeping track of all accepting states
2613 we transition through until we get to a failing node.
2614 */
2615
a3621e74
YO
2616 while ( state && uc <= (U8*)PL_regeol ) {
2617
5b47454d 2618 if (trie->states[ state ].wordnum) {
166ba7cd 2619 if (!ST.accepted ) {
5b47454d
DM
2620 ENTER;
2621 SAVETMPS;
2622 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2623 sv_accept_buff=newSV(bufflen *
2624 sizeof(reg_trie_accepted) - 1);
2625 SvCUR_set(sv_accept_buff,
2626 sizeof(reg_trie_accepted));
2627 SvPOK_on(sv_accept_buff);
2628 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2629 SAVETMPS;
2630 ST.accept_buff =
5b47454d
DM
2631 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2632 }
2633 else {
166ba7cd 2634 if (ST.accepted >= bufflen) {
5b47454d 2635 bufflen *= 2;
166ba7cd 2636 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2637 SvGROW(sv_accept_buff,
2638 bufflen * sizeof(reg_trie_accepted));
2639 }
2640 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2641 + sizeof(reg_trie_accepted));
2642 }
166ba7cd
DM
2643 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2644 ST.accept_buff[ST.accepted].endpos = uc;
2645 ++ST.accepted;
5b47454d 2646 }
a3621e74
YO
2647
2648 base = trie->states[ state ].trans.base;
2649
07be1b83
YO
2650 DEBUG_TRIE_EXECUTE_r({
2651 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2652 PerlIO_printf( Perl_debug_log,
e4584336 2653 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2654 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2655 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2656 });
a3621e74
YO
2657
2658 if ( base ) {
4cadc6a9
YO
2659 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2660 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2661
5b47454d
DM
2662 if (charid &&
2663 (base + charid > trie->uniquecharcount )
2664 && (base + charid - 1 - trie->uniquecharcount
2665 < trie->lasttrans)
2666 && trie->trans[base + charid - 1 -
2667 trie->uniquecharcount].check == state)
2668 {
2669 state = trie->trans[base + charid - 1 -
2670 trie->uniquecharcount ].next;
2671 }
2672 else {
2673 state = 0;
2674 }
2675 uc += len;
2676
2677 }
2678 else {
a3621e74
YO
2679 state = 0;
2680 }
2681 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2682 PerlIO_printf( Perl_debug_log,
2683 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2684 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2685 );
2686 }
166ba7cd 2687 if (!ST.accepted )
a3621e74 2688 sayNO;
a3621e74 2689
166ba7cd
DM
2690 DEBUG_EXECUTE_r(
2691 PerlIO_printf( Perl_debug_log,
2692 "%*s %sgot %"IVdf" possible matches%s\n",
2693 REPORT_CODE_OFF + PL_regindent * 2, "",
2694 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2695 );
2696 }
2697
2698 /* FALL THROUGH */
2699
2700 case TRIE_next_fail: /* we failed - try next alterative */
2701
2702 if ( ST.accepted == 1 ) {
2703 /* only one choice left - just continue */
2704 DEBUG_EXECUTE_r({
2705 reg_trie_data * const trie
2706 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2707 SV ** const tmp = RX_DEBUG(reginfo->prog)
2708 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2709 : NULL;
2710 PerlIO_printf( Perl_debug_log,
2711 "%*s %sonly one match left: #%d <%s>%s\n",
2712 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2713 ST.accept_buff[ 0 ].wordnum,
2714 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2715 PL_colors[5] );
2716 });
2717 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2718 /* in this case we free tmps/leave before we call regmatch
2719 as we wont be using accept_buff again. */
2720 FREETMPS;
2721 LEAVE;
2722 locinput = PL_reginput;
2723 nextchr = UCHARAT(locinput);
2724 scan = ST.B;
2725 continue; /* execute rest of RE */
2726 }
2727
2728 if (!ST.accepted-- ) {
2729 FREETMPS;
2730 LEAVE;
2731 sayNO;
2732 }
2733
a3621e74 2734 /*
166ba7cd
DM
2735 There are at least two accepting states left. Presumably
2736 the number of accepting states is going to be low,
2737 typically two. So we simply scan through to find the one
2738 with lowest wordnum. Once we find it, we swap the last
2739 state into its place and decrement the size. We then try to
2740 match the rest of the pattern at the point where the word
2741 ends. If we succeed, control just continues along the
2742 regex; if we fail we return here to try the next accepting
2743 state
2744 */
a3621e74 2745
166ba7cd
DM
2746 {
2747 U32 best = 0;
2748 U32 cur;
2749 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2750 DEBUG_TRIE_EXECUTE_r(
f2278c82 2751 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2752 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2753 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2754 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2755 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2756 );
2757
2758 if (ST.accept_buff[cur].wordnum <
2759 ST.accept_buff[best].wordnum)
2760 best = cur;
a3621e74 2761 }
166ba7cd
DM
2762
2763 DEBUG_EXECUTE_r({
2764 reg_trie_data * const trie
2765 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2766 SV ** const tmp = RX_DEBUG(reginfo->prog)
2767 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2768 : NULL;
2769 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2770 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2771 ST.accept_buff[best].wordnum,
2772 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2773 PL_colors[5] );
2774 });
2775
2776 if ( best<ST.accepted ) {
2777 reg_trie_accepted tmp = ST.accept_buff[ best ];
2778 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2779 ST.accept_buff[ ST.accepted ] = tmp;
2780 best = ST.accepted;
a3621e74 2781 }
166ba7cd
DM
2782 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2783 }
2784 PUSH_STATE_GOTO(TRIE_next, ST.B);
2785 /* NOTREACHED */
2786
2787#undef ST
2788
95b24440
DM
2789 case EXACT: {
2790 char *s = STRING(scan);
5d9a96ca 2791 st->ln = STR_LEN(scan);
eb160463 2792 if (do_utf8 != UTF) {
bc517b45 2793 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2794 char *l = locinput;
be8e71aa 2795 const char * const e = s + st->ln;
a72c7584 2796
5ff6fc6d
JH
2797 if (do_utf8) {
2798 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2799 while (s < e) {
a3b680e6 2800 STRLEN ulen;
1aa99e6b 2801 if (l >= PL_regeol)
5ff6fc6d
JH
2802 sayNO;
2803 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2804 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2805 uniflags))
5ff6fc6d 2806 sayNO;
bc517b45 2807 l += ulen;
5ff6fc6d 2808 s ++;
1aa99e6b 2809 }
5ff6fc6d
JH
2810 }
2811 else {
2812 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2813 while (s < e) {
a3b680e6 2814 STRLEN ulen;
1aa99e6b
IH
2815 if (l >= PL_regeol)
2816 sayNO;
5ff6fc6d 2817 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2818 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2819 uniflags))
1aa99e6b 2820 sayNO;
bc517b45 2821 s += ulen;
a72c7584 2822 l ++;
1aa99e6b 2823 }
5ff6fc6d 2824 }
1aa99e6b
IH
2825 locinput = l;
2826 nextchr = UCHARAT(locinput);
2827 break;
2828 }
bc517b45 2829 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2830 /* Inline the first character, for speed. */
2831 if (UCHARAT(s) != nextchr)
2832 sayNO;
5d9a96ca 2833 if (PL_regeol - locinput < st->ln)
d6a28714 2834 sayNO;
5d9a96ca 2835 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2836 sayNO;
5d9a96ca 2837 locinput += st->ln;
d6a28714
JH
2838 nextchr = UCHARAT(locinput);
2839 break;
95b24440 2840 }
d6a28714 2841 case EXACTFL:
b8c5462f
JH
2842 PL_reg_flags |= RF_tainted;
2843 /* FALL THROUGH */
95b24440 2844 case EXACTF: {
be8e71aa 2845 char * const s = STRING(scan);
5d9a96ca 2846 st->ln = STR_LEN(scan);
d6a28714 2847
d07ddd77
JH
2848 if (do_utf8 || UTF) {
2849 /* Either target or the pattern are utf8. */
be8e71aa 2850 const char * const l = locinput;
d07ddd77 2851 char *e = PL_regeol;
bc517b45 2852
5d9a96ca 2853 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2854 l, &e, 0, do_utf8)) {
5486206c
JH
2855 /* One more case for the sharp s:
2856 * pack("U0U*", 0xDF) =~ /ss/i,
2857 * the 0xC3 0x9F are the UTF-8
2858 * byte sequence for the U+00DF. */
2859 if (!(do_utf8 &&
2860 toLOWER(s[0]) == 's' &&
5d9a96ca 2861 st->ln >= 2 &&
5486206c
JH
2862 toLOWER(s[1]) == 's' &&
2863 (U8)l[0] == 0xC3 &&
2864 e - l >= 2 &&
2865 (U8)l[1] == 0x9F))
2866 sayNO;
2867 }
d07ddd77
JH
2868 locinput = e;
2869 nextchr = UCHARAT(locinput);
2870 break;
a0ed51b3 2871 }
d6a28714 2872
bc517b45
JH
2873 /* Neither the target and the pattern are utf8. */
2874
d6a28714
JH
2875 /* Inline the first character, for speed. */
2876 if (UCHARAT(s) != nextchr &&
2877 UCHARAT(s) != ((OP(scan) == EXACTF)
2878 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2879 sayNO;
5d9a96ca 2880 if (PL_regeol - locinput < st->ln)
b8c5462f 2881 sayNO;
5d9a96ca
DM
2882 if (st->ln > 1 && (OP(scan) == EXACTF
2883 ? ibcmp(s, locinput, st->ln)
2884 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2885 sayNO;
5d9a96ca 2886 locinput += st->ln;
d6a28714 2887 nextchr = UCHARAT(locinput);
a0d0e21e 2888 break;
95b24440 2889 }
d6a28714 2890 case ANYOF:
ffc61ed2 2891 if (do_utf8) {
9e55ce06
JH
2892 STRLEN inclasslen = PL_regeol - locinput;
2893
32fc9b6a 2894 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2895 sayNO_ANYOF;
ffc61ed2
JH
2896 if (locinput >= PL_regeol)
2897 sayNO;
0f0076b4 2898 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2899 nextchr = UCHARAT(locinput);
e0f9d4a8 2900 break;
ffc61ed2
JH
2901 }
2902 else {
2903 if (nextchr < 0)
2904 nextchr = UCHARAT(locinput);
32fc9b6a 2905 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2906 sayNO_ANYOF;
ffc61ed2
JH
2907 if (!nextchr && locinput >= PL_regeol)
2908 sayNO;
2909 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2910 break;
2911 }
2912 no_anyof:
2913 /* If we might have the case of the German sharp s
2914 * in a casefolding Unicode character class. */
2915
ebc501f0
JH
2916 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2917 locinput += SHARP_S_SKIP;
e0f9d4a8 2918 nextchr = UCHARAT(locinput);
ffc61ed2 2919 }
e0f9d4a8
JH
2920 else
2921 sayNO;
b8c5462f 2922 break;
d6a28714 2923 case ALNUML:
b8c5462f
JH
2924 PL_reg_flags |= RF_tainted;
2925 /* FALL THROUGH */
d6a28714 2926 case ALNUM:
b8c5462f 2927 if (!nextchr)
4633a7c4 2928 sayNO;
ffc61ed2 2929 if (do_utf8) {
1a4fad37 2930 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2931 if (!(OP(scan) == ALNUM
bb7a0f54 2932 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2933 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2934 {
2935 sayNO;
a0ed51b3 2936 }
b8c5462f 2937 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2938 nextchr = UCHARAT(locinput);
2939 break;
2940 }
ffc61ed2 2941 if (!(OP(scan) == ALNUM
d6a28714 2942 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2943 sayNO;
b8c5462f 2944 nextchr = UCHARAT(++locinput);
a0d0e21e 2945 break;
d6a28714 2946 case NALNUML:
b8c5462f
JH
2947 PL_reg_flags |= RF_tainted;
2948 /* FALL THROUGH */
d6a28714
JH
2949 case NALNUM:
2950 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2951 sayNO;
ffc61ed2 2952 if (do_utf8) {
1a4fad37 2953 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2954 if (OP(scan) == NALNUM
bb7a0f54 2955 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2956 : isALNUM_LC_utf8((U8*)locinput))
2957 {
b8c5462f 2958 sayNO;
d6a28714 2959 }
b8c5462f
JH
2960 locinput += PL_utf8skip[nextchr];
2961 nextchr = UCHARAT(locinput);
2962 break;
2963 }
ffc61ed2 2964 if (OP(scan) == NALNUM
d6a28714 2965 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2966 sayNO;
76e3520e 2967 nextchr = UCHARAT(++locinput);
a0d0e21e 2968 break;
d6a28714
JH
2969 case BOUNDL:
2970 case NBOUNDL:
3280af22 2971 PL_reg_flags |= RF_tainted;
bbce6d69 2972 /* FALL THROUGH */
d6a28714
JH
2973 case BOUND:
2974 case NBOUND:
2975 /* was last char in word? */
ffc61ed2 2976 if (do_utf8) {
12d33761 2977 if (locinput == PL_bostr)
5d9a96ca 2978 st->ln = '\n';
ffc61ed2 2979 else {
a3b680e6 2980 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2981
4ad0818d 2982 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
2983 }
2984 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2985 st->ln = isALNUM_uni(st->ln);
1a4fad37 2986 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2987 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2988 }
2989 else {
5d9a96ca 2990 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
2991 n = isALNUM_LC_utf8((U8*)locinput);
2992 }
a0ed51b3 2993 }
d6a28714 2994 else {
5d9a96ca 2995 st->ln = (locinput != PL_bostr) ?
12d33761 2996 UCHARAT(locinput - 1) : '\n';
ffc61ed2 2997 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2998 st->ln = isALNUM(st->ln);
ffc61ed2
JH
2999 n = isALNUM(nextchr);
3000 }
3001 else {
5d9a96ca 3002 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3003 n = isALNUM_LC(nextchr);
3004 }
d6a28714 3005 }
5d9a96ca 3006 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3007 OP(scan) == BOUNDL))
3008 sayNO;
a0ed51b3 3009 break;
d6a28714 3010 case SPACEL:
3280af22 3011 PL_reg_flags |= RF_tainted;
bbce6d69 3012 /* FALL THROUGH */
d6a28714 3013 case SPACE:
9442cb0e 3014 if (!nextchr)
4633a7c4 3015 sayNO;
1aa99e6b 3016 if (do_utf8) {
fd400ab9 3017 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3018 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3019 if (!(OP(scan) == SPACE
bb7a0f54 3020 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3021 : isSPACE_LC_utf8((U8*)locinput)))
3022 {
3023 sayNO;
3024 }
3025 locinput += PL_utf8skip[nextchr];
3026 nextchr = UCHARAT(locinput);
3027 break;
d6a28714 3028 }
ffc61ed2
JH
3029 if (!(OP(scan) == SPACE
3030 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3031 sayNO;
3032 nextchr = UCHARAT(++locinput);
3033 }
3034 else {
3035 if (!(OP(scan) == SPACE
3036 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3037 sayNO;
3038 nextchr = UCHARAT(++locinput);
a0ed51b3 3039 }
a0ed51b3 3040 break;
d6a28714 3041 case NSPACEL:
3280af22 3042 PL_reg_flags |= RF_tainted;
bbce6d69 3043 /* FALL THROUGH */
d6a28714 3044 case NSPACE:
9442cb0e 3045 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3046 sayNO;
1aa99e6b 3047 if (do_utf8) {
1a4fad37 3048 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3049 if (OP(scan) == NSPACE
bb7a0f54 3050 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3051 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3052 {
3053 sayNO;
3054 }
3055 locinput += PL_utf8skip[nextchr];
3056 nextchr = UCHARAT(locinput);
3057 break;
a0ed51b3 3058 }
ffc61ed2 3059 if (OP(scan) == NSPACE
d6a28714 3060 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3061 sayNO;
76e3520e 3062 nextchr = UCHARAT(++locinput);
a0d0e21e 3063 break;
d6a28714 3064 case DIGITL:
a0ed51b3
LW
3065 PL_reg_flags |= RF_tainted;
3066 /* FALL THROUGH */
d6a28714 3067 case DIGIT:
9442cb0e 3068 if (!nextchr)
a0ed51b3 3069 sayNO;
1aa99e6b 3070 if (do_utf8) {
1a4fad37 3071 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3072 if (!(OP(scan) == DIGIT
bb7a0f54 3073 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3074 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3075 {
a0ed51b3 3076 sayNO;
dfe13c55 3077 }
6f06b55f 3078 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3079 nextchr = UCHARAT(locinput);
3080 break;
3081 }
ffc61ed2 3082 if (!(OP(scan) == DIGIT
9442cb0e 3083 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3084 sayNO;
3085 nextchr = UCHARAT(++locinput);
3086 break;
d6a28714 3087 case NDIGITL:
b8c5462f
JH
3088 PL_reg_flags |= RF_tainted;
3089 /* FALL THROUGH */
d6a28714 3090 case NDIGIT:
9442cb0e 3091 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3092 sayNO;
1aa99e6b 3093 if (do_utf8) {
1a4fad37 3094 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3095 if (OP(scan) == NDIGIT
bb7a0f54 3096 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3097 : isDIGIT_LC_utf8((U8*)locinput))
3098 {
a0ed51b3 3099 sayNO;
9442cb0e 3100 }
6f06b55f 3101 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3102 nextchr = UCHARAT(locinput);
3103 break;
3104 }
ffc61ed2 3105 if (OP(scan) == NDIGIT
9442cb0e 3106 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3107 sayNO;
3108 nextchr = UCHARAT(++locinput);
3109 break;
3110 case CLUMP:
b7c83a7e 3111 if (locinput >= PL_regeol)
a0ed51b3 3112 sayNO;
b7c83a7e 3113 if (do_utf8) {
1a4fad37 3114 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3115 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3116 sayNO;
3117 locinput += PL_utf8skip[nextchr];
3118 while (locinput < PL_regeol &&
3119 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3120 locinput += UTF8SKIP(locinput);
3121 if (locinput > PL_regeol)
3122 sayNO;
eb08e2da
JH
3123 }
3124 else
3125 locinput++;
a0ed51b3
LW
3126 nextchr = UCHARAT(locinput);
3127 break;
c8756f30 3128 case REFFL:
3280af22 3129 PL_reg_flags |= RF_tainted;
c8756f30 3130 /* FALL THROUGH */
c277df42 3131 case REF:
95b24440
DM
3132 case REFF: {
3133 char *s;
c277df42 3134 n = ARG(scan); /* which paren pair */
5d9a96ca 3135 st->ln = PL_regstartp[n];
2c2d71f5 3136 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3137 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3138 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3139 if (st->ln == PL_regendp[n])
a0d0e21e 3140 break;
a0ed51b3 3141
5d9a96ca 3142 s = PL_bostr + st->ln;
1aa99e6b 3143 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3144 char *l = locinput;
a3b680e6 3145 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3146 /*
3147 * Note that we can't do the "other character" lookup trick as
3148 * in the 8-bit case (no pun intended) because in Unicode we
3149 * have to map both upper and title case to lower case.
3150 */
3151 if (OP(scan) == REFF) {
3152 while (s < e) {
a3b680e6
AL
3153 STRLEN ulen1, ulen2;
3154 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3155 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3156
a0ed51b3
LW
3157 if (l >= PL_regeol)
3158 sayNO;
a2a2844f
JH
3159 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3160 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3161 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3162 sayNO;
a2a2844f
JH
3163 s += ulen1;
3164 l += ulen2;
a0ed51b3
LW
3165 }
3166 }
3167 locinput = l;
3168 nextchr = UCHARAT(locinput);
3169 break;
3170 }
3171
a0d0e21e 3172 /* Inline the first character, for speed. */
76e3520e 3173 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3174 (OP(scan) == REF ||
3175 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3176 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3177 sayNO;
5d9a96ca
DM
3178 st->ln = PL_regendp[n] - st->ln;
3179 if (locinput + st->ln > PL_regeol)
4633a7c4 3180 sayNO;
5d9a96ca
DM
3181 if (st->ln > 1 && (OP(scan) == REF
3182 ? memNE(s, locinput, st->ln)
c8756f30 3183 : (OP(scan) == REFF
5d9a96ca
DM
3184 ? ibcmp(s, locinput, st->ln)
3185 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3186 sayNO;
5d9a96ca 3187 locinput += st->ln;
76e3520e 3188 nextchr = UCHARAT(locinput);
a0d0e21e 3189 break;
95b24440 3190 }
a0d0e21e
LW
3191
3192 case NOTHING:
c277df42 3193 case TAIL:
a0d0e21e
LW
3194 break;
3195 case BACK:
3196 break;
40a82448
DM
3197
3198#undef ST
3199#define ST st->u.eval
3200
3201 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3202 {
c277df42 3203 SV *ret;
8e5e9ebe 3204 {
4aabdb9b
DM
3205 /* execute the code in the {...} */
3206 dSP;
6136c704 3207 SV ** const before = SP;
4aabdb9b
DM
3208 OP_4tree * const oop = PL_op;
3209 COP * const ocurcop = PL_curcop;
3210 PAD *old_comppad;
4aabdb9b
DM
3211
3212 n = ARG(scan);
32fc9b6a 3213 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3214 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3215 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3216 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3217
8e5e9ebe
RGS
3218 CALLRUNOPS(aTHX); /* Scalar context. */
3219 SPAGAIN;
3220 if (SP == before)
075aa684 3221 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3222 else {
3223 ret = POPs;
3224 PUTBACK;
3225 }
4aabdb9b
DM
3226
3227 PL_op = oop;
3228 PAD_RESTORE_LOCAL(old_comppad);
3229 PL_curcop = ocurcop;
3230 if (!st->logical) {
3231 /* /(?{...})/ */
3232 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3233 break;
3234 }
8e5e9ebe 3235 }
4aabdb9b
DM
3236 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3237 regexp *re;
4aabdb9b 3238 {
4f639d21
DM
3239 /* extract RE object from returned value; compiling if
3240 * necessary */
3241
6136c704 3242 MAGIC *mg = NULL;
be8e71aa 3243 const SV *sv;
faf82a0b
AE
3244 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3245 mg = mg_find(sv, PERL_MAGIC_qr);
3246 else if (SvSMAGICAL(ret)) {
3247 if (SvGMAGICAL(ret))
3248 sv_unmagic(ret, PERL_MAGIC_qr);
3249 else
3250 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3251 }
faf82a0b 3252
0f5d15d6
IZ
3253 if (mg) {
3254 re = (regexp *)mg->mg_obj;
df0003d4 3255 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3256 }
3257 else {
3258 STRLEN len;
6136c704 3259 const char * const t = SvPV_const(ret, len);
0f5d15d6 3260 PMOP pm;
a3b680e6 3261 const I32 osize = PL_regsize;
0f5d15d6 3262
5fcd1c1b 3263 Zero(&pm, 1, PMOP);
4aabdb9b 3264 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3265 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3266 if (!(SvFLAGS(ret)
faf82a0b
AE
3267 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3268 | SVs_GMG)))
14befaf4
DM
3269 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3270 PERL_MAGIC_qr,0,0);
0f5d15d6 3271 PL_regsize = osize;
0f5d15d6 3272 }
4aabdb9b 3273 }
aa283a38
DM
3274
3275 /* run the pattern returned from (??{...}) */
3276
4aabdb9b
DM
3277 DEBUG_EXECUTE_r(
3278 PerlIO_printf(Perl_debug_log,
3279 "Entering embedded \"%s%.60s%s%s\"\n",
3280 PL_colors[0],
3281 re->precomp,
3282 PL_colors[1],
3283 (strlen(re->precomp) > 60 ? "..." : ""))
3284 );
2c2d71f5 3285
40a82448
DM
3286 ST.cp = regcppush(0); /* Save *all* the positions. */
3287 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3288 *PL_reglastparen = 0;
3289 *PL_reglastcloseparen = 0;
4aabdb9b 3290 PL_reginput = locinput;
4aabdb9b
DM
3291
3292 /* XXXX This is too dramatic a measure... */
3293 PL_reg_maxiter = 0;
3294
5d9a96ca 3295 st->logical = 0;
40a82448 3296 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3297 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3298 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3299 ST.prev_rex = rex;
aa283a38 3300 rex = re;
aa283a38 3301
40a82448 3302 ST.B = next;
aa283a38 3303 /* now continue from first node in postoned RE */
40a82448 3304 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3305 /* NOTREACHED */
a0ed51b3 3306 }
4aabdb9b
DM
3307 /* /(?(?{...})X|Y)/ */
3308 st->sw = SvTRUE(ret);
3309 st->logical = 0;
c277df42
IZ
3310 break;
3311 }
40a82448
DM
3312
3313 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3314 if (ST.toggleutf)
3315 PL_reg_flags ^= RF_utf8;
3316 ReREFCNT_dec(rex);
3317 rex = ST.prev_rex;
3318 /* XXXX This is too dramatic a measure... */
3319 PL_reg_maxiter = 0;
3320 /* Restore parens of the caller without popping the
3321 * savestack */
3322 {
3323 const I32 tmp = PL_savestack_ix;
3324 PL_savestack_ix = ST.lastcp;
3325 regcppop(rex);
3326 PL_savestack_ix = tmp;
3327 }
3328 PL_reginput = locinput;
3329 /* continue at the node following the (??{...}) */
3330 scan = ST.B;
3331 continue;
3332
3333 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3334 /* Restore state to the outer re then re-throw the failure */
3335 if (ST.toggleutf)
3336 PL_reg_flags ^= RF_utf8;
3337 ReREFCNT_dec(rex);
3338 rex = ST.prev_rex;
3339
3340 /* XXXX This is too dramatic a measure... */
3341 PL_reg_maxiter = 0;
3342
3343 PL_reginput = locinput;
3344 REGCP_UNWIND(ST.lastcp);
3345 regcppop(rex);
3346 sayNO_SILENT;
3347
3348#undef ST
3349
a0d0e21e 3350 case OPEN:
c277df42 3351 n = ARG(scan); /* which paren pair */
3280af22
NIS
3352 PL_reg_start_tmp[n] = locinput;
3353 if (n > PL_regsize)
3354 PL_regsize = n;
a0d0e21e
LW
3355 break;
3356 case CLOSE:
c277df42 3357 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3358 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3359 PL_regendp[n] = locinput - PL_bostr;
eb160463 3360 if (n > (I32)*PL_reglastparen)
3280af22 3361 *PL_reglastparen = n;
a01268b5 3362 *PL_reglastcloseparen = n;
a0d0e21e 3363 break;
c277df42
IZ
3364 case GROUPP:
3365 n = ARG(scan); /* which paren pair */
5d9a96ca 3366 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3367 break;
3368 case IFTHEN:
2c2d71f5 3369 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3370 if (st->sw)
c277df42
IZ
3371 next = NEXTOPER(NEXTOPER(scan));
3372 else {
3373 next = scan + ARG(scan);
3374 if (OP(next) == IFTHEN) /* Fake one. */
3375 next = NEXTOPER(NEXTOPER(next));
3376 }
3377 break;
3378 case LOGICAL:
5d9a96ca 3379 st->logical = scan->flags;
c277df42 3380 break;
2ab05381 3381/*******************************************************************
a0374537
DM
3382 cc points to the regmatch_state associated with the most recent CURLYX.
3383 This struct contains info about the innermost (...)* loop (an
3384 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3385
3386 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3387
95b24440 3388 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3389
a0374537 3390 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3391 with the starting point at WHILEM node;
2ab05381
IZ
3392
3393 3) Each hit of WHILEM node tries to match A and Z (in the order
3394 depending on the current iteration, min/max of {min,max} and
3395 greediness). The information about where are nodes for "A"
a0374537 3396 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3397 was already matched, and greediness.
3398
3399 4) After A matches, the same WHILEM node is hit again.
3400
95b24440 3401 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3402 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3403 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3404 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3405 of the external loop.
3406
a0374537 3407 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3408 and whatever it mentions via ->next, and additional attached trees
3409 corresponding to temporarily unset infoblocks as in "5" above.
3410
95b24440 3411 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3412 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3413 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3414 infoblocks are drawn below the "reset" infoblock.
3415
3416 In fact in the picture below we do not show failed matches for Z and T
3417 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3418 more obvious *why* one needs to *temporary* unset infoblocks.]
3419
3420 Matched REx position InfoBlocks Comment
3421 (Y(A)*?Z)*?T x
3422 Y(A)*?Z)*?T x <- O
3423 Y (A)*?Z)*?T x <- O
3424 Y A)*?Z)*?T x <- O <- I
3425 YA )*?Z)*?T x <- O <- I
3426 YA A)*?Z)*?T x <- O <- I
3427 YAA )*?Z)*?T x <- O <- I
3428 YAA Z)*?T x <- O # Temporary unset I
3429 I
3430
3431 YAAZ Y(A)*?Z)*?T x <- O
3432 I
3433
3434 YAAZY (A)*?Z)*?T x <- O
3435 I
3436
3437 YAAZY A)*?Z)*?T x <- O <- I
3438 I
3439
3440 YAAZYA )*?Z)*?T x <- O <- I
3441 I
3442
3443 YAAZYA Z)*?T x <- O # Temporary unset I
3444 I,I
3445
3446 YAAZYAZ )*?T x <- O
3447 I,I
3448
3449 YAAZYAZ T x # Temporary unset O
3450 O
3451 I,I
3452
3453 YAAZYAZT x
3454 O
3455 I,I
3456 *******************************************************************/
95b24440 3457
a0d0e21e 3458 case CURLYX: {
cb434fcc
IZ
3459 /* No need to save/restore up to this paren */
3460 I32 parenfloor = scan->flags;
c277df42 3461
c2b7afd3
NC
3462 /* Dave says:
3463
3464 CURLYX and WHILEM are always paired: they're the moral
3465 equivalent of pp_enteriter anbd pp_iter.
3466
3467 The only time next could be null is if the node tree is
3468 corrupt. This was mentioned on p5p a few days ago.
3469
3470 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3471 So we'll assert that this is true:
3472 */
3473 assert(next);
30b2893d 3474 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3475 next += ARG(next);
cb434fcc
IZ
3476 /* XXXX Probably it is better to teach regpush to support
3477 parenfloor > PL_regsize... */
eb160463 3478 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3479 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3480
d8319b27
DM
3481 st->u.curlyx.cp = PL_savestack_ix;
3482 st->u.curlyx.outercc = st->cc;
a0374537
DM
3483 st->cc = st;
3484 /* these fields contain the state of the current curly.
3485 * they are accessed by subsequent WHILEMs;
3486 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3487 st->u.curlyx.parenfloor = parenfloor;
3488 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3489 st->u.curlyx.min = ARG1(scan);
3490 st->u.curlyx.max = ARG2(scan);
3491 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3492 st->u.curlyx.lastloc = 0;
a0374537
DM
3493 /* st->next and st->minmod are also read by WHILEM */
3494
3280af22 3495 PL_reginput = locinput;
95b24440
DM
3496 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3497 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3498 regcpblow(st->u.curlyx.cp);
3499 st->cc = st->u.curlyx.outercc;
95b24440 3500 saySAME(result);
a0d0e21e 3501 }
5f66b61c 3502 /* NOTREACHED */
a0d0e21e
LW
3503 case WHILEM: {
3504 /*
3505 * This is really hard to understand, because after we match
3506 * what we're trying to match, we must make sure the rest of
2c2d71f5 3507 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3508 * to go back UP the parse tree by recursing ever deeper. And
3509 * if it fails, we have to reset our parent's current state
3510 * that we can try again after backing off.
3511 */
3512
c2b7afd3
NC
3513 /* Dave says:
3514
3515 st->cc gets initialised by CURLYX ready for use by WHILEM.
3516 So again, unless somethings been corrupted, st->cc cannot
3517 be null at that point in WHILEM.
3518
3519 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3520 So we'll assert that this is true:
3521 */
3522 assert(st->cc);
d8319b27
DM
3523 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3524 st->u.whilem.cache_offset = 0;
3525 st->u.whilem.cache_bit = 0;
c277df42 3526
d8319b27 3527 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3528 PL_reginput = locinput;
a0d0e21e 3529
a3621e74 3530 DEBUG_EXECUTE_r(
9041c2e3 3531 PerlIO_printf(Perl_debug_log,
91f3b821 3532 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3533 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3534 (long)n, (long)st->cc->u.curlyx.min,
3535 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3536 );
4633a7c4 3537
a0d0e21e
LW
3538 /* If degenerate scan matches "", assume scan done. */
3539
d8319b27
DM
3540 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3541 st->u.whilem.savecc = st->cc;
3542 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3543 if (st->cc)
d8319b27 3544 st->ln = st->cc->u.curlyx.cur;
a3621e74 3545 DEBUG_EXECUTE_r(
c3464db5
DD
3546 PerlIO_printf(Perl_debug_log,
3547 "%*s empty match detected, try continuation...\n",
3280af22 3548 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3549 );
d8319b27 3550 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3551 /*** all unsaved local vars undefined at this point */
d8319b27 3552 st->cc = st->u.whilem.savecc;
95b24440 3553 if (result)
4633a7c4 3554 sayYES;
d8319b27
DM
3555 if (st->cc->u.curlyx.outercc)
3556 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3557 sayNO;
a0d0e21e
LW
3558 }
3559
3560 /* First just match a string of min scans. */
3561
d8319b27
DM
3562 if (n < st->cc->u.curlyx.min) {
3563 st->cc->u.curlyx.cur = n;
3564 st->cc->u.curlyx.lastloc = locinput;
3565 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3566 /*** all unsaved local vars undefined at this point */
3567 if (result)
4633a7c4 3568 sayYES;
d8319b27
DM
3569 st->cc->u.curlyx.cur = n - 1;
3570 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3571 sayNO;
a0d0e21e
LW
3572 }
3573
2c2d71f5
JH
3574 if (scan->flags) {
3575 /* Check whether we already were at this position.
3576 Postpone detection until we know the match is not
3577 *that* much linear. */
3578 if (!PL_reg_maxiter) {
3579 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3580 /* possible overflow for long strings and many CURLYX's */
3581 if (PL_reg_maxiter < 0)
3582 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3583 PL_reg_leftiter = PL_reg_maxiter;
3584 }
3585 if (PL_reg_leftiter-- == 0) {
a3b680e6 3586 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3587 if (PL_reg_poscache) {
eb160463 3588 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3589 Renew(PL_reg_poscache, size, char);
3590 PL_reg_poscache_size = size;
3591 }
3592 Zero(PL_reg_poscache, size, char);
3593 }
3594 else {
3595 PL_reg_poscache_size = size;
a02a5408 3596 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3597 }
a3621e74 3598 DEBUG_EXECUTE_r(
2c2d71f5
JH
3599 PerlIO_printf(Perl_debug_log,
3600 "%sDetected a super-linear match, switching on caching%s...\n",
3601 PL_colors[4], PL_colors[5])
3602 );
3603 }
3604 if (PL_reg_leftiter < 0) {
d8319b27 3605 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3606
d8319b27
DM
3607 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3608 + st->u.whilem.cache_offset * (scan->flags>>4);
3609 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3610 st->u.whilem.cache_offset /= 8;
3611 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3612 DEBUG_EXECUTE_r(
2c2d71f5
JH
3613 PerlIO_printf(Perl_debug_log,
3614 "%*s already tried at this position...\n",
3615 REPORT_CODE_OFF+PL_regindent*2, "")
3616 );
3ab3c9b4
HS
3617 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3618 /* cache records success */
c2b0868c
HS
3619 sayYES;
3620 else
3ab3c9b4 3621 /* cache records failure */
c2b0868c 3622 sayNO_SILENT;
2c2d71f5 3623 }
2c2d71f5
JH
3624 }
3625 }
3626
a0d0e21e
LW
3627 /* Prefer next over scan for minimal matching. */
3628
5d9a96ca 3629 if (st->cc->minmod) {
d8319b27
DM
3630 st->u.whilem.savecc = st->cc;
3631 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3632 if (st->cc)
d8319b27
DM
3633 st->ln = st->cc->u.curlyx.cur;
3634 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3635 REGCP_SET(st->u.whilem.lastcp);
3636 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3637 /*** all unsaved local vars undefined at this point */
d8319b27 3638 st->cc = st->u.whilem.savecc;
95b24440 3639 if (result) {
d8319b27 3640 regcpblow(st->u.whilem.cp);
3ab3c9b4 3641 CACHEsayYES; /* All done. */
5f05dabc 3642 }
d8319b27 3643 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3644 regcppop(rex);
d8319b27
DM
3645 if (st->cc->u.curlyx.outercc)
3646 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3647
d8319b27 3648 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3649 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3650 && !(PL_reg_flags & RF_warned)) {
3651 PL_reg_flags |= RF_warned;
9014280d 3652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3653 "Complex regular subexpression recursion",
3654 REG_INFTY - 1);
c277df42 3655 }
3ab3c9b4 3656 CACHEsayNO;
c277df42 3657 }
a687059c 3658
a3621e74 3659 DEBUG_EXECUTE_r(
c3464db5
DD
3660 PerlIO_printf(Perl_debug_log,
3661 "%*s trying longer...\n",
3280af22 3662 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3663 );
a0d0e21e 3664 /* Try scanning more and see if it helps. */
3280af22 3665 PL_reginput = locinput;
d8319b27
DM
3666 st->cc->u.curlyx.cur = n;
3667 st->cc->u.curlyx.lastloc = locinput;
3668 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3669 REGCP_SET(st->u.whilem.lastcp);
3670 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3671 /*** all unsaved local vars undefined at this point */
3672 if (result) {
d8319b27 3673 regcpblow(st->u.whilem.cp);
3ab3c9b4 3674 CACHEsayYES;
5f05dabc 3675 }
d8319b27 3676 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3677 regcppop(rex);
d8319b27
DM
3678 st->cc->u.curlyx.cur = n - 1;
3679 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3680 CACHEsayNO;
a0d0e21e
LW
3681 }
3682
3683 /* Prefer scan over next for maximal matching. */
3684
d8319b27
DM
3685 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3686 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3687 st->cc->u.curlyx.cur = n;
3688 st->cc->u.curlyx.lastloc = locinput;
3689 REGCP_SET(st->u.whilem.lastcp);
3690 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3691 /*** all unsaved local vars undefined at this point */
3692 if (result) {
d8319b27 3693 regcpblow(st->u.whilem.cp);
3ab3c9b4 3694 CACHEsayYES;
5f05dabc 3695 }
d8319b27 3696 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3697 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3698 PL_reginput = locinput;
a3621e74 3699 DEBUG_EXECUTE_r(
c3464db5
DD
3700 PerlIO_printf(Perl_debug_log,
3701 "%*s failed, try continuation...\n",
3280af22 3702 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3703 );
3704 }
9041c2e3 3705 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3706 && !(PL_reg_flags & RF_warned)) {
3280af22 3707 PL_reg_flags |= RF_warned;
9014280d 3708 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3709 "Complex regular subexpression recursion",
3710 REG_INFTY - 1);
a0d0e21e
LW
3711 }
3712
3713 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3714 st->u.whilem.savecc = st->cc;
3715 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3716 if (st->cc)
d8319b27
DM
3717 st->ln = st->cc->u.curlyx.cur;
3718 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3719 /*** all unsaved local vars undefined at this point */
d8319b27 3720 st->cc = st->u.whilem.savecc;
95b24440 3721 if (result)
3ab3c9b4 3722 CACHEsayYES;
d8319b27
DM
3723 if (st->cc->u.curlyx.outercc)
3724 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3725 st->cc->u.curlyx.cur = n - 1;
3726 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3727 CACHEsayNO;
a0d0e21e 3728 }
5f66b61c 3729 /* NOTREACHED */
40a82448
DM
3730
3731#undef ST
3732#define ST st->u.branch
3733
3734 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3735 next = scan + ARG(scan);
3736 if (next == scan)
3737 next = NULL;
40a82448
DM
3738 scan = NEXTOPER(scan);
3739 /* FALL THROUGH */
c277df42 3740
40a82448
DM
3741 case BRANCH: /* /(...|A|...)/ */
3742 scan = NEXTOPER(scan); /* scan now points to inner node */
3743 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3744 /* last branch; skip state push and jump direct to node */
3745 continue;
3746 ST.lastparen = *PL_reglastparen;
3747 ST.next_branch = next;
3748 REGCP_SET(ST.cp);
3749 PL_reginput = locinput;
02db2b7b 3750
40a82448
DM
3751 /* Now go into the branch */
3752 PUSH_STATE_GOTO(BRANCH_next, scan);
3753 /* NOTREACHED */
3754
3755 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3756 REGCP_UNWIND(ST.cp);
3757 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3758 PL_regendp[n] = -1;
3759 *PL_reglastparen = n;
3760 scan = ST.next_branch;
3761 /* no more branches? */
3762 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3763 sayNO;
3764 continue; /* execute next BRANCH[J] op */
3765 /* NOTREACHED */
3766
a0d0e21e 3767 case MINMOD:
5d9a96ca 3768 st->minmod = 1;
a0d0e21e 3769 break;
40a82448
DM
3770
3771#undef ST
3772#define ST st->u.curlym
3773
3774 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3775
3776 /* This is an optimisation of CURLYX that enables us to push
3777 * only a single backtracking state, no matter now many matches
3778 * there are in {m,n}. It relies on the pattern being constant
3779 * length, with no parens to influence future backrefs
3780 */
3781
3782 ST.me = scan;
dc45a647 3783 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3784
3785 /* if paren positive, emulate an OPEN/CLOSE around A */
3786 if (ST.me->flags) {
3787 I32 paren = ST.me->flags;
3788 if (paren > PL_regsize)
3789 PL_regsize = paren;
3790 if (paren > (I32)*PL_reglastparen)
3791 *PL_reglastparen = paren;
c277df42 3792 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3793 }
40a82448
DM
3794 ST.A = scan;
3795 ST.B = next;
3796 ST.alen = 0;
3797 ST.count = 0;
3798 ST.minmod = st->minmod;
3799 st->minmod = 0;
3800 ST.c1 = CHRTEST_UNINIT;
3801 REGCP_SET(ST.cp);
6407bf3b 3802
40a82448
DM
3803 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3804 goto curlym_do_B;
3805
3806 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3807 PL_reginput = locinput;
40a82448
DM
3808 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3809 /* NOTREACHED */
5f80c4cf 3810
40a82448
DM
3811 case CURLYM_A: /* we've just matched an A */
3812 locinput = st->locinput;
3813 nextchr = UCHARAT(locinput);
3814
3815 ST.count++;
3816 /* after first match, determine A's length: u.curlym.alen */
3817 if (ST.count == 1) {
3818 if (PL_reg_match_utf8) {
3819 char *s = locinput;
3820 while (s < PL_reginput) {
3821 ST.alen++;
3822 s += UTF8SKIP(s);
3823 }
3824 }
3825 else {
3826 ST.alen = PL_reginput - locinput;
3827 }
3828 if (ST.alen == 0)
3829 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3830 }
0cadcf80
DM
3831 DEBUG_EXECUTE_r(
3832 PerlIO_printf(Perl_debug_log,
40a82448 3833 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 3834 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 3835 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3836 );
3837
40a82448
DM
3838 locinput = PL_reginput;
3839 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3840 goto curlym_do_A; /* try to match another A */
3841 goto curlym_do_B; /* try to match B */
5f80c4cf 3842
40a82448
DM
3843 case CURLYM_A_fail: /* just failed to match an A */
3844 REGCP_UNWIND(ST.cp);
3845 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3846 sayNO;
0cadcf80 3847
40a82448
DM
3848 curlym_do_B: /* execute the B in /A{m,n}B/ */
3849 PL_reginput = locinput;
3850 if (ST.c1 == CHRTEST_UNINIT) {
3851 /* calculate c1 and c2 for possible match of 1st char
3852 * following curly */
3853 ST.c1 = ST.c2 = CHRTEST_VOID;
3854 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3855 regnode *text_node = ST.B;
3856 if (! HAS_TEXT(text_node))
3857 FIND_NEXT_IMPT(text_node);
3858 if (HAS_TEXT(text_node)
3859 && PL_regkind[OP(text_node)] != REF)
3860 {
3861 ST.c1 = (U8)*STRING(text_node);
3862 ST.c2 =
3863 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3864 ? PL_fold[ST.c1]
3865 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3866 ? PL_fold_locale[ST.c1]
3867 : ST.c1;
c277df42 3868 }
c277df42 3869 }
40a82448
DM
3870 }
3871
3872 DEBUG_EXECUTE_r(
3873 PerlIO_printf(Perl_debug_log,
3874 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3875 (int)(REPORT_CODE_OFF+PL_regindent*2),
3876 "", (IV)ST.count)
3877 );
3878 if (ST.c1 != CHRTEST_VOID
3879 && UCHARAT(PL_reginput) != ST.c1
3880 && UCHARAT(PL_reginput) != ST.c2)
3881 {
3882 /* simulate B failing */
3883 state_num = CURLYM_B_fail;
3884 goto reenter_switch;
3885 }
3886
3887 if (ST.me->flags) {
3888 /* mark current A as captured */
3889 I32 paren = ST.me->flags;
3890 if (ST.count) {
3891 PL_regstartp[paren]
3892 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3893 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3894 }
40a82448
DM
3895 else
3896 PL_regendp[paren] = -1;
c277df42 3897 }
40a82448 3898 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 3899 /* NOTREACHED */
40a82448
DM
3900
3901 case CURLYM_B_fail: /* just failed to match a B */
3902 REGCP_UNWIND(ST.cp);
3903 if (ST.minmod) {
3904 if (ST.count == ARG2(ST.me) /* max */)
3905 sayNO;
3906 goto curlym_do_A; /* try to match a further A */
3907 }
3908 /* backtrack one A */
3909 if (ST.count == ARG1(ST.me) /* min */)
3910 sayNO;
3911 ST.count--;
3912 locinput = HOPc(locinput, -ST.alen);
3913 goto curlym_do_B; /* try to match B */
3914
c255a977
DM
3915#undef ST
3916#define ST st->u.curly
40a82448 3917
c255a977
DM
3918#define CURLY_SETPAREN(paren, success) \
3919 if (paren) { \
3920 if (success) { \
3921 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3922 PL_regendp[paren] = locinput - PL_bostr; \
3923 } \
3924 else \
3925 PL_regendp[paren] = -1; \
3926 }
3927
3928 case STAR: /* /A*B/ where A is width 1 */
3929 ST.paren = 0;
3930 ST.min = 0;
3931 ST.max = REG_INFTY;
a0d0e21e
LW
3932 scan = NEXTOPER(scan);
3933 goto repeat;
c255a977
DM
3934 case PLUS: /* /A+B/ where A is width 1 */
3935 ST.paren = 0;
3936 ST.min = 1;
3937 ST.max = REG_INFTY;
c277df42 3938 scan = NEXTOPER(scan);
c255a977
DM
3939 goto repeat;
3940 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3941 ST.paren = scan->flags; /* Which paren to set */
3942 if (ST.paren > PL_regsize)
3943 PL_regsize = ST.paren;
3944 if (ST.paren > (I32)*PL_reglastparen)
3945 *PL_reglastparen = ST.paren;
3946 ST.min = ARG1(scan); /* min to match */
3947 ST.max = ARG2(scan); /* max to match */
3948 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3949 goto repeat;
3950 case CURLY: /* /A{m,n}B/ where A is width 1 */
3951 ST.paren = 0;
3952 ST.min = ARG1(scan); /* min to match */
3953 ST.max = ARG2(scan); /* max to match */
3954 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 3955 repeat:
a0d0e21e
LW
3956 /*
3957 * Lookahead to avoid useless match attempts
3958 * when we know what character comes next.
c255a977 3959 *
5f80c4cf
JP
3960 * Used to only do .*x and .*?x, but now it allows
3961 * for )'s, ('s and (?{ ... })'s to be in the way
3962 * of the quantifier and the EXACT-like node. -- japhy
3963 */
3964
c255a977
DM
3965 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3966 sayNO;
cca55fe3 3967 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3968 U8 *s;
3969 regnode *text_node = next;
3970
3dab1dad
YO
3971 if (! HAS_TEXT(text_node))
3972 FIND_NEXT_IMPT(text_node);
5f80c4cf 3973
9e137952 3974 if (! HAS_TEXT(text_node))
c255a977 3975 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 3976 else {
3dab1dad 3977 if (PL_regkind[OP(text_node)] == REF) {
c255a977 3978 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 3979 goto assume_ok_easy;
cca55fe3 3980 }
be8e71aa
YO
3981 else
3982 s = (U8*)STRING(text_node);
5f80c4cf
JP
3983
3984 if (!UTF) {
c255a977 3985 ST.c2 = ST.c1 = *s;
f65d3ee7 3986 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 3987 ST.c2 = PL_fold[ST.c1];
f65d3ee7 3988 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 3989 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 3990 }
5f80c4cf 3991 else { /* UTF */
f65d3ee7 3992 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3993 STRLEN ulen1, ulen2;
89ebb4a3
JH
3994 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3995 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3996
3997 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3998 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3999
c255a977 4000 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4001 uniflags);
c255a977 4002 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4003 uniflags);
5f80c4cf
JP
4004 }
4005 else {
c255a977 4006 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4007 uniflags);
5f80c4cf 4008 }
1aa99e6b
IH
4009 }
4010 }
bbce6d69 4011 }
a0d0e21e 4012 else
c255a977 4013 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4014 assume_ok_easy:
c255a977
DM
4015
4016 ST.A = scan;
4017 ST.B = next;
3280af22 4018 PL_reginput = locinput;
5d9a96ca
DM
4019 if (st->minmod) {
4020 st->minmod = 0;
c255a977 4021 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4022 sayNO;
c255a977 4023 ST.count = ST.min;
a0ed51b3 4024 locinput = PL_reginput;
c255a977
DM
4025 REGCP_SET(ST.cp);
4026 if (ST.c1 == CHRTEST_VOID)
4027 goto curly_try_B_min;
4028
4029 ST.oldloc = locinput;
4030
4031 /* set ST.maxpos to the furthest point along the
4032 * string that could possibly match */
4033 if (ST.max == REG_INFTY) {
4034 ST.maxpos = PL_regeol - 1;
4035 if (do_utf8)
4036 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4037 ST.maxpos--;
4038 }
4039 else if (do_utf8) {
4040 int m = ST.max - ST.min;
4041 for (ST.maxpos = locinput;
4042 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4043 ST.maxpos += UTF8SKIP(ST.maxpos);
4044 }
4045 else {
4046 ST.maxpos = locinput + ST.max - ST.min;
4047 if (ST.maxpos >= PL_regeol)
4048 ST.maxpos = PL_regeol - 1;
4049 }
4050 goto curly_try_B_min_known;
4051
4052 }
4053 else {
4054 ST.count = regrepeat(rex, ST.A, ST.max);
4055 locinput = PL_reginput;
4056 if (ST.count < ST.min)
4057 sayNO;
4058 if ((ST.count > ST.min)
4059 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4060 {
4061 /* A{m,n} must come at the end of the string, there's
4062 * no point in backing off ... */
4063 ST.min = ST.count;
4064 /* ...except that $ and \Z can match before *and* after
4065 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4066 We may back off by one in this case. */
4067 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4068 ST.min--;
4069 }
4070 REGCP_SET(ST.cp);
4071 goto curly_try_B_max;
4072 }
4073 /* NOTREACHED */
4074
4075
4076 case CURLY_B_min_known_fail:
4077 /* failed to find B in a non-greedy match where c1,c2 valid */
4078 if (ST.paren && ST.count)
4079 PL_regendp[ST.paren] = -1;
4080
4081 PL_reginput = locinput; /* Could be reset... */
4082 REGCP_UNWIND(ST.cp);
4083 /* Couldn't or didn't -- move forward. */
4084 ST.oldloc = locinput;
4085 if (do_utf8)
4086 locinput += UTF8SKIP(locinput);
4087 else
4088 locinput++;
4089 ST.count++;
4090 curly_try_B_min_known:
4091 /* find the next place where 'B' could work, then call B */
4092 {
4093 int n;
4094 if (do_utf8) {
4095 n = (ST.oldloc == locinput) ? 0 : 1;
4096 if (ST.c1 == ST.c2) {
4097 STRLEN len;
4098 /* set n to utf8_distance(oldloc, locinput) */
4099 while (locinput <= ST.maxpos &&
4100 utf8n_to_uvchr((U8*)locinput,
4101 UTF8_MAXBYTES, &len,
4102 uniflags) != (UV)ST.c1) {
4103 locinput += len;
4104 n++;
4105 }
1aa99e6b
IH
4106 }
4107 else {
c255a977
DM
4108 /* set n to utf8_distance(oldloc, locinput) */
4109 while (locinput <= ST.maxpos) {
4110 STRLEN len;
4111 const UV c = utf8n_to_uvchr((U8*)locinput,
4112 UTF8_MAXBYTES, &len,
4113 uniflags);
4114 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4115 break;
4116 locinput += len;
4117 n++;
1aa99e6b 4118 }
0fe9bf95
IZ
4119 }
4120 }
c255a977
DM
4121 else {
4122 if (ST.c1 == ST.c2) {
4123 while (locinput <= ST.maxpos &&
4124 UCHARAT(locinput) != ST.c1)
4125 locinput++;
bbce6d69 4126 }
c255a977
DM
4127 else {
4128 while (locinput <= ST.maxpos
4129 && UCHARAT(locinput) != ST.c1
4130 && UCHARAT(locinput) != ST.c2)
4131 locinput++;
a0ed51b3 4132 }
c255a977
DM
4133 n = locinput - ST.oldloc;
4134 }
4135 if (locinput > ST.maxpos)
4136 sayNO;
4137 /* PL_reginput == oldloc now */
4138 if (n) {
4139 ST.count += n;
4140 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4141 sayNO;
a0d0e21e 4142 }
c255a977
DM
4143 PL_reginput = locinput;
4144 CURLY_SETPAREN(ST.paren, ST.count);
4145 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4146 }
c255a977
DM
4147 /* NOTREACHED */
4148
4149
4150 case CURLY_B_min_fail:
4151 /* failed to find B in a non-greedy match where c1,c2 invalid */
4152 if (ST.paren && ST.count)
4153 PL_regendp[ST.paren] = -1;
4154
4155 REGCP_UNWIND(ST.cp);
4156 /* failed -- move forward one */
4157 PL_reginput = locinput;
4158 if (regrepeat(rex, ST.A, 1)) {
4159 ST.count++;
a0ed51b3 4160 locinput = PL_reginput;
c255a977
DM
4161 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4162 ST.count > 0)) /* count overflow ? */
15272685 4163 {
c255a977
DM
4164 curly_try_B_min:
4165 CURLY_SETPAREN(ST.paren, ST.count);
4166 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4167 }
4168 }
4633a7c4 4169 sayNO;
c255a977
DM
4170 /* NOTREACHED */
4171
4172
4173 curly_try_B_max:
4174 /* a successful greedy match: now try to match B */
4175 {
4176 UV c = 0;
4177 if (ST.c1 != CHRTEST_VOID)
4178 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4179 UTF8_MAXBYTES, 0, uniflags)
466787eb 4180 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4181 /* If it could work, try it. */
4182 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4183 CURLY_SETPAREN(ST.paren, ST.count);
4184 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4185 /* NOTREACHED */
4186 }
4187 }
4188 /* FALL THROUGH */
4189 case CURLY_B_max_fail:
4190 /* failed to find B in a greedy match */
4191 if (ST.paren && ST.count)
4192 PL_regendp[ST.paren] = -1;
4193
4194 REGCP_UNWIND(ST.cp);
4195 /* back up. */
4196 if (--ST.count < ST.min)
4197 sayNO;
4198 PL_reginput = locinput = HOPc(locinput, -1);
4199 goto curly_try_B_max;
4200
4201#undef ST
4202
4203
a0d0e21e 4204 case END:
3b0527fe 4205 if (locinput < reginfo->till) {
a3621e74 4206 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4207 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4208 PL_colors[4],
4209 (long)(locinput - PL_reg_starttry),
3b0527fe 4210 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4211 PL_colors[5]));
4212 sayNO_FINAL; /* Cannot match: too short. */
4213 }
4214 PL_reginput = locinput; /* put where regtry can find it */
4215 sayYES_FINAL; /* Success! */
dad79028
DM
4216
4217 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4218 DEBUG_EXECUTE_r(
4219 PerlIO_printf(Perl_debug_log,
4220 "%*s %ssubpattern success...%s\n",
4221 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4222 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4223 sayYES_FINAL; /* Success! */
4224
40a82448
DM
4225#undef ST
4226#define ST st->u.ifmatch
4227
4228 case SUSPEND: /* (?>A) */
4229 ST.wanted = 1;
9fe1d20c 4230 PL_reginput = locinput;
9041c2e3 4231 goto do_ifmatch;
dad79028 4232
40a82448
DM
4233 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4234 ST.wanted = 0;
dad79028
DM
4235 goto ifmatch_trivial_fail_test;
4236
40a82448
DM
4237 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4238 ST.wanted = 1;
dad79028 4239 ifmatch_trivial_fail_test:
a0ed51b3 4240 if (scan->flags) {
52657f30 4241 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4242 if (!s) {
4243 /* trivial fail */
4244 if (st->logical) {
4245 st->logical = 0;
40a82448 4246 st->sw = 1 - (bool)ST.wanted;
dad79028 4247 }
40a82448 4248 else if (ST.wanted)
dad79028
DM
4249 sayNO;
4250 next = scan + ARG(scan);
4251 if (next == scan)
4252 next = NULL;
4253 break;
4254 }
efb30f32 4255 PL_reginput = s;
a0ed51b3
LW
4256 }
4257 else
4258 PL_reginput = locinput;
4259
c277df42 4260 do_ifmatch:
40a82448
DM
4261 ST.me = scan;
4262 /* execute body of (?...A) */
4263 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4264 /* NOTREACHED */
4265
4266 case IFMATCH_A_fail: /* body of (?...A) failed */
4267 ST.wanted = !ST.wanted;
4268 /* FALL THROUGH */
4269
4270 case IFMATCH_A: /* body of (?...A) succeeded */
4271 if (st->logical) {
4272 st->logical = 0;
4273 st->sw = (bool)ST.wanted;
4274 }
4275 else if (!ST.wanted)
4276 sayNO;
4277
4278 if (OP(ST.me) == SUSPEND)
4279 locinput = PL_reginput;
4280 else {
4281 locinput = PL_reginput = st->locinput;
4282 nextchr = UCHARAT(locinput);
4283 }
4284 scan = ST.me + ARG(ST.me);
4285 if (scan == ST.me)
4286 scan = NULL;
4287 continue; /* execute B */
4288
4289#undef ST
dad79028 4290
c277df42 4291 case LONGJMP:
c277df42
IZ
4292 next = scan + ARG(scan);
4293 if (next == scan)
4294 next = NULL;
a0d0e21e
LW
4295 break;
4296 default:
b900a521 4297 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4298 PTR2UV(scan), OP(scan));
cea2e8a9 4299 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4300 }
95b24440 4301
a0d0e21e 4302 scan = next;
95b24440
DM
4303 continue;
4304 /* NOTREACHED */
4305
40a82448
DM
4306 push_yes_state:
4307 /* push a state that backtracks on success */
4308 st->u.yes.prev_yes_state = yes_state;
4309 yes_state = st;
4310 /* FALL THROUGH */
4311 push_state:
4312 /* push a new regex state, then continue at scan */
4313 {
4314 regmatch_state *newst;
4315
4316 depth++;
4317 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4318 "PUSH STATE(%d)\n", depth));
4319 st->locinput = locinput;
4320 newst = st+1;
4321 if (newst > SLAB_LAST(PL_regmatch_slab))
4322 newst = S_push_slab(aTHX);
4323 PL_regmatch_state = newst;
4324 newst->cc = st->cc;
4325 /* XXX probably don't need to initialise these */
4326 newst->minmod = 0;
4327 newst->sw = 0;
4328 newst->logical = 0;
4329
4330 locinput = PL_reginput;
4331 nextchr = UCHARAT(locinput);
4332 st = newst;
4333 continue;
4334 /* NOTREACHED */
4335 }
4336
95b24440
DM
4337 /* simulate recursively calling regmatch(), but without actually
4338 * recursing - ie save the current state on the heap rather than on
4339 * the stack, then re-enter the loop. This avoids complex regexes
4340 * blowing the processor stack */
4341
4342 start_recurse:
4343 {
5d9a96ca
DM
4344 /* push new state */
4345 regmatch_state *oldst = st;
4346
4347 depth++;
40a82448 4348 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4349
4350 /* grab the next free state slot */
4351 st++;
86545054 4352 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4353 st = S_push_slab(aTHX);
4354 PL_regmatch_state = st;
4355
4356 oldst->next = next;
4357 oldst->n = n;
4358 oldst->locinput = locinput;
5d9a96ca
DM
4359
4360 st->cc = oldst->cc;
95b24440
DM
4361 locinput = PL_reginput;
4362 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4363 st->minmod = 0;
4364 st->sw = 0;
4365 st->logical = 0;
95b24440
DM
4366#ifdef DEBUGGING
4367 PL_regindent++;
4368#endif
4369 }
a0d0e21e 4370 }
a687059c 4371
aa283a38
DM
4372
4373
a0d0e21e
LW
4374 /*
4375 * We get here only if there's trouble -- normally "case END" is
4376 * the terminating point.
4377 */
cea2e8a9 4378 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4379 /*NOTREACHED*/
4633a7c4
LW
4380 sayNO;
4381
7821416a 4382yes_final:
77cb431f
DM
4383
4384 if (yes_state) {
4385 /* we have successfully completed a subexpression, but we must now
4386 * pop to the state marked by yes_state and continue from there */
4387
4388 assert(st != yes_state);
4389 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4390 || yes_state > SLAB_LAST(PL_regmatch_slab))
4391 {
4392 /* not in this slab, pop slab */
4393 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4394 PL_regmatch_slab = PL_regmatch_slab->prev;
4395 st = SLAB_LAST(PL_regmatch_slab);
4396 }
4397 depth -= (st - yes_state);
40a82448
DM
4398 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4399 depth+1, depth+(st - yes_state)));
77cb431f
DM
4400 st = yes_state;
4401 yes_state = st->u.yes.prev_yes_state;
4402 PL_regmatch_state = st;
4403
4404 switch (st->resume_state) {
40a82448
DM
4405 case IFMATCH_A:
4406 case CURLYM_A:
40a82448
DM
4407 case EVAL_A:
4408 state_num = st->resume_state;
4409 goto reenter_switch;
4410
4411 case CURLYM_B:
166ba7cd
DM
4412 case BRANCH_next:
4413 case TRIE_next:
c255a977 4414 case CURLY_B_max:
77cb431f 4415 default:
40a82448 4416 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4417 }
4418 }
4419
a3621e74 4420 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4421 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4422yes:
4423#ifdef DEBUGGING
3280af22 4424 PL_regindent--;
4633a7c4 4425#endif
02db2b7b 4426
95b24440 4427 result = 1;
aa283a38 4428 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4429 * will disappear when REGFMATCH goes */
aa283a38
DM
4430 if (depth) {
4431 /* restore previous state and re-enter */
40a82448
DM
4432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4433 depth--;
4434 st--;
4435 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4436 PL_regmatch_slab = PL_regmatch_slab->prev;
4437 st = SLAB_LAST(PL_regmatch_slab);
4438 }
4439 PL_regmatch_state = st;
4440 scan = st->scan;
4441 next = st->next;
4442 n = st->n;
4443 locinput= st->locinput;
4444 nextchr = UCHARAT(locinput);
aa283a38
DM
4445
4446 switch (st->resume_state) {
aa283a38
DM
4447 case resume_CURLYX:
4448 goto resume_point_CURLYX;
4449 case resume_WHILEM1:
4450 goto resume_point_WHILEM1;
4451 case resume_WHILEM2:
4452 goto resume_point_WHILEM2;
4453 case resume_WHILEM3:
4454 goto resume_point_WHILEM3;
4455 case resume_WHILEM4:
4456 goto resume_point_WHILEM4;
4457 case resume_WHILEM5:
4458 goto resume_point_WHILEM5;
4459 case resume_WHILEM6:
4460 goto resume_point_WHILEM6;
77cb431f 4461
166ba7cd 4462 case TRIE_next:
40a82448
DM
4463 case CURLYM_A:
4464 case CURLYM_B:
4465 case EVAL_A:
4466 case IFMATCH_A:
4467 case BRANCH_next:
c255a977
DM
4468 case CURLY_B_max:
4469 case CURLY_B_min:
4470 case CURLY_B_min_known:
40a82448
DM
4471 break;
4472
aa283a38
DM
4473 default:
4474 Perl_croak(aTHX_ "regexp resume memory corruption");
4475 }
4476 }
4477 goto final_exit;
4633a7c4
LW
4478
4479no:
a3621e74 4480 DEBUG_EXECUTE_r(
7821416a
IZ
4481 PerlIO_printf(Perl_debug_log,
4482 "%*s %sfailed...%s\n",
e4584336 4483 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4484 );
7821416a
IZ
4485no_final:
4486do_no:
aa283a38 4487
4633a7c4 4488#ifdef DEBUGGING
3280af22 4489 PL_regindent--;
4633a7c4 4490#endif
95b24440 4491 result = 0;
5d9a96ca 4492
aa283a38
DM
4493 if (depth) {
4494 /* there's a previous state to backtrack to */
40a82448
DM
4495 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4496 depth--;
4497 st--;
4498 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4499 PL_regmatch_slab = PL_regmatch_slab->prev;
4500 st = SLAB_LAST(PL_regmatch_slab);
4501 }
4502 PL_regmatch_state = st;
4503 scan = st->scan;
4504 next = st->next;
4505 n = st->n;
4506 locinput= st->locinput;
4507 nextchr = UCHARAT(locinput);
4508
5d9a96ca 4509 switch (st->resume_state) {
95b24440
DM
4510 case resume_CURLYX:
4511 goto resume_point_CURLYX;
4512 case resume_WHILEM1:
4513 goto resume_point_WHILEM1;
4514 case resume_WHILEM2:
4515 goto resume_point_WHILEM2;
4516 case resume_WHILEM3:
4517 goto resume_point_WHILEM3;
4518 case resume_WHILEM4:
4519 goto resume_point_WHILEM4;
4520 case resume_WHILEM5:
4521 goto resume_point_WHILEM5;
4522 case resume_WHILEM6:
4523 goto resume_point_WHILEM6;
dad79028 4524
166ba7cd 4525 case TRIE_next:
40a82448
DM
4526 case EVAL_A:
4527 case BRANCH_next:
4528 case CURLYM_A:
4529 case CURLYM_B:
4530 case IFMATCH_A:
c255a977
DM
4531 case CURLY_B_max:
4532 case CURLY_B_min:
4533 case CURLY_B_min_known:
40a82448
DM
4534 if (yes_state == st)
4535 yes_state = st->u.yes.prev_yes_state;
4536 state_num = st->resume_state + 1; /* failure = success + 1 */
4537 goto reenter_switch;
dad79028 4538
95b24440
DM
4539 default:
4540 Perl_croak(aTHX_ "regexp resume memory corruption");
4541 }
95b24440 4542 }
aa283a38
DM
4543
4544final_exit:
4545
5d9a96ca
DM
4546 /* restore original high-water mark */
4547 PL_regmatch_slab = orig_slab;
4548 PL_regmatch_state = orig_state;
4549
4550 /* free all slabs above current one */
4551 if (orig_slab->next) {
c4fd8992 4552 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4553 orig_slab->next = NULL;
4554 while (sl) {
c4fd8992 4555 regmatch_slab * const osl = sl;
5d9a96ca 4556 sl = sl->next;
ad65c075 4557 Safefree(osl);
5d9a96ca
DM
4558 }
4559 }
4560
95b24440
DM
4561 return result;
4562
a687059c
LW
4563}
4564
4565/*
4566 - regrepeat - repeatedly match something simple, report how many
4567 */
4568/*
4569 * [This routine now assumes that it will only match on things of length 1.
4570 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4571 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4572 */
76e3520e 4573STATIC I32
32fc9b6a 4574S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4575{
27da23d5 4576 dVAR;
a0d0e21e 4577 register char *scan;
a0d0e21e 4578 register I32 c;
3280af22 4579 register char *loceol = PL_regeol;
a0ed51b3 4580 register I32 hardcount = 0;
53c4c00c 4581 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4582
3280af22 4583 scan = PL_reginput;
faf11cac
HS
4584 if (max == REG_INFTY)
4585 max = I32_MAX;
4586 else if (max < loceol - scan)
7f596f4c 4587 loceol = scan + max;
a0d0e21e 4588 switch (OP(p)) {
22c35a8c 4589 case REG_ANY:
1aa99e6b 4590 if (do_utf8) {
ffc61ed2 4591 loceol = PL_regeol;
1aa99e6b 4592 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4593 scan += UTF8SKIP(scan);
4594 hardcount++;
4595 }
4596 } else {
4597 while (scan < loceol && *scan != '\n')
4598 scan++;
a0ed51b3
LW
4599 }
4600 break;
ffc61ed2 4601 case SANY:
def8e4ea
JH
4602 if (do_utf8) {
4603 loceol = PL_regeol;
a0804c9e 4604 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4605 scan += UTF8SKIP(scan);
4606 hardcount++;
4607 }
4608 }
4609 else
4610 scan = loceol;
a0ed51b3 4611 break;
f33976b4
DB
4612 case CANY:
4613 scan = loceol;
4614 break;
090f7165
JH
4615 case EXACT: /* length of string is 1 */
4616 c = (U8)*STRING(p);
4617 while (scan < loceol && UCHARAT(scan) == c)
4618 scan++;
bbce6d69 4619 break;
4620 case EXACTF: /* length of string is 1 */
cd439c50 4621 c = (U8)*STRING(p);
bbce6d69 4622 while (scan < loceol &&
22c35a8c 4623 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4624 scan++;
4625 break;
4626 case EXACTFL: /* length of string is 1 */
3280af22 4627 PL_reg_flags |= RF_tainted;
cd439c50 4628 c = (U8)*STRING(p);
bbce6d69 4629 while (scan < loceol &&
22c35a8c 4630 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4631 scan++;
4632 break;
4633 case ANYOF:
ffc61ed2
JH
4634 if (do_utf8) {
4635 loceol = PL_regeol;
cfc92286 4636 while (hardcount < max && scan < loceol &&
32fc9b6a 4637 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4638 scan += UTF8SKIP(scan);
4639 hardcount++;
4640 }
4641 } else {
32fc9b6a 4642 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4643 scan++;
4644 }
a0d0e21e
LW
4645 break;
4646 case ALNUM:
1aa99e6b 4647 if (do_utf8) {
ffc61ed2 4648 loceol = PL_regeol;
1a4fad37 4649 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4650 while (hardcount < max && scan < loceol &&
3568d838 4651 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4652 scan += UTF8SKIP(scan);
4653 hardcount++;
4654 }
4655 } else {
4656 while (scan < loceol && isALNUM(*scan))
4657 scan++;
a0ed51b3
LW
4658 }
4659 break;
bbce6d69 4660 case ALNUML:
3280af22 4661 PL_reg_flags |= RF_tainted;
1aa99e6b 4662 if (do_utf8) {
ffc61ed2 4663 loceol = PL_regeol;
1aa99e6b
IH
4664 while (hardcount < max && scan < loceol &&
4665 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4666 scan += UTF8SKIP(scan);
4667 hardcount++;
4668 }
4669 } else {
4670 while (scan < loceol && isALNUM_LC(*scan))
4671 scan++;
a0ed51b3
LW
4672 }
4673 break;
a0d0e21e 4674 case NALNUM:
1aa99e6b 4675 if (do_utf8) {
ffc61ed2 4676 loceol = PL_regeol;
1a4fad37 4677 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4678 while (hardcount < max && scan < loceol &&
3568d838 4679 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4680 scan += UTF8SKIP(scan);
4681 hardcount++;
4682 }
4683 } else {
4684 while (scan < loceol && !isALNUM(*scan))
4685 scan++;
a0ed51b3
LW
4686 }
4687 break;
bbce6d69 4688 case NALNUML:
3280af22 4689 PL_reg_flags |= RF_tainted;
1aa99e6b 4690 if (do_utf8) {
ffc61ed2 4691 loceol = PL_regeol;
1aa99e6b
IH
4692 while (hardcount < max && scan < loceol &&
4693 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4694 scan += UTF8SKIP(scan);
4695 hardcount++;
4696 }
4697 } else {
4698 while (scan < loceol && !isALNUM_LC(*scan))
4699 scan++;
a0ed51b3
LW
4700 }
4701 break;
a0d0e21e 4702 case SPACE:
1aa99e6b 4703 if (do_utf8) {
ffc61ed2 4704 loceol = PL_regeol;
1a4fad37 4705 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4706 while (hardcount < max && scan < loceol &&
3568d838
JH
4707 (*scan == ' ' ||
4708 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4709 scan += UTF8SKIP(scan);
4710 hardcount++;
4711 }
4712 } else {
4713 while (scan < loceol && isSPACE(*scan))
4714 scan++;
a0ed51b3
LW
4715 }
4716 break;
bbce6d69 4717 case SPACEL:
3280af22 4718 PL_reg_flags |= RF_tainted;
1aa99e6b 4719 if (do_utf8) {
ffc61ed2 4720 loceol = PL_regeol;
1aa99e6b 4721 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4722 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4723 scan += UTF8SKIP(scan);
4724 hardcount++;
4725 }
4726 } else {
4727 while (scan < loceol && isSPACE_LC(*scan))
4728 scan++;
a0ed51b3
LW
4729 }
4730 break;
a0d0e21e 4731 case NSPACE:
1aa99e6b 4732 if (do_utf8) {
ffc61ed2 4733 loceol = PL_regeol;
1a4fad37 4734 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4735 while (hardcount < max && scan < loceol &&
3568d838
JH
4736 !(*scan == ' ' ||
4737 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4738 scan += UTF8SKIP(scan);
4739 hardcount++;
4740 }
4741 } else {
4742 while (scan < loceol && !isSPACE(*scan))
4743 scan++;
4744 break;
a0ed51b3 4745 }
bbce6d69 4746 case NSPACEL:
3280af22 4747 PL_reg_flags |= RF_tainted;
1aa99e6b 4748 if (do_utf8) {
ffc61ed2 4749 loceol = PL_regeol;
1aa99e6b 4750 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4751 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4752 scan += UTF8SKIP(scan);
4753 hardcount++;
4754 }
4755 } else {
4756 while (scan < loceol && !isSPACE_LC(*scan))
4757 scan++;
a0ed51b3
LW
4758 }
4759 break;
a0d0e21e 4760 case DIGIT:
1aa99e6b 4761 if (do_utf8) {
ffc61ed2 4762 loceol = PL_regeol;
1a4fad37 4763 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4764 while (hardcount < max && scan < loceol &&
3568d838 4765 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4766 scan += UTF8SKIP(scan);
4767 hardcount++;
4768 }
4769 } else {
4770 while (scan < loceol && isDIGIT(*scan))
4771 scan++;
a0ed51b3
LW
4772 }
4773 break;
a0d0e21e 4774 case NDIGIT:
1aa99e6b 4775 if (do_utf8) {
ffc61ed2 4776 loceol = PL_regeol;
1a4fad37 4777 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4778 while (hardcount < max && scan < loceol &&
3568d838 4779 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4780 scan += UTF8SKIP(scan);
4781 hardcount++;
4782 }
4783 } else {
4784 while (scan < loceol && !isDIGIT(*scan))
4785 scan++;
a0ed51b3
LW
4786 }
4787 break;
a0d0e21e
LW
4788 default: /* Called on something of 0 width. */
4789 break; /* So match right here or not at all. */
4790 }
a687059c 4791
a0ed51b3
LW
4792 if (hardcount)
4793 c = hardcount;
4794 else
4795 c = scan - PL_reginput;
3280af22 4796 PL_reginput = scan;
a687059c 4797
a3621e74 4798 DEBUG_r({
e68ec53f 4799 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4800 DEBUG_EXECUTE_r({
e68ec53f
YO
4801 SV * const prop = sv_newmortal();
4802 regprop(prog, prop, p);
4803 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4804 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4805 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4806 });
be8e71aa 4807 });
9041c2e3 4808
a0d0e21e 4809 return(c);
a687059c
LW
4810}
4811
c277df42 4812
be8e71aa 4813#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4814/*
ffc61ed2
JH
4815- regclass_swash - prepare the utf8 swash
4816*/
4817
4818SV *
32fc9b6a 4819Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4820{
97aff369 4821 dVAR;
9e55ce06
JH
4822 SV *sw = NULL;
4823 SV *si = NULL;
4824 SV *alt = NULL;
3dab1dad 4825 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4826
4f639d21 4827 if (data && data->count) {
a3b680e6 4828 const U32 n = ARG(node);
ffc61ed2 4829
4f639d21
DM
4830 if (data->what[n] == 's') {
4831 SV * const rv = (SV*)data->data[n];
890ce7af 4832 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4833 SV **const ary = AvARRAY(av);
9e55ce06 4834 SV **a, **b;
9041c2e3 4835
711a919c 4836 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4837 * documentation of these array elements. */
4838
b11f357e 4839 si = *ary;
8f7f7219 4840 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4841 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4842
ffc61ed2
JH
4843 if (a)
4844 sw = *a;
4845 else if (si && doinit) {
4846 sw = swash_init("utf8", "", si, 1, 0);
4847 (void)av_store(av, 1, sw);
4848 }
9e55ce06
JH
4849 if (b)
4850 alt = *b;
ffc61ed2
JH
4851 }
4852 }
4853
9e55ce06
JH
4854 if (listsvp)
4855 *listsvp = si;
4856 if (altsvp)
4857 *altsvp = alt;
ffc61ed2
JH
4858
4859 return sw;
4860}
76234dfb 4861#endif
ffc61ed2
JH
4862
4863/*
ba7b4546 4864 - reginclass - determine if a character falls into a character class
832705d4
JH
4865
4866 The n is the ANYOF regnode, the p is the target string, lenp
4867 is pointer to the maximum length of how far to go in the p
4868 (if the lenp is zero, UTF8SKIP(p) is used),
4869 do_utf8 tells whether the target string is in UTF-8.
4870
bbce6d69 4871 */
4872
76e3520e 4873STATIC bool
32fc9b6a 4874S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4875{
27da23d5 4876 dVAR;
a3b680e6 4877 const char flags = ANYOF_FLAGS(n);
bbce6d69 4878 bool match = FALSE;
cc07378b 4879 UV c = *p;
ae9ddab8 4880 STRLEN len = 0;
9e55ce06 4881 STRLEN plen;
1aa99e6b 4882
19f67299
TS
4883 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4884 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4885 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4886 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
4887 if (len == (STRLEN)-1)
4888 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4889 }
bbce6d69 4890
0f0076b4 4891 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4892 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4893 if (lenp)
4894 *lenp = 0;
ffc61ed2 4895 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4896 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4897 match = TRUE;
bbce6d69 4898 }
3568d838 4899 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4900 match = TRUE;
ffc61ed2 4901 if (!match) {
9e55ce06 4902 AV *av;
32fc9b6a 4903 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4904
4905 if (sw) {
3568d838 4906 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4907 match = TRUE;
4908 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4909 if (!match && lenp && av) {
4910 I32 i;
9e55ce06 4911 for (i = 0; i <= av_len(av); i++) {
890ce7af 4912 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4913 STRLEN len;
890ce7af 4914 const char * const s = SvPV_const(sv, len);
9e55ce06 4915
061b10df 4916 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4917 *lenp = len;
4918 match = TRUE;
4919 break;
4920 }
4921 }
4922 }
4923 if (!match) {
89ebb4a3 4924 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4925 STRLEN tmplen;
4926
9e55ce06
JH
4927 to_utf8_fold(p, tmpbuf, &tmplen);
4928 if (swash_fetch(sw, tmpbuf, do_utf8))
4929 match = TRUE;
4930 }
ffc61ed2
JH
4931 }
4932 }
bbce6d69 4933 }
9e55ce06 4934 if (match && lenp && *lenp == 0)
0f0076b4 4935 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4936 }
1aa99e6b 4937 if (!match && c < 256) {
ffc61ed2
JH
4938 if (ANYOF_BITMAP_TEST(n, c))
4939 match = TRUE;
4940 else if (flags & ANYOF_FOLD) {
eb160463 4941 U8 f;
a0ed51b3 4942
ffc61ed2
JH
4943 if (flags & ANYOF_LOCALE) {
4944 PL_reg_flags |= RF_tainted;
4945 f = PL_fold_locale[c];
4946 }
4947 else
4948 f = PL_fold[c];
4949 if (f != c && ANYOF_BITMAP_TEST(n, f))
4950 match = TRUE;
4951 }
4952
4953 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4954 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4955 if (
4956 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4957 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4958 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4959 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4960 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4961 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4962 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4963 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4964 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4965 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4966 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4967 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4968 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4969 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4970 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4971 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4972 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4973 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4974 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4975 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4976 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4977 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4978 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4979 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4980 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4981 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4982 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4983 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4984 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4985 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4986 ) /* How's that for a conditional? */
4987 {
4988 match = TRUE;
4989 }
a0ed51b3 4990 }
a0ed51b3
LW
4991 }
4992
a0ed51b3
LW
4993 return (flags & ANYOF_INVERT) ? !match : match;
4994}
161b471a 4995
dfe13c55 4996STATIC U8 *
0ce71af7 4997S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 4998{
97aff369 4999 dVAR;
a0ed51b3 5000 if (off >= 0) {
1aa99e6b 5001 while (off-- && s < lim) {
ffc61ed2 5002 /* XXX could check well-formedness here */
a0ed51b3 5003 s += UTF8SKIP(s);
ffc61ed2 5004 }
a0ed51b3
LW
5005 }
5006 else {
5007 while (off++) {
1aa99e6b 5008 if (s > lim) {
a0ed51b3 5009 s--;
ffc61ed2 5010 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5011 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5012 s--;
ffc61ed2
JH
5013 }
5014 /* XXX could check well-formedness here */
a0ed51b3
LW
5015 }
5016 }
5017 }
5018 return s;
5019}
161b471a 5020
dfe13c55 5021STATIC U8 *
0ce71af7 5022S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5023{
97aff369 5024 dVAR;
a0ed51b3 5025 if (off >= 0) {
1aa99e6b 5026 while (off-- && s < lim) {
ffc61ed2 5027 /* XXX could check well-formedness here */
a0ed51b3 5028 s += UTF8SKIP(s);
ffc61ed2 5029 }
a0ed51b3 5030 if (off >= 0)
3dab1dad 5031 return NULL;
a0ed51b3
LW
5032 }
5033 else {
5034 while (off++) {
1aa99e6b 5035 if (s > lim) {
a0ed51b3 5036 s--;
ffc61ed2 5037 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5038 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5039 s--;
ffc61ed2
JH
5040 }
5041 /* XXX could check well-formedness here */
a0ed51b3
LW
5042 }
5043 else
5044 break;
5045 }
5046 if (off <= 0)
3dab1dad 5047 return NULL;
a0ed51b3
LW
5048 }
5049 return s;
5050}
51371543 5051
51371543 5052static void
acfe0abc 5053restore_pos(pTHX_ void *arg)
51371543 5054{
97aff369 5055 dVAR;
097eb12c 5056 regexp * const rex = (regexp *)arg;
51371543
GS
5057 if (PL_reg_eval_set) {
5058 if (PL_reg_oldsaved) {
4f639d21
DM
5059 rex->subbeg = PL_reg_oldsaved;
5060 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5061#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5062 rex->saved_copy = PL_nrs;
ed252734 5063#endif
4f639d21 5064 RX_MATCH_COPIED_on(rex);
51371543
GS
5065 }
5066 PL_reg_magic->mg_len = PL_reg_oldpos;
5067 PL_reg_eval_set = 0;
5068 PL_curpm = PL_reg_oldcurpm;
5069 }
5070}
33b8afdf
JH
5071
5072STATIC void
5073S_to_utf8_substr(pTHX_ register regexp *prog)
5074{
33b8afdf 5075 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5076 SV* const sv = newSVsv(prog->float_substr);
5077 prog->float_utf8 = sv;
33b8afdf
JH
5078 sv_utf8_upgrade(sv);
5079 if (SvTAIL(prog->float_substr))
5080 SvTAIL_on(sv);
5081 if (prog->float_substr == prog->check_substr)
5082 prog->check_utf8 = sv;
5083 }
5084 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5085 SV* const sv = newSVsv(prog->anchored_substr);
5086 prog->anchored_utf8 = sv;
33b8afdf
JH
5087 sv_utf8_upgrade(sv);
5088 if (SvTAIL(prog->anchored_substr))
5089 SvTAIL_on(sv);
5090 if (prog->anchored_substr == prog->check_substr)
5091 prog->check_utf8 = sv;
5092 }
5093}
5094
5095STATIC void
5096S_to_byte_substr(pTHX_ register regexp *prog)
5097{
97aff369 5098 dVAR;
33b8afdf 5099 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5100 SV* sv = newSVsv(prog->float_utf8);
5101 prog->float_substr = sv;
33b8afdf
JH
5102 if (sv_utf8_downgrade(sv, TRUE)) {
5103 if (SvTAIL(prog->float_utf8))
5104 SvTAIL_on(sv);
5105 } else {
5106 SvREFCNT_dec(sv);
5107 prog->float_substr = sv = &PL_sv_undef;
5108 }
5109 if (prog->float_utf8 == prog->check_utf8)
5110 prog->check_substr = sv;
5111 }
5112 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5113 SV* sv = newSVsv(prog->anchored_utf8);
5114 prog->anchored_substr = sv;
33b8afdf
JH
5115 if (sv_utf8_downgrade(sv, TRUE)) {
5116 if (SvTAIL(prog->anchored_utf8))
5117 SvTAIL_on(sv);
5118 } else {
5119 SvREFCNT_dec(sv);
5120 prog->anchored_substr = sv = &PL_sv_undef;
5121 }
5122 if (prog->anchored_utf8 == prog->check_utf8)
5123 prog->check_substr = sv;
5124 }
5125}
66610fdd
RGS
5126
5127/*
5128 * Local variables:
5129 * c-indentation-style: bsd
5130 * c-basic-offset: 4
5131 * indent-tabs-mode: t
5132 * End:
5133 *
37442d52
RGS
5134 * ex: set ts=8 sts=4 sw=4 noet:
5135 */