This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
-DPERL_GLOBAL_STRUCT_PRIVATE tweaks (done_sanity_check global? yech!)
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
81#define RF_utf8 8 /* String contains multibyte chars? */
82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
a3621e74 198# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 201
a3621e74 202# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 203 PerlIO_printf(Perl_debug_log, \
faccc32b 204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 206
76e3520e 207STATIC char *
097eb12c 208S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 209{
97aff369 210 dVAR;
b1ce53c5 211 I32 i;
a0d0e21e 212 char *input;
b1ce53c5 213
a3621e74
YO
214 GET_RE_DEBUG_FLAGS_DECL;
215
b1ce53c5 216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 217 i = SSPOPINT;
b1ce53c5
JH
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 220 input = (char *) SSPOPPTR;
a01268b5 221 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
b1ce53c5
JH
224
225 /* Now restore the parentheses context. */
41123dfd
JH
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 228 I32 tmps;
097eb12c 229 U32 paren = (U32)SSPOPINT;
3280af22 230 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
231 PL_regstartp[paren] = SSPOPINT;
232 tmps = SSPOPINT;
3280af22
NIS
233 if (paren <= *PL_reglastparen)
234 PL_regendp[paren] = tmps;
a3621e74 235 DEBUG_EXECUTE_r(
c3464db5 236 PerlIO_printf(Perl_debug_log,
b900a521 237 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 238 (UV)paren, (IV)PL_regstartp[paren],
b900a521 239 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 240 (IV)PL_regendp[paren],
3280af22 241 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 242 );
a0d0e21e 243 }
a3621e74 244 DEBUG_EXECUTE_r(
bb7a0f54 245 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 246 PerlIO_printf(Perl_debug_log,
faccc32b 247 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 248 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
249 }
250 );
daf18116 251#if 1
dafc8851
JH
252 /* It would seem that the similar code in regtry()
253 * already takes care of this, and in fact it is in
254 * a better location to since this code can #if 0-ed out
255 * but the code in regtry() is needed or otherwise tests
256 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
257 * (as of patchlevel 7877) will fail. Then again,
258 * this code seems to be necessary or otherwise
259 * building DynaLoader will fail:
260 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
261 * --jhi */
bb7a0f54 262 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
263 if (i > PL_regsize)
264 PL_regstartp[i] = -1;
265 PL_regendp[i] = -1;
a0d0e21e 266 }
dafc8851 267#endif
a0d0e21e
LW
268 return input;
269}
270
02db2b7b 271#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 272
a687059c 273/*
e50aee73 274 * pregexec and friends
a687059c
LW
275 */
276
76234dfb 277#ifndef PERL_IN_XSUB_RE
a687059c 278/*
c277df42 279 - pregexec - match a regexp against a string
a687059c 280 */
c277df42 281I32
864dbfa3 282Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 283 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
284/* strend: pointer to null at end of string */
285/* strbeg: real beginning of string */
286/* minend: end of match must be >=minend after stringarg. */
287/* nosave: For optimizations. */
288{
289 return
9041c2e3 290 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
291 nosave ? 0 : REXEC_COPY_STR);
292}
76234dfb 293#endif
22e551b9 294
9041c2e3 295/*
cad2e5aa
JH
296 * Need to implement the following flags for reg_anch:
297 *
298 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
299 * USE_INTUIT_ML
300 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
301 * INTUIT_AUTORITATIVE_ML
302 * INTUIT_ONCE_NOML - Intuit can match in one location only.
303 * INTUIT_ONCE_ML
304 *
305 * Another flag for this function: SECOND_TIME (so that float substrs
306 * with giant delta may be not rechecked).
307 */
308
309/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
310
3f7c398e 311/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
312 Otherwise, only SvCUR(sv) is used to get strbeg. */
313
314/* XXXX We assume that strpos is strbeg unless sv. */
315
6eb5f6b9
JH
316/* XXXX Some places assume that there is a fixed substring.
317 An update may be needed if optimizer marks as "INTUITable"
318 RExen without fixed substrings. Similarly, it is assumed that
319 lengths of all the strings are no more than minlen, thus they
320 cannot come from lookahead.
321 (Or minlen should take into account lookahead.) */
322
2c2d71f5
JH
323/* A failure to find a constant substring means that there is no need to make
324 an expensive call to REx engine, thus we celebrate a failure. Similarly,
325 finding a substring too deep into the string means that less calls to
30944b6d
IZ
326 regtry() should be needed.
327
328 REx compiler's optimizer found 4 possible hints:
329 a) Anchored substring;
330 b) Fixed substring;
331 c) Whether we are anchored (beginning-of-line or \G);
332 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 333 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
334 string which does not contradict any of them.
335 */
2c2d71f5 336
6eb5f6b9
JH
337/* Most of decisions we do here should have been done at compile time.
338 The nodes of the REx which we used for the search should have been
339 deleted from the finite automaton. */
340
cad2e5aa
JH
341char *
342Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
343 char *strend, U32 flags, re_scream_pos_data *data)
344{
97aff369 345 dVAR;
b7953727 346 register I32 start_shift = 0;
cad2e5aa 347 /* Should be nonnegative! */
b7953727 348 register I32 end_shift = 0;
2c2d71f5
JH
349 register char *s;
350 register SV *check;
a1933d95 351 char *strbeg;
cad2e5aa 352 char *t;
a3b680e6 353 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 354 I32 ml_anch;
bd61b366
SS
355 register char *other_last = NULL; /* other substr checked before this */
356 char *check_at = NULL; /* check substr found at this pos */
1df70142 357 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 358#ifdef DEBUGGING
890ce7af 359 const char * const i_strpos = strpos;
30944b6d 360#endif
a3621e74
YO
361
362 GET_RE_DEBUG_FLAGS_DECL;
363
a30b2f1f 364 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 365
b8d68ded 366 if (prog->reganch & ROPT_UTF8) {
a3621e74 367 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
368 "UTF-8 regex...\n"));
369 PL_reg_flags |= RF_utf8;
370 }
371
a3621e74 372 DEBUG_EXECUTE_r({
0df25f3d
YO
373 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
374 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
375
2a782b5b
JH
376 if (!PL_colorset)
377 reginitcolors();
b8d68ded 378 if (PL_reg_match_utf8)
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 380 "UTF-8 target...\n"));
2a782b5b 381 PerlIO_printf(Perl_debug_log,
a0288114 382 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 383 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
384 prog->precomp,
385 PL_colors[1],
386 (strlen(prog->precomp) > 60 ? "..." : ""),
387 PL_colors[0],
388 (int)(len > 60 ? 60 : len),
389 s, PL_colors[1],
390 (len > 60 ? "..." : "")
391 );
392 });
cad2e5aa 393
c344f387
JH
394 /* CHR_DIST() would be more correct here but it makes things slow. */
395 if (prog->minlen > strend - strpos) {
a3621e74 396 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 397 "String too short... [re_intuit_start]\n"));
cad2e5aa 398 goto fail;
2c2d71f5 399 }
a1933d95 400 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 401 PL_regeol = strend;
33b8afdf
JH
402 if (do_utf8) {
403 if (!prog->check_utf8 && prog->check_substr)
404 to_utf8_substr(prog);
405 check = prog->check_utf8;
406 } else {
407 if (!prog->check_substr && prog->check_utf8)
408 to_byte_substr(prog);
409 check = prog->check_substr;
410 }
411 if (check == &PL_sv_undef) {
a3621e74 412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
413 "Non-utf string cannot match utf check string\n"));
414 goto fail;
415 }
2c2d71f5 416 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
417 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
418 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 419 && !multiline ) ); /* Check after \n? */
cad2e5aa 420
7e25d62c
JH
421 if (!ml_anch) {
422 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
423 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 424 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
425 && sv && !SvROK(sv)
426 && (strpos != strbeg)) {
a3621e74 427 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
428 goto fail;
429 }
430 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 431 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 432 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
433 I32 slen;
434
1aa99e6b 435 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
436 if (SvTAIL(check)) {
437 slen = SvCUR(check); /* >= 1 */
cad2e5aa 438
9041c2e3 439 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 440 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 442 goto fail_finish;
cad2e5aa
JH
443 }
444 /* Now should match s[0..slen-2] */
445 slen--;
3f7c398e 446 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 447 || (slen > 1
3f7c398e 448 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 449 report_neq:
a3621e74 450 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
451 goto fail_finish;
452 }
cad2e5aa 453 }
3f7c398e 454 else if (*SvPVX_const(check) != *s
653099ff 455 || ((slen = SvCUR(check)) > 1
3f7c398e 456 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 457 goto report_neq;
c315bfe8 458 check_at = s;
2c2d71f5 459 goto success_at_start;
7e25d62c 460 }
cad2e5aa 461 }
2c2d71f5 462 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 463 s = strpos;
2c2d71f5 464 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 465 end_shift = prog->minlen - start_shift -
653099ff 466 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 467 if (!ml_anch) {
a3b680e6 468 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 469 - (SvTAIL(check) != 0);
a3b680e6 470 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
471
472 if (end_shift < eshift)
473 end_shift = eshift;
474 }
cad2e5aa 475 }
2c2d71f5 476 else { /* Can match at random position */
cad2e5aa
JH
477 ml_anch = 0;
478 s = strpos;
2c2d71f5
JH
479 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
480 /* Should be nonnegative! */
481 end_shift = prog->minlen - start_shift -
653099ff 482 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
483 }
484
2c2d71f5 485#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 486 if (end_shift < 0)
6bbae5e6 487 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
488#endif
489
2c2d71f5
JH
490 restart:
491 /* Find a possible match in the region s..strend by looking for
492 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 493 if (flags & REXEC_SCREAM) {
cad2e5aa 494 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 495 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 496
2c2d71f5
JH
497 if (PL_screamfirst[BmRARE(check)] >= 0
498 || ( BmRARE(check) == '\n'
499 && (BmPREVIOUS(check) == SvCUR(check) - 1)
500 && SvTAIL(check) ))
9041c2e3 501 s = screaminstr(sv, check,
2c2d71f5 502 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 503 else
2c2d71f5 504 goto fail_finish;
4addbd3b
HS
505 /* we may be pointing at the wrong string */
506 if (s && RX_MATCH_COPIED(prog))
3f7c398e 507 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
508 if (data)
509 *data->scream_olds = s;
510 }
f33976b4 511 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
512 s = fbm_instr((U8*)(s + start_shift),
513 (U8*)(strend - end_shift),
7fba1cd6 514 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 515 else
1aa99e6b
IH
516 s = fbm_instr(HOP3(s, start_shift, strend),
517 HOP3(strend, -end_shift, strbeg),
7fba1cd6 518 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
519
520 /* Update the count-of-usability, remove useless subpatterns,
521 unshift s. */
2c2d71f5 522
a0288114 523 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 524 (s ? "Found" : "Did not find"),
33b8afdf 525 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 526 PL_colors[0],
7b0972df 527 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 528 SvPVX_const(check),
2c2d71f5
JH
529 PL_colors[1], (SvTAIL(check) ? "$" : ""),
530 (s ? " at offset " : "...\n") ) );
531
532 if (!s)
533 goto fail_finish;
534
6eb5f6b9
JH
535 check_at = s;
536
2c2d71f5 537 /* Finish the diagnostic message */
a3621e74 538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
539
540 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
541 Start with the other substr.
542 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 543 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
544 *always* match. Probably should be marked during compile...
545 Probably it is right to do no SCREAM here...
546 */
547
33b8afdf 548 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 549 /* Take into account the "other" substring. */
2c2d71f5
JH
550 /* XXXX May be hopelessly wrong for UTF... */
551 if (!other_last)
6eb5f6b9 552 other_last = strpos;
33b8afdf 553 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
554 do_other_anchored:
555 {
890ce7af
AL
556 char * const last = HOP3c(s, -start_shift, strbeg);
557 char *last1, *last2;
be8e71aa 558 char * const saved_s = s;
33b8afdf 559 SV* must;
2c2d71f5 560
2c2d71f5
JH
561 t = s - prog->check_offset_max;
562 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 563 && (!do_utf8
0ce71af7 564 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 565 && t > strpos)))
6f207bd3 566 NOOP;
2c2d71f5
JH
567 else
568 t = strpos;
1aa99e6b 569 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
570 if (t < other_last) /* These positions already checked */
571 t = other_last;
1aa99e6b 572 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
573 if (last < last1)
574 last1 = last;
575 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
576 /* On end-of-str: see comment below. */
33b8afdf
JH
577 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
578 if (must == &PL_sv_undef) {
579 s = (char*)NULL;
a3621e74 580 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
581 }
582 else
583 s = fbm_instr(
584 (unsigned char*)t,
585 HOP3(HOP3(last1, prog->anchored_offset, strend)
586 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
587 must,
7fba1cd6 588 multiline ? FBMrf_MULTILINE : 0
33b8afdf 589 );
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 591 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
592 (s ? "Found" : "Contradicts"),
593 PL_colors[0],
33b8afdf
JH
594 (int)(SvCUR(must)
595 - (SvTAIL(must)!=0)),
3f7c398e 596 SvPVX_const(must),
33b8afdf 597 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
598 if (!s) {
599 if (last1 >= last2) {
a3621e74 600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
601 ", giving up...\n"));
602 goto fail_finish;
603 }
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 605 ", trying floating at offset %ld...\n",
be8e71aa 606 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
607 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
608 s = HOP3c(last, 1, strend);
2c2d71f5
JH
609 goto restart;
610 }
611 else {
a3621e74 612 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 613 (long)(s - i_strpos)));
1aa99e6b
IH
614 t = HOP3c(s, -prog->anchored_offset, strbeg);
615 other_last = HOP3c(s, 1, strend);
be8e71aa 616 s = saved_s;
2c2d71f5
JH
617 if (t == strpos)
618 goto try_at_start;
2c2d71f5
JH
619 goto try_at_offset;
620 }
30944b6d 621 }
2c2d71f5
JH
622 }
623 else { /* Take into account the floating substring. */
33b8afdf 624 char *last, *last1;
be8e71aa 625 char * const saved_s = s;
33b8afdf
JH
626 SV* must;
627
628 t = HOP3c(s, -start_shift, strbeg);
629 last1 = last =
630 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
631 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
632 last = HOP3c(t, prog->float_max_offset, strend);
633 s = HOP3c(t, prog->float_min_offset, strend);
634 if (s < other_last)
635 s = other_last;
2c2d71f5 636 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
637 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
638 /* fbm_instr() takes into account exact value of end-of-str
639 if the check is SvTAIL(ed). Since false positives are OK,
640 and end-of-str is not later than strend we are OK. */
641 if (must == &PL_sv_undef) {
642 s = (char*)NULL;
a3621e74 643 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
644 }
645 else
2c2d71f5 646 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
647 (unsigned char*)last + SvCUR(must)
648 - (SvTAIL(must)!=0),
7fba1cd6 649 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
651 (s ? "Found" : "Contradicts"),
652 PL_colors[0],
653 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 654 SvPVX_const(must),
33b8afdf
JH
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
656 if (!s) {
657 if (last1 == last) {
a3621e74 658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
659 ", giving up...\n"));
660 goto fail_finish;
2c2d71f5 661 }
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 663 ", trying anchored starting at offset %ld...\n",
be8e71aa 664 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
665 other_last = last;
666 s = HOP3c(t, 1, strend);
667 goto restart;
668 }
669 else {
a3621e74 670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
671 (long)(s - i_strpos)));
672 other_last = s; /* Fix this later. --Hugo */
be8e71aa 673 s = saved_s;
33b8afdf
JH
674 if (t == strpos)
675 goto try_at_start;
676 goto try_at_offset;
677 }
2c2d71f5 678 }
cad2e5aa 679 }
2c2d71f5
JH
680
681 t = s - prog->check_offset_max;
2c2d71f5 682 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 683 && (!do_utf8
0ce71af7 684 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 685 && t > strpos))) {
2c2d71f5
JH
686 /* Fixed substring is found far enough so that the match
687 cannot start at strpos. */
688 try_at_offset:
cad2e5aa 689 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
690 /* Eventually fbm_*() should handle this, but often
691 anchored_offset is not 0, so this check will not be wasted. */
692 /* XXXX In the code below we prefer to look for "^" even in
693 presence of anchored substrings. And we search even
694 beyond the found float position. These pessimizations
695 are historical artefacts only. */
696 find_anchor:
2c2d71f5 697 while (t < strend - prog->minlen) {
cad2e5aa 698 if (*t == '\n') {
4ee3650e 699 if (t < check_at - prog->check_offset_min) {
33b8afdf 700 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
701 /* Since we moved from the found position,
702 we definitely contradict the found anchored
30944b6d
IZ
703 substr. Due to the above check we do not
704 contradict "check" substr.
705 Thus we can arrive here only if check substr
706 is float. Redo checking for "other"=="fixed".
707 */
9041c2e3 708 strpos = t + 1;
a3621e74 709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 710 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
711 goto do_other_anchored;
712 }
4ee3650e
GS
713 /* We don't contradict the found floating substring. */
714 /* XXXX Why not check for STCLASS? */
cad2e5aa 715 s = t + 1;
a3621e74 716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 717 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
718 goto set_useful;
719 }
4ee3650e
GS
720 /* Position contradicts check-string */
721 /* XXXX probably better to look for check-string
722 than for "\n", so one should lower the limit for t? */
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 725 other_last = strpos = s = t + 1;
cad2e5aa
JH
726 goto restart;
727 }
728 t++;
729 }
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 731 PL_colors[0], PL_colors[1]));
2c2d71f5 732 goto fail_finish;
cad2e5aa 733 }
f5952150 734 else {
a3621e74 735 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 736 PL_colors[0], PL_colors[1]));
f5952150 737 }
cad2e5aa
JH
738 s = t;
739 set_useful:
33b8afdf 740 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
741 }
742 else {
f5952150 743 /* The found string does not prohibit matching at strpos,
2c2d71f5 744 - no optimization of calling REx engine can be performed,
f5952150
GS
745 unless it was an MBOL and we are not after MBOL,
746 or a future STCLASS check will fail this. */
2c2d71f5
JH
747 try_at_start:
748 /* Even in this situation we may use MBOL flag if strpos is offset
749 wrt the start of the string. */
05b4157f 750 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 751 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
752 /* May be due to an implicit anchor of m{.*foo} */
753 && !(prog->reganch & ROPT_IMPLICIT))
754 {
cad2e5aa
JH
755 t = strpos;
756 goto find_anchor;
757 }
a3621e74 758 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 759 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 760 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 761 );
2c2d71f5 762 success_at_start:
30944b6d 763 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
764 && (do_utf8 ? (
765 prog->check_utf8 /* Could be deleted already */
766 && --BmUSEFUL(prog->check_utf8) < 0
767 && (prog->check_utf8 == prog->float_utf8)
768 ) : (
769 prog->check_substr /* Could be deleted already */
770 && --BmUSEFUL(prog->check_substr) < 0
771 && (prog->check_substr == prog->float_substr)
772 )))
66e933ab 773 {
cad2e5aa 774 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 775 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
776 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
777 if (do_utf8 ? prog->check_substr : prog->check_utf8)
778 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
779 prog->check_substr = prog->check_utf8 = NULL; /* disable */
780 prog->float_substr = prog->float_utf8 = NULL; /* clear */
781 check = NULL; /* abort */
cad2e5aa 782 s = strpos;
3cf5c195
IZ
783 /* XXXX This is a remnant of the old implementation. It
784 looks wasteful, since now INTUIT can use many
6eb5f6b9 785 other heuristics. */
cad2e5aa
JH
786 prog->reganch &= ~RE_USE_INTUIT;
787 }
788 else
789 s = strpos;
790 }
791
6eb5f6b9
JH
792 /* Last resort... */
793 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 794 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
795 /* minlen == 0 is possible if regstclass is \b or \B,
796 and the fixed substr is ''$.
797 Since minlen is already taken into account, s+1 is before strend;
798 accidentally, minlen >= 1 guaranties no false positives at s + 1
799 even for \b or \B. But (minlen? 1 : 0) below assumes that
800 regstclass does not come from lookahead... */
801 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
802 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 803 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 804 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 805 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 806 : 1);
07be1b83 807 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 808 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 809 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
810 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
811 cl_l, strend)
812 : strend);
07be1b83
YO
813 /*if (OP(prog->regstclass) == TRIE)
814 endpos++;*/
6eb5f6b9 815 t = s;
3b0527fe 816 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
817 if (!s) {
818#ifdef DEBUGGING
cbbf8932 819 const char *what = NULL;
6eb5f6b9
JH
820#endif
821 if (endpos == strend) {
a3621e74 822 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
823 "Could not match STCLASS...\n") );
824 goto fail;
825 }
a3621e74 826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 827 "This position contradicts STCLASS...\n") );
653099ff
GS
828 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
829 goto fail;
6eb5f6b9 830 /* Contradict one of substrings */
33b8afdf
JH
831 if (prog->anchored_substr || prog->anchored_utf8) {
832 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 833 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 834 hop_and_restart:
1aa99e6b 835 s = HOP3c(t, 1, strend);
66e933ab
GS
836 if (s + start_shift + end_shift > strend) {
837 /* XXXX Should be taken into account earlier? */
a3621e74 838 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
839 "Could not match STCLASS...\n") );
840 goto fail;
841 }
5e39e1e5
HS
842 if (!check)
843 goto giveup;
a3621e74 844 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 845 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
846 what, (long)(s + start_shift - i_strpos)) );
847 goto restart;
848 }
66e933ab 849 /* Have both, check_string is floating */
6eb5f6b9
JH
850 if (t + start_shift >= check_at) /* Contradicts floating=check */
851 goto retry_floating_check;
852 /* Recheck anchored substring, but not floating... */
9041c2e3 853 s = check_at;
5e39e1e5
HS
854 if (!check)
855 goto giveup;
a3621e74 856 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 857 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
858 (long)(other_last - i_strpos)) );
859 goto do_other_anchored;
860 }
60e71179
GS
861 /* Another way we could have checked stclass at the
862 current position only: */
863 if (ml_anch) {
864 s = t = t + 1;
5e39e1e5
HS
865 if (!check)
866 goto giveup;
a3621e74 867 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 868 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 869 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 870 goto try_at_offset;
66e933ab 871 }
33b8afdf 872 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 873 goto fail;
6eb5f6b9
JH
874 /* Check is floating subtring. */
875 retry_floating_check:
876 t = check_at - start_shift;
a3621e74 877 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
878 goto hop_and_restart;
879 }
b7953727 880 if (t != s) {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 882 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
883 (long)(t - i_strpos), (long)(s - i_strpos))
884 );
885 }
886 else {
a3621e74 887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
888 "Does not contradict STCLASS...\n");
889 );
890 }
6eb5f6b9 891 }
5e39e1e5 892 giveup:
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
894 PL_colors[4], (check ? "Guessed" : "Giving up"),
895 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 896 return s;
2c2d71f5
JH
897
898 fail_finish: /* Substring not found */
33b8afdf
JH
899 if (prog->check_substr || prog->check_utf8) /* could be removed already */
900 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 901 fail:
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 903 PL_colors[4], PL_colors[5]));
bd61b366 904 return NULL;
cad2e5aa 905}
9661b544 906
6eb5f6b9 907/* We know what class REx starts with. Try to find this position... */
3b0527fe 908/* if reginfo is NULL, its a dryrun */
07be1b83
YO
909/* annoyingly all the vars in this routine have different names from their counterparts
910 in regmatch. /grrr */
3b0527fe 911
4cadc6a9
YO
912#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
913foldlen, foldbuf, uniflags) STMT_START { \
914 switch (trie_type) { \
915 case trie_utf8_fold: \
916 if ( foldlen>0 ) { \
917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
918 foldlen -= len; \
919 uscan += len; \
920 len=0; \
921 } else { \
922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
923 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
924 foldlen -= UNISKIP( uvc ); \
925 uscan = foldbuf + UNISKIP( uvc ); \
926 } \
927 break; \
928 case trie_utf8: \
929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
930 break; \
931 case trie_plain: \
932 uvc = (UV)*uc; \
933 len = 1; \
934 } \
935 \
936 if (uvc < 256) { \
937 charid = trie->charmap[ uvc ]; \
938 } \
939 else { \
940 charid = 0; \
941 if (trie->widecharmap) { \
942 SV** const svpp = hv_fetch(trie->widecharmap, \
943 (char*)&uvc, sizeof(UV), 0); \
944 if (svpp) \
945 charid = (U16)SvIV(*svpp); \
946 } \
947 } \
948} STMT_END
949
950#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
951 if ( (CoNd) \
952 && (ln == len || \
953 ibcmp_utf8(s, NULL, 0, do_utf8, \
954 m, NULL, ln, (bool)UTF)) \
955 && (!reginfo || regtry(reginfo, s)) ) \
956 goto got_it; \
957 else { \
958 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
959 uvchr_to_utf8(tmpbuf, c); \
960 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
961 if ( f != c \
962 && (f == c1 || f == c2) \
963 && (ln == foldlen || \
964 !ibcmp_utf8((char *) foldbuf, \
965 NULL, foldlen, do_utf8, \
966 m, \
967 NULL, ln, (bool)UTF)) \
968 && (!reginfo || regtry(reginfo, s)) ) \
969 goto got_it; \
970 } \
971 s += len
972
973#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
974STMT_START { \
975 while (s <= e) { \
976 if ( (CoNd) \
977 && (ln == 1 || !(OP(c) == EXACTF \
978 ? ibcmp(s, m, ln) \
979 : ibcmp_locale(s, m, ln))) \
980 && (!reginfo || regtry(reginfo, s)) ) \
981 goto got_it; \
982 s++; \
983 } \
984} STMT_END
985
986#define REXEC_FBC_UTF8_SCAN(CoDe) \
987STMT_START { \
988 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
989 CoDe \
990 s += uskip; \
991 } \
992} STMT_END
993
994#define REXEC_FBC_SCAN(CoDe) \
995STMT_START { \
996 while (s < strend) { \
997 CoDe \
998 s++; \
999 } \
1000} STMT_END
1001
1002#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1003REXEC_FBC_UTF8_SCAN( \
1004 if (CoNd) { \
1005 if (tmp && (!reginfo || regtry(reginfo, s))) \
1006 goto got_it; \
1007 else \
1008 tmp = doevery; \
1009 } \
1010 else \
1011 tmp = 1; \
1012)
1013
1014#define REXEC_FBC_CLASS_SCAN(CoNd) \
1015REXEC_FBC_SCAN( \
1016 if (CoNd) { \
1017 if (tmp && (!reginfo || regtry(reginfo, s))) \
1018 goto got_it; \
1019 else \
1020 tmp = doevery; \
1021 } \
1022 else \
1023 tmp = 1; \
1024)
1025
1026#define REXEC_FBC_TRYIT \
1027if ((!reginfo || regtry(reginfo, s))) \
1028 goto got_it
1029
1030#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1031 if (do_utf8) { \
1032 UtFpReLoAd; \
1033 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1034 } \
1035 else { \
1036 REXEC_FBC_CLASS_SCAN(CoNd); \
1037 } \
1038 break
1039
1040#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1041 PL_reg_flags |= RF_tainted; \
1042 if (do_utf8) { \
1043 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1044 } \
1045 else { \
1046 REXEC_FBC_CLASS_SCAN(CoNd); \
1047 } \
1048 break
1049
3c3eec57 1050STATIC char *
07be1b83
YO
1051S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1052 const char *strend, const regmatch_info *reginfo)
a687059c 1053{
27da23d5 1054 dVAR;
1df70142 1055 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1056 char *m;
d8093b23 1057 STRLEN ln;
5dab1207 1058 STRLEN lnc;
078c425b 1059 register STRLEN uskip;
d8093b23
G
1060 unsigned int c1;
1061 unsigned int c2;
6eb5f6b9
JH
1062 char *e;
1063 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1064 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1065
6eb5f6b9
JH
1066 /* We know what class it must start with. */
1067 switch (OP(c)) {
6eb5f6b9 1068 case ANYOF:
388cc4de 1069 if (do_utf8) {
4cadc6a9 1070 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1071 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1072 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1073 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1074 }
1075 else {
1076 while (s < strend) {
1077 STRLEN skip = 1;
1078
32fc9b6a 1079 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1080 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1081 /* The assignment of 2 is intentional:
1082 * for the folded sharp s, the skip is 2. */
1083 (skip = SHARP_S_SKIP))) {
3b0527fe 1084 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1085 goto got_it;
1086 else
1087 tmp = doevery;
1088 }
1089 else
1090 tmp = 1;
1091 s += skip;
1092 }
a0d0e21e 1093 }
6eb5f6b9 1094 break;
f33976b4 1095 case CANY:
4cadc6a9 1096 REXEC_FBC_SCAN(
3b0527fe 1097 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1098 goto got_it;
1099 else
1100 tmp = doevery;
4cadc6a9 1101 );
f33976b4 1102 break;
6eb5f6b9 1103 case EXACTF:
5dab1207
NIS
1104 m = STRING(c);
1105 ln = STR_LEN(c); /* length to match in octets/bytes */
1106 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1107 if (UTF) {
a2a2844f 1108 STRLEN ulen1, ulen2;
5dab1207 1109 U8 *sm = (U8 *) m;
89ebb4a3
JH
1110 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1111 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1112 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1113
1114 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1115 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1116
89ebb4a3 1117 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1118 0, uniflags);
89ebb4a3 1119 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1120 0, uniflags);
5dab1207
NIS
1121 lnc = 0;
1122 while (sm < ((U8 *) m + ln)) {
1123 lnc++;
1124 sm += UTF8SKIP(sm);
1125 }
1aa99e6b
IH
1126 }
1127 else {
1128 c1 = *(U8*)m;
1129 c2 = PL_fold[c1];
1130 }
6eb5f6b9
JH
1131 goto do_exactf;
1132 case EXACTFL:
5dab1207
NIS
1133 m = STRING(c);
1134 ln = STR_LEN(c);
1135 lnc = (I32) ln;
d8093b23 1136 c1 = *(U8*)m;
6eb5f6b9
JH
1137 c2 = PL_fold_locale[c1];
1138 do_exactf:
db12adc6 1139 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1140
3b0527fe 1141 if (!reginfo && e < s)
6eb5f6b9 1142 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1143
60a8b682
JH
1144 /* The idea in the EXACTF* cases is to first find the
1145 * first character of the EXACTF* node and then, if
1146 * necessary, case-insensitively compare the full
1147 * text of the node. The c1 and c2 are the first
1148 * characters (though in Unicode it gets a bit
1149 * more complicated because there are more cases
7f16dd3d
JH
1150 * than just upper and lower: one needs to use
1151 * the so-called folding case for case-insensitive
1152 * matching (called "loose matching" in Unicode).
1153 * ibcmp_utf8() will do just that. */
60a8b682 1154
1aa99e6b 1155 if (do_utf8) {
575cac57 1156 UV c, f;
89ebb4a3 1157 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1158 STRLEN len, foldlen;
4ad0818d 1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1160 if (c1 == c2) {
5dab1207
NIS
1161 /* Upper and lower of 1st char are equal -
1162 * probably not a "letter". */
1aa99e6b 1163 while (s <= e) {
89ebb4a3 1164 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1165 uniflags);
4cadc6a9 1166 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1167 }
09091399
JH
1168 }
1169 else {
1aa99e6b 1170 while (s <= e) {
89ebb4a3 1171 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1172 uniflags);
80aecb99 1173
60a8b682 1174 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1175 * Note that not all the possible combinations
1176 * are handled here: some of them are handled
1177 * by the standard folding rules, and some of
1178 * them (the character class or ANYOF cases)
1179 * are handled during compiletime in
1180 * regexec.c:S_regclass(). */
880bd946
JH
1181 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1182 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1183 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1184
4cadc6a9 1185 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1186 }
09091399 1187 }
1aa99e6b
IH
1188 }
1189 else {
1190 if (c1 == c2)
4cadc6a9 1191 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1192 else
4cadc6a9 1193 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1194 }
1195 break;
bbce6d69 1196 case BOUNDL:
3280af22 1197 PL_reg_flags |= RF_tainted;
bbce6d69 1198 /* FALL THROUGH */
a0d0e21e 1199 case BOUND:
ffc61ed2 1200 if (do_utf8) {
12d33761 1201 if (s == PL_bostr)
ffc61ed2
JH
1202 tmp = '\n';
1203 else {
6136c704 1204 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1205 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1206 }
1207 tmp = ((OP(c) == BOUND ?
9041c2e3 1208 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1209 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1210 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1211 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1212 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1213 isALNUM_LC_utf8((U8*)s)))
1214 {
1215 tmp = !tmp;
4cadc6a9 1216 REXEC_FBC_TRYIT;
a687059c 1217 }
4cadc6a9 1218 );
a0d0e21e 1219 }
667bb95a 1220 else {
12d33761 1221 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1222 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1223 REXEC_FBC_SCAN(
ffc61ed2
JH
1224 if (tmp ==
1225 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1226 tmp = !tmp;
4cadc6a9 1227 REXEC_FBC_TRYIT;
a0ed51b3 1228 }
4cadc6a9 1229 );
a0ed51b3 1230 }
3b0527fe 1231 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1232 goto got_it;
1233 break;
bbce6d69 1234 case NBOUNDL:
3280af22 1235 PL_reg_flags |= RF_tainted;
bbce6d69 1236 /* FALL THROUGH */
a0d0e21e 1237 case NBOUND:
ffc61ed2 1238 if (do_utf8) {
12d33761 1239 if (s == PL_bostr)
ffc61ed2
JH
1240 tmp = '\n';
1241 else {
6136c704 1242 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1243 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1244 }
1245 tmp = ((OP(c) == NBOUND ?
9041c2e3 1246 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1247 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1248 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1249 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1250 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1251 isALNUM_LC_utf8((U8*)s)))
1252 tmp = !tmp;
4cadc6a9
YO
1253 else REXEC_FBC_TRYIT;
1254 );
a0d0e21e 1255 }
667bb95a 1256 else {
12d33761 1257 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1258 tmp = ((OP(c) == NBOUND ?
1259 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1260 REXEC_FBC_SCAN(
ffc61ed2
JH
1261 if (tmp ==
1262 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1263 tmp = !tmp;
4cadc6a9
YO
1264 else REXEC_FBC_TRYIT;
1265 );
a0ed51b3 1266 }
3b0527fe 1267 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1268 goto got_it;
1269 break;
a0d0e21e 1270 case ALNUM:
4cadc6a9
YO
1271 REXEC_FBC_CSCAN_PRELOAD(
1272 LOAD_UTF8_CHARCLASS_ALNUM(),
1273 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1274 isALNUM(*s)
1275 );
bbce6d69 1276 case ALNUML:
4cadc6a9
YO
1277 REXEC_FBC_CSCAN_TAINT(
1278 isALNUM_LC_utf8((U8*)s),
1279 isALNUM_LC(*s)
1280 );
a0d0e21e 1281 case NALNUM:
4cadc6a9
YO
1282 REXEC_FBC_CSCAN_PRELOAD(
1283 LOAD_UTF8_CHARCLASS_ALNUM(),
1284 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1285 !isALNUM(*s)
1286 );
bbce6d69 1287 case NALNUML:
4cadc6a9
YO
1288 REXEC_FBC_CSCAN_TAINT(
1289 !isALNUM_LC_utf8((U8*)s),
1290 !isALNUM_LC(*s)
1291 );
a0d0e21e 1292 case SPACE:
4cadc6a9
YO
1293 REXEC_FBC_CSCAN_PRELOAD(
1294 LOAD_UTF8_CHARCLASS_SPACE(),
1295 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1296 isSPACE(*s)
1297 );
bbce6d69 1298 case SPACEL:
4cadc6a9
YO
1299 REXEC_FBC_CSCAN_TAINT(
1300 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1301 isSPACE_LC(*s)
1302 );
a0d0e21e 1303 case NSPACE:
4cadc6a9
YO
1304 REXEC_FBC_CSCAN_PRELOAD(
1305 LOAD_UTF8_CHARCLASS_SPACE(),
1306 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1307 !isSPACE(*s)
1308 );
bbce6d69 1309 case NSPACEL:
4cadc6a9
YO
1310 REXEC_FBC_CSCAN_TAINT(
1311 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1312 !isSPACE_LC(*s)
1313 );
a0d0e21e 1314 case DIGIT:
4cadc6a9
YO
1315 REXEC_FBC_CSCAN_PRELOAD(
1316 LOAD_UTF8_CHARCLASS_DIGIT(),
1317 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1318 isDIGIT(*s)
1319 );
b8c5462f 1320 case DIGITL:
4cadc6a9
YO
1321 REXEC_FBC_CSCAN_TAINT(
1322 isDIGIT_LC_utf8((U8*)s),
1323 isDIGIT_LC(*s)
1324 );
a0d0e21e 1325 case NDIGIT:
4cadc6a9
YO
1326 REXEC_FBC_CSCAN_PRELOAD(
1327 LOAD_UTF8_CHARCLASS_DIGIT(),
1328 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1329 !isDIGIT(*s)
1330 );
b8c5462f 1331 case NDIGITL:
4cadc6a9
YO
1332 REXEC_FBC_CSCAN_TAINT(
1333 !isDIGIT_LC_utf8((U8*)s),
1334 !isDIGIT_LC(*s)
1335 );
07be1b83
YO
1336 case TRIE:
1337 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1338 {
1339 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1340 trie_type = do_utf8 ?
1341 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1342 : trie_plain;
1343 /* what trie are we using right now */
1344 reg_ac_data *aho
1345 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1346 reg_trie_data *trie=aho->trie;
1347
1348 const char *last_start = strend - trie->minlen;
6148ee25 1349#ifdef DEBUGGING
07be1b83 1350 const char *real_start = s;
6148ee25 1351#endif
07be1b83 1352 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1353 SV *sv_points;
1354 U8 **points; /* map of where we were in the input string
1355 when reading a given string. For ASCII this
1356 is unnecessary overhead as the relationship
1357 is always 1:1, but for unicode, especially
1358 case folded unicode this is not true. */
f9e705e8 1359 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1360
1361 GET_RE_DEBUG_FLAGS_DECL;
1362
be8e71aa
YO
1363 /* We can't just allocate points here. We need to wrap it in
1364 * an SV so it gets freed properly if there is a croak while
1365 * running the match */
1366 ENTER;
1367 SAVETMPS;
1368 sv_points=newSV(maxlen * sizeof(U8 *));
1369 SvCUR_set(sv_points,
1370 maxlen * sizeof(U8 *));
1371 SvPOK_on(sv_points);
1372 sv_2mortal(sv_points);
1373 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1374
1375 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1376 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1377 s++;
1378 }
1379 }
1380
1381 while (s <= last_start) {
1382 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1383 U8 *uc = (U8*)s;
1384 U16 charid = 0;
1385 U32 base = 1;
1386 U32 state = 1;
1387 UV uvc = 0;
1388 STRLEN len = 0;
1389 STRLEN foldlen = 0;
1390 U8 *uscan = (U8*)NULL;
1391 U8 *leftmost = NULL;
1392
1393 U32 pointpos = 0;
1394
1395 while ( state && uc <= (U8*)strend ) {
1396 int failed=0;
1397 if (aho->states[ state ].wordnum) {
1398 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1399 if (!leftmost || lpos < leftmost)
1400 leftmost= lpos;
1401 if (base==0) break;
1402 }
1403 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1404 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1405 uvc, charid, foldlen, foldbuf, uniflags);
07be1b83
YO
1406 DEBUG_TRIE_EXECUTE_r(
1407 PerlIO_printf(Perl_debug_log,
1408 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1409 (int)((const char*)uc - real_start), charid, uvc)
1410 );
1411 uc += len;
1412
1413 do {
6148ee25 1414#ifdef DEBUGGING
07be1b83 1415 U32 word = aho->states[ state ].wordnum;
6148ee25 1416#endif
07be1b83
YO
1417 base = aho->states[ state ].trans.base;
1418
1419 DEBUG_TRIE_EXECUTE_r(
1420 PerlIO_printf( Perl_debug_log,
1421 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1422 failed ? "Fail transition to " : "",
1423 state, base, uvc, word)
1424 );
1425 if ( base ) {
1426 U32 tmp;
1427 if (charid &&
1428 (base + charid > trie->uniquecharcount )
1429 && (base + charid - 1 - trie->uniquecharcount
1430 < trie->lasttrans)
1431 && trie->trans[base + charid - 1 -
1432 trie->uniquecharcount].check == state
1433 && (tmp=trie->trans[base + charid - 1 -
1434 trie->uniquecharcount ].next))
1435 {
1436 state = tmp;
1437 break;
1438 }
1439 else {
1440 failed++;
1441 if ( state == 1 )
1442 break;
1443 else
1444 state = aho->fail[state];
1445 }
1446 }
1447 else {
1448 /* we must be accepting here */
1449 failed++;
1450 break;
1451 }
1452 } while(state);
1453 if (failed) {
1454 if (leftmost)
1455 break;
1456 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1457 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1458 uc++;
1459 }
1460 }
1461 }
1462 }
1463 if ( aho->states[ state ].wordnum ) {
1464 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1465 if (!leftmost || lpos < leftmost)
1466 leftmost = lpos;
1467 }
1468 DEBUG_TRIE_EXECUTE_r(
1469 PerlIO_printf( Perl_debug_log,
1470 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1471 "All done: ",
1472 state, base, uvc)
1473 );
1474 if (leftmost) {
1475 s = (char*)leftmost;
be8e71aa
YO
1476 if (!reginfo || regtry(reginfo, s)) {
1477 FREETMPS;
1478 LEAVE;
07be1b83 1479 goto got_it;
be8e71aa 1480 }
07be1b83
YO
1481 s = HOPc(s,1);
1482 } else {
1483 break;
1484 }
1485 }
be8e71aa
YO
1486 FREETMPS;
1487 LEAVE;
07be1b83
YO
1488 }
1489 break;
b3c9acc1 1490 default:
3c3eec57
GS
1491 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1492 break;
d6a28714 1493 }
6eb5f6b9
JH
1494 return 0;
1495 got_it:
1496 return s;
1497}
1498
1499/*
1500 - regexec_flags - match a regexp against a string
1501 */
1502I32
1503Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1504 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1505/* strend: pointer to null at end of string */
1506/* strbeg: real beginning of string */
1507/* minend: end of match must be >=minend after stringarg. */
1508/* data: May be used for some additional optimizations. */
1509/* nosave: For optimizations. */
1510{
97aff369 1511 dVAR;
6eb5f6b9
JH
1512 register char *s;
1513 register regnode *c;
1514 register char *startpos = stringarg;
6eb5f6b9
JH
1515 I32 minlen; /* must match at least this many chars */
1516 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1517 I32 end_shift = 0; /* Same for the end. */ /* CC */
1518 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1519 char *scream_olds = NULL;
3dab1dad 1520 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1521 const bool do_utf8 = DO_UTF8(sv);
2757e526 1522 I32 multiline;
0df25f3d 1523
3b0527fe 1524 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1525
1526 GET_RE_DEBUG_FLAGS_DECL;
1527
9d4ba2ae 1528 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1529
1530 /* Be paranoid... */
1531 if (prog == NULL || startpos == NULL) {
1532 Perl_croak(aTHX_ "NULL regexp parameter");
1533 return 0;
1534 }
1535
2757e526 1536 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1537 reginfo.prog = prog;
2757e526 1538
bac06658
JH
1539 RX_MATCH_UTF8_set(prog, do_utf8);
1540
6eb5f6b9 1541 minlen = prog->minlen;
61a36c01 1542 if (strend - startpos < minlen) {
a3621e74 1543 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1544 "String too short [regexec_flags]...\n"));
1545 goto phooey;
1aa99e6b 1546 }
6eb5f6b9 1547
6eb5f6b9
JH
1548 /* Check validity of program. */
1549 if (UCHARAT(prog->program) != REG_MAGIC) {
1550 Perl_croak(aTHX_ "corrupted regexp program");
1551 }
1552
1553 PL_reg_flags = 0;
1554 PL_reg_eval_set = 0;
1555 PL_reg_maxiter = 0;
1556
1557 if (prog->reganch & ROPT_UTF8)
1558 PL_reg_flags |= RF_utf8;
1559
1560 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1561 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1562 PL_bostr = strbeg;
3b0527fe 1563 reginfo.sv = sv;
6eb5f6b9
JH
1564
1565 /* Mark end of line for $ (and such) */
1566 PL_regeol = strend;
1567
1568 /* see how far we have to get to not match where we matched before */
3b0527fe 1569 reginfo.till = startpos+minend;
6eb5f6b9 1570
6eb5f6b9
JH
1571 /* If there is a "must appear" string, look for it. */
1572 s = startpos;
1573
3b0527fe 1574 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1575 MAGIC *mg;
1576
1577 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1578 reginfo.ganch = startpos;
6eb5f6b9
JH
1579 else if (sv && SvTYPE(sv) >= SVt_PVMG
1580 && SvMAGIC(sv)
14befaf4
DM
1581 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1582 && mg->mg_len >= 0) {
3b0527fe 1583 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1584 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1585 if (s > reginfo.ganch)
6eb5f6b9 1586 goto phooey;
3b0527fe 1587 s = reginfo.ganch;
6eb5f6b9
JH
1588 }
1589 }
1590 else /* pos() not defined */
3b0527fe 1591 reginfo.ganch = strbeg;
6eb5f6b9
JH
1592 }
1593
a0714e2c 1594 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1595 re_scream_pos_data d;
1596
1597 d.scream_olds = &scream_olds;
1598 d.scream_pos = &scream_pos;
1599 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1600 if (!s) {
a3621e74 1601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1602 goto phooey; /* not present */
3fa9c3d7 1603 }
6eb5f6b9
JH
1604 }
1605
a3621e74 1606 DEBUG_EXECUTE_r({
0df25f3d
YO
1607 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1608 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1609 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1610 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1611
2a782b5b
JH
1612 if (!PL_colorset)
1613 reginitcolors();
1614 PerlIO_printf(Perl_debug_log,
a0288114 1615 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1616 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1617 len0, len0, s0,
2a782b5b 1618 PL_colors[1],
9e55ce06 1619 len0 > 60 ? "..." : "",
2a782b5b 1620 PL_colors[0],
9e55ce06
JH
1621 (int)(len1 > 60 ? 60 : len1),
1622 s1, PL_colors[1],
1623 (len1 > 60 ? "..." : "")
2a782b5b
JH
1624 );
1625 });
6eb5f6b9
JH
1626
1627 /* Simplest case: anchored match need be tried only once. */
1628 /* [unless only anchor is BOL and multiline is set] */
1629 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1630 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1631 goto got_it;
7fba1cd6 1632 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1633 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1634 {
1635 char *end;
1636
1637 if (minlen)
1638 dontbother = minlen - 1;
1aa99e6b 1639 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1640 /* for multiline we only have to try after newlines */
33b8afdf 1641 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1642 if (s == startpos)
1643 goto after_try;
1644 while (1) {
3b0527fe 1645 if (regtry(&reginfo, s))
6eb5f6b9
JH
1646 goto got_it;
1647 after_try:
1648 if (s >= end)
1649 goto phooey;
1650 if (prog->reganch & RE_USE_INTUIT) {
1651 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1652 if (!s)
1653 goto phooey;
1654 }
1655 else
1656 s++;
1657 }
1658 } else {
1659 if (s > startpos)
1660 s--;
1661 while (s < end) {
1662 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1663 if (regtry(&reginfo, s))
6eb5f6b9
JH
1664 goto got_it;
1665 }
1666 }
1667 }
1668 }
1669 goto phooey;
1670 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1671 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1672 goto got_it;
1673 goto phooey;
1674 }
1675
1676 /* Messy cases: unanchored match. */
33b8afdf 1677 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1678 /* we have /x+whatever/ */
1679 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1680 char ch;
bf93d4cc
GS
1681#ifdef DEBUGGING
1682 int did_match = 0;
1683#endif
33b8afdf
JH
1684 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1685 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1686 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1687
1aa99e6b 1688 if (do_utf8) {
4cadc6a9 1689 REXEC_FBC_SCAN(
6eb5f6b9 1690 if (*s == ch) {
a3621e74 1691 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1692 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1693 s += UTF8SKIP(s);
1694 while (s < strend && *s == ch)
1695 s += UTF8SKIP(s);
1696 }
4cadc6a9 1697 );
6eb5f6b9
JH
1698 }
1699 else {
4cadc6a9 1700 REXEC_FBC_SCAN(
6eb5f6b9 1701 if (*s == ch) {
a3621e74 1702 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1703 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1704 s++;
1705 while (s < strend && *s == ch)
1706 s++;
1707 }
4cadc6a9 1708 );
6eb5f6b9 1709 }
a3621e74 1710 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1711 PerlIO_printf(Perl_debug_log,
b7953727
JH
1712 "Did not find anchored character...\n")
1713 );
6eb5f6b9 1714 }
a0714e2c
SS
1715 else if (prog->anchored_substr != NULL
1716 || prog->anchored_utf8 != NULL
1717 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1718 && prog->float_max_offset < strend - s)) {
1719 SV *must;
1720 I32 back_max;
1721 I32 back_min;
1722 char *last;
6eb5f6b9 1723 char *last1; /* Last position checked before */
bf93d4cc
GS
1724#ifdef DEBUGGING
1725 int did_match = 0;
1726#endif
33b8afdf
JH
1727 if (prog->anchored_substr || prog->anchored_utf8) {
1728 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1729 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1730 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1731 back_max = back_min = prog->anchored_offset;
1732 } else {
1733 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1734 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1735 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1736 back_max = prog->float_max_offset;
1737 back_min = prog->float_min_offset;
1738 }
1739 if (must == &PL_sv_undef)
1740 /* could not downgrade utf8 check substring, so must fail */
1741 goto phooey;
1742
1743 last = HOP3c(strend, /* Cannot start after this */
1744 -(I32)(CHR_SVLEN(must)
1745 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1746
1747 if (s > PL_bostr)
1748 last1 = HOPc(s, -1);
1749 else
1750 last1 = s - 1; /* bogus */
1751
a0288114 1752 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1753 check_substr==must. */
1754 scream_pos = -1;
1755 dontbother = end_shift;
1756 strend = HOPc(strend, -dontbother);
1757 while ( (s <= last) &&
9041c2e3 1758 ((flags & REXEC_SCREAM)
1aa99e6b 1759 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1760 end_shift, &scream_pos, 0))
1aa99e6b 1761 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1762 (unsigned char*)strend, must,
7fba1cd6 1763 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1764 /* we may be pointing at the wrong string */
1765 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1766 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1767 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1768 if (HOPc(s, -back_max) > last1) {
1769 last1 = HOPc(s, -back_min);
1770 s = HOPc(s, -back_max);
1771 }
1772 else {
52657f30 1773 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1774
1775 last1 = HOPc(s, -back_min);
52657f30 1776 s = t;
6eb5f6b9 1777 }
1aa99e6b 1778 if (do_utf8) {
6eb5f6b9 1779 while (s <= last1) {
3b0527fe 1780 if (regtry(&reginfo, s))
6eb5f6b9
JH
1781 goto got_it;
1782 s += UTF8SKIP(s);
1783 }
1784 }
1785 else {
1786 while (s <= last1) {
3b0527fe 1787 if (regtry(&reginfo, s))
6eb5f6b9
JH
1788 goto got_it;
1789 s++;
1790 }
1791 }
1792 }
a3621e74 1793 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1794 PerlIO_printf(Perl_debug_log,
a0288114 1795 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1796 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1797 ? "anchored" : "floating"),
1798 PL_colors[0],
1799 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1800 SvPVX_const(must),
b7953727
JH
1801 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1802 );
6eb5f6b9
JH
1803 goto phooey;
1804 }
155aba94 1805 else if ((c = prog->regstclass)) {
f14c76ed 1806 if (minlen) {
be8e71aa 1807 const OPCODE op = OP(prog->regstclass);
66e933ab 1808 /* don't bother with what can't match */
07be1b83 1809 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
1810 strend = HOPc(strend, -(minlen - 1));
1811 }
a3621e74 1812 DEBUG_EXECUTE_r({
be8e71aa 1813 SV * const prop = sv_newmortal();
32fc9b6a 1814 regprop(prog, prop, c);
0df25f3d
YO
1815 {
1816 RE_PV_DISPLAY_DECL(s0,len0,UTF,
1817 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
1818 RE_PV_DISPLAY_DECL(s1,len1,UTF,
1819 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
1820 PerlIO_printf(Perl_debug_log,
1821 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
1822 len0, len0, s0,
1823 len1, len1, s1, (int)(strend - s));
1824 }
ffc61ed2 1825 });
3b0527fe 1826 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1827 goto got_it;
07be1b83 1828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1829 }
1830 else {
1831 dontbother = 0;
a0714e2c 1832 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1833 /* Trim the end. */
d6a28714 1834 char *last;
33b8afdf
JH
1835 SV* float_real;
1836
1837 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1838 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1839 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1840
1841 if (flags & REXEC_SCREAM) {
33b8afdf 1842 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1843 end_shift, &scream_pos, 1); /* last one */
1844 if (!last)
ffc61ed2 1845 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1846 /* we may be pointing at the wrong string */
1847 else if (RX_MATCH_COPIED(prog))
3f7c398e 1848 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1849 }
d6a28714
JH
1850 else {
1851 STRLEN len;
cfd0369c 1852 const char * const little = SvPV_const(float_real, len);
d6a28714 1853
33b8afdf 1854 if (SvTAIL(float_real)) {
d6a28714
JH
1855 if (memEQ(strend - len + 1, little, len - 1))
1856 last = strend - len + 1;
7fba1cd6 1857 else if (!multiline)
9041c2e3 1858 last = memEQ(strend - len, little, len)
bd61b366 1859 ? strend - len : NULL;
b8c5462f 1860 else
d6a28714
JH
1861 goto find_last;
1862 } else {
1863 find_last:
9041c2e3 1864 if (len)
d6a28714 1865 last = rninstr(s, strend, little, little + len);
b8c5462f 1866 else
a0288114 1867 last = strend; /* matching "$" */
b8c5462f 1868 }
b8c5462f 1869 }
bf93d4cc 1870 if (last == NULL) {
a3621e74 1871 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1872 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1873 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1874 goto phooey; /* Should not happen! */
1875 }
d6a28714
JH
1876 dontbother = strend - last + prog->float_min_offset;
1877 }
1878 if (minlen && (dontbother < minlen))
1879 dontbother = minlen - 1;
1880 strend -= dontbother; /* this one's always in bytes! */
1881 /* We don't know much -- general case. */
1aa99e6b 1882 if (do_utf8) {
d6a28714 1883 for (;;) {
3b0527fe 1884 if (regtry(&reginfo, s))
d6a28714
JH
1885 goto got_it;
1886 if (s >= strend)
1887 break;
b8c5462f 1888 s += UTF8SKIP(s);
d6a28714
JH
1889 };
1890 }
1891 else {
1892 do {
3b0527fe 1893 if (regtry(&reginfo, s))
d6a28714
JH
1894 goto got_it;
1895 } while (s++ < strend);
1896 }
1897 }
1898
1899 /* Failure. */
1900 goto phooey;
1901
1902got_it:
1903 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1904
1905 if (PL_reg_eval_set) {
1906 /* Preserve the current value of $^R */
1907 if (oreplsv != GvSV(PL_replgv))
1908 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1909 restored, the value remains
1910 the same. */
4f639d21 1911 restore_pos(aTHX_ prog);
d6a28714
JH
1912 }
1913
1914 /* make sure $`, $&, $', and $digit will work later */
1915 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1916 RX_MATCH_COPY_FREE(prog);
d6a28714 1917 if (flags & REXEC_COPY_STR) {
be8e71aa 1918 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1919#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1920 if ((SvIsCOW(sv)
1921 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1922 if (DEBUG_C_TEST) {
1923 PerlIO_printf(Perl_debug_log,
1924 "Copy on write: regexp capture, type %d\n",
1925 (int) SvTYPE(sv));
1926 }
1927 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1928 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1929 assert (SvPOKp(prog->saved_copy));
1930 } else
1931#endif
1932 {
1933 RX_MATCH_COPIED_on(prog);
1934 s = savepvn(strbeg, i);
1935 prog->subbeg = s;
1936 }
d6a28714 1937 prog->sublen = i;
d6a28714
JH
1938 }
1939 else {
1940 prog->subbeg = strbeg;
1941 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1942 }
1943 }
9041c2e3 1944
d6a28714
JH
1945 return 1;
1946
1947phooey:
a3621e74 1948 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1949 PL_colors[4], PL_colors[5]));
d6a28714 1950 if (PL_reg_eval_set)
4f639d21 1951 restore_pos(aTHX_ prog);
d6a28714
JH
1952 return 0;
1953}
1954
1955/*
1956 - regtry - try match at specific point
1957 */
1958STATIC I32 /* 0 failure, 1 success */
3b0527fe 1959S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1960{
97aff369 1961 dVAR;
d6a28714
JH
1962 register I32 *sp;
1963 register I32 *ep;
1964 CHECKPOINT lastcp;
3b0527fe 1965 regexp *prog = reginfo->prog;
a3621e74 1966 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1967
02db2b7b
IZ
1968#ifdef DEBUGGING
1969 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1970#endif
d6a28714
JH
1971 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1972 MAGIC *mg;
1973
1974 PL_reg_eval_set = RS_init;
a3621e74 1975 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
1976 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1977 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1978 ));
e8347627 1979 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1980 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1981 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1982 SAVETMPS;
1983 /* Apparently this is not needed, judging by wantarray. */
e8347627 1984 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1985 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1986
3b0527fe 1987 if (reginfo->sv) {
d6a28714 1988 /* Make $_ available to executed code. */
3b0527fe 1989 if (reginfo->sv != DEFSV) {
59f00321 1990 SAVE_DEFSV;
3b0527fe 1991 DEFSV = reginfo->sv;
b8c5462f 1992 }
d6a28714 1993
3b0527fe
DM
1994 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1995 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 1996 /* prepare for quick setting of pos */
d300d9fa
NC
1997#ifdef PERL_OLD_COPY_ON_WRITE
1998 if (SvIsCOW(sv))
1999 sv_force_normal_flags(sv, 0);
2000#endif
3dab1dad 2001 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2002 &PL_vtbl_mglob, NULL, 0);
d6a28714 2003 mg->mg_len = -1;
b8c5462f 2004 }
d6a28714
JH
2005 PL_reg_magic = mg;
2006 PL_reg_oldpos = mg->mg_len;
4f639d21 2007 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2008 }
09687e5a 2009 if (!PL_reg_curpm) {
a02a5408 2010 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2011#ifdef USE_ITHREADS
2012 {
be8e71aa 2013 SV* const repointer = newSViv(0);
577e12cc 2014 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2015 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2016 av_push(PL_regex_padav,repointer);
2017 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2018 PL_regex_pad = AvARRAY(PL_regex_padav);
2019 }
2020#endif
2021 }
aaa362c4 2022 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2023 PL_reg_oldcurpm = PL_curpm;
2024 PL_curpm = PL_reg_curpm;
2025 if (RX_MATCH_COPIED(prog)) {
2026 /* Here is a serious problem: we cannot rewrite subbeg,
2027 since it may be needed if this match fails. Thus
2028 $` inside (?{}) could fail... */
2029 PL_reg_oldsaved = prog->subbeg;
2030 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2031#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2032 PL_nrs = prog->saved_copy;
2033#endif
d6a28714
JH
2034 RX_MATCH_COPIED_off(prog);
2035 }
2036 else
bd61b366 2037 PL_reg_oldsaved = NULL;
d6a28714
JH
2038 prog->subbeg = PL_bostr;
2039 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2040 }
973dddac 2041 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2042 PL_reginput = startpos;
2043 PL_regstartp = prog->startp;
2044 PL_regendp = prog->endp;
2045 PL_reglastparen = &prog->lastparen;
a01268b5 2046 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2047 prog->lastparen = 0;
03994de8 2048 prog->lastcloseparen = 0;
d6a28714 2049 PL_regsize = 0;
a3621e74 2050 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2051 if (PL_reg_start_tmpl <= prog->nparens) {
2052 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2053 if(PL_reg_start_tmp)
2054 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2055 else
a02a5408 2056 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2057 }
2058
2059 /* XXXX What this code is doing here?!!! There should be no need
2060 to do this again and again, PL_reglastparen should take care of
3dd2943c 2061 this! --ilya*/
dafc8851
JH
2062
2063 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2064 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2065 * PL_reglastparen), is not needed at all by the test suite
2066 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2067 * enough, for building DynaLoader, or otherwise this
2068 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2069 * will happen. Meanwhile, this code *is* needed for the
2070 * above-mentioned test suite tests to succeed. The common theme
2071 * on those tests seems to be returning null fields from matches.
2072 * --jhi */
dafc8851 2073#if 1
d6a28714
JH
2074 sp = prog->startp;
2075 ep = prog->endp;
2076 if (prog->nparens) {
097eb12c 2077 register I32 i;
eb160463 2078 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2079 *++sp = -1;
2080 *++ep = -1;
2081 }
2082 }
dafc8851 2083#endif
02db2b7b 2084 REGCP_SET(lastcp);
3b0527fe 2085 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2086 prog->endp[0] = PL_reginput - PL_bostr;
2087 return 1;
2088 }
02db2b7b 2089 REGCP_UNWIND(lastcp);
d6a28714
JH
2090 return 0;
2091}
2092
02db2b7b 2093
8ba1375e
MJD
2094#define sayYES goto yes
2095#define sayNO goto no
e0f9d4a8 2096#define sayNO_ANYOF goto no_anyof
8ba1375e 2097#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2098#define sayNO_FINAL goto no_final
2099#define sayNO_SILENT goto do_no
2100#define saySAME(x) if (x) goto yes; else goto no
2101
3ab3c9b4
HS
2102#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2103#define POSCACHE_SEEN 1 /* we know what we're caching */
2104#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2105
3ab3c9b4 2106#define CACHEsayYES STMT_START { \
d8319b27 2107 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2108 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2109 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2110 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2111 } \
2112 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2113 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2114 } \
2115 else { \
3ab3c9b4
HS
2116 /* cache records failure, but this is success */ \
2117 DEBUG_r( \
2118 PerlIO_printf(Perl_debug_log, \
2119 "%*s (remove success from failure cache)\n", \
2120 REPORT_CODE_OFF+PL_regindent*2, "") \
2121 ); \
d8319b27 2122 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2123 } \
2124 } \
2125 sayYES; \
2126} STMT_END
7409bbd3 2127
3ab3c9b4 2128#define CACHEsayNO STMT_START { \
d8319b27 2129 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2130 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2131 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2132 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2133 } \
2134 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2135 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2136 } \
2137 else { \
3ab3c9b4
HS
2138 /* cache records success, but this is failure */ \
2139 DEBUG_r( \
2140 PerlIO_printf(Perl_debug_log, \
2141 "%*s (remove failure from success cache)\n", \
2142 REPORT_CODE_OFF+PL_regindent*2, "") \
2143 ); \
d8319b27 2144 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2145 } \
2146 } \
2147 sayNO; \
2148} STMT_END
2149
a3621e74
YO
2150/* this is used to determine how far from the left messages like
2151 'failed...' are printed. Currently 29 makes these messages line
2152 up with the opcode they refer to. Earlier perls used 25 which
2153 left these messages outdented making reviewing a debug output
2154 quite difficult.
2155*/
2156#define REPORT_CODE_OFF 29
2157
2158
2159/* Make sure there is a test for this +1 options in re_tests */
2160#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2161
40a82448
DM
2162#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2163#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2164
86545054
DM
2165#define SLAB_FIRST(s) (&(s)->states[0])
2166#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2167
5d9a96ca
DM
2168/* grab a new slab and return the first slot in it */
2169
2170STATIC regmatch_state *
2171S_push_slab(pTHX)
2172{
54df2634
NC
2173#if PERL_VERSION < 9
2174 dMY_CXT;
2175#endif
5d9a96ca
DM
2176 regmatch_slab *s = PL_regmatch_slab->next;
2177 if (!s) {
2178 Newx(s, 1, regmatch_slab);
2179 s->prev = PL_regmatch_slab;
2180 s->next = NULL;
2181 PL_regmatch_slab->next = s;
2182 }
2183 PL_regmatch_slab = s;
86545054 2184 return SLAB_FIRST(s);
5d9a96ca 2185}
5b47454d 2186
95b24440
DM
2187/* simulate a recursive call to regmatch */
2188
2189#define REGMATCH(ns, where) \
5d9a96ca
DM
2190 st->scan = scan; \
2191 scan = (ns); \
2192 st->resume_state = resume_##where; \
95b24440
DM
2193 goto start_recurse; \
2194 resume_point_##where:
2195
40a82448
DM
2196/* push a new state then goto it */
2197
2198#define PUSH_STATE_GOTO(state, node) \
2199 scan = node; \
2200 st->resume_state = state; \
2201 goto push_state;
2202
2203/* push a new state with success backtracking, then goto it */
2204
2205#define PUSH_YES_STATE_GOTO(state, node) \
2206 scan = node; \
2207 st->resume_state = state; \
2208 goto push_yes_state;
2209
aa283a38 2210
aa283a38 2211
d6a28714
JH
2212/*
2213 - regmatch - main matching routine
2214 *
2215 * Conceptually the strategy is simple: check to see whether the current
2216 * node matches, call self recursively to see whether the rest matches,
2217 * and then act accordingly. In practice we make some effort to avoid
2218 * recursion, in particular by going through "ordinary" nodes (that don't
2219 * need to know whether the rest of the match failed) by a loop instead of
2220 * by recursion.
2221 */
2222/* [lwall] I've hoisted the register declarations to the outer block in order to
2223 * maybe save a little bit of pushing and popping on the stack. It also takes
2224 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2225 *
2226 * This function used to be heavily recursive, but since this had the
2227 * effect of blowing the CPU stack on complex regexes, it has been
2228 * restructured to be iterative, and to save state onto the heap rather
2229 * than the stack. Essentially whereever regmatch() used to be called, it
2230 * pushes the current state, notes where to return, then jumps back into
2231 * the main loop.
2232 *
2233 * Originally the structure of this function used to look something like
2234
2235 S_regmatch() {
2236 int a = 1, b = 2;
2237 ...
2238 while (scan != NULL) {
5d9a96ca 2239 a++; // do stuff with a and b
95b24440
DM
2240 ...
2241 switch (OP(scan)) {
2242 case FOO: {
2243 int local = 3;
2244 ...
2245 if (regmatch(...)) // recurse
2246 goto yes;
2247 }
2248 ...
2249 }
2250 }
2251 yes:
2252 return 1;
2253 }
2254
2255 * Now it looks something like this:
2256
5d9a96ca 2257 typedef struct {
95b24440
DM
2258 int a, b, local;
2259 int resume_state;
5d9a96ca 2260 } regmatch_state;
95b24440
DM
2261
2262 S_regmatch() {
5d9a96ca
DM
2263 regmatch_state *st = new();
2264 int depth=0;
2265 st->a++; // do stuff with a and b
95b24440
DM
2266 ...
2267 while (scan != NULL) {
2268 ...
2269 switch (OP(scan)) {
2270 case FOO: {
5d9a96ca 2271 st->local = 3;
95b24440 2272 ...
5d9a96ca
DM
2273 st->scan = scan;
2274 scan = ...;
2275 st->resume_state = resume_FOO;
2276 goto start_recurse; // recurse
95b24440 2277
5d9a96ca
DM
2278 resume_point_FOO:
2279 if (result)
95b24440
DM
2280 goto yes;
2281 }
2282 ...
2283 }
5d9a96ca
DM
2284 start_recurse:
2285 st = new(); push a new state
2286 st->a = 1; st->b = 2;
2287 depth++;
95b24440 2288 }
5d9a96ca 2289 yes:
95b24440 2290 result = 1;
5d9a96ca
DM
2291 if (depth--) {
2292 st = pop();
95b24440
DM
2293 switch (resume_state) {
2294 case resume_FOO:
2295 goto resume_point_FOO;
2296 ...
2297 }
2298 }
2299 return result
2300 }
2301
2302 * WARNING: this means that any line in this function that contains a
2303 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2304 * regmatch() using gotos instead. Thus the values of any local variables
2305 * not saved in the regmatch_state structure will have been lost when
2306 * execution resumes on the next line .
5d9a96ca
DM
2307 *
2308 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2309 * PL_regmatch_state always points to the currently active state, and
2310 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2311 * The first time regmatch is called, the first slab is allocated, and is
2312 * never freed until interpreter desctruction. When the slab is full,
2313 * a new one is allocated chained to the end. At exit from regmatch, slabs
2314 * allocated since entry are freed.
d6a28714 2315 */
95b24440 2316
40a82448 2317/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2318#define TRIE_next (REGNODE_MAX+1)
2319#define TRIE_next_fail (REGNODE_MAX+2)
2320#define EVAL_A (REGNODE_MAX+3)
2321#define EVAL_A_fail (REGNODE_MAX+4)
2322#define resume_CURLYX (REGNODE_MAX+5)
2323#define resume_WHILEM1 (REGNODE_MAX+6)
2324#define resume_WHILEM2 (REGNODE_MAX+7)
2325#define resume_WHILEM3 (REGNODE_MAX+8)
2326#define resume_WHILEM4 (REGNODE_MAX+9)
2327#define resume_WHILEM5 (REGNODE_MAX+10)
2328#define resume_WHILEM6 (REGNODE_MAX+11)
2329#define BRANCH_next (REGNODE_MAX+12)
2330#define BRANCH_next_fail (REGNODE_MAX+13)
2331#define CURLYM_A (REGNODE_MAX+14)
2332#define CURLYM_A_fail (REGNODE_MAX+15)
2333#define CURLYM_B (REGNODE_MAX+16)
2334#define CURLYM_B_fail (REGNODE_MAX+17)
2335#define IFMATCH_A (REGNODE_MAX+18)
2336#define IFMATCH_A_fail (REGNODE_MAX+19)
2337#define CURLY_B_min_known (REGNODE_MAX+20)
2338#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2339#define CURLY_B_min (REGNODE_MAX+22)
2340#define CURLY_B_min_fail (REGNODE_MAX+23)
2341#define CURLY_B_max (REGNODE_MAX+24)
2342#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2343
2344
3dab1dad 2345#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2346
3df15adc
YO
2347#ifdef DEBUGGING
2348
2349STATIC void
07be1b83
YO
2350S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2351{
2352 const int docolor = *PL_colors[0];
2353 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2354 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2355 /* The part of the string before starttry has one color
2356 (pref0_len chars), between starttry and current
2357 position another one (pref_len - pref0_len chars),
2358 after the current position the third one.
2359 We assume that pref0_len <= pref_len, otherwise we
2360 decrease pref0_len. */
2361 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2362 ? (5 + taill) - l : locinput - PL_bostr;
2363 int pref0_len;
2364
2365 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2366 pref_len++;
2367 pref0_len = pref_len - (locinput - PL_reg_starttry);
2368 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2369 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2370 ? (5 + taill) - pref_len : PL_regeol - locinput);
2371 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2372 l--;
2373 if (pref0_len < 0)
2374 pref0_len = 0;
2375 if (pref0_len > pref_len)
2376 pref0_len = pref_len;
2377 {
3df15adc 2378 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d
YO
2379
2380 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2381 (locinput - pref_len),pref0_len, 60);
2382
2383 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2384 (locinput - pref_len + pref0_len),
0df25f3d
YO
2385 pref_len - pref0_len, 60);
2386
2387 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2388 locinput, PL_regeol - locinput, 60);
2389
3df15adc 2390 PerlIO_printf(Perl_debug_log,
07be1b83
YO
2391 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2392 (IV)(locinput - PL_bostr),
2393 PL_colors[4],
2394 len0, s0,
2395 PL_colors[5],
2396 PL_colors[2],
2397 len1, s1,
2398 PL_colors[3],
2399 (docolor ? "" : "> <"),
2400 PL_colors[0],
2401 len2, s2,
2402 PL_colors[1],
2403 15 - l - pref_len + 1,
2404 "");
2405 }
2406}
3df15adc 2407
07be1b83
YO
2408#endif
2409
d6a28714 2410STATIC I32 /* 0 failure, 1 success */
3b0527fe 2411S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2412{
54df2634
NC
2413#if PERL_VERSION < 9
2414 dMY_CXT;
2415#endif
27da23d5 2416 dVAR;
95b24440 2417 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2418 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2419
3b0527fe
DM
2420 regexp *rex = reginfo->prog;
2421
5d9a96ca
DM
2422 regmatch_slab *orig_slab;
2423 regmatch_state *orig_state;
a3621e74 2424
5d9a96ca
DM
2425 /* the current state. This is a cached copy of PL_regmatch_state */
2426 register regmatch_state *st;
95b24440 2427
5d9a96ca
DM
2428 /* cache heavy used fields of st in registers */
2429 register regnode *scan;
2430 register regnode *next;
2431 register I32 n = 0; /* initialize to shut up compiler warning */
2432 register char *locinput = PL_reginput;
95b24440 2433
5d9a96ca
DM
2434 /* these variables are NOT saved during a recusive RFEGMATCH: */
2435 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2436 bool result; /* return value of S_regmatch */
5d9a96ca 2437 int depth = 0; /* depth of recursion */
77cb431f
DM
2438 regmatch_state *yes_state = NULL; /* state to pop to on success of
2439 subpattern */
40a82448 2440 U32 state_num;
95b24440
DM
2441
2442#ifdef DEBUGGING
e68ec53f 2443 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2444 PL_regindent++;
2445#endif
2446
5d9a96ca
DM
2447 /* on first ever call to regmatch, allocate first slab */
2448 if (!PL_regmatch_slab) {
2449 Newx(PL_regmatch_slab, 1, regmatch_slab);
2450 PL_regmatch_slab->prev = NULL;
2451 PL_regmatch_slab->next = NULL;
86545054 2452 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2453 }
2454
2455 /* remember current high-water mark for exit */
2456 /* XXX this should be done with SAVE* instead */
2457 orig_slab = PL_regmatch_slab;
2458 orig_state = PL_regmatch_state;
2459
2460 /* grab next free state slot */
2461 st = ++PL_regmatch_state;
86545054 2462 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2463 st = PL_regmatch_state = S_push_slab(aTHX);
2464
2465 st->minmod = 0;
2466 st->sw = 0;
2467 st->logical = 0;
5d9a96ca 2468 st->cc = NULL;
d6a28714
JH
2469 /* Note that nextchr is a byte even in UTF */
2470 nextchr = UCHARAT(locinput);
2471 scan = prog;
2472 while (scan != NULL) {
8ba1375e 2473
a3621e74 2474 DEBUG_EXECUTE_r( {
6136c704 2475 SV * const prop = sv_newmortal();
07be1b83 2476 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2477 regprop(rex, prop, scan);
07be1b83
YO
2478
2479 PerlIO_printf(Perl_debug_log,
2480 "%3"IVdf":%*s%s(%"IVdf")\n",
2481 (IV)(scan - rex->program), PL_regindent*2, "",
2482 SvPVX_const(prop),
2483 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2484 });
d6a28714
JH
2485
2486 next = scan + NEXT_OFF(scan);
2487 if (next == scan)
2488 next = NULL;
40a82448 2489 state_num = OP(scan);
d6a28714 2490
40a82448
DM
2491 reenter_switch:
2492 switch (state_num) {
d6a28714 2493 case BOL:
7fba1cd6 2494 if (locinput == PL_bostr)
d6a28714 2495 {
3b0527fe 2496 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2497 break;
2498 }
d6a28714
JH
2499 sayNO;
2500 case MBOL:
12d33761
HS
2501 if (locinput == PL_bostr ||
2502 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2503 {
b8c5462f
JH
2504 break;
2505 }
d6a28714
JH
2506 sayNO;
2507 case SBOL:
c2a73568 2508 if (locinput == PL_bostr)
b8c5462f 2509 break;
d6a28714
JH
2510 sayNO;
2511 case GPOS:
3b0527fe 2512 if (locinput == reginfo->ganch)
d6a28714
JH
2513 break;
2514 sayNO;
2515 case EOL:
d6a28714
JH
2516 goto seol;
2517 case MEOL:
d6a28714 2518 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2519 sayNO;
b8c5462f 2520 break;
d6a28714
JH
2521 case SEOL:
2522 seol:
2523 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2524 sayNO;
d6a28714 2525 if (PL_regeol - locinput > 1)
b8c5462f 2526 sayNO;
b8c5462f 2527 break;
d6a28714
JH
2528 case EOS:
2529 if (PL_regeol != locinput)
b8c5462f 2530 sayNO;
d6a28714 2531 break;
ffc61ed2 2532 case SANY:
d6a28714 2533 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2534 sayNO;
f33976b4
DB
2535 if (do_utf8) {
2536 locinput += PL_utf8skip[nextchr];
2537 if (locinput > PL_regeol)
2538 sayNO;
2539 nextchr = UCHARAT(locinput);
2540 }
2541 else
2542 nextchr = UCHARAT(++locinput);
2543 break;
2544 case CANY:
2545 if (!nextchr && locinput >= PL_regeol)
2546 sayNO;
b8c5462f 2547 nextchr = UCHARAT(++locinput);
a0d0e21e 2548 break;
ffc61ed2 2549 case REG_ANY:
1aa99e6b
IH
2550 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2551 sayNO;
2552 if (do_utf8) {
b8c5462f 2553 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2554 if (locinput > PL_regeol)
2555 sayNO;
a0ed51b3 2556 nextchr = UCHARAT(locinput);
a0ed51b3 2557 }
1aa99e6b
IH
2558 else
2559 nextchr = UCHARAT(++locinput);
a0ed51b3 2560 break;
166ba7cd
DM
2561
2562#undef ST
2563#define ST st->u.trie
2564
5b47454d 2565 case TRIE:
3dab1dad 2566 {
07be1b83 2567 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2568 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2569 trie_type = do_utf8 ?
2570 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2571 : trie_plain;
2572
2573 /* what trie are we using right now */
be8e71aa 2574 reg_trie_data * const trie
3dab1dad
YO
2575 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2576 U32 state = trie->startstate;
166ba7cd
DM
2577
2578 U8 *uc = ( U8* )locinput;
2579 U16 charid = 0;
2580 U32 base = 0;
2581 UV uvc = 0;
2582 STRLEN len = 0;
2583 STRLEN foldlen = 0;
2584 U8 *uscan = (U8*)NULL;
2585 STRLEN bufflen=0;
2586 SV *sv_accept_buff = NULL;
2587 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2588
2589 ST.accepted = 0; /* how many accepting states we have seen */
2590 ST.B = next;
2591#ifdef DEBUGGING
2592 ST.me = scan;
2593#endif
3dab1dad
YO
2594
2595 if (trie->bitmap && trie_type != trie_utf8_fold &&
2596 !TRIE_BITMAP_TEST(trie,*locinput)
2597 ) {
2598 if (trie->states[ state ].wordnum) {
2599 DEBUG_EXECUTE_r(
2600 PerlIO_printf(Perl_debug_log,
2601 "%*s %smatched empty string...%s\n",
2602 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2603 );
2604 break;
2605 } else {
2606 DEBUG_EXECUTE_r(
2607 PerlIO_printf(Perl_debug_log,
2608 "%*s %sfailed to match start class...%s\n",
2609 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2610 );
2611 sayNO_SILENT;
2612 }
2613 }
166ba7cd 2614
07be1b83
YO
2615 /*
2616 traverse the TRIE keeping track of all accepting states
2617 we transition through until we get to a failing node.
2618 */
2619
a3621e74
YO
2620 while ( state && uc <= (U8*)PL_regeol ) {
2621
5b47454d 2622 if (trie->states[ state ].wordnum) {
166ba7cd 2623 if (!ST.accepted ) {
5b47454d
DM
2624 ENTER;
2625 SAVETMPS;
2626 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2627 sv_accept_buff=newSV(bufflen *
2628 sizeof(reg_trie_accepted) - 1);
2629 SvCUR_set(sv_accept_buff,
2630 sizeof(reg_trie_accepted));
2631 SvPOK_on(sv_accept_buff);
2632 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2633 SAVETMPS;
2634 ST.accept_buff =
5b47454d
DM
2635 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2636 }
2637 else {
166ba7cd 2638 if (ST.accepted >= bufflen) {
5b47454d 2639 bufflen *= 2;
166ba7cd 2640 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2641 SvGROW(sv_accept_buff,
2642 bufflen * sizeof(reg_trie_accepted));
2643 }
2644 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2645 + sizeof(reg_trie_accepted));
2646 }
166ba7cd
DM
2647 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2648 ST.accept_buff[ST.accepted].endpos = uc;
2649 ++ST.accepted;
5b47454d 2650 }
a3621e74
YO
2651
2652 base = trie->states[ state ].trans.base;
2653
07be1b83
YO
2654 DEBUG_TRIE_EXECUTE_r({
2655 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2656 PerlIO_printf( Perl_debug_log,
e4584336 2657 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2658 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2659 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2660 });
a3621e74
YO
2661
2662 if ( base ) {
4cadc6a9
YO
2663 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2664 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2665
5b47454d
DM
2666 if (charid &&
2667 (base + charid > trie->uniquecharcount )
2668 && (base + charid - 1 - trie->uniquecharcount
2669 < trie->lasttrans)
2670 && trie->trans[base + charid - 1 -
2671 trie->uniquecharcount].check == state)
2672 {
2673 state = trie->trans[base + charid - 1 -
2674 trie->uniquecharcount ].next;
2675 }
2676 else {
2677 state = 0;
2678 }
2679 uc += len;
2680
2681 }
2682 else {
a3621e74
YO
2683 state = 0;
2684 }
2685 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2686 PerlIO_printf( Perl_debug_log,
2687 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2688 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2689 );
2690 }
166ba7cd 2691 if (!ST.accepted )
a3621e74 2692 sayNO;
a3621e74 2693
166ba7cd
DM
2694 DEBUG_EXECUTE_r(
2695 PerlIO_printf( Perl_debug_log,
2696 "%*s %sgot %"IVdf" possible matches%s\n",
2697 REPORT_CODE_OFF + PL_regindent * 2, "",
2698 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2699 );
2700 }
2701
2702 /* FALL THROUGH */
2703
2704 case TRIE_next_fail: /* we failed - try next alterative */
2705
2706 if ( ST.accepted == 1 ) {
2707 /* only one choice left - just continue */
2708 DEBUG_EXECUTE_r({
2709 reg_trie_data * const trie
2710 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2711 SV ** const tmp = RX_DEBUG(reginfo->prog)
2712 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2713 : NULL;
2714 PerlIO_printf( Perl_debug_log,
2715 "%*s %sonly one match left: #%d <%s>%s\n",
2716 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2717 ST.accept_buff[ 0 ].wordnum,
2718 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2719 PL_colors[5] );
2720 });
2721 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2722 /* in this case we free tmps/leave before we call regmatch
2723 as we wont be using accept_buff again. */
2724 FREETMPS;
2725 LEAVE;
2726 locinput = PL_reginput;
2727 nextchr = UCHARAT(locinput);
2728 scan = ST.B;
2729 continue; /* execute rest of RE */
2730 }
2731
2732 if (!ST.accepted-- ) {
2733 FREETMPS;
2734 LEAVE;
2735 sayNO;
2736 }
2737
a3621e74 2738 /*
166ba7cd
DM
2739 There are at least two accepting states left. Presumably
2740 the number of accepting states is going to be low,
2741 typically two. So we simply scan through to find the one
2742 with lowest wordnum. Once we find it, we swap the last
2743 state into its place and decrement the size. We then try to
2744 match the rest of the pattern at the point where the word
2745 ends. If we succeed, control just continues along the
2746 regex; if we fail we return here to try the next accepting
2747 state
2748 */
a3621e74 2749
166ba7cd
DM
2750 {
2751 U32 best = 0;
2752 U32 cur;
2753 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2754 DEBUG_TRIE_EXECUTE_r(
f2278c82 2755 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2756 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2757 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2758 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2759 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2760 );
2761
2762 if (ST.accept_buff[cur].wordnum <
2763 ST.accept_buff[best].wordnum)
2764 best = cur;
a3621e74 2765 }
166ba7cd
DM
2766
2767 DEBUG_EXECUTE_r({
2768 reg_trie_data * const trie
2769 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2770 SV ** const tmp = RX_DEBUG(reginfo->prog)
2771 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2772 : NULL;
2773 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2774 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2775 ST.accept_buff[best].wordnum,
2776 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2777 PL_colors[5] );
2778 });
2779
2780 if ( best<ST.accepted ) {
2781 reg_trie_accepted tmp = ST.accept_buff[ best ];
2782 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2783 ST.accept_buff[ ST.accepted ] = tmp;
2784 best = ST.accepted;
a3621e74 2785 }
166ba7cd
DM
2786 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2787 }
2788 PUSH_STATE_GOTO(TRIE_next, ST.B);
2789 /* NOTREACHED */
2790
2791#undef ST
2792
95b24440
DM
2793 case EXACT: {
2794 char *s = STRING(scan);
5d9a96ca 2795 st->ln = STR_LEN(scan);
eb160463 2796 if (do_utf8 != UTF) {
bc517b45 2797 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2798 char *l = locinput;
be8e71aa 2799 const char * const e = s + st->ln;
a72c7584 2800
5ff6fc6d
JH
2801 if (do_utf8) {
2802 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2803 while (s < e) {
a3b680e6 2804 STRLEN ulen;
1aa99e6b 2805 if (l >= PL_regeol)
5ff6fc6d
JH
2806 sayNO;
2807 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2808 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2809 uniflags))
5ff6fc6d 2810 sayNO;
bc517b45 2811 l += ulen;
5ff6fc6d 2812 s ++;
1aa99e6b 2813 }
5ff6fc6d
JH
2814 }
2815 else {
2816 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2817 while (s < e) {
a3b680e6 2818 STRLEN ulen;
1aa99e6b
IH
2819 if (l >= PL_regeol)
2820 sayNO;
5ff6fc6d 2821 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2822 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2823 uniflags))
1aa99e6b 2824 sayNO;
bc517b45 2825 s += ulen;
a72c7584 2826 l ++;
1aa99e6b 2827 }
5ff6fc6d 2828 }
1aa99e6b
IH
2829 locinput = l;
2830 nextchr = UCHARAT(locinput);
2831 break;
2832 }
bc517b45 2833 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2834 /* Inline the first character, for speed. */
2835 if (UCHARAT(s) != nextchr)
2836 sayNO;
5d9a96ca 2837 if (PL_regeol - locinput < st->ln)
d6a28714 2838 sayNO;
5d9a96ca 2839 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2840 sayNO;
5d9a96ca 2841 locinput += st->ln;
d6a28714
JH
2842 nextchr = UCHARAT(locinput);
2843 break;
95b24440 2844 }
d6a28714 2845 case EXACTFL:
b8c5462f
JH
2846 PL_reg_flags |= RF_tainted;
2847 /* FALL THROUGH */
95b24440 2848 case EXACTF: {
be8e71aa 2849 char * const s = STRING(scan);
5d9a96ca 2850 st->ln = STR_LEN(scan);
d6a28714 2851
d07ddd77
JH
2852 if (do_utf8 || UTF) {
2853 /* Either target or the pattern are utf8. */
be8e71aa 2854 const char * const l = locinput;
d07ddd77 2855 char *e = PL_regeol;
bc517b45 2856
5d9a96ca 2857 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2858 l, &e, 0, do_utf8)) {
5486206c
JH
2859 /* One more case for the sharp s:
2860 * pack("U0U*", 0xDF) =~ /ss/i,
2861 * the 0xC3 0x9F are the UTF-8
2862 * byte sequence for the U+00DF. */
2863 if (!(do_utf8 &&
2864 toLOWER(s[0]) == 's' &&
5d9a96ca 2865 st->ln >= 2 &&
5486206c
JH
2866 toLOWER(s[1]) == 's' &&
2867 (U8)l[0] == 0xC3 &&
2868 e - l >= 2 &&
2869 (U8)l[1] == 0x9F))
2870 sayNO;
2871 }
d07ddd77
JH
2872 locinput = e;
2873 nextchr = UCHARAT(locinput);
2874 break;
a0ed51b3 2875 }
d6a28714 2876
bc517b45
JH
2877 /* Neither the target and the pattern are utf8. */
2878
d6a28714
JH
2879 /* Inline the first character, for speed. */
2880 if (UCHARAT(s) != nextchr &&
2881 UCHARAT(s) != ((OP(scan) == EXACTF)
2882 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2883 sayNO;
5d9a96ca 2884 if (PL_regeol - locinput < st->ln)
b8c5462f 2885 sayNO;
5d9a96ca
DM
2886 if (st->ln > 1 && (OP(scan) == EXACTF
2887 ? ibcmp(s, locinput, st->ln)
2888 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2889 sayNO;
5d9a96ca 2890 locinput += st->ln;
d6a28714 2891 nextchr = UCHARAT(locinput);
a0d0e21e 2892 break;
95b24440 2893 }
d6a28714 2894 case ANYOF:
ffc61ed2 2895 if (do_utf8) {
9e55ce06
JH
2896 STRLEN inclasslen = PL_regeol - locinput;
2897
32fc9b6a 2898 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2899 sayNO_ANYOF;
ffc61ed2
JH
2900 if (locinput >= PL_regeol)
2901 sayNO;
0f0076b4 2902 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2903 nextchr = UCHARAT(locinput);
e0f9d4a8 2904 break;
ffc61ed2
JH
2905 }
2906 else {
2907 if (nextchr < 0)
2908 nextchr = UCHARAT(locinput);
32fc9b6a 2909 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2910 sayNO_ANYOF;
ffc61ed2
JH
2911 if (!nextchr && locinput >= PL_regeol)
2912 sayNO;
2913 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2914 break;
2915 }
2916 no_anyof:
2917 /* If we might have the case of the German sharp s
2918 * in a casefolding Unicode character class. */
2919
ebc501f0
JH
2920 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2921 locinput += SHARP_S_SKIP;
e0f9d4a8 2922 nextchr = UCHARAT(locinput);
ffc61ed2 2923 }
e0f9d4a8
JH
2924 else
2925 sayNO;
b8c5462f 2926 break;
d6a28714 2927 case ALNUML:
b8c5462f
JH
2928 PL_reg_flags |= RF_tainted;
2929 /* FALL THROUGH */
d6a28714 2930 case ALNUM:
b8c5462f 2931 if (!nextchr)
4633a7c4 2932 sayNO;
ffc61ed2 2933 if (do_utf8) {
1a4fad37 2934 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2935 if (!(OP(scan) == ALNUM
bb7a0f54 2936 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2937 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2938 {
2939 sayNO;
a0ed51b3 2940 }
b8c5462f 2941 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2942 nextchr = UCHARAT(locinput);
2943 break;
2944 }
ffc61ed2 2945 if (!(OP(scan) == ALNUM
d6a28714 2946 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2947 sayNO;
b8c5462f 2948 nextchr = UCHARAT(++locinput);
a0d0e21e 2949 break;
d6a28714 2950 case NALNUML:
b8c5462f
JH
2951 PL_reg_flags |= RF_tainted;
2952 /* FALL THROUGH */
d6a28714
JH
2953 case NALNUM:
2954 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2955 sayNO;
ffc61ed2 2956 if (do_utf8) {
1a4fad37 2957 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2958 if (OP(scan) == NALNUM
bb7a0f54 2959 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2960 : isALNUM_LC_utf8((U8*)locinput))
2961 {
b8c5462f 2962 sayNO;
d6a28714 2963 }
b8c5462f
JH
2964 locinput += PL_utf8skip[nextchr];
2965 nextchr = UCHARAT(locinput);
2966 break;
2967 }
ffc61ed2 2968 if (OP(scan) == NALNUM
d6a28714 2969 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2970 sayNO;
76e3520e 2971 nextchr = UCHARAT(++locinput);
a0d0e21e 2972 break;
d6a28714
JH
2973 case BOUNDL:
2974 case NBOUNDL:
3280af22 2975 PL_reg_flags |= RF_tainted;
bbce6d69 2976 /* FALL THROUGH */
d6a28714
JH
2977 case BOUND:
2978 case NBOUND:
2979 /* was last char in word? */
ffc61ed2 2980 if (do_utf8) {
12d33761 2981 if (locinput == PL_bostr)
5d9a96ca 2982 st->ln = '\n';
ffc61ed2 2983 else {
a3b680e6 2984 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2985
4ad0818d 2986 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
2987 }
2988 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2989 st->ln = isALNUM_uni(st->ln);
1a4fad37 2990 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2991 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2992 }
2993 else {
5d9a96ca 2994 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
2995 n = isALNUM_LC_utf8((U8*)locinput);
2996 }
a0ed51b3 2997 }
d6a28714 2998 else {
5d9a96ca 2999 st->ln = (locinput != PL_bostr) ?
12d33761 3000 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3001 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3002 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3003 n = isALNUM(nextchr);
3004 }
3005 else {
5d9a96ca 3006 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3007 n = isALNUM_LC(nextchr);
3008 }
d6a28714 3009 }
5d9a96ca 3010 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3011 OP(scan) == BOUNDL))
3012 sayNO;
a0ed51b3 3013 break;
d6a28714 3014 case SPACEL:
3280af22 3015 PL_reg_flags |= RF_tainted;
bbce6d69 3016 /* FALL THROUGH */
d6a28714 3017 case SPACE:
9442cb0e 3018 if (!nextchr)
4633a7c4 3019 sayNO;
1aa99e6b 3020 if (do_utf8) {
fd400ab9 3021 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3022 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3023 if (!(OP(scan) == SPACE
bb7a0f54 3024 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3025 : isSPACE_LC_utf8((U8*)locinput)))
3026 {
3027 sayNO;
3028 }
3029 locinput += PL_utf8skip[nextchr];
3030 nextchr = UCHARAT(locinput);
3031 break;
d6a28714 3032 }
ffc61ed2
JH
3033 if (!(OP(scan) == SPACE
3034 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3035 sayNO;
3036 nextchr = UCHARAT(++locinput);
3037 }
3038 else {
3039 if (!(OP(scan) == SPACE
3040 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3041 sayNO;
3042 nextchr = UCHARAT(++locinput);
a0ed51b3 3043 }
a0ed51b3 3044 break;
d6a28714 3045 case NSPACEL:
3280af22 3046 PL_reg_flags |= RF_tainted;
bbce6d69 3047 /* FALL THROUGH */
d6a28714 3048 case NSPACE:
9442cb0e 3049 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3050 sayNO;
1aa99e6b 3051 if (do_utf8) {
1a4fad37 3052 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3053 if (OP(scan) == NSPACE
bb7a0f54 3054 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3055 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3056 {
3057 sayNO;
3058 }
3059 locinput += PL_utf8skip[nextchr];
3060 nextchr = UCHARAT(locinput);
3061 break;
a0ed51b3 3062 }
ffc61ed2 3063 if (OP(scan) == NSPACE
d6a28714 3064 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3065 sayNO;
76e3520e 3066 nextchr = UCHARAT(++locinput);
a0d0e21e 3067 break;
d6a28714 3068 case DIGITL:
a0ed51b3
LW
3069 PL_reg_flags |= RF_tainted;
3070 /* FALL THROUGH */
d6a28714 3071 case DIGIT:
9442cb0e 3072 if (!nextchr)
a0ed51b3 3073 sayNO;
1aa99e6b 3074 if (do_utf8) {
1a4fad37 3075 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3076 if (!(OP(scan) == DIGIT
bb7a0f54 3077 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3078 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3079 {
a0ed51b3 3080 sayNO;
dfe13c55 3081 }
6f06b55f 3082 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3083 nextchr = UCHARAT(locinput);
3084 break;
3085 }
ffc61ed2 3086 if (!(OP(scan) == DIGIT
9442cb0e 3087 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3088 sayNO;
3089 nextchr = UCHARAT(++locinput);
3090 break;
d6a28714 3091 case NDIGITL:
b8c5462f
JH
3092 PL_reg_flags |= RF_tainted;
3093 /* FALL THROUGH */
d6a28714 3094 case NDIGIT:
9442cb0e 3095 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3096 sayNO;
1aa99e6b 3097 if (do_utf8) {
1a4fad37 3098 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3099 if (OP(scan) == NDIGIT
bb7a0f54 3100 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3101 : isDIGIT_LC_utf8((U8*)locinput))
3102 {
a0ed51b3 3103 sayNO;
9442cb0e 3104 }
6f06b55f 3105 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3106 nextchr = UCHARAT(locinput);
3107 break;
3108 }
ffc61ed2 3109 if (OP(scan) == NDIGIT
9442cb0e 3110 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3111 sayNO;
3112 nextchr = UCHARAT(++locinput);
3113 break;
3114 case CLUMP:
b7c83a7e 3115 if (locinput >= PL_regeol)
a0ed51b3 3116 sayNO;
b7c83a7e 3117 if (do_utf8) {
1a4fad37 3118 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3119 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3120 sayNO;
3121 locinput += PL_utf8skip[nextchr];
3122 while (locinput < PL_regeol &&
3123 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3124 locinput += UTF8SKIP(locinput);
3125 if (locinput > PL_regeol)
3126 sayNO;
eb08e2da
JH
3127 }
3128 else
3129 locinput++;
a0ed51b3
LW
3130 nextchr = UCHARAT(locinput);
3131 break;
c8756f30 3132 case REFFL:
3280af22 3133 PL_reg_flags |= RF_tainted;
c8756f30 3134 /* FALL THROUGH */
c277df42 3135 case REF:
95b24440
DM
3136 case REFF: {
3137 char *s;
c277df42 3138 n = ARG(scan); /* which paren pair */
5d9a96ca 3139 st->ln = PL_regstartp[n];
2c2d71f5 3140 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3141 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3142 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3143 if (st->ln == PL_regendp[n])
a0d0e21e 3144 break;
a0ed51b3 3145
5d9a96ca 3146 s = PL_bostr + st->ln;
1aa99e6b 3147 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3148 char *l = locinput;
a3b680e6 3149 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3150 /*
3151 * Note that we can't do the "other character" lookup trick as
3152 * in the 8-bit case (no pun intended) because in Unicode we
3153 * have to map both upper and title case to lower case.
3154 */
3155 if (OP(scan) == REFF) {
3156 while (s < e) {
a3b680e6
AL
3157 STRLEN ulen1, ulen2;
3158 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3159 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3160
a0ed51b3
LW
3161 if (l >= PL_regeol)
3162 sayNO;
a2a2844f
JH
3163 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3164 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3165 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3166 sayNO;
a2a2844f
JH
3167 s += ulen1;
3168 l += ulen2;
a0ed51b3
LW
3169 }
3170 }
3171 locinput = l;
3172 nextchr = UCHARAT(locinput);
3173 break;
3174 }
3175
a0d0e21e 3176 /* Inline the first character, for speed. */
76e3520e 3177 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3178 (OP(scan) == REF ||
3179 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3180 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3181 sayNO;
5d9a96ca
DM
3182 st->ln = PL_regendp[n] - st->ln;
3183 if (locinput + st->ln > PL_regeol)
4633a7c4 3184 sayNO;
5d9a96ca
DM
3185 if (st->ln > 1 && (OP(scan) == REF
3186 ? memNE(s, locinput, st->ln)
c8756f30 3187 : (OP(scan) == REFF
5d9a96ca
DM
3188 ? ibcmp(s, locinput, st->ln)
3189 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3190 sayNO;
5d9a96ca 3191 locinput += st->ln;
76e3520e 3192 nextchr = UCHARAT(locinput);
a0d0e21e 3193 break;
95b24440 3194 }
a0d0e21e
LW
3195
3196 case NOTHING:
c277df42 3197 case TAIL:
a0d0e21e
LW
3198 break;
3199 case BACK:
3200 break;
40a82448
DM
3201
3202#undef ST
3203#define ST st->u.eval
3204
3205 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3206 {
c277df42 3207 SV *ret;
8e5e9ebe 3208 {
4aabdb9b
DM
3209 /* execute the code in the {...} */
3210 dSP;
6136c704 3211 SV ** const before = SP;
4aabdb9b
DM
3212 OP_4tree * const oop = PL_op;
3213 COP * const ocurcop = PL_curcop;
3214 PAD *old_comppad;
4aabdb9b
DM
3215
3216 n = ARG(scan);
32fc9b6a 3217 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3218 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3219 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3220 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3221
8e5e9ebe
RGS
3222 CALLRUNOPS(aTHX); /* Scalar context. */
3223 SPAGAIN;
3224 if (SP == before)
075aa684 3225 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3226 else {
3227 ret = POPs;
3228 PUTBACK;
3229 }
4aabdb9b
DM
3230
3231 PL_op = oop;
3232 PAD_RESTORE_LOCAL(old_comppad);
3233 PL_curcop = ocurcop;
3234 if (!st->logical) {
3235 /* /(?{...})/ */
3236 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3237 break;
3238 }
8e5e9ebe 3239 }
4aabdb9b
DM
3240 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3241 regexp *re;
4aabdb9b 3242 {
4f639d21
DM
3243 /* extract RE object from returned value; compiling if
3244 * necessary */
3245
6136c704 3246 MAGIC *mg = NULL;
be8e71aa 3247 const SV *sv;
faf82a0b
AE
3248 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3249 mg = mg_find(sv, PERL_MAGIC_qr);
3250 else if (SvSMAGICAL(ret)) {
3251 if (SvGMAGICAL(ret))
3252 sv_unmagic(ret, PERL_MAGIC_qr);
3253 else
3254 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3255 }
faf82a0b 3256
0f5d15d6
IZ
3257 if (mg) {
3258 re = (regexp *)mg->mg_obj;
df0003d4 3259 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3260 }
3261 else {
3262 STRLEN len;
6136c704 3263 const char * const t = SvPV_const(ret, len);
0f5d15d6 3264 PMOP pm;
a3b680e6 3265 const I32 osize = PL_regsize;
0f5d15d6 3266
5fcd1c1b 3267 Zero(&pm, 1, PMOP);
4aabdb9b 3268 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3269 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3270 if (!(SvFLAGS(ret)
faf82a0b
AE
3271 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3272 | SVs_GMG)))
14befaf4
DM
3273 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3274 PERL_MAGIC_qr,0,0);
0f5d15d6 3275 PL_regsize = osize;
0f5d15d6 3276 }
4aabdb9b 3277 }
aa283a38
DM
3278
3279 /* run the pattern returned from (??{...}) */
3280
4aabdb9b
DM
3281 DEBUG_EXECUTE_r(
3282 PerlIO_printf(Perl_debug_log,
3283 "Entering embedded \"%s%.60s%s%s\"\n",
3284 PL_colors[0],
3285 re->precomp,
3286 PL_colors[1],
3287 (strlen(re->precomp) > 60 ? "..." : ""))
3288 );
2c2d71f5 3289
40a82448
DM
3290 ST.cp = regcppush(0); /* Save *all* the positions. */
3291 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3292 *PL_reglastparen = 0;
3293 *PL_reglastcloseparen = 0;
4aabdb9b 3294 PL_reginput = locinput;
4aabdb9b
DM
3295
3296 /* XXXX This is too dramatic a measure... */
3297 PL_reg_maxiter = 0;
3298
5d9a96ca 3299 st->logical = 0;
40a82448 3300 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3301 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3302 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3303 ST.prev_rex = rex;
aa283a38 3304 rex = re;
aa283a38 3305
40a82448 3306 ST.B = next;
aa283a38 3307 /* now continue from first node in postoned RE */
40a82448 3308 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3309 /* NOTREACHED */
a0ed51b3 3310 }
4aabdb9b
DM
3311 /* /(?(?{...})X|Y)/ */
3312 st->sw = SvTRUE(ret);
3313 st->logical = 0;
c277df42
IZ
3314 break;
3315 }
40a82448
DM
3316
3317 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3318 if (ST.toggleutf)
3319 PL_reg_flags ^= RF_utf8;
3320 ReREFCNT_dec(rex);
3321 rex = ST.prev_rex;
3322 /* XXXX This is too dramatic a measure... */
3323 PL_reg_maxiter = 0;
3324 /* Restore parens of the caller without popping the
3325 * savestack */
3326 {
3327 const I32 tmp = PL_savestack_ix;
3328 PL_savestack_ix = ST.lastcp;
3329 regcppop(rex);
3330 PL_savestack_ix = tmp;
3331 }
3332 PL_reginput = locinput;
3333 /* continue at the node following the (??{...}) */
3334 scan = ST.B;
3335 continue;
3336
3337 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3338 /* Restore state to the outer re then re-throw the failure */
3339 if (ST.toggleutf)
3340 PL_reg_flags ^= RF_utf8;
3341 ReREFCNT_dec(rex);
3342 rex = ST.prev_rex;
3343
3344 /* XXXX This is too dramatic a measure... */
3345 PL_reg_maxiter = 0;
3346
3347 PL_reginput = locinput;
3348 REGCP_UNWIND(ST.lastcp);
3349 regcppop(rex);
3350 sayNO_SILENT;
3351
3352#undef ST
3353
a0d0e21e 3354 case OPEN:
c277df42 3355 n = ARG(scan); /* which paren pair */
3280af22
NIS
3356 PL_reg_start_tmp[n] = locinput;
3357 if (n > PL_regsize)
3358 PL_regsize = n;
a0d0e21e
LW
3359 break;
3360 case CLOSE:
c277df42 3361 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3362 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3363 PL_regendp[n] = locinput - PL_bostr;
eb160463 3364 if (n > (I32)*PL_reglastparen)
3280af22 3365 *PL_reglastparen = n;
a01268b5 3366 *PL_reglastcloseparen = n;
a0d0e21e 3367 break;
c277df42
IZ
3368 case GROUPP:
3369 n = ARG(scan); /* which paren pair */
5d9a96ca 3370 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3371 break;
3372 case IFTHEN:
2c2d71f5 3373 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3374 if (st->sw)
c277df42
IZ
3375 next = NEXTOPER(NEXTOPER(scan));
3376 else {
3377 next = scan + ARG(scan);
3378 if (OP(next) == IFTHEN) /* Fake one. */
3379 next = NEXTOPER(NEXTOPER(next));
3380 }
3381 break;
3382 case LOGICAL:
5d9a96ca 3383 st->logical = scan->flags;
c277df42 3384 break;
2ab05381 3385/*******************************************************************
a0374537
DM
3386 cc points to the regmatch_state associated with the most recent CURLYX.
3387 This struct contains info about the innermost (...)* loop (an
3388 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3389
3390 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3391
95b24440 3392 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3393
a0374537 3394 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3395 with the starting point at WHILEM node;
2ab05381
IZ
3396
3397 3) Each hit of WHILEM node tries to match A and Z (in the order
3398 depending on the current iteration, min/max of {min,max} and
3399 greediness). The information about where are nodes for "A"
a0374537 3400 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3401 was already matched, and greediness.
3402
3403 4) After A matches, the same WHILEM node is hit again.
3404
95b24440 3405 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3406 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3407 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3408 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3409 of the external loop.
3410
a0374537 3411 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3412 and whatever it mentions via ->next, and additional attached trees
3413 corresponding to temporarily unset infoblocks as in "5" above.
3414
95b24440 3415 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3416 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3417 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3418 infoblocks are drawn below the "reset" infoblock.
3419
3420 In fact in the picture below we do not show failed matches for Z and T
3421 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3422 more obvious *why* one needs to *temporary* unset infoblocks.]
3423
3424 Matched REx position InfoBlocks Comment
3425 (Y(A)*?Z)*?T x
3426 Y(A)*?Z)*?T x <- O
3427 Y (A)*?Z)*?T x <- O
3428 Y A)*?Z)*?T x <- O <- I
3429 YA )*?Z)*?T x <- O <- I
3430 YA A)*?Z)*?T x <- O <- I
3431 YAA )*?Z)*?T x <- O <- I
3432 YAA Z)*?T x <- O # Temporary unset I
3433 I
3434
3435 YAAZ Y(A)*?Z)*?T x <- O
3436 I
3437
3438 YAAZY (A)*?Z)*?T x <- O
3439 I
3440
3441 YAAZY A)*?Z)*?T x <- O <- I
3442 I
3443
3444 YAAZYA )*?Z)*?T x <- O <- I
3445 I
3446
3447 YAAZYA Z)*?T x <- O # Temporary unset I
3448 I,I
3449
3450 YAAZYAZ )*?T x <- O
3451 I,I
3452
3453 YAAZYAZ T x # Temporary unset O
3454 O
3455 I,I
3456
3457 YAAZYAZT x
3458 O
3459 I,I
3460 *******************************************************************/
95b24440 3461
a0d0e21e 3462 case CURLYX: {
cb434fcc
IZ
3463 /* No need to save/restore up to this paren */
3464 I32 parenfloor = scan->flags;
c277df42 3465
c2b7afd3
NC
3466 /* Dave says:
3467
3468 CURLYX and WHILEM are always paired: they're the moral
3469 equivalent of pp_enteriter anbd pp_iter.
3470
3471 The only time next could be null is if the node tree is
3472 corrupt. This was mentioned on p5p a few days ago.
3473
3474 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3475 So we'll assert that this is true:
3476 */
3477 assert(next);
30b2893d 3478 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3479 next += ARG(next);
cb434fcc
IZ
3480 /* XXXX Probably it is better to teach regpush to support
3481 parenfloor > PL_regsize... */
eb160463 3482 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3483 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3484
d8319b27
DM
3485 st->u.curlyx.cp = PL_savestack_ix;
3486 st->u.curlyx.outercc = st->cc;
a0374537
DM
3487 st->cc = st;
3488 /* these fields contain the state of the current curly.
3489 * they are accessed by subsequent WHILEMs;
3490 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3491 st->u.curlyx.parenfloor = parenfloor;
3492 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3493 st->u.curlyx.min = ARG1(scan);
3494 st->u.curlyx.max = ARG2(scan);
3495 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3496 st->u.curlyx.lastloc = 0;
a0374537
DM
3497 /* st->next and st->minmod are also read by WHILEM */
3498
3280af22 3499 PL_reginput = locinput;
95b24440
DM
3500 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3501 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3502 regcpblow(st->u.curlyx.cp);
3503 st->cc = st->u.curlyx.outercc;
95b24440 3504 saySAME(result);
a0d0e21e 3505 }
5f66b61c 3506 /* NOTREACHED */
a0d0e21e
LW
3507 case WHILEM: {
3508 /*
3509 * This is really hard to understand, because after we match
3510 * what we're trying to match, we must make sure the rest of
2c2d71f5 3511 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3512 * to go back UP the parse tree by recursing ever deeper. And
3513 * if it fails, we have to reset our parent's current state
3514 * that we can try again after backing off.
3515 */
3516
c2b7afd3
NC
3517 /* Dave says:
3518
3519 st->cc gets initialised by CURLYX ready for use by WHILEM.
3520 So again, unless somethings been corrupted, st->cc cannot
3521 be null at that point in WHILEM.
3522
3523 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3524 So we'll assert that this is true:
3525 */
3526 assert(st->cc);
d8319b27
DM
3527 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3528 st->u.whilem.cache_offset = 0;
3529 st->u.whilem.cache_bit = 0;
c277df42 3530
d8319b27 3531 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3532 PL_reginput = locinput;
a0d0e21e 3533
a3621e74 3534 DEBUG_EXECUTE_r(
9041c2e3 3535 PerlIO_printf(Perl_debug_log,
91f3b821 3536 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3537 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3538 (long)n, (long)st->cc->u.curlyx.min,
3539 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3540 );
4633a7c4 3541
a0d0e21e
LW
3542 /* If degenerate scan matches "", assume scan done. */
3543
d8319b27
DM
3544 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3545 st->u.whilem.savecc = st->cc;
3546 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3547 if (st->cc)
d8319b27 3548 st->ln = st->cc->u.curlyx.cur;
a3621e74 3549 DEBUG_EXECUTE_r(
c3464db5
DD
3550 PerlIO_printf(Perl_debug_log,
3551 "%*s empty match detected, try continuation...\n",
3280af22 3552 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3553 );
d8319b27 3554 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3555 /*** all unsaved local vars undefined at this point */
d8319b27 3556 st->cc = st->u.whilem.savecc;
95b24440 3557 if (result)
4633a7c4 3558 sayYES;
d8319b27
DM
3559 if (st->cc->u.curlyx.outercc)
3560 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3561 sayNO;
a0d0e21e
LW
3562 }
3563
3564 /* First just match a string of min scans. */
3565
d8319b27
DM
3566 if (n < st->cc->u.curlyx.min) {
3567 st->cc->u.curlyx.cur = n;
3568 st->cc->u.curlyx.lastloc = locinput;
3569 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3570 /*** all unsaved local vars undefined at this point */
3571 if (result)
4633a7c4 3572 sayYES;
d8319b27
DM
3573 st->cc->u.curlyx.cur = n - 1;
3574 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3575 sayNO;
a0d0e21e
LW
3576 }
3577
2c2d71f5
JH
3578 if (scan->flags) {
3579 /* Check whether we already were at this position.
3580 Postpone detection until we know the match is not
3581 *that* much linear. */
3582 if (!PL_reg_maxiter) {
3583 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3584 /* possible overflow for long strings and many CURLYX's */
3585 if (PL_reg_maxiter < 0)
3586 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3587 PL_reg_leftiter = PL_reg_maxiter;
3588 }
3589 if (PL_reg_leftiter-- == 0) {
a3b680e6 3590 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3591 if (PL_reg_poscache) {
eb160463 3592 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3593 Renew(PL_reg_poscache, size, char);
3594 PL_reg_poscache_size = size;
3595 }
3596 Zero(PL_reg_poscache, size, char);
3597 }
3598 else {
3599 PL_reg_poscache_size = size;
a02a5408 3600 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3601 }
a3621e74 3602 DEBUG_EXECUTE_r(
2c2d71f5
JH
3603 PerlIO_printf(Perl_debug_log,
3604 "%sDetected a super-linear match, switching on caching%s...\n",
3605 PL_colors[4], PL_colors[5])
3606 );
3607 }
3608 if (PL_reg_leftiter < 0) {
d8319b27 3609 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3610
d8319b27
DM
3611 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3612 + st->u.whilem.cache_offset * (scan->flags>>4);
3613 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3614 st->u.whilem.cache_offset /= 8;
3615 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3616 DEBUG_EXECUTE_r(
2c2d71f5
JH
3617 PerlIO_printf(Perl_debug_log,
3618 "%*s already tried at this position...\n",
3619 REPORT_CODE_OFF+PL_regindent*2, "")
3620 );
3ab3c9b4
HS
3621 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3622 /* cache records success */
c2b0868c
HS
3623 sayYES;
3624 else
3ab3c9b4 3625 /* cache records failure */
c2b0868c 3626 sayNO_SILENT;
2c2d71f5 3627 }
2c2d71f5
JH
3628 }
3629 }
3630
a0d0e21e
LW
3631 /* Prefer next over scan for minimal matching. */
3632
5d9a96ca 3633 if (st->cc->minmod) {
d8319b27
DM
3634 st->u.whilem.savecc = st->cc;
3635 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3636 if (st->cc)
d8319b27
DM
3637 st->ln = st->cc->u.curlyx.cur;
3638 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3639 REGCP_SET(st->u.whilem.lastcp);
3640 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3641 /*** all unsaved local vars undefined at this point */
d8319b27 3642 st->cc = st->u.whilem.savecc;
95b24440 3643 if (result) {
d8319b27 3644 regcpblow(st->u.whilem.cp);
3ab3c9b4 3645 CACHEsayYES; /* All done. */
5f05dabc 3646 }
d8319b27 3647 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3648 regcppop(rex);
d8319b27
DM
3649 if (st->cc->u.curlyx.outercc)
3650 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3651
d8319b27 3652 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3653 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3654 && !(PL_reg_flags & RF_warned)) {
3655 PL_reg_flags |= RF_warned;
9014280d 3656 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3657 "Complex regular subexpression recursion",
3658 REG_INFTY - 1);
c277df42 3659 }
3ab3c9b4 3660 CACHEsayNO;
c277df42 3661 }
a687059c 3662
a3621e74 3663 DEBUG_EXECUTE_r(
c3464db5
DD
3664 PerlIO_printf(Perl_debug_log,
3665 "%*s trying longer...\n",
3280af22 3666 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3667 );
a0d0e21e 3668 /* Try scanning more and see if it helps. */
3280af22 3669 PL_reginput = locinput;
d8319b27
DM
3670 st->cc->u.curlyx.cur = n;
3671 st->cc->u.curlyx.lastloc = locinput;
3672 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3673 REGCP_SET(st->u.whilem.lastcp);
3674 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3675 /*** all unsaved local vars undefined at this point */
3676 if (result) {
d8319b27 3677 regcpblow(st->u.whilem.cp);
3ab3c9b4 3678 CACHEsayYES;
5f05dabc 3679 }
d8319b27 3680 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3681 regcppop(rex);
d8319b27
DM
3682 st->cc->u.curlyx.cur = n - 1;
3683 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3684 CACHEsayNO;
a0d0e21e
LW
3685 }
3686
3687 /* Prefer scan over next for maximal matching. */
3688
d8319b27
DM
3689 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3690 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3691 st->cc->u.curlyx.cur = n;
3692 st->cc->u.curlyx.lastloc = locinput;
3693 REGCP_SET(st->u.whilem.lastcp);
3694 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3695 /*** all unsaved local vars undefined at this point */
3696 if (result) {
d8319b27 3697 regcpblow(st->u.whilem.cp);
3ab3c9b4 3698 CACHEsayYES;
5f05dabc 3699 }
d8319b27 3700 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3701 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3702 PL_reginput = locinput;
a3621e74 3703 DEBUG_EXECUTE_r(
c3464db5
DD
3704 PerlIO_printf(Perl_debug_log,
3705 "%*s failed, try continuation...\n",
3280af22 3706 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3707 );
3708 }
9041c2e3 3709 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3710 && !(PL_reg_flags & RF_warned)) {
3280af22 3711 PL_reg_flags |= RF_warned;
9014280d 3712 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3713 "Complex regular subexpression recursion",
3714 REG_INFTY - 1);
a0d0e21e
LW
3715 }
3716
3717 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3718 st->u.whilem.savecc = st->cc;
3719 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3720 if (st->cc)
d8319b27
DM
3721 st->ln = st->cc->u.curlyx.cur;
3722 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3723 /*** all unsaved local vars undefined at this point */
d8319b27 3724 st->cc = st->u.whilem.savecc;
95b24440 3725 if (result)
3ab3c9b4 3726 CACHEsayYES;
d8319b27
DM
3727 if (st->cc->u.curlyx.outercc)
3728 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
3729 st->cc->u.curlyx.cur = n - 1;
3730 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3731 CACHEsayNO;
a0d0e21e 3732 }
5f66b61c 3733 /* NOTREACHED */
40a82448
DM
3734
3735#undef ST
3736#define ST st->u.branch
3737
3738 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
3739 next = scan + ARG(scan);
3740 if (next == scan)
3741 next = NULL;
40a82448
DM
3742 scan = NEXTOPER(scan);
3743 /* FALL THROUGH */
c277df42 3744
40a82448
DM
3745 case BRANCH: /* /(...|A|...)/ */
3746 scan = NEXTOPER(scan); /* scan now points to inner node */
3747 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
3748 /* last branch; skip state push and jump direct to node */
3749 continue;
3750 ST.lastparen = *PL_reglastparen;
3751 ST.next_branch = next;
3752 REGCP_SET(ST.cp);
3753 PL_reginput = locinput;
02db2b7b 3754
40a82448
DM
3755 /* Now go into the branch */
3756 PUSH_STATE_GOTO(BRANCH_next, scan);
3757 /* NOTREACHED */
3758
3759 case BRANCH_next_fail: /* that branch failed; try the next, if any */
3760 REGCP_UNWIND(ST.cp);
3761 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3762 PL_regendp[n] = -1;
3763 *PL_reglastparen = n;
3764 scan = ST.next_branch;
3765 /* no more branches? */
3766 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
3767 sayNO;
3768 continue; /* execute next BRANCH[J] op */
3769 /* NOTREACHED */
3770
a0d0e21e 3771 case MINMOD:
5d9a96ca 3772 st->minmod = 1;
a0d0e21e 3773 break;
40a82448
DM
3774
3775#undef ST
3776#define ST st->u.curlym
3777
3778 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
3779
3780 /* This is an optimisation of CURLYX that enables us to push
3781 * only a single backtracking state, no matter now many matches
3782 * there are in {m,n}. It relies on the pattern being constant
3783 * length, with no parens to influence future backrefs
3784 */
3785
3786 ST.me = scan;
dc45a647 3787 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
3788
3789 /* if paren positive, emulate an OPEN/CLOSE around A */
3790 if (ST.me->flags) {
3791 I32 paren = ST.me->flags;
3792 if (paren > PL_regsize)
3793 PL_regsize = paren;
3794 if (paren > (I32)*PL_reglastparen)
3795 *PL_reglastparen = paren;
c277df42 3796 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 3797 }
40a82448
DM
3798 ST.A = scan;
3799 ST.B = next;
3800 ST.alen = 0;
3801 ST.count = 0;
3802 ST.minmod = st->minmod;
3803 st->minmod = 0;
3804 ST.c1 = CHRTEST_UNINIT;
3805 REGCP_SET(ST.cp);
6407bf3b 3806
40a82448
DM
3807 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
3808 goto curlym_do_B;
3809
3810 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 3811 PL_reginput = locinput;
40a82448
DM
3812 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
3813 /* NOTREACHED */
5f80c4cf 3814
40a82448
DM
3815 case CURLYM_A: /* we've just matched an A */
3816 locinput = st->locinput;
3817 nextchr = UCHARAT(locinput);
3818
3819 ST.count++;
3820 /* after first match, determine A's length: u.curlym.alen */
3821 if (ST.count == 1) {
3822 if (PL_reg_match_utf8) {
3823 char *s = locinput;
3824 while (s < PL_reginput) {
3825 ST.alen++;
3826 s += UTF8SKIP(s);
3827 }
3828 }
3829 else {
3830 ST.alen = PL_reginput - locinput;
3831 }
3832 if (ST.alen == 0)
3833 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
3834 }
0cadcf80
DM
3835 DEBUG_EXECUTE_r(
3836 PerlIO_printf(Perl_debug_log,
40a82448 3837 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 3838 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 3839 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
3840 );
3841
40a82448
DM
3842 locinput = PL_reginput;
3843 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
3844 goto curlym_do_A; /* try to match another A */
3845 goto curlym_do_B; /* try to match B */
5f80c4cf 3846
40a82448
DM
3847 case CURLYM_A_fail: /* just failed to match an A */
3848 REGCP_UNWIND(ST.cp);
3849 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
3850 sayNO;
0cadcf80 3851
40a82448
DM
3852 curlym_do_B: /* execute the B in /A{m,n}B/ */
3853 PL_reginput = locinput;
3854 if (ST.c1 == CHRTEST_UNINIT) {
3855 /* calculate c1 and c2 for possible match of 1st char
3856 * following curly */
3857 ST.c1 = ST.c2 = CHRTEST_VOID;
3858 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
3859 regnode *text_node = ST.B;
3860 if (! HAS_TEXT(text_node))
3861 FIND_NEXT_IMPT(text_node);
3862 if (HAS_TEXT(text_node)
3863 && PL_regkind[OP(text_node)] != REF)
3864 {
3865 ST.c1 = (U8)*STRING(text_node);
3866 ST.c2 =
3867 (OP(text_node) == EXACTF || OP(text_node) == REFF)
3868 ? PL_fold[ST.c1]
3869 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3870 ? PL_fold_locale[ST.c1]
3871 : ST.c1;
c277df42 3872 }
c277df42 3873 }
40a82448
DM
3874 }
3875
3876 DEBUG_EXECUTE_r(
3877 PerlIO_printf(Perl_debug_log,
3878 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
3879 (int)(REPORT_CODE_OFF+PL_regindent*2),
3880 "", (IV)ST.count)
3881 );
3882 if (ST.c1 != CHRTEST_VOID
3883 && UCHARAT(PL_reginput) != ST.c1
3884 && UCHARAT(PL_reginput) != ST.c2)
3885 {
3886 /* simulate B failing */
3887 state_num = CURLYM_B_fail;
3888 goto reenter_switch;
3889 }
3890
3891 if (ST.me->flags) {
3892 /* mark current A as captured */
3893 I32 paren = ST.me->flags;
3894 if (ST.count) {
3895 PL_regstartp[paren]
3896 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
3897 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3898 }
40a82448
DM
3899 else
3900 PL_regendp[paren] = -1;
c277df42 3901 }
40a82448 3902 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 3903 /* NOTREACHED */
40a82448
DM
3904
3905 case CURLYM_B_fail: /* just failed to match a B */
3906 REGCP_UNWIND(ST.cp);
3907 if (ST.minmod) {
3908 if (ST.count == ARG2(ST.me) /* max */)
3909 sayNO;
3910 goto curlym_do_A; /* try to match a further A */
3911 }
3912 /* backtrack one A */
3913 if (ST.count == ARG1(ST.me) /* min */)
3914 sayNO;
3915 ST.count--;
3916 locinput = HOPc(locinput, -ST.alen);
3917 goto curlym_do_B; /* try to match B */
3918
c255a977
DM
3919#undef ST
3920#define ST st->u.curly
40a82448 3921
c255a977
DM
3922#define CURLY_SETPAREN(paren, success) \
3923 if (paren) { \
3924 if (success) { \
3925 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
3926 PL_regendp[paren] = locinput - PL_bostr; \
3927 } \
3928 else \
3929 PL_regendp[paren] = -1; \
3930 }
3931
3932 case STAR: /* /A*B/ where A is width 1 */
3933 ST.paren = 0;
3934 ST.min = 0;
3935 ST.max = REG_INFTY;
a0d0e21e
LW
3936 scan = NEXTOPER(scan);
3937 goto repeat;
c255a977
DM
3938 case PLUS: /* /A+B/ where A is width 1 */
3939 ST.paren = 0;
3940 ST.min = 1;
3941 ST.max = REG_INFTY;
c277df42 3942 scan = NEXTOPER(scan);
c255a977
DM
3943 goto repeat;
3944 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
3945 ST.paren = scan->flags; /* Which paren to set */
3946 if (ST.paren > PL_regsize)
3947 PL_regsize = ST.paren;
3948 if (ST.paren > (I32)*PL_reglastparen)
3949 *PL_reglastparen = ST.paren;
3950 ST.min = ARG1(scan); /* min to match */
3951 ST.max = ARG2(scan); /* max to match */
3952 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3953 goto repeat;
3954 case CURLY: /* /A{m,n}B/ where A is width 1 */
3955 ST.paren = 0;
3956 ST.min = ARG1(scan); /* min to match */
3957 ST.max = ARG2(scan); /* max to match */
3958 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 3959 repeat:
a0d0e21e
LW
3960 /*
3961 * Lookahead to avoid useless match attempts
3962 * when we know what character comes next.
c255a977 3963 *
5f80c4cf
JP
3964 * Used to only do .*x and .*?x, but now it allows
3965 * for )'s, ('s and (?{ ... })'s to be in the way
3966 * of the quantifier and the EXACT-like node. -- japhy
3967 */
3968
c255a977
DM
3969 if (ST.min > ST.max) /* XXX make this a compile-time check? */
3970 sayNO;
cca55fe3 3971 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3972 U8 *s;
3973 regnode *text_node = next;
3974
3dab1dad
YO
3975 if (! HAS_TEXT(text_node))
3976 FIND_NEXT_IMPT(text_node);
5f80c4cf 3977
9e137952 3978 if (! HAS_TEXT(text_node))
c255a977 3979 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 3980 else {
3dab1dad 3981 if (PL_regkind[OP(text_node)] == REF) {
c255a977 3982 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 3983 goto assume_ok_easy;
cca55fe3 3984 }
be8e71aa
YO
3985 else
3986 s = (U8*)STRING(text_node);
5f80c4cf
JP
3987
3988 if (!UTF) {
c255a977 3989 ST.c2 = ST.c1 = *s;
f65d3ee7 3990 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 3991 ST.c2 = PL_fold[ST.c1];
f65d3ee7 3992 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 3993 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 3994 }
5f80c4cf 3995 else { /* UTF */
f65d3ee7 3996 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3997 STRLEN ulen1, ulen2;
89ebb4a3
JH
3998 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3999 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4000
4001 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4002 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4003
c255a977 4004 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4005 uniflags);
c255a977 4006 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4007 uniflags);
5f80c4cf
JP
4008 }
4009 else {
c255a977 4010 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4011 uniflags);
5f80c4cf 4012 }
1aa99e6b
IH
4013 }
4014 }
bbce6d69 4015 }
a0d0e21e 4016 else
c255a977 4017 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4018 assume_ok_easy:
c255a977
DM
4019
4020 ST.A = scan;
4021 ST.B = next;
3280af22 4022 PL_reginput = locinput;
5d9a96ca
DM
4023 if (st->minmod) {
4024 st->minmod = 0;
c255a977 4025 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4026 sayNO;
c255a977 4027 ST.count = ST.min;
a0ed51b3 4028 locinput = PL_reginput;
c255a977
DM
4029 REGCP_SET(ST.cp);
4030 if (ST.c1 == CHRTEST_VOID)
4031 goto curly_try_B_min;
4032
4033 ST.oldloc = locinput;
4034
4035 /* set ST.maxpos to the furthest point along the
4036 * string that could possibly match */
4037 if (ST.max == REG_INFTY) {
4038 ST.maxpos = PL_regeol - 1;
4039 if (do_utf8)
4040 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4041 ST.maxpos--;
4042 }
4043 else if (do_utf8) {
4044 int m = ST.max - ST.min;
4045 for (ST.maxpos = locinput;
4046 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4047 ST.maxpos += UTF8SKIP(ST.maxpos);
4048 }
4049 else {
4050 ST.maxpos = locinput + ST.max - ST.min;
4051 if (ST.maxpos >= PL_regeol)
4052 ST.maxpos = PL_regeol - 1;
4053 }
4054 goto curly_try_B_min_known;
4055
4056 }
4057 else {
4058 ST.count = regrepeat(rex, ST.A, ST.max);
4059 locinput = PL_reginput;
4060 if (ST.count < ST.min)
4061 sayNO;
4062 if ((ST.count > ST.min)
4063 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4064 {
4065 /* A{m,n} must come at the end of the string, there's
4066 * no point in backing off ... */
4067 ST.min = ST.count;
4068 /* ...except that $ and \Z can match before *and* after
4069 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4070 We may back off by one in this case. */
4071 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4072 ST.min--;
4073 }
4074 REGCP_SET(ST.cp);
4075 goto curly_try_B_max;
4076 }
4077 /* NOTREACHED */
4078
4079
4080 case CURLY_B_min_known_fail:
4081 /* failed to find B in a non-greedy match where c1,c2 valid */
4082 if (ST.paren && ST.count)
4083 PL_regendp[ST.paren] = -1;
4084
4085 PL_reginput = locinput; /* Could be reset... */
4086 REGCP_UNWIND(ST.cp);
4087 /* Couldn't or didn't -- move forward. */
4088 ST.oldloc = locinput;
4089 if (do_utf8)
4090 locinput += UTF8SKIP(locinput);
4091 else
4092 locinput++;
4093 ST.count++;
4094 curly_try_B_min_known:
4095 /* find the next place where 'B' could work, then call B */
4096 {
4097 int n;
4098 if (do_utf8) {
4099 n = (ST.oldloc == locinput) ? 0 : 1;
4100 if (ST.c1 == ST.c2) {
4101 STRLEN len;
4102 /* set n to utf8_distance(oldloc, locinput) */
4103 while (locinput <= ST.maxpos &&
4104 utf8n_to_uvchr((U8*)locinput,
4105 UTF8_MAXBYTES, &len,
4106 uniflags) != (UV)ST.c1) {
4107 locinput += len;
4108 n++;
4109 }
1aa99e6b
IH
4110 }
4111 else {
c255a977
DM
4112 /* set n to utf8_distance(oldloc, locinput) */
4113 while (locinput <= ST.maxpos) {
4114 STRLEN len;
4115 const UV c = utf8n_to_uvchr((U8*)locinput,
4116 UTF8_MAXBYTES, &len,
4117 uniflags);
4118 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4119 break;
4120 locinput += len;
4121 n++;
1aa99e6b 4122 }
0fe9bf95
IZ
4123 }
4124 }
c255a977
DM
4125 else {
4126 if (ST.c1 == ST.c2) {
4127 while (locinput <= ST.maxpos &&
4128 UCHARAT(locinput) != ST.c1)
4129 locinput++;
bbce6d69 4130 }
c255a977
DM
4131 else {
4132 while (locinput <= ST.maxpos
4133 && UCHARAT(locinput) != ST.c1
4134 && UCHARAT(locinput) != ST.c2)
4135 locinput++;
a0ed51b3 4136 }
c255a977
DM
4137 n = locinput - ST.oldloc;
4138 }
4139 if (locinput > ST.maxpos)
4140 sayNO;
4141 /* PL_reginput == oldloc now */
4142 if (n) {
4143 ST.count += n;
4144 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4145 sayNO;
a0d0e21e 4146 }
c255a977
DM
4147 PL_reginput = locinput;
4148 CURLY_SETPAREN(ST.paren, ST.count);
4149 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4150 }
c255a977
DM
4151 /* NOTREACHED */
4152
4153
4154 case CURLY_B_min_fail:
4155 /* failed to find B in a non-greedy match where c1,c2 invalid */
4156 if (ST.paren && ST.count)
4157 PL_regendp[ST.paren] = -1;
4158
4159 REGCP_UNWIND(ST.cp);
4160 /* failed -- move forward one */
4161 PL_reginput = locinput;
4162 if (regrepeat(rex, ST.A, 1)) {
4163 ST.count++;
a0ed51b3 4164 locinput = PL_reginput;
c255a977
DM
4165 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4166 ST.count > 0)) /* count overflow ? */
15272685 4167 {
c255a977
DM
4168 curly_try_B_min:
4169 CURLY_SETPAREN(ST.paren, ST.count);
4170 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4171 }
4172 }
4633a7c4 4173 sayNO;
c255a977
DM
4174 /* NOTREACHED */
4175
4176
4177 curly_try_B_max:
4178 /* a successful greedy match: now try to match B */
4179 {
4180 UV c = 0;
4181 if (ST.c1 != CHRTEST_VOID)
4182 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4183 UTF8_MAXBYTES, 0, uniflags)
466787eb 4184 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4185 /* If it could work, try it. */
4186 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4187 CURLY_SETPAREN(ST.paren, ST.count);
4188 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4189 /* NOTREACHED */
4190 }
4191 }
4192 /* FALL THROUGH */
4193 case CURLY_B_max_fail:
4194 /* failed to find B in a greedy match */
4195 if (ST.paren && ST.count)
4196 PL_regendp[ST.paren] = -1;
4197
4198 REGCP_UNWIND(ST.cp);
4199 /* back up. */
4200 if (--ST.count < ST.min)
4201 sayNO;
4202 PL_reginput = locinput = HOPc(locinput, -1);
4203 goto curly_try_B_max;
4204
4205#undef ST
4206
4207
a0d0e21e 4208 case END:
3b0527fe 4209 if (locinput < reginfo->till) {
a3621e74 4210 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4211 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4212 PL_colors[4],
4213 (long)(locinput - PL_reg_starttry),
3b0527fe 4214 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4215 PL_colors[5]));
4216 sayNO_FINAL; /* Cannot match: too short. */
4217 }
4218 PL_reginput = locinput; /* put where regtry can find it */
4219 sayYES_FINAL; /* Success! */
dad79028
DM
4220
4221 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4222 DEBUG_EXECUTE_r(
4223 PerlIO_printf(Perl_debug_log,
4224 "%*s %ssubpattern success...%s\n",
4225 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4226 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4227 sayYES_FINAL; /* Success! */
4228
40a82448
DM
4229#undef ST
4230#define ST st->u.ifmatch
4231
4232 case SUSPEND: /* (?>A) */
4233 ST.wanted = 1;
9fe1d20c 4234 PL_reginput = locinput;
9041c2e3 4235 goto do_ifmatch;
dad79028 4236
40a82448
DM
4237 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4238 ST.wanted = 0;
dad79028
DM
4239 goto ifmatch_trivial_fail_test;
4240
40a82448
DM
4241 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4242 ST.wanted = 1;
dad79028 4243 ifmatch_trivial_fail_test:
a0ed51b3 4244 if (scan->flags) {
52657f30 4245 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4246 if (!s) {
4247 /* trivial fail */
4248 if (st->logical) {
4249 st->logical = 0;
40a82448 4250 st->sw = 1 - (bool)ST.wanted;
dad79028 4251 }
40a82448 4252 else if (ST.wanted)
dad79028
DM
4253 sayNO;
4254 next = scan + ARG(scan);
4255 if (next == scan)
4256 next = NULL;
4257 break;
4258 }
efb30f32 4259 PL_reginput = s;
a0ed51b3
LW
4260 }
4261 else
4262 PL_reginput = locinput;
4263
c277df42 4264 do_ifmatch:
40a82448
DM
4265 ST.me = scan;
4266 /* execute body of (?...A) */
4267 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4268 /* NOTREACHED */
4269
4270 case IFMATCH_A_fail: /* body of (?...A) failed */
4271 ST.wanted = !ST.wanted;
4272 /* FALL THROUGH */
4273
4274 case IFMATCH_A: /* body of (?...A) succeeded */
4275 if (st->logical) {
4276 st->logical = 0;
4277 st->sw = (bool)ST.wanted;
4278 }
4279 else if (!ST.wanted)
4280 sayNO;
4281
4282 if (OP(ST.me) == SUSPEND)
4283 locinput = PL_reginput;
4284 else {
4285 locinput = PL_reginput = st->locinput;
4286 nextchr = UCHARAT(locinput);
4287 }
4288 scan = ST.me + ARG(ST.me);
4289 if (scan == ST.me)
4290 scan = NULL;
4291 continue; /* execute B */
4292
4293#undef ST
dad79028 4294
c277df42 4295 case LONGJMP:
c277df42
IZ
4296 next = scan + ARG(scan);
4297 if (next == scan)
4298 next = NULL;
a0d0e21e
LW
4299 break;
4300 default:
b900a521 4301 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4302 PTR2UV(scan), OP(scan));
cea2e8a9 4303 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4304 }
95b24440 4305
a0d0e21e 4306 scan = next;
95b24440
DM
4307 continue;
4308 /* NOTREACHED */
4309
40a82448
DM
4310 push_yes_state:
4311 /* push a state that backtracks on success */
4312 st->u.yes.prev_yes_state = yes_state;
4313 yes_state = st;
4314 /* FALL THROUGH */
4315 push_state:
4316 /* push a new regex state, then continue at scan */
4317 {
4318 regmatch_state *newst;
4319
4320 depth++;
4321 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4322 "PUSH STATE(%d)\n", depth));
4323 st->locinput = locinput;
4324 newst = st+1;
4325 if (newst > SLAB_LAST(PL_regmatch_slab))
4326 newst = S_push_slab(aTHX);
4327 PL_regmatch_state = newst;
4328 newst->cc = st->cc;
4329 /* XXX probably don't need to initialise these */
4330 newst->minmod = 0;
4331 newst->sw = 0;
4332 newst->logical = 0;
4333
4334 locinput = PL_reginput;
4335 nextchr = UCHARAT(locinput);
4336 st = newst;
4337 continue;
4338 /* NOTREACHED */
4339 }
4340
95b24440
DM
4341 /* simulate recursively calling regmatch(), but without actually
4342 * recursing - ie save the current state on the heap rather than on
4343 * the stack, then re-enter the loop. This avoids complex regexes
4344 * blowing the processor stack */
4345
4346 start_recurse:
4347 {
5d9a96ca
DM
4348 /* push new state */
4349 regmatch_state *oldst = st;
4350
4351 depth++;
40a82448 4352 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4353
4354 /* grab the next free state slot */
4355 st++;
86545054 4356 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4357 st = S_push_slab(aTHX);
4358 PL_regmatch_state = st;
4359
4360 oldst->next = next;
4361 oldst->n = n;
4362 oldst->locinput = locinput;
5d9a96ca
DM
4363
4364 st->cc = oldst->cc;
95b24440
DM
4365 locinput = PL_reginput;
4366 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4367 st->minmod = 0;
4368 st->sw = 0;
4369 st->logical = 0;
95b24440
DM
4370#ifdef DEBUGGING
4371 PL_regindent++;
4372#endif
4373 }
a0d0e21e 4374 }
a687059c 4375
aa283a38
DM
4376
4377
a0d0e21e
LW
4378 /*
4379 * We get here only if there's trouble -- normally "case END" is
4380 * the terminating point.
4381 */
cea2e8a9 4382 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4383 /*NOTREACHED*/
4633a7c4
LW
4384 sayNO;
4385
7821416a 4386yes_final:
77cb431f
DM
4387
4388 if (yes_state) {
4389 /* we have successfully completed a subexpression, but we must now
4390 * pop to the state marked by yes_state and continue from there */
4391
4392 assert(st != yes_state);
4393 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4394 || yes_state > SLAB_LAST(PL_regmatch_slab))
4395 {
4396 /* not in this slab, pop slab */
4397 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4398 PL_regmatch_slab = PL_regmatch_slab->prev;
4399 st = SLAB_LAST(PL_regmatch_slab);
4400 }
4401 depth -= (st - yes_state);
40a82448
DM
4402 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4403 depth+1, depth+(st - yes_state)));
77cb431f
DM
4404 st = yes_state;
4405 yes_state = st->u.yes.prev_yes_state;
4406 PL_regmatch_state = st;
4407
4408 switch (st->resume_state) {
40a82448
DM
4409 case IFMATCH_A:
4410 case CURLYM_A:
40a82448
DM
4411 case EVAL_A:
4412 state_num = st->resume_state;
4413 goto reenter_switch;
4414
4415 case CURLYM_B:
166ba7cd
DM
4416 case BRANCH_next:
4417 case TRIE_next:
c255a977 4418 case CURLY_B_max:
77cb431f 4419 default:
40a82448 4420 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4421 }
4422 }
4423
a3621e74 4424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4425 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4426yes:
4427#ifdef DEBUGGING
3280af22 4428 PL_regindent--;
4633a7c4 4429#endif
02db2b7b 4430
95b24440 4431 result = 1;
aa283a38 4432 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4433 * will disappear when REGFMATCH goes */
aa283a38
DM
4434 if (depth) {
4435 /* restore previous state and re-enter */
40a82448
DM
4436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4437 depth--;
4438 st--;
4439 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4440 PL_regmatch_slab = PL_regmatch_slab->prev;
4441 st = SLAB_LAST(PL_regmatch_slab);
4442 }
4443 PL_regmatch_state = st;
4444 scan = st->scan;
4445 next = st->next;
4446 n = st->n;
4447 locinput= st->locinput;
4448 nextchr = UCHARAT(locinput);
aa283a38
DM
4449
4450 switch (st->resume_state) {
aa283a38
DM
4451 case resume_CURLYX:
4452 goto resume_point_CURLYX;
4453 case resume_WHILEM1:
4454 goto resume_point_WHILEM1;
4455 case resume_WHILEM2:
4456 goto resume_point_WHILEM2;
4457 case resume_WHILEM3:
4458 goto resume_point_WHILEM3;
4459 case resume_WHILEM4:
4460 goto resume_point_WHILEM4;
4461 case resume_WHILEM5:
4462 goto resume_point_WHILEM5;
4463 case resume_WHILEM6:
4464 goto resume_point_WHILEM6;
77cb431f 4465
166ba7cd 4466 case TRIE_next:
40a82448
DM
4467 case CURLYM_A:
4468 case CURLYM_B:
4469 case EVAL_A:
4470 case IFMATCH_A:
4471 case BRANCH_next:
c255a977
DM
4472 case CURLY_B_max:
4473 case CURLY_B_min:
4474 case CURLY_B_min_known:
40a82448
DM
4475 break;
4476
aa283a38
DM
4477 default:
4478 Perl_croak(aTHX_ "regexp resume memory corruption");
4479 }
4480 }
4481 goto final_exit;
4633a7c4
LW
4482
4483no:
a3621e74 4484 DEBUG_EXECUTE_r(
7821416a
IZ
4485 PerlIO_printf(Perl_debug_log,
4486 "%*s %sfailed...%s\n",
e4584336 4487 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4488 );
7821416a
IZ
4489no_final:
4490do_no:
aa283a38 4491
4633a7c4 4492#ifdef DEBUGGING
3280af22 4493 PL_regindent--;
4633a7c4 4494#endif
95b24440 4495 result = 0;
5d9a96ca 4496
aa283a38
DM
4497 if (depth) {
4498 /* there's a previous state to backtrack to */
40a82448
DM
4499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4500 depth--;
4501 st--;
4502 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4503 PL_regmatch_slab = PL_regmatch_slab->prev;
4504 st = SLAB_LAST(PL_regmatch_slab);
4505 }
4506 PL_regmatch_state = st;
4507 scan = st->scan;
4508 next = st->next;
4509 n = st->n;
4510 locinput= st->locinput;
4511 nextchr = UCHARAT(locinput);
4512
5d9a96ca 4513 switch (st->resume_state) {
95b24440
DM
4514 case resume_CURLYX:
4515 goto resume_point_CURLYX;
4516 case resume_WHILEM1:
4517 goto resume_point_WHILEM1;
4518 case resume_WHILEM2:
4519 goto resume_point_WHILEM2;
4520 case resume_WHILEM3:
4521 goto resume_point_WHILEM3;
4522 case resume_WHILEM4:
4523 goto resume_point_WHILEM4;
4524 case resume_WHILEM5:
4525 goto resume_point_WHILEM5;
4526 case resume_WHILEM6:
4527 goto resume_point_WHILEM6;
dad79028 4528
166ba7cd 4529 case TRIE_next:
40a82448
DM
4530 case EVAL_A:
4531 case BRANCH_next:
4532 case CURLYM_A:
4533 case CURLYM_B:
4534 case IFMATCH_A:
c255a977
DM
4535 case CURLY_B_max:
4536 case CURLY_B_min:
4537 case CURLY_B_min_known:
40a82448
DM
4538 if (yes_state == st)
4539 yes_state = st->u.yes.prev_yes_state;
4540 state_num = st->resume_state + 1; /* failure = success + 1 */
4541 goto reenter_switch;
dad79028 4542
95b24440
DM
4543 default:
4544 Perl_croak(aTHX_ "regexp resume memory corruption");
4545 }
95b24440 4546 }
aa283a38
DM
4547
4548final_exit:
4549
5d9a96ca
DM
4550 /* restore original high-water mark */
4551 PL_regmatch_slab = orig_slab;
4552 PL_regmatch_state = orig_state;
4553
4554 /* free all slabs above current one */
4555 if (orig_slab->next) {
c4fd8992 4556 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4557 orig_slab->next = NULL;
4558 while (sl) {
c4fd8992 4559 regmatch_slab * const osl = sl;
5d9a96ca 4560 sl = sl->next;
ad65c075 4561 Safefree(osl);
5d9a96ca
DM
4562 }
4563 }
4564
95b24440
DM
4565 return result;
4566
a687059c
LW
4567}
4568
4569/*
4570 - regrepeat - repeatedly match something simple, report how many
4571 */
4572/*
4573 * [This routine now assumes that it will only match on things of length 1.
4574 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4575 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4576 */
76e3520e 4577STATIC I32
32fc9b6a 4578S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4579{
27da23d5 4580 dVAR;
a0d0e21e 4581 register char *scan;
a0d0e21e 4582 register I32 c;
3280af22 4583 register char *loceol = PL_regeol;
a0ed51b3 4584 register I32 hardcount = 0;
53c4c00c 4585 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4586
3280af22 4587 scan = PL_reginput;
faf11cac
HS
4588 if (max == REG_INFTY)
4589 max = I32_MAX;
4590 else if (max < loceol - scan)
7f596f4c 4591 loceol = scan + max;
a0d0e21e 4592 switch (OP(p)) {
22c35a8c 4593 case REG_ANY:
1aa99e6b 4594 if (do_utf8) {
ffc61ed2 4595 loceol = PL_regeol;
1aa99e6b 4596 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4597 scan += UTF8SKIP(scan);
4598 hardcount++;
4599 }
4600 } else {
4601 while (scan < loceol && *scan != '\n')
4602 scan++;
a0ed51b3
LW
4603 }
4604 break;
ffc61ed2 4605 case SANY:
def8e4ea
JH
4606 if (do_utf8) {
4607 loceol = PL_regeol;
a0804c9e 4608 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4609 scan += UTF8SKIP(scan);
4610 hardcount++;
4611 }
4612 }
4613 else
4614 scan = loceol;
a0ed51b3 4615 break;
f33976b4
DB
4616 case CANY:
4617 scan = loceol;
4618 break;
090f7165
JH
4619 case EXACT: /* length of string is 1 */
4620 c = (U8)*STRING(p);
4621 while (scan < loceol && UCHARAT(scan) == c)
4622 scan++;
bbce6d69 4623 break;
4624 case EXACTF: /* length of string is 1 */
cd439c50 4625 c = (U8)*STRING(p);
bbce6d69 4626 while (scan < loceol &&
22c35a8c 4627 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4628 scan++;
4629 break;
4630 case EXACTFL: /* length of string is 1 */
3280af22 4631 PL_reg_flags |= RF_tainted;
cd439c50 4632 c = (U8)*STRING(p);
bbce6d69 4633 while (scan < loceol &&
22c35a8c 4634 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4635 scan++;
4636 break;
4637 case ANYOF:
ffc61ed2
JH
4638 if (do_utf8) {
4639 loceol = PL_regeol;
cfc92286 4640 while (hardcount < max && scan < loceol &&
32fc9b6a 4641 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4642 scan += UTF8SKIP(scan);
4643 hardcount++;
4644 }
4645 } else {
32fc9b6a 4646 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4647 scan++;
4648 }
a0d0e21e
LW
4649 break;
4650 case ALNUM:
1aa99e6b 4651 if (do_utf8) {
ffc61ed2 4652 loceol = PL_regeol;
1a4fad37 4653 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4654 while (hardcount < max && scan < loceol &&
3568d838 4655 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4656 scan += UTF8SKIP(scan);
4657 hardcount++;
4658 }
4659 } else {
4660 while (scan < loceol && isALNUM(*scan))
4661 scan++;
a0ed51b3
LW
4662 }
4663 break;
bbce6d69 4664 case ALNUML:
3280af22 4665 PL_reg_flags |= RF_tainted;
1aa99e6b 4666 if (do_utf8) {
ffc61ed2 4667 loceol = PL_regeol;
1aa99e6b
IH
4668 while (hardcount < max && scan < loceol &&
4669 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4670 scan += UTF8SKIP(scan);
4671 hardcount++;
4672 }
4673 } else {
4674 while (scan < loceol && isALNUM_LC(*scan))
4675 scan++;
a0ed51b3
LW
4676 }
4677 break;
a0d0e21e 4678 case NALNUM:
1aa99e6b 4679 if (do_utf8) {
ffc61ed2 4680 loceol = PL_regeol;
1a4fad37 4681 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4682 while (hardcount < max && scan < loceol &&
3568d838 4683 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4684 scan += UTF8SKIP(scan);
4685 hardcount++;
4686 }
4687 } else {
4688 while (scan < loceol && !isALNUM(*scan))
4689 scan++;
a0ed51b3
LW
4690 }
4691 break;
bbce6d69 4692 case NALNUML:
3280af22 4693 PL_reg_flags |= RF_tainted;
1aa99e6b 4694 if (do_utf8) {
ffc61ed2 4695 loceol = PL_regeol;
1aa99e6b
IH
4696 while (hardcount < max && scan < loceol &&
4697 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4698 scan += UTF8SKIP(scan);
4699 hardcount++;
4700 }
4701 } else {
4702 while (scan < loceol && !isALNUM_LC(*scan))
4703 scan++;
a0ed51b3
LW
4704 }
4705 break;
a0d0e21e 4706 case SPACE:
1aa99e6b 4707 if (do_utf8) {
ffc61ed2 4708 loceol = PL_regeol;
1a4fad37 4709 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4710 while (hardcount < max && scan < loceol &&
3568d838
JH
4711 (*scan == ' ' ||
4712 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4713 scan += UTF8SKIP(scan);
4714 hardcount++;
4715 }
4716 } else {
4717 while (scan < loceol && isSPACE(*scan))
4718 scan++;
a0ed51b3
LW
4719 }
4720 break;
bbce6d69 4721 case SPACEL:
3280af22 4722 PL_reg_flags |= RF_tainted;
1aa99e6b 4723 if (do_utf8) {
ffc61ed2 4724 loceol = PL_regeol;
1aa99e6b 4725 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4726 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4727 scan += UTF8SKIP(scan);
4728 hardcount++;
4729 }
4730 } else {
4731 while (scan < loceol && isSPACE_LC(*scan))
4732 scan++;
a0ed51b3
LW
4733 }
4734 break;
a0d0e21e 4735 case NSPACE:
1aa99e6b 4736 if (do_utf8) {
ffc61ed2 4737 loceol = PL_regeol;
1a4fad37 4738 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4739 while (hardcount < max && scan < loceol &&
3568d838
JH
4740 !(*scan == ' ' ||
4741 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4742 scan += UTF8SKIP(scan);
4743 hardcount++;
4744 }
4745 } else {
4746 while (scan < loceol && !isSPACE(*scan))
4747 scan++;
4748 break;
a0ed51b3 4749 }
bbce6d69 4750 case NSPACEL:
3280af22 4751 PL_reg_flags |= RF_tainted;
1aa99e6b 4752 if (do_utf8) {
ffc61ed2 4753 loceol = PL_regeol;
1aa99e6b 4754 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4755 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4756 scan += UTF8SKIP(scan);
4757 hardcount++;
4758 }
4759 } else {
4760 while (scan < loceol && !isSPACE_LC(*scan))
4761 scan++;
a0ed51b3
LW
4762 }
4763 break;
a0d0e21e 4764 case DIGIT:
1aa99e6b 4765 if (do_utf8) {
ffc61ed2 4766 loceol = PL_regeol;
1a4fad37 4767 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4768 while (hardcount < max && scan < loceol &&
3568d838 4769 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4770 scan += UTF8SKIP(scan);
4771 hardcount++;
4772 }
4773 } else {
4774 while (scan < loceol && isDIGIT(*scan))
4775 scan++;
a0ed51b3
LW
4776 }
4777 break;
a0d0e21e 4778 case NDIGIT:
1aa99e6b 4779 if (do_utf8) {
ffc61ed2 4780 loceol = PL_regeol;
1a4fad37 4781 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 4782 while (hardcount < max && scan < loceol &&
3568d838 4783 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4784 scan += UTF8SKIP(scan);
4785 hardcount++;
4786 }
4787 } else {
4788 while (scan < loceol && !isDIGIT(*scan))
4789 scan++;
a0ed51b3
LW
4790 }
4791 break;
a0d0e21e
LW
4792 default: /* Called on something of 0 width. */
4793 break; /* So match right here or not at all. */
4794 }
a687059c 4795
a0ed51b3
LW
4796 if (hardcount)
4797 c = hardcount;
4798 else
4799 c = scan - PL_reginput;
3280af22 4800 PL_reginput = scan;
a687059c 4801
a3621e74 4802 DEBUG_r({
e68ec53f 4803 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 4804 DEBUG_EXECUTE_r({
e68ec53f
YO
4805 SV * const prop = sv_newmortal();
4806 regprop(prog, prop, p);
4807 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
4808 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4809 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 4810 });
be8e71aa 4811 });
9041c2e3 4812
a0d0e21e 4813 return(c);
a687059c
LW
4814}
4815
c277df42 4816
be8e71aa 4817#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 4818/*
ffc61ed2
JH
4819- regclass_swash - prepare the utf8 swash
4820*/
4821
4822SV *
32fc9b6a 4823Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4824{
97aff369 4825 dVAR;
9e55ce06
JH
4826 SV *sw = NULL;
4827 SV *si = NULL;
4828 SV *alt = NULL;
3dab1dad 4829 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 4830
4f639d21 4831 if (data && data->count) {
a3b680e6 4832 const U32 n = ARG(node);
ffc61ed2 4833
4f639d21
DM
4834 if (data->what[n] == 's') {
4835 SV * const rv = (SV*)data->data[n];
890ce7af 4836 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 4837 SV **const ary = AvARRAY(av);
9e55ce06 4838 SV **a, **b;
9041c2e3 4839
711a919c 4840 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
4841 * documentation of these array elements. */
4842
b11f357e 4843 si = *ary;
8f7f7219 4844 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
4845 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4846
ffc61ed2
JH
4847 if (a)
4848 sw = *a;
4849 else if (si && doinit) {
4850 sw = swash_init("utf8", "", si, 1, 0);
4851 (void)av_store(av, 1, sw);
4852 }
9e55ce06
JH
4853 if (b)
4854 alt = *b;
ffc61ed2
JH
4855 }
4856 }
4857
9e55ce06
JH
4858 if (listsvp)
4859 *listsvp = si;
4860 if (altsvp)
4861 *altsvp = alt;
ffc61ed2
JH
4862
4863 return sw;
4864}
76234dfb 4865#endif
ffc61ed2
JH
4866
4867/*
ba7b4546 4868 - reginclass - determine if a character falls into a character class
832705d4
JH
4869
4870 The n is the ANYOF regnode, the p is the target string, lenp
4871 is pointer to the maximum length of how far to go in the p
4872 (if the lenp is zero, UTF8SKIP(p) is used),
4873 do_utf8 tells whether the target string is in UTF-8.
4874
bbce6d69 4875 */
4876
76e3520e 4877STATIC bool
32fc9b6a 4878S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4879{
27da23d5 4880 dVAR;
a3b680e6 4881 const char flags = ANYOF_FLAGS(n);
bbce6d69 4882 bool match = FALSE;
cc07378b 4883 UV c = *p;
ae9ddab8 4884 STRLEN len = 0;
9e55ce06 4885 STRLEN plen;
1aa99e6b 4886
19f67299
TS
4887 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4888 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
4889 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
4890 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
4891 if (len == (STRLEN)-1)
4892 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4893 }
bbce6d69 4894
0f0076b4 4895 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4896 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4897 if (lenp)
4898 *lenp = 0;
ffc61ed2 4899 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4900 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4901 match = TRUE;
bbce6d69 4902 }
3568d838 4903 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4904 match = TRUE;
ffc61ed2 4905 if (!match) {
9e55ce06 4906 AV *av;
32fc9b6a 4907 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4908
4909 if (sw) {
3568d838 4910 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4911 match = TRUE;
4912 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4913 if (!match && lenp && av) {
4914 I32 i;
9e55ce06 4915 for (i = 0; i <= av_len(av); i++) {
890ce7af 4916 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 4917 STRLEN len;
890ce7af 4918 const char * const s = SvPV_const(sv, len);
9e55ce06 4919
061b10df 4920 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4921 *lenp = len;
4922 match = TRUE;
4923 break;
4924 }
4925 }
4926 }
4927 if (!match) {
89ebb4a3 4928 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4929 STRLEN tmplen;
4930
9e55ce06
JH
4931 to_utf8_fold(p, tmpbuf, &tmplen);
4932 if (swash_fetch(sw, tmpbuf, do_utf8))
4933 match = TRUE;
4934 }
ffc61ed2
JH
4935 }
4936 }
bbce6d69 4937 }
9e55ce06 4938 if (match && lenp && *lenp == 0)
0f0076b4 4939 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4940 }
1aa99e6b 4941 if (!match && c < 256) {
ffc61ed2
JH
4942 if (ANYOF_BITMAP_TEST(n, c))
4943 match = TRUE;
4944 else if (flags & ANYOF_FOLD) {
eb160463 4945 U8 f;
a0ed51b3 4946
ffc61ed2
JH
4947 if (flags & ANYOF_LOCALE) {
4948 PL_reg_flags |= RF_tainted;
4949 f = PL_fold_locale[c];
4950 }
4951 else
4952 f = PL_fold[c];
4953 if (f != c && ANYOF_BITMAP_TEST(n, f))
4954 match = TRUE;
4955 }
4956
4957 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4958 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4959 if (
4960 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4961 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4962 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4963 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4964 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4965 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4966 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4967 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4968 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4969 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4970 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4971 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4972 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4973 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4974 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4975 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4976 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4977 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4978 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4979 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4980 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4981 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4982 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4983 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4984 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4985 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4986 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4987 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4988 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4989 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4990 ) /* How's that for a conditional? */
4991 {
4992 match = TRUE;
4993 }
a0ed51b3 4994 }
a0ed51b3
LW
4995 }
4996
a0ed51b3
LW
4997 return (flags & ANYOF_INVERT) ? !match : match;
4998}
161b471a 4999
dfe13c55 5000STATIC U8 *
0ce71af7 5001S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5002{
97aff369 5003 dVAR;
a0ed51b3 5004 if (off >= 0) {
1aa99e6b 5005 while (off-- && s < lim) {
ffc61ed2 5006 /* XXX could check well-formedness here */
a0ed51b3 5007 s += UTF8SKIP(s);
ffc61ed2 5008 }
a0ed51b3
LW
5009 }
5010 else {
5011 while (off++) {
1aa99e6b 5012 if (s > lim) {
a0ed51b3 5013 s--;
ffc61ed2 5014 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5015 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5016 s--;
ffc61ed2
JH
5017 }
5018 /* XXX could check well-formedness here */
a0ed51b3
LW
5019 }
5020 }
5021 }
5022 return s;
5023}
161b471a 5024
dfe13c55 5025STATIC U8 *
0ce71af7 5026S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5027{
97aff369 5028 dVAR;
a0ed51b3 5029 if (off >= 0) {
1aa99e6b 5030 while (off-- && s < lim) {
ffc61ed2 5031 /* XXX could check well-formedness here */
a0ed51b3 5032 s += UTF8SKIP(s);
ffc61ed2 5033 }
a0ed51b3 5034 if (off >= 0)
3dab1dad 5035 return NULL;
a0ed51b3
LW
5036 }
5037 else {
5038 while (off++) {
1aa99e6b 5039 if (s > lim) {
a0ed51b3 5040 s--;
ffc61ed2 5041 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5042 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5043 s--;
ffc61ed2
JH
5044 }
5045 /* XXX could check well-formedness here */
a0ed51b3
LW
5046 }
5047 else
5048 break;
5049 }
5050 if (off <= 0)
3dab1dad 5051 return NULL;
a0ed51b3
LW
5052 }
5053 return s;
5054}
51371543 5055
51371543 5056static void
acfe0abc 5057restore_pos(pTHX_ void *arg)
51371543 5058{
97aff369 5059 dVAR;
097eb12c 5060 regexp * const rex = (regexp *)arg;
51371543
GS
5061 if (PL_reg_eval_set) {
5062 if (PL_reg_oldsaved) {
4f639d21
DM
5063 rex->subbeg = PL_reg_oldsaved;
5064 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5065#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5066 rex->saved_copy = PL_nrs;
ed252734 5067#endif
4f639d21 5068 RX_MATCH_COPIED_on(rex);
51371543
GS
5069 }
5070 PL_reg_magic->mg_len = PL_reg_oldpos;
5071 PL_reg_eval_set = 0;
5072 PL_curpm = PL_reg_oldcurpm;
5073 }
5074}
33b8afdf
JH
5075
5076STATIC void
5077S_to_utf8_substr(pTHX_ register regexp *prog)
5078{
33b8afdf 5079 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5080 SV* const sv = newSVsv(prog->float_substr);
5081 prog->float_utf8 = sv;
33b8afdf
JH
5082 sv_utf8_upgrade(sv);
5083 if (SvTAIL(prog->float_substr))
5084 SvTAIL_on(sv);
5085 if (prog->float_substr == prog->check_substr)
5086 prog->check_utf8 = sv;
5087 }
5088 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5089 SV* const sv = newSVsv(prog->anchored_substr);
5090 prog->anchored_utf8 = sv;
33b8afdf
JH
5091 sv_utf8_upgrade(sv);
5092 if (SvTAIL(prog->anchored_substr))
5093 SvTAIL_on(sv);
5094 if (prog->anchored_substr == prog->check_substr)
5095 prog->check_utf8 = sv;
5096 }
5097}
5098
5099STATIC void
5100S_to_byte_substr(pTHX_ register regexp *prog)
5101{
97aff369 5102 dVAR;
33b8afdf 5103 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5104 SV* sv = newSVsv(prog->float_utf8);
5105 prog->float_substr = sv;
33b8afdf
JH
5106 if (sv_utf8_downgrade(sv, TRUE)) {
5107 if (SvTAIL(prog->float_utf8))
5108 SvTAIL_on(sv);
5109 } else {
5110 SvREFCNT_dec(sv);
5111 prog->float_substr = sv = &PL_sv_undef;
5112 }
5113 if (prog->float_utf8 == prog->check_utf8)
5114 prog->check_substr = sv;
5115 }
5116 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5117 SV* sv = newSVsv(prog->anchored_utf8);
5118 prog->anchored_substr = sv;
33b8afdf
JH
5119 if (sv_utf8_downgrade(sv, TRUE)) {
5120 if (SvTAIL(prog->anchored_utf8))
5121 SvTAIL_on(sv);
5122 } else {
5123 SvREFCNT_dec(sv);
5124 prog->anchored_substr = sv = &PL_sv_undef;
5125 }
5126 if (prog->anchored_utf8 == prog->check_utf8)
5127 prog->check_substr = sv;
5128 }
5129}
66610fdd
RGS
5130
5131/*
5132 * Local variables:
5133 * c-indentation-style: bsd
5134 * c-basic-offset: 4
5135 * indent-tabs-mode: t
5136 * End:
5137 *
37442d52
RGS
5138 * ex: set ts=8 sts=4 sw=4 noet:
5139 */