This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
2c2d71f5 572 char *s1 = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
1aa99e6b
IH
620 (long)(HOP3c(s1, 1, strend) - i_strpos)));
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
30944b6d 630 s = s1;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf
JH
638 char *last, *last1;
639 char *s1 = s;
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
677 ", trying anchored starting at offset %ld...\n",
678 (long)(s1 + 1 - i_strpos)));
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
687 s = s1;
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 808 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
07be1b83 821 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
07be1b83
YO
827 /*if (OP(prog->regstclass) == TRIE)
828 endpos++;*/
6eb5f6b9 829 t = s;
3b0527fe 830 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
831 if (!s) {
832#ifdef DEBUGGING
cbbf8932 833 const char *what = NULL;
6eb5f6b9
JH
834#endif
835 if (endpos == strend) {
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
837 "Could not match STCLASS...\n") );
838 goto fail;
839 }
a3621e74 840 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 841 "This position contradicts STCLASS...\n") );
653099ff
GS
842 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
843 goto fail;
6eb5f6b9 844 /* Contradict one of substrings */
33b8afdf
JH
845 if (prog->anchored_substr || prog->anchored_utf8) {
846 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 847 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 848 hop_and_restart:
1aa99e6b 849 s = HOP3c(t, 1, strend);
66e933ab
GS
850 if (s + start_shift + end_shift > strend) {
851 /* XXXX Should be taken into account earlier? */
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
853 "Could not match STCLASS...\n") );
854 goto fail;
855 }
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
860 what, (long)(s + start_shift - i_strpos)) );
861 goto restart;
862 }
66e933ab 863 /* Have both, check_string is floating */
6eb5f6b9
JH
864 if (t + start_shift >= check_at) /* Contradicts floating=check */
865 goto retry_floating_check;
866 /* Recheck anchored substring, but not floating... */
9041c2e3 867 s = check_at;
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
872 (long)(other_last - i_strpos)) );
873 goto do_other_anchored;
874 }
60e71179
GS
875 /* Another way we could have checked stclass at the
876 current position only: */
877 if (ml_anch) {
878 s = t = t + 1;
5e39e1e5
HS
879 if (!check)
880 goto giveup;
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 882 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 883 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 884 goto try_at_offset;
66e933ab 885 }
33b8afdf 886 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 887 goto fail;
6eb5f6b9
JH
888 /* Check is floating subtring. */
889 retry_floating_check:
890 t = check_at - start_shift;
a3621e74 891 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
892 goto hop_and_restart;
893 }
b7953727 894 if (t != s) {
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 896 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
897 (long)(t - i_strpos), (long)(s - i_strpos))
898 );
899 }
900 else {
a3621e74 901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
902 "Does not contradict STCLASS...\n");
903 );
904 }
6eb5f6b9 905 }
5e39e1e5 906 giveup:
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
908 PL_colors[4], (check ? "Guessed" : "Giving up"),
909 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 910 return s;
2c2d71f5
JH
911
912 fail_finish: /* Substring not found */
33b8afdf
JH
913 if (prog->check_substr || prog->check_utf8) /* could be removed already */
914 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 915 fail:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 917 PL_colors[4], PL_colors[5]));
bd61b366 918 return NULL;
cad2e5aa 919}
9661b544 920
6eb5f6b9 921/* We know what class REx starts with. Try to find this position... */
3b0527fe 922/* if reginfo is NULL, its a dryrun */
07be1b83
YO
923/* annoyingly all the vars in this routine have different names from their counterparts
924 in regmatch. /grrr */
3b0527fe 925
3c3eec57 926STATIC char *
07be1b83
YO
927S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
928 const char *strend, const regmatch_info *reginfo)
a687059c 929{
27da23d5 930 dVAR;
1df70142 931 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 932 char *m;
d8093b23 933 STRLEN ln;
5dab1207 934 STRLEN lnc;
078c425b 935 register STRLEN uskip;
d8093b23
G
936 unsigned int c1;
937 unsigned int c2;
6eb5f6b9
JH
938 char *e;
939 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 940 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 941
6eb5f6b9
JH
942 /* We know what class it must start with. */
943 switch (OP(c)) {
6eb5f6b9 944 case ANYOF:
388cc4de 945 if (do_utf8) {
078c425b 946 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
947 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
948 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
949 reginclass(prog, c, (U8*)s, 0, do_utf8) :
950 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 951 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
952 goto got_it;
953 else
954 tmp = doevery;
955 }
956 else
957 tmp = 1;
078c425b 958 s += uskip;
388cc4de
HS
959 }
960 }
961 else {
962 while (s < strend) {
963 STRLEN skip = 1;
964
32fc9b6a 965 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
966 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
967 /* The assignment of 2 is intentional:
968 * for the folded sharp s, the skip is 2. */
969 (skip = SHARP_S_SKIP))) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += skip;
978 }
a0d0e21e 979 }
6eb5f6b9 980 break;
f33976b4
DB
981 case CANY:
982 while (s < strend) {
3b0527fe 983 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
984 goto got_it;
985 else
986 tmp = doevery;
987 s++;
988 }
989 break;
6eb5f6b9 990 case EXACTF:
5dab1207
NIS
991 m = STRING(c);
992 ln = STR_LEN(c); /* length to match in octets/bytes */
993 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
5dab1207 996 U8 *sm = (U8 *) m;
89ebb4a3
JH
997 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
998 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 999 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1000
1001 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1002 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1003
89ebb4a3 1004 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1005 0, uniflags);
89ebb4a3 1006 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1007 0, uniflags);
5dab1207
NIS
1008 lnc = 0;
1009 while (sm < ((U8 *) m + ln)) {
1010 lnc++;
1011 sm += UTF8SKIP(sm);
1012 }
1aa99e6b
IH
1013 }
1014 else {
1015 c1 = *(U8*)m;
1016 c2 = PL_fold[c1];
1017 }
6eb5f6b9
JH
1018 goto do_exactf;
1019 case EXACTFL:
5dab1207
NIS
1020 m = STRING(c);
1021 ln = STR_LEN(c);
1022 lnc = (I32) ln;
d8093b23 1023 c1 = *(U8*)m;
6eb5f6b9
JH
1024 c2 = PL_fold_locale[c1];
1025 do_exactf:
db12adc6 1026 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1027
3b0527fe 1028 if (!reginfo && e < s)
6eb5f6b9 1029 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1030
60a8b682
JH
1031 /* The idea in the EXACTF* cases is to first find the
1032 * first character of the EXACTF* node and then, if
1033 * necessary, case-insensitively compare the full
1034 * text of the node. The c1 and c2 are the first
1035 * characters (though in Unicode it gets a bit
1036 * more complicated because there are more cases
7f16dd3d
JH
1037 * than just upper and lower: one needs to use
1038 * the so-called folding case for case-insensitive
1039 * matching (called "loose matching" in Unicode).
1040 * ibcmp_utf8() will do just that. */
60a8b682 1041
1aa99e6b 1042 if (do_utf8) {
575cac57 1043 UV c, f;
89ebb4a3 1044 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1045 STRLEN len, foldlen;
4ad0818d 1046 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1047 if (c1 == c2) {
5dab1207
NIS
1048 /* Upper and lower of 1st char are equal -
1049 * probably not a "letter". */
1aa99e6b 1050 while (s <= e) {
89ebb4a3 1051 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1052 uniflags);
80aecb99
JH
1053 if ( c == c1
1054 && (ln == len ||
3dab1dad
YO
1055 ibcmp_utf8(s, NULL, 0, do_utf8,
1056 m, NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1058 goto got_it;
80aecb99 1059 else {
1df70142 1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1061 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1063 if ( f != c
1064 && (f == c1 || f == c2)
1065 && (ln == foldlen ||
66423254 1066 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1067 NULL, foldlen, do_utf8,
d07ddd77 1068 m,
3dab1dad 1069 NULL, ln, (bool)UTF))
3b0527fe 1070 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1071 goto got_it;
1072 }
1aa99e6b
IH
1073 s += len;
1074 }
09091399
JH
1075 }
1076 else {
1aa99e6b 1077 while (s <= e) {
89ebb4a3 1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1079 uniflags);
80aecb99 1080
60a8b682 1081 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1082 * Note that not all the possible combinations
1083 * are handled here: some of them are handled
1084 * by the standard folding rules, and some of
1085 * them (the character class or ANYOF cases)
1086 * are handled during compiletime in
1087 * regexec.c:S_regclass(). */
880bd946
JH
1088 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1089 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1090 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1091
1092 if ( (c == c1 || c == c2)
1093 && (ln == len ||
3dab1dad
YO
1094 ibcmp_utf8(s, NULL, 0, do_utf8,
1095 m, NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1097 goto got_it;
80aecb99 1098 else {
1df70142 1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
a6872d42 1105 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1106 NULL, foldlen, do_utf8,
d07ddd77 1107 m,
3dab1dad 1108 NULL, ln, (bool)UTF))
3b0527fe 1109 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399 1114 }
1aa99e6b
IH
1115 }
1116 else {
1117 if (c1 == c2)
1118 while (s <= e) {
1119 if ( *(U8*)s == c1
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
1127 else
1128 while (s <= e) {
1129 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
b3c9acc1
IZ
1137 }
1138 break;
bbce6d69 1139 case BOUNDL:
3280af22 1140 PL_reg_flags |= RF_tainted;
bbce6d69 1141 /* FALL THROUGH */
a0d0e21e 1142 case BOUND:
ffc61ed2 1143 if (do_utf8) {
12d33761 1144 if (s == PL_bostr)
ffc61ed2
JH
1145 tmp = '\n';
1146 else {
6136c704 1147 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1148 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1149 }
1150 tmp = ((OP(c) == BOUND ?
9041c2e3 1151 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1152 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1153 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1154 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1155 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1156 isALNUM_LC_utf8((U8*)s)))
1157 {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
078c425b 1162 s += uskip;
a687059c 1163 }
a0d0e21e 1164 }
667bb95a 1165 else {
12d33761 1166 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1167 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1168 while (s < strend) {
1169 if (tmp ==
1170 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1171 tmp = !tmp;
3b0527fe 1172 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1173 goto got_it;
1174 }
1175 s++;
a0ed51b3 1176 }
a0ed51b3 1177 }
3b0527fe 1178 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1179 goto got_it;
1180 break;
bbce6d69 1181 case NBOUNDL:
3280af22 1182 PL_reg_flags |= RF_tainted;
bbce6d69 1183 /* FALL THROUGH */
a0d0e21e 1184 case NBOUND:
ffc61ed2 1185 if (do_utf8) {
12d33761 1186 if (s == PL_bostr)
ffc61ed2
JH
1187 tmp = '\n';
1188 else {
6136c704 1189 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == NBOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1194 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1197 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1201 goto got_it;
078c425b 1202 s += uskip;
ffc61ed2 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == NBOUND ?
1208 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1210 if (tmp ==
1211 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1212 tmp = !tmp;
3b0527fe 1213 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 s++;
1216 }
a0ed51b3 1217 }
3b0527fe 1218 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
a0d0e21e 1221 case ALNUM:
ffc61ed2 1222 if (do_utf8) {
1a4fad37 1223 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1225 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
bbce6d69 1231 else
ffc61ed2 1232 tmp = 1;
078c425b 1233 s += uskip;
bbce6d69 1234 }
bbce6d69 1235 }
ffc61ed2
JH
1236 else {
1237 while (s < strend) {
1238 if (isALNUM(*s)) {
3b0527fe 1239 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
a0ed51b3 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s++;
a0ed51b3 1247 }
a0ed51b3
LW
1248 }
1249 break;
bbce6d69 1250 case ALNUML:
3280af22 1251 PL_reg_flags |= RF_tainted;
ffc61ed2 1252 if (do_utf8) {
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1254 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a687059c 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
a0d0e21e 1263 }
a0d0e21e 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM_LC(*s)) {
3b0527fe 1268 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
a0d0e21e 1279 case NALNUM:
ffc61ed2 1280 if (do_utf8) {
1a4fad37 1281 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1283 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
bbce6d69 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
bbce6d69 1292 }
bbce6d69 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (!isALNUM(*s)) {
3b0527fe 1297 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
bbce6d69 1308 case NALNUML:
3280af22 1309 PL_reg_flags |= RF_tainted;
ffc61ed2 1310 if (do_utf8) {
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1312 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a687059c 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
a687059c 1321 }
a0d0e21e 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM_LC(*s)) {
3b0527fe 1326 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
a0d0e21e 1337 case SPACE:
ffc61ed2 1338 if (do_utf8) {
1a4fad37 1339 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1341 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0d0e21e 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
2304df62 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (isSPACE(*s)) {
3b0527fe 1355 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
bbce6d69 1366 case SPACEL:
3280af22 1367 PL_reg_flags |= RF_tainted;
ffc61ed2 1368 if (do_utf8) {
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1370 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
bbce6d69 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
bbce6d69 1379 }
bbce6d69 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE_LC(*s)) {
3b0527fe 1384 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
a0d0e21e 1395 case NSPACE:
ffc61ed2 1396 if (do_utf8) {
1a4fad37 1397 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1399 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0d0e21e 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
a687059c 1408 }
a0d0e21e 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (!isSPACE(*s)) {
3b0527fe 1413 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
bbce6d69 1424 case NSPACEL:
3280af22 1425 PL_reg_flags |= RF_tainted;
ffc61ed2 1426 if (do_utf8) {
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1428 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
bbce6d69 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
bbce6d69 1437 }
bbce6d69 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE_LC(*s)) {
3b0527fe 1442 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
a0d0e21e 1453 case DIGIT:
ffc61ed2 1454 if (do_utf8) {
1a4fad37 1455 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1457 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0d0e21e 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
2b69d0c2 1466 }
a0d0e21e 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (isDIGIT(*s)) {
3b0527fe 1471 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
b8c5462f
JH
1482 case DIGITL:
1483 PL_reg_flags |= RF_tainted;
ffc61ed2 1484 if (do_utf8) {
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1486 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
b8c5462f 1495 }
b8c5462f 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT_LC(*s)) {
3b0527fe 1500 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
b8c5462f 1508 }
b8c5462f
JH
1509 }
1510 break;
a0d0e21e 1511 case NDIGIT:
ffc61ed2 1512 if (do_utf8) {
1a4fad37 1513 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1515 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0d0e21e 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
a687059c 1524 }
a0d0e21e 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (!isDIGIT(*s)) {
3b0527fe 1529 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0ed51b3 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
a0ed51b3 1537 }
a0ed51b3
LW
1538 }
1539 break;
b8c5462f
JH
1540 case NDIGITL:
1541 PL_reg_flags |= RF_tainted;
ffc61ed2 1542 if (do_utf8) {
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1544 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
b8c5462f 1553 }
a0ed51b3 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT_LC(*s)) {
3b0527fe 1558 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
cf93c79d 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
b8c5462f 1566 }
b8c5462f
JH
1567 }
1568 break;
07be1b83
YO
1569 case TRIE:
1570 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1571 {
1572 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1573 trie_type = do_utf8 ?
1574 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1575 : trie_plain;
1576 /* what trie are we using right now */
1577 reg_ac_data *aho
1578 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1579 reg_trie_data *trie=aho->trie;
1580
1581 const char *last_start = strend - trie->minlen;
1582 const char *real_start = s;
1583 STRLEN maxlen = trie->maxlen;
1584 U8 **points;
1585
1586 GET_RE_DEBUG_FLAGS_DECL;
1587
1588 Newxz(points,maxlen,U8 *);
1589
1590 if (trie->bitmap && trie_type != trie_utf8_fold) {
1591 while (!TRIE_BITMAP_TEST(trie,*s) && s <= last_start ) {
1592 s++;
1593 }
1594 }
1595
1596 while (s <= last_start) {
1597 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1598 U8 *uc = (U8*)s;
1599 U16 charid = 0;
1600 U32 base = 1;
1601 U32 state = 1;
1602 UV uvc = 0;
1603 STRLEN len = 0;
1604 STRLEN foldlen = 0;
1605 U8 *uscan = (U8*)NULL;
1606 U8 *leftmost = NULL;
1607
1608 U32 pointpos = 0;
1609
1610 while ( state && uc <= (U8*)strend ) {
1611 int failed=0;
1612 if (aho->states[ state ].wordnum) {
1613 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1614 if (!leftmost || lpos < leftmost)
1615 leftmost= lpos;
1616 if (base==0) break;
1617 }
1618 points[pointpos++ % maxlen]= uc;
1619 switch (trie_type) {
1620 case trie_utf8_fold:
1621 if ( foldlen>0 ) {
1622 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1623 foldlen -= len;
1624 uscan += len;
1625 len=0;
1626 } else {
1627 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1628 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1629 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1630 foldlen -= UNISKIP( uvc );
1631 uscan = foldbuf + UNISKIP( uvc );
1632 }
1633 break;
1634 case trie_utf8:
1635 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1636 &len, uniflags );
1637 break;
1638 case trie_plain:
1639 uvc = (UV)*uc;
1640 len = 1;
1641 }
1642
1643 if (uvc < 256) {
1644 charid = trie->charmap[ uvc ];
1645 }
1646 else {
1647 charid = 0;
1648 if (trie->widecharmap) {
1649 SV** const svpp = hv_fetch(trie->widecharmap,
1650 (char*)&uvc, sizeof(UV), 0);
1651 if (svpp)
1652 charid = (U16)SvIV(*svpp);
1653 }
1654 }
1655 DEBUG_TRIE_EXECUTE_r(
1656 PerlIO_printf(Perl_debug_log,
1657 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1658 (int)((const char*)uc - real_start), charid, uvc)
1659 );
1660 uc += len;
1661
1662 do {
1663 U32 word = aho->states[ state ].wordnum;
1664 base = aho->states[ state ].trans.base;
1665
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log,
1668 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1669 failed ? "Fail transition to " : "",
1670 state, base, uvc, word)
1671 );
1672 if ( base ) {
1673 U32 tmp;
1674 if (charid &&
1675 (base + charid > trie->uniquecharcount )
1676 && (base + charid - 1 - trie->uniquecharcount
1677 < trie->lasttrans)
1678 && trie->trans[base + charid - 1 -
1679 trie->uniquecharcount].check == state
1680 && (tmp=trie->trans[base + charid - 1 -
1681 trie->uniquecharcount ].next))
1682 {
1683 state = tmp;
1684 break;
1685 }
1686 else {
1687 failed++;
1688 if ( state == 1 )
1689 break;
1690 else
1691 state = aho->fail[state];
1692 }
1693 }
1694 else {
1695 /* we must be accepting here */
1696 failed++;
1697 break;
1698 }
1699 } while(state);
1700 if (failed) {
1701 if (leftmost)
1702 break;
1703 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1704 while (!TRIE_BITMAP_TEST(trie,*uc) && uc <= (U8*)last_start ) {
1705 uc++;
1706 }
1707 }
1708 }
1709 }
1710 if ( aho->states[ state ].wordnum ) {
1711 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1712 if (!leftmost || lpos < leftmost)
1713 leftmost = lpos;
1714 }
1715 DEBUG_TRIE_EXECUTE_r(
1716 PerlIO_printf( Perl_debug_log,
1717 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1718 "All done: ",
1719 state, base, uvc)
1720 );
1721 if (leftmost) {
1722 s = (char*)leftmost;
1723 if (!reginfo || regtry(reginfo, s))
1724 goto got_it;
1725 s = HOPc(s,1);
1726 } else {
1727 break;
1728 }
1729 }
1730 }
1731 break;
b3c9acc1 1732 default:
3c3eec57
GS
1733 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1734 break;
d6a28714 1735 }
6eb5f6b9
JH
1736 return 0;
1737 got_it:
1738 return s;
1739}
1740
1741/*
1742 - regexec_flags - match a regexp against a string
1743 */
1744I32
1745Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1746 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1747/* strend: pointer to null at end of string */
1748/* strbeg: real beginning of string */
1749/* minend: end of match must be >=minend after stringarg. */
1750/* data: May be used for some additional optimizations. */
1751/* nosave: For optimizations. */
1752{
97aff369 1753 dVAR;
6eb5f6b9
JH
1754 register char *s;
1755 register regnode *c;
1756 register char *startpos = stringarg;
6eb5f6b9
JH
1757 I32 minlen; /* must match at least this many chars */
1758 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1759 I32 end_shift = 0; /* Same for the end. */ /* CC */
1760 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1761 char *scream_olds = NULL;
3dab1dad 1762 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1763 const bool do_utf8 = DO_UTF8(sv);
2757e526 1764 I32 multiline;
2a782b5b 1765#ifdef DEBUGGING
2757e526
JH
1766 SV* dsv0;
1767 SV* dsv1;
2a782b5b 1768#endif
3b0527fe 1769 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1770
1771 GET_RE_DEBUG_FLAGS_DECL;
1772
9d4ba2ae 1773 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1774
1775 /* Be paranoid... */
1776 if (prog == NULL || startpos == NULL) {
1777 Perl_croak(aTHX_ "NULL regexp parameter");
1778 return 0;
1779 }
1780
2757e526 1781 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1782 reginfo.prog = prog;
2757e526
JH
1783
1784#ifdef DEBUGGING
1785 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1786 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1787#endif
1788
bac06658
JH
1789 RX_MATCH_UTF8_set(prog, do_utf8);
1790
6eb5f6b9 1791 minlen = prog->minlen;
61a36c01 1792 if (strend - startpos < minlen) {
a3621e74 1793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1794 "String too short [regexec_flags]...\n"));
1795 goto phooey;
1aa99e6b 1796 }
6eb5f6b9 1797
6eb5f6b9
JH
1798 /* Check validity of program. */
1799 if (UCHARAT(prog->program) != REG_MAGIC) {
1800 Perl_croak(aTHX_ "corrupted regexp program");
1801 }
1802
1803 PL_reg_flags = 0;
1804 PL_reg_eval_set = 0;
1805 PL_reg_maxiter = 0;
1806
1807 if (prog->reganch & ROPT_UTF8)
1808 PL_reg_flags |= RF_utf8;
1809
1810 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1811 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1812 PL_bostr = strbeg;
3b0527fe 1813 reginfo.sv = sv;
6eb5f6b9
JH
1814
1815 /* Mark end of line for $ (and such) */
1816 PL_regeol = strend;
1817
1818 /* see how far we have to get to not match where we matched before */
3b0527fe 1819 reginfo.till = startpos+minend;
6eb5f6b9 1820
6eb5f6b9
JH
1821 /* If there is a "must appear" string, look for it. */
1822 s = startpos;
1823
3b0527fe 1824 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1825 MAGIC *mg;
1826
1827 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1828 reginfo.ganch = startpos;
6eb5f6b9
JH
1829 else if (sv && SvTYPE(sv) >= SVt_PVMG
1830 && SvMAGIC(sv)
14befaf4
DM
1831 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1832 && mg->mg_len >= 0) {
3b0527fe 1833 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1834 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1835 if (s > reginfo.ganch)
6eb5f6b9 1836 goto phooey;
3b0527fe 1837 s = reginfo.ganch;
6eb5f6b9
JH
1838 }
1839 }
1840 else /* pos() not defined */
3b0527fe 1841 reginfo.ganch = strbeg;
6eb5f6b9
JH
1842 }
1843
a0714e2c 1844 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1845 re_scream_pos_data d;
1846
1847 d.scream_olds = &scream_olds;
1848 d.scream_pos = &scream_pos;
1849 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1850 if (!s) {
a3621e74 1851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1852 goto phooey; /* not present */
3fa9c3d7 1853 }
6eb5f6b9
JH
1854 }
1855
a3621e74 1856 DEBUG_EXECUTE_r({
1df70142
AL
1857 const char * const s0 = UTF
1858 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1859 UNI_DISPLAY_REGEX)
1860 : prog->precomp;
bb7a0f54 1861 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1862 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1863 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1864 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1865 if (!PL_colorset)
1866 reginitcolors();
1867 PerlIO_printf(Perl_debug_log,
a0288114 1868 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1869 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1870 len0, len0, s0,
2a782b5b 1871 PL_colors[1],
9e55ce06 1872 len0 > 60 ? "..." : "",
2a782b5b 1873 PL_colors[0],
9e55ce06
JH
1874 (int)(len1 > 60 ? 60 : len1),
1875 s1, PL_colors[1],
1876 (len1 > 60 ? "..." : "")
2a782b5b
JH
1877 );
1878 });
6eb5f6b9
JH
1879
1880 /* Simplest case: anchored match need be tried only once. */
1881 /* [unless only anchor is BOL and multiline is set] */
1882 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1883 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1884 goto got_it;
7fba1cd6 1885 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1886 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1887 {
1888 char *end;
1889
1890 if (minlen)
1891 dontbother = minlen - 1;
1aa99e6b 1892 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1893 /* for multiline we only have to try after newlines */
33b8afdf 1894 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1895 if (s == startpos)
1896 goto after_try;
1897 while (1) {
3b0527fe 1898 if (regtry(&reginfo, s))
6eb5f6b9
JH
1899 goto got_it;
1900 after_try:
1901 if (s >= end)
1902 goto phooey;
1903 if (prog->reganch & RE_USE_INTUIT) {
1904 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1905 if (!s)
1906 goto phooey;
1907 }
1908 else
1909 s++;
1910 }
1911 } else {
1912 if (s > startpos)
1913 s--;
1914 while (s < end) {
1915 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1916 if (regtry(&reginfo, s))
6eb5f6b9
JH
1917 goto got_it;
1918 }
1919 }
1920 }
1921 }
1922 goto phooey;
1923 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1924 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1925 goto got_it;
1926 goto phooey;
1927 }
1928
1929 /* Messy cases: unanchored match. */
33b8afdf 1930 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1931 /* we have /x+whatever/ */
1932 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1933 char ch;
bf93d4cc
GS
1934#ifdef DEBUGGING
1935 int did_match = 0;
1936#endif
33b8afdf
JH
1937 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1938 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1939 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1940
1aa99e6b 1941 if (do_utf8) {
6eb5f6b9
JH
1942 while (s < strend) {
1943 if (*s == ch) {
a3621e74 1944 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1945 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1946 s += UTF8SKIP(s);
1947 while (s < strend && *s == ch)
1948 s += UTF8SKIP(s);
1949 }
1950 s += UTF8SKIP(s);
1951 }
1952 }
1953 else {
1954 while (s < strend) {
1955 if (*s == ch) {
a3621e74 1956 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1957 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1958 s++;
1959 while (s < strend && *s == ch)
1960 s++;
1961 }
1962 s++;
1963 }
1964 }
a3621e74 1965 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1966 PerlIO_printf(Perl_debug_log,
b7953727
JH
1967 "Did not find anchored character...\n")
1968 );
6eb5f6b9 1969 }
a0714e2c
SS
1970 else if (prog->anchored_substr != NULL
1971 || prog->anchored_utf8 != NULL
1972 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1973 && prog->float_max_offset < strend - s)) {
1974 SV *must;
1975 I32 back_max;
1976 I32 back_min;
1977 char *last;
6eb5f6b9 1978 char *last1; /* Last position checked before */
bf93d4cc
GS
1979#ifdef DEBUGGING
1980 int did_match = 0;
1981#endif
33b8afdf
JH
1982 if (prog->anchored_substr || prog->anchored_utf8) {
1983 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1984 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1985 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1986 back_max = back_min = prog->anchored_offset;
1987 } else {
1988 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1989 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1990 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1991 back_max = prog->float_max_offset;
1992 back_min = prog->float_min_offset;
1993 }
1994 if (must == &PL_sv_undef)
1995 /* could not downgrade utf8 check substring, so must fail */
1996 goto phooey;
1997
1998 last = HOP3c(strend, /* Cannot start after this */
1999 -(I32)(CHR_SVLEN(must)
2000 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2001
2002 if (s > PL_bostr)
2003 last1 = HOPc(s, -1);
2004 else
2005 last1 = s - 1; /* bogus */
2006
a0288114 2007 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2008 check_substr==must. */
2009 scream_pos = -1;
2010 dontbother = end_shift;
2011 strend = HOPc(strend, -dontbother);
2012 while ( (s <= last) &&
9041c2e3 2013 ((flags & REXEC_SCREAM)
1aa99e6b 2014 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2015 end_shift, &scream_pos, 0))
1aa99e6b 2016 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2017 (unsigned char*)strend, must,
7fba1cd6 2018 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2019 /* we may be pointing at the wrong string */
2020 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2021 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2022 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2023 if (HOPc(s, -back_max) > last1) {
2024 last1 = HOPc(s, -back_min);
2025 s = HOPc(s, -back_max);
2026 }
2027 else {
52657f30 2028 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2029
2030 last1 = HOPc(s, -back_min);
52657f30 2031 s = t;
6eb5f6b9 2032 }
1aa99e6b 2033 if (do_utf8) {
6eb5f6b9 2034 while (s <= last1) {
3b0527fe 2035 if (regtry(&reginfo, s))
6eb5f6b9
JH
2036 goto got_it;
2037 s += UTF8SKIP(s);
2038 }
2039 }
2040 else {
2041 while (s <= last1) {
3b0527fe 2042 if (regtry(&reginfo, s))
6eb5f6b9
JH
2043 goto got_it;
2044 s++;
2045 }
2046 }
2047 }
a3621e74 2048 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2049 PerlIO_printf(Perl_debug_log,
a0288114 2050 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2051 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2052 ? "anchored" : "floating"),
2053 PL_colors[0],
2054 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2055 SvPVX_const(must),
b7953727
JH
2056 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2057 );
6eb5f6b9
JH
2058 goto phooey;
2059 }
155aba94 2060 else if ((c = prog->regstclass)) {
f14c76ed 2061 if (minlen) {
07be1b83 2062 U8 op = OP(prog->regstclass);
66e933ab 2063 /* don't bother with what can't match */
07be1b83 2064 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2065 strend = HOPc(strend, -(minlen - 1));
2066 }
a3621e74 2067 DEBUG_EXECUTE_r({
ffc61ed2 2068 SV *prop = sv_newmortal();
cfd0369c
NC
2069 const char *s0;
2070 const char *s1;
9e55ce06
JH
2071 int len0;
2072 int len1;
2073
32fc9b6a 2074 regprop(prog, prop, c);
9e55ce06 2075 s0 = UTF ?
3f7c398e 2076 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2077 UNI_DISPLAY_REGEX) :
cfd0369c 2078 SvPVX_const(prop);
9e55ce06
JH
2079 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2080 s1 = UTF ?
c728cb41 2081 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2082 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2083 PerlIO_printf(Perl_debug_log,
07be1b83 2084 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2085 len0, len0, s0,
07be1b83 2086 len1, len1, s1, (int)(strend - s));
ffc61ed2 2087 });
3b0527fe 2088 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2089 goto got_it;
07be1b83 2090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2091 }
2092 else {
2093 dontbother = 0;
a0714e2c 2094 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2095 /* Trim the end. */
d6a28714 2096 char *last;
33b8afdf
JH
2097 SV* float_real;
2098
2099 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2100 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2101 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2102
2103 if (flags & REXEC_SCREAM) {
33b8afdf 2104 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2105 end_shift, &scream_pos, 1); /* last one */
2106 if (!last)
ffc61ed2 2107 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2108 /* we may be pointing at the wrong string */
2109 else if (RX_MATCH_COPIED(prog))
3f7c398e 2110 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2111 }
d6a28714
JH
2112 else {
2113 STRLEN len;
cfd0369c 2114 const char * const little = SvPV_const(float_real, len);
d6a28714 2115
33b8afdf 2116 if (SvTAIL(float_real)) {
d6a28714
JH
2117 if (memEQ(strend - len + 1, little, len - 1))
2118 last = strend - len + 1;
7fba1cd6 2119 else if (!multiline)
9041c2e3 2120 last = memEQ(strend - len, little, len)
bd61b366 2121 ? strend - len : NULL;
b8c5462f 2122 else
d6a28714
JH
2123 goto find_last;
2124 } else {
2125 find_last:
9041c2e3 2126 if (len)
d6a28714 2127 last = rninstr(s, strend, little, little + len);
b8c5462f 2128 else
a0288114 2129 last = strend; /* matching "$" */
b8c5462f 2130 }
b8c5462f 2131 }
bf93d4cc 2132 if (last == NULL) {
a3621e74 2133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2134 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2135 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2136 goto phooey; /* Should not happen! */
2137 }
d6a28714
JH
2138 dontbother = strend - last + prog->float_min_offset;
2139 }
2140 if (minlen && (dontbother < minlen))
2141 dontbother = minlen - 1;
2142 strend -= dontbother; /* this one's always in bytes! */
2143 /* We don't know much -- general case. */
1aa99e6b 2144 if (do_utf8) {
d6a28714 2145 for (;;) {
3b0527fe 2146 if (regtry(&reginfo, s))
d6a28714
JH
2147 goto got_it;
2148 if (s >= strend)
2149 break;
b8c5462f 2150 s += UTF8SKIP(s);
d6a28714
JH
2151 };
2152 }
2153 else {
2154 do {
3b0527fe 2155 if (regtry(&reginfo, s))
d6a28714
JH
2156 goto got_it;
2157 } while (s++ < strend);
2158 }
2159 }
2160
2161 /* Failure. */
2162 goto phooey;
2163
2164got_it:
2165 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2166
2167 if (PL_reg_eval_set) {
2168 /* Preserve the current value of $^R */
2169 if (oreplsv != GvSV(PL_replgv))
2170 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2171 restored, the value remains
2172 the same. */
4f639d21 2173 restore_pos(aTHX_ prog);
d6a28714
JH
2174 }
2175
2176 /* make sure $`, $&, $', and $digit will work later */
2177 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2178 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2179 if (flags & REXEC_COPY_STR) {
2180 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2181#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2182 if ((SvIsCOW(sv)
2183 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2184 if (DEBUG_C_TEST) {
2185 PerlIO_printf(Perl_debug_log,
2186 "Copy on write: regexp capture, type %d\n",
2187 (int) SvTYPE(sv));
2188 }
2189 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2190 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2191 assert (SvPOKp(prog->saved_copy));
2192 } else
2193#endif
2194 {
2195 RX_MATCH_COPIED_on(prog);
2196 s = savepvn(strbeg, i);
2197 prog->subbeg = s;
2198 }
d6a28714 2199 prog->sublen = i;
d6a28714
JH
2200 }
2201 else {
2202 prog->subbeg = strbeg;
2203 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2204 }
2205 }
9041c2e3 2206
d6a28714
JH
2207 return 1;
2208
2209phooey:
a3621e74 2210 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2211 PL_colors[4], PL_colors[5]));
d6a28714 2212 if (PL_reg_eval_set)
4f639d21 2213 restore_pos(aTHX_ prog);
d6a28714
JH
2214 return 0;
2215}
2216
2217/*
2218 - regtry - try match at specific point
2219 */
2220STATIC I32 /* 0 failure, 1 success */
3b0527fe 2221S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2222{
97aff369 2223 dVAR;
d6a28714
JH
2224 register I32 *sp;
2225 register I32 *ep;
2226 CHECKPOINT lastcp;
3b0527fe 2227 regexp *prog = reginfo->prog;
a3621e74 2228 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2229
02db2b7b
IZ
2230#ifdef DEBUGGING
2231 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2232#endif
d6a28714
JH
2233 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2234 MAGIC *mg;
2235
2236 PL_reg_eval_set = RS_init;
a3621e74 2237 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2238 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2239 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2240 ));
e8347627 2241 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2242 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2243 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2244 SAVETMPS;
2245 /* Apparently this is not needed, judging by wantarray. */
e8347627 2246 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2247 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2248
3b0527fe 2249 if (reginfo->sv) {
d6a28714 2250 /* Make $_ available to executed code. */
3b0527fe 2251 if (reginfo->sv != DEFSV) {
59f00321 2252 SAVE_DEFSV;
3b0527fe 2253 DEFSV = reginfo->sv;
b8c5462f 2254 }
d6a28714 2255
3b0527fe
DM
2256 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2257 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2258 /* prepare for quick setting of pos */
d300d9fa
NC
2259#ifdef PERL_OLD_COPY_ON_WRITE
2260 if (SvIsCOW(sv))
2261 sv_force_normal_flags(sv, 0);
2262#endif
3dab1dad 2263 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2264 &PL_vtbl_mglob, NULL, 0);
d6a28714 2265 mg->mg_len = -1;
b8c5462f 2266 }
d6a28714
JH
2267 PL_reg_magic = mg;
2268 PL_reg_oldpos = mg->mg_len;
4f639d21 2269 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2270 }
09687e5a 2271 if (!PL_reg_curpm) {
a02a5408 2272 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2273#ifdef USE_ITHREADS
2274 {
2275 SV* repointer = newSViv(0);
577e12cc 2276 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2277 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2278 av_push(PL_regex_padav,repointer);
2279 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2280 PL_regex_pad = AvARRAY(PL_regex_padav);
2281 }
2282#endif
2283 }
aaa362c4 2284 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2285 PL_reg_oldcurpm = PL_curpm;
2286 PL_curpm = PL_reg_curpm;
2287 if (RX_MATCH_COPIED(prog)) {
2288 /* Here is a serious problem: we cannot rewrite subbeg,
2289 since it may be needed if this match fails. Thus
2290 $` inside (?{}) could fail... */
2291 PL_reg_oldsaved = prog->subbeg;
2292 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2293#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2294 PL_nrs = prog->saved_copy;
2295#endif
d6a28714
JH
2296 RX_MATCH_COPIED_off(prog);
2297 }
2298 else
bd61b366 2299 PL_reg_oldsaved = NULL;
d6a28714
JH
2300 prog->subbeg = PL_bostr;
2301 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2302 }
973dddac 2303 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2304 PL_reginput = startpos;
2305 PL_regstartp = prog->startp;
2306 PL_regendp = prog->endp;
2307 PL_reglastparen = &prog->lastparen;
a01268b5 2308 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2309 prog->lastparen = 0;
03994de8 2310 prog->lastcloseparen = 0;
d6a28714 2311 PL_regsize = 0;
a3621e74 2312 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2313 if (PL_reg_start_tmpl <= prog->nparens) {
2314 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2315 if(PL_reg_start_tmp)
2316 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2317 else
a02a5408 2318 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2319 }
2320
2321 /* XXXX What this code is doing here?!!! There should be no need
2322 to do this again and again, PL_reglastparen should take care of
3dd2943c 2323 this! --ilya*/
dafc8851
JH
2324
2325 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2326 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2327 * PL_reglastparen), is not needed at all by the test suite
2328 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2329 * enough, for building DynaLoader, or otherwise this
2330 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2331 * will happen. Meanwhile, this code *is* needed for the
2332 * above-mentioned test suite tests to succeed. The common theme
2333 * on those tests seems to be returning null fields from matches.
2334 * --jhi */
dafc8851 2335#if 1
d6a28714
JH
2336 sp = prog->startp;
2337 ep = prog->endp;
2338 if (prog->nparens) {
097eb12c 2339 register I32 i;
eb160463 2340 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2341 *++sp = -1;
2342 *++ep = -1;
2343 }
2344 }
dafc8851 2345#endif
02db2b7b 2346 REGCP_SET(lastcp);
3b0527fe 2347 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2348 prog->endp[0] = PL_reginput - PL_bostr;
2349 return 1;
2350 }
02db2b7b 2351 REGCP_UNWIND(lastcp);
d6a28714
JH
2352 return 0;
2353}
2354
02db2b7b
IZ
2355#define RE_UNWIND_BRANCH 1
2356#define RE_UNWIND_BRANCHJ 2
2357
2358union re_unwind_t;
2359
2360typedef struct { /* XX: makes sense to enlarge it... */
2361 I32 type;
2362 I32 prev;
2363 CHECKPOINT lastcp;
2364} re_unwind_generic_t;
2365
2366typedef struct {
2367 I32 type;
2368 I32 prev;
2369 CHECKPOINT lastcp;
2370 I32 lastparen;
2371 regnode *next;
2372 char *locinput;
2373 I32 nextchr;
3a2830be 2374 int minmod;
02db2b7b
IZ
2375#ifdef DEBUGGING
2376 int regindent;
2377#endif
2378} re_unwind_branch_t;
2379
2380typedef union re_unwind_t {
2381 I32 type;
2382 re_unwind_generic_t generic;
2383 re_unwind_branch_t branch;
2384} re_unwind_t;
2385
8ba1375e
MJD
2386#define sayYES goto yes
2387#define sayNO goto no
e0f9d4a8 2388#define sayNO_ANYOF goto no_anyof
8ba1375e 2389#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2390#define sayNO_FINAL goto no_final
2391#define sayNO_SILENT goto do_no
2392#define saySAME(x) if (x) goto yes; else goto no
2393
3ab3c9b4
HS
2394#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2395#define POSCACHE_SEEN 1 /* we know what we're caching */
2396#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2397
3ab3c9b4 2398#define CACHEsayYES STMT_START { \
d8319b27 2399 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2400 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2401 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2403 } \
2404 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2405 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2406 } \
2407 else { \
3ab3c9b4
HS
2408 /* cache records failure, but this is success */ \
2409 DEBUG_r( \
2410 PerlIO_printf(Perl_debug_log, \
2411 "%*s (remove success from failure cache)\n", \
2412 REPORT_CODE_OFF+PL_regindent*2, "") \
2413 ); \
d8319b27 2414 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2415 } \
2416 } \
2417 sayYES; \
2418} STMT_END
7409bbd3 2419
3ab3c9b4 2420#define CACHEsayNO STMT_START { \
d8319b27 2421 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2422 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2423 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2425 } \
2426 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2427 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2428 } \
2429 else { \
3ab3c9b4
HS
2430 /* cache records success, but this is failure */ \
2431 DEBUG_r( \
2432 PerlIO_printf(Perl_debug_log, \
2433 "%*s (remove failure from success cache)\n", \
2434 REPORT_CODE_OFF+PL_regindent*2, "") \
2435 ); \
d8319b27 2436 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2437 } \
2438 } \
2439 sayNO; \
2440} STMT_END
2441
a3621e74
YO
2442/* this is used to determine how far from the left messages like
2443 'failed...' are printed. Currently 29 makes these messages line
2444 up with the opcode they refer to. Earlier perls used 25 which
2445 left these messages outdented making reviewing a debug output
2446 quite difficult.
2447*/
2448#define REPORT_CODE_OFF 29
2449
2450
2451/* Make sure there is a test for this +1 options in re_tests */
2452#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2453
9e137952
DM
2454/* this value indiciates that the c1/c2 "next char" test should be skipped */
2455#define CHRTEST_VOID -1000
2456
86545054
DM
2457#define SLAB_FIRST(s) (&(s)->states[0])
2458#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2459
5d9a96ca
DM
2460/* grab a new slab and return the first slot in it */
2461
2462STATIC regmatch_state *
2463S_push_slab(pTHX)
2464{
54df2634
NC
2465#if PERL_VERSION < 9
2466 dMY_CXT;
2467#endif
5d9a96ca
DM
2468 regmatch_slab *s = PL_regmatch_slab->next;
2469 if (!s) {
2470 Newx(s, 1, regmatch_slab);
2471 s->prev = PL_regmatch_slab;
2472 s->next = NULL;
2473 PL_regmatch_slab->next = s;
2474 }
2475 PL_regmatch_slab = s;
86545054 2476 return SLAB_FIRST(s);
5d9a96ca 2477}
5b47454d 2478
95b24440
DM
2479/* simulate a recursive call to regmatch */
2480
2481#define REGMATCH(ns, where) \
5d9a96ca
DM
2482 st->scan = scan; \
2483 scan = (ns); \
2484 st->resume_state = resume_##where; \
95b24440
DM
2485 goto start_recurse; \
2486 resume_point_##where:
2487
aa283a38
DM
2488
2489/* push a new regex state. Set newst to point to it */
2490
2491#define PUSH_STATE(newst, resume) \
2492 depth++; \
2493 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2494 st->scan = scan; \
2495 st->next = next; \
2496 st->n = n; \
2497 st->locinput = locinput; \
2498 st->resume_state = resume; \
2499 newst = st+1; \
86545054 2500 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2501 newst = S_push_slab(aTHX); \
2502 PL_regmatch_state = newst; \
2503 newst->cc = 0; \
2504 newst->minmod = 0; \
2505 newst->sw = 0; \
2506 newst->logical = 0; \
2507 newst->unwind = 0; \
2508 locinput = PL_reginput; \
2509 nextchr = UCHARAT(locinput);
2510
2511#define POP_STATE \
2512 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2513 depth--; \
2514 st--; \
86545054 2515 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2516 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2517 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2518 } \
2519 PL_regmatch_state = st; \
2520 scan = st->scan; \
2521 next = st->next; \
2522 n = st->n; \
2523 locinput = st->locinput; \
2524 nextchr = UCHARAT(locinput);
2525
d6a28714
JH
2526/*
2527 - regmatch - main matching routine
2528 *
2529 * Conceptually the strategy is simple: check to see whether the current
2530 * node matches, call self recursively to see whether the rest matches,
2531 * and then act accordingly. In practice we make some effort to avoid
2532 * recursion, in particular by going through "ordinary" nodes (that don't
2533 * need to know whether the rest of the match failed) by a loop instead of
2534 * by recursion.
2535 */
2536/* [lwall] I've hoisted the register declarations to the outer block in order to
2537 * maybe save a little bit of pushing and popping on the stack. It also takes
2538 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2539 *
2540 * This function used to be heavily recursive, but since this had the
2541 * effect of blowing the CPU stack on complex regexes, it has been
2542 * restructured to be iterative, and to save state onto the heap rather
2543 * than the stack. Essentially whereever regmatch() used to be called, it
2544 * pushes the current state, notes where to return, then jumps back into
2545 * the main loop.
2546 *
2547 * Originally the structure of this function used to look something like
2548
2549 S_regmatch() {
2550 int a = 1, b = 2;
2551 ...
2552 while (scan != NULL) {
5d9a96ca 2553 a++; // do stuff with a and b
95b24440
DM
2554 ...
2555 switch (OP(scan)) {
2556 case FOO: {
2557 int local = 3;
2558 ...
2559 if (regmatch(...)) // recurse
2560 goto yes;
2561 }
2562 ...
2563 }
2564 }
2565 yes:
2566 return 1;
2567 }
2568
2569 * Now it looks something like this:
2570
5d9a96ca 2571 typedef struct {
95b24440
DM
2572 int a, b, local;
2573 int resume_state;
5d9a96ca 2574 } regmatch_state;
95b24440
DM
2575
2576 S_regmatch() {
5d9a96ca
DM
2577 regmatch_state *st = new();
2578 int depth=0;
2579 st->a++; // do stuff with a and b
95b24440
DM
2580 ...
2581 while (scan != NULL) {
2582 ...
2583 switch (OP(scan)) {
2584 case FOO: {
5d9a96ca 2585 st->local = 3;
95b24440 2586 ...
5d9a96ca
DM
2587 st->scan = scan;
2588 scan = ...;
2589 st->resume_state = resume_FOO;
2590 goto start_recurse; // recurse
95b24440 2591
5d9a96ca
DM
2592 resume_point_FOO:
2593 if (result)
95b24440
DM
2594 goto yes;
2595 }
2596 ...
2597 }
5d9a96ca
DM
2598 start_recurse:
2599 st = new(); push a new state
2600 st->a = 1; st->b = 2;
2601 depth++;
95b24440 2602 }
5d9a96ca 2603 yes:
95b24440 2604 result = 1;
5d9a96ca
DM
2605 if (depth--) {
2606 st = pop();
95b24440
DM
2607 switch (resume_state) {
2608 case resume_FOO:
2609 goto resume_point_FOO;
2610 ...
2611 }
2612 }
2613 return result
2614 }
2615
2616 * WARNING: this means that any line in this function that contains a
2617 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2618 * regmatch() using gotos instead. Thus the values of any local variables
2619 * not saved in the regmatch_state structure will have been lost when
2620 * execution resumes on the next line .
5d9a96ca
DM
2621 *
2622 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2623 * PL_regmatch_state always points to the currently active state, and
2624 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2625 * The first time regmatch is called, the first slab is allocated, and is
2626 * never freed until interpreter desctruction. When the slab is full,
2627 * a new one is allocated chained to the end. At exit from regmatch, slabs
2628 * allocated since entry are freed.
d6a28714 2629 */
95b24440 2630
3dab1dad 2631#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2632
07be1b83
YO
2633#ifdef DEBUGGING
2634STATIC void
2635S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2636{
2637 const int docolor = *PL_colors[0];
2638 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2639 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2640 /* The part of the string before starttry has one color
2641 (pref0_len chars), between starttry and current
2642 position another one (pref_len - pref0_len chars),
2643 after the current position the third one.
2644 We assume that pref0_len <= pref_len, otherwise we
2645 decrease pref0_len. */
2646 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2647 ? (5 + taill) - l : locinput - PL_bostr;
2648 int pref0_len;
2649
2650 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2651 pref_len++;
2652 pref0_len = pref_len - (locinput - PL_reg_starttry);
2653 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2654 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2655 ? (5 + taill) - pref_len : PL_regeol - locinput);
2656 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2657 l--;
2658 if (pref0_len < 0)
2659 pref0_len = 0;
2660 if (pref0_len > pref_len)
2661 pref0_len = pref_len;
2662 {
2663 const char * const s0 =
2664 do_utf8 && OP(scan) != CANY ?
2665 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2666 pref0_len, 60, UNI_DISPLAY_REGEX) :
2667 locinput - pref_len;
2668 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2669 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2670 pv_uni_display(PERL_DEBUG_PAD(1),
2671 (U8*)(locinput - pref_len + pref0_len),
2672 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2673 locinput - pref_len + pref0_len;
2674 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2675 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2676 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2677 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2678 locinput;
2679 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2680 PerlIO_printf(Perl_debug_log,
2681 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2682 (IV)(locinput - PL_bostr),
2683 PL_colors[4],
2684 len0, s0,
2685 PL_colors[5],
2686 PL_colors[2],
2687 len1, s1,
2688 PL_colors[3],
2689 (docolor ? "" : "> <"),
2690 PL_colors[0],
2691 len2, s2,
2692 PL_colors[1],
2693 15 - l - pref_len + 1,
2694 "");
2695 }
2696}
2697#endif
2698
d6a28714 2699STATIC I32 /* 0 failure, 1 success */
3b0527fe 2700S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2701{
54df2634
NC
2702#if PERL_VERSION < 9
2703 dMY_CXT;
2704#endif
27da23d5 2705 dVAR;
95b24440 2706 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2707 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2708
3b0527fe
DM
2709 regexp *rex = reginfo->prog;
2710
5d9a96ca
DM
2711 regmatch_slab *orig_slab;
2712 regmatch_state *orig_state;
a3621e74 2713
5d9a96ca
DM
2714 /* the current state. This is a cached copy of PL_regmatch_state */
2715 register regmatch_state *st;
95b24440 2716
5d9a96ca
DM
2717 /* cache heavy used fields of st in registers */
2718 register regnode *scan;
2719 register regnode *next;
2720 register I32 n = 0; /* initialize to shut up compiler warning */
2721 register char *locinput = PL_reginput;
95b24440 2722
5d9a96ca
DM
2723 /* these variables are NOT saved during a recusive RFEGMATCH: */
2724 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2725 bool result; /* return value of S_regmatch */
2726 regnode *inner; /* Next node in internal branch. */
2727 int depth = 0; /* depth of recursion */
aa283a38 2728 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2730 subpattern */
95b24440
DM
2731
2732#ifdef DEBUGGING
ab74612d 2733 SV *re_debug_flags = NULL;
a3621e74 2734 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2735 PL_regindent++;
2736#endif
2737
5d9a96ca
DM
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
86545054 2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2744 }
2745
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2750
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
86545054 2753 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2755
2756 st->minmod = 0;
2757 st->sw = 0;
2758 st->logical = 0;
2759 st->unwind = 0;
2760 st->cc = NULL;
d6a28714
JH
2761 /* Note that nextchr is a byte even in UTF */
2762 nextchr = UCHARAT(locinput);
2763 scan = prog;
2764 while (scan != NULL) {
8ba1375e 2765
a3621e74 2766 DEBUG_EXECUTE_r( {
6136c704 2767 SV * const prop = sv_newmortal();
07be1b83 2768 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2769 regprop(rex, prop, scan);
07be1b83
YO
2770
2771 PerlIO_printf(Perl_debug_log,
2772 "%3"IVdf":%*s%s(%"IVdf")\n",
2773 (IV)(scan - rex->program), PL_regindent*2, "",
2774 SvPVX_const(prop),
2775 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2776 });
d6a28714
JH
2777
2778 next = scan + NEXT_OFF(scan);
2779 if (next == scan)
2780 next = NULL;
2781
2782 switch (OP(scan)) {
2783 case BOL:
7fba1cd6 2784 if (locinput == PL_bostr)
d6a28714 2785 {
3b0527fe 2786 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2787 break;
2788 }
d6a28714
JH
2789 sayNO;
2790 case MBOL:
12d33761
HS
2791 if (locinput == PL_bostr ||
2792 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2793 {
b8c5462f
JH
2794 break;
2795 }
d6a28714
JH
2796 sayNO;
2797 case SBOL:
c2a73568 2798 if (locinput == PL_bostr)
b8c5462f 2799 break;
d6a28714
JH
2800 sayNO;
2801 case GPOS:
3b0527fe 2802 if (locinput == reginfo->ganch)
d6a28714
JH
2803 break;
2804 sayNO;
2805 case EOL:
d6a28714
JH
2806 goto seol;
2807 case MEOL:
d6a28714 2808 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2809 sayNO;
b8c5462f 2810 break;
d6a28714
JH
2811 case SEOL:
2812 seol:
2813 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2814 sayNO;
d6a28714 2815 if (PL_regeol - locinput > 1)
b8c5462f 2816 sayNO;
b8c5462f 2817 break;
d6a28714
JH
2818 case EOS:
2819 if (PL_regeol != locinput)
b8c5462f 2820 sayNO;
d6a28714 2821 break;
ffc61ed2 2822 case SANY:
d6a28714 2823 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2824 sayNO;
f33976b4
DB
2825 if (do_utf8) {
2826 locinput += PL_utf8skip[nextchr];
2827 if (locinput > PL_regeol)
2828 sayNO;
2829 nextchr = UCHARAT(locinput);
2830 }
2831 else
2832 nextchr = UCHARAT(++locinput);
2833 break;
2834 case CANY:
2835 if (!nextchr && locinput >= PL_regeol)
2836 sayNO;
b8c5462f 2837 nextchr = UCHARAT(++locinput);
a0d0e21e 2838 break;
ffc61ed2 2839 case REG_ANY:
1aa99e6b
IH
2840 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2841 sayNO;
2842 if (do_utf8) {
b8c5462f 2843 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2844 if (locinput > PL_regeol)
2845 sayNO;
a0ed51b3 2846 nextchr = UCHARAT(locinput);
a0ed51b3 2847 }
1aa99e6b
IH
2848 else
2849 nextchr = UCHARAT(++locinput);
a0ed51b3 2850 break;
5b47454d 2851 case TRIE:
3dab1dad 2852 {
07be1b83 2853 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2854 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2855 trie_type = do_utf8 ?
2856 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2857 : trie_plain;
2858
2859 /* what trie are we using right now */
2860 reg_trie_data *trie
2861 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2862 U32 state = trie->startstate;
2863
2864 if (trie->bitmap && trie_type != trie_utf8_fold &&
2865 !TRIE_BITMAP_TEST(trie,*locinput)
2866 ) {
2867 if (trie->states[ state ].wordnum) {
2868 DEBUG_EXECUTE_r(
2869 PerlIO_printf(Perl_debug_log,
2870 "%*s %smatched empty string...%s\n",
2871 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2872 );
2873 break;
2874 } else {
2875 DEBUG_EXECUTE_r(
2876 PerlIO_printf(Perl_debug_log,
2877 "%*s %sfailed to match start class...%s\n",
2878 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2879 );
2880 sayNO_SILENT;
2881 }
2882 }
a3621e74 2883 {
07be1b83
YO
2884 /*
2885 traverse the TRIE keeping track of all accepting states
2886 we transition through until we get to a failing node.
2887 */
2888
a3621e74 2889 U8 *uc = ( U8* )locinput;
a3621e74
YO
2890 U16 charid = 0;
2891 U32 base = 0;
2892 UV uvc = 0;
2893 STRLEN len = 0;
2894 STRLEN foldlen = 0;
a3621e74
YO
2895 U8 *uscan = (U8*)NULL;
2896 STRLEN bufflen=0;
95b24440 2897 SV *sv_accept_buff = NULL;
5b47454d 2898
d8319b27 2899 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2900 result = 0;
a3621e74
YO
2901
2902 while ( state && uc <= (U8*)PL_regeol ) {
2903
5b47454d 2904 if (trie->states[ state ].wordnum) {
d8319b27 2905 if (!st->u.trie.accepted ) {
5b47454d
DM
2906 ENTER;
2907 SAVETMPS;
2908 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2909 sv_accept_buff=newSV(bufflen *
2910 sizeof(reg_trie_accepted) - 1);
2911 SvCUR_set(sv_accept_buff,
2912 sizeof(reg_trie_accepted));
2913 SvPOK_on(sv_accept_buff);
2914 sv_2mortal(sv_accept_buff);
d8319b27 2915 st->u.trie.accept_buff =
5b47454d
DM
2916 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2917 }
2918 else {
d8319b27 2919 if (st->u.trie.accepted >= bufflen) {
5b47454d 2920 bufflen *= 2;
d8319b27 2921 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2922 SvGROW(sv_accept_buff,
2923 bufflen * sizeof(reg_trie_accepted));
2924 }
2925 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2926 + sizeof(reg_trie_accepted));
2927 }
d8319b27
DM
2928 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2929 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2930 ++st->u.trie.accepted;
5b47454d 2931 }
a3621e74
YO
2932
2933 base = trie->states[ state ].trans.base;
2934
07be1b83
YO
2935 DEBUG_TRIE_EXECUTE_r({
2936 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2937 PerlIO_printf( Perl_debug_log,
e4584336 2938 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2939 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2940 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2941 });
a3621e74
YO
2942
2943 if ( base ) {
5b47454d 2944 switch (trie_type) {
3dab1dad 2945 case trie_utf8_fold:
a3621e74
YO
2946 if ( foldlen>0 ) {
2947 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2948 foldlen -= len;
2949 uscan += len;
2950 len=0;
2951 } else {
1df70142 2952 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2953 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2954 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2955 foldlen -= UNISKIP( uvc );
2956 uscan = foldbuf + UNISKIP( uvc );
2957 }
5b47454d
DM
2958 break;
2959 case trie_utf8:
2960 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2961 &len, uniflags );
2962 break;
2963 case trie_plain:
e4584336 2964 uvc = (UV)*uc;
a3621e74
YO
2965 len = 1;
2966 }
2967
5b47454d
DM
2968 if (uvc < 256) {
2969 charid = trie->charmap[ uvc ];
2970 }
2971 else {
2972 charid = 0;
2973 if (trie->widecharmap) {
3dab1dad 2974 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2975 (char*)&uvc, sizeof(UV), 0);
2976 if (svpp)
2977 charid = (U16)SvIV(*svpp);
2978 }
2979 }
a3621e74 2980
5b47454d
DM
2981 if (charid &&
2982 (base + charid > trie->uniquecharcount )
2983 && (base + charid - 1 - trie->uniquecharcount
2984 < trie->lasttrans)
2985 && trie->trans[base + charid - 1 -
2986 trie->uniquecharcount].check == state)
2987 {
2988 state = trie->trans[base + charid - 1 -
2989 trie->uniquecharcount ].next;
2990 }
2991 else {
2992 state = 0;
2993 }
2994 uc += len;
2995
2996 }
2997 else {
a3621e74
YO
2998 state = 0;
2999 }
3000 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3001 PerlIO_printf( Perl_debug_log,
3002 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3003 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3004 );
3005 }
d8319b27 3006 if (!st->u.trie.accepted )
a3621e74 3007 sayNO;
a3621e74
YO
3008
3009 /*
3010 There was at least one accepting state that we
3011 transitioned through. Presumably the number of accepting
3012 states is going to be low, typically one or two. So we
3013 simply scan through to find the one with lowest wordnum.
3014 Once we find it, we swap the last state into its place
3015 and decrement the size. We then try to match the rest of
3016 the pattern at the point where the word ends, if we
3017 succeed then we end the loop, otherwise the loop
3018 eventually terminates once all of the accepting states
3019 have been tried.
3020 */
a3621e74 3021
d8319b27 3022 if ( st->u.trie.accepted == 1 ) {
f2278c82
YO
3023 DEBUG_EXECUTE_r({
3024 SV ** const tmp = RX_DEBUG(reginfo->prog)
3025 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3026 : NULL;
3027 PerlIO_printf( Perl_debug_log,
3028 "%*s %sonly one match : #%d <%s>%s\n",
3029 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3030 st->u.trie.accept_buff[ 0 ].wordnum,
3031 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3032 PL_colors[5] );
3033 });
d8319b27 3034 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3035 /* in this case we free tmps/leave before we call regmatch
3036 as we wont be using accept_buff again. */
3037 FREETMPS;
3038 LEAVE;
07be1b83 3039 /* do we need this? why dont we just do a break? */
95b24440
DM
3040 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3041 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3042 } else {
3043 DEBUG_EXECUTE_r(
e4584336 3044 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3045 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3046 PL_colors[5] );
3047 );
d8319b27 3048 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3049 U32 best = 0;
3050 U32 cur;
d8319b27 3051 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3052 DEBUG_TRIE_EXECUTE_r(
3053 PerlIO_printf( Perl_debug_log,
3054 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3055 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3056 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3057 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3058 );
a3621e74 3059
d8319b27
DM
3060 if (st->u.trie.accept_buff[cur].wordnum <
3061 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3062 best = cur;
a3621e74 3063 }
f2278c82
YO
3064 DEBUG_EXECUTE_r({
3065 reg_trie_data * const trie = (reg_trie_data*)
3066 rex->data->data[ARG(scan)];
3067 SV ** const tmp = RX_DEBUG(reginfo->prog)
3068 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3069 : NULL;
3070 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3071 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3072 st->u.trie.accept_buff[best].wordnum,
3073 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3074 PL_colors[5] );
3075 });
d8319b27
DM
3076 if ( best<st->u.trie.accepted ) {
3077 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3078 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3079 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3080 best = st->u.trie.accepted;
a3621e74 3081 }
d8319b27 3082 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3083
3084 /*
3085 as far as I can tell we only need the SAVETMPS/FREETMPS
3086 for re's with EVAL in them but I'm leaving them in for
3087 all until I can be sure.
3088 */
3089 SAVETMPS;
95b24440
DM
3090 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3091 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3092 FREETMPS;
3093 }
3094 FREETMPS;
3095 LEAVE;
3096 }
3097
95b24440 3098 if (result) {
a3621e74
YO
3099 sayYES;
3100 } else {
3101 sayNO;
3102 }
3dab1dad 3103 }}
a3621e74 3104 /* unreached codepoint */
95b24440
DM
3105 case EXACT: {
3106 char *s = STRING(scan);
5d9a96ca 3107 st->ln = STR_LEN(scan);
eb160463 3108 if (do_utf8 != UTF) {
bc517b45 3109 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3110 char *l = locinput;
5d9a96ca 3111 const char *e = s + st->ln;
a72c7584 3112
5ff6fc6d
JH
3113 if (do_utf8) {
3114 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3115 while (s < e) {
a3b680e6 3116 STRLEN ulen;
1aa99e6b 3117 if (l >= PL_regeol)
5ff6fc6d
JH
3118 sayNO;
3119 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3120 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3121 uniflags))
5ff6fc6d 3122 sayNO;
bc517b45 3123 l += ulen;
5ff6fc6d 3124 s ++;
1aa99e6b 3125 }
5ff6fc6d
JH
3126 }
3127 else {
3128 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3129 while (s < e) {
a3b680e6 3130 STRLEN ulen;
1aa99e6b
IH
3131 if (l >= PL_regeol)
3132 sayNO;
5ff6fc6d 3133 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3134 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3135 uniflags))
1aa99e6b 3136 sayNO;
bc517b45 3137 s += ulen;
a72c7584 3138 l ++;
1aa99e6b 3139 }
5ff6fc6d 3140 }
1aa99e6b
IH
3141 locinput = l;
3142 nextchr = UCHARAT(locinput);
3143 break;
3144 }
bc517b45 3145 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3146 /* Inline the first character, for speed. */
3147 if (UCHARAT(s) != nextchr)
3148 sayNO;
5d9a96ca 3149 if (PL_regeol - locinput < st->ln)
d6a28714 3150 sayNO;
5d9a96ca 3151 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3152 sayNO;
5d9a96ca 3153 locinput += st->ln;
d6a28714
JH
3154 nextchr = UCHARAT(locinput);
3155 break;
95b24440 3156 }
d6a28714 3157 case EXACTFL:
b8c5462f
JH
3158 PL_reg_flags |= RF_tainted;
3159 /* FALL THROUGH */
95b24440
DM
3160 case EXACTF: {
3161 char *s = STRING(scan);
5d9a96ca 3162 st->ln = STR_LEN(scan);
d6a28714 3163
d07ddd77
JH
3164 if (do_utf8 || UTF) {
3165 /* Either target or the pattern are utf8. */
d6a28714 3166 char *l = locinput;
d07ddd77 3167 char *e = PL_regeol;
bc517b45 3168
5d9a96ca 3169 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3170 l, &e, 0, do_utf8)) {
5486206c
JH
3171 /* One more case for the sharp s:
3172 * pack("U0U*", 0xDF) =~ /ss/i,
3173 * the 0xC3 0x9F are the UTF-8
3174 * byte sequence for the U+00DF. */
3175 if (!(do_utf8 &&
3176 toLOWER(s[0]) == 's' &&
5d9a96ca 3177 st->ln >= 2 &&
5486206c
JH
3178 toLOWER(s[1]) == 's' &&
3179 (U8)l[0] == 0xC3 &&
3180 e - l >= 2 &&
3181 (U8)l[1] == 0x9F))
3182 sayNO;
3183 }
d07ddd77
JH
3184 locinput = e;
3185 nextchr = UCHARAT(locinput);
3186 break;
a0ed51b3 3187 }
d6a28714 3188
bc517b45
JH
3189 /* Neither the target and the pattern are utf8. */
3190
d6a28714
JH
3191 /* Inline the first character, for speed. */
3192 if (UCHARAT(s) != nextchr &&
3193 UCHARAT(s) != ((OP(scan) == EXACTF)
3194 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3195 sayNO;
5d9a96ca 3196 if (PL_regeol - locinput < st->ln)
b8c5462f 3197 sayNO;
5d9a96ca
DM
3198 if (st->ln > 1 && (OP(scan) == EXACTF
3199 ? ibcmp(s, locinput, st->ln)
3200 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3201 sayNO;
5d9a96ca 3202 locinput += st->ln;
d6a28714 3203 nextchr = UCHARAT(locinput);
a0d0e21e 3204 break;
95b24440 3205 }
d6a28714 3206 case ANYOF:
ffc61ed2 3207 if (do_utf8) {
9e55ce06
JH
3208 STRLEN inclasslen = PL_regeol - locinput;
3209
32fc9b6a 3210 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3211 sayNO_ANYOF;
ffc61ed2
JH
3212 if (locinput >= PL_regeol)
3213 sayNO;
0f0076b4 3214 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3215 nextchr = UCHARAT(locinput);
e0f9d4a8 3216 break;
ffc61ed2
JH
3217 }
3218 else {
3219 if (nextchr < 0)
3220 nextchr = UCHARAT(locinput);
32fc9b6a 3221 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3222 sayNO_ANYOF;
ffc61ed2
JH
3223 if (!nextchr && locinput >= PL_regeol)
3224 sayNO;
3225 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3226 break;
3227 }
3228 no_anyof:
3229 /* If we might have the case of the German sharp s
3230 * in a casefolding Unicode character class. */
3231
ebc501f0
JH
3232 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3233 locinput += SHARP_S_SKIP;
e0f9d4a8 3234 nextchr = UCHARAT(locinput);
ffc61ed2 3235 }
e0f9d4a8
JH
3236 else
3237 sayNO;
b8c5462f 3238 break;
d6a28714 3239 case ALNUML:
b8c5462f
JH
3240 PL_reg_flags |= RF_tainted;
3241 /* FALL THROUGH */
d6a28714 3242 case ALNUM:
b8c5462f 3243 if (!nextchr)
4633a7c4 3244 sayNO;
ffc61ed2 3245 if (do_utf8) {
1a4fad37 3246 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3247 if (!(OP(scan) == ALNUM
bb7a0f54 3248 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3249 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3250 {
3251 sayNO;
a0ed51b3 3252 }
b8c5462f 3253 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3254 nextchr = UCHARAT(locinput);
3255 break;
3256 }
ffc61ed2 3257 if (!(OP(scan) == ALNUM
d6a28714 3258 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3259 sayNO;
b8c5462f 3260 nextchr = UCHARAT(++locinput);
a0d0e21e 3261 break;
d6a28714 3262 case NALNUML:
b8c5462f
JH
3263 PL_reg_flags |= RF_tainted;
3264 /* FALL THROUGH */
d6a28714
JH
3265 case NALNUM:
3266 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3267 sayNO;
ffc61ed2 3268 if (do_utf8) {
1a4fad37 3269 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3270 if (OP(scan) == NALNUM
bb7a0f54 3271 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3272 : isALNUM_LC_utf8((U8*)locinput))
3273 {
b8c5462f 3274 sayNO;
d6a28714 3275 }
b8c5462f
JH
3276 locinput += PL_utf8skip[nextchr];
3277 nextchr = UCHARAT(locinput);
3278 break;
3279 }
ffc61ed2 3280 if (OP(scan) == NALNUM
d6a28714 3281 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3282 sayNO;
76e3520e 3283 nextchr = UCHARAT(++locinput);
a0d0e21e 3284 break;
d6a28714
JH
3285 case BOUNDL:
3286 case NBOUNDL:
3280af22 3287 PL_reg_flags |= RF_tainted;
bbce6d69 3288 /* FALL THROUGH */
d6a28714
JH
3289 case BOUND:
3290 case NBOUND:
3291 /* was last char in word? */
ffc61ed2 3292 if (do_utf8) {
12d33761 3293 if (locinput == PL_bostr)
5d9a96ca 3294 st->ln = '\n';
ffc61ed2 3295 else {
a3b680e6 3296 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3297
4ad0818d 3298 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3299 }
3300 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3301 st->ln = isALNUM_uni(st->ln);
1a4fad37 3302 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3303 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3304 }
3305 else {
5d9a96ca 3306 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3307 n = isALNUM_LC_utf8((U8*)locinput);
3308 }
a0ed51b3 3309 }
d6a28714 3310 else {
5d9a96ca 3311 st->ln = (locinput != PL_bostr) ?
12d33761 3312 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3313 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3314 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3315 n = isALNUM(nextchr);
3316 }
3317 else {
5d9a96ca 3318 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3319 n = isALNUM_LC(nextchr);
3320 }
d6a28714 3321 }
5d9a96ca 3322 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3323 OP(scan) == BOUNDL))
3324 sayNO;
a0ed51b3 3325 break;
d6a28714 3326 case SPACEL:
3280af22 3327 PL_reg_flags |= RF_tainted;
bbce6d69 3328 /* FALL THROUGH */
d6a28714 3329 case SPACE:
9442cb0e 3330 if (!nextchr)
4633a7c4 3331 sayNO;
1aa99e6b 3332 if (do_utf8) {
fd400ab9 3333 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3334 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3335 if (!(OP(scan) == SPACE
bb7a0f54 3336 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3337 : isSPACE_LC_utf8((U8*)locinput)))
3338 {
3339 sayNO;
3340 }
3341 locinput += PL_utf8skip[nextchr];
3342 nextchr = UCHARAT(locinput);
3343 break;
d6a28714 3344 }
ffc61ed2
JH
3345 if (!(OP(scan) == SPACE
3346 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3347 sayNO;
3348 nextchr = UCHARAT(++locinput);
3349 }
3350 else {
3351 if (!(OP(scan) == SPACE
3352 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3353 sayNO;
3354 nextchr = UCHARAT(++locinput);
a0ed51b3 3355 }
a0ed51b3 3356 break;
d6a28714 3357 case NSPACEL:
3280af22 3358 PL_reg_flags |= RF_tainted;
bbce6d69 3359 /* FALL THROUGH */
d6a28714 3360 case NSPACE:
9442cb0e 3361 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3362 sayNO;
1aa99e6b 3363 if (do_utf8) {
1a4fad37 3364 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3365 if (OP(scan) == NSPACE
bb7a0f54 3366 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3367 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3368 {
3369 sayNO;
3370 }
3371 locinput += PL_utf8skip[nextchr];
3372 nextchr = UCHARAT(locinput);
3373 break;
a0ed51b3 3374 }
ffc61ed2 3375 if (OP(scan) == NSPACE
d6a28714 3376 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3377 sayNO;
76e3520e 3378 nextchr = UCHARAT(++locinput);
a0d0e21e 3379 break;
d6a28714 3380 case DIGITL:
a0ed51b3
LW
3381 PL_reg_flags |= RF_tainted;
3382 /* FALL THROUGH */
d6a28714 3383 case DIGIT:
9442cb0e 3384 if (!nextchr)
a0ed51b3 3385 sayNO;
1aa99e6b 3386 if (do_utf8) {
1a4fad37 3387 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3388 if (!(OP(scan) == DIGIT
bb7a0f54 3389 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3390 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3391 {
a0ed51b3 3392 sayNO;
dfe13c55 3393 }
6f06b55f 3394 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3395 nextchr = UCHARAT(locinput);
3396 break;
3397 }
ffc61ed2 3398 if (!(OP(scan) == DIGIT
9442cb0e 3399 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3400 sayNO;
3401 nextchr = UCHARAT(++locinput);
3402 break;
d6a28714 3403 case NDIGITL:
b8c5462f
JH
3404 PL_reg_flags |= RF_tainted;
3405 /* FALL THROUGH */
d6a28714 3406 case NDIGIT:
9442cb0e 3407 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3408 sayNO;
1aa99e6b 3409 if (do_utf8) {
1a4fad37 3410 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3411 if (OP(scan) == NDIGIT
bb7a0f54 3412 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3413 : isDIGIT_LC_utf8((U8*)locinput))
3414 {
a0ed51b3 3415 sayNO;
9442cb0e 3416 }
6f06b55f 3417 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3418 nextchr = UCHARAT(locinput);
3419 break;
3420 }
ffc61ed2 3421 if (OP(scan) == NDIGIT
9442cb0e 3422 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3423 sayNO;
3424 nextchr = UCHARAT(++locinput);
3425 break;
3426 case CLUMP:
b7c83a7e 3427 if (locinput >= PL_regeol)
a0ed51b3 3428 sayNO;
b7c83a7e 3429 if (do_utf8) {
1a4fad37 3430 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3431 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3432 sayNO;
3433 locinput += PL_utf8skip[nextchr];
3434 while (locinput < PL_regeol &&
3435 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3436 locinput += UTF8SKIP(locinput);
3437 if (locinput > PL_regeol)
3438 sayNO;
eb08e2da
JH
3439 }
3440 else
3441 locinput++;
a0ed51b3
LW
3442 nextchr = UCHARAT(locinput);
3443 break;
c8756f30 3444 case REFFL:
3280af22 3445 PL_reg_flags |= RF_tainted;
c8756f30 3446 /* FALL THROUGH */
c277df42 3447 case REF:
95b24440
DM
3448 case REFF: {
3449 char *s;
c277df42 3450 n = ARG(scan); /* which paren pair */
5d9a96ca 3451 st->ln = PL_regstartp[n];
2c2d71f5 3452 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3453 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3454 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3455 if (st->ln == PL_regendp[n])
a0d0e21e 3456 break;
a0ed51b3 3457
5d9a96ca 3458 s = PL_bostr + st->ln;
1aa99e6b 3459 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3460 char *l = locinput;
a3b680e6 3461 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3462 /*
3463 * Note that we can't do the "other character" lookup trick as
3464 * in the 8-bit case (no pun intended) because in Unicode we
3465 * have to map both upper and title case to lower case.
3466 */
3467 if (OP(scan) == REFF) {
3468 while (s < e) {
a3b680e6
AL
3469 STRLEN ulen1, ulen2;
3470 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3471 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3472
a0ed51b3
LW
3473 if (l >= PL_regeol)
3474 sayNO;
a2a2844f
JH
3475 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3476 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3477 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3478 sayNO;
a2a2844f
JH
3479 s += ulen1;
3480 l += ulen2;
a0ed51b3
LW
3481 }
3482 }
3483 locinput = l;
3484 nextchr = UCHARAT(locinput);
3485 break;
3486 }
3487
a0d0e21e 3488 /* Inline the first character, for speed. */
76e3520e 3489 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3490 (OP(scan) == REF ||
3491 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3492 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3493 sayNO;
5d9a96ca
DM
3494 st->ln = PL_regendp[n] - st->ln;
3495 if (locinput + st->ln > PL_regeol)
4633a7c4 3496 sayNO;
5d9a96ca
DM
3497 if (st->ln > 1 && (OP(scan) == REF
3498 ? memNE(s, locinput, st->ln)
c8756f30 3499 : (OP(scan) == REFF
5d9a96ca
DM
3500 ? ibcmp(s, locinput, st->ln)
3501 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3502 sayNO;
5d9a96ca 3503 locinput += st->ln;
76e3520e 3504 nextchr = UCHARAT(locinput);
a0d0e21e 3505 break;
95b24440 3506 }
a0d0e21e
LW
3507
3508 case NOTHING:
c277df42 3509 case TAIL:
a0d0e21e
LW
3510 break;
3511 case BACK:
3512 break;
c277df42
IZ
3513 case EVAL:
3514 {
c277df42 3515 SV *ret;
8e5e9ebe 3516 {
4aabdb9b
DM
3517 /* execute the code in the {...} */
3518 dSP;
6136c704 3519 SV ** const before = SP;
4aabdb9b
DM
3520 OP_4tree * const oop = PL_op;
3521 COP * const ocurcop = PL_curcop;
3522 PAD *old_comppad;
4aabdb9b
DM
3523
3524 n = ARG(scan);
32fc9b6a 3525 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3526 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3527 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3528 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3529
8e5e9ebe
RGS
3530 CALLRUNOPS(aTHX); /* Scalar context. */
3531 SPAGAIN;
3532 if (SP == before)
075aa684 3533 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3534 else {
3535 ret = POPs;
3536 PUTBACK;
3537 }
4aabdb9b
DM
3538
3539 PL_op = oop;
3540 PAD_RESTORE_LOCAL(old_comppad);
3541 PL_curcop = ocurcop;
3542 if (!st->logical) {
3543 /* /(?{...})/ */
3544 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3545 break;
3546 }
8e5e9ebe 3547 }
4aabdb9b
DM
3548 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3549 regexp *re;
4aabdb9b 3550 {
4f639d21
DM
3551 /* extract RE object from returned value; compiling if
3552 * necessary */
3553
6136c704 3554 MAGIC *mg = NULL;
4aabdb9b 3555 SV *sv;
faf82a0b
AE
3556 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3557 mg = mg_find(sv, PERL_MAGIC_qr);
3558 else if (SvSMAGICAL(ret)) {
3559 if (SvGMAGICAL(ret))
3560 sv_unmagic(ret, PERL_MAGIC_qr);
3561 else
3562 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3563 }
faf82a0b 3564
0f5d15d6
IZ
3565 if (mg) {
3566 re = (regexp *)mg->mg_obj;
df0003d4 3567 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3568 }
3569 else {
3570 STRLEN len;
6136c704 3571 const char * const t = SvPV_const(ret, len);
0f5d15d6 3572 PMOP pm;
a3b680e6 3573 const I32 osize = PL_regsize;
0f5d15d6 3574
5fcd1c1b 3575 Zero(&pm, 1, PMOP);
4aabdb9b 3576 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3577 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3578 if (!(SvFLAGS(ret)
faf82a0b
AE
3579 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3580 | SVs_GMG)))
14befaf4
DM
3581 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3582 PERL_MAGIC_qr,0,0);
0f5d15d6 3583 PL_regsize = osize;
0f5d15d6 3584 }
4aabdb9b 3585 }
aa283a38
DM
3586
3587 /* run the pattern returned from (??{...}) */
3588
4aabdb9b
DM
3589 DEBUG_EXECUTE_r(
3590 PerlIO_printf(Perl_debug_log,
3591 "Entering embedded \"%s%.60s%s%s\"\n",
3592 PL_colors[0],
3593 re->precomp,
3594 PL_colors[1],
3595 (strlen(re->precomp) > 60 ? "..." : ""))
3596 );
2c2d71f5 3597
4aabdb9b
DM
3598 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
3599 REGCP_SET(st->u.eval.lastcp);
4aabdb9b
DM
3600 *PL_reglastparen = 0;
3601 *PL_reglastcloseparen = 0;
4aabdb9b 3602 PL_reginput = locinput;
4aabdb9b
DM
3603
3604 /* XXXX This is too dramatic a measure... */
3605 PL_reg_maxiter = 0;
3606
5d9a96ca 3607 st->logical = 0;
aa283a38
DM
3608 st->u.eval.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3609 ((re->reganch & ROPT_UTF8) != 0);
3610 if (st->u.eval.toggleutf) PL_reg_flags ^= RF_utf8;
3611 st->u.eval.prev_rex = rex;
aa283a38 3612 rex = re;
aa283a38 3613
77cb431f
DM
3614 /* resume to current state on success */
3615 st->u.yes.prev_yes_state = yes_state;
3616 yes_state = st;
aa283a38
DM
3617 PUSH_STATE(newst, resume_EVAL);
3618 st = newst;
3619
3620 /* now continue from first node in postoned RE */
3621 next = re->program + 1;
3622 break;
4aabdb9b 3623 /* NOTREACHED */
a0ed51b3 3624 }
4aabdb9b
DM
3625 /* /(?(?{...})X|Y)/ */
3626 st->sw = SvTRUE(ret);
3627 st->logical = 0;
c277df42
IZ
3628 break;
3629 }
a0d0e21e 3630 case OPEN:
c277df42 3631 n = ARG(scan); /* which paren pair */
3280af22
NIS
3632 PL_reg_start_tmp[n] = locinput;
3633 if (n > PL_regsize)
3634 PL_regsize = n;
a0d0e21e
LW
3635 break;
3636 case CLOSE:
c277df42 3637 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3638 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3639 PL_regendp[n] = locinput - PL_bostr;
eb160463 3640 if (n > (I32)*PL_reglastparen)
3280af22 3641 *PL_reglastparen = n;
a01268b5 3642 *PL_reglastcloseparen = n;
a0d0e21e 3643 break;
c277df42
IZ
3644 case GROUPP:
3645 n = ARG(scan); /* which paren pair */
5d9a96ca 3646 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3647 break;
3648 case IFTHEN:
2c2d71f5 3649 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3650 if (st->sw)
c277df42
IZ
3651 next = NEXTOPER(NEXTOPER(scan));
3652 else {
3653 next = scan + ARG(scan);
3654 if (OP(next) == IFTHEN) /* Fake one. */
3655 next = NEXTOPER(NEXTOPER(next));
3656 }
3657 break;
3658 case LOGICAL:
5d9a96ca 3659 st->logical = scan->flags;
c277df42 3660 break;
2ab05381 3661/*******************************************************************
a0374537
DM
3662 cc points to the regmatch_state associated with the most recent CURLYX.
3663 This struct contains info about the innermost (...)* loop (an
3664 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3665
3666 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3667
95b24440 3668 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3669
a0374537 3670 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3671 with the starting point at WHILEM node;
2ab05381
IZ
3672
3673 3) Each hit of WHILEM node tries to match A and Z (in the order
3674 depending on the current iteration, min/max of {min,max} and
3675 greediness). The information about where are nodes for "A"
a0374537 3676 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3677 was already matched, and greediness.
3678
3679 4) After A matches, the same WHILEM node is hit again.
3680
95b24440 3681 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3682 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3683 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3684 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3685 of the external loop.
3686
a0374537 3687 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3688 and whatever it mentions via ->next, and additional attached trees
3689 corresponding to temporarily unset infoblocks as in "5" above.
3690
95b24440 3691 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3692 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3693 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3694 infoblocks are drawn below the "reset" infoblock.
3695
3696 In fact in the picture below we do not show failed matches for Z and T
3697 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3698 more obvious *why* one needs to *temporary* unset infoblocks.]
3699
3700 Matched REx position InfoBlocks Comment
3701 (Y(A)*?Z)*?T x
3702 Y(A)*?Z)*?T x <- O
3703 Y (A)*?Z)*?T x <- O
3704 Y A)*?Z)*?T x <- O <- I
3705 YA )*?Z)*?T x <- O <- I
3706 YA A)*?Z)*?T x <- O <- I
3707 YAA )*?Z)*?T x <- O <- I
3708 YAA Z)*?T x <- O # Temporary unset I
3709 I
3710
3711 YAAZ Y(A)*?Z)*?T x <- O
3712 I
3713
3714 YAAZY (A)*?Z)*?T x <- O
3715 I
3716
3717 YAAZY A)*?Z)*?T x <- O <- I
3718 I
3719
3720 YAAZYA )*?Z)*?T x <- O <- I
3721 I
3722
3723 YAAZYA Z)*?T x <- O # Temporary unset I
3724 I,I
3725
3726 YAAZYAZ )*?T x <- O
3727 I,I
3728
3729 YAAZYAZ T x # Temporary unset O
3730 O
3731 I,I
3732
3733 YAAZYAZT x
3734 O
3735 I,I
3736 *******************************************************************/
95b24440 3737
a0d0e21e 3738 case CURLYX: {
cb434fcc
IZ
3739 /* No need to save/restore up to this paren */
3740 I32 parenfloor = scan->flags;
c277df42 3741
c2b7afd3
NC
3742 /* Dave says:
3743
3744 CURLYX and WHILEM are always paired: they're the moral
3745 equivalent of pp_enteriter anbd pp_iter.
3746
3747 The only time next could be null is if the node tree is
3748 corrupt. This was mentioned on p5p a few days ago.
3749
3750 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3751 So we'll assert that this is true:
3752 */
3753 assert(next);
30b2893d 3754 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3755 next += ARG(next);
cb434fcc
IZ
3756 /* XXXX Probably it is better to teach regpush to support
3757 parenfloor > PL_regsize... */
eb160463 3758 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3759 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3760
d8319b27
DM
3761 st->u.curlyx.cp = PL_savestack_ix;
3762 st->u.curlyx.outercc = st->cc;
a0374537
DM
3763 st->cc = st;
3764 /* these fields contain the state of the current curly.
3765 * they are accessed by subsequent WHILEMs;
3766 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3767 st->u.curlyx.parenfloor = parenfloor;
3768 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3769 st->u.curlyx.min = ARG1(scan);
3770 st->u.curlyx.max = ARG2(scan);
3771 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3772 st->u.curlyx.lastloc = 0;
a0374537
DM
3773 /* st->next and st->minmod are also read by WHILEM */
3774
3280af22 3775 PL_reginput = locinput;
95b24440
DM
3776 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3777 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3778 regcpblow(st->u.curlyx.cp);
3779 st->cc = st->u.curlyx.outercc;
95b24440 3780 saySAME(result);
a0d0e21e 3781 }
5f66b61c 3782 /* NOTREACHED */
a0d0e21e
LW
3783 case WHILEM: {
3784 /*
3785 * This is really hard to understand, because after we match
3786 * what we're trying to match, we must make sure the rest of
2c2d71f5 3787 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3788 * to go back UP the parse tree by recursing ever deeper. And
3789 * if it fails, we have to reset our parent's current state
3790 * that we can try again after backing off.
3791 */
3792
c2b7afd3
NC
3793 /* Dave says:
3794
3795 st->cc gets initialised by CURLYX ready for use by WHILEM.
3796 So again, unless somethings been corrupted, st->cc cannot
3797 be null at that point in WHILEM.
3798
3799 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3800 So we'll assert that this is true:
3801 */
3802 assert(st->cc);
d8319b27
DM
3803 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3804 st->u.whilem.cache_offset = 0;
3805 st->u.whilem.cache_bit = 0;
c277df42 3806
d8319b27 3807 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3808 PL_reginput = locinput;
a0d0e21e 3809
a3621e74 3810 DEBUG_EXECUTE_r(
9041c2e3 3811 PerlIO_printf(Perl_debug_log,
91f3b821 3812 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3813 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3814 (long)n, (long)st->cc->u.curlyx.min,
3815 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3816 );
4633a7c4 3817
a0d0e21e
LW
3818 /* If degenerate scan matches "", assume scan done. */
3819
d8319b27
DM
3820 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3821 st->u.whilem.savecc = st->cc;
3822 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3823 if (st->cc)
d8319b27 3824 st->ln = st->cc->u.curlyx.cur;
a3621e74 3825 DEBUG_EXECUTE_r(
c3464db5
DD
3826 PerlIO_printf(Perl_debug_log,
3827 "%*s empty match detected, try continuation...\n",
3280af22 3828 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3829 );
d8319b27 3830 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3831 /*** all unsaved local vars undefined at this point */
d8319b27 3832 st->cc = st->u.whilem.savecc;
95b24440 3833 if (result)
4633a7c4 3834 sayYES;
d8319b27
DM
3835 if (st->cc->u.curlyx.outercc)
3836 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3837 sayNO;
a0d0e21e
LW
3838 }
3839
3840 /* First just match a string of min scans. */
3841
d8319b27
DM
3842 if (n < st->cc->u.curlyx.min) {
3843 st->cc->u.curlyx.cur = n;
3844 st->cc->u.curlyx.lastloc = locinput;
3845 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3846 /*** all unsaved local vars undefined at this point */
3847 if (result)
4633a7c4 3848 sayYES;
d8319b27
DM
3849 st->cc->u.curlyx.cur = n - 1;
3850 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3851 sayNO;
a0d0e21e
LW
3852 }
3853
2c2d71f5
JH
3854 if (scan->flags) {
3855 /* Check whether we already were at this position.
3856 Postpone detection until we know the match is not
3857 *that* much linear. */
3858 if (!PL_reg_maxiter) {
3859 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3860 /* possible overflow for long strings and many CURLYX's */
3861 if (PL_reg_maxiter < 0)
3862 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3863 PL_reg_leftiter = PL_reg_maxiter;
3864 }
3865 if (PL_reg_leftiter-- == 0) {
a3b680e6 3866 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3867 if (PL_reg_poscache) {
eb160463 3868 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3869 Renew(PL_reg_poscache, size, char);
3870 PL_reg_poscache_size = size;
3871 }
3872 Zero(PL_reg_poscache, size, char);
3873 }
3874 else {
3875 PL_reg_poscache_size = size;
a02a5408 3876 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3877 }
a3621e74 3878 DEBUG_EXECUTE_r(
2c2d71f5
JH
3879 PerlIO_printf(Perl_debug_log,
3880 "%sDetected a super-linear match, switching on caching%s...\n",
3881 PL_colors[4], PL_colors[5])
3882 );
3883 }
3884 if (PL_reg_leftiter < 0) {
d8319b27 3885 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3886
d8319b27
DM
3887 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3888 + st->u.whilem.cache_offset * (scan->flags>>4);
3889 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3890 st->u.whilem.cache_offset /= 8;
3891 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3892 DEBUG_EXECUTE_r(
2c2d71f5
JH
3893 PerlIO_printf(Perl_debug_log,
3894 "%*s already tried at this position...\n",
3895 REPORT_CODE_OFF+PL_regindent*2, "")
3896 );
3ab3c9b4
HS
3897 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3898 /* cache records success */
c2b0868c
HS
3899 sayYES;
3900 else
3ab3c9b4 3901 /* cache records failure */
c2b0868c 3902 sayNO_SILENT;
2c2d71f5 3903 }
2c2d71f5
JH
3904 }
3905 }
3906
a0d0e21e
LW
3907 /* Prefer next over scan for minimal matching. */
3908
5d9a96ca 3909 if (st->cc->minmod) {
d8319b27
DM
3910 st->u.whilem.savecc = st->cc;
3911 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3912 if (st->cc)
d8319b27
DM
3913 st->ln = st->cc->u.curlyx.cur;
3914 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3915 REGCP_SET(st->u.whilem.lastcp);
3916 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3917 /*** all unsaved local vars undefined at this point */
d8319b27 3918 st->cc = st->u.whilem.savecc;
95b24440 3919 if (result) {
d8319b27 3920 regcpblow(st->u.whilem.cp);
3ab3c9b4 3921 CACHEsayYES; /* All done. */
5f05dabc 3922 }
d8319b27 3923 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3924 regcppop(rex);
d8319b27
DM
3925 if (st->cc->u.curlyx.outercc)
3926 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3927
d8319b27 3928 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3929 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3930 && !(PL_reg_flags & RF_warned)) {
3931 PL_reg_flags |= RF_warned;
9014280d 3932 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3933 "Complex regular subexpression recursion",
3934 REG_INFTY - 1);
c277df42 3935 }
3ab3c9b4 3936 CACHEsayNO;
c277df42 3937 }
a687059c 3938
a3621e74 3939 DEBUG_EXECUTE_r(
c3464db5
DD
3940 PerlIO_printf(Perl_debug_log,
3941 "%*s trying longer...\n",
3280af22 3942 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3943 );
a0d0e21e 3944 /* Try scanning more and see if it helps. */
3280af22 3945 PL_reginput = locinput;
d8319b27
DM
3946 st->cc->u.curlyx.cur = n;
3947 st->cc->u.curlyx.lastloc = locinput;
3948 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3949 REGCP_SET(st->u.whilem.lastcp);
3950 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3951 /*** all unsaved local vars undefined at this point */
3952 if (result) {
d8319b27 3953 regcpblow(st->u.whilem.cp);
3ab3c9b4 3954 CACHEsayYES;
5f05dabc 3955 }
d8319b27 3956 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3957 regcppop(rex);
d8319b27
DM
3958 st->cc->u.curlyx.cur = n - 1;
3959 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3960 CACHEsayNO;
a0d0e21e
LW
3961 }
3962
3963 /* Prefer scan over next for maximal matching. */
3964
d8319b27
DM
3965 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3966 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3967 st->cc->u.curlyx.cur = n;
3968 st->cc->u.curlyx.lastloc = locinput;
3969 REGCP_SET(st->u.whilem.lastcp);
3970 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3971 /*** all unsaved local vars undefined at this point */
3972 if (result) {
d8319b27 3973 regcpblow(st->u.whilem.cp);
3ab3c9b4 3974 CACHEsayYES;
5f05dabc 3975 }
d8319b27 3976 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3977 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3978 PL_reginput = locinput;
a3621e74 3979 DEBUG_EXECUTE_r(
c3464db5
DD
3980 PerlIO_printf(Perl_debug_log,
3981 "%*s failed, try continuation...\n",
3280af22 3982 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3983 );
3984 }
9041c2e3 3985 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3986 && !(PL_reg_flags & RF_warned)) {
3280af22 3987 PL_reg_flags |= RF_warned;
9014280d 3988 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3989 "Complex regular subexpression recursion",
3990 REG_INFTY - 1);
a0d0e21e
LW
3991 }
3992
3993 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
3994 st->u.whilem.savecc = st->cc;
3995 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3996 if (st->cc)
d8319b27
DM
3997 st->ln = st->cc->u.curlyx.cur;
3998 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 3999 /*** all unsaved local vars undefined at this point */
d8319b27 4000 st->cc = st->u.whilem.savecc;
95b24440 4001 if (result)
3ab3c9b4 4002 CACHEsayYES;
d8319b27
DM
4003 if (st->cc->u.curlyx.outercc)
4004 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4005 st->cc->u.curlyx.cur = n - 1;
4006 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4007 CACHEsayNO;
a0d0e21e 4008 }
5f66b61c 4009 /* NOTREACHED */
9041c2e3 4010 case BRANCHJ:
c277df42
IZ
4011 next = scan + ARG(scan);
4012 if (next == scan)
4013 next = NULL;
4014 inner = NEXTOPER(NEXTOPER(scan));
4015 goto do_branch;
9041c2e3 4016 case BRANCH:
c277df42
IZ
4017 inner = NEXTOPER(scan);
4018 do_branch:
4019 {
e822a8b4
DM
4020 I32 type;
4021 type = OP(scan);
ae5031b3 4022 if (!next || OP(next) != type) /* No choice. */
c277df42 4023 next = inner; /* Avoid recursion. */
a0d0e21e 4024 else {
a3b680e6 4025 const I32 lastparen = *PL_reglastparen;
02db2b7b 4026 /* Put unwinding data on stack */
6136c704
AL
4027 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
4028 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
4029
5d9a96ca
DM
4030 uw->prev = st->unwind;
4031 st->unwind = unwind1;
e822a8b4 4032 uw->type = ((type == BRANCH)
02db2b7b
IZ
4033 ? RE_UNWIND_BRANCH
4034 : RE_UNWIND_BRANCHJ);
4035 uw->lastparen = lastparen;
4036 uw->next = next;
4037 uw->locinput = locinput;
4038 uw->nextchr = nextchr;
3a2830be 4039 uw->minmod = st->minmod;
02db2b7b
IZ
4040#ifdef DEBUGGING
4041 uw->regindent = ++PL_regindent;
4042#endif
c277df42 4043
02db2b7b
IZ
4044 REGCP_SET(uw->lastcp);
4045
4046 /* Now go into the first branch */
4047 next = inner;
a687059c 4048 }
a0d0e21e
LW
4049 }
4050 break;
4051 case MINMOD:
5d9a96ca 4052 st->minmod = 1;
a0d0e21e 4053 break;
c277df42
IZ
4054 case CURLYM:
4055 {
d8319b27 4056 st->u.curlym.l = st->u.curlym.matches = 0;
9041c2e3 4057
c277df42 4058 /* We suppose that the next guy does not need
0e788c72 4059 backtracking: in particular, it is of constant non-zero length,
c277df42 4060 and has no parenths to influence future backrefs. */
5d9a96ca 4061 st->ln = ARG1(scan); /* min to match */
c277df42 4062 n = ARG2(scan); /* max to match */
d8319b27
DM
4063 st->u.curlym.paren = scan->flags;
4064 if (st->u.curlym.paren) {
4065 if (st->u.curlym.paren > PL_regsize)
4066 PL_regsize = st->u.curlym.paren;
4067 if (st->u.curlym.paren > (I32)*PL_reglastparen)
4068 *PL_reglastparen = st->u.curlym.paren;
c277df42 4069 }
dc45a647 4070 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
d8319b27 4071 if (st->u.curlym.paren)
c277df42 4072 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 4073 PL_reginput = locinput;
d8319b27 4074 st->u.curlym.maxwanted = st->minmod ? st->ln : n;
0cadcf80
DM
4075 while (PL_reginput < PL_regeol && st->u.curlym.matches < st->u.curlym.maxwanted) {
4076 /* resume to current state on success */
4077 st->u.yes.prev_yes_state = yes_state;
4078 yes_state = st;
4079 REGMATCH(scan, CURLYM1);
4080 yes_state = st->u.yes.prev_yes_state;
4081 /*** all unsaved local vars undefined at this point */
4082 if (!result)
4083 break;
4084 /* on first match, determine length, u.curlym.l */
4085 if (!st->u.curlym.matches++) {
4086 if (PL_reg_match_utf8) {
4087 char *s = locinput;
4088 while (s < PL_reginput) {
4089 st->u.curlym.l++;
4090 s += UTF8SKIP(s);
6407bf3b
DM
4091 }
4092 }
0cadcf80
DM
4093 else {
4094 st->u.curlym.l = PL_reginput - locinput;
4095 }
4096 if (st->u.curlym.l == 0) {
4097 st->u.curlym.matches = st->u.curlym.maxwanted;
4098 break;
4099 }
6407bf3b 4100 }
0cadcf80 4101 locinput = PL_reginput;
6407bf3b
DM
4102 }
4103
4104 PL_reginput = locinput;
0cadcf80 4105 if (st->u.curlym.matches < st->ln) {
5d9a96ca 4106 st->minmod = 0;
0cadcf80
DM
4107 sayNO;
4108 }
5f80c4cf 4109
0cadcf80
DM
4110 DEBUG_EXECUTE_r(
4111 PerlIO_printf(Perl_debug_log,
4112 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
4113 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
4114 (IV) st->u.curlym.matches, (IV)st->u.curlym.l)
4115 );
4116
4117 /* calculate c1 and c1 for possible match of 1st char
4118 * following curly */
9e137952 4119 st->u.curlym.c1 = st->u.curlym.c2 = CHRTEST_VOID;
0cadcf80
DM
4120 if (HAS_TEXT(next) || JUMPABLE(next)) {
4121 regnode *text_node = next;
3dab1dad
YO
4122 if (! HAS_TEXT(text_node))
4123 FIND_NEXT_IMPT(text_node);
0cadcf80 4124 if (HAS_TEXT(text_node)
3dab1dad 4125 && PL_regkind[OP(text_node)] != REF)
0cadcf80
DM
4126 {
4127 st->u.curlym.c1 = (U8)*STRING(text_node);
4128 st->u.curlym.c2 =
4129 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4130 ? PL_fold[st->u.curlym.c1]
4131 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4132 ? PL_fold_locale[st->u.curlym.c1]
4133 : st->u.curlym.c1;
4134 }
4135 }
5f80c4cf 4136
0cadcf80
DM
4137 REGCP_SET(st->u.curlym.lastcp);
4138
4139 st->u.curlym.minmod = st->minmod;
4140 st->minmod = 0;
4141 while (st->u.curlym.matches >= st->ln
4142 && (st->u.curlym.matches <= n
4143 /* for REG_INFTY, ln could overflow to negative */
4144 || (n == REG_INFTY && st->u.curlym.matches >= 0)))
4145 {
4146 /* If it could work, try it. */
9e137952 4147 if (st->u.curlym.c1 == CHRTEST_VOID ||
0cadcf80
DM
4148 UCHARAT(PL_reginput) == st->u.curlym.c1 ||
4149 UCHARAT(PL_reginput) == st->u.curlym.c2)
4150 {
4151 DEBUG_EXECUTE_r(
4152 PerlIO_printf(Perl_debug_log,
4153 "%*s trying tail with matches=%"IVdf"...\n",
4154 (int)(REPORT_CODE_OFF+PL_regindent*2),
4155 "", (IV)st->u.curlym.matches)
4156 );
4157 if (st->u.curlym.paren) {
4158 if (st->u.curlym.matches) {
4159 PL_regstartp[st->u.curlym.paren]
4160 = HOPc(PL_reginput, -st->u.curlym.l) - PL_bostr;
4161 PL_regendp[st->u.curlym.paren] = PL_reginput - PL_bostr;
cca55fe3 4162 }
5f80c4cf 4163 else
0cadcf80 4164 PL_regendp[st->u.curlym.paren] = -1;
5f80c4cf 4165 }
0cadcf80
DM
4166 /* resume to current state on success */
4167 st->u.yes.prev_yes_state = yes_state;
4168 yes_state = st;
4169 REGMATCH(next, CURLYM2);
4170 yes_state = st->u.yes.prev_yes_state;
4171 /*** all unsaved local vars undefined at this point */
4172 if (result)
4173 /* XXX tmp sayYES; */
4174 sayYES_FINAL;
4175 REGCP_UNWIND(st->u.curlym.lastcp);
a0ed51b3 4176 }
0cadcf80
DM
4177 /* Couldn't or didn't -- move forward/backward. */
4178 if (st->u.curlym.minmod) {
3280af22 4179 PL_reginput = locinput;
dad79028
DM
4180 /* resume to current state on success */
4181 st->u.yes.prev_yes_state = yes_state;
4182 yes_state = st;
95b24440 4183 REGMATCH(scan, CURLYM3);
dad79028 4184 yes_state = st->u.yes.prev_yes_state;
95b24440
DM
4185 /*** all unsaved local vars undefined at this point */
4186 if (result) {
0cadcf80 4187 st->u.curlym.matches++;
3280af22 4188 locinput = PL_reginput;
c277df42
IZ
4189 }
4190 else
4191 sayNO;
4192 }
0cadcf80 4193 else {
d8319b27
DM
4194 st->u.curlym.matches--;
4195 locinput = HOPc(locinput, -st->u.curlym.l);
3280af22 4196 PL_reginput = locinput;
c277df42
IZ
4197 }
4198 }
4199 sayNO;
5f66b61c 4200 /* NOTREACHED */
c277df42
IZ
4201 break;
4202 }
4203 case CURLYN:
d8319b27
DM
4204 st->u.plus.paren = scan->flags; /* Which paren to set */
4205 if (st->u.plus.paren > PL_regsize)
4206 PL_regsize = st->u.plus.paren;
4207 if (st->u.plus.paren > (I32)*PL_reglastparen)
4208 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4209 st->ln = ARG1(scan); /* min to match */
c277df42 4210 n = ARG2(scan); /* max to match */
dc45a647 4211 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4212 goto repeat;
a0d0e21e 4213 case CURLY:
d8319b27 4214 st->u.plus.paren = 0;
5d9a96ca 4215 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4216 n = ARG2(scan); /* max to match */
dc45a647 4217 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4218 goto repeat;
4219 case STAR:
5d9a96ca 4220 st->ln = 0;
c277df42 4221 n = REG_INFTY;
a0d0e21e 4222 scan = NEXTOPER(scan);
d8319b27 4223 st->u.plus.paren = 0;
a0d0e21e
LW
4224 goto repeat;
4225 case PLUS:
5d9a96ca 4226 st->ln = 1;
c277df42
IZ
4227 n = REG_INFTY;
4228 scan = NEXTOPER(scan);
d8319b27 4229 st->u.plus.paren = 0;
c277df42 4230 repeat:
a0d0e21e
LW
4231 /*
4232 * Lookahead to avoid useless match attempts
4233 * when we know what character comes next.
4234 */
5f80c4cf
JP
4235
4236 /*
4237 * Used to only do .*x and .*?x, but now it allows
4238 * for )'s, ('s and (?{ ... })'s to be in the way
4239 * of the quantifier and the EXACT-like node. -- japhy
4240 */
4241
cca55fe3 4242 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4243 U8 *s;
4244 regnode *text_node = next;
4245
3dab1dad
YO
4246 if (! HAS_TEXT(text_node))
4247 FIND_NEXT_IMPT(text_node);
5f80c4cf 4248
9e137952
DM
4249 if (! HAS_TEXT(text_node))
4250 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4251 else {
3dab1dad 4252 if (PL_regkind[OP(text_node)] == REF) {
9e137952 4253 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4254 goto assume_ok_easy;
cca55fe3
JP
4255 }
4256 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
4257
4258 if (!UTF) {
d8319b27 4259 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4260 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4261 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4262 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4263 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4264 }
5f80c4cf 4265 else { /* UTF */
f65d3ee7 4266 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4267 STRLEN ulen1, ulen2;
89ebb4a3
JH
4268 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4269 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4270
4271 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4272 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4273
d8319b27 4274 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4275 uniflags);
d8319b27 4276 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4277 uniflags);
5f80c4cf
JP
4278 }
4279 else {
d8319b27 4280 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4281 uniflags);
5f80c4cf 4282 }
1aa99e6b
IH
4283 }
4284 }
bbce6d69 4285 }
a0d0e21e 4286 else
9e137952 4287 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4288 assume_ok_easy:
3280af22 4289 PL_reginput = locinput;
5d9a96ca
DM
4290 if (st->minmod) {
4291 st->minmod = 0;
32fc9b6a 4292 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4293 sayNO;
a0ed51b3 4294 locinput = PL_reginput;
d8319b27 4295 REGCP_SET(st->u.plus.lastcp);
9e137952 4296 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4297 st->u.plus.old = locinput;
4298 st->u.plus.count = 0;
0fe9bf95 4299
1aa99e6b 4300 if (n == REG_INFTY) {
d8319b27 4301 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4302 if (do_utf8)
d8319b27
DM
4303 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4304 st->u.plus.e--;
1aa99e6b
IH
4305 }
4306 else if (do_utf8) {
5d9a96ca 4307 int m = n - st->ln;
d8319b27
DM
4308 for (st->u.plus.e = locinput;
4309 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4310 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4311 }
4312 else {
d8319b27
DM
4313 st->u.plus.e = locinput + n - st->ln;
4314 if (st->u.plus.e >= PL_regeol)
4315 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4316 }
0fe9bf95
IZ
4317 while (1) {
4318 /* Find place 'next' could work */
1aa99e6b 4319 if (!do_utf8) {
d8319b27
DM
4320 if (st->u.plus.c1 == st->u.plus.c2) {
4321 while (locinput <= st->u.plus.e &&
4322 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4323 locinput++;
4324 } else {
d8319b27
DM
4325 while (locinput <= st->u.plus.e
4326 && UCHARAT(locinput) != st->u.plus.c1
4327 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4328 locinput++;
4329 }
d8319b27 4330 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4331 }
4332 else {
d8319b27 4333 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4334 STRLEN len;
872c91ae
JH
4335 /* count initialised to
4336 * utf8_distance(old, locinput) */
d8319b27 4337 while (locinput <= st->u.plus.e &&
872c91ae 4338 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4339 UTF8_MAXBYTES, &len,
d8319b27 4340 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4341 locinput += len;
d8319b27 4342 st->u.plus.count++;
b2f2f093 4343 }
1aa99e6b 4344 } else {
872c91ae
JH
4345 /* count initialised to
4346 * utf8_distance(old, locinput) */
d8319b27 4347 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4348 STRLEN len;
4349 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4350 UTF8_MAXBYTES, &len,
041457d9 4351 uniflags);
d8319b27 4352 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4353 break;
b2f2f093 4354 locinput += len;
d8319b27 4355 st->u.plus.count++;
1aa99e6b
IH
4356 }
4357 }
0fe9bf95 4358 }
d8319b27 4359 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4360 sayNO;
4361 /* PL_reginput == old now */
d8319b27 4362 if (locinput != st->u.plus.old) {
5d9a96ca 4363 st->ln = 1; /* Did some */
32fc9b6a 4364 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4365 sayNO;
4366 }
4367 /* PL_reginput == locinput now */
d8319b27 4368 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4369 /*** all unsaved local vars undefined at this point */
0fe9bf95 4370 PL_reginput = locinput; /* Could be reset... */
d8319b27 4371 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4372 /* Couldn't or didn't -- move forward. */
d8319b27 4373 st->u.plus.old = locinput;
1aa99e6b
IH
4374 if (do_utf8)
4375 locinput += UTF8SKIP(locinput);
4376 else
4377 locinput++;
d8319b27 4378 st->u.plus.count = 1;
0fe9bf95
IZ
4379 }
4380 }
4381 else
5d9a96ca 4382 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4383 UV c;
9e137952 4384 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4385 if (do_utf8)
872c91ae 4386 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4387 UTF8_MAXBYTES, 0,
041457d9 4388 uniflags);
1aa99e6b 4389 else
9041c2e3 4390 c = UCHARAT(PL_reginput);
2390ecbc 4391 /* If it could work, try it. */
d8319b27 4392 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
2390ecbc 4393 {
d8319b27 4394 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4395 /*** all unsaved local vars undefined at this point */
d8319b27 4396 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4397 }
1aa99e6b 4398 }
a0d0e21e 4399 /* If it could work, try it. */
9e137952 4400 else if (st->u.plus.c1 == CHRTEST_VOID)
bbce6d69 4401 {
d8319b27 4402 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4403 /*** all unsaved local vars undefined at this point */
d8319b27 4404 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4405 }
c277df42 4406 /* Couldn't or didn't -- move forward. */
a0ed51b3 4407 PL_reginput = locinput;
32fc9b6a 4408 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4409 st->ln++;
a0ed51b3
LW
4410 locinput = PL_reginput;
4411 }
4412 else
4633a7c4 4413 sayNO;
a0d0e21e
LW
4414 }
4415 }
4416 else {
32fc9b6a 4417 n = regrepeat(rex, scan, n);
a0ed51b3 4418 locinput = PL_reginput;
3dab1dad
YO
4419 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4420 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
15272685 4421 {
5d9a96ca 4422 st->ln = n; /* why back off? */
1aeab75a
GS
4423 /* ...because $ and \Z can match before *and* after
4424 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4425 We should back off by one in this case. */
4426 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4427 st->ln--;
1aeab75a 4428 }
d8319b27 4429 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4430 {
8fa7f367 4431 UV c = 0;
5d9a96ca 4432 while (n >= st->ln) {
9e137952 4433 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4434 if (do_utf8)
872c91ae 4435 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4436 UTF8_MAXBYTES, 0,
041457d9 4437 uniflags);
1aa99e6b 4438 else
9041c2e3 4439 c = UCHARAT(PL_reginput);
1aa99e6b 4440 }
c277df42 4441 /* If it could work, try it. */
9e137952 4442 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
c277df42 4443 {
d8319b27 4444 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
95b24440 4445 /*** all unsaved local vars undefined at this point */
d8319b27 4446 REGCP_UNWIND(st->u.plus.lastcp);
c277df42
IZ
4447 }
4448 /* Couldn't or didn't -- back up. */
4449 n--;
dfe13c55 4450 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4451 }
a0d0e21e
LW
4452 }
4453 }
4633a7c4 4454 sayNO;
c277df42 4455 break;
a0d0e21e 4456 case END:
3b0527fe 4457 if (locinput < reginfo->till) {
a3621e74 4458 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4459 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4460 PL_colors[4],
4461 (long)(locinput - PL_reg_starttry),
3b0527fe 4462 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4463 PL_colors[5]));
4464 sayNO_FINAL; /* Cannot match: too short. */
4465 }
4466 PL_reginput = locinput; /* put where regtry can find it */
4467 sayYES_FINAL; /* Success! */
dad79028
DM
4468
4469 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4470 DEBUG_EXECUTE_r(
4471 PerlIO_printf(Perl_debug_log,
4472 "%*s %ssubpattern success...%s\n",
4473 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4474 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4475 sayYES_FINAL; /* Success! */
4476
4477 case SUSPEND: /* (?>FOO) */
4478 st->u.ifmatch.wanted = 1;
9fe1d20c 4479 PL_reginput = locinput;
9041c2e3 4480 goto do_ifmatch;
dad79028
DM
4481
4482 case UNLESSM: /* -ve lookaround: (?!FOO), or with flags, (?<!foo) */
4483 st->u.ifmatch.wanted = 0;
4484 goto ifmatch_trivial_fail_test;
4485
4486 case IFMATCH: /* +ve lookaround: (?=FOO), or with flags, (?<=foo) */
4487 st->u.ifmatch.wanted = 1;
4488 ifmatch_trivial_fail_test:
a0ed51b3 4489 if (scan->flags) {
52657f30 4490 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4491 if (!s) {
4492 /* trivial fail */
4493 if (st->logical) {
4494 st->logical = 0;
3dab1dad 4495 st->sw = 1 - (bool)st->u.ifmatch.wanted;
dad79028
DM
4496 }
4497 else if (st->u.ifmatch.wanted)
4498 sayNO;
4499 next = scan + ARG(scan);
4500 if (next == scan)
4501 next = NULL;
4502 break;
4503 }
efb30f32 4504 PL_reginput = s;
a0ed51b3
LW
4505 }
4506 else
4507 PL_reginput = locinput;
4508
c277df42 4509 do_ifmatch:
dad79028
DM
4510 /* resume to current state on success */
4511 st->u.yes.prev_yes_state = yes_state;
4512 yes_state = st;
4513 PUSH_STATE(newst, resume_IFMATCH);
4514 st = newst;
4515 next = NEXTOPER(NEXTOPER(scan));
4516 break;
4517
c277df42 4518 case LONGJMP:
c277df42
IZ
4519 next = scan + ARG(scan);
4520 if (next == scan)
4521 next = NULL;
a0d0e21e
LW
4522 break;
4523 default:
b900a521 4524 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4525 PTR2UV(scan), OP(scan));
cea2e8a9 4526 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4527 }
95b24440 4528
02db2b7b 4529 reenter:
a0d0e21e 4530 scan = next;
95b24440
DM
4531 continue;
4532 /* NOTREACHED */
4533
4534 /* simulate recursively calling regmatch(), but without actually
4535 * recursing - ie save the current state on the heap rather than on
4536 * the stack, then re-enter the loop. This avoids complex regexes
4537 * blowing the processor stack */
4538
4539 start_recurse:
4540 {
5d9a96ca
DM
4541 /* push new state */
4542 regmatch_state *oldst = st;
4543
4544 depth++;
4545
4546 /* grab the next free state slot */
4547 st++;
86545054 4548 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4549 st = S_push_slab(aTHX);
4550 PL_regmatch_state = st;
4551
4552 oldst->next = next;
4553 oldst->n = n;
4554 oldst->locinput = locinput;
5d9a96ca
DM
4555
4556 st->cc = oldst->cc;
95b24440
DM
4557 locinput = PL_reginput;
4558 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4559 st->minmod = 0;
4560 st->sw = 0;
4561 st->logical = 0;
4562 st->unwind = 0;
95b24440
DM
4563#ifdef DEBUGGING
4564 PL_regindent++;
4565#endif
4566 }
a0d0e21e 4567 }
a687059c 4568
aa283a38
DM
4569
4570
a0d0e21e
LW
4571 /*
4572 * We get here only if there's trouble -- normally "case END" is
4573 * the terminating point.
4574 */
cea2e8a9 4575 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4576 /*NOTREACHED*/
4633a7c4
LW
4577 sayNO;
4578
7821416a 4579yes_final:
77cb431f
DM
4580
4581 if (yes_state) {
4582 /* we have successfully completed a subexpression, but we must now
4583 * pop to the state marked by yes_state and continue from there */
4584
dad79028 4585 /*XXX tmp for CURLYM*/
c4fd8992
AL
4586 regmatch_slab * const oslab = PL_regmatch_slab;
4587 regmatch_state * const ost = st;
4588 regmatch_state * const oys = yes_state;
dad79028
DM
4589 int odepth = depth;
4590
77cb431f
DM
4591 assert(st != yes_state);
4592 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4593 || yes_state > SLAB_LAST(PL_regmatch_slab))
4594 {
4595 /* not in this slab, pop slab */
4596 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4597 PL_regmatch_slab = PL_regmatch_slab->prev;
4598 st = SLAB_LAST(PL_regmatch_slab);
4599 }
4600 depth -= (st - yes_state);
dad79028 4601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE TO (%d)\n", depth));
77cb431f
DM
4602 st = yes_state;
4603 yes_state = st->u.yes.prev_yes_state;
4604 PL_regmatch_state = st;
4605
4606 switch (st->resume_state) {
4607 case resume_EVAL:
4608 if (st->u.eval.toggleutf)
4609 PL_reg_flags ^= RF_utf8;
4610 ReREFCNT_dec(rex);
4611 rex = st->u.eval.prev_rex;
4612 /* XXXX This is too dramatic a measure... */
4613 PL_reg_maxiter = 0;
4614 /* Restore parens of the caller without popping the
4615 * savestack */
4616 {
c4fd8992 4617 const I32 tmp = PL_savestack_ix;
77cb431f
DM
4618 PL_savestack_ix = st->u.eval.lastcp;
4619 regcppop(rex);
4620 PL_savestack_ix = tmp;
4621 }
4622 PL_reginput = locinput;
4623 /* continue at the node following the (??{...}) */
4624 next = st->next;
4625 goto reenter;
4626
dad79028
DM
4627 case resume_IFMATCH:
4628 if (st->logical) {
4629 st->logical = 0;
3dab1dad 4630 st->sw = (bool)st->u.ifmatch.wanted;
dad79028
DM
4631 }
4632 else if (!st->u.ifmatch.wanted)
4633 sayNO;
4634
4635 if (OP(st->scan) == SUSPEND)
4636 locinput = PL_reginput;
4637 else {
4638 locinput = PL_reginput = st->locinput;
4639 nextchr = UCHARAT(locinput);
4640 }
4641 next = st->scan + ARG(st->scan);
4642 if (next == st->scan)
4643 next = NULL;
4644 goto reenter;
4645
4646 /* XXX tmp don't handle yes_state yet */
4647 case resume_CURLYM1:
4648 case resume_CURLYM2:
4649 case resume_CURLYM3:
dad79028
DM
4650 PL_regmatch_slab =oslab;
4651 st = ost;
4652 PL_regmatch_state = st;
4653 depth = odepth;
4654 yes_state = oys;
4655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "XXX revering a CURLYM\n"));
4656 goto yes;
4657
77cb431f
DM
4658 default:
4659 Perl_croak(aTHX_ "unexpected yes reume state");
4660 }
4661 }
4662
a3621e74 4663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4664 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4665yes:
4666#ifdef DEBUGGING
3280af22 4667 PL_regindent--;
4633a7c4 4668#endif
02db2b7b 4669
95b24440 4670 result = 1;
aa283a38 4671 /* XXX this is duplicate(ish) code to that in the do_no section.
77cb431f
DM
4672 * eventually a yes should just pop the stack back to the current
4673 * yes_state */
aa283a38
DM
4674 if (depth) {
4675 /* restore previous state and re-enter */
4676 POP_STATE;
4677
4678 switch (st->resume_state) {
4679 case resume_TRIE1:
4680 goto resume_point_TRIE1;
4681 case resume_TRIE2:
4682 goto resume_point_TRIE2;
aa283a38
DM
4683 case resume_CURLYX:
4684 goto resume_point_CURLYX;
4685 case resume_WHILEM1:
4686 goto resume_point_WHILEM1;
4687 case resume_WHILEM2:
4688 goto resume_point_WHILEM2;
4689 case resume_WHILEM3:
4690 goto resume_point_WHILEM3;
4691 case resume_WHILEM4:
4692 goto resume_point_WHILEM4;
4693 case resume_WHILEM5:
4694 goto resume_point_WHILEM5;
4695 case resume_WHILEM6:
4696 goto resume_point_WHILEM6;
4697 case resume_CURLYM1:
4698 goto resume_point_CURLYM1;
4699 case resume_CURLYM2:
4700 goto resume_point_CURLYM2;
4701 case resume_CURLYM3:
4702 goto resume_point_CURLYM3;
aa283a38
DM
4703 case resume_PLUS1:
4704 goto resume_point_PLUS1;
4705 case resume_PLUS2:
4706 goto resume_point_PLUS2;
4707 case resume_PLUS3:
4708 goto resume_point_PLUS3;
4709 case resume_PLUS4:
4710 goto resume_point_PLUS4;
77cb431f 4711
dad79028 4712 case resume_IFMATCH:
77cb431f 4713 case resume_EVAL:
aa283a38
DM
4714 default:
4715 Perl_croak(aTHX_ "regexp resume memory corruption");
4716 }
4717 }
4718 goto final_exit;
4633a7c4
LW
4719
4720no:
a3621e74 4721 DEBUG_EXECUTE_r(
7821416a
IZ
4722 PerlIO_printf(Perl_debug_log,
4723 "%*s %sfailed...%s\n",
e4584336 4724 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a
IZ
4725 );
4726 goto do_no;
4727no_final:
4728do_no:
5d9a96ca
DM
4729 if (st->unwind) {
4730 re_unwind_t * const uw = SSPTRt(st->unwind,re_unwind_t);
02db2b7b
IZ
4731
4732 switch (uw->type) {
4733 case RE_UNWIND_BRANCH:
4734 case RE_UNWIND_BRANCHJ:
4735 {
6136c704 4736 re_unwind_branch_t * const uwb = &(uw->branch);
a3b680e6 4737 const I32 lastparen = uwb->lastparen;
9041c2e3 4738
02db2b7b
IZ
4739 REGCP_UNWIND(uwb->lastcp);
4740 for (n = *PL_reglastparen; n > lastparen; n--)
4741 PL_regendp[n] = -1;
4742 *PL_reglastparen = n;
4743 scan = next = uwb->next;
3a2830be 4744 st->minmod = uwb->minmod;
9041c2e3
NIS
4745 if ( !scan ||
4746 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b 4747 ? BRANCH : BRANCHJ) ) { /* Failure */
5d9a96ca 4748 st->unwind = uwb->prev;
02db2b7b
IZ
4749#ifdef DEBUGGING
4750 PL_regindent--;
4751#endif
4752 goto do_no;
4753 }
4754 /* Have more choice yet. Reuse the same uwb. */
02db2b7b
IZ
4755 if ((n = (uwb->type == RE_UNWIND_BRANCH
4756 ? NEXT_OFF(next) : ARG(next))))
4757 next += n;
4758 else
4759 next = NULL; /* XXXX Needn't unwinding in this case... */
4760 uwb->next = next;
4761 next = NEXTOPER(scan);
4762 if (uwb->type == RE_UNWIND_BRANCHJ)
4763 next = NEXTOPER(next);
4764 locinput = uwb->locinput;
4765 nextchr = uwb->nextchr;
4766#ifdef DEBUGGING
4767 PL_regindent = uwb->regindent;
4768#endif
4769
4770 goto reenter;
4771 }
5f66b61c 4772 /* NOTREACHED */
02db2b7b
IZ
4773 default:
4774 Perl_croak(aTHX_ "regexp unwind memory corruption");
4775 }
5f66b61c 4776 /* NOTREACHED */
02db2b7b 4777 }
aa283a38 4778
4633a7c4 4779#ifdef DEBUGGING
3280af22 4780 PL_regindent--;
4633a7c4 4781#endif
95b24440 4782 result = 0;
5d9a96ca 4783
aa283a38
DM
4784 if (depth) {
4785 /* there's a previous state to backtrack to */
4786 POP_STATE;
5d9a96ca 4787 switch (st->resume_state) {
95b24440
DM
4788 case resume_TRIE1:
4789 goto resume_point_TRIE1;
4790 case resume_TRIE2:
4791 goto resume_point_TRIE2;
aa283a38
DM
4792 case resume_EVAL:
4793 /* we have failed an (??{...}). Restore state to the outer re
4794 * then re-throw the failure */
4795 if (st->u.eval.toggleutf)
4796 PL_reg_flags ^= RF_utf8;
4797 ReREFCNT_dec(rex);
4798 rex = st->u.eval.prev_rex;
77cb431f 4799 yes_state = st->u.yes.prev_yes_state;
aa283a38
DM
4800
4801 /* XXXX This is too dramatic a measure... */
4802 PL_reg_maxiter = 0;
4803
4804 PL_reginput = locinput;
4805 REGCP_UNWIND(st->u.eval.lastcp);
4806 regcppop(rex);
4807 goto do_no;
4808
95b24440
DM
4809 case resume_CURLYX:
4810 goto resume_point_CURLYX;
4811 case resume_WHILEM1:
4812 goto resume_point_WHILEM1;
4813 case resume_WHILEM2:
4814 goto resume_point_WHILEM2;
4815 case resume_WHILEM3:
4816 goto resume_point_WHILEM3;
4817 case resume_WHILEM4:
4818 goto resume_point_WHILEM4;
4819 case resume_WHILEM5:
4820 goto resume_point_WHILEM5;
4821 case resume_WHILEM6:
4822 goto resume_point_WHILEM6;
4823 case resume_CURLYM1:
4824 goto resume_point_CURLYM1;
4825 case resume_CURLYM2:
4826 goto resume_point_CURLYM2;
4827 case resume_CURLYM3:
4828 goto resume_point_CURLYM3;
95b24440 4829 case resume_IFMATCH:
dad79028
DM
4830 yes_state = st->u.yes.prev_yes_state;
4831 if (st->logical) {
4832 st->logical = 0;
4833 st->sw = !st->u.ifmatch.wanted;
4834 }
4835 else if (st->u.ifmatch.wanted)
4836 sayNO;
4837
4838 assert(OP(scan) != SUSPEND); /* XXX DAPM tmp */
4839 locinput = PL_reginput = st->locinput;
4840 nextchr = UCHARAT(locinput);
4841 next = scan + ARG(scan);
4842 if (next == scan)
4843 next = NULL;
4844 goto reenter;
4845
95b24440
DM
4846 case resume_PLUS1:
4847 goto resume_point_PLUS1;
4848 case resume_PLUS2:
4849 goto resume_point_PLUS2;
4850 case resume_PLUS3:
4851 goto resume_point_PLUS3;
4852 case resume_PLUS4:
4853 goto resume_point_PLUS4;
95b24440
DM
4854 default:
4855 Perl_croak(aTHX_ "regexp resume memory corruption");
4856 }
95b24440 4857 }
aa283a38
DM
4858
4859final_exit:
4860
5d9a96ca
DM
4861 /* restore original high-water mark */
4862 PL_regmatch_slab = orig_slab;
4863 PL_regmatch_state = orig_state;
4864
4865 /* free all slabs above current one */
4866 if (orig_slab->next) {
c4fd8992 4867 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4868 orig_slab->next = NULL;
4869 while (sl) {
c4fd8992 4870 regmatch_slab * const osl = sl;
5d9a96ca 4871 sl = sl->next;
ad65c075 4872 Safefree(osl);
5d9a96ca
DM
4873 }
4874 }
4875
95b24440
DM
4876 return result;
4877
a687059c
LW
4878}
4879
4880/*
4881 - regrepeat - repeatedly match something simple, report how many
4882 */
4883/*
4884 * [This routine now assumes that it will only match on things of length 1.
4885 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4886 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4887 */
76e3520e 4888STATIC I32
32fc9b6a 4889S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4890{
27da23d5 4891 dVAR;
a0d0e21e 4892 register char *scan;
a0d0e21e 4893 register I32 c;
3280af22 4894 register char *loceol = PL_regeol;
a0ed51b3 4895 register I32 hardcount = 0;
53c4c00c 4896 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4897
3280af22 4898 scan = PL_reginput;
faf11cac
HS
4899 if (max == REG_INFTY)
4900 max = I32_MAX;
4901 else if (max < loceol - scan)
7f596f4c 4902 loceol = scan + max;
a0d0e21e 4903 switch (OP(p)) {
22c35a8c 4904 case REG_ANY:
1aa99e6b 4905 if (do_utf8) {
ffc61ed2 4906 loceol = PL_regeol;
1aa99e6b 4907 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4908 scan += UTF8SKIP(scan);
4909 hardcount++;
4910 }
4911 } else {
4912 while (scan < loceol && *scan != '\n')
4913 scan++;
a0ed51b3
LW
4914 }
4915 break;
ffc61ed2 4916 case SANY:
def8e4ea
JH
4917 if (do_utf8) {
4918 loceol = PL_regeol;
a0804c9e 4919 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4920 scan += UTF8SKIP(scan);
4921 hardcount++;
4922 }
4923 }
4924 else
4925 scan = loceol;
a0ed51b3 4926 break;
f33976b4
DB
4927 case CANY:
4928 scan = loceol;
4929 break;
090f7165
JH
4930 case EXACT: /* length of string is 1 */
4931 c = (U8)*STRING(p);
4932 while (scan < loceol && UCHARAT(scan) == c)
4933 scan++;
bbce6d69 4934 break;
4935 case EXACTF: /* length of string is 1 */
cd439c50 4936 c = (U8)*STRING(p);
bbce6d69 4937 while (scan < loceol &&
22c35a8c 4938 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4939 scan++;
4940 break;
4941 case EXACTFL: /* length of string is 1 */
3280af22 4942 PL_reg_flags |= RF_tainted;
cd439c50 4943 c = (U8)*STRING(p);
bbce6d69 4944 while (scan < loceol &&
22c35a8c 4945 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4946 scan++;
4947 break;
4948 case ANYOF:
ffc61ed2
JH
4949 if (do_utf8) {
4950 loceol = PL_regeol;
cfc92286 4951 while (hardcount < max && scan < loceol &&
32fc9b6a 4952 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4953 scan += UTF8SKIP(scan);
4954 hardcount++;
4955 }
4956 } else {
32fc9b6a 4957 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4958 scan++;
4959 }
a0d0e21e
LW
4960 break;
4961 case ALNUM:
1aa99e6b 4962 if (do_utf8) {
ffc61ed2 4963 loceol = PL_regeol;
1a4fad37 4964 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4965 while (hardcount < max && scan < loceol &&
3568d838 4966 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4967 scan += UTF8SKIP(scan);
4968 hardcount++;
4969 }
4970 } else {
4971 while (scan < loceol && isALNUM(*scan))
4972 scan++;
a0ed51b3
LW
4973 }
4974 break;
bbce6d69 4975 case ALNUML:
3280af22 4976 PL_reg_flags |= RF_tainted;
1aa99e6b 4977 if (do_utf8) {
ffc61ed2 4978 loceol = PL_regeol;
1aa99e6b
IH
4979 while (hardcount < max && scan < loceol &&
4980 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4981 scan += UTF8SKIP(scan);
4982 hardcount++;
4983 }
4984 } else {
4985 while (scan < loceol && isALNUM_LC(*scan))
4986 scan++;
a0ed51b3
LW
4987 }
4988 break;
a0d0e21e 4989 case NALNUM:
1aa99e6b 4990 if (do_utf8) {
ffc61ed2 4991 loceol = PL_regeol;
1a4fad37 4992 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4993 while (hardcount < max && scan < loceol &&
3568d838 4994 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4995 scan += UTF8SKIP(scan);
4996 hardcount++;
4997 }
4998 } else {
4999 while (scan < loceol && !isALNUM(*scan))
5000 scan++;
a0ed51b3
LW
5001 }
5002 break;
bbce6d69 5003 case NALNUML:
3280af22 5004 PL_reg_flags |= RF_tainted;
1aa99e6b 5005 if (do_utf8) {
ffc61ed2 5006 loceol = PL_regeol;
1aa99e6b
IH
5007 while (hardcount < max && scan < loceol &&
5008 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5009 scan += UTF8SKIP(scan);
5010 hardcount++;
5011 }
5012 } else {
5013 while (scan < loceol && !isALNUM_LC(*scan))
5014 scan++;
a0ed51b3
LW
5015 }
5016 break;
a0d0e21e 5017 case SPACE:
1aa99e6b 5018 if (do_utf8) {
ffc61ed2 5019 loceol = PL_regeol;
1a4fad37 5020 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5021 while (hardcount < max && scan < loceol &&
3568d838
JH
5022 (*scan == ' ' ||
5023 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5024 scan += UTF8SKIP(scan);
5025 hardcount++;
5026 }
5027 } else {
5028 while (scan < loceol && isSPACE(*scan))
5029 scan++;
a0ed51b3
LW
5030 }
5031 break;
bbce6d69 5032 case SPACEL:
3280af22 5033 PL_reg_flags |= RF_tainted;
1aa99e6b 5034 if (do_utf8) {
ffc61ed2 5035 loceol = PL_regeol;
1aa99e6b 5036 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5037 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5038 scan += UTF8SKIP(scan);
5039 hardcount++;
5040 }
5041 } else {
5042 while (scan < loceol && isSPACE_LC(*scan))
5043 scan++;
a0ed51b3
LW
5044 }
5045 break;
a0d0e21e 5046 case NSPACE:
1aa99e6b 5047 if (do_utf8) {
ffc61ed2 5048 loceol = PL_regeol;
1a4fad37 5049 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5050 while (hardcount < max && scan < loceol &&
3568d838
JH
5051 !(*scan == ' ' ||
5052 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5053 scan += UTF8SKIP(scan);
5054 hardcount++;
5055 }
5056 } else {
5057 while (scan < loceol && !isSPACE(*scan))
5058 scan++;
5059 break;
a0ed51b3 5060 }
bbce6d69 5061 case NSPACEL:
3280af22 5062 PL_reg_flags |= RF_tainted;
1aa99e6b 5063 if (do_utf8) {
ffc61ed2 5064 loceol = PL_regeol;
1aa99e6b 5065 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5066 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5067 scan += UTF8SKIP(scan);
5068 hardcount++;
5069 }
5070 } else {
5071 while (scan < loceol && !isSPACE_LC(*scan))
5072 scan++;
a0ed51b3
LW
5073 }
5074 break;
a0d0e21e 5075 case DIGIT:
1aa99e6b 5076 if (do_utf8) {
ffc61ed2 5077 loceol = PL_regeol;
1a4fad37 5078 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5079 while (hardcount < max && scan < loceol &&
3568d838 5080 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5081 scan += UTF8SKIP(scan);
5082 hardcount++;
5083 }
5084 } else {
5085 while (scan < loceol && isDIGIT(*scan))
5086 scan++;
a0ed51b3
LW
5087 }
5088 break;
a0d0e21e 5089 case NDIGIT:
1aa99e6b 5090 if (do_utf8) {
ffc61ed2 5091 loceol = PL_regeol;
1a4fad37 5092 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5093 while (hardcount < max && scan < loceol &&
3568d838 5094 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5095 scan += UTF8SKIP(scan);
5096 hardcount++;
5097 }
5098 } else {
5099 while (scan < loceol && !isDIGIT(*scan))
5100 scan++;
a0ed51b3
LW
5101 }
5102 break;
a0d0e21e
LW
5103 default: /* Called on something of 0 width. */
5104 break; /* So match right here or not at all. */
5105 }
a687059c 5106
a0ed51b3
LW
5107 if (hardcount)
5108 c = hardcount;
5109 else
5110 c = scan - PL_reginput;
3280af22 5111 PL_reginput = scan;
a687059c 5112
a3621e74 5113 DEBUG_r({
ab74612d 5114 SV *re_debug_flags = NULL;
6136c704 5115 SV * const prop = sv_newmortal();
a3621e74
YO
5116 GET_RE_DEBUG_FLAGS;
5117 DEBUG_EXECUTE_r({
32fc9b6a 5118 regprop(prog, prop, p);
9041c2e3
NIS
5119 PerlIO_printf(Perl_debug_log,
5120 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
3f7c398e 5121 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
c277df42 5122 });
a3621e74 5123 });
9041c2e3 5124
a0d0e21e 5125 return(c);
a687059c
LW
5126}
5127
c277df42 5128
76234dfb 5129#ifndef PERL_IN_XSUB_RE
c277df42 5130/*
ffc61ed2
JH
5131- regclass_swash - prepare the utf8 swash
5132*/
5133
5134SV *
32fc9b6a 5135Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5136{
97aff369 5137 dVAR;
9e55ce06
JH
5138 SV *sw = NULL;
5139 SV *si = NULL;
5140 SV *alt = NULL;
3dab1dad 5141 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5142
4f639d21 5143 if (data && data->count) {
a3b680e6 5144 const U32 n = ARG(node);
ffc61ed2 5145
4f639d21
DM
5146 if (data->what[n] == 's') {
5147 SV * const rv = (SV*)data->data[n];
890ce7af 5148 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5149 SV **const ary = AvARRAY(av);
9e55ce06 5150 SV **a, **b;
9041c2e3 5151
711a919c 5152 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5153 * documentation of these array elements. */
5154
b11f357e 5155 si = *ary;
8f7f7219 5156 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5157 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5158
ffc61ed2
JH
5159 if (a)
5160 sw = *a;
5161 else if (si && doinit) {
5162 sw = swash_init("utf8", "", si, 1, 0);
5163 (void)av_store(av, 1, sw);
5164 }
9e55ce06
JH
5165 if (b)
5166 alt = *b;
ffc61ed2
JH
5167 }
5168 }
5169
9e55ce06
JH
5170 if (listsvp)
5171 *listsvp = si;
5172 if (altsvp)
5173 *altsvp = alt;
ffc61ed2
JH
5174
5175 return sw;
5176}
76234dfb 5177#endif
ffc61ed2
JH
5178
5179/*
ba7b4546 5180 - reginclass - determine if a character falls into a character class
832705d4
JH
5181
5182 The n is the ANYOF regnode, the p is the target string, lenp
5183 is pointer to the maximum length of how far to go in the p
5184 (if the lenp is zero, UTF8SKIP(p) is used),
5185 do_utf8 tells whether the target string is in UTF-8.
5186
bbce6d69 5187 */
5188
76e3520e 5189STATIC bool
32fc9b6a 5190S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5191{
27da23d5 5192 dVAR;
a3b680e6 5193 const char flags = ANYOF_FLAGS(n);
bbce6d69 5194 bool match = FALSE;
cc07378b 5195 UV c = *p;
ae9ddab8 5196 STRLEN len = 0;
9e55ce06 5197 STRLEN plen;
1aa99e6b 5198
19f67299
TS
5199 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5200 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5201 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5202 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5203 if (len == (STRLEN)-1)
5204 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5205 }
bbce6d69 5206
0f0076b4 5207 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5208 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5209 if (lenp)
5210 *lenp = 0;
ffc61ed2 5211 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5212 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5213 match = TRUE;
bbce6d69 5214 }
3568d838 5215 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5216 match = TRUE;
ffc61ed2 5217 if (!match) {
9e55ce06 5218 AV *av;
32fc9b6a 5219 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5220
5221 if (sw) {
3568d838 5222 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5223 match = TRUE;
5224 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5225 if (!match && lenp && av) {
5226 I32 i;
9e55ce06 5227 for (i = 0; i <= av_len(av); i++) {
890ce7af 5228 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5229 STRLEN len;
890ce7af 5230 const char * const s = SvPV_const(sv, len);
9e55ce06 5231
061b10df 5232 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5233 *lenp = len;
5234 match = TRUE;
5235 break;
5236 }
5237 }
5238 }
5239 if (!match) {
89ebb4a3 5240 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5241 STRLEN tmplen;
5242
9e55ce06
JH
5243 to_utf8_fold(p, tmpbuf, &tmplen);
5244 if (swash_fetch(sw, tmpbuf, do_utf8))
5245 match = TRUE;
5246 }
ffc61ed2
JH
5247 }
5248 }
bbce6d69 5249 }
9e55ce06 5250 if (match && lenp && *lenp == 0)
0f0076b4 5251 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5252 }
1aa99e6b 5253 if (!match && c < 256) {
ffc61ed2
JH
5254 if (ANYOF_BITMAP_TEST(n, c))
5255 match = TRUE;
5256 else if (flags & ANYOF_FOLD) {
eb160463 5257 U8 f;
a0ed51b3 5258
ffc61ed2
JH
5259 if (flags & ANYOF_LOCALE) {
5260 PL_reg_flags |= RF_tainted;
5261 f = PL_fold_locale[c];
5262 }
5263 else
5264 f = PL_fold[c];
5265 if (f != c && ANYOF_BITMAP_TEST(n, f))
5266 match = TRUE;
5267 }
5268
5269 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5270 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5271 if (
5272 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5274 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5275 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5276 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5281 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5282 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5283 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5284 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5285 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5286 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5287 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5288 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5289 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5290 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5291 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5292 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5293 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5294 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5295 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5296 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5297 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5298 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5299 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5300 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5301 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5302 ) /* How's that for a conditional? */
5303 {
5304 match = TRUE;
5305 }
a0ed51b3 5306 }
a0ed51b3
LW
5307 }
5308
a0ed51b3
LW
5309 return (flags & ANYOF_INVERT) ? !match : match;
5310}
161b471a 5311
dfe13c55 5312STATIC U8 *
0ce71af7 5313S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5314{
97aff369 5315 dVAR;
a0ed51b3 5316 if (off >= 0) {
1aa99e6b 5317 while (off-- && s < lim) {
ffc61ed2 5318 /* XXX could check well-formedness here */
a0ed51b3 5319 s += UTF8SKIP(s);
ffc61ed2 5320 }
a0ed51b3
LW
5321 }
5322 else {
5323 while (off++) {
1aa99e6b 5324 if (s > lim) {
a0ed51b3 5325 s--;
ffc61ed2 5326 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5327 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5328 s--;
ffc61ed2
JH
5329 }
5330 /* XXX could check well-formedness here */
a0ed51b3
LW
5331 }
5332 }
5333 }
5334 return s;
5335}
161b471a 5336
dfe13c55 5337STATIC U8 *
0ce71af7 5338S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5339{
97aff369 5340 dVAR;
a0ed51b3 5341 if (off >= 0) {
1aa99e6b 5342 while (off-- && s < lim) {
ffc61ed2 5343 /* XXX could check well-formedness here */
a0ed51b3 5344 s += UTF8SKIP(s);
ffc61ed2 5345 }
a0ed51b3 5346 if (off >= 0)
3dab1dad 5347 return NULL;
a0ed51b3
LW
5348 }
5349 else {
5350 while (off++) {
1aa99e6b 5351 if (s > lim) {
a0ed51b3 5352 s--;
ffc61ed2 5353 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5354 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5355 s--;
ffc61ed2
JH
5356 }
5357 /* XXX could check well-formedness here */
a0ed51b3
LW
5358 }
5359 else
5360 break;
5361 }
5362 if (off <= 0)
3dab1dad 5363 return NULL;
a0ed51b3
LW
5364 }
5365 return s;
5366}
51371543 5367
51371543 5368static void
acfe0abc 5369restore_pos(pTHX_ void *arg)
51371543 5370{
97aff369 5371 dVAR;
097eb12c 5372 regexp * const rex = (regexp *)arg;
51371543
GS
5373 if (PL_reg_eval_set) {
5374 if (PL_reg_oldsaved) {
4f639d21
DM
5375 rex->subbeg = PL_reg_oldsaved;
5376 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5377#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5378 rex->saved_copy = PL_nrs;
ed252734 5379#endif
4f639d21 5380 RX_MATCH_COPIED_on(rex);
51371543
GS
5381 }
5382 PL_reg_magic->mg_len = PL_reg_oldpos;
5383 PL_reg_eval_set = 0;
5384 PL_curpm = PL_reg_oldcurpm;
5385 }
5386}
33b8afdf
JH
5387
5388STATIC void
5389S_to_utf8_substr(pTHX_ register regexp *prog)
5390{
33b8afdf 5391 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5392 SV* const sv = newSVsv(prog->float_substr);
5393 prog->float_utf8 = sv;
33b8afdf
JH
5394 sv_utf8_upgrade(sv);
5395 if (SvTAIL(prog->float_substr))
5396 SvTAIL_on(sv);
5397 if (prog->float_substr == prog->check_substr)
5398 prog->check_utf8 = sv;
5399 }
5400 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5401 SV* const sv = newSVsv(prog->anchored_substr);
5402 prog->anchored_utf8 = sv;
33b8afdf
JH
5403 sv_utf8_upgrade(sv);
5404 if (SvTAIL(prog->anchored_substr))
5405 SvTAIL_on(sv);
5406 if (prog->anchored_substr == prog->check_substr)
5407 prog->check_utf8 = sv;
5408 }
5409}
5410
5411STATIC void
5412S_to_byte_substr(pTHX_ register regexp *prog)
5413{
97aff369 5414 dVAR;
33b8afdf 5415 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5416 SV* sv = newSVsv(prog->float_utf8);
5417 prog->float_substr = sv;
33b8afdf
JH
5418 if (sv_utf8_downgrade(sv, TRUE)) {
5419 if (SvTAIL(prog->float_utf8))
5420 SvTAIL_on(sv);
5421 } else {
5422 SvREFCNT_dec(sv);
5423 prog->float_substr = sv = &PL_sv_undef;
5424 }
5425 if (prog->float_utf8 == prog->check_utf8)
5426 prog->check_substr = sv;
5427 }
5428 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5429 SV* sv = newSVsv(prog->anchored_utf8);
5430 prog->anchored_substr = sv;
33b8afdf
JH
5431 if (sv_utf8_downgrade(sv, TRUE)) {
5432 if (SvTAIL(prog->anchored_utf8))
5433 SvTAIL_on(sv);
5434 } else {
5435 SvREFCNT_dec(sv);
5436 prog->anchored_substr = sv = &PL_sv_undef;
5437 }
5438 if (prog->anchored_utf8 == prog->check_utf8)
5439 prog->check_substr = sv;
5440 }
5441}
66610fdd
RGS
5442
5443/*
5444 * Local variables:
5445 * c-indentation-style: bsd
5446 * c-basic-offset: 4
5447 * indent-tabs-mode: t
5448 * End:
5449 *
37442d52
RGS
5450 * ex: set ts=8 sts=4 sw=4 noet:
5451 */