This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Implement handling of state variables in list assignment
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
a687059c 274/*
e50aee73 275 * pregexec and friends
a687059c
LW
276 */
277
76234dfb 278#ifndef PERL_IN_XSUB_RE
a687059c 279/*
c277df42 280 - pregexec - match a regexp against a string
a687059c 281 */
c277df42 282I32
864dbfa3 283Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 284 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
285/* strend: pointer to null at end of string */
286/* strbeg: real beginning of string */
287/* minend: end of match must be >=minend after stringarg. */
288/* nosave: For optimizations. */
289{
290 return
9041c2e3 291 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
292 nosave ? 0 : REXEC_COPY_STR);
293}
76234dfb 294#endif
22e551b9 295
9041c2e3 296/*
cad2e5aa
JH
297 * Need to implement the following flags for reg_anch:
298 *
299 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * USE_INTUIT_ML
301 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
302 * INTUIT_AUTORITATIVE_ML
303 * INTUIT_ONCE_NOML - Intuit can match in one location only.
304 * INTUIT_ONCE_ML
305 *
306 * Another flag for this function: SECOND_TIME (so that float substrs
307 * with giant delta may be not rechecked).
308 */
309
310/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311
3f7c398e 312/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
313 Otherwise, only SvCUR(sv) is used to get strbeg. */
314
315/* XXXX We assume that strpos is strbeg unless sv. */
316
6eb5f6b9
JH
317/* XXXX Some places assume that there is a fixed substring.
318 An update may be needed if optimizer marks as "INTUITable"
319 RExen without fixed substrings. Similarly, it is assumed that
320 lengths of all the strings are no more than minlen, thus they
321 cannot come from lookahead.
322 (Or minlen should take into account lookahead.) */
323
2c2d71f5
JH
324/* A failure to find a constant substring means that there is no need to make
325 an expensive call to REx engine, thus we celebrate a failure. Similarly,
326 finding a substring too deep into the string means that less calls to
30944b6d
IZ
327 regtry() should be needed.
328
329 REx compiler's optimizer found 4 possible hints:
330 a) Anchored substring;
331 b) Fixed substring;
332 c) Whether we are anchored (beginning-of-line or \G);
333 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 334 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
335 string which does not contradict any of them.
336 */
2c2d71f5 337
6eb5f6b9
JH
338/* Most of decisions we do here should have been done at compile time.
339 The nodes of the REx which we used for the search should have been
340 deleted from the finite automaton. */
341
cad2e5aa
JH
342char *
343Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
344 char *strend, U32 flags, re_scream_pos_data *data)
345{
97aff369 346 dVAR;
b7953727 347 register I32 start_shift = 0;
cad2e5aa 348 /* Should be nonnegative! */
b7953727 349 register I32 end_shift = 0;
2c2d71f5
JH
350 register char *s;
351 register SV *check;
a1933d95 352 char *strbeg;
cad2e5aa 353 char *t;
a3b680e6 354 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 355 I32 ml_anch;
bd61b366
SS
356 register char *other_last = NULL; /* other substr checked before this */
357 char *check_at = NULL; /* check substr found at this pos */
1df70142 358 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 359#ifdef DEBUGGING
890ce7af
AL
360 const char * const i_strpos = strpos;
361 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 362#endif
a3621e74
YO
363
364 GET_RE_DEBUG_FLAGS_DECL;
365
a30b2f1f 366 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 367
b8d68ded 368 if (prog->reganch & ROPT_UTF8) {
a3621e74 369 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
370 "UTF-8 regex...\n"));
371 PL_reg_flags |= RF_utf8;
372 }
373
a3621e74 374 DEBUG_EXECUTE_r({
1df70142 375 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
376 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
377 strpos;
1df70142 378 const int len = PL_reg_match_utf8 ?
bb7a0f54 379 (int)strlen(s) : strend - strpos;
2a782b5b
JH
380 if (!PL_colorset)
381 reginitcolors();
b8d68ded 382 if (PL_reg_match_utf8)
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 384 "UTF-8 target...\n"));
2a782b5b 385 PerlIO_printf(Perl_debug_log,
a0288114 386 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 387 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
388 prog->precomp,
389 PL_colors[1],
390 (strlen(prog->precomp) > 60 ? "..." : ""),
391 PL_colors[0],
392 (int)(len > 60 ? 60 : len),
393 s, PL_colors[1],
394 (len > 60 ? "..." : "")
395 );
396 });
cad2e5aa 397
c344f387
JH
398 /* CHR_DIST() would be more correct here but it makes things slow. */
399 if (prog->minlen > strend - strpos) {
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 401 "String too short... [re_intuit_start]\n"));
cad2e5aa 402 goto fail;
2c2d71f5 403 }
a1933d95 404 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 405 PL_regeol = strend;
33b8afdf
JH
406 if (do_utf8) {
407 if (!prog->check_utf8 && prog->check_substr)
408 to_utf8_substr(prog);
409 check = prog->check_utf8;
410 } else {
411 if (!prog->check_substr && prog->check_utf8)
412 to_byte_substr(prog);
413 check = prog->check_substr;
414 }
415 if (check == &PL_sv_undef) {
a3621e74 416 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
417 "Non-utf string cannot match utf check string\n"));
418 goto fail;
419 }
2c2d71f5 420 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
421 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
422 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 423 && !multiline ) ); /* Check after \n? */
cad2e5aa 424
7e25d62c
JH
425 if (!ml_anch) {
426 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
427 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 428 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
429 && sv && !SvROK(sv)
430 && (strpos != strbeg)) {
a3621e74 431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
432 goto fail;
433 }
434 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 435 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 436 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
437 I32 slen;
438
1aa99e6b 439 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
440 if (SvTAIL(check)) {
441 slen = SvCUR(check); /* >= 1 */
cad2e5aa 442
9041c2e3 443 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 444 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 446 goto fail_finish;
cad2e5aa
JH
447 }
448 /* Now should match s[0..slen-2] */
449 slen--;
3f7c398e 450 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 451 || (slen > 1
3f7c398e 452 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 453 report_neq:
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
455 goto fail_finish;
456 }
cad2e5aa 457 }
3f7c398e 458 else if (*SvPVX_const(check) != *s
653099ff 459 || ((slen = SvCUR(check)) > 1
3f7c398e 460 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 461 goto report_neq;
c315bfe8 462 check_at = s;
2c2d71f5 463 goto success_at_start;
7e25d62c 464 }
cad2e5aa 465 }
2c2d71f5 466 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 467 s = strpos;
2c2d71f5 468 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 469 end_shift = prog->minlen - start_shift -
653099ff 470 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 471 if (!ml_anch) {
a3b680e6 472 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 473 - (SvTAIL(check) != 0);
a3b680e6 474 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
475
476 if (end_shift < eshift)
477 end_shift = eshift;
478 }
cad2e5aa 479 }
2c2d71f5 480 else { /* Can match at random position */
cad2e5aa
JH
481 ml_anch = 0;
482 s = strpos;
2c2d71f5
JH
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 /* Should be nonnegative! */
485 end_shift = prog->minlen - start_shift -
653099ff 486 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
487 }
488
2c2d71f5 489#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 490 if (end_shift < 0)
6bbae5e6 491 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
492#endif
493
2c2d71f5
JH
494 restart:
495 /* Find a possible match in the region s..strend by looking for
496 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 497 if (flags & REXEC_SCREAM) {
cad2e5aa 498 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 499 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 500
2c2d71f5
JH
501 if (PL_screamfirst[BmRARE(check)] >= 0
502 || ( BmRARE(check) == '\n'
503 && (BmPREVIOUS(check) == SvCUR(check) - 1)
504 && SvTAIL(check) ))
9041c2e3 505 s = screaminstr(sv, check,
2c2d71f5 506 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 507 else
2c2d71f5 508 goto fail_finish;
4addbd3b
HS
509 /* we may be pointing at the wrong string */
510 if (s && RX_MATCH_COPIED(prog))
3f7c398e 511 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
512 if (data)
513 *data->scream_olds = s;
514 }
f33976b4 515 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
516 s = fbm_instr((U8*)(s + start_shift),
517 (U8*)(strend - end_shift),
7fba1cd6 518 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 519 else
1aa99e6b
IH
520 s = fbm_instr(HOP3(s, start_shift, strend),
521 HOP3(strend, -end_shift, strbeg),
7fba1cd6 522 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
523
524 /* Update the count-of-usability, remove useless subpatterns,
525 unshift s. */
2c2d71f5 526
a0288114 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 528 (s ? "Found" : "Did not find"),
33b8afdf 529 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 530 PL_colors[0],
7b0972df 531 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 532 SvPVX_const(check),
2c2d71f5
JH
533 PL_colors[1], (SvTAIL(check) ? "$" : ""),
534 (s ? " at offset " : "...\n") ) );
535
536 if (!s)
537 goto fail_finish;
538
6eb5f6b9
JH
539 check_at = s;
540
2c2d71f5 541 /* Finish the diagnostic message */
a3621e74 542 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
543
544 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
545 Start with the other substr.
546 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 547 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
548 *always* match. Probably should be marked during compile...
549 Probably it is right to do no SCREAM here...
550 */
551
33b8afdf 552 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 553 /* Take into account the "other" substring. */
2c2d71f5
JH
554 /* XXXX May be hopelessly wrong for UTF... */
555 if (!other_last)
6eb5f6b9 556 other_last = strpos;
33b8afdf 557 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
558 do_other_anchored:
559 {
890ce7af
AL
560 char * const last = HOP3c(s, -start_shift, strbeg);
561 char *last1, *last2;
be8e71aa 562 char * const saved_s = s;
33b8afdf 563 SV* must;
2c2d71f5 564
2c2d71f5
JH
565 t = s - prog->check_offset_max;
566 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 567 && (!do_utf8
0ce71af7 568 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 569 && t > strpos)))
6f207bd3 570 NOOP;
2c2d71f5
JH
571 else
572 t = strpos;
1aa99e6b 573 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
574 if (t < other_last) /* These positions already checked */
575 t = other_last;
1aa99e6b 576 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
577 if (last < last1)
578 last1 = last;
579 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
580 /* On end-of-str: see comment below. */
33b8afdf
JH
581 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
582 if (must == &PL_sv_undef) {
583 s = (char*)NULL;
a3621e74 584 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
585 }
586 else
587 s = fbm_instr(
588 (unsigned char*)t,
589 HOP3(HOP3(last1, prog->anchored_offset, strend)
590 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
591 must,
7fba1cd6 592 multiline ? FBMrf_MULTILINE : 0
33b8afdf 593 );
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 595 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
596 (s ? "Found" : "Contradicts"),
597 PL_colors[0],
33b8afdf
JH
598 (int)(SvCUR(must)
599 - (SvTAIL(must)!=0)),
3f7c398e 600 SvPVX_const(must),
33b8afdf 601 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
602 if (!s) {
603 if (last1 >= last2) {
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
605 ", giving up...\n"));
606 goto fail_finish;
607 }
a3621e74 608 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 609 ", trying floating at offset %ld...\n",
be8e71aa 610 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
611 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
612 s = HOP3c(last, 1, strend);
2c2d71f5
JH
613 goto restart;
614 }
615 else {
a3621e74 616 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 617 (long)(s - i_strpos)));
1aa99e6b
IH
618 t = HOP3c(s, -prog->anchored_offset, strbeg);
619 other_last = HOP3c(s, 1, strend);
be8e71aa 620 s = saved_s;
2c2d71f5
JH
621 if (t == strpos)
622 goto try_at_start;
2c2d71f5
JH
623 goto try_at_offset;
624 }
30944b6d 625 }
2c2d71f5
JH
626 }
627 else { /* Take into account the floating substring. */
33b8afdf 628 char *last, *last1;
be8e71aa 629 char * const saved_s = s;
33b8afdf
JH
630 SV* must;
631
632 t = HOP3c(s, -start_shift, strbeg);
633 last1 = last =
634 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
635 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
636 last = HOP3c(t, prog->float_max_offset, strend);
637 s = HOP3c(t, prog->float_min_offset, strend);
638 if (s < other_last)
639 s = other_last;
2c2d71f5 640 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
641 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
642 /* fbm_instr() takes into account exact value of end-of-str
643 if the check is SvTAIL(ed). Since false positives are OK,
644 and end-of-str is not later than strend we are OK. */
645 if (must == &PL_sv_undef) {
646 s = (char*)NULL;
a3621e74 647 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
648 }
649 else
2c2d71f5 650 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
651 (unsigned char*)last + SvCUR(must)
652 - (SvTAIL(must)!=0),
7fba1cd6 653 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
655 (s ? "Found" : "Contradicts"),
656 PL_colors[0],
657 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf
JH
659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
660 if (!s) {
661 if (last1 == last) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
663 ", giving up...\n"));
664 goto fail_finish;
2c2d71f5 665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 667 ", trying anchored starting at offset %ld...\n",
be8e71aa 668 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
669 other_last = last;
670 s = HOP3c(t, 1, strend);
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
675 (long)(s - i_strpos)));
676 other_last = s; /* Fix this later. --Hugo */
be8e71aa 677 s = saved_s;
33b8afdf
JH
678 if (t == strpos)
679 goto try_at_start;
680 goto try_at_offset;
681 }
2c2d71f5 682 }
cad2e5aa 683 }
2c2d71f5
JH
684
685 t = s - prog->check_offset_max;
2c2d71f5 686 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 687 && (!do_utf8
0ce71af7 688 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 689 && t > strpos))) {
2c2d71f5
JH
690 /* Fixed substring is found far enough so that the match
691 cannot start at strpos. */
692 try_at_offset:
cad2e5aa 693 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
694 /* Eventually fbm_*() should handle this, but often
695 anchored_offset is not 0, so this check will not be wasted. */
696 /* XXXX In the code below we prefer to look for "^" even in
697 presence of anchored substrings. And we search even
698 beyond the found float position. These pessimizations
699 are historical artefacts only. */
700 find_anchor:
2c2d71f5 701 while (t < strend - prog->minlen) {
cad2e5aa 702 if (*t == '\n') {
4ee3650e 703 if (t < check_at - prog->check_offset_min) {
33b8afdf 704 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
705 /* Since we moved from the found position,
706 we definitely contradict the found anchored
30944b6d
IZ
707 substr. Due to the above check we do not
708 contradict "check" substr.
709 Thus we can arrive here only if check substr
710 is float. Redo checking for "other"=="fixed".
711 */
9041c2e3 712 strpos = t + 1;
a3621e74 713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 714 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
715 goto do_other_anchored;
716 }
4ee3650e
GS
717 /* We don't contradict the found floating substring. */
718 /* XXXX Why not check for STCLASS? */
cad2e5aa 719 s = t + 1;
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 721 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
722 goto set_useful;
723 }
4ee3650e
GS
724 /* Position contradicts check-string */
725 /* XXXX probably better to look for check-string
726 than for "\n", so one should lower the limit for t? */
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 728 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 729 other_last = strpos = s = t + 1;
cad2e5aa
JH
730 goto restart;
731 }
732 t++;
733 }
a3621e74 734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 735 PL_colors[0], PL_colors[1]));
2c2d71f5 736 goto fail_finish;
cad2e5aa 737 }
f5952150 738 else {
a3621e74 739 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 740 PL_colors[0], PL_colors[1]));
f5952150 741 }
cad2e5aa
JH
742 s = t;
743 set_useful:
33b8afdf 744 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
745 }
746 else {
f5952150 747 /* The found string does not prohibit matching at strpos,
2c2d71f5 748 - no optimization of calling REx engine can be performed,
f5952150
GS
749 unless it was an MBOL and we are not after MBOL,
750 or a future STCLASS check will fail this. */
2c2d71f5
JH
751 try_at_start:
752 /* Even in this situation we may use MBOL flag if strpos is offset
753 wrt the start of the string. */
05b4157f 754 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 755 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
756 /* May be due to an implicit anchor of m{.*foo} */
757 && !(prog->reganch & ROPT_IMPLICIT))
758 {
cad2e5aa
JH
759 t = strpos;
760 goto find_anchor;
761 }
a3621e74 762 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 763 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 764 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 765 );
2c2d71f5 766 success_at_start:
30944b6d 767 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
768 && (do_utf8 ? (
769 prog->check_utf8 /* Could be deleted already */
770 && --BmUSEFUL(prog->check_utf8) < 0
771 && (prog->check_utf8 == prog->float_utf8)
772 ) : (
773 prog->check_substr /* Could be deleted already */
774 && --BmUSEFUL(prog->check_substr) < 0
775 && (prog->check_substr == prog->float_substr)
776 )))
66e933ab 777 {
cad2e5aa 778 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
780 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
781 if (do_utf8 ? prog->check_substr : prog->check_utf8)
782 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
783 prog->check_substr = prog->check_utf8 = NULL; /* disable */
784 prog->float_substr = prog->float_utf8 = NULL; /* clear */
785 check = NULL; /* abort */
cad2e5aa 786 s = strpos;
3cf5c195
IZ
787 /* XXXX This is a remnant of the old implementation. It
788 looks wasteful, since now INTUIT can use many
6eb5f6b9 789 other heuristics. */
cad2e5aa
JH
790 prog->reganch &= ~RE_USE_INTUIT;
791 }
792 else
793 s = strpos;
794 }
795
6eb5f6b9
JH
796 /* Last resort... */
797 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 798 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
799 /* minlen == 0 is possible if regstclass is \b or \B,
800 and the fixed substr is ''$.
801 Since minlen is already taken into account, s+1 is before strend;
802 accidentally, minlen >= 1 guaranties no false positives at s + 1
803 even for \b or \B. But (minlen? 1 : 0) below assumes that
804 regstclass does not come from lookahead... */
805 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
806 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 807 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 808 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 809 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 810 : 1);
07be1b83 811 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 812 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 813 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
814 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
815 cl_l, strend)
816 : strend);
07be1b83
YO
817 /*if (OP(prog->regstclass) == TRIE)
818 endpos++;*/
6eb5f6b9 819 t = s;
3b0527fe 820 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
821 if (!s) {
822#ifdef DEBUGGING
cbbf8932 823 const char *what = NULL;
6eb5f6b9
JH
824#endif
825 if (endpos == strend) {
a3621e74 826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
827 "Could not match STCLASS...\n") );
828 goto fail;
829 }
a3621e74 830 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 831 "This position contradicts STCLASS...\n") );
653099ff
GS
832 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
833 goto fail;
6eb5f6b9 834 /* Contradict one of substrings */
33b8afdf
JH
835 if (prog->anchored_substr || prog->anchored_utf8) {
836 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 837 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 838 hop_and_restart:
1aa99e6b 839 s = HOP3c(t, 1, strend);
66e933ab
GS
840 if (s + start_shift + end_shift > strend) {
841 /* XXXX Should be taken into account earlier? */
a3621e74 842 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
843 "Could not match STCLASS...\n") );
844 goto fail;
845 }
5e39e1e5
HS
846 if (!check)
847 goto giveup;
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 849 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
850 what, (long)(s + start_shift - i_strpos)) );
851 goto restart;
852 }
66e933ab 853 /* Have both, check_string is floating */
6eb5f6b9
JH
854 if (t + start_shift >= check_at) /* Contradicts floating=check */
855 goto retry_floating_check;
856 /* Recheck anchored substring, but not floating... */
9041c2e3 857 s = check_at;
5e39e1e5
HS
858 if (!check)
859 goto giveup;
a3621e74 860 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 861 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
862 (long)(other_last - i_strpos)) );
863 goto do_other_anchored;
864 }
60e71179
GS
865 /* Another way we could have checked stclass at the
866 current position only: */
867 if (ml_anch) {
868 s = t = t + 1;
5e39e1e5
HS
869 if (!check)
870 goto giveup;
a3621e74 871 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 872 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 873 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 874 goto try_at_offset;
66e933ab 875 }
33b8afdf 876 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 877 goto fail;
6eb5f6b9
JH
878 /* Check is floating subtring. */
879 retry_floating_check:
880 t = check_at - start_shift;
a3621e74 881 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
882 goto hop_and_restart;
883 }
b7953727 884 if (t != s) {
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 886 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
887 (long)(t - i_strpos), (long)(s - i_strpos))
888 );
889 }
890 else {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
892 "Does not contradict STCLASS...\n");
893 );
894 }
6eb5f6b9 895 }
5e39e1e5 896 giveup:
a3621e74 897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
898 PL_colors[4], (check ? "Guessed" : "Giving up"),
899 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 900 return s;
2c2d71f5
JH
901
902 fail_finish: /* Substring not found */
33b8afdf
JH
903 if (prog->check_substr || prog->check_utf8) /* could be removed already */
904 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 905 fail:
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 907 PL_colors[4], PL_colors[5]));
bd61b366 908 return NULL;
cad2e5aa 909}
9661b544 910
6eb5f6b9 911/* We know what class REx starts with. Try to find this position... */
3b0527fe 912/* if reginfo is NULL, its a dryrun */
07be1b83
YO
913/* annoyingly all the vars in this routine have different names from their counterparts
914 in regmatch. /grrr */
3b0527fe 915
3c3eec57 916STATIC char *
07be1b83
YO
917S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
918 const char *strend, const regmatch_info *reginfo)
a687059c 919{
27da23d5 920 dVAR;
1df70142 921 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 922 char *m;
d8093b23 923 STRLEN ln;
5dab1207 924 STRLEN lnc;
078c425b 925 register STRLEN uskip;
d8093b23
G
926 unsigned int c1;
927 unsigned int c2;
6eb5f6b9
JH
928 char *e;
929 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 930 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 931
6eb5f6b9
JH
932 /* We know what class it must start with. */
933 switch (OP(c)) {
6eb5f6b9 934 case ANYOF:
388cc4de 935 if (do_utf8) {
078c425b 936 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
937 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
938 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
939 reginclass(prog, c, (U8*)s, 0, do_utf8) :
940 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 941 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
942 goto got_it;
943 else
944 tmp = doevery;
945 }
946 else
947 tmp = 1;
078c425b 948 s += uskip;
388cc4de
HS
949 }
950 }
951 else {
952 while (s < strend) {
953 STRLEN skip = 1;
954
32fc9b6a 955 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
956 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
957 /* The assignment of 2 is intentional:
958 * for the folded sharp s, the skip is 2. */
959 (skip = SHARP_S_SKIP))) {
3b0527fe 960 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
961 goto got_it;
962 else
963 tmp = doevery;
964 }
965 else
966 tmp = 1;
967 s += skip;
968 }
a0d0e21e 969 }
6eb5f6b9 970 break;
f33976b4
DB
971 case CANY:
972 while (s < strend) {
3b0527fe 973 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
974 goto got_it;
975 else
976 tmp = doevery;
977 s++;
978 }
979 break;
6eb5f6b9 980 case EXACTF:
5dab1207
NIS
981 m = STRING(c);
982 ln = STR_LEN(c); /* length to match in octets/bytes */
983 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 984 if (UTF) {
a2a2844f 985 STRLEN ulen1, ulen2;
5dab1207 986 U8 *sm = (U8 *) m;
89ebb4a3
JH
987 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
988 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 989 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
990
991 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
992 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
993
89ebb4a3 994 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 995 0, uniflags);
89ebb4a3 996 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 997 0, uniflags);
5dab1207
NIS
998 lnc = 0;
999 while (sm < ((U8 *) m + ln)) {
1000 lnc++;
1001 sm += UTF8SKIP(sm);
1002 }
1aa99e6b
IH
1003 }
1004 else {
1005 c1 = *(U8*)m;
1006 c2 = PL_fold[c1];
1007 }
6eb5f6b9
JH
1008 goto do_exactf;
1009 case EXACTFL:
5dab1207
NIS
1010 m = STRING(c);
1011 ln = STR_LEN(c);
1012 lnc = (I32) ln;
d8093b23 1013 c1 = *(U8*)m;
6eb5f6b9
JH
1014 c2 = PL_fold_locale[c1];
1015 do_exactf:
db12adc6 1016 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1017
3b0527fe 1018 if (!reginfo && e < s)
6eb5f6b9 1019 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1020
60a8b682
JH
1021 /* The idea in the EXACTF* cases is to first find the
1022 * first character of the EXACTF* node and then, if
1023 * necessary, case-insensitively compare the full
1024 * text of the node. The c1 and c2 are the first
1025 * characters (though in Unicode it gets a bit
1026 * more complicated because there are more cases
7f16dd3d
JH
1027 * than just upper and lower: one needs to use
1028 * the so-called folding case for case-insensitive
1029 * matching (called "loose matching" in Unicode).
1030 * ibcmp_utf8() will do just that. */
60a8b682 1031
1aa99e6b 1032 if (do_utf8) {
575cac57 1033 UV c, f;
89ebb4a3 1034 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1035 STRLEN len, foldlen;
4ad0818d 1036 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1037 if (c1 == c2) {
5dab1207
NIS
1038 /* Upper and lower of 1st char are equal -
1039 * probably not a "letter". */
1aa99e6b 1040 while (s <= e) {
89ebb4a3 1041 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1042 uniflags);
80aecb99
JH
1043 if ( c == c1
1044 && (ln == len ||
3dab1dad
YO
1045 ibcmp_utf8(s, NULL, 0, do_utf8,
1046 m, NULL, ln, (bool)UTF))
3b0527fe 1047 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1048 goto got_it;
80aecb99 1049 else {
1df70142 1050 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1051 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1052 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1053 if ( f != c
1054 && (f == c1 || f == c2)
1055 && (ln == foldlen ||
66423254 1056 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1057 NULL, foldlen, do_utf8,
d07ddd77 1058 m,
3dab1dad 1059 NULL, ln, (bool)UTF))
3b0527fe 1060 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1061 goto got_it;
1062 }
1aa99e6b
IH
1063 s += len;
1064 }
09091399
JH
1065 }
1066 else {
1aa99e6b 1067 while (s <= e) {
89ebb4a3 1068 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1069 uniflags);
80aecb99 1070
60a8b682 1071 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1072 * Note that not all the possible combinations
1073 * are handled here: some of them are handled
1074 * by the standard folding rules, and some of
1075 * them (the character class or ANYOF cases)
1076 * are handled during compiletime in
1077 * regexec.c:S_regclass(). */
880bd946
JH
1078 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1079 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1080 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1081
1082 if ( (c == c1 || c == c2)
1083 && (ln == len ||
3dab1dad
YO
1084 ibcmp_utf8(s, NULL, 0, do_utf8,
1085 m, NULL, ln, (bool)UTF))
3b0527fe 1086 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1087 goto got_it;
80aecb99 1088 else {
1df70142 1089 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1090 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1091 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1092 if ( f != c
1093 && (f == c1 || f == c2)
1094 && (ln == foldlen ||
a6872d42 1095 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1096 NULL, foldlen, do_utf8,
d07ddd77 1097 m,
3dab1dad 1098 NULL, ln, (bool)UTF))
3b0527fe 1099 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1100 goto got_it;
1101 }
1aa99e6b
IH
1102 s += len;
1103 }
09091399 1104 }
1aa99e6b
IH
1105 }
1106 else {
1107 if (c1 == c2)
1108 while (s <= e) {
1109 if ( *(U8*)s == c1
1110 && (ln == 1 || !(OP(c) == EXACTF
1111 ? ibcmp(s, m, ln)
1112 : ibcmp_locale(s, m, ln)))
3b0527fe 1113 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1114 goto got_it;
1115 s++;
1116 }
1117 else
1118 while (s <= e) {
1119 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
b3c9acc1
IZ
1127 }
1128 break;
bbce6d69 1129 case BOUNDL:
3280af22 1130 PL_reg_flags |= RF_tainted;
bbce6d69 1131 /* FALL THROUGH */
a0d0e21e 1132 case BOUND:
ffc61ed2 1133 if (do_utf8) {
12d33761 1134 if (s == PL_bostr)
ffc61ed2
JH
1135 tmp = '\n';
1136 else {
6136c704 1137 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1138 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1139 }
1140 tmp = ((OP(c) == BOUND ?
9041c2e3 1141 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1142 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1143 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1144 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1145 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1146 isALNUM_LC_utf8((U8*)s)))
1147 {
1148 tmp = !tmp;
3b0527fe 1149 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1150 goto got_it;
1151 }
078c425b 1152 s += uskip;
a687059c 1153 }
a0d0e21e 1154 }
667bb95a 1155 else {
12d33761 1156 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1157 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1158 while (s < strend) {
1159 if (tmp ==
1160 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1161 tmp = !tmp;
3b0527fe 1162 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1163 goto got_it;
1164 }
1165 s++;
a0ed51b3 1166 }
a0ed51b3 1167 }
3b0527fe 1168 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1169 goto got_it;
1170 break;
bbce6d69 1171 case NBOUNDL:
3280af22 1172 PL_reg_flags |= RF_tainted;
bbce6d69 1173 /* FALL THROUGH */
a0d0e21e 1174 case NBOUND:
ffc61ed2 1175 if (do_utf8) {
12d33761 1176 if (s == PL_bostr)
ffc61ed2
JH
1177 tmp = '\n';
1178 else {
6136c704 1179 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1180 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1181 }
1182 tmp = ((OP(c) == NBOUND ?
9041c2e3 1183 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1184 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1185 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1186 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1187 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1188 isALNUM_LC_utf8((U8*)s)))
1189 tmp = !tmp;
3b0527fe 1190 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1191 goto got_it;
078c425b 1192 s += uskip;
ffc61ed2 1193 }
a0d0e21e 1194 }
667bb95a 1195 else {
12d33761 1196 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1197 tmp = ((OP(c) == NBOUND ?
1198 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1199 while (s < strend) {
1200 if (tmp ==
1201 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1202 tmp = !tmp;
3b0527fe 1203 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1204 goto got_it;
1205 s++;
1206 }
a0ed51b3 1207 }
3b0527fe 1208 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1209 goto got_it;
1210 break;
a0d0e21e 1211 case ALNUM:
ffc61ed2 1212 if (do_utf8) {
1a4fad37 1213 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1214 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1215 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1216 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1217 goto got_it;
1218 else
1219 tmp = doevery;
1220 }
bbce6d69 1221 else
ffc61ed2 1222 tmp = 1;
078c425b 1223 s += uskip;
bbce6d69 1224 }
bbce6d69 1225 }
ffc61ed2
JH
1226 else {
1227 while (s < strend) {
1228 if (isALNUM(*s)) {
3b0527fe 1229 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1230 goto got_it;
1231 else
1232 tmp = doevery;
1233 }
a0ed51b3 1234 else
ffc61ed2
JH
1235 tmp = 1;
1236 s++;
a0ed51b3 1237 }
a0ed51b3
LW
1238 }
1239 break;
bbce6d69 1240 case ALNUML:
3280af22 1241 PL_reg_flags |= RF_tainted;
ffc61ed2 1242 if (do_utf8) {
078c425b 1243 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1244 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1245 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1246 goto got_it;
1247 else
1248 tmp = doevery;
1249 }
a687059c 1250 else
ffc61ed2 1251 tmp = 1;
078c425b 1252 s += uskip;
a0d0e21e 1253 }
a0d0e21e 1254 }
ffc61ed2
JH
1255 else {
1256 while (s < strend) {
1257 if (isALNUM_LC(*s)) {
3b0527fe 1258 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1259 goto got_it;
1260 else
1261 tmp = doevery;
1262 }
a0ed51b3 1263 else
ffc61ed2
JH
1264 tmp = 1;
1265 s++;
a0ed51b3 1266 }
a0ed51b3
LW
1267 }
1268 break;
a0d0e21e 1269 case NALNUM:
ffc61ed2 1270 if (do_utf8) {
1a4fad37 1271 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1272 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1273 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1274 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1275 goto got_it;
1276 else
1277 tmp = doevery;
1278 }
bbce6d69 1279 else
ffc61ed2 1280 tmp = 1;
078c425b 1281 s += uskip;
bbce6d69 1282 }
bbce6d69 1283 }
ffc61ed2
JH
1284 else {
1285 while (s < strend) {
1286 if (!isALNUM(*s)) {
3b0527fe 1287 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1288 goto got_it;
1289 else
1290 tmp = doevery;
1291 }
a0ed51b3 1292 else
ffc61ed2
JH
1293 tmp = 1;
1294 s++;
a0ed51b3 1295 }
a0ed51b3
LW
1296 }
1297 break;
bbce6d69 1298 case NALNUML:
3280af22 1299 PL_reg_flags |= RF_tainted;
ffc61ed2 1300 if (do_utf8) {
078c425b 1301 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1302 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1303 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1304 goto got_it;
1305 else
1306 tmp = doevery;
1307 }
a687059c 1308 else
ffc61ed2 1309 tmp = 1;
078c425b 1310 s += uskip;
a687059c 1311 }
a0d0e21e 1312 }
ffc61ed2
JH
1313 else {
1314 while (s < strend) {
1315 if (!isALNUM_LC(*s)) {
3b0527fe 1316 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1317 goto got_it;
1318 else
1319 tmp = doevery;
1320 }
a0ed51b3 1321 else
ffc61ed2
JH
1322 tmp = 1;
1323 s++;
a0ed51b3 1324 }
a0ed51b3
LW
1325 }
1326 break;
a0d0e21e 1327 case SPACE:
ffc61ed2 1328 if (do_utf8) {
1a4fad37 1329 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1330 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1331 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1332 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1333 goto got_it;
1334 else
1335 tmp = doevery;
1336 }
a0d0e21e 1337 else
ffc61ed2 1338 tmp = 1;
078c425b 1339 s += uskip;
2304df62 1340 }
a0d0e21e 1341 }
ffc61ed2
JH
1342 else {
1343 while (s < strend) {
1344 if (isSPACE(*s)) {
3b0527fe 1345 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1346 goto got_it;
1347 else
1348 tmp = doevery;
1349 }
a0ed51b3 1350 else
ffc61ed2
JH
1351 tmp = 1;
1352 s++;
a0ed51b3 1353 }
a0ed51b3
LW
1354 }
1355 break;
bbce6d69 1356 case SPACEL:
3280af22 1357 PL_reg_flags |= RF_tainted;
ffc61ed2 1358 if (do_utf8) {
078c425b 1359 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1360 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1361 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1362 goto got_it;
1363 else
1364 tmp = doevery;
1365 }
bbce6d69 1366 else
ffc61ed2 1367 tmp = 1;
078c425b 1368 s += uskip;
bbce6d69 1369 }
bbce6d69 1370 }
ffc61ed2
JH
1371 else {
1372 while (s < strend) {
1373 if (isSPACE_LC(*s)) {
3b0527fe 1374 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1375 goto got_it;
1376 else
1377 tmp = doevery;
1378 }
a0ed51b3 1379 else
ffc61ed2
JH
1380 tmp = 1;
1381 s++;
a0ed51b3 1382 }
a0ed51b3
LW
1383 }
1384 break;
a0d0e21e 1385 case NSPACE:
ffc61ed2 1386 if (do_utf8) {
1a4fad37 1387 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1388 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1389 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1390 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1391 goto got_it;
1392 else
1393 tmp = doevery;
1394 }
a0d0e21e 1395 else
ffc61ed2 1396 tmp = 1;
078c425b 1397 s += uskip;
a687059c 1398 }
a0d0e21e 1399 }
ffc61ed2
JH
1400 else {
1401 while (s < strend) {
1402 if (!isSPACE(*s)) {
3b0527fe 1403 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1404 goto got_it;
1405 else
1406 tmp = doevery;
1407 }
a0ed51b3 1408 else
ffc61ed2
JH
1409 tmp = 1;
1410 s++;
a0ed51b3 1411 }
a0ed51b3
LW
1412 }
1413 break;
bbce6d69 1414 case NSPACEL:
3280af22 1415 PL_reg_flags |= RF_tainted;
ffc61ed2 1416 if (do_utf8) {
078c425b 1417 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1418 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1419 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1420 goto got_it;
1421 else
1422 tmp = doevery;
1423 }
bbce6d69 1424 else
ffc61ed2 1425 tmp = 1;
078c425b 1426 s += uskip;
bbce6d69 1427 }
bbce6d69 1428 }
ffc61ed2
JH
1429 else {
1430 while (s < strend) {
1431 if (!isSPACE_LC(*s)) {
3b0527fe 1432 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1433 goto got_it;
1434 else
1435 tmp = doevery;
1436 }
a0ed51b3 1437 else
ffc61ed2
JH
1438 tmp = 1;
1439 s++;
a0ed51b3 1440 }
a0ed51b3
LW
1441 }
1442 break;
a0d0e21e 1443 case DIGIT:
ffc61ed2 1444 if (do_utf8) {
1a4fad37 1445 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1446 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1447 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1448 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1449 goto got_it;
1450 else
1451 tmp = doevery;
1452 }
a0d0e21e 1453 else
ffc61ed2 1454 tmp = 1;
078c425b 1455 s += uskip;
2b69d0c2 1456 }
a0d0e21e 1457 }
ffc61ed2
JH
1458 else {
1459 while (s < strend) {
1460 if (isDIGIT(*s)) {
3b0527fe 1461 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1462 goto got_it;
1463 else
1464 tmp = doevery;
1465 }
a0ed51b3 1466 else
ffc61ed2
JH
1467 tmp = 1;
1468 s++;
a0ed51b3 1469 }
a0ed51b3
LW
1470 }
1471 break;
b8c5462f
JH
1472 case DIGITL:
1473 PL_reg_flags |= RF_tainted;
ffc61ed2 1474 if (do_utf8) {
078c425b 1475 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1476 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1477 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1478 goto got_it;
1479 else
1480 tmp = doevery;
1481 }
b8c5462f 1482 else
ffc61ed2 1483 tmp = 1;
078c425b 1484 s += uskip;
b8c5462f 1485 }
b8c5462f 1486 }
ffc61ed2
JH
1487 else {
1488 while (s < strend) {
1489 if (isDIGIT_LC(*s)) {
3b0527fe 1490 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1491 goto got_it;
1492 else
1493 tmp = doevery;
1494 }
b8c5462f 1495 else
ffc61ed2
JH
1496 tmp = 1;
1497 s++;
b8c5462f 1498 }
b8c5462f
JH
1499 }
1500 break;
a0d0e21e 1501 case NDIGIT:
ffc61ed2 1502 if (do_utf8) {
1a4fad37 1503 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1504 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1505 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1506 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1507 goto got_it;
1508 else
1509 tmp = doevery;
1510 }
a0d0e21e 1511 else
ffc61ed2 1512 tmp = 1;
078c425b 1513 s += uskip;
a687059c 1514 }
a0d0e21e 1515 }
ffc61ed2
JH
1516 else {
1517 while (s < strend) {
1518 if (!isDIGIT(*s)) {
3b0527fe 1519 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1520 goto got_it;
1521 else
1522 tmp = doevery;
1523 }
a0ed51b3 1524 else
ffc61ed2
JH
1525 tmp = 1;
1526 s++;
a0ed51b3 1527 }
a0ed51b3
LW
1528 }
1529 break;
b8c5462f
JH
1530 case NDIGITL:
1531 PL_reg_flags |= RF_tainted;
ffc61ed2 1532 if (do_utf8) {
078c425b 1533 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1534 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1535 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1536 goto got_it;
1537 else
1538 tmp = doevery;
1539 }
b8c5462f 1540 else
ffc61ed2 1541 tmp = 1;
078c425b 1542 s += uskip;
b8c5462f 1543 }
a0ed51b3 1544 }
ffc61ed2
JH
1545 else {
1546 while (s < strend) {
1547 if (!isDIGIT_LC(*s)) {
3b0527fe 1548 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1549 goto got_it;
1550 else
1551 tmp = doevery;
1552 }
cf93c79d 1553 else
ffc61ed2
JH
1554 tmp = 1;
1555 s++;
b8c5462f 1556 }
b8c5462f
JH
1557 }
1558 break;
07be1b83
YO
1559 case TRIE:
1560 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1561 {
1562 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1563 trie_type = do_utf8 ?
1564 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1565 : trie_plain;
1566 /* what trie are we using right now */
1567 reg_ac_data *aho
1568 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1569 reg_trie_data *trie=aho->trie;
1570
1571 const char *last_start = strend - trie->minlen;
1572 const char *real_start = s;
1573 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1574 SV *sv_points;
1575 U8 **points; /* map of where we were in the input string
1576 when reading a given string. For ASCII this
1577 is unnecessary overhead as the relationship
1578 is always 1:1, but for unicode, especially
1579 case folded unicode this is not true. */
f9e705e8 1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1581
1582 GET_RE_DEBUG_FLAGS_DECL;
1583
be8e71aa
YO
1584 /* We can't just allocate points here. We need to wrap it in
1585 * an SV so it gets freed properly if there is a croak while
1586 * running the match */
1587 ENTER;
1588 SAVETMPS;
1589 sv_points=newSV(maxlen * sizeof(U8 *));
1590 SvCUR_set(sv_points,
1591 maxlen * sizeof(U8 *));
1592 SvPOK_on(sv_points);
1593 sv_2mortal(sv_points);
1594 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1595
1596 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1597 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1598 s++;
1599 }
1600 }
1601
1602 while (s <= last_start) {
1603 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1604 U8 *uc = (U8*)s;
1605 U16 charid = 0;
1606 U32 base = 1;
1607 U32 state = 1;
1608 UV uvc = 0;
1609 STRLEN len = 0;
1610 STRLEN foldlen = 0;
1611 U8 *uscan = (U8*)NULL;
1612 U8 *leftmost = NULL;
1613
1614 U32 pointpos = 0;
1615
1616 while ( state && uc <= (U8*)strend ) {
1617 int failed=0;
1618 if (aho->states[ state ].wordnum) {
1619 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1620 if (!leftmost || lpos < leftmost)
1621 leftmost= lpos;
1622 if (base==0) break;
1623 }
1624 points[pointpos++ % maxlen]= uc;
1625 switch (trie_type) {
1626 case trie_utf8_fold:
1627 if ( foldlen>0 ) {
1628 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1629 foldlen -= len;
1630 uscan += len;
1631 len=0;
1632 } else {
07be1b83
YO
1633 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1634 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1635 foldlen -= UNISKIP( uvc );
1636 uscan = foldbuf + UNISKIP( uvc );
1637 }
1638 break;
1639 case trie_utf8:
1640 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1641 &len, uniflags );
1642 break;
1643 case trie_plain:
1644 uvc = (UV)*uc;
1645 len = 1;
1646 }
1647
1648 if (uvc < 256) {
1649 charid = trie->charmap[ uvc ];
1650 }
1651 else {
1652 charid = 0;
1653 if (trie->widecharmap) {
1654 SV** const svpp = hv_fetch(trie->widecharmap,
1655 (char*)&uvc, sizeof(UV), 0);
1656 if (svpp)
1657 charid = (U16)SvIV(*svpp);
1658 }
1659 }
1660 DEBUG_TRIE_EXECUTE_r(
1661 PerlIO_printf(Perl_debug_log,
1662 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1663 (int)((const char*)uc - real_start), charid, uvc)
1664 );
1665 uc += len;
1666
1667 do {
1668 U32 word = aho->states[ state ].wordnum;
1669 base = aho->states[ state ].trans.base;
1670
1671 DEBUG_TRIE_EXECUTE_r(
1672 PerlIO_printf( Perl_debug_log,
1673 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1674 failed ? "Fail transition to " : "",
1675 state, base, uvc, word)
1676 );
1677 if ( base ) {
1678 U32 tmp;
1679 if (charid &&
1680 (base + charid > trie->uniquecharcount )
1681 && (base + charid - 1 - trie->uniquecharcount
1682 < trie->lasttrans)
1683 && trie->trans[base + charid - 1 -
1684 trie->uniquecharcount].check == state
1685 && (tmp=trie->trans[base + charid - 1 -
1686 trie->uniquecharcount ].next))
1687 {
1688 state = tmp;
1689 break;
1690 }
1691 else {
1692 failed++;
1693 if ( state == 1 )
1694 break;
1695 else
1696 state = aho->fail[state];
1697 }
1698 }
1699 else {
1700 /* we must be accepting here */
1701 failed++;
1702 break;
1703 }
1704 } while(state);
1705 if (failed) {
1706 if (leftmost)
1707 break;
1708 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1709 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1710 uc++;
1711 }
1712 }
1713 }
1714 }
1715 if ( aho->states[ state ].wordnum ) {
1716 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1717 if (!leftmost || lpos < leftmost)
1718 leftmost = lpos;
1719 }
1720 DEBUG_TRIE_EXECUTE_r(
1721 PerlIO_printf( Perl_debug_log,
1722 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1723 "All done: ",
1724 state, base, uvc)
1725 );
1726 if (leftmost) {
1727 s = (char*)leftmost;
be8e71aa
YO
1728 if (!reginfo || regtry(reginfo, s)) {
1729 FREETMPS;
1730 LEAVE;
07be1b83 1731 goto got_it;
be8e71aa 1732 }
07be1b83
YO
1733 s = HOPc(s,1);
1734 } else {
1735 break;
1736 }
1737 }
be8e71aa
YO
1738 FREETMPS;
1739 LEAVE;
07be1b83
YO
1740 }
1741 break;
b3c9acc1 1742 default:
3c3eec57
GS
1743 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1744 break;
d6a28714 1745 }
6eb5f6b9
JH
1746 return 0;
1747 got_it:
1748 return s;
1749}
1750
1751/*
1752 - regexec_flags - match a regexp against a string
1753 */
1754I32
1755Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1756 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1757/* strend: pointer to null at end of string */
1758/* strbeg: real beginning of string */
1759/* minend: end of match must be >=minend after stringarg. */
1760/* data: May be used for some additional optimizations. */
1761/* nosave: For optimizations. */
1762{
97aff369 1763 dVAR;
6eb5f6b9
JH
1764 register char *s;
1765 register regnode *c;
1766 register char *startpos = stringarg;
6eb5f6b9
JH
1767 I32 minlen; /* must match at least this many chars */
1768 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1769 I32 end_shift = 0; /* Same for the end. */ /* CC */
1770 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1771 char *scream_olds = NULL;
3dab1dad 1772 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1773 const bool do_utf8 = DO_UTF8(sv);
2757e526 1774 I32 multiline;
2a782b5b 1775#ifdef DEBUGGING
2757e526
JH
1776 SV* dsv0;
1777 SV* dsv1;
2a782b5b 1778#endif
3b0527fe 1779 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1780
1781 GET_RE_DEBUG_FLAGS_DECL;
1782
9d4ba2ae 1783 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1784
1785 /* Be paranoid... */
1786 if (prog == NULL || startpos == NULL) {
1787 Perl_croak(aTHX_ "NULL regexp parameter");
1788 return 0;
1789 }
1790
2757e526 1791 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1792 reginfo.prog = prog;
2757e526
JH
1793
1794#ifdef DEBUGGING
1795 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1796 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1797#endif
1798
bac06658
JH
1799 RX_MATCH_UTF8_set(prog, do_utf8);
1800
6eb5f6b9 1801 minlen = prog->minlen;
61a36c01 1802 if (strend - startpos < minlen) {
a3621e74 1803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1804 "String too short [regexec_flags]...\n"));
1805 goto phooey;
1aa99e6b 1806 }
6eb5f6b9 1807
6eb5f6b9
JH
1808 /* Check validity of program. */
1809 if (UCHARAT(prog->program) != REG_MAGIC) {
1810 Perl_croak(aTHX_ "corrupted regexp program");
1811 }
1812
1813 PL_reg_flags = 0;
1814 PL_reg_eval_set = 0;
1815 PL_reg_maxiter = 0;
1816
1817 if (prog->reganch & ROPT_UTF8)
1818 PL_reg_flags |= RF_utf8;
1819
1820 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1821 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1822 PL_bostr = strbeg;
3b0527fe 1823 reginfo.sv = sv;
6eb5f6b9
JH
1824
1825 /* Mark end of line for $ (and such) */
1826 PL_regeol = strend;
1827
1828 /* see how far we have to get to not match where we matched before */
3b0527fe 1829 reginfo.till = startpos+minend;
6eb5f6b9 1830
6eb5f6b9
JH
1831 /* If there is a "must appear" string, look for it. */
1832 s = startpos;
1833
3b0527fe 1834 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1835 MAGIC *mg;
1836
1837 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1838 reginfo.ganch = startpos;
6eb5f6b9
JH
1839 else if (sv && SvTYPE(sv) >= SVt_PVMG
1840 && SvMAGIC(sv)
14befaf4
DM
1841 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1842 && mg->mg_len >= 0) {
3b0527fe 1843 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1844 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1845 if (s > reginfo.ganch)
6eb5f6b9 1846 goto phooey;
3b0527fe 1847 s = reginfo.ganch;
6eb5f6b9
JH
1848 }
1849 }
1850 else /* pos() not defined */
3b0527fe 1851 reginfo.ganch = strbeg;
6eb5f6b9
JH
1852 }
1853
a0714e2c 1854 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1855 re_scream_pos_data d;
1856
1857 d.scream_olds = &scream_olds;
1858 d.scream_pos = &scream_pos;
1859 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1860 if (!s) {
a3621e74 1861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1862 goto phooey; /* not present */
3fa9c3d7 1863 }
6eb5f6b9
JH
1864 }
1865
a3621e74 1866 DEBUG_EXECUTE_r({
1df70142
AL
1867 const char * const s0 = UTF
1868 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1869 UNI_DISPLAY_REGEX)
1870 : prog->precomp;
bb7a0f54 1871 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1872 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1873 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1874 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1875 if (!PL_colorset)
1876 reginitcolors();
1877 PerlIO_printf(Perl_debug_log,
a0288114 1878 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1879 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1880 len0, len0, s0,
2a782b5b 1881 PL_colors[1],
9e55ce06 1882 len0 > 60 ? "..." : "",
2a782b5b 1883 PL_colors[0],
9e55ce06
JH
1884 (int)(len1 > 60 ? 60 : len1),
1885 s1, PL_colors[1],
1886 (len1 > 60 ? "..." : "")
2a782b5b
JH
1887 );
1888 });
6eb5f6b9
JH
1889
1890 /* Simplest case: anchored match need be tried only once. */
1891 /* [unless only anchor is BOL and multiline is set] */
1892 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1893 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1894 goto got_it;
7fba1cd6 1895 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1896 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1897 {
1898 char *end;
1899
1900 if (minlen)
1901 dontbother = minlen - 1;
1aa99e6b 1902 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1903 /* for multiline we only have to try after newlines */
33b8afdf 1904 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1905 if (s == startpos)
1906 goto after_try;
1907 while (1) {
3b0527fe 1908 if (regtry(&reginfo, s))
6eb5f6b9
JH
1909 goto got_it;
1910 after_try:
1911 if (s >= end)
1912 goto phooey;
1913 if (prog->reganch & RE_USE_INTUIT) {
1914 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1915 if (!s)
1916 goto phooey;
1917 }
1918 else
1919 s++;
1920 }
1921 } else {
1922 if (s > startpos)
1923 s--;
1924 while (s < end) {
1925 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1926 if (regtry(&reginfo, s))
6eb5f6b9
JH
1927 goto got_it;
1928 }
1929 }
1930 }
1931 }
1932 goto phooey;
1933 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1934 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1935 goto got_it;
1936 goto phooey;
1937 }
1938
1939 /* Messy cases: unanchored match. */
33b8afdf 1940 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1941 /* we have /x+whatever/ */
1942 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1943 char ch;
bf93d4cc
GS
1944#ifdef DEBUGGING
1945 int did_match = 0;
1946#endif
33b8afdf
JH
1947 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1948 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1949 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1950
1aa99e6b 1951 if (do_utf8) {
6eb5f6b9
JH
1952 while (s < strend) {
1953 if (*s == ch) {
a3621e74 1954 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1955 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1956 s += UTF8SKIP(s);
1957 while (s < strend && *s == ch)
1958 s += UTF8SKIP(s);
1959 }
1960 s += UTF8SKIP(s);
1961 }
1962 }
1963 else {
1964 while (s < strend) {
1965 if (*s == ch) {
a3621e74 1966 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1967 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1968 s++;
1969 while (s < strend && *s == ch)
1970 s++;
1971 }
1972 s++;
1973 }
1974 }
a3621e74 1975 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1976 PerlIO_printf(Perl_debug_log,
b7953727
JH
1977 "Did not find anchored character...\n")
1978 );
6eb5f6b9 1979 }
a0714e2c
SS
1980 else if (prog->anchored_substr != NULL
1981 || prog->anchored_utf8 != NULL
1982 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1983 && prog->float_max_offset < strend - s)) {
1984 SV *must;
1985 I32 back_max;
1986 I32 back_min;
1987 char *last;
6eb5f6b9 1988 char *last1; /* Last position checked before */
bf93d4cc
GS
1989#ifdef DEBUGGING
1990 int did_match = 0;
1991#endif
33b8afdf
JH
1992 if (prog->anchored_substr || prog->anchored_utf8) {
1993 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1994 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1995 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1996 back_max = back_min = prog->anchored_offset;
1997 } else {
1998 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1999 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2000 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2001 back_max = prog->float_max_offset;
2002 back_min = prog->float_min_offset;
2003 }
2004 if (must == &PL_sv_undef)
2005 /* could not downgrade utf8 check substring, so must fail */
2006 goto phooey;
2007
2008 last = HOP3c(strend, /* Cannot start after this */
2009 -(I32)(CHR_SVLEN(must)
2010 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2011
2012 if (s > PL_bostr)
2013 last1 = HOPc(s, -1);
2014 else
2015 last1 = s - 1; /* bogus */
2016
a0288114 2017 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2018 check_substr==must. */
2019 scream_pos = -1;
2020 dontbother = end_shift;
2021 strend = HOPc(strend, -dontbother);
2022 while ( (s <= last) &&
9041c2e3 2023 ((flags & REXEC_SCREAM)
1aa99e6b 2024 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2025 end_shift, &scream_pos, 0))
1aa99e6b 2026 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2027 (unsigned char*)strend, must,
7fba1cd6 2028 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2029 /* we may be pointing at the wrong string */
2030 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2031 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2032 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2033 if (HOPc(s, -back_max) > last1) {
2034 last1 = HOPc(s, -back_min);
2035 s = HOPc(s, -back_max);
2036 }
2037 else {
52657f30 2038 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2039
2040 last1 = HOPc(s, -back_min);
52657f30 2041 s = t;
6eb5f6b9 2042 }
1aa99e6b 2043 if (do_utf8) {
6eb5f6b9 2044 while (s <= last1) {
3b0527fe 2045 if (regtry(&reginfo, s))
6eb5f6b9
JH
2046 goto got_it;
2047 s += UTF8SKIP(s);
2048 }
2049 }
2050 else {
2051 while (s <= last1) {
3b0527fe 2052 if (regtry(&reginfo, s))
6eb5f6b9
JH
2053 goto got_it;
2054 s++;
2055 }
2056 }
2057 }
a3621e74 2058 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2059 PerlIO_printf(Perl_debug_log,
a0288114 2060 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2061 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2062 ? "anchored" : "floating"),
2063 PL_colors[0],
2064 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2065 SvPVX_const(must),
b7953727
JH
2066 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2067 );
6eb5f6b9
JH
2068 goto phooey;
2069 }
155aba94 2070 else if ((c = prog->regstclass)) {
f14c76ed 2071 if (minlen) {
be8e71aa 2072 const OPCODE op = OP(prog->regstclass);
66e933ab 2073 /* don't bother with what can't match */
07be1b83 2074 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2075 strend = HOPc(strend, -(minlen - 1));
2076 }
a3621e74 2077 DEBUG_EXECUTE_r({
be8e71aa 2078 SV * const prop = sv_newmortal();
cfd0369c
NC
2079 const char *s0;
2080 const char *s1;
9e55ce06
JH
2081 int len0;
2082 int len1;
2083
32fc9b6a 2084 regprop(prog, prop, c);
9e55ce06 2085 s0 = UTF ?
3f7c398e 2086 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2087 UNI_DISPLAY_REGEX) :
cfd0369c 2088 SvPVX_const(prop);
9e55ce06
JH
2089 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2090 s1 = UTF ?
c728cb41 2091 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2092 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2093 PerlIO_printf(Perl_debug_log,
07be1b83 2094 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2095 len0, len0, s0,
07be1b83 2096 len1, len1, s1, (int)(strend - s));
ffc61ed2 2097 });
3b0527fe 2098 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2099 goto got_it;
07be1b83 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2101 }
2102 else {
2103 dontbother = 0;
a0714e2c 2104 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2105 /* Trim the end. */
d6a28714 2106 char *last;
33b8afdf
JH
2107 SV* float_real;
2108
2109 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2110 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2111 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2112
2113 if (flags & REXEC_SCREAM) {
33b8afdf 2114 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2115 end_shift, &scream_pos, 1); /* last one */
2116 if (!last)
ffc61ed2 2117 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2118 /* we may be pointing at the wrong string */
2119 else if (RX_MATCH_COPIED(prog))
3f7c398e 2120 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2121 }
d6a28714
JH
2122 else {
2123 STRLEN len;
cfd0369c 2124 const char * const little = SvPV_const(float_real, len);
d6a28714 2125
33b8afdf 2126 if (SvTAIL(float_real)) {
d6a28714
JH
2127 if (memEQ(strend - len + 1, little, len - 1))
2128 last = strend - len + 1;
7fba1cd6 2129 else if (!multiline)
9041c2e3 2130 last = memEQ(strend - len, little, len)
bd61b366 2131 ? strend - len : NULL;
b8c5462f 2132 else
d6a28714
JH
2133 goto find_last;
2134 } else {
2135 find_last:
9041c2e3 2136 if (len)
d6a28714 2137 last = rninstr(s, strend, little, little + len);
b8c5462f 2138 else
a0288114 2139 last = strend; /* matching "$" */
b8c5462f 2140 }
b8c5462f 2141 }
bf93d4cc 2142 if (last == NULL) {
a3621e74 2143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2144 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2145 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2146 goto phooey; /* Should not happen! */
2147 }
d6a28714
JH
2148 dontbother = strend - last + prog->float_min_offset;
2149 }
2150 if (minlen && (dontbother < minlen))
2151 dontbother = minlen - 1;
2152 strend -= dontbother; /* this one's always in bytes! */
2153 /* We don't know much -- general case. */
1aa99e6b 2154 if (do_utf8) {
d6a28714 2155 for (;;) {
3b0527fe 2156 if (regtry(&reginfo, s))
d6a28714
JH
2157 goto got_it;
2158 if (s >= strend)
2159 break;
b8c5462f 2160 s += UTF8SKIP(s);
d6a28714
JH
2161 };
2162 }
2163 else {
2164 do {
3b0527fe 2165 if (regtry(&reginfo, s))
d6a28714
JH
2166 goto got_it;
2167 } while (s++ < strend);
2168 }
2169 }
2170
2171 /* Failure. */
2172 goto phooey;
2173
2174got_it:
2175 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2176
2177 if (PL_reg_eval_set) {
2178 /* Preserve the current value of $^R */
2179 if (oreplsv != GvSV(PL_replgv))
2180 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2181 restored, the value remains
2182 the same. */
4f639d21 2183 restore_pos(aTHX_ prog);
d6a28714
JH
2184 }
2185
2186 /* make sure $`, $&, $', and $digit will work later */
2187 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2188 RX_MATCH_COPY_FREE(prog);
d6a28714 2189 if (flags & REXEC_COPY_STR) {
be8e71aa 2190 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2191#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2192 if ((SvIsCOW(sv)
2193 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2194 if (DEBUG_C_TEST) {
2195 PerlIO_printf(Perl_debug_log,
2196 "Copy on write: regexp capture, type %d\n",
2197 (int) SvTYPE(sv));
2198 }
2199 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2200 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2201 assert (SvPOKp(prog->saved_copy));
2202 } else
2203#endif
2204 {
2205 RX_MATCH_COPIED_on(prog);
2206 s = savepvn(strbeg, i);
2207 prog->subbeg = s;
2208 }
d6a28714 2209 prog->sublen = i;
d6a28714
JH
2210 }
2211 else {
2212 prog->subbeg = strbeg;
2213 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2214 }
2215 }
9041c2e3 2216
d6a28714
JH
2217 return 1;
2218
2219phooey:
a3621e74 2220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2221 PL_colors[4], PL_colors[5]));
d6a28714 2222 if (PL_reg_eval_set)
4f639d21 2223 restore_pos(aTHX_ prog);
d6a28714
JH
2224 return 0;
2225}
2226
2227/*
2228 - regtry - try match at specific point
2229 */
2230STATIC I32 /* 0 failure, 1 success */
3b0527fe 2231S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2232{
97aff369 2233 dVAR;
d6a28714
JH
2234 register I32 *sp;
2235 register I32 *ep;
2236 CHECKPOINT lastcp;
3b0527fe 2237 regexp *prog = reginfo->prog;
a3621e74 2238 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2239
02db2b7b
IZ
2240#ifdef DEBUGGING
2241 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2242#endif
d6a28714
JH
2243 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2244 MAGIC *mg;
2245
2246 PL_reg_eval_set = RS_init;
a3621e74 2247 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2248 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2249 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2250 ));
e8347627 2251 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2252 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2253 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2254 SAVETMPS;
2255 /* Apparently this is not needed, judging by wantarray. */
e8347627 2256 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2257 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2258
3b0527fe 2259 if (reginfo->sv) {
d6a28714 2260 /* Make $_ available to executed code. */
3b0527fe 2261 if (reginfo->sv != DEFSV) {
59f00321 2262 SAVE_DEFSV;
3b0527fe 2263 DEFSV = reginfo->sv;
b8c5462f 2264 }
d6a28714 2265
3b0527fe
DM
2266 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2267 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2268 /* prepare for quick setting of pos */
d300d9fa
NC
2269#ifdef PERL_OLD_COPY_ON_WRITE
2270 if (SvIsCOW(sv))
2271 sv_force_normal_flags(sv, 0);
2272#endif
3dab1dad 2273 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2274 &PL_vtbl_mglob, NULL, 0);
d6a28714 2275 mg->mg_len = -1;
b8c5462f 2276 }
d6a28714
JH
2277 PL_reg_magic = mg;
2278 PL_reg_oldpos = mg->mg_len;
4f639d21 2279 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2280 }
09687e5a 2281 if (!PL_reg_curpm) {
a02a5408 2282 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2283#ifdef USE_ITHREADS
2284 {
be8e71aa 2285 SV* const repointer = newSViv(0);
577e12cc 2286 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2287 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2288 av_push(PL_regex_padav,repointer);
2289 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2290 PL_regex_pad = AvARRAY(PL_regex_padav);
2291 }
2292#endif
2293 }
aaa362c4 2294 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2295 PL_reg_oldcurpm = PL_curpm;
2296 PL_curpm = PL_reg_curpm;
2297 if (RX_MATCH_COPIED(prog)) {
2298 /* Here is a serious problem: we cannot rewrite subbeg,
2299 since it may be needed if this match fails. Thus
2300 $` inside (?{}) could fail... */
2301 PL_reg_oldsaved = prog->subbeg;
2302 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2303#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2304 PL_nrs = prog->saved_copy;
2305#endif
d6a28714
JH
2306 RX_MATCH_COPIED_off(prog);
2307 }
2308 else
bd61b366 2309 PL_reg_oldsaved = NULL;
d6a28714
JH
2310 prog->subbeg = PL_bostr;
2311 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2312 }
973dddac 2313 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2314 PL_reginput = startpos;
2315 PL_regstartp = prog->startp;
2316 PL_regendp = prog->endp;
2317 PL_reglastparen = &prog->lastparen;
a01268b5 2318 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2319 prog->lastparen = 0;
03994de8 2320 prog->lastcloseparen = 0;
d6a28714 2321 PL_regsize = 0;
a3621e74 2322 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2323 if (PL_reg_start_tmpl <= prog->nparens) {
2324 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2325 if(PL_reg_start_tmp)
2326 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2327 else
a02a5408 2328 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2329 }
2330
2331 /* XXXX What this code is doing here?!!! There should be no need
2332 to do this again and again, PL_reglastparen should take care of
3dd2943c 2333 this! --ilya*/
dafc8851
JH
2334
2335 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2336 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2337 * PL_reglastparen), is not needed at all by the test suite
2338 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2339 * enough, for building DynaLoader, or otherwise this
2340 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2341 * will happen. Meanwhile, this code *is* needed for the
2342 * above-mentioned test suite tests to succeed. The common theme
2343 * on those tests seems to be returning null fields from matches.
2344 * --jhi */
dafc8851 2345#if 1
d6a28714
JH
2346 sp = prog->startp;
2347 ep = prog->endp;
2348 if (prog->nparens) {
097eb12c 2349 register I32 i;
eb160463 2350 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2351 *++sp = -1;
2352 *++ep = -1;
2353 }
2354 }
dafc8851 2355#endif
02db2b7b 2356 REGCP_SET(lastcp);
3b0527fe 2357 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2358 prog->endp[0] = PL_reginput - PL_bostr;
2359 return 1;
2360 }
02db2b7b 2361 REGCP_UNWIND(lastcp);
d6a28714
JH
2362 return 0;
2363}
2364
02db2b7b 2365
8ba1375e
MJD
2366#define sayYES goto yes
2367#define sayNO goto no
e0f9d4a8 2368#define sayNO_ANYOF goto no_anyof
8ba1375e 2369#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2370#define sayNO_FINAL goto no_final
2371#define sayNO_SILENT goto do_no
2372#define saySAME(x) if (x) goto yes; else goto no
2373
3ab3c9b4
HS
2374#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2375#define POSCACHE_SEEN 1 /* we know what we're caching */
2376#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2377
3ab3c9b4 2378#define CACHEsayYES STMT_START { \
d8319b27 2379 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2380 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2381 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2382 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2383 } \
2384 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2385 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2386 } \
2387 else { \
3ab3c9b4
HS
2388 /* cache records failure, but this is success */ \
2389 DEBUG_r( \
2390 PerlIO_printf(Perl_debug_log, \
2391 "%*s (remove success from failure cache)\n", \
2392 REPORT_CODE_OFF+PL_regindent*2, "") \
2393 ); \
d8319b27 2394 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2395 } \
2396 } \
2397 sayYES; \
2398} STMT_END
7409bbd3 2399
3ab3c9b4 2400#define CACHEsayNO STMT_START { \
d8319b27 2401 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2402 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2403 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2404 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2405 } \
2406 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2407 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2408 } \
2409 else { \
3ab3c9b4
HS
2410 /* cache records success, but this is failure */ \
2411 DEBUG_r( \
2412 PerlIO_printf(Perl_debug_log, \
2413 "%*s (remove failure from success cache)\n", \
2414 REPORT_CODE_OFF+PL_regindent*2, "") \
2415 ); \
d8319b27 2416 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2417 } \
2418 } \
2419 sayNO; \
2420} STMT_END
2421
a3621e74
YO
2422/* this is used to determine how far from the left messages like
2423 'failed...' are printed. Currently 29 makes these messages line
2424 up with the opcode they refer to. Earlier perls used 25 which
2425 left these messages outdented making reviewing a debug output
2426 quite difficult.
2427*/
2428#define REPORT_CODE_OFF 29
2429
2430
2431/* Make sure there is a test for this +1 options in re_tests */
2432#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2433
40a82448
DM
2434#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2435#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2436
86545054
DM
2437#define SLAB_FIRST(s) (&(s)->states[0])
2438#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2439
5d9a96ca
DM
2440/* grab a new slab and return the first slot in it */
2441
2442STATIC regmatch_state *
2443S_push_slab(pTHX)
2444{
54df2634
NC
2445#if PERL_VERSION < 9
2446 dMY_CXT;
2447#endif
5d9a96ca
DM
2448 regmatch_slab *s = PL_regmatch_slab->next;
2449 if (!s) {
2450 Newx(s, 1, regmatch_slab);
2451 s->prev = PL_regmatch_slab;
2452 s->next = NULL;
2453 PL_regmatch_slab->next = s;
2454 }
2455 PL_regmatch_slab = s;
86545054 2456 return SLAB_FIRST(s);
5d9a96ca 2457}
5b47454d 2458
95b24440
DM
2459/* simulate a recursive call to regmatch */
2460
2461#define REGMATCH(ns, where) \
5d9a96ca
DM
2462 st->scan = scan; \
2463 scan = (ns); \
2464 st->resume_state = resume_##where; \
95b24440
DM
2465 goto start_recurse; \
2466 resume_point_##where:
2467
40a82448
DM
2468/* push a new state then goto it */
2469
2470#define PUSH_STATE_GOTO(state, node) \
2471 scan = node; \
2472 st->resume_state = state; \
2473 goto push_state;
2474
2475/* push a new state with success backtracking, then goto it */
2476
2477#define PUSH_YES_STATE_GOTO(state, node) \
2478 scan = node; \
2479 st->resume_state = state; \
2480 goto push_yes_state;
2481
aa283a38 2482
aa283a38 2483
d6a28714
JH
2484/*
2485 - regmatch - main matching routine
2486 *
2487 * Conceptually the strategy is simple: check to see whether the current
2488 * node matches, call self recursively to see whether the rest matches,
2489 * and then act accordingly. In practice we make some effort to avoid
2490 * recursion, in particular by going through "ordinary" nodes (that don't
2491 * need to know whether the rest of the match failed) by a loop instead of
2492 * by recursion.
2493 */
2494/* [lwall] I've hoisted the register declarations to the outer block in order to
2495 * maybe save a little bit of pushing and popping on the stack. It also takes
2496 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2497 *
2498 * This function used to be heavily recursive, but since this had the
2499 * effect of blowing the CPU stack on complex regexes, it has been
2500 * restructured to be iterative, and to save state onto the heap rather
2501 * than the stack. Essentially whereever regmatch() used to be called, it
2502 * pushes the current state, notes where to return, then jumps back into
2503 * the main loop.
2504 *
2505 * Originally the structure of this function used to look something like
2506
2507 S_regmatch() {
2508 int a = 1, b = 2;
2509 ...
2510 while (scan != NULL) {
5d9a96ca 2511 a++; // do stuff with a and b
95b24440
DM
2512 ...
2513 switch (OP(scan)) {
2514 case FOO: {
2515 int local = 3;
2516 ...
2517 if (regmatch(...)) // recurse
2518 goto yes;
2519 }
2520 ...
2521 }
2522 }
2523 yes:
2524 return 1;
2525 }
2526
2527 * Now it looks something like this:
2528
5d9a96ca 2529 typedef struct {
95b24440
DM
2530 int a, b, local;
2531 int resume_state;
5d9a96ca 2532 } regmatch_state;
95b24440
DM
2533
2534 S_regmatch() {
5d9a96ca
DM
2535 regmatch_state *st = new();
2536 int depth=0;
2537 st->a++; // do stuff with a and b
95b24440
DM
2538 ...
2539 while (scan != NULL) {
2540 ...
2541 switch (OP(scan)) {
2542 case FOO: {
5d9a96ca 2543 st->local = 3;
95b24440 2544 ...
5d9a96ca
DM
2545 st->scan = scan;
2546 scan = ...;
2547 st->resume_state = resume_FOO;
2548 goto start_recurse; // recurse
95b24440 2549
5d9a96ca
DM
2550 resume_point_FOO:
2551 if (result)
95b24440
DM
2552 goto yes;
2553 }
2554 ...
2555 }
5d9a96ca
DM
2556 start_recurse:
2557 st = new(); push a new state
2558 st->a = 1; st->b = 2;
2559 depth++;
95b24440 2560 }
5d9a96ca 2561 yes:
95b24440 2562 result = 1;
5d9a96ca
DM
2563 if (depth--) {
2564 st = pop();
95b24440
DM
2565 switch (resume_state) {
2566 case resume_FOO:
2567 goto resume_point_FOO;
2568 ...
2569 }
2570 }
2571 return result
2572 }
2573
2574 * WARNING: this means that any line in this function that contains a
2575 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2576 * regmatch() using gotos instead. Thus the values of any local variables
2577 * not saved in the regmatch_state structure will have been lost when
2578 * execution resumes on the next line .
5d9a96ca
DM
2579 *
2580 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2581 * PL_regmatch_state always points to the currently active state, and
2582 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2583 * The first time regmatch is called, the first slab is allocated, and is
2584 * never freed until interpreter desctruction. When the slab is full,
2585 * a new one is allocated chained to the end. At exit from regmatch, slabs
2586 * allocated since entry are freed.
d6a28714 2587 */
95b24440 2588
40a82448 2589/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2590#define TRIE_next (REGNODE_MAX+1)
2591#define TRIE_next_fail (REGNODE_MAX+2)
2592#define EVAL_A (REGNODE_MAX+3)
2593#define EVAL_A_fail (REGNODE_MAX+4)
2594#define resume_CURLYX (REGNODE_MAX+5)
2595#define resume_WHILEM1 (REGNODE_MAX+6)
2596#define resume_WHILEM2 (REGNODE_MAX+7)
2597#define resume_WHILEM3 (REGNODE_MAX+8)
2598#define resume_WHILEM4 (REGNODE_MAX+9)
2599#define resume_WHILEM5 (REGNODE_MAX+10)
2600#define resume_WHILEM6 (REGNODE_MAX+11)
2601#define BRANCH_next (REGNODE_MAX+12)
2602#define BRANCH_next_fail (REGNODE_MAX+13)
2603#define CURLYM_A (REGNODE_MAX+14)
2604#define CURLYM_A_fail (REGNODE_MAX+15)
2605#define CURLYM_B (REGNODE_MAX+16)
2606#define CURLYM_B_fail (REGNODE_MAX+17)
2607#define IFMATCH_A (REGNODE_MAX+18)
2608#define IFMATCH_A_fail (REGNODE_MAX+19)
2609#define CURLY_B_min_known (REGNODE_MAX+20)
2610#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2611#define CURLY_B_min (REGNODE_MAX+22)
2612#define CURLY_B_min_fail (REGNODE_MAX+23)
2613#define CURLY_B_max (REGNODE_MAX+24)
2614#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2615
2616
3dab1dad 2617#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2618
07be1b83
YO
2619#ifdef DEBUGGING
2620STATIC void
2621S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2622{
2623 const int docolor = *PL_colors[0];
2624 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2625 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2626 /* The part of the string before starttry has one color
2627 (pref0_len chars), between starttry and current
2628 position another one (pref_len - pref0_len chars),
2629 after the current position the third one.
2630 We assume that pref0_len <= pref_len, otherwise we
2631 decrease pref0_len. */
2632 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2633 ? (5 + taill) - l : locinput - PL_bostr;
2634 int pref0_len;
2635
2636 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2637 pref_len++;
2638 pref0_len = pref_len - (locinput - PL_reg_starttry);
2639 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2640 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2641 ? (5 + taill) - pref_len : PL_regeol - locinput);
2642 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2643 l--;
2644 if (pref0_len < 0)
2645 pref0_len = 0;
2646 if (pref0_len > pref_len)
2647 pref0_len = pref_len;
2648 {
2649 const char * const s0 =
2650 do_utf8 && OP(scan) != CANY ?
2651 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2652 pref0_len, 60, UNI_DISPLAY_REGEX) :
2653 locinput - pref_len;
2654 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2655 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2656 pv_uni_display(PERL_DEBUG_PAD(1),
2657 (U8*)(locinput - pref_len + pref0_len),
2658 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2659 locinput - pref_len + pref0_len;
2660 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2661 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2662 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2663 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2664 locinput;
2665 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2666 PerlIO_printf(Perl_debug_log,
2667 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2668 (IV)(locinput - PL_bostr),
2669 PL_colors[4],
2670 len0, s0,
2671 PL_colors[5],
2672 PL_colors[2],
2673 len1, s1,
2674 PL_colors[3],
2675 (docolor ? "" : "> <"),
2676 PL_colors[0],
2677 len2, s2,
2678 PL_colors[1],
2679 15 - l - pref_len + 1,
2680 "");
2681 }
2682}
2683#endif
2684
d6a28714 2685STATIC I32 /* 0 failure, 1 success */
3b0527fe 2686S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2687{
54df2634
NC
2688#if PERL_VERSION < 9
2689 dMY_CXT;
2690#endif
27da23d5 2691 dVAR;
95b24440 2692 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2693 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2694
3b0527fe
DM
2695 regexp *rex = reginfo->prog;
2696
5d9a96ca
DM
2697 regmatch_slab *orig_slab;
2698 regmatch_state *orig_state;
a3621e74 2699
5d9a96ca
DM
2700 /* the current state. This is a cached copy of PL_regmatch_state */
2701 register regmatch_state *st;
95b24440 2702
5d9a96ca
DM
2703 /* cache heavy used fields of st in registers */
2704 register regnode *scan;
2705 register regnode *next;
2706 register I32 n = 0; /* initialize to shut up compiler warning */
2707 register char *locinput = PL_reginput;
95b24440 2708
5d9a96ca
DM
2709 /* these variables are NOT saved during a recusive RFEGMATCH: */
2710 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2711 bool result; /* return value of S_regmatch */
5d9a96ca 2712 int depth = 0; /* depth of recursion */
77cb431f
DM
2713 regmatch_state *yes_state = NULL; /* state to pop to on success of
2714 subpattern */
40a82448 2715 U32 state_num;
95b24440
DM
2716
2717#ifdef DEBUGGING
e68ec53f 2718 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2719 PL_regindent++;
2720#endif
2721
5d9a96ca
DM
2722 /* on first ever call to regmatch, allocate first slab */
2723 if (!PL_regmatch_slab) {
2724 Newx(PL_regmatch_slab, 1, regmatch_slab);
2725 PL_regmatch_slab->prev = NULL;
2726 PL_regmatch_slab->next = NULL;
86545054 2727 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2728 }
2729
2730 /* remember current high-water mark for exit */
2731 /* XXX this should be done with SAVE* instead */
2732 orig_slab = PL_regmatch_slab;
2733 orig_state = PL_regmatch_state;
2734
2735 /* grab next free state slot */
2736 st = ++PL_regmatch_state;
86545054 2737 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2738 st = PL_regmatch_state = S_push_slab(aTHX);
2739
2740 st->minmod = 0;
2741 st->sw = 0;
2742 st->logical = 0;
5d9a96ca 2743 st->cc = NULL;
d6a28714
JH
2744 /* Note that nextchr is a byte even in UTF */
2745 nextchr = UCHARAT(locinput);
2746 scan = prog;
2747 while (scan != NULL) {
8ba1375e 2748
a3621e74 2749 DEBUG_EXECUTE_r( {
6136c704 2750 SV * const prop = sv_newmortal();
07be1b83 2751 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2752 regprop(rex, prop, scan);
07be1b83
YO
2753
2754 PerlIO_printf(Perl_debug_log,
2755 "%3"IVdf":%*s%s(%"IVdf")\n",
2756 (IV)(scan - rex->program), PL_regindent*2, "",
2757 SvPVX_const(prop),
2758 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2759 });
d6a28714
JH
2760
2761 next = scan + NEXT_OFF(scan);
2762 if (next == scan)
2763 next = NULL;
40a82448 2764 state_num = OP(scan);
d6a28714 2765
40a82448
DM
2766 reenter_switch:
2767 switch (state_num) {
d6a28714 2768 case BOL:
7fba1cd6 2769 if (locinput == PL_bostr)
d6a28714 2770 {
3b0527fe 2771 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2772 break;
2773 }
d6a28714
JH
2774 sayNO;
2775 case MBOL:
12d33761
HS
2776 if (locinput == PL_bostr ||
2777 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2778 {
b8c5462f
JH
2779 break;
2780 }
d6a28714
JH
2781 sayNO;
2782 case SBOL:
c2a73568 2783 if (locinput == PL_bostr)
b8c5462f 2784 break;
d6a28714
JH
2785 sayNO;
2786 case GPOS:
3b0527fe 2787 if (locinput == reginfo->ganch)
d6a28714
JH
2788 break;
2789 sayNO;
2790 case EOL:
d6a28714
JH
2791 goto seol;
2792 case MEOL:
d6a28714 2793 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2794 sayNO;
b8c5462f 2795 break;
d6a28714
JH
2796 case SEOL:
2797 seol:
2798 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2799 sayNO;
d6a28714 2800 if (PL_regeol - locinput > 1)
b8c5462f 2801 sayNO;
b8c5462f 2802 break;
d6a28714
JH
2803 case EOS:
2804 if (PL_regeol != locinput)
b8c5462f 2805 sayNO;
d6a28714 2806 break;
ffc61ed2 2807 case SANY:
d6a28714 2808 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2809 sayNO;
f33976b4
DB
2810 if (do_utf8) {
2811 locinput += PL_utf8skip[nextchr];
2812 if (locinput > PL_regeol)
2813 sayNO;
2814 nextchr = UCHARAT(locinput);
2815 }
2816 else
2817 nextchr = UCHARAT(++locinput);
2818 break;
2819 case CANY:
2820 if (!nextchr && locinput >= PL_regeol)
2821 sayNO;
b8c5462f 2822 nextchr = UCHARAT(++locinput);
a0d0e21e 2823 break;
ffc61ed2 2824 case REG_ANY:
1aa99e6b
IH
2825 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2826 sayNO;
2827 if (do_utf8) {
b8c5462f 2828 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2829 if (locinput > PL_regeol)
2830 sayNO;
a0ed51b3 2831 nextchr = UCHARAT(locinput);
a0ed51b3 2832 }
1aa99e6b
IH
2833 else
2834 nextchr = UCHARAT(++locinput);
a0ed51b3 2835 break;
166ba7cd
DM
2836
2837#undef ST
2838#define ST st->u.trie
2839
5b47454d 2840 case TRIE:
3dab1dad 2841 {
07be1b83 2842 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2843 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2844 trie_type = do_utf8 ?
2845 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2846 : trie_plain;
2847
2848 /* what trie are we using right now */
be8e71aa 2849 reg_trie_data * const trie
3dab1dad
YO
2850 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2851 U32 state = trie->startstate;
166ba7cd
DM
2852
2853 U8 *uc = ( U8* )locinput;
2854 U16 charid = 0;
2855 U32 base = 0;
2856 UV uvc = 0;
2857 STRLEN len = 0;
2858 STRLEN foldlen = 0;
2859 U8 *uscan = (U8*)NULL;
2860 STRLEN bufflen=0;
2861 SV *sv_accept_buff = NULL;
2862 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2863
2864 ST.accepted = 0; /* how many accepting states we have seen */
2865 ST.B = next;
2866#ifdef DEBUGGING
2867 ST.me = scan;
2868#endif
3dab1dad
YO
2869
2870 if (trie->bitmap && trie_type != trie_utf8_fold &&
2871 !TRIE_BITMAP_TEST(trie,*locinput)
2872 ) {
2873 if (trie->states[ state ].wordnum) {
2874 DEBUG_EXECUTE_r(
2875 PerlIO_printf(Perl_debug_log,
2876 "%*s %smatched empty string...%s\n",
2877 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2878 );
2879 break;
2880 } else {
2881 DEBUG_EXECUTE_r(
2882 PerlIO_printf(Perl_debug_log,
2883 "%*s %sfailed to match start class...%s\n",
2884 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2885 );
2886 sayNO_SILENT;
2887 }
2888 }
166ba7cd 2889
07be1b83
YO
2890 /*
2891 traverse the TRIE keeping track of all accepting states
2892 we transition through until we get to a failing node.
2893 */
2894
a3621e74
YO
2895 while ( state && uc <= (U8*)PL_regeol ) {
2896
5b47454d 2897 if (trie->states[ state ].wordnum) {
166ba7cd 2898 if (!ST.accepted ) {
5b47454d
DM
2899 ENTER;
2900 SAVETMPS;
2901 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2902 sv_accept_buff=newSV(bufflen *
2903 sizeof(reg_trie_accepted) - 1);
2904 SvCUR_set(sv_accept_buff,
2905 sizeof(reg_trie_accepted));
2906 SvPOK_on(sv_accept_buff);
2907 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2908 SAVETMPS;
2909 ST.accept_buff =
5b47454d
DM
2910 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2911 }
2912 else {
166ba7cd 2913 if (ST.accepted >= bufflen) {
5b47454d 2914 bufflen *= 2;
166ba7cd 2915 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2916 SvGROW(sv_accept_buff,
2917 bufflen * sizeof(reg_trie_accepted));
2918 }
2919 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2920 + sizeof(reg_trie_accepted));
2921 }
166ba7cd
DM
2922 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2923 ST.accept_buff[ST.accepted].endpos = uc;
2924 ++ST.accepted;
5b47454d 2925 }
a3621e74
YO
2926
2927 base = trie->states[ state ].trans.base;
2928
07be1b83
YO
2929 DEBUG_TRIE_EXECUTE_r({
2930 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2931 PerlIO_printf( Perl_debug_log,
e4584336 2932 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2933 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2934 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2935 });
a3621e74
YO
2936
2937 if ( base ) {
5b47454d 2938 switch (trie_type) {
3dab1dad 2939 case trie_utf8_fold:
a3621e74
YO
2940 if ( foldlen>0 ) {
2941 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2942 foldlen -= len;
2943 uscan += len;
2944 len=0;
2945 } else {
2946 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2947 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2948 foldlen -= UNISKIP( uvc );
2949 uscan = foldbuf + UNISKIP( uvc );
2950 }
5b47454d
DM
2951 break;
2952 case trie_utf8:
2953 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2954 &len, uniflags );
2955 break;
2956 case trie_plain:
e4584336 2957 uvc = (UV)*uc;
a3621e74
YO
2958 len = 1;
2959 }
2960
5b47454d
DM
2961 if (uvc < 256) {
2962 charid = trie->charmap[ uvc ];
2963 }
2964 else {
2965 charid = 0;
2966 if (trie->widecharmap) {
3dab1dad 2967 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2968 (char*)&uvc, sizeof(UV), 0);
2969 if (svpp)
2970 charid = (U16)SvIV(*svpp);
2971 }
2972 }
a3621e74 2973
5b47454d
DM
2974 if (charid &&
2975 (base + charid > trie->uniquecharcount )
2976 && (base + charid - 1 - trie->uniquecharcount
2977 < trie->lasttrans)
2978 && trie->trans[base + charid - 1 -
2979 trie->uniquecharcount].check == state)
2980 {
2981 state = trie->trans[base + charid - 1 -
2982 trie->uniquecharcount ].next;
2983 }
2984 else {
2985 state = 0;
2986 }
2987 uc += len;
2988
2989 }
2990 else {
a3621e74
YO
2991 state = 0;
2992 }
2993 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2994 PerlIO_printf( Perl_debug_log,
2995 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2996 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2997 );
2998 }
166ba7cd 2999 if (!ST.accepted )
a3621e74 3000 sayNO;
a3621e74 3001
166ba7cd
DM
3002 DEBUG_EXECUTE_r(
3003 PerlIO_printf( Perl_debug_log,
3004 "%*s %sgot %"IVdf" possible matches%s\n",
3005 REPORT_CODE_OFF + PL_regindent * 2, "",
3006 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3007 );
3008 }
3009
3010 /* FALL THROUGH */
3011
3012 case TRIE_next_fail: /* we failed - try next alterative */
3013
3014 if ( ST.accepted == 1 ) {
3015 /* only one choice left - just continue */
3016 DEBUG_EXECUTE_r({
3017 reg_trie_data * const trie
3018 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3019 SV ** const tmp = RX_DEBUG(reginfo->prog)
3020 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
3021 : NULL;
3022 PerlIO_printf( Perl_debug_log,
3023 "%*s %sonly one match left: #%d <%s>%s\n",
3024 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3025 ST.accept_buff[ 0 ].wordnum,
3026 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3027 PL_colors[5] );
3028 });
3029 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3030 /* in this case we free tmps/leave before we call regmatch
3031 as we wont be using accept_buff again. */
3032 FREETMPS;
3033 LEAVE;
3034 locinput = PL_reginput;
3035 nextchr = UCHARAT(locinput);
3036 scan = ST.B;
3037 continue; /* execute rest of RE */
3038 }
3039
3040 if (!ST.accepted-- ) {
3041 FREETMPS;
3042 LEAVE;
3043 sayNO;
3044 }
3045
a3621e74 3046 /*
166ba7cd
DM
3047 There are at least two accepting states left. Presumably
3048 the number of accepting states is going to be low,
3049 typically two. So we simply scan through to find the one
3050 with lowest wordnum. Once we find it, we swap the last
3051 state into its place and decrement the size. We then try to
3052 match the rest of the pattern at the point where the word
3053 ends. If we succeed, control just continues along the
3054 regex; if we fail we return here to try the next accepting
3055 state
3056 */
a3621e74 3057
166ba7cd
DM
3058 {
3059 U32 best = 0;
3060 U32 cur;
3061 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3062 DEBUG_TRIE_EXECUTE_r(
f2278c82 3063 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
3064 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3065 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3066 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3067 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3068 );
3069
3070 if (ST.accept_buff[cur].wordnum <
3071 ST.accept_buff[best].wordnum)
3072 best = cur;
a3621e74 3073 }
166ba7cd
DM
3074
3075 DEBUG_EXECUTE_r({
3076 reg_trie_data * const trie
3077 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3078 SV ** const tmp = RX_DEBUG(reginfo->prog)
3079 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3080 : NULL;
3081 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3082 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3083 ST.accept_buff[best].wordnum,
3084 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3085 PL_colors[5] );
3086 });
3087
3088 if ( best<ST.accepted ) {
3089 reg_trie_accepted tmp = ST.accept_buff[ best ];
3090 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3091 ST.accept_buff[ ST.accepted ] = tmp;
3092 best = ST.accepted;
a3621e74 3093 }
166ba7cd
DM
3094 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3095 }
3096 PUSH_STATE_GOTO(TRIE_next, ST.B);
3097 /* NOTREACHED */
3098
3099#undef ST
3100
95b24440
DM
3101 case EXACT: {
3102 char *s = STRING(scan);
5d9a96ca 3103 st->ln = STR_LEN(scan);
eb160463 3104 if (do_utf8 != UTF) {
bc517b45 3105 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3106 char *l = locinput;
be8e71aa 3107 const char * const e = s + st->ln;
a72c7584 3108
5ff6fc6d
JH
3109 if (do_utf8) {
3110 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3111 while (s < e) {
a3b680e6 3112 STRLEN ulen;
1aa99e6b 3113 if (l >= PL_regeol)
5ff6fc6d
JH
3114 sayNO;
3115 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3116 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3117 uniflags))
5ff6fc6d 3118 sayNO;
bc517b45 3119 l += ulen;
5ff6fc6d 3120 s ++;
1aa99e6b 3121 }
5ff6fc6d
JH
3122 }
3123 else {
3124 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3125 while (s < e) {
a3b680e6 3126 STRLEN ulen;
1aa99e6b
IH
3127 if (l >= PL_regeol)
3128 sayNO;
5ff6fc6d 3129 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3130 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3131 uniflags))
1aa99e6b 3132 sayNO;
bc517b45 3133 s += ulen;
a72c7584 3134 l ++;
1aa99e6b 3135 }
5ff6fc6d 3136 }
1aa99e6b
IH
3137 locinput = l;
3138 nextchr = UCHARAT(locinput);
3139 break;
3140 }
bc517b45 3141 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3142 /* Inline the first character, for speed. */
3143 if (UCHARAT(s) != nextchr)
3144 sayNO;
5d9a96ca 3145 if (PL_regeol - locinput < st->ln)
d6a28714 3146 sayNO;
5d9a96ca 3147 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3148 sayNO;
5d9a96ca 3149 locinput += st->ln;
d6a28714
JH
3150 nextchr = UCHARAT(locinput);
3151 break;
95b24440 3152 }
d6a28714 3153 case EXACTFL:
b8c5462f
JH
3154 PL_reg_flags |= RF_tainted;
3155 /* FALL THROUGH */
95b24440 3156 case EXACTF: {
be8e71aa 3157 char * const s = STRING(scan);
5d9a96ca 3158 st->ln = STR_LEN(scan);
d6a28714 3159
d07ddd77
JH
3160 if (do_utf8 || UTF) {
3161 /* Either target or the pattern are utf8. */
be8e71aa 3162 const char * const l = locinput;
d07ddd77 3163 char *e = PL_regeol;
bc517b45 3164
5d9a96ca 3165 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3166 l, &e, 0, do_utf8)) {
5486206c
JH
3167 /* One more case for the sharp s:
3168 * pack("U0U*", 0xDF) =~ /ss/i,
3169 * the 0xC3 0x9F are the UTF-8
3170 * byte sequence for the U+00DF. */
3171 if (!(do_utf8 &&
3172 toLOWER(s[0]) == 's' &&
5d9a96ca 3173 st->ln >= 2 &&
5486206c
JH
3174 toLOWER(s[1]) == 's' &&
3175 (U8)l[0] == 0xC3 &&
3176 e - l >= 2 &&
3177 (U8)l[1] == 0x9F))
3178 sayNO;
3179 }
d07ddd77
JH
3180 locinput = e;
3181 nextchr = UCHARAT(locinput);
3182 break;
a0ed51b3 3183 }
d6a28714 3184
bc517b45
JH
3185 /* Neither the target and the pattern are utf8. */
3186
d6a28714
JH
3187 /* Inline the first character, for speed. */
3188 if (UCHARAT(s) != nextchr &&
3189 UCHARAT(s) != ((OP(scan) == EXACTF)
3190 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3191 sayNO;
5d9a96ca 3192 if (PL_regeol - locinput < st->ln)
b8c5462f 3193 sayNO;
5d9a96ca
DM
3194 if (st->ln > 1 && (OP(scan) == EXACTF
3195 ? ibcmp(s, locinput, st->ln)
3196 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3197 sayNO;
5d9a96ca 3198 locinput += st->ln;
d6a28714 3199 nextchr = UCHARAT(locinput);
a0d0e21e 3200 break;
95b24440 3201 }
d6a28714 3202 case ANYOF:
ffc61ed2 3203 if (do_utf8) {
9e55ce06
JH
3204 STRLEN inclasslen = PL_regeol - locinput;
3205
32fc9b6a 3206 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3207 sayNO_ANYOF;
ffc61ed2
JH
3208 if (locinput >= PL_regeol)
3209 sayNO;
0f0076b4 3210 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3211 nextchr = UCHARAT(locinput);
e0f9d4a8 3212 break;
ffc61ed2
JH
3213 }
3214 else {
3215 if (nextchr < 0)
3216 nextchr = UCHARAT(locinput);
32fc9b6a 3217 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3218 sayNO_ANYOF;
ffc61ed2
JH
3219 if (!nextchr && locinput >= PL_regeol)
3220 sayNO;
3221 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3222 break;
3223 }
3224 no_anyof:
3225 /* If we might have the case of the German sharp s
3226 * in a casefolding Unicode character class. */
3227
ebc501f0
JH
3228 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3229 locinput += SHARP_S_SKIP;
e0f9d4a8 3230 nextchr = UCHARAT(locinput);
ffc61ed2 3231 }
e0f9d4a8
JH
3232 else
3233 sayNO;
b8c5462f 3234 break;
d6a28714 3235 case ALNUML:
b8c5462f
JH
3236 PL_reg_flags |= RF_tainted;
3237 /* FALL THROUGH */
d6a28714 3238 case ALNUM:
b8c5462f 3239 if (!nextchr)
4633a7c4 3240 sayNO;
ffc61ed2 3241 if (do_utf8) {
1a4fad37 3242 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3243 if (!(OP(scan) == ALNUM
bb7a0f54 3244 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3245 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3246 {
3247 sayNO;
a0ed51b3 3248 }
b8c5462f 3249 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3250 nextchr = UCHARAT(locinput);
3251 break;
3252 }
ffc61ed2 3253 if (!(OP(scan) == ALNUM
d6a28714 3254 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3255 sayNO;
b8c5462f 3256 nextchr = UCHARAT(++locinput);
a0d0e21e 3257 break;
d6a28714 3258 case NALNUML:
b8c5462f
JH
3259 PL_reg_flags |= RF_tainted;
3260 /* FALL THROUGH */
d6a28714
JH
3261 case NALNUM:
3262 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3263 sayNO;
ffc61ed2 3264 if (do_utf8) {
1a4fad37 3265 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3266 if (OP(scan) == NALNUM
bb7a0f54 3267 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3268 : isALNUM_LC_utf8((U8*)locinput))
3269 {
b8c5462f 3270 sayNO;
d6a28714 3271 }
b8c5462f
JH
3272 locinput += PL_utf8skip[nextchr];
3273 nextchr = UCHARAT(locinput);
3274 break;
3275 }
ffc61ed2 3276 if (OP(scan) == NALNUM
d6a28714 3277 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3278 sayNO;
76e3520e 3279 nextchr = UCHARAT(++locinput);
a0d0e21e 3280 break;
d6a28714
JH
3281 case BOUNDL:
3282 case NBOUNDL:
3280af22 3283 PL_reg_flags |= RF_tainted;
bbce6d69 3284 /* FALL THROUGH */
d6a28714
JH
3285 case BOUND:
3286 case NBOUND:
3287 /* was last char in word? */
ffc61ed2 3288 if (do_utf8) {
12d33761 3289 if (locinput == PL_bostr)
5d9a96ca 3290 st->ln = '\n';
ffc61ed2 3291 else {
a3b680e6 3292 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3293
4ad0818d 3294 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3295 }
3296 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3297 st->ln = isALNUM_uni(st->ln);
1a4fad37 3298 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3299 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3300 }
3301 else {
5d9a96ca 3302 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3303 n = isALNUM_LC_utf8((U8*)locinput);
3304 }
a0ed51b3 3305 }
d6a28714 3306 else {
5d9a96ca 3307 st->ln = (locinput != PL_bostr) ?
12d33761 3308 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3309 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3310 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3311 n = isALNUM(nextchr);
3312 }
3313 else {
5d9a96ca 3314 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3315 n = isALNUM_LC(nextchr);
3316 }
d6a28714 3317 }
5d9a96ca 3318 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3319 OP(scan) == BOUNDL))
3320 sayNO;
a0ed51b3 3321 break;
d6a28714 3322 case SPACEL:
3280af22 3323 PL_reg_flags |= RF_tainted;
bbce6d69 3324 /* FALL THROUGH */
d6a28714 3325 case SPACE:
9442cb0e 3326 if (!nextchr)
4633a7c4 3327 sayNO;
1aa99e6b 3328 if (do_utf8) {
fd400ab9 3329 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3330 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3331 if (!(OP(scan) == SPACE
bb7a0f54 3332 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3333 : isSPACE_LC_utf8((U8*)locinput)))
3334 {
3335 sayNO;
3336 }
3337 locinput += PL_utf8skip[nextchr];
3338 nextchr = UCHARAT(locinput);
3339 break;
d6a28714 3340 }
ffc61ed2
JH
3341 if (!(OP(scan) == SPACE
3342 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3343 sayNO;
3344 nextchr = UCHARAT(++locinput);
3345 }
3346 else {
3347 if (!(OP(scan) == SPACE
3348 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3349 sayNO;
3350 nextchr = UCHARAT(++locinput);
a0ed51b3 3351 }
a0ed51b3 3352 break;
d6a28714 3353 case NSPACEL:
3280af22 3354 PL_reg_flags |= RF_tainted;
bbce6d69 3355 /* FALL THROUGH */
d6a28714 3356 case NSPACE:
9442cb0e 3357 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3358 sayNO;
1aa99e6b 3359 if (do_utf8) {
1a4fad37 3360 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3361 if (OP(scan) == NSPACE
bb7a0f54 3362 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3363 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3364 {
3365 sayNO;
3366 }
3367 locinput += PL_utf8skip[nextchr];
3368 nextchr = UCHARAT(locinput);
3369 break;
a0ed51b3 3370 }
ffc61ed2 3371 if (OP(scan) == NSPACE
d6a28714 3372 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3373 sayNO;
76e3520e 3374 nextchr = UCHARAT(++locinput);
a0d0e21e 3375 break;
d6a28714 3376 case DIGITL:
a0ed51b3
LW
3377 PL_reg_flags |= RF_tainted;
3378 /* FALL THROUGH */
d6a28714 3379 case DIGIT:
9442cb0e 3380 if (!nextchr)
a0ed51b3 3381 sayNO;
1aa99e6b 3382 if (do_utf8) {
1a4fad37 3383 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3384 if (!(OP(scan) == DIGIT
bb7a0f54 3385 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3386 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3387 {
a0ed51b3 3388 sayNO;
dfe13c55 3389 }
6f06b55f 3390 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3391 nextchr = UCHARAT(locinput);
3392 break;
3393 }
ffc61ed2 3394 if (!(OP(scan) == DIGIT
9442cb0e 3395 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3396 sayNO;
3397 nextchr = UCHARAT(++locinput);
3398 break;
d6a28714 3399 case NDIGITL:
b8c5462f
JH
3400 PL_reg_flags |= RF_tainted;
3401 /* FALL THROUGH */
d6a28714 3402 case NDIGIT:
9442cb0e 3403 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3404 sayNO;
1aa99e6b 3405 if (do_utf8) {
1a4fad37 3406 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3407 if (OP(scan) == NDIGIT
bb7a0f54 3408 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3409 : isDIGIT_LC_utf8((U8*)locinput))
3410 {
a0ed51b3 3411 sayNO;
9442cb0e 3412 }
6f06b55f 3413 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3414 nextchr = UCHARAT(locinput);
3415 break;
3416 }
ffc61ed2 3417 if (OP(scan) == NDIGIT
9442cb0e 3418 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3419 sayNO;
3420 nextchr = UCHARAT(++locinput);
3421 break;
3422 case CLUMP:
b7c83a7e 3423 if (locinput >= PL_regeol)
a0ed51b3 3424 sayNO;
b7c83a7e 3425 if (do_utf8) {
1a4fad37 3426 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3427 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3428 sayNO;
3429 locinput += PL_utf8skip[nextchr];
3430 while (locinput < PL_regeol &&
3431 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3432 locinput += UTF8SKIP(locinput);
3433 if (locinput > PL_regeol)
3434 sayNO;
eb08e2da
JH
3435 }
3436 else
3437 locinput++;
a0ed51b3
LW
3438 nextchr = UCHARAT(locinput);
3439 break;
c8756f30 3440 case REFFL:
3280af22 3441 PL_reg_flags |= RF_tainted;
c8756f30 3442 /* FALL THROUGH */
c277df42 3443 case REF:
95b24440
DM
3444 case REFF: {
3445 char *s;
c277df42 3446 n = ARG(scan); /* which paren pair */
5d9a96ca 3447 st->ln = PL_regstartp[n];
2c2d71f5 3448 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3449 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3450 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3451 if (st->ln == PL_regendp[n])
a0d0e21e 3452 break;
a0ed51b3 3453
5d9a96ca 3454 s = PL_bostr + st->ln;
1aa99e6b 3455 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3456 char *l = locinput;
a3b680e6 3457 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3458 /*
3459 * Note that we can't do the "other character" lookup trick as
3460 * in the 8-bit case (no pun intended) because in Unicode we
3461 * have to map both upper and title case to lower case.
3462 */
3463 if (OP(scan) == REFF) {
3464 while (s < e) {
a3b680e6
AL
3465 STRLEN ulen1, ulen2;
3466 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3467 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3468
a0ed51b3
LW
3469 if (l >= PL_regeol)
3470 sayNO;
a2a2844f
JH
3471 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3472 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3473 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3474 sayNO;
a2a2844f
JH
3475 s += ulen1;
3476 l += ulen2;
a0ed51b3
LW
3477 }
3478 }
3479 locinput = l;
3480 nextchr = UCHARAT(locinput);
3481 break;
3482 }
3483
a0d0e21e 3484 /* Inline the first character, for speed. */
76e3520e 3485 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3486 (OP(scan) == REF ||
3487 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3488 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3489 sayNO;
5d9a96ca
DM
3490 st->ln = PL_regendp[n] - st->ln;
3491 if (locinput + st->ln > PL_regeol)
4633a7c4 3492 sayNO;
5d9a96ca
DM
3493 if (st->ln > 1 && (OP(scan) == REF
3494 ? memNE(s, locinput, st->ln)
c8756f30 3495 : (OP(scan) == REFF
5d9a96ca
DM
3496 ? ibcmp(s, locinput, st->ln)
3497 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3498 sayNO;
5d9a96ca 3499 locinput += st->ln;
76e3520e 3500 nextchr = UCHARAT(locinput);
a0d0e21e 3501 break;
95b24440 3502 }
a0d0e21e
LW
3503
3504 case NOTHING:
c277df42 3505 case TAIL:
a0d0e21e
LW
3506 break;
3507 case BACK:
3508 break;
40a82448
DM
3509
3510#undef ST
3511#define ST st->u.eval
3512
3513 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3514 {
c277df42 3515 SV *ret;
8e5e9ebe 3516 {
4aabdb9b
DM
3517 /* execute the code in the {...} */
3518 dSP;
6136c704 3519 SV ** const before = SP;
4aabdb9b
DM
3520 OP_4tree * const oop = PL_op;
3521 COP * const ocurcop = PL_curcop;
3522 PAD *old_comppad;
4aabdb9b
DM
3523
3524 n = ARG(scan);
32fc9b6a 3525 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3526 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3527 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3528 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3529
8e5e9ebe
RGS
3530 CALLRUNOPS(aTHX); /* Scalar context. */
3531 SPAGAIN;
3532 if (SP == before)
075aa684 3533 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3534 else {
3535 ret = POPs;
3536 PUTBACK;
3537 }
4aabdb9b
DM
3538
3539 PL_op = oop;
3540 PAD_RESTORE_LOCAL(old_comppad);
3541 PL_curcop = ocurcop;
3542 if (!st->logical) {
3543 /* /(?{...})/ */
3544 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3545 break;
3546 }
8e5e9ebe 3547 }
4aabdb9b
DM
3548 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3549 regexp *re;
4aabdb9b 3550 {
4f639d21
DM
3551 /* extract RE object from returned value; compiling if
3552 * necessary */
3553
6136c704 3554 MAGIC *mg = NULL;
be8e71aa 3555 const SV *sv;
faf82a0b
AE
3556 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3557 mg = mg_find(sv, PERL_MAGIC_qr);
3558 else if (SvSMAGICAL(ret)) {
3559 if (SvGMAGICAL(ret))
3560 sv_unmagic(ret, PERL_MAGIC_qr);
3561 else
3562 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3563 }
faf82a0b 3564
0f5d15d6
IZ
3565 if (mg) {
3566 re = (regexp *)mg->mg_obj;
df0003d4 3567 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3568 }
3569 else {
3570 STRLEN len;
6136c704 3571 const char * const t = SvPV_const(ret, len);
0f5d15d6 3572 PMOP pm;
a3b680e6 3573 const I32 osize = PL_regsize;
0f5d15d6 3574
5fcd1c1b 3575 Zero(&pm, 1, PMOP);
4aabdb9b 3576 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3577 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3578 if (!(SvFLAGS(ret)
faf82a0b
AE
3579 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3580 | SVs_GMG)))
14befaf4
DM
3581 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3582 PERL_MAGIC_qr,0,0);
0f5d15d6 3583 PL_regsize = osize;
0f5d15d6 3584 }
4aabdb9b 3585 }
aa283a38
DM
3586
3587 /* run the pattern returned from (??{...}) */
3588
4aabdb9b
DM
3589 DEBUG_EXECUTE_r(
3590 PerlIO_printf(Perl_debug_log,
3591 "Entering embedded \"%s%.60s%s%s\"\n",
3592 PL_colors[0],
3593 re->precomp,
3594 PL_colors[1],
3595 (strlen(re->precomp) > 60 ? "..." : ""))
3596 );
2c2d71f5 3597
40a82448
DM
3598 ST.cp = regcppush(0); /* Save *all* the positions. */
3599 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3600 *PL_reglastparen = 0;
3601 *PL_reglastcloseparen = 0;
4aabdb9b 3602 PL_reginput = locinput;
4aabdb9b
DM
3603
3604 /* XXXX This is too dramatic a measure... */
3605 PL_reg_maxiter = 0;
3606
5d9a96ca 3607 st->logical = 0;
40a82448 3608 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3609 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3610 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3611 ST.prev_rex = rex;
aa283a38 3612 rex = re;
aa283a38 3613
40a82448 3614 ST.B = next;
aa283a38 3615 /* now continue from first node in postoned RE */
40a82448 3616 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3617 /* NOTREACHED */
a0ed51b3 3618 }
4aabdb9b
DM
3619 /* /(?(?{...})X|Y)/ */
3620 st->sw = SvTRUE(ret);
3621 st->logical = 0;
c277df42
IZ
3622 break;
3623 }
40a82448
DM
3624
3625 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3626 if (ST.toggleutf)
3627 PL_reg_flags ^= RF_utf8;
3628 ReREFCNT_dec(rex);
3629 rex = ST.prev_rex;
3630 /* XXXX This is too dramatic a measure... */
3631 PL_reg_maxiter = 0;
3632 /* Restore parens of the caller without popping the
3633 * savestack */
3634 {
3635 const I32 tmp = PL_savestack_ix;
3636 PL_savestack_ix = ST.lastcp;
3637 regcppop(rex);
3638 PL_savestack_ix = tmp;
3639 }
3640 PL_reginput = locinput;
3641 /* continue at the node following the (??{...}) */
3642 scan = ST.B;
3643 continue;
3644
3645 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3646 /* Restore state to the outer re then re-throw the failure */
3647 if (ST.toggleutf)
3648 PL_reg_flags ^= RF_utf8;
3649 ReREFCNT_dec(rex);
3650 rex = ST.prev_rex;
3651
3652 /* XXXX This is too dramatic a measure... */
3653 PL_reg_maxiter = 0;
3654
3655 PL_reginput = locinput;
3656 REGCP_UNWIND(ST.lastcp);
3657 regcppop(rex);
3658 sayNO_SILENT;
3659
3660#undef ST
3661
a0d0e21e 3662 case OPEN:
c277df42 3663 n = ARG(scan); /* which paren pair */
3280af22
NIS
3664 PL_reg_start_tmp[n] = locinput;
3665 if (n > PL_regsize)
3666 PL_regsize = n;
a0d0e21e
LW
3667 break;
3668 case CLOSE:
c277df42 3669 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3670 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3671 PL_regendp[n] = locinput - PL_bostr;
eb160463 3672 if (n > (I32)*PL_reglastparen)
3280af22 3673 *PL_reglastparen = n;
a01268b5 3674 *PL_reglastcloseparen = n;
a0d0e21e 3675 break;
c277df42
IZ
3676 case GROUPP:
3677 n = ARG(scan); /* which paren pair */
5d9a96ca 3678 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3679 break;
3680 case IFTHEN:
2c2d71f5 3681 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3682 if (st->sw)
c277df42
IZ
3683 next = NEXTOPER(NEXTOPER(scan));
3684 else {
3685 next = scan + ARG(scan);
3686 if (OP(next) == IFTHEN) /* Fake one. */
3687 next = NEXTOPER(NEXTOPER(next));
3688 }
3689 break;
3690 case LOGICAL:
5d9a96ca 3691 st->logical = scan->flags;
c277df42 3692 break;
2ab05381 3693/*******************************************************************
a0374537
DM
3694 cc points to the regmatch_state associated with the most recent CURLYX.
3695 This struct contains info about the innermost (...)* loop (an
3696 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3697
3698 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3699
95b24440 3700 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3701
a0374537 3702 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3703 with the starting point at WHILEM node;
2ab05381
IZ
3704
3705 3) Each hit of WHILEM node tries to match A and Z (in the order
3706 depending on the current iteration, min/max of {min,max} and
3707 greediness). The information about where are nodes for "A"
a0374537 3708 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3709 was already matched, and greediness.
3710
3711 4) After A matches, the same WHILEM node is hit again.
3712
95b24440 3713 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3714 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3715 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3716 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3717 of the external loop.
3718
a0374537 3719 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3720 and whatever it mentions via ->next, and additional attached trees
3721 corresponding to temporarily unset infoblocks as in "5" above.
3722
95b24440 3723 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3724 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3725 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3726 infoblocks are drawn below the "reset" infoblock.
3727
3728 In fact in the picture below we do not show failed matches for Z and T
3729 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3730 more obvious *why* one needs to *temporary* unset infoblocks.]
3731
3732 Matched REx position InfoBlocks Comment
3733 (Y(A)*?Z)*?T x
3734 Y(A)*?Z)*?T x <- O
3735 Y (A)*?Z)*?T x <- O
3736 Y A)*?Z)*?T x <- O <- I
3737 YA )*?Z)*?T x <- O <- I
3738 YA A)*?Z)*?T x <- O <- I
3739 YAA )*?Z)*?T x <- O <- I
3740 YAA Z)*?T x <- O # Temporary unset I
3741 I
3742
3743 YAAZ Y(A)*?Z)*?T x <- O
3744 I
3745
3746 YAAZY (A)*?Z)*?T x <- O
3747 I
3748
3749 YAAZY A)*?Z)*?T x <- O <- I
3750 I
3751
3752 YAAZYA )*?Z)*?T x <- O <- I
3753 I
3754
3755 YAAZYA Z)*?T x <- O # Temporary unset I
3756 I,I
3757
3758 YAAZYAZ )*?T x <- O
3759 I,I
3760
3761 YAAZYAZ T x # Temporary unset O
3762 O
3763 I,I
3764
3765 YAAZYAZT x
3766 O
3767 I,I
3768 *******************************************************************/
95b24440 3769
a0d0e21e 3770 case CURLYX: {
cb434fcc
IZ
3771 /* No need to save/restore up to this paren */
3772 I32 parenfloor = scan->flags;
c277df42 3773
c2b7afd3
NC
3774 /* Dave says:
3775
3776 CURLYX and WHILEM are always paired: they're the moral
3777 equivalent of pp_enteriter anbd pp_iter.
3778
3779 The only time next could be null is if the node tree is
3780 corrupt. This was mentioned on p5p a few days ago.
3781
3782 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3783 So we'll assert that this is true:
3784 */
3785 assert(next);
30b2893d 3786 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3787 next += ARG(next);
cb434fcc
IZ
3788 /* XXXX Probably it is better to teach regpush to support
3789 parenfloor > PL_regsize... */
eb160463 3790 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3791 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3792
d8319b27
DM
3793 st->u.curlyx.cp = PL_savestack_ix;
3794 st->u.curlyx.outercc = st->cc;
a0374537
DM
3795 st->cc = st;
3796 /* these fields contain the state of the current curly.
3797 * they are accessed by subsequent WHILEMs;
3798 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3799 st->u.curlyx.parenfloor = parenfloor;
3800 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3801 st->u.curlyx.min = ARG1(scan);
3802 st->u.curlyx.max = ARG2(scan);
3803 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3804 st->u.curlyx.lastloc = 0;
a0374537
DM
3805 /* st->next and st->minmod are also read by WHILEM */
3806
3280af22 3807 PL_reginput = locinput;
95b24440
DM
3808 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3809 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3810 regcpblow(st->u.curlyx.cp);
3811 st->cc = st->u.curlyx.outercc;
95b24440 3812 saySAME(result);
a0d0e21e 3813 }
5f66b61c 3814 /* NOTREACHED */
a0d0e21e
LW
3815 case WHILEM: {
3816 /*
3817 * This is really hard to understand, because after we match
3818 * what we're trying to match, we must make sure the rest of
2c2d71f5 3819 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3820 * to go back UP the parse tree by recursing ever deeper. And
3821 * if it fails, we have to reset our parent's current state
3822 * that we can try again after backing off.
3823 */
3824
c2b7afd3
NC
3825 /* Dave says:
3826
3827 st->cc gets initialised by CURLYX ready for use by WHILEM.
3828 So again, unless somethings been corrupted, st->cc cannot
3829 be null at that point in WHILEM.
3830
3831 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3832 So we'll assert that this is true:
3833 */
3834 assert(st->cc);
d8319b27
DM
3835 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3836 st->u.whilem.cache_offset = 0;
3837 st->u.whilem.cache_bit = 0;
c277df42 3838
d8319b27 3839 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3840 PL_reginput = locinput;
a0d0e21e 3841
a3621e74 3842 DEBUG_EXECUTE_r(
9041c2e3 3843 PerlIO_printf(Perl_debug_log,
91f3b821 3844 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3845 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3846 (long)n, (long)st->cc->u.curlyx.min,
3847 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3848 );
4633a7c4 3849
a0d0e21e
LW
3850 /* If degenerate scan matches "", assume scan done. */
3851
d8319b27
DM
3852 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3853 st->u.whilem.savecc = st->cc;
3854 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3855 if (st->cc)
d8319b27 3856 st->ln = st->cc->u.curlyx.cur;
a3621e74 3857 DEBUG_EXECUTE_r(
c3464db5
DD
3858 PerlIO_printf(Perl_debug_log,
3859 "%*s empty match detected, try continuation...\n",
3280af22 3860 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3861 );
d8319b27 3862 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3863 /*** all unsaved local vars undefined at this point */
d8319b27 3864 st->cc = st->u.whilem.savecc;
95b24440 3865 if (result)
4633a7c4 3866 sayYES;
d8319b27
DM
3867 if (st->cc->u.curlyx.outercc)
3868 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3869 sayNO;
a0d0e21e
LW
3870 }
3871
3872 /* First just match a string of min scans. */
3873
d8319b27
DM
3874 if (n < st->cc->u.curlyx.min) {
3875 st->cc->u.curlyx.cur = n;
3876 st->cc->u.curlyx.lastloc = locinput;
3877 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3878 /*** all unsaved local vars undefined at this point */
3879 if (result)
4633a7c4 3880 sayYES;
d8319b27
DM
3881 st->cc->u.curlyx.cur = n - 1;
3882 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3883 sayNO;
a0d0e21e
LW
3884 }
3885
2c2d71f5
JH
3886 if (scan->flags) {
3887 /* Check whether we already were at this position.
3888 Postpone detection until we know the match is not
3889 *that* much linear. */
3890 if (!PL_reg_maxiter) {
3891 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3892 /* possible overflow for long strings and many CURLYX's */
3893 if (PL_reg_maxiter < 0)
3894 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3895 PL_reg_leftiter = PL_reg_maxiter;
3896 }
3897 if (PL_reg_leftiter-- == 0) {
a3b680e6 3898 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3899 if (PL_reg_poscache) {
eb160463 3900 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3901 Renew(PL_reg_poscache, size, char);
3902 PL_reg_poscache_size = size;
3903 }
3904 Zero(PL_reg_poscache, size, char);
3905 }
3906 else {
3907 PL_reg_poscache_size = size;
a02a5408 3908 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3909 }
a3621e74 3910 DEBUG_EXECUTE_r(
2c2d71f5
JH
3911 PerlIO_printf(Perl_debug_log,
3912 "%sDetected a super-linear match, switching on caching%s...\n",
3913 PL_colors[4], PL_colors[5])
3914 );
3915 }
3916 if (PL_reg_leftiter < 0) {
d8319b27 3917 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3918
d8319b27
DM
3919 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3920 + st->u.whilem.cache_offset * (scan->flags>>4);
3921 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3922 st->u.whilem.cache_offset /= 8;
3923 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3924 DEBUG_EXECUTE_r(
2c2d71f5
JH
3925 PerlIO_printf(Perl_debug_log,
3926 "%*s already tried at this position...\n",
3927 REPORT_CODE_OFF+PL_regindent*2, "")
3928 );
3ab3c9b4
HS
3929 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3930 /* cache records success */
c2b0868c
HS
3931 sayYES;
3932 else
3ab3c9b4 3933 /* cache records failure */
c2b0868c 3934 sayNO_SILENT;
2c2d71f5 3935 }
2c2d71f5
JH
3936 }
3937 }
3938
a0d0e21e
LW
3939 /* Prefer next over scan for minimal matching. */
3940
5d9a96ca 3941 if (st->cc->minmod) {
d8319b27
DM
3942 st->u.whilem.savecc = st->cc;
3943 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3944 if (st->cc)
d8319b27
DM
3945 st->ln = st->cc->u.curlyx.cur;
3946 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3947 REGCP_SET(st->u.whilem.lastcp);
3948 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3949 /*** all unsaved local vars undefined at this point */
d8319b27 3950 st->cc = st->u.whilem.savecc;
95b24440 3951 if (result) {
d8319b27 3952 regcpblow(st->u.whilem.cp);
3ab3c9b4 3953 CACHEsayYES; /* All done. */
5f05dabc 3954 }
d8319b27 3955 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3956 regcppop(rex);
d8319b27
DM
3957 if (st->cc->u.curlyx.outercc)
3958 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3959
d8319b27 3960 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3961 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3962 && !(PL_reg_flags & RF_warned)) {
3963 PL_reg_flags |= RF_warned;
9014280d 3964 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3965 "Complex regular subexpression recursion",
3966 REG_INFTY - 1);
c277df42 3967 }
3ab3c9b4 3968 CACHEsayNO;
c277df42 3969 }
a687059c 3970
a3621e74 3971 DEBUG_EXECUTE_r(
c3464db5
DD
3972 PerlIO_printf(Perl_debug_log,
3973 "%*s trying longer...\n",
3280af22 3974 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3975 );
a0d0e21e 3976 /* Try scanning more and see if it helps. */
3280af22 3977 PL_reginput = locinput;
d8319b27
DM
3978 st->cc->u.curlyx.cur = n;
3979 st->cc->u.curlyx.lastloc = locinput;
3980 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3981 REGCP_SET(st->u.whilem.lastcp);
3982 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3983 /*** all unsaved local vars undefined at this point */
3984 if (result) {
d8319b27 3985 regcpblow(st->u.whilem.cp);
3ab3c9b4 3986 CACHEsayYES;
5f05dabc 3987 }
d8319b27 3988 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3989 regcppop(rex);
d8319b27
DM
3990 st->cc->u.curlyx.cur = n - 1;
3991 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3992 CACHEsayNO;
a0d0e21e
LW
3993 }
3994
3995 /* Prefer scan over next for maximal matching. */
3996
d8319b27
DM
3997 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3998 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3999 st->cc->u.curlyx.cur = n;
4000 st->cc->u.curlyx.lastloc = locinput;
4001 REGCP_SET(st->u.whilem.lastcp);
4002 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
4003 /*** all unsaved local vars undefined at this point */
4004 if (result) {
d8319b27 4005 regcpblow(st->u.whilem.cp);
3ab3c9b4 4006 CACHEsayYES;
5f05dabc 4007 }
d8319b27 4008 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 4009 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 4010 PL_reginput = locinput;
a3621e74 4011 DEBUG_EXECUTE_r(
c3464db5
DD
4012 PerlIO_printf(Perl_debug_log,
4013 "%*s failed, try continuation...\n",
3280af22 4014 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4015 );
4016 }
9041c2e3 4017 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4018 && !(PL_reg_flags & RF_warned)) {
3280af22 4019 PL_reg_flags |= RF_warned;
9014280d 4020 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4021 "Complex regular subexpression recursion",
4022 REG_INFTY - 1);
a0d0e21e
LW
4023 }
4024
4025 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4026 st->u.whilem.savecc = st->cc;
4027 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4028 if (st->cc)
d8319b27
DM
4029 st->ln = st->cc->u.curlyx.cur;
4030 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4031 /*** all unsaved local vars undefined at this point */
d8319b27 4032 st->cc = st->u.whilem.savecc;
95b24440 4033 if (result)
3ab3c9b4 4034 CACHEsayYES;
d8319b27
DM
4035 if (st->cc->u.curlyx.outercc)
4036 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4037 st->cc->u.curlyx.cur = n - 1;
4038 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4039 CACHEsayNO;
a0d0e21e 4040 }
5f66b61c 4041 /* NOTREACHED */
40a82448
DM
4042
4043#undef ST
4044#define ST st->u.branch
4045
4046 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4047 next = scan + ARG(scan);
4048 if (next == scan)
4049 next = NULL;
40a82448
DM
4050 scan = NEXTOPER(scan);
4051 /* FALL THROUGH */
c277df42 4052
40a82448
DM
4053 case BRANCH: /* /(...|A|...)/ */
4054 scan = NEXTOPER(scan); /* scan now points to inner node */
4055 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4056 /* last branch; skip state push and jump direct to node */
4057 continue;
4058 ST.lastparen = *PL_reglastparen;
4059 ST.next_branch = next;
4060 REGCP_SET(ST.cp);
4061 PL_reginput = locinput;
02db2b7b 4062
40a82448
DM
4063 /* Now go into the branch */
4064 PUSH_STATE_GOTO(BRANCH_next, scan);
4065 /* NOTREACHED */
4066
4067 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4068 REGCP_UNWIND(ST.cp);
4069 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4070 PL_regendp[n] = -1;
4071 *PL_reglastparen = n;
4072 scan = ST.next_branch;
4073 /* no more branches? */
4074 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4075 sayNO;
4076 continue; /* execute next BRANCH[J] op */
4077 /* NOTREACHED */
4078
a0d0e21e 4079 case MINMOD:
5d9a96ca 4080 st->minmod = 1;
a0d0e21e 4081 break;
40a82448
DM
4082
4083#undef ST
4084#define ST st->u.curlym
4085
4086 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4087
4088 /* This is an optimisation of CURLYX that enables us to push
4089 * only a single backtracking state, no matter now many matches
4090 * there are in {m,n}. It relies on the pattern being constant
4091 * length, with no parens to influence future backrefs
4092 */
4093
4094 ST.me = scan;
dc45a647 4095 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4096
4097 /* if paren positive, emulate an OPEN/CLOSE around A */
4098 if (ST.me->flags) {
4099 I32 paren = ST.me->flags;
4100 if (paren > PL_regsize)
4101 PL_regsize = paren;
4102 if (paren > (I32)*PL_reglastparen)
4103 *PL_reglastparen = paren;
c277df42 4104 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4105 }
40a82448
DM
4106 ST.A = scan;
4107 ST.B = next;
4108 ST.alen = 0;
4109 ST.count = 0;
4110 ST.minmod = st->minmod;
4111 st->minmod = 0;
4112 ST.c1 = CHRTEST_UNINIT;
4113 REGCP_SET(ST.cp);
6407bf3b 4114
40a82448
DM
4115 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4116 goto curlym_do_B;
4117
4118 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4119 PL_reginput = locinput;
40a82448
DM
4120 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4121 /* NOTREACHED */
5f80c4cf 4122
40a82448
DM
4123 case CURLYM_A: /* we've just matched an A */
4124 locinput = st->locinput;
4125 nextchr = UCHARAT(locinput);
4126
4127 ST.count++;
4128 /* after first match, determine A's length: u.curlym.alen */
4129 if (ST.count == 1) {
4130 if (PL_reg_match_utf8) {
4131 char *s = locinput;
4132 while (s < PL_reginput) {
4133 ST.alen++;
4134 s += UTF8SKIP(s);
4135 }
4136 }
4137 else {
4138 ST.alen = PL_reginput - locinput;
4139 }
4140 if (ST.alen == 0)
4141 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4142 }
0cadcf80
DM
4143 DEBUG_EXECUTE_r(
4144 PerlIO_printf(Perl_debug_log,
40a82448 4145 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 4146 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 4147 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4148 );
4149
40a82448
DM
4150 locinput = PL_reginput;
4151 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4152 goto curlym_do_A; /* try to match another A */
4153 goto curlym_do_B; /* try to match B */
5f80c4cf 4154
40a82448
DM
4155 case CURLYM_A_fail: /* just failed to match an A */
4156 REGCP_UNWIND(ST.cp);
4157 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4158 sayNO;
0cadcf80 4159
40a82448
DM
4160 curlym_do_B: /* execute the B in /A{m,n}B/ */
4161 PL_reginput = locinput;
4162 if (ST.c1 == CHRTEST_UNINIT) {
4163 /* calculate c1 and c2 for possible match of 1st char
4164 * following curly */
4165 ST.c1 = ST.c2 = CHRTEST_VOID;
4166 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4167 regnode *text_node = ST.B;
4168 if (! HAS_TEXT(text_node))
4169 FIND_NEXT_IMPT(text_node);
4170 if (HAS_TEXT(text_node)
4171 && PL_regkind[OP(text_node)] != REF)
4172 {
4173 ST.c1 = (U8)*STRING(text_node);
4174 ST.c2 =
4175 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4176 ? PL_fold[ST.c1]
4177 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4178 ? PL_fold_locale[ST.c1]
4179 : ST.c1;
c277df42 4180 }
c277df42 4181 }
40a82448
DM
4182 }
4183
4184 DEBUG_EXECUTE_r(
4185 PerlIO_printf(Perl_debug_log,
4186 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4187 (int)(REPORT_CODE_OFF+PL_regindent*2),
4188 "", (IV)ST.count)
4189 );
4190 if (ST.c1 != CHRTEST_VOID
4191 && UCHARAT(PL_reginput) != ST.c1
4192 && UCHARAT(PL_reginput) != ST.c2)
4193 {
4194 /* simulate B failing */
4195 state_num = CURLYM_B_fail;
4196 goto reenter_switch;
4197 }
4198
4199 if (ST.me->flags) {
4200 /* mark current A as captured */
4201 I32 paren = ST.me->flags;
4202 if (ST.count) {
4203 PL_regstartp[paren]
4204 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4205 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4206 }
40a82448
DM
4207 else
4208 PL_regendp[paren] = -1;
c277df42 4209 }
40a82448 4210 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4211 /* NOTREACHED */
40a82448
DM
4212
4213 case CURLYM_B_fail: /* just failed to match a B */
4214 REGCP_UNWIND(ST.cp);
4215 if (ST.minmod) {
4216 if (ST.count == ARG2(ST.me) /* max */)
4217 sayNO;
4218 goto curlym_do_A; /* try to match a further A */
4219 }
4220 /* backtrack one A */
4221 if (ST.count == ARG1(ST.me) /* min */)
4222 sayNO;
4223 ST.count--;
4224 locinput = HOPc(locinput, -ST.alen);
4225 goto curlym_do_B; /* try to match B */
4226
c255a977
DM
4227#undef ST
4228#define ST st->u.curly
40a82448 4229
c255a977
DM
4230#define CURLY_SETPAREN(paren, success) \
4231 if (paren) { \
4232 if (success) { \
4233 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4234 PL_regendp[paren] = locinput - PL_bostr; \
4235 } \
4236 else \
4237 PL_regendp[paren] = -1; \
4238 }
4239
4240 case STAR: /* /A*B/ where A is width 1 */
4241 ST.paren = 0;
4242 ST.min = 0;
4243 ST.max = REG_INFTY;
a0d0e21e
LW
4244 scan = NEXTOPER(scan);
4245 goto repeat;
c255a977
DM
4246 case PLUS: /* /A+B/ where A is width 1 */
4247 ST.paren = 0;
4248 ST.min = 1;
4249 ST.max = REG_INFTY;
c277df42 4250 scan = NEXTOPER(scan);
c255a977
DM
4251 goto repeat;
4252 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4253 ST.paren = scan->flags; /* Which paren to set */
4254 if (ST.paren > PL_regsize)
4255 PL_regsize = ST.paren;
4256 if (ST.paren > (I32)*PL_reglastparen)
4257 *PL_reglastparen = ST.paren;
4258 ST.min = ARG1(scan); /* min to match */
4259 ST.max = ARG2(scan); /* max to match */
4260 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4261 goto repeat;
4262 case CURLY: /* /A{m,n}B/ where A is width 1 */
4263 ST.paren = 0;
4264 ST.min = ARG1(scan); /* min to match */
4265 ST.max = ARG2(scan); /* max to match */
4266 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4267 repeat:
a0d0e21e
LW
4268 /*
4269 * Lookahead to avoid useless match attempts
4270 * when we know what character comes next.
c255a977 4271 *
5f80c4cf
JP
4272 * Used to only do .*x and .*?x, but now it allows
4273 * for )'s, ('s and (?{ ... })'s to be in the way
4274 * of the quantifier and the EXACT-like node. -- japhy
4275 */
4276
c255a977
DM
4277 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4278 sayNO;
cca55fe3 4279 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4280 U8 *s;
4281 regnode *text_node = next;
4282
3dab1dad
YO
4283 if (! HAS_TEXT(text_node))
4284 FIND_NEXT_IMPT(text_node);
5f80c4cf 4285
9e137952 4286 if (! HAS_TEXT(text_node))
c255a977 4287 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4288 else {
3dab1dad 4289 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4290 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4291 goto assume_ok_easy;
cca55fe3 4292 }
be8e71aa
YO
4293 else
4294 s = (U8*)STRING(text_node);
5f80c4cf
JP
4295
4296 if (!UTF) {
c255a977 4297 ST.c2 = ST.c1 = *s;
f65d3ee7 4298 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4299 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4300 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4301 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4302 }
5f80c4cf 4303 else { /* UTF */
f65d3ee7 4304 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4305 STRLEN ulen1, ulen2;
89ebb4a3
JH
4306 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4307 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4308
4309 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4310 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4311
c255a977 4312 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4313 uniflags);
c255a977 4314 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4315 uniflags);
5f80c4cf
JP
4316 }
4317 else {
c255a977 4318 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4319 uniflags);
5f80c4cf 4320 }
1aa99e6b
IH
4321 }
4322 }
bbce6d69 4323 }
a0d0e21e 4324 else
c255a977 4325 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4326 assume_ok_easy:
c255a977
DM
4327
4328 ST.A = scan;
4329 ST.B = next;
3280af22 4330 PL_reginput = locinput;
5d9a96ca
DM
4331 if (st->minmod) {
4332 st->minmod = 0;
c255a977 4333 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4334 sayNO;
c255a977 4335 ST.count = ST.min;
a0ed51b3 4336 locinput = PL_reginput;
c255a977
DM
4337 REGCP_SET(ST.cp);
4338 if (ST.c1 == CHRTEST_VOID)
4339 goto curly_try_B_min;
4340
4341 ST.oldloc = locinput;
4342
4343 /* set ST.maxpos to the furthest point along the
4344 * string that could possibly match */
4345 if (ST.max == REG_INFTY) {
4346 ST.maxpos = PL_regeol - 1;
4347 if (do_utf8)
4348 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4349 ST.maxpos--;
4350 }
4351 else if (do_utf8) {
4352 int m = ST.max - ST.min;
4353 for (ST.maxpos = locinput;
4354 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4355 ST.maxpos += UTF8SKIP(ST.maxpos);
4356 }
4357 else {
4358 ST.maxpos = locinput + ST.max - ST.min;
4359 if (ST.maxpos >= PL_regeol)
4360 ST.maxpos = PL_regeol - 1;
4361 }
4362 goto curly_try_B_min_known;
4363
4364 }
4365 else {
4366 ST.count = regrepeat(rex, ST.A, ST.max);
4367 locinput = PL_reginput;
4368 if (ST.count < ST.min)
4369 sayNO;
4370 if ((ST.count > ST.min)
4371 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4372 {
4373 /* A{m,n} must come at the end of the string, there's
4374 * no point in backing off ... */
4375 ST.min = ST.count;
4376 /* ...except that $ and \Z can match before *and* after
4377 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4378 We may back off by one in this case. */
4379 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4380 ST.min--;
4381 }
4382 REGCP_SET(ST.cp);
4383 goto curly_try_B_max;
4384 }
4385 /* NOTREACHED */
4386
4387
4388 case CURLY_B_min_known_fail:
4389 /* failed to find B in a non-greedy match where c1,c2 valid */
4390 if (ST.paren && ST.count)
4391 PL_regendp[ST.paren] = -1;
4392
4393 PL_reginput = locinput; /* Could be reset... */
4394 REGCP_UNWIND(ST.cp);
4395 /* Couldn't or didn't -- move forward. */
4396 ST.oldloc = locinput;
4397 if (do_utf8)
4398 locinput += UTF8SKIP(locinput);
4399 else
4400 locinput++;
4401 ST.count++;
4402 curly_try_B_min_known:
4403 /* find the next place where 'B' could work, then call B */
4404 {
4405 int n;
4406 if (do_utf8) {
4407 n = (ST.oldloc == locinput) ? 0 : 1;
4408 if (ST.c1 == ST.c2) {
4409 STRLEN len;
4410 /* set n to utf8_distance(oldloc, locinput) */
4411 while (locinput <= ST.maxpos &&
4412 utf8n_to_uvchr((U8*)locinput,
4413 UTF8_MAXBYTES, &len,
4414 uniflags) != (UV)ST.c1) {
4415 locinput += len;
4416 n++;
4417 }
1aa99e6b
IH
4418 }
4419 else {
c255a977
DM
4420 /* set n to utf8_distance(oldloc, locinput) */
4421 while (locinput <= ST.maxpos) {
4422 STRLEN len;
4423 const UV c = utf8n_to_uvchr((U8*)locinput,
4424 UTF8_MAXBYTES, &len,
4425 uniflags);
4426 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4427 break;
4428 locinput += len;
4429 n++;
1aa99e6b 4430 }
0fe9bf95
IZ
4431 }
4432 }
c255a977
DM
4433 else {
4434 if (ST.c1 == ST.c2) {
4435 while (locinput <= ST.maxpos &&
4436 UCHARAT(locinput) != ST.c1)
4437 locinput++;
bbce6d69 4438 }
c255a977
DM
4439 else {
4440 while (locinput <= ST.maxpos
4441 && UCHARAT(locinput) != ST.c1
4442 && UCHARAT(locinput) != ST.c2)
4443 locinput++;
a0ed51b3 4444 }
c255a977
DM
4445 n = locinput - ST.oldloc;
4446 }
4447 if (locinput > ST.maxpos)
4448 sayNO;
4449 /* PL_reginput == oldloc now */
4450 if (n) {
4451 ST.count += n;
4452 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4453 sayNO;
a0d0e21e 4454 }
c255a977
DM
4455 PL_reginput = locinput;
4456 CURLY_SETPAREN(ST.paren, ST.count);
4457 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4458 }
c255a977
DM
4459 /* NOTREACHED */
4460
4461
4462 case CURLY_B_min_fail:
4463 /* failed to find B in a non-greedy match where c1,c2 invalid */
4464 if (ST.paren && ST.count)
4465 PL_regendp[ST.paren] = -1;
4466
4467 REGCP_UNWIND(ST.cp);
4468 /* failed -- move forward one */
4469 PL_reginput = locinput;
4470 if (regrepeat(rex, ST.A, 1)) {
4471 ST.count++;
a0ed51b3 4472 locinput = PL_reginput;
c255a977
DM
4473 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4474 ST.count > 0)) /* count overflow ? */
15272685 4475 {
c255a977
DM
4476 curly_try_B_min:
4477 CURLY_SETPAREN(ST.paren, ST.count);
4478 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4479 }
4480 }
4633a7c4 4481 sayNO;
c255a977
DM
4482 /* NOTREACHED */
4483
4484
4485 curly_try_B_max:
4486 /* a successful greedy match: now try to match B */
4487 {
4488 UV c = 0;
4489 if (ST.c1 != CHRTEST_VOID)
4490 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4491 UTF8_MAXBYTES, 0, uniflags)
466787eb 4492 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4493 /* If it could work, try it. */
4494 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4495 CURLY_SETPAREN(ST.paren, ST.count);
4496 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4497 /* NOTREACHED */
4498 }
4499 }
4500 /* FALL THROUGH */
4501 case CURLY_B_max_fail:
4502 /* failed to find B in a greedy match */
4503 if (ST.paren && ST.count)
4504 PL_regendp[ST.paren] = -1;
4505
4506 REGCP_UNWIND(ST.cp);
4507 /* back up. */
4508 if (--ST.count < ST.min)
4509 sayNO;
4510 PL_reginput = locinput = HOPc(locinput, -1);
4511 goto curly_try_B_max;
4512
4513#undef ST
4514
4515
a0d0e21e 4516 case END:
3b0527fe 4517 if (locinput < reginfo->till) {
a3621e74 4518 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4519 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4520 PL_colors[4],
4521 (long)(locinput - PL_reg_starttry),
3b0527fe 4522 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4523 PL_colors[5]));
4524 sayNO_FINAL; /* Cannot match: too short. */
4525 }
4526 PL_reginput = locinput; /* put where regtry can find it */
4527 sayYES_FINAL; /* Success! */
dad79028
DM
4528
4529 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4530 DEBUG_EXECUTE_r(
4531 PerlIO_printf(Perl_debug_log,
4532 "%*s %ssubpattern success...%s\n",
4533 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4534 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4535 sayYES_FINAL; /* Success! */
4536
40a82448
DM
4537#undef ST
4538#define ST st->u.ifmatch
4539
4540 case SUSPEND: /* (?>A) */
4541 ST.wanted = 1;
9fe1d20c 4542 PL_reginput = locinput;
9041c2e3 4543 goto do_ifmatch;
dad79028 4544
40a82448
DM
4545 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4546 ST.wanted = 0;
dad79028
DM
4547 goto ifmatch_trivial_fail_test;
4548
40a82448
DM
4549 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4550 ST.wanted = 1;
dad79028 4551 ifmatch_trivial_fail_test:
a0ed51b3 4552 if (scan->flags) {
52657f30 4553 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4554 if (!s) {
4555 /* trivial fail */
4556 if (st->logical) {
4557 st->logical = 0;
40a82448 4558 st->sw = 1 - (bool)ST.wanted;
dad79028 4559 }
40a82448 4560 else if (ST.wanted)
dad79028
DM
4561 sayNO;
4562 next = scan + ARG(scan);
4563 if (next == scan)
4564 next = NULL;
4565 break;
4566 }
efb30f32 4567 PL_reginput = s;
a0ed51b3
LW
4568 }
4569 else
4570 PL_reginput = locinput;
4571
c277df42 4572 do_ifmatch:
40a82448
DM
4573 ST.me = scan;
4574 /* execute body of (?...A) */
4575 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4576 /* NOTREACHED */
4577
4578 case IFMATCH_A_fail: /* body of (?...A) failed */
4579 ST.wanted = !ST.wanted;
4580 /* FALL THROUGH */
4581
4582 case IFMATCH_A: /* body of (?...A) succeeded */
4583 if (st->logical) {
4584 st->logical = 0;
4585 st->sw = (bool)ST.wanted;
4586 }
4587 else if (!ST.wanted)
4588 sayNO;
4589
4590 if (OP(ST.me) == SUSPEND)
4591 locinput = PL_reginput;
4592 else {
4593 locinput = PL_reginput = st->locinput;
4594 nextchr = UCHARAT(locinput);
4595 }
4596 scan = ST.me + ARG(ST.me);
4597 if (scan == ST.me)
4598 scan = NULL;
4599 continue; /* execute B */
4600
4601#undef ST
dad79028 4602
c277df42 4603 case LONGJMP:
c277df42
IZ
4604 next = scan + ARG(scan);
4605 if (next == scan)
4606 next = NULL;
a0d0e21e
LW
4607 break;
4608 default:
b900a521 4609 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4610 PTR2UV(scan), OP(scan));
cea2e8a9 4611 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4612 }
95b24440 4613
a0d0e21e 4614 scan = next;
95b24440
DM
4615 continue;
4616 /* NOTREACHED */
4617
40a82448
DM
4618 push_yes_state:
4619 /* push a state that backtracks on success */
4620 st->u.yes.prev_yes_state = yes_state;
4621 yes_state = st;
4622 /* FALL THROUGH */
4623 push_state:
4624 /* push a new regex state, then continue at scan */
4625 {
4626 regmatch_state *newst;
4627
4628 depth++;
4629 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4630 "PUSH STATE(%d)\n", depth));
4631 st->locinput = locinput;
4632 newst = st+1;
4633 if (newst > SLAB_LAST(PL_regmatch_slab))
4634 newst = S_push_slab(aTHX);
4635 PL_regmatch_state = newst;
4636 newst->cc = st->cc;
4637 /* XXX probably don't need to initialise these */
4638 newst->minmod = 0;
4639 newst->sw = 0;
4640 newst->logical = 0;
4641
4642 locinput = PL_reginput;
4643 nextchr = UCHARAT(locinput);
4644 st = newst;
4645 continue;
4646 /* NOTREACHED */
4647 }
4648
95b24440
DM
4649 /* simulate recursively calling regmatch(), but without actually
4650 * recursing - ie save the current state on the heap rather than on
4651 * the stack, then re-enter the loop. This avoids complex regexes
4652 * blowing the processor stack */
4653
4654 start_recurse:
4655 {
5d9a96ca
DM
4656 /* push new state */
4657 regmatch_state *oldst = st;
4658
4659 depth++;
40a82448 4660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4661
4662 /* grab the next free state slot */
4663 st++;
86545054 4664 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4665 st = S_push_slab(aTHX);
4666 PL_regmatch_state = st;
4667
4668 oldst->next = next;
4669 oldst->n = n;
4670 oldst->locinput = locinput;
5d9a96ca
DM
4671
4672 st->cc = oldst->cc;
95b24440
DM
4673 locinput = PL_reginput;
4674 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4675 st->minmod = 0;
4676 st->sw = 0;
4677 st->logical = 0;
95b24440
DM
4678#ifdef DEBUGGING
4679 PL_regindent++;
4680#endif
4681 }
a0d0e21e 4682 }
a687059c 4683
aa283a38
DM
4684
4685
a0d0e21e
LW
4686 /*
4687 * We get here only if there's trouble -- normally "case END" is
4688 * the terminating point.
4689 */
cea2e8a9 4690 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4691 /*NOTREACHED*/
4633a7c4
LW
4692 sayNO;
4693
7821416a 4694yes_final:
77cb431f
DM
4695
4696 if (yes_state) {
4697 /* we have successfully completed a subexpression, but we must now
4698 * pop to the state marked by yes_state and continue from there */
4699
4700 assert(st != yes_state);
4701 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4702 || yes_state > SLAB_LAST(PL_regmatch_slab))
4703 {
4704 /* not in this slab, pop slab */
4705 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4706 PL_regmatch_slab = PL_regmatch_slab->prev;
4707 st = SLAB_LAST(PL_regmatch_slab);
4708 }
4709 depth -= (st - yes_state);
40a82448
DM
4710 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4711 depth+1, depth+(st - yes_state)));
77cb431f
DM
4712 st = yes_state;
4713 yes_state = st->u.yes.prev_yes_state;
4714 PL_regmatch_state = st;
4715
4716 switch (st->resume_state) {
40a82448
DM
4717 case IFMATCH_A:
4718 case CURLYM_A:
40a82448
DM
4719 case EVAL_A:
4720 state_num = st->resume_state;
4721 goto reenter_switch;
4722
4723 case CURLYM_B:
166ba7cd
DM
4724 case BRANCH_next:
4725 case TRIE_next:
c255a977 4726 case CURLY_B_max:
77cb431f 4727 default:
40a82448 4728 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4729 }
4730 }
4731
a3621e74 4732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4733 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4734yes:
4735#ifdef DEBUGGING
3280af22 4736 PL_regindent--;
4633a7c4 4737#endif
02db2b7b 4738
95b24440 4739 result = 1;
aa283a38 4740 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4741 * will disappear when REGFMATCH goes */
aa283a38
DM
4742 if (depth) {
4743 /* restore previous state and re-enter */
40a82448
DM
4744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4745 depth--;
4746 st--;
4747 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4748 PL_regmatch_slab = PL_regmatch_slab->prev;
4749 st = SLAB_LAST(PL_regmatch_slab);
4750 }
4751 PL_regmatch_state = st;
4752 scan = st->scan;
4753 next = st->next;
4754 n = st->n;
4755 locinput= st->locinput;
4756 nextchr = UCHARAT(locinput);
aa283a38
DM
4757
4758 switch (st->resume_state) {
aa283a38
DM
4759 case resume_CURLYX:
4760 goto resume_point_CURLYX;
4761 case resume_WHILEM1:
4762 goto resume_point_WHILEM1;
4763 case resume_WHILEM2:
4764 goto resume_point_WHILEM2;
4765 case resume_WHILEM3:
4766 goto resume_point_WHILEM3;
4767 case resume_WHILEM4:
4768 goto resume_point_WHILEM4;
4769 case resume_WHILEM5:
4770 goto resume_point_WHILEM5;
4771 case resume_WHILEM6:
4772 goto resume_point_WHILEM6;
77cb431f 4773
166ba7cd 4774 case TRIE_next:
40a82448
DM
4775 case CURLYM_A:
4776 case CURLYM_B:
4777 case EVAL_A:
4778 case IFMATCH_A:
4779 case BRANCH_next:
c255a977
DM
4780 case CURLY_B_max:
4781 case CURLY_B_min:
4782 case CURLY_B_min_known:
40a82448
DM
4783 break;
4784
aa283a38
DM
4785 default:
4786 Perl_croak(aTHX_ "regexp resume memory corruption");
4787 }
4788 }
4789 goto final_exit;
4633a7c4
LW
4790
4791no:
a3621e74 4792 DEBUG_EXECUTE_r(
7821416a
IZ
4793 PerlIO_printf(Perl_debug_log,
4794 "%*s %sfailed...%s\n",
e4584336 4795 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4796 );
7821416a
IZ
4797no_final:
4798do_no:
aa283a38 4799
4633a7c4 4800#ifdef DEBUGGING
3280af22 4801 PL_regindent--;
4633a7c4 4802#endif
95b24440 4803 result = 0;
5d9a96ca 4804
aa283a38
DM
4805 if (depth) {
4806 /* there's a previous state to backtrack to */
40a82448
DM
4807 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4808 depth--;
4809 st--;
4810 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4811 PL_regmatch_slab = PL_regmatch_slab->prev;
4812 st = SLAB_LAST(PL_regmatch_slab);
4813 }
4814 PL_regmatch_state = st;
4815 scan = st->scan;
4816 next = st->next;
4817 n = st->n;
4818 locinput= st->locinput;
4819 nextchr = UCHARAT(locinput);
4820
5d9a96ca 4821 switch (st->resume_state) {
95b24440
DM
4822 case resume_CURLYX:
4823 goto resume_point_CURLYX;
4824 case resume_WHILEM1:
4825 goto resume_point_WHILEM1;
4826 case resume_WHILEM2:
4827 goto resume_point_WHILEM2;
4828 case resume_WHILEM3:
4829 goto resume_point_WHILEM3;
4830 case resume_WHILEM4:
4831 goto resume_point_WHILEM4;
4832 case resume_WHILEM5:
4833 goto resume_point_WHILEM5;
4834 case resume_WHILEM6:
4835 goto resume_point_WHILEM6;
dad79028 4836
166ba7cd 4837 case TRIE_next:
40a82448
DM
4838 case EVAL_A:
4839 case BRANCH_next:
4840 case CURLYM_A:
4841 case CURLYM_B:
4842 case IFMATCH_A:
c255a977
DM
4843 case CURLY_B_max:
4844 case CURLY_B_min:
4845 case CURLY_B_min_known:
40a82448
DM
4846 if (yes_state == st)
4847 yes_state = st->u.yes.prev_yes_state;
4848 state_num = st->resume_state + 1; /* failure = success + 1 */
4849 goto reenter_switch;
dad79028 4850
95b24440
DM
4851 default:
4852 Perl_croak(aTHX_ "regexp resume memory corruption");
4853 }
95b24440 4854 }
aa283a38
DM
4855
4856final_exit:
4857
5d9a96ca
DM
4858 /* restore original high-water mark */
4859 PL_regmatch_slab = orig_slab;
4860 PL_regmatch_state = orig_state;
4861
4862 /* free all slabs above current one */
4863 if (orig_slab->next) {
c4fd8992 4864 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4865 orig_slab->next = NULL;
4866 while (sl) {
c4fd8992 4867 regmatch_slab * const osl = sl;
5d9a96ca 4868 sl = sl->next;
ad65c075 4869 Safefree(osl);
5d9a96ca
DM
4870 }
4871 }
4872
95b24440
DM
4873 return result;
4874
a687059c
LW
4875}
4876
4877/*
4878 - regrepeat - repeatedly match something simple, report how many
4879 */
4880/*
4881 * [This routine now assumes that it will only match on things of length 1.
4882 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4883 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4884 */
76e3520e 4885STATIC I32
32fc9b6a 4886S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4887{
27da23d5 4888 dVAR;
a0d0e21e 4889 register char *scan;
a0d0e21e 4890 register I32 c;
3280af22 4891 register char *loceol = PL_regeol;
a0ed51b3 4892 register I32 hardcount = 0;
53c4c00c 4893 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4894
3280af22 4895 scan = PL_reginput;
faf11cac
HS
4896 if (max == REG_INFTY)
4897 max = I32_MAX;
4898 else if (max < loceol - scan)
7f596f4c 4899 loceol = scan + max;
a0d0e21e 4900 switch (OP(p)) {
22c35a8c 4901 case REG_ANY:
1aa99e6b 4902 if (do_utf8) {
ffc61ed2 4903 loceol = PL_regeol;
1aa99e6b 4904 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4905 scan += UTF8SKIP(scan);
4906 hardcount++;
4907 }
4908 } else {
4909 while (scan < loceol && *scan != '\n')
4910 scan++;
a0ed51b3
LW
4911 }
4912 break;
ffc61ed2 4913 case SANY:
def8e4ea
JH
4914 if (do_utf8) {
4915 loceol = PL_regeol;
a0804c9e 4916 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4917 scan += UTF8SKIP(scan);
4918 hardcount++;
4919 }
4920 }
4921 else
4922 scan = loceol;
a0ed51b3 4923 break;
f33976b4
DB
4924 case CANY:
4925 scan = loceol;
4926 break;
090f7165
JH
4927 case EXACT: /* length of string is 1 */
4928 c = (U8)*STRING(p);
4929 while (scan < loceol && UCHARAT(scan) == c)
4930 scan++;
bbce6d69 4931 break;
4932 case EXACTF: /* length of string is 1 */
cd439c50 4933 c = (U8)*STRING(p);
bbce6d69 4934 while (scan < loceol &&
22c35a8c 4935 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4936 scan++;
4937 break;
4938 case EXACTFL: /* length of string is 1 */
3280af22 4939 PL_reg_flags |= RF_tainted;
cd439c50 4940 c = (U8)*STRING(p);
bbce6d69 4941 while (scan < loceol &&
22c35a8c 4942 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4943 scan++;
4944 break;
4945 case ANYOF:
ffc61ed2
JH
4946 if (do_utf8) {
4947 loceol = PL_regeol;
cfc92286 4948 while (hardcount < max && scan < loceol &&
32fc9b6a 4949 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4950 scan += UTF8SKIP(scan);
4951 hardcount++;
4952 }
4953 } else {
32fc9b6a 4954 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4955 scan++;
4956 }
a0d0e21e
LW
4957 break;
4958 case ALNUM:
1aa99e6b 4959 if (do_utf8) {
ffc61ed2 4960 loceol = PL_regeol;
1a4fad37 4961 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4962 while (hardcount < max && scan < loceol &&
3568d838 4963 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4964 scan += UTF8SKIP(scan);
4965 hardcount++;
4966 }
4967 } else {
4968 while (scan < loceol && isALNUM(*scan))
4969 scan++;
a0ed51b3
LW
4970 }
4971 break;
bbce6d69 4972 case ALNUML:
3280af22 4973 PL_reg_flags |= RF_tainted;
1aa99e6b 4974 if (do_utf8) {
ffc61ed2 4975 loceol = PL_regeol;
1aa99e6b
IH
4976 while (hardcount < max && scan < loceol &&
4977 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4978 scan += UTF8SKIP(scan);
4979 hardcount++;
4980 }
4981 } else {
4982 while (scan < loceol && isALNUM_LC(*scan))
4983 scan++;
a0ed51b3
LW
4984 }
4985 break;
a0d0e21e 4986 case NALNUM:
1aa99e6b 4987 if (do_utf8) {
ffc61ed2 4988 loceol = PL_regeol;
1a4fad37 4989 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4990 while (hardcount < max && scan < loceol &&
3568d838 4991 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4992 scan += UTF8SKIP(scan);
4993 hardcount++;
4994 }
4995 } else {
4996 while (scan < loceol && !isALNUM(*scan))
4997 scan++;
a0ed51b3
LW
4998 }
4999 break;
bbce6d69 5000 case NALNUML:
3280af22 5001 PL_reg_flags |= RF_tainted;
1aa99e6b 5002 if (do_utf8) {
ffc61ed2 5003 loceol = PL_regeol;
1aa99e6b
IH
5004 while (hardcount < max && scan < loceol &&
5005 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5006 scan += UTF8SKIP(scan);
5007 hardcount++;
5008 }
5009 } else {
5010 while (scan < loceol && !isALNUM_LC(*scan))
5011 scan++;
a0ed51b3
LW
5012 }
5013 break;
a0d0e21e 5014 case SPACE:
1aa99e6b 5015 if (do_utf8) {
ffc61ed2 5016 loceol = PL_regeol;
1a4fad37 5017 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5018 while (hardcount < max && scan < loceol &&
3568d838
JH
5019 (*scan == ' ' ||
5020 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5021 scan += UTF8SKIP(scan);
5022 hardcount++;
5023 }
5024 } else {
5025 while (scan < loceol && isSPACE(*scan))
5026 scan++;
a0ed51b3
LW
5027 }
5028 break;
bbce6d69 5029 case SPACEL:
3280af22 5030 PL_reg_flags |= RF_tainted;
1aa99e6b 5031 if (do_utf8) {
ffc61ed2 5032 loceol = PL_regeol;
1aa99e6b 5033 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5034 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5035 scan += UTF8SKIP(scan);
5036 hardcount++;
5037 }
5038 } else {
5039 while (scan < loceol && isSPACE_LC(*scan))
5040 scan++;
a0ed51b3
LW
5041 }
5042 break;
a0d0e21e 5043 case NSPACE:
1aa99e6b 5044 if (do_utf8) {
ffc61ed2 5045 loceol = PL_regeol;
1a4fad37 5046 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5047 while (hardcount < max && scan < loceol &&
3568d838
JH
5048 !(*scan == ' ' ||
5049 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5050 scan += UTF8SKIP(scan);
5051 hardcount++;
5052 }
5053 } else {
5054 while (scan < loceol && !isSPACE(*scan))
5055 scan++;
5056 break;
a0ed51b3 5057 }
bbce6d69 5058 case NSPACEL:
3280af22 5059 PL_reg_flags |= RF_tainted;
1aa99e6b 5060 if (do_utf8) {
ffc61ed2 5061 loceol = PL_regeol;
1aa99e6b 5062 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5063 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5064 scan += UTF8SKIP(scan);
5065 hardcount++;
5066 }
5067 } else {
5068 while (scan < loceol && !isSPACE_LC(*scan))
5069 scan++;
a0ed51b3
LW
5070 }
5071 break;
a0d0e21e 5072 case DIGIT:
1aa99e6b 5073 if (do_utf8) {
ffc61ed2 5074 loceol = PL_regeol;
1a4fad37 5075 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5076 while (hardcount < max && scan < loceol &&
3568d838 5077 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5078 scan += UTF8SKIP(scan);
5079 hardcount++;
5080 }
5081 } else {
5082 while (scan < loceol && isDIGIT(*scan))
5083 scan++;
a0ed51b3
LW
5084 }
5085 break;
a0d0e21e 5086 case NDIGIT:
1aa99e6b 5087 if (do_utf8) {
ffc61ed2 5088 loceol = PL_regeol;
1a4fad37 5089 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5090 while (hardcount < max && scan < loceol &&
3568d838 5091 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5092 scan += UTF8SKIP(scan);
5093 hardcount++;
5094 }
5095 } else {
5096 while (scan < loceol && !isDIGIT(*scan))
5097 scan++;
a0ed51b3
LW
5098 }
5099 break;
a0d0e21e
LW
5100 default: /* Called on something of 0 width. */
5101 break; /* So match right here or not at all. */
5102 }
a687059c 5103
a0ed51b3
LW
5104 if (hardcount)
5105 c = hardcount;
5106 else
5107 c = scan - PL_reginput;
3280af22 5108 PL_reginput = scan;
a687059c 5109
a3621e74 5110 DEBUG_r({
e68ec53f 5111 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5112 DEBUG_EXECUTE_r({
e68ec53f
YO
5113 SV * const prop = sv_newmortal();
5114 regprop(prog, prop, p);
5115 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5116 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5117 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5118 });
be8e71aa 5119 });
9041c2e3 5120
a0d0e21e 5121 return(c);
a687059c
LW
5122}
5123
c277df42 5124
be8e71aa 5125#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5126/*
ffc61ed2
JH
5127- regclass_swash - prepare the utf8 swash
5128*/
5129
5130SV *
32fc9b6a 5131Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5132{
97aff369 5133 dVAR;
9e55ce06
JH
5134 SV *sw = NULL;
5135 SV *si = NULL;
5136 SV *alt = NULL;
3dab1dad 5137 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5138
4f639d21 5139 if (data && data->count) {
a3b680e6 5140 const U32 n = ARG(node);
ffc61ed2 5141
4f639d21
DM
5142 if (data->what[n] == 's') {
5143 SV * const rv = (SV*)data->data[n];
890ce7af 5144 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5145 SV **const ary = AvARRAY(av);
9e55ce06 5146 SV **a, **b;
9041c2e3 5147
711a919c 5148 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5149 * documentation of these array elements. */
5150
b11f357e 5151 si = *ary;
8f7f7219 5152 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5153 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5154
ffc61ed2
JH
5155 if (a)
5156 sw = *a;
5157 else if (si && doinit) {
5158 sw = swash_init("utf8", "", si, 1, 0);
5159 (void)av_store(av, 1, sw);
5160 }
9e55ce06
JH
5161 if (b)
5162 alt = *b;
ffc61ed2
JH
5163 }
5164 }
5165
9e55ce06
JH
5166 if (listsvp)
5167 *listsvp = si;
5168 if (altsvp)
5169 *altsvp = alt;
ffc61ed2
JH
5170
5171 return sw;
5172}
76234dfb 5173#endif
ffc61ed2
JH
5174
5175/*
ba7b4546 5176 - reginclass - determine if a character falls into a character class
832705d4
JH
5177
5178 The n is the ANYOF regnode, the p is the target string, lenp
5179 is pointer to the maximum length of how far to go in the p
5180 (if the lenp is zero, UTF8SKIP(p) is used),
5181 do_utf8 tells whether the target string is in UTF-8.
5182
bbce6d69 5183 */
5184
76e3520e 5185STATIC bool
32fc9b6a 5186S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5187{
27da23d5 5188 dVAR;
a3b680e6 5189 const char flags = ANYOF_FLAGS(n);
bbce6d69 5190 bool match = FALSE;
cc07378b 5191 UV c = *p;
ae9ddab8 5192 STRLEN len = 0;
9e55ce06 5193 STRLEN plen;
1aa99e6b 5194
19f67299
TS
5195 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5196 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5197 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5198 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5199 if (len == (STRLEN)-1)
5200 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5201 }
bbce6d69 5202
0f0076b4 5203 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5204 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5205 if (lenp)
5206 *lenp = 0;
ffc61ed2 5207 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5208 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5209 match = TRUE;
bbce6d69 5210 }
3568d838 5211 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5212 match = TRUE;
ffc61ed2 5213 if (!match) {
9e55ce06 5214 AV *av;
32fc9b6a 5215 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5216
5217 if (sw) {
3568d838 5218 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5219 match = TRUE;
5220 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5221 if (!match && lenp && av) {
5222 I32 i;
9e55ce06 5223 for (i = 0; i <= av_len(av); i++) {
890ce7af 5224 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5225 STRLEN len;
890ce7af 5226 const char * const s = SvPV_const(sv, len);
9e55ce06 5227
061b10df 5228 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5229 *lenp = len;
5230 match = TRUE;
5231 break;
5232 }
5233 }
5234 }
5235 if (!match) {
89ebb4a3 5236 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5237 STRLEN tmplen;
5238
9e55ce06
JH
5239 to_utf8_fold(p, tmpbuf, &tmplen);
5240 if (swash_fetch(sw, tmpbuf, do_utf8))
5241 match = TRUE;
5242 }
ffc61ed2
JH
5243 }
5244 }
bbce6d69 5245 }
9e55ce06 5246 if (match && lenp && *lenp == 0)
0f0076b4 5247 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5248 }
1aa99e6b 5249 if (!match && c < 256) {
ffc61ed2
JH
5250 if (ANYOF_BITMAP_TEST(n, c))
5251 match = TRUE;
5252 else if (flags & ANYOF_FOLD) {
eb160463 5253 U8 f;
a0ed51b3 5254
ffc61ed2
JH
5255 if (flags & ANYOF_LOCALE) {
5256 PL_reg_flags |= RF_tainted;
5257 f = PL_fold_locale[c];
5258 }
5259 else
5260 f = PL_fold[c];
5261 if (f != c && ANYOF_BITMAP_TEST(n, f))
5262 match = TRUE;
5263 }
5264
5265 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5266 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5267 if (
5268 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5274 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5275 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5276 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5281 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5282 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5283 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5284 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5285 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5286 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5287 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5288 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5289 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5290 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5291 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5292 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5293 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5294 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5295 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5296 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5297 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5298 ) /* How's that for a conditional? */
5299 {
5300 match = TRUE;
5301 }
a0ed51b3 5302 }
a0ed51b3
LW
5303 }
5304
a0ed51b3
LW
5305 return (flags & ANYOF_INVERT) ? !match : match;
5306}
161b471a 5307
dfe13c55 5308STATIC U8 *
0ce71af7 5309S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5310{
97aff369 5311 dVAR;
a0ed51b3 5312 if (off >= 0) {
1aa99e6b 5313 while (off-- && s < lim) {
ffc61ed2 5314 /* XXX could check well-formedness here */
a0ed51b3 5315 s += UTF8SKIP(s);
ffc61ed2 5316 }
a0ed51b3
LW
5317 }
5318 else {
5319 while (off++) {
1aa99e6b 5320 if (s > lim) {
a0ed51b3 5321 s--;
ffc61ed2 5322 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5323 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5324 s--;
ffc61ed2
JH
5325 }
5326 /* XXX could check well-formedness here */
a0ed51b3
LW
5327 }
5328 }
5329 }
5330 return s;
5331}
161b471a 5332
dfe13c55 5333STATIC U8 *
0ce71af7 5334S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5335{
97aff369 5336 dVAR;
a0ed51b3 5337 if (off >= 0) {
1aa99e6b 5338 while (off-- && s < lim) {
ffc61ed2 5339 /* XXX could check well-formedness here */
a0ed51b3 5340 s += UTF8SKIP(s);
ffc61ed2 5341 }
a0ed51b3 5342 if (off >= 0)
3dab1dad 5343 return NULL;
a0ed51b3
LW
5344 }
5345 else {
5346 while (off++) {
1aa99e6b 5347 if (s > lim) {
a0ed51b3 5348 s--;
ffc61ed2 5349 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5350 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5351 s--;
ffc61ed2
JH
5352 }
5353 /* XXX could check well-formedness here */
a0ed51b3
LW
5354 }
5355 else
5356 break;
5357 }
5358 if (off <= 0)
3dab1dad 5359 return NULL;
a0ed51b3
LW
5360 }
5361 return s;
5362}
51371543 5363
51371543 5364static void
acfe0abc 5365restore_pos(pTHX_ void *arg)
51371543 5366{
97aff369 5367 dVAR;
097eb12c 5368 regexp * const rex = (regexp *)arg;
51371543
GS
5369 if (PL_reg_eval_set) {
5370 if (PL_reg_oldsaved) {
4f639d21
DM
5371 rex->subbeg = PL_reg_oldsaved;
5372 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5373#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5374 rex->saved_copy = PL_nrs;
ed252734 5375#endif
4f639d21 5376 RX_MATCH_COPIED_on(rex);
51371543
GS
5377 }
5378 PL_reg_magic->mg_len = PL_reg_oldpos;
5379 PL_reg_eval_set = 0;
5380 PL_curpm = PL_reg_oldcurpm;
5381 }
5382}
33b8afdf
JH
5383
5384STATIC void
5385S_to_utf8_substr(pTHX_ register regexp *prog)
5386{
33b8afdf 5387 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5388 SV* const sv = newSVsv(prog->float_substr);
5389 prog->float_utf8 = sv;
33b8afdf
JH
5390 sv_utf8_upgrade(sv);
5391 if (SvTAIL(prog->float_substr))
5392 SvTAIL_on(sv);
5393 if (prog->float_substr == prog->check_substr)
5394 prog->check_utf8 = sv;
5395 }
5396 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5397 SV* const sv = newSVsv(prog->anchored_substr);
5398 prog->anchored_utf8 = sv;
33b8afdf
JH
5399 sv_utf8_upgrade(sv);
5400 if (SvTAIL(prog->anchored_substr))
5401 SvTAIL_on(sv);
5402 if (prog->anchored_substr == prog->check_substr)
5403 prog->check_utf8 = sv;
5404 }
5405}
5406
5407STATIC void
5408S_to_byte_substr(pTHX_ register regexp *prog)
5409{
97aff369 5410 dVAR;
33b8afdf 5411 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5412 SV* sv = newSVsv(prog->float_utf8);
5413 prog->float_substr = sv;
33b8afdf
JH
5414 if (sv_utf8_downgrade(sv, TRUE)) {
5415 if (SvTAIL(prog->float_utf8))
5416 SvTAIL_on(sv);
5417 } else {
5418 SvREFCNT_dec(sv);
5419 prog->float_substr = sv = &PL_sv_undef;
5420 }
5421 if (prog->float_utf8 == prog->check_utf8)
5422 prog->check_substr = sv;
5423 }
5424 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5425 SV* sv = newSVsv(prog->anchored_utf8);
5426 prog->anchored_substr = sv;
33b8afdf
JH
5427 if (sv_utf8_downgrade(sv, TRUE)) {
5428 if (SvTAIL(prog->anchored_utf8))
5429 SvTAIL_on(sv);
5430 } else {
5431 SvREFCNT_dec(sv);
5432 prog->anchored_substr = sv = &PL_sv_undef;
5433 }
5434 if (prog->anchored_utf8 == prog->check_utf8)
5435 prog->check_substr = sv;
5436 }
5437}
66610fdd
RGS
5438
5439/*
5440 * Local variables:
5441 * c-indentation-style: bsd
5442 * c-basic-offset: 4
5443 * indent-tabs-mode: t
5444 * End:
5445 *
37442d52
RGS
5446 * ex: set ts=8 sts=4 sw=4 noet:
5447 */