This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix the implementation of list assignment to state().
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
a687059c 274/*
e50aee73 275 * pregexec and friends
a687059c
LW
276 */
277
76234dfb 278#ifndef PERL_IN_XSUB_RE
a687059c 279/*
c277df42 280 - pregexec - match a regexp against a string
a687059c 281 */
c277df42 282I32
864dbfa3 283Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 284 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
285/* strend: pointer to null at end of string */
286/* strbeg: real beginning of string */
287/* minend: end of match must be >=minend after stringarg. */
288/* nosave: For optimizations. */
289{
290 return
9041c2e3 291 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
292 nosave ? 0 : REXEC_COPY_STR);
293}
76234dfb 294#endif
22e551b9 295
9041c2e3 296/*
cad2e5aa
JH
297 * Need to implement the following flags for reg_anch:
298 *
299 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * USE_INTUIT_ML
301 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
302 * INTUIT_AUTORITATIVE_ML
303 * INTUIT_ONCE_NOML - Intuit can match in one location only.
304 * INTUIT_ONCE_ML
305 *
306 * Another flag for this function: SECOND_TIME (so that float substrs
307 * with giant delta may be not rechecked).
308 */
309
310/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311
3f7c398e 312/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
313 Otherwise, only SvCUR(sv) is used to get strbeg. */
314
315/* XXXX We assume that strpos is strbeg unless sv. */
316
6eb5f6b9
JH
317/* XXXX Some places assume that there is a fixed substring.
318 An update may be needed if optimizer marks as "INTUITable"
319 RExen without fixed substrings. Similarly, it is assumed that
320 lengths of all the strings are no more than minlen, thus they
321 cannot come from lookahead.
322 (Or minlen should take into account lookahead.) */
323
2c2d71f5
JH
324/* A failure to find a constant substring means that there is no need to make
325 an expensive call to REx engine, thus we celebrate a failure. Similarly,
326 finding a substring too deep into the string means that less calls to
30944b6d
IZ
327 regtry() should be needed.
328
329 REx compiler's optimizer found 4 possible hints:
330 a) Anchored substring;
331 b) Fixed substring;
332 c) Whether we are anchored (beginning-of-line or \G);
333 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 334 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
335 string which does not contradict any of them.
336 */
2c2d71f5 337
6eb5f6b9
JH
338/* Most of decisions we do here should have been done at compile time.
339 The nodes of the REx which we used for the search should have been
340 deleted from the finite automaton. */
341
cad2e5aa
JH
342char *
343Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
344 char *strend, U32 flags, re_scream_pos_data *data)
345{
97aff369 346 dVAR;
b7953727 347 register I32 start_shift = 0;
cad2e5aa 348 /* Should be nonnegative! */
b7953727 349 register I32 end_shift = 0;
2c2d71f5
JH
350 register char *s;
351 register SV *check;
a1933d95 352 char *strbeg;
cad2e5aa 353 char *t;
a3b680e6 354 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 355 I32 ml_anch;
bd61b366
SS
356 register char *other_last = NULL; /* other substr checked before this */
357 char *check_at = NULL; /* check substr found at this pos */
1df70142 358 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 359#ifdef DEBUGGING
890ce7af
AL
360 const char * const i_strpos = strpos;
361 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 362#endif
a3621e74
YO
363
364 GET_RE_DEBUG_FLAGS_DECL;
365
a30b2f1f 366 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 367
b8d68ded 368 if (prog->reganch & ROPT_UTF8) {
a3621e74 369 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
370 "UTF-8 regex...\n"));
371 PL_reg_flags |= RF_utf8;
372 }
373
a3621e74 374 DEBUG_EXECUTE_r({
1df70142 375 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
376 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
377 strpos;
1df70142 378 const int len = PL_reg_match_utf8 ?
bb7a0f54 379 (int)strlen(s) : strend - strpos;
2a782b5b
JH
380 if (!PL_colorset)
381 reginitcolors();
b8d68ded 382 if (PL_reg_match_utf8)
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 384 "UTF-8 target...\n"));
2a782b5b 385 PerlIO_printf(Perl_debug_log,
a0288114 386 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 387 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
388 prog->precomp,
389 PL_colors[1],
390 (strlen(prog->precomp) > 60 ? "..." : ""),
391 PL_colors[0],
392 (int)(len > 60 ? 60 : len),
393 s, PL_colors[1],
394 (len > 60 ? "..." : "")
395 );
396 });
cad2e5aa 397
c344f387
JH
398 /* CHR_DIST() would be more correct here but it makes things slow. */
399 if (prog->minlen > strend - strpos) {
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 401 "String too short... [re_intuit_start]\n"));
cad2e5aa 402 goto fail;
2c2d71f5 403 }
a1933d95 404 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 405 PL_regeol = strend;
33b8afdf
JH
406 if (do_utf8) {
407 if (!prog->check_utf8 && prog->check_substr)
408 to_utf8_substr(prog);
409 check = prog->check_utf8;
410 } else {
411 if (!prog->check_substr && prog->check_utf8)
412 to_byte_substr(prog);
413 check = prog->check_substr;
414 }
415 if (check == &PL_sv_undef) {
a3621e74 416 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
417 "Non-utf string cannot match utf check string\n"));
418 goto fail;
419 }
2c2d71f5 420 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
421 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
422 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 423 && !multiline ) ); /* Check after \n? */
cad2e5aa 424
7e25d62c
JH
425 if (!ml_anch) {
426 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
427 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 428 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
429 && sv && !SvROK(sv)
430 && (strpos != strbeg)) {
a3621e74 431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
432 goto fail;
433 }
434 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 435 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 436 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
437 I32 slen;
438
1aa99e6b 439 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
440 if (SvTAIL(check)) {
441 slen = SvCUR(check); /* >= 1 */
cad2e5aa 442
9041c2e3 443 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 444 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 446 goto fail_finish;
cad2e5aa
JH
447 }
448 /* Now should match s[0..slen-2] */
449 slen--;
3f7c398e 450 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 451 || (slen > 1
3f7c398e 452 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 453 report_neq:
a3621e74 454 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
455 goto fail_finish;
456 }
cad2e5aa 457 }
3f7c398e 458 else if (*SvPVX_const(check) != *s
653099ff 459 || ((slen = SvCUR(check)) > 1
3f7c398e 460 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 461 goto report_neq;
c315bfe8 462 check_at = s;
2c2d71f5 463 goto success_at_start;
7e25d62c 464 }
cad2e5aa 465 }
2c2d71f5 466 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 467 s = strpos;
2c2d71f5 468 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 469 end_shift = prog->minlen - start_shift -
653099ff 470 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 471 if (!ml_anch) {
a3b680e6 472 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 473 - (SvTAIL(check) != 0);
a3b680e6 474 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
475
476 if (end_shift < eshift)
477 end_shift = eshift;
478 }
cad2e5aa 479 }
2c2d71f5 480 else { /* Can match at random position */
cad2e5aa
JH
481 ml_anch = 0;
482 s = strpos;
2c2d71f5
JH
483 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
484 /* Should be nonnegative! */
485 end_shift = prog->minlen - start_shift -
653099ff 486 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
487 }
488
2c2d71f5 489#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 490 if (end_shift < 0)
6bbae5e6 491 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
492#endif
493
2c2d71f5
JH
494 restart:
495 /* Find a possible match in the region s..strend by looking for
496 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 497 if (flags & REXEC_SCREAM) {
cad2e5aa 498 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 499 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 500
2c2d71f5
JH
501 if (PL_screamfirst[BmRARE(check)] >= 0
502 || ( BmRARE(check) == '\n'
503 && (BmPREVIOUS(check) == SvCUR(check) - 1)
504 && SvTAIL(check) ))
9041c2e3 505 s = screaminstr(sv, check,
2c2d71f5 506 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 507 else
2c2d71f5 508 goto fail_finish;
4addbd3b
HS
509 /* we may be pointing at the wrong string */
510 if (s && RX_MATCH_COPIED(prog))
3f7c398e 511 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
512 if (data)
513 *data->scream_olds = s;
514 }
f33976b4 515 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
516 s = fbm_instr((U8*)(s + start_shift),
517 (U8*)(strend - end_shift),
7fba1cd6 518 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 519 else
1aa99e6b
IH
520 s = fbm_instr(HOP3(s, start_shift, strend),
521 HOP3(strend, -end_shift, strbeg),
7fba1cd6 522 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
523
524 /* Update the count-of-usability, remove useless subpatterns,
525 unshift s. */
2c2d71f5 526
a0288114 527 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 528 (s ? "Found" : "Did not find"),
33b8afdf 529 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 530 PL_colors[0],
7b0972df 531 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 532 SvPVX_const(check),
2c2d71f5
JH
533 PL_colors[1], (SvTAIL(check) ? "$" : ""),
534 (s ? " at offset " : "...\n") ) );
535
536 if (!s)
537 goto fail_finish;
538
6eb5f6b9
JH
539 check_at = s;
540
2c2d71f5 541 /* Finish the diagnostic message */
a3621e74 542 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
543
544 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
545 Start with the other substr.
546 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 547 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
548 *always* match. Probably should be marked during compile...
549 Probably it is right to do no SCREAM here...
550 */
551
33b8afdf 552 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 553 /* Take into account the "other" substring. */
2c2d71f5
JH
554 /* XXXX May be hopelessly wrong for UTF... */
555 if (!other_last)
6eb5f6b9 556 other_last = strpos;
33b8afdf 557 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
558 do_other_anchored:
559 {
890ce7af
AL
560 char * const last = HOP3c(s, -start_shift, strbeg);
561 char *last1, *last2;
be8e71aa 562 char * const saved_s = s;
33b8afdf 563 SV* must;
2c2d71f5 564
2c2d71f5
JH
565 t = s - prog->check_offset_max;
566 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 567 && (!do_utf8
0ce71af7 568 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 569 && t > strpos)))
6f207bd3 570 NOOP;
2c2d71f5
JH
571 else
572 t = strpos;
1aa99e6b 573 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
574 if (t < other_last) /* These positions already checked */
575 t = other_last;
1aa99e6b 576 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
577 if (last < last1)
578 last1 = last;
579 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
580 /* On end-of-str: see comment below. */
33b8afdf
JH
581 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
582 if (must == &PL_sv_undef) {
583 s = (char*)NULL;
a3621e74 584 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
585 }
586 else
587 s = fbm_instr(
588 (unsigned char*)t,
589 HOP3(HOP3(last1, prog->anchored_offset, strend)
590 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
591 must,
7fba1cd6 592 multiline ? FBMrf_MULTILINE : 0
33b8afdf 593 );
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 595 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
596 (s ? "Found" : "Contradicts"),
597 PL_colors[0],
33b8afdf
JH
598 (int)(SvCUR(must)
599 - (SvTAIL(must)!=0)),
3f7c398e 600 SvPVX_const(must),
33b8afdf 601 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
602 if (!s) {
603 if (last1 >= last2) {
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
605 ", giving up...\n"));
606 goto fail_finish;
607 }
a3621e74 608 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 609 ", trying floating at offset %ld...\n",
be8e71aa 610 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
611 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
612 s = HOP3c(last, 1, strend);
2c2d71f5
JH
613 goto restart;
614 }
615 else {
a3621e74 616 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 617 (long)(s - i_strpos)));
1aa99e6b
IH
618 t = HOP3c(s, -prog->anchored_offset, strbeg);
619 other_last = HOP3c(s, 1, strend);
be8e71aa 620 s = saved_s;
2c2d71f5
JH
621 if (t == strpos)
622 goto try_at_start;
2c2d71f5
JH
623 goto try_at_offset;
624 }
30944b6d 625 }
2c2d71f5
JH
626 }
627 else { /* Take into account the floating substring. */
33b8afdf 628 char *last, *last1;
be8e71aa 629 char * const saved_s = s;
33b8afdf
JH
630 SV* must;
631
632 t = HOP3c(s, -start_shift, strbeg);
633 last1 = last =
634 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
635 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
636 last = HOP3c(t, prog->float_max_offset, strend);
637 s = HOP3c(t, prog->float_min_offset, strend);
638 if (s < other_last)
639 s = other_last;
2c2d71f5 640 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
641 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
642 /* fbm_instr() takes into account exact value of end-of-str
643 if the check is SvTAIL(ed). Since false positives are OK,
644 and end-of-str is not later than strend we are OK. */
645 if (must == &PL_sv_undef) {
646 s = (char*)NULL;
a3621e74 647 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
648 }
649 else
2c2d71f5 650 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
651 (unsigned char*)last + SvCUR(must)
652 - (SvTAIL(must)!=0),
7fba1cd6 653 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
655 (s ? "Found" : "Contradicts"),
656 PL_colors[0],
657 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf
JH
659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
660 if (!s) {
661 if (last1 == last) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
663 ", giving up...\n"));
664 goto fail_finish;
2c2d71f5 665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 667 ", trying anchored starting at offset %ld...\n",
be8e71aa 668 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
669 other_last = last;
670 s = HOP3c(t, 1, strend);
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
675 (long)(s - i_strpos)));
676 other_last = s; /* Fix this later. --Hugo */
be8e71aa 677 s = saved_s;
33b8afdf
JH
678 if (t == strpos)
679 goto try_at_start;
680 goto try_at_offset;
681 }
2c2d71f5 682 }
cad2e5aa 683 }
2c2d71f5
JH
684
685 t = s - prog->check_offset_max;
2c2d71f5 686 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 687 && (!do_utf8
0ce71af7 688 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 689 && t > strpos))) {
2c2d71f5
JH
690 /* Fixed substring is found far enough so that the match
691 cannot start at strpos. */
692 try_at_offset:
cad2e5aa 693 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
694 /* Eventually fbm_*() should handle this, but often
695 anchored_offset is not 0, so this check will not be wasted. */
696 /* XXXX In the code below we prefer to look for "^" even in
697 presence of anchored substrings. And we search even
698 beyond the found float position. These pessimizations
699 are historical artefacts only. */
700 find_anchor:
2c2d71f5 701 while (t < strend - prog->minlen) {
cad2e5aa 702 if (*t == '\n') {
4ee3650e 703 if (t < check_at - prog->check_offset_min) {
33b8afdf 704 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
705 /* Since we moved from the found position,
706 we definitely contradict the found anchored
30944b6d
IZ
707 substr. Due to the above check we do not
708 contradict "check" substr.
709 Thus we can arrive here only if check substr
710 is float. Redo checking for "other"=="fixed".
711 */
9041c2e3 712 strpos = t + 1;
a3621e74 713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 714 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
715 goto do_other_anchored;
716 }
4ee3650e
GS
717 /* We don't contradict the found floating substring. */
718 /* XXXX Why not check for STCLASS? */
cad2e5aa 719 s = t + 1;
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 721 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
722 goto set_useful;
723 }
4ee3650e
GS
724 /* Position contradicts check-string */
725 /* XXXX probably better to look for check-string
726 than for "\n", so one should lower the limit for t? */
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 728 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 729 other_last = strpos = s = t + 1;
cad2e5aa
JH
730 goto restart;
731 }
732 t++;
733 }
a3621e74 734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 735 PL_colors[0], PL_colors[1]));
2c2d71f5 736 goto fail_finish;
cad2e5aa 737 }
f5952150 738 else {
a3621e74 739 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 740 PL_colors[0], PL_colors[1]));
f5952150 741 }
cad2e5aa
JH
742 s = t;
743 set_useful:
33b8afdf 744 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
745 }
746 else {
f5952150 747 /* The found string does not prohibit matching at strpos,
2c2d71f5 748 - no optimization of calling REx engine can be performed,
f5952150
GS
749 unless it was an MBOL and we are not after MBOL,
750 or a future STCLASS check will fail this. */
2c2d71f5
JH
751 try_at_start:
752 /* Even in this situation we may use MBOL flag if strpos is offset
753 wrt the start of the string. */
05b4157f 754 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 755 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
756 /* May be due to an implicit anchor of m{.*foo} */
757 && !(prog->reganch & ROPT_IMPLICIT))
758 {
cad2e5aa
JH
759 t = strpos;
760 goto find_anchor;
761 }
a3621e74 762 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 763 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 764 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 765 );
2c2d71f5 766 success_at_start:
30944b6d 767 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
768 && (do_utf8 ? (
769 prog->check_utf8 /* Could be deleted already */
770 && --BmUSEFUL(prog->check_utf8) < 0
771 && (prog->check_utf8 == prog->float_utf8)
772 ) : (
773 prog->check_substr /* Could be deleted already */
774 && --BmUSEFUL(prog->check_substr) < 0
775 && (prog->check_substr == prog->float_substr)
776 )))
66e933ab 777 {
cad2e5aa 778 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
780 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
781 if (do_utf8 ? prog->check_substr : prog->check_utf8)
782 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
783 prog->check_substr = prog->check_utf8 = NULL; /* disable */
784 prog->float_substr = prog->float_utf8 = NULL; /* clear */
785 check = NULL; /* abort */
cad2e5aa 786 s = strpos;
3cf5c195
IZ
787 /* XXXX This is a remnant of the old implementation. It
788 looks wasteful, since now INTUIT can use many
6eb5f6b9 789 other heuristics. */
cad2e5aa
JH
790 prog->reganch &= ~RE_USE_INTUIT;
791 }
792 else
793 s = strpos;
794 }
795
6eb5f6b9
JH
796 /* Last resort... */
797 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 798 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
799 /* minlen == 0 is possible if regstclass is \b or \B,
800 and the fixed substr is ''$.
801 Since minlen is already taken into account, s+1 is before strend;
802 accidentally, minlen >= 1 guaranties no false positives at s + 1
803 even for \b or \B. But (minlen? 1 : 0) below assumes that
804 regstclass does not come from lookahead... */
805 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
806 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 807 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 808 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 809 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 810 : 1);
07be1b83 811 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 812 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 813 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
814 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
815 cl_l, strend)
816 : strend);
07be1b83
YO
817 /*if (OP(prog->regstclass) == TRIE)
818 endpos++;*/
6eb5f6b9 819 t = s;
3b0527fe 820 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
821 if (!s) {
822#ifdef DEBUGGING
cbbf8932 823 const char *what = NULL;
6eb5f6b9
JH
824#endif
825 if (endpos == strend) {
a3621e74 826 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
827 "Could not match STCLASS...\n") );
828 goto fail;
829 }
a3621e74 830 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 831 "This position contradicts STCLASS...\n") );
653099ff
GS
832 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
833 goto fail;
6eb5f6b9 834 /* Contradict one of substrings */
33b8afdf
JH
835 if (prog->anchored_substr || prog->anchored_utf8) {
836 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 837 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 838 hop_and_restart:
1aa99e6b 839 s = HOP3c(t, 1, strend);
66e933ab
GS
840 if (s + start_shift + end_shift > strend) {
841 /* XXXX Should be taken into account earlier? */
a3621e74 842 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
843 "Could not match STCLASS...\n") );
844 goto fail;
845 }
5e39e1e5
HS
846 if (!check)
847 goto giveup;
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 849 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
850 what, (long)(s + start_shift - i_strpos)) );
851 goto restart;
852 }
66e933ab 853 /* Have both, check_string is floating */
6eb5f6b9
JH
854 if (t + start_shift >= check_at) /* Contradicts floating=check */
855 goto retry_floating_check;
856 /* Recheck anchored substring, but not floating... */
9041c2e3 857 s = check_at;
5e39e1e5
HS
858 if (!check)
859 goto giveup;
a3621e74 860 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 861 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
862 (long)(other_last - i_strpos)) );
863 goto do_other_anchored;
864 }
60e71179
GS
865 /* Another way we could have checked stclass at the
866 current position only: */
867 if (ml_anch) {
868 s = t = t + 1;
5e39e1e5
HS
869 if (!check)
870 goto giveup;
a3621e74 871 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 872 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 873 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 874 goto try_at_offset;
66e933ab 875 }
33b8afdf 876 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 877 goto fail;
6eb5f6b9
JH
878 /* Check is floating subtring. */
879 retry_floating_check:
880 t = check_at - start_shift;
a3621e74 881 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
882 goto hop_and_restart;
883 }
b7953727 884 if (t != s) {
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 886 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
887 (long)(t - i_strpos), (long)(s - i_strpos))
888 );
889 }
890 else {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
892 "Does not contradict STCLASS...\n");
893 );
894 }
6eb5f6b9 895 }
5e39e1e5 896 giveup:
a3621e74 897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
898 PL_colors[4], (check ? "Guessed" : "Giving up"),
899 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 900 return s;
2c2d71f5
JH
901
902 fail_finish: /* Substring not found */
33b8afdf
JH
903 if (prog->check_substr || prog->check_utf8) /* could be removed already */
904 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 905 fail:
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 907 PL_colors[4], PL_colors[5]));
bd61b366 908 return NULL;
cad2e5aa 909}
9661b544 910
6eb5f6b9 911/* We know what class REx starts with. Try to find this position... */
3b0527fe 912/* if reginfo is NULL, its a dryrun */
07be1b83
YO
913/* annoyingly all the vars in this routine have different names from their counterparts
914 in regmatch. /grrr */
3b0527fe 915
3c3eec57 916STATIC char *
07be1b83
YO
917S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
918 const char *strend, const regmatch_info *reginfo)
a687059c 919{
27da23d5 920 dVAR;
1df70142 921 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 922 char *m;
d8093b23 923 STRLEN ln;
5dab1207 924 STRLEN lnc;
078c425b 925 register STRLEN uskip;
d8093b23
G
926 unsigned int c1;
927 unsigned int c2;
6eb5f6b9
JH
928 char *e;
929 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 930 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 931
6eb5f6b9
JH
932 /* We know what class it must start with. */
933 switch (OP(c)) {
6eb5f6b9 934 case ANYOF:
388cc4de 935 if (do_utf8) {
078c425b 936 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
937 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
938 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
939 reginclass(prog, c, (U8*)s, 0, do_utf8) :
940 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 941 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
942 goto got_it;
943 else
944 tmp = doevery;
945 }
946 else
947 tmp = 1;
078c425b 948 s += uskip;
388cc4de
HS
949 }
950 }
951 else {
952 while (s < strend) {
953 STRLEN skip = 1;
954
32fc9b6a 955 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
956 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
957 /* The assignment of 2 is intentional:
958 * for the folded sharp s, the skip is 2. */
959 (skip = SHARP_S_SKIP))) {
3b0527fe 960 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
961 goto got_it;
962 else
963 tmp = doevery;
964 }
965 else
966 tmp = 1;
967 s += skip;
968 }
a0d0e21e 969 }
6eb5f6b9 970 break;
f33976b4
DB
971 case CANY:
972 while (s < strend) {
3b0527fe 973 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
974 goto got_it;
975 else
976 tmp = doevery;
977 s++;
978 }
979 break;
6eb5f6b9 980 case EXACTF:
5dab1207
NIS
981 m = STRING(c);
982 ln = STR_LEN(c); /* length to match in octets/bytes */
983 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 984 if (UTF) {
a2a2844f 985 STRLEN ulen1, ulen2;
5dab1207 986 U8 *sm = (U8 *) m;
89ebb4a3
JH
987 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
988 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 989 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
990
991 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
992 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
993
89ebb4a3 994 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 995 0, uniflags);
89ebb4a3 996 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 997 0, uniflags);
5dab1207
NIS
998 lnc = 0;
999 while (sm < ((U8 *) m + ln)) {
1000 lnc++;
1001 sm += UTF8SKIP(sm);
1002 }
1aa99e6b
IH
1003 }
1004 else {
1005 c1 = *(U8*)m;
1006 c2 = PL_fold[c1];
1007 }
6eb5f6b9
JH
1008 goto do_exactf;
1009 case EXACTFL:
5dab1207
NIS
1010 m = STRING(c);
1011 ln = STR_LEN(c);
1012 lnc = (I32) ln;
d8093b23 1013 c1 = *(U8*)m;
6eb5f6b9
JH
1014 c2 = PL_fold_locale[c1];
1015 do_exactf:
db12adc6 1016 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1017
3b0527fe 1018 if (!reginfo && e < s)
6eb5f6b9 1019 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1020
60a8b682
JH
1021 /* The idea in the EXACTF* cases is to first find the
1022 * first character of the EXACTF* node and then, if
1023 * necessary, case-insensitively compare the full
1024 * text of the node. The c1 and c2 are the first
1025 * characters (though in Unicode it gets a bit
1026 * more complicated because there are more cases
7f16dd3d
JH
1027 * than just upper and lower: one needs to use
1028 * the so-called folding case for case-insensitive
1029 * matching (called "loose matching" in Unicode).
1030 * ibcmp_utf8() will do just that. */
60a8b682 1031
1aa99e6b 1032 if (do_utf8) {
575cac57 1033 UV c, f;
89ebb4a3 1034 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1035 STRLEN len, foldlen;
4ad0818d 1036 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1037 if (c1 == c2) {
5dab1207
NIS
1038 /* Upper and lower of 1st char are equal -
1039 * probably not a "letter". */
1aa99e6b 1040 while (s <= e) {
89ebb4a3 1041 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1042 uniflags);
80aecb99
JH
1043 if ( c == c1
1044 && (ln == len ||
3dab1dad
YO
1045 ibcmp_utf8(s, NULL, 0, do_utf8,
1046 m, NULL, ln, (bool)UTF))
3b0527fe 1047 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1048 goto got_it;
80aecb99 1049 else {
1df70142 1050 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1051 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1052 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1053 if ( f != c
1054 && (f == c1 || f == c2)
1055 && (ln == foldlen ||
66423254 1056 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1057 NULL, foldlen, do_utf8,
d07ddd77 1058 m,
3dab1dad 1059 NULL, ln, (bool)UTF))
3b0527fe 1060 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1061 goto got_it;
1062 }
1aa99e6b
IH
1063 s += len;
1064 }
09091399
JH
1065 }
1066 else {
1aa99e6b 1067 while (s <= e) {
89ebb4a3 1068 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1069 uniflags);
80aecb99 1070
60a8b682 1071 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1072 * Note that not all the possible combinations
1073 * are handled here: some of them are handled
1074 * by the standard folding rules, and some of
1075 * them (the character class or ANYOF cases)
1076 * are handled during compiletime in
1077 * regexec.c:S_regclass(). */
880bd946
JH
1078 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1079 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1080 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1081
1082 if ( (c == c1 || c == c2)
1083 && (ln == len ||
3dab1dad
YO
1084 ibcmp_utf8(s, NULL, 0, do_utf8,
1085 m, NULL, ln, (bool)UTF))
3b0527fe 1086 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1087 goto got_it;
80aecb99 1088 else {
1df70142 1089 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1090 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1091 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1092 if ( f != c
1093 && (f == c1 || f == c2)
1094 && (ln == foldlen ||
a6872d42 1095 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1096 NULL, foldlen, do_utf8,
d07ddd77 1097 m,
3dab1dad 1098 NULL, ln, (bool)UTF))
3b0527fe 1099 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1100 goto got_it;
1101 }
1aa99e6b
IH
1102 s += len;
1103 }
09091399 1104 }
1aa99e6b
IH
1105 }
1106 else {
1107 if (c1 == c2)
1108 while (s <= e) {
1109 if ( *(U8*)s == c1
1110 && (ln == 1 || !(OP(c) == EXACTF
1111 ? ibcmp(s, m, ln)
1112 : ibcmp_locale(s, m, ln)))
3b0527fe 1113 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1114 goto got_it;
1115 s++;
1116 }
1117 else
1118 while (s <= e) {
1119 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
b3c9acc1
IZ
1127 }
1128 break;
bbce6d69 1129 case BOUNDL:
3280af22 1130 PL_reg_flags |= RF_tainted;
bbce6d69 1131 /* FALL THROUGH */
a0d0e21e 1132 case BOUND:
ffc61ed2 1133 if (do_utf8) {
12d33761 1134 if (s == PL_bostr)
ffc61ed2
JH
1135 tmp = '\n';
1136 else {
6136c704 1137 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1138 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1139 }
1140 tmp = ((OP(c) == BOUND ?
9041c2e3 1141 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1142 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1143 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1144 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1145 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1146 isALNUM_LC_utf8((U8*)s)))
1147 {
1148 tmp = !tmp;
3b0527fe 1149 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1150 goto got_it;
1151 }
078c425b 1152 s += uskip;
a687059c 1153 }
a0d0e21e 1154 }
667bb95a 1155 else {
12d33761 1156 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1157 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1158 while (s < strend) {
1159 if (tmp ==
1160 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1161 tmp = !tmp;
3b0527fe 1162 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1163 goto got_it;
1164 }
1165 s++;
a0ed51b3 1166 }
a0ed51b3 1167 }
3b0527fe 1168 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1169 goto got_it;
1170 break;
bbce6d69 1171 case NBOUNDL:
3280af22 1172 PL_reg_flags |= RF_tainted;
bbce6d69 1173 /* FALL THROUGH */
a0d0e21e 1174 case NBOUND:
ffc61ed2 1175 if (do_utf8) {
12d33761 1176 if (s == PL_bostr)
ffc61ed2
JH
1177 tmp = '\n';
1178 else {
6136c704 1179 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1180 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1181 }
1182 tmp = ((OP(c) == NBOUND ?
9041c2e3 1183 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1184 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1185 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1186 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1187 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1188 isALNUM_LC_utf8((U8*)s)))
1189 tmp = !tmp;
3b0527fe 1190 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1191 goto got_it;
078c425b 1192 s += uskip;
ffc61ed2 1193 }
a0d0e21e 1194 }
667bb95a 1195 else {
12d33761 1196 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1197 tmp = ((OP(c) == NBOUND ?
1198 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1199 while (s < strend) {
1200 if (tmp ==
1201 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1202 tmp = !tmp;
3b0527fe 1203 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1204 goto got_it;
1205 s++;
1206 }
a0ed51b3 1207 }
3b0527fe 1208 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1209 goto got_it;
1210 break;
a0d0e21e 1211 case ALNUM:
ffc61ed2 1212 if (do_utf8) {
1a4fad37 1213 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1214 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1215 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1216 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1217 goto got_it;
1218 else
1219 tmp = doevery;
1220 }
bbce6d69 1221 else
ffc61ed2 1222 tmp = 1;
078c425b 1223 s += uskip;
bbce6d69 1224 }
bbce6d69 1225 }
ffc61ed2
JH
1226 else {
1227 while (s < strend) {
1228 if (isALNUM(*s)) {
3b0527fe 1229 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1230 goto got_it;
1231 else
1232 tmp = doevery;
1233 }
a0ed51b3 1234 else
ffc61ed2
JH
1235 tmp = 1;
1236 s++;
a0ed51b3 1237 }
a0ed51b3
LW
1238 }
1239 break;
bbce6d69 1240 case ALNUML:
3280af22 1241 PL_reg_flags |= RF_tainted;
ffc61ed2 1242 if (do_utf8) {
078c425b 1243 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1244 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1245 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1246 goto got_it;
1247 else
1248 tmp = doevery;
1249 }
a687059c 1250 else
ffc61ed2 1251 tmp = 1;
078c425b 1252 s += uskip;
a0d0e21e 1253 }
a0d0e21e 1254 }
ffc61ed2
JH
1255 else {
1256 while (s < strend) {
1257 if (isALNUM_LC(*s)) {
3b0527fe 1258 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1259 goto got_it;
1260 else
1261 tmp = doevery;
1262 }
a0ed51b3 1263 else
ffc61ed2
JH
1264 tmp = 1;
1265 s++;
a0ed51b3 1266 }
a0ed51b3
LW
1267 }
1268 break;
a0d0e21e 1269 case NALNUM:
ffc61ed2 1270 if (do_utf8) {
1a4fad37 1271 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1272 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1273 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1274 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1275 goto got_it;
1276 else
1277 tmp = doevery;
1278 }
bbce6d69 1279 else
ffc61ed2 1280 tmp = 1;
078c425b 1281 s += uskip;
bbce6d69 1282 }
bbce6d69 1283 }
ffc61ed2
JH
1284 else {
1285 while (s < strend) {
1286 if (!isALNUM(*s)) {
3b0527fe 1287 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1288 goto got_it;
1289 else
1290 tmp = doevery;
1291 }
a0ed51b3 1292 else
ffc61ed2
JH
1293 tmp = 1;
1294 s++;
a0ed51b3 1295 }
a0ed51b3
LW
1296 }
1297 break;
bbce6d69 1298 case NALNUML:
3280af22 1299 PL_reg_flags |= RF_tainted;
ffc61ed2 1300 if (do_utf8) {
078c425b 1301 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1302 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1303 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1304 goto got_it;
1305 else
1306 tmp = doevery;
1307 }
a687059c 1308 else
ffc61ed2 1309 tmp = 1;
078c425b 1310 s += uskip;
a687059c 1311 }
a0d0e21e 1312 }
ffc61ed2
JH
1313 else {
1314 while (s < strend) {
1315 if (!isALNUM_LC(*s)) {
3b0527fe 1316 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1317 goto got_it;
1318 else
1319 tmp = doevery;
1320 }
a0ed51b3 1321 else
ffc61ed2
JH
1322 tmp = 1;
1323 s++;
a0ed51b3 1324 }
a0ed51b3
LW
1325 }
1326 break;
a0d0e21e 1327 case SPACE:
ffc61ed2 1328 if (do_utf8) {
1a4fad37 1329 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1330 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1331 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1332 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1333 goto got_it;
1334 else
1335 tmp = doevery;
1336 }
a0d0e21e 1337 else
ffc61ed2 1338 tmp = 1;
078c425b 1339 s += uskip;
2304df62 1340 }
a0d0e21e 1341 }
ffc61ed2
JH
1342 else {
1343 while (s < strend) {
1344 if (isSPACE(*s)) {
3b0527fe 1345 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1346 goto got_it;
1347 else
1348 tmp = doevery;
1349 }
a0ed51b3 1350 else
ffc61ed2
JH
1351 tmp = 1;
1352 s++;
a0ed51b3 1353 }
a0ed51b3
LW
1354 }
1355 break;
bbce6d69 1356 case SPACEL:
3280af22 1357 PL_reg_flags |= RF_tainted;
ffc61ed2 1358 if (do_utf8) {
078c425b 1359 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1360 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1361 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1362 goto got_it;
1363 else
1364 tmp = doevery;
1365 }
bbce6d69 1366 else
ffc61ed2 1367 tmp = 1;
078c425b 1368 s += uskip;
bbce6d69 1369 }
bbce6d69 1370 }
ffc61ed2
JH
1371 else {
1372 while (s < strend) {
1373 if (isSPACE_LC(*s)) {
3b0527fe 1374 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1375 goto got_it;
1376 else
1377 tmp = doevery;
1378 }
a0ed51b3 1379 else
ffc61ed2
JH
1380 tmp = 1;
1381 s++;
a0ed51b3 1382 }
a0ed51b3
LW
1383 }
1384 break;
a0d0e21e 1385 case NSPACE:
ffc61ed2 1386 if (do_utf8) {
1a4fad37 1387 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1388 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1389 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1390 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1391 goto got_it;
1392 else
1393 tmp = doevery;
1394 }
a0d0e21e 1395 else
ffc61ed2 1396 tmp = 1;
078c425b 1397 s += uskip;
a687059c 1398 }
a0d0e21e 1399 }
ffc61ed2
JH
1400 else {
1401 while (s < strend) {
1402 if (!isSPACE(*s)) {
3b0527fe 1403 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1404 goto got_it;
1405 else
1406 tmp = doevery;
1407 }
a0ed51b3 1408 else
ffc61ed2
JH
1409 tmp = 1;
1410 s++;
a0ed51b3 1411 }
a0ed51b3
LW
1412 }
1413 break;
bbce6d69 1414 case NSPACEL:
3280af22 1415 PL_reg_flags |= RF_tainted;
ffc61ed2 1416 if (do_utf8) {
078c425b 1417 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1418 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1419 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1420 goto got_it;
1421 else
1422 tmp = doevery;
1423 }
bbce6d69 1424 else
ffc61ed2 1425 tmp = 1;
078c425b 1426 s += uskip;
bbce6d69 1427 }
bbce6d69 1428 }
ffc61ed2
JH
1429 else {
1430 while (s < strend) {
1431 if (!isSPACE_LC(*s)) {
3b0527fe 1432 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1433 goto got_it;
1434 else
1435 tmp = doevery;
1436 }
a0ed51b3 1437 else
ffc61ed2
JH
1438 tmp = 1;
1439 s++;
a0ed51b3 1440 }
a0ed51b3
LW
1441 }
1442 break;
a0d0e21e 1443 case DIGIT:
ffc61ed2 1444 if (do_utf8) {
1a4fad37 1445 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1446 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1447 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1448 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1449 goto got_it;
1450 else
1451 tmp = doevery;
1452 }
a0d0e21e 1453 else
ffc61ed2 1454 tmp = 1;
078c425b 1455 s += uskip;
2b69d0c2 1456 }
a0d0e21e 1457 }
ffc61ed2
JH
1458 else {
1459 while (s < strend) {
1460 if (isDIGIT(*s)) {
3b0527fe 1461 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1462 goto got_it;
1463 else
1464 tmp = doevery;
1465 }
a0ed51b3 1466 else
ffc61ed2
JH
1467 tmp = 1;
1468 s++;
a0ed51b3 1469 }
a0ed51b3
LW
1470 }
1471 break;
b8c5462f
JH
1472 case DIGITL:
1473 PL_reg_flags |= RF_tainted;
ffc61ed2 1474 if (do_utf8) {
078c425b 1475 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1476 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1477 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1478 goto got_it;
1479 else
1480 tmp = doevery;
1481 }
b8c5462f 1482 else
ffc61ed2 1483 tmp = 1;
078c425b 1484 s += uskip;
b8c5462f 1485 }
b8c5462f 1486 }
ffc61ed2
JH
1487 else {
1488 while (s < strend) {
1489 if (isDIGIT_LC(*s)) {
3b0527fe 1490 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1491 goto got_it;
1492 else
1493 tmp = doevery;
1494 }
b8c5462f 1495 else
ffc61ed2
JH
1496 tmp = 1;
1497 s++;
b8c5462f 1498 }
b8c5462f
JH
1499 }
1500 break;
a0d0e21e 1501 case NDIGIT:
ffc61ed2 1502 if (do_utf8) {
1a4fad37 1503 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1504 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1505 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1506 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1507 goto got_it;
1508 else
1509 tmp = doevery;
1510 }
a0d0e21e 1511 else
ffc61ed2 1512 tmp = 1;
078c425b 1513 s += uskip;
a687059c 1514 }
a0d0e21e 1515 }
ffc61ed2
JH
1516 else {
1517 while (s < strend) {
1518 if (!isDIGIT(*s)) {
3b0527fe 1519 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1520 goto got_it;
1521 else
1522 tmp = doevery;
1523 }
a0ed51b3 1524 else
ffc61ed2
JH
1525 tmp = 1;
1526 s++;
a0ed51b3 1527 }
a0ed51b3
LW
1528 }
1529 break;
b8c5462f
JH
1530 case NDIGITL:
1531 PL_reg_flags |= RF_tainted;
ffc61ed2 1532 if (do_utf8) {
078c425b 1533 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1534 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1535 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1536 goto got_it;
1537 else
1538 tmp = doevery;
1539 }
b8c5462f 1540 else
ffc61ed2 1541 tmp = 1;
078c425b 1542 s += uskip;
b8c5462f 1543 }
a0ed51b3 1544 }
ffc61ed2
JH
1545 else {
1546 while (s < strend) {
1547 if (!isDIGIT_LC(*s)) {
3b0527fe 1548 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1549 goto got_it;
1550 else
1551 tmp = doevery;
1552 }
cf93c79d 1553 else
ffc61ed2
JH
1554 tmp = 1;
1555 s++;
b8c5462f 1556 }
b8c5462f
JH
1557 }
1558 break;
07be1b83
YO
1559 case TRIE:
1560 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1561 {
1562 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1563 trie_type = do_utf8 ?
1564 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1565 : trie_plain;
1566 /* what trie are we using right now */
1567 reg_ac_data *aho
1568 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1569 reg_trie_data *trie=aho->trie;
1570
1571 const char *last_start = strend - trie->minlen;
1572 const char *real_start = s;
1573 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1574 SV *sv_points;
1575 U8 **points; /* map of where we were in the input string
1576 when reading a given string. For ASCII this
1577 is unnecessary overhead as the relationship
1578 is always 1:1, but for unicode, especially
1579 case folded unicode this is not true. */
f9e705e8 1580 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1581
1582 GET_RE_DEBUG_FLAGS_DECL;
1583
be8e71aa
YO
1584 /* We can't just allocate points here. We need to wrap it in
1585 * an SV so it gets freed properly if there is a croak while
1586 * running the match */
1587 ENTER;
1588 SAVETMPS;
1589 sv_points=newSV(maxlen * sizeof(U8 *));
1590 SvCUR_set(sv_points,
1591 maxlen * sizeof(U8 *));
1592 SvPOK_on(sv_points);
1593 sv_2mortal(sv_points);
1594 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1595
1596 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1597 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1598 s++;
1599 }
1600 }
1601
1602 while (s <= last_start) {
1603 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1604 U8 *uc = (U8*)s;
1605 U16 charid = 0;
1606 U32 base = 1;
1607 U32 state = 1;
1608 UV uvc = 0;
1609 STRLEN len = 0;
1610 STRLEN foldlen = 0;
1611 U8 *uscan = (U8*)NULL;
1612 U8 *leftmost = NULL;
1613
1614 U32 pointpos = 0;
1615
1616 while ( state && uc <= (U8*)strend ) {
1617 int failed=0;
1618 if (aho->states[ state ].wordnum) {
1619 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1620 if (!leftmost || lpos < leftmost)
1621 leftmost= lpos;
1622 if (base==0) break;
1623 }
1624 points[pointpos++ % maxlen]= uc;
1625 switch (trie_type) {
1626 case trie_utf8_fold:
1627 if ( foldlen>0 ) {
1628 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1629 foldlen -= len;
1630 uscan += len;
1631 len=0;
1632 } else {
07be1b83
YO
1633 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1634 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1635 foldlen -= UNISKIP( uvc );
1636 uscan = foldbuf + UNISKIP( uvc );
1637 }
1638 break;
1639 case trie_utf8:
1640 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1641 &len, uniflags );
1642 break;
1643 case trie_plain:
1644 uvc = (UV)*uc;
1645 len = 1;
1646 }
1647
1648 if (uvc < 256) {
1649 charid = trie->charmap[ uvc ];
1650 }
1651 else {
1652 charid = 0;
1653 if (trie->widecharmap) {
1654 SV** const svpp = hv_fetch(trie->widecharmap,
1655 (char*)&uvc, sizeof(UV), 0);
1656 if (svpp)
1657 charid = (U16)SvIV(*svpp);
1658 }
1659 }
1660 DEBUG_TRIE_EXECUTE_r(
1661 PerlIO_printf(Perl_debug_log,
1662 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1663 (int)((const char*)uc - real_start), charid, uvc)
1664 );
1665 uc += len;
1666
1667 do {
1668 U32 word = aho->states[ state ].wordnum;
1669 base = aho->states[ state ].trans.base;
1670
1671 DEBUG_TRIE_EXECUTE_r(
1672 PerlIO_printf( Perl_debug_log,
1673 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1674 failed ? "Fail transition to " : "",
1675 state, base, uvc, word)
1676 );
1677 if ( base ) {
1678 U32 tmp;
1679 if (charid &&
1680 (base + charid > trie->uniquecharcount )
1681 && (base + charid - 1 - trie->uniquecharcount
1682 < trie->lasttrans)
1683 && trie->trans[base + charid - 1 -
1684 trie->uniquecharcount].check == state
1685 && (tmp=trie->trans[base + charid - 1 -
1686 trie->uniquecharcount ].next))
1687 {
1688 state = tmp;
1689 break;
1690 }
1691 else {
1692 failed++;
1693 if ( state == 1 )
1694 break;
1695 else
1696 state = aho->fail[state];
1697 }
1698 }
1699 else {
1700 /* we must be accepting here */
1701 failed++;
1702 break;
1703 }
1704 } while(state);
1705 if (failed) {
1706 if (leftmost)
1707 break;
1708 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1709 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1710 uc++;
1711 }
1712 }
1713 }
1714 }
1715 if ( aho->states[ state ].wordnum ) {
1716 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1717 if (!leftmost || lpos < leftmost)
1718 leftmost = lpos;
1719 }
1720 DEBUG_TRIE_EXECUTE_r(
1721 PerlIO_printf( Perl_debug_log,
1722 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1723 "All done: ",
1724 state, base, uvc)
1725 );
1726 if (leftmost) {
1727 s = (char*)leftmost;
be8e71aa
YO
1728 if (!reginfo || regtry(reginfo, s)) {
1729 FREETMPS;
1730 LEAVE;
07be1b83 1731 goto got_it;
be8e71aa 1732 }
07be1b83
YO
1733 s = HOPc(s,1);
1734 } else {
1735 break;
1736 }
1737 }
be8e71aa
YO
1738 FREETMPS;
1739 LEAVE;
07be1b83
YO
1740 }
1741 break;
b3c9acc1 1742 default:
3c3eec57
GS
1743 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1744 break;
d6a28714 1745 }
6eb5f6b9
JH
1746 return 0;
1747 got_it:
1748 return s;
1749}
1750
1751/*
1752 - regexec_flags - match a regexp against a string
1753 */
1754I32
1755Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1756 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1757/* strend: pointer to null at end of string */
1758/* strbeg: real beginning of string */
1759/* minend: end of match must be >=minend after stringarg. */
1760/* data: May be used for some additional optimizations. */
1761/* nosave: For optimizations. */
1762{
97aff369 1763 dVAR;
6eb5f6b9
JH
1764 register char *s;
1765 register regnode *c;
1766 register char *startpos = stringarg;
6eb5f6b9
JH
1767 I32 minlen; /* must match at least this many chars */
1768 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1769 I32 end_shift = 0; /* Same for the end. */ /* CC */
1770 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1771 char *scream_olds = NULL;
3dab1dad 1772 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1773 const bool do_utf8 = DO_UTF8(sv);
2757e526 1774 I32 multiline;
2a782b5b 1775#ifdef DEBUGGING
2757e526
JH
1776 SV* dsv0;
1777 SV* dsv1;
2a782b5b 1778#endif
3b0527fe 1779 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1780
1781 GET_RE_DEBUG_FLAGS_DECL;
1782
9d4ba2ae 1783 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1784
1785 /* Be paranoid... */
1786 if (prog == NULL || startpos == NULL) {
1787 Perl_croak(aTHX_ "NULL regexp parameter");
1788 return 0;
1789 }
1790
2757e526 1791 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1792 reginfo.prog = prog;
2757e526
JH
1793
1794#ifdef DEBUGGING
1795 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1796 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1797#endif
1798
bac06658
JH
1799 RX_MATCH_UTF8_set(prog, do_utf8);
1800
6eb5f6b9 1801 minlen = prog->minlen;
61a36c01 1802 if (strend - startpos < minlen) {
a3621e74 1803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1804 "String too short [regexec_flags]...\n"));
1805 goto phooey;
1aa99e6b 1806 }
6eb5f6b9 1807
6eb5f6b9
JH
1808 /* Check validity of program. */
1809 if (UCHARAT(prog->program) != REG_MAGIC) {
1810 Perl_croak(aTHX_ "corrupted regexp program");
1811 }
1812
1813 PL_reg_flags = 0;
1814 PL_reg_eval_set = 0;
1815 PL_reg_maxiter = 0;
1816
1817 if (prog->reganch & ROPT_UTF8)
1818 PL_reg_flags |= RF_utf8;
1819
1820 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1821 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1822 PL_bostr = strbeg;
3b0527fe 1823 reginfo.sv = sv;
6eb5f6b9
JH
1824
1825 /* Mark end of line for $ (and such) */
1826 PL_regeol = strend;
1827
1828 /* see how far we have to get to not match where we matched before */
3b0527fe 1829 reginfo.till = startpos+minend;
6eb5f6b9 1830
6eb5f6b9
JH
1831 /* If there is a "must appear" string, look for it. */
1832 s = startpos;
1833
3b0527fe 1834 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1835 MAGIC *mg;
1836
1837 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1838 reginfo.ganch = startpos;
6eb5f6b9
JH
1839 else if (sv && SvTYPE(sv) >= SVt_PVMG
1840 && SvMAGIC(sv)
14befaf4
DM
1841 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1842 && mg->mg_len >= 0) {
3b0527fe 1843 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1844 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1845 if (s > reginfo.ganch)
6eb5f6b9 1846 goto phooey;
3b0527fe 1847 s = reginfo.ganch;
6eb5f6b9
JH
1848 }
1849 }
1850 else /* pos() not defined */
3b0527fe 1851 reginfo.ganch = strbeg;
6eb5f6b9
JH
1852 }
1853
a0714e2c 1854 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1855 re_scream_pos_data d;
1856
1857 d.scream_olds = &scream_olds;
1858 d.scream_pos = &scream_pos;
1859 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1860 if (!s) {
a3621e74 1861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1862 goto phooey; /* not present */
3fa9c3d7 1863 }
6eb5f6b9
JH
1864 }
1865
a3621e74 1866 DEBUG_EXECUTE_r({
1df70142
AL
1867 const char * const s0 = UTF
1868 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1869 UNI_DISPLAY_REGEX)
1870 : prog->precomp;
bb7a0f54 1871 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1872 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1873 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1874 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1875 if (!PL_colorset)
1876 reginitcolors();
1877 PerlIO_printf(Perl_debug_log,
a0288114 1878 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1879 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1880 len0, len0, s0,
2a782b5b 1881 PL_colors[1],
9e55ce06 1882 len0 > 60 ? "..." : "",
2a782b5b 1883 PL_colors[0],
9e55ce06
JH
1884 (int)(len1 > 60 ? 60 : len1),
1885 s1, PL_colors[1],
1886 (len1 > 60 ? "..." : "")
2a782b5b
JH
1887 );
1888 });
6eb5f6b9
JH
1889
1890 /* Simplest case: anchored match need be tried only once. */
1891 /* [unless only anchor is BOL and multiline is set] */
1892 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1893 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1894 goto got_it;
7fba1cd6 1895 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1896 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1897 {
1898 char *end;
1899
1900 if (minlen)
1901 dontbother = minlen - 1;
1aa99e6b 1902 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1903 /* for multiline we only have to try after newlines */
33b8afdf 1904 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1905 if (s == startpos)
1906 goto after_try;
1907 while (1) {
3b0527fe 1908 if (regtry(&reginfo, s))
6eb5f6b9
JH
1909 goto got_it;
1910 after_try:
1911 if (s >= end)
1912 goto phooey;
1913 if (prog->reganch & RE_USE_INTUIT) {
1914 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1915 if (!s)
1916 goto phooey;
1917 }
1918 else
1919 s++;
1920 }
1921 } else {
1922 if (s > startpos)
1923 s--;
1924 while (s < end) {
1925 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1926 if (regtry(&reginfo, s))
6eb5f6b9
JH
1927 goto got_it;
1928 }
1929 }
1930 }
1931 }
1932 goto phooey;
1933 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1934 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1935 goto got_it;
1936 goto phooey;
1937 }
1938
1939 /* Messy cases: unanchored match. */
33b8afdf 1940 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1941 /* we have /x+whatever/ */
1942 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1943 char ch;
bf93d4cc
GS
1944#ifdef DEBUGGING
1945 int did_match = 0;
1946#endif
33b8afdf
JH
1947 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1948 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1949 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1950
1aa99e6b 1951 if (do_utf8) {
6eb5f6b9
JH
1952 while (s < strend) {
1953 if (*s == ch) {
a3621e74 1954 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1955 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1956 s += UTF8SKIP(s);
1957 while (s < strend && *s == ch)
1958 s += UTF8SKIP(s);
1959 }
1960 s += UTF8SKIP(s);
1961 }
1962 }
1963 else {
1964 while (s < strend) {
1965 if (*s == ch) {
a3621e74 1966 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1967 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1968 s++;
1969 while (s < strend && *s == ch)
1970 s++;
1971 }
1972 s++;
1973 }
1974 }
a3621e74 1975 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1976 PerlIO_printf(Perl_debug_log,
b7953727
JH
1977 "Did not find anchored character...\n")
1978 );
6eb5f6b9 1979 }
a0714e2c
SS
1980 else if (prog->anchored_substr != NULL
1981 || prog->anchored_utf8 != NULL
1982 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1983 && prog->float_max_offset < strend - s)) {
1984 SV *must;
1985 I32 back_max;
1986 I32 back_min;
1987 char *last;
6eb5f6b9 1988 char *last1; /* Last position checked before */
bf93d4cc
GS
1989#ifdef DEBUGGING
1990 int did_match = 0;
1991#endif
33b8afdf
JH
1992 if (prog->anchored_substr || prog->anchored_utf8) {
1993 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1994 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1995 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1996 back_max = back_min = prog->anchored_offset;
1997 } else {
1998 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1999 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2000 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2001 back_max = prog->float_max_offset;
2002 back_min = prog->float_min_offset;
2003 }
2004 if (must == &PL_sv_undef)
2005 /* could not downgrade utf8 check substring, so must fail */
2006 goto phooey;
2007
2008 last = HOP3c(strend, /* Cannot start after this */
2009 -(I32)(CHR_SVLEN(must)
2010 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2011
2012 if (s > PL_bostr)
2013 last1 = HOPc(s, -1);
2014 else
2015 last1 = s - 1; /* bogus */
2016
a0288114 2017 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2018 check_substr==must. */
2019 scream_pos = -1;
2020 dontbother = end_shift;
2021 strend = HOPc(strend, -dontbother);
2022 while ( (s <= last) &&
9041c2e3 2023 ((flags & REXEC_SCREAM)
1aa99e6b 2024 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2025 end_shift, &scream_pos, 0))
1aa99e6b 2026 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2027 (unsigned char*)strend, must,
7fba1cd6 2028 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2029 /* we may be pointing at the wrong string */
2030 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2031 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2032 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2033 if (HOPc(s, -back_max) > last1) {
2034 last1 = HOPc(s, -back_min);
2035 s = HOPc(s, -back_max);
2036 }
2037 else {
52657f30 2038 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2039
2040 last1 = HOPc(s, -back_min);
52657f30 2041 s = t;
6eb5f6b9 2042 }
1aa99e6b 2043 if (do_utf8) {
6eb5f6b9 2044 while (s <= last1) {
3b0527fe 2045 if (regtry(&reginfo, s))
6eb5f6b9
JH
2046 goto got_it;
2047 s += UTF8SKIP(s);
2048 }
2049 }
2050 else {
2051 while (s <= last1) {
3b0527fe 2052 if (regtry(&reginfo, s))
6eb5f6b9
JH
2053 goto got_it;
2054 s++;
2055 }
2056 }
2057 }
a3621e74 2058 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2059 PerlIO_printf(Perl_debug_log,
a0288114 2060 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2061 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2062 ? "anchored" : "floating"),
2063 PL_colors[0],
2064 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2065 SvPVX_const(must),
b7953727
JH
2066 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2067 );
6eb5f6b9
JH
2068 goto phooey;
2069 }
155aba94 2070 else if ((c = prog->regstclass)) {
f14c76ed 2071 if (minlen) {
be8e71aa 2072 const OPCODE op = OP(prog->regstclass);
66e933ab 2073 /* don't bother with what can't match */
07be1b83 2074 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2075 strend = HOPc(strend, -(minlen - 1));
2076 }
a3621e74 2077 DEBUG_EXECUTE_r({
be8e71aa 2078 SV * const prop = sv_newmortal();
cfd0369c
NC
2079 const char *s0;
2080 const char *s1;
9e55ce06
JH
2081 int len0;
2082 int len1;
2083
32fc9b6a 2084 regprop(prog, prop, c);
9e55ce06 2085 s0 = UTF ?
3f7c398e 2086 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2087 UNI_DISPLAY_REGEX) :
cfd0369c 2088 SvPVX_const(prop);
9e55ce06
JH
2089 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2090 s1 = UTF ?
c728cb41 2091 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2092 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2093 PerlIO_printf(Perl_debug_log,
07be1b83 2094 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2095 len0, len0, s0,
07be1b83 2096 len1, len1, s1, (int)(strend - s));
ffc61ed2 2097 });
3b0527fe 2098 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2099 goto got_it;
07be1b83 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2101 }
2102 else {
2103 dontbother = 0;
a0714e2c 2104 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2105 /* Trim the end. */
d6a28714 2106 char *last;
33b8afdf
JH
2107 SV* float_real;
2108
2109 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2110 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2111 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2112
2113 if (flags & REXEC_SCREAM) {
33b8afdf 2114 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2115 end_shift, &scream_pos, 1); /* last one */
2116 if (!last)
ffc61ed2 2117 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2118 /* we may be pointing at the wrong string */
2119 else if (RX_MATCH_COPIED(prog))
3f7c398e 2120 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2121 }
d6a28714
JH
2122 else {
2123 STRLEN len;
cfd0369c 2124 const char * const little = SvPV_const(float_real, len);
d6a28714 2125
33b8afdf 2126 if (SvTAIL(float_real)) {
d6a28714
JH
2127 if (memEQ(strend - len + 1, little, len - 1))
2128 last = strend - len + 1;
7fba1cd6 2129 else if (!multiline)
9041c2e3 2130 last = memEQ(strend - len, little, len)
bd61b366 2131 ? strend - len : NULL;
b8c5462f 2132 else
d6a28714
JH
2133 goto find_last;
2134 } else {
2135 find_last:
9041c2e3 2136 if (len)
d6a28714 2137 last = rninstr(s, strend, little, little + len);
b8c5462f 2138 else
a0288114 2139 last = strend; /* matching "$" */
b8c5462f 2140 }
b8c5462f 2141 }
bf93d4cc 2142 if (last == NULL) {
a3621e74 2143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2144 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2145 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2146 goto phooey; /* Should not happen! */
2147 }
d6a28714
JH
2148 dontbother = strend - last + prog->float_min_offset;
2149 }
2150 if (minlen && (dontbother < minlen))
2151 dontbother = minlen - 1;
2152 strend -= dontbother; /* this one's always in bytes! */
2153 /* We don't know much -- general case. */
1aa99e6b 2154 if (do_utf8) {
d6a28714 2155 for (;;) {
3b0527fe 2156 if (regtry(&reginfo, s))
d6a28714
JH
2157 goto got_it;
2158 if (s >= strend)
2159 break;
b8c5462f 2160 s += UTF8SKIP(s);
d6a28714
JH
2161 };
2162 }
2163 else {
2164 do {
3b0527fe 2165 if (regtry(&reginfo, s))
d6a28714
JH
2166 goto got_it;
2167 } while (s++ < strend);
2168 }
2169 }
2170
2171 /* Failure. */
2172 goto phooey;
2173
2174got_it:
2175 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2176
2177 if (PL_reg_eval_set) {
2178 /* Preserve the current value of $^R */
2179 if (oreplsv != GvSV(PL_replgv))
2180 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2181 restored, the value remains
2182 the same. */
4f639d21 2183 restore_pos(aTHX_ prog);
d6a28714
JH
2184 }
2185
2186 /* make sure $`, $&, $', and $digit will work later */
2187 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2188 RX_MATCH_COPY_FREE(prog);
d6a28714 2189 if (flags & REXEC_COPY_STR) {
be8e71aa 2190 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2191#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2192 if ((SvIsCOW(sv)
2193 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2194 if (DEBUG_C_TEST) {
2195 PerlIO_printf(Perl_debug_log,
2196 "Copy on write: regexp capture, type %d\n",
2197 (int) SvTYPE(sv));
2198 }
2199 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2200 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2201 assert (SvPOKp(prog->saved_copy));
2202 } else
2203#endif
2204 {
2205 RX_MATCH_COPIED_on(prog);
2206 s = savepvn(strbeg, i);
2207 prog->subbeg = s;
2208 }
d6a28714 2209 prog->sublen = i;
d6a28714
JH
2210 }
2211 else {
2212 prog->subbeg = strbeg;
2213 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2214 }
2215 }
9041c2e3 2216
d6a28714
JH
2217 return 1;
2218
2219phooey:
a3621e74 2220 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2221 PL_colors[4], PL_colors[5]));
d6a28714 2222 if (PL_reg_eval_set)
4f639d21 2223 restore_pos(aTHX_ prog);
d6a28714
JH
2224 return 0;
2225}
2226
2227/*
2228 - regtry - try match at specific point
2229 */
2230STATIC I32 /* 0 failure, 1 success */
3b0527fe 2231S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2232{
97aff369 2233 dVAR;
d6a28714
JH
2234 register I32 *sp;
2235 register I32 *ep;
2236 CHECKPOINT lastcp;
3b0527fe 2237 regexp *prog = reginfo->prog;
a3621e74 2238 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2239
02db2b7b
IZ
2240#ifdef DEBUGGING
2241 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2242#endif
d6a28714
JH
2243 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2244 MAGIC *mg;
2245
2246 PL_reg_eval_set = RS_init;
a3621e74 2247 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2248 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2249 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2250 ));
e8347627 2251 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2252 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2253 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2254 SAVETMPS;
2255 /* Apparently this is not needed, judging by wantarray. */
e8347627 2256 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2257 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2258
3b0527fe 2259 if (reginfo->sv) {
d6a28714 2260 /* Make $_ available to executed code. */
3b0527fe 2261 if (reginfo->sv != DEFSV) {
59f00321 2262 SAVE_DEFSV;
3b0527fe 2263 DEFSV = reginfo->sv;
b8c5462f 2264 }
d6a28714 2265
3b0527fe
DM
2266 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2267 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2268 /* prepare for quick setting of pos */
d300d9fa
NC
2269#ifdef PERL_OLD_COPY_ON_WRITE
2270 if (SvIsCOW(sv))
2271 sv_force_normal_flags(sv, 0);
2272#endif
3dab1dad 2273 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2274 &PL_vtbl_mglob, NULL, 0);
d6a28714 2275 mg->mg_len = -1;
b8c5462f 2276 }
d6a28714
JH
2277 PL_reg_magic = mg;
2278 PL_reg_oldpos = mg->mg_len;
4f639d21 2279 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2280 }
09687e5a 2281 if (!PL_reg_curpm) {
a02a5408 2282 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2283#ifdef USE_ITHREADS
2284 {
be8e71aa 2285 SV* const repointer = newSViv(0);
577e12cc 2286 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2287 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2288 av_push(PL_regex_padav,repointer);
2289 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2290 PL_regex_pad = AvARRAY(PL_regex_padav);
2291 }
2292#endif
2293 }
aaa362c4 2294 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2295 PL_reg_oldcurpm = PL_curpm;
2296 PL_curpm = PL_reg_curpm;
2297 if (RX_MATCH_COPIED(prog)) {
2298 /* Here is a serious problem: we cannot rewrite subbeg,
2299 since it may be needed if this match fails. Thus
2300 $` inside (?{}) could fail... */
2301 PL_reg_oldsaved = prog->subbeg;
2302 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2303#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2304 PL_nrs = prog->saved_copy;
2305#endif
d6a28714
JH
2306 RX_MATCH_COPIED_off(prog);
2307 }
2308 else
bd61b366 2309 PL_reg_oldsaved = NULL;
d6a28714
JH
2310 prog->subbeg = PL_bostr;
2311 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2312 }
973dddac 2313 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2314 PL_reginput = startpos;
2315 PL_regstartp = prog->startp;
2316 PL_regendp = prog->endp;
2317 PL_reglastparen = &prog->lastparen;
a01268b5 2318 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2319 prog->lastparen = 0;
03994de8 2320 prog->lastcloseparen = 0;
d6a28714 2321 PL_regsize = 0;
a3621e74 2322 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2323 if (PL_reg_start_tmpl <= prog->nparens) {
2324 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2325 if(PL_reg_start_tmp)
2326 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2327 else
a02a5408 2328 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2329 }
2330
2331 /* XXXX What this code is doing here?!!! There should be no need
2332 to do this again and again, PL_reglastparen should take care of
3dd2943c 2333 this! --ilya*/
dafc8851
JH
2334
2335 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2336 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2337 * PL_reglastparen), is not needed at all by the test suite
2338 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2339 * enough, for building DynaLoader, or otherwise this
2340 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2341 * will happen. Meanwhile, this code *is* needed for the
2342 * above-mentioned test suite tests to succeed. The common theme
2343 * on those tests seems to be returning null fields from matches.
2344 * --jhi */
dafc8851 2345#if 1
d6a28714
JH
2346 sp = prog->startp;
2347 ep = prog->endp;
2348 if (prog->nparens) {
097eb12c 2349 register I32 i;
eb160463 2350 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2351 *++sp = -1;
2352 *++ep = -1;
2353 }
2354 }
dafc8851 2355#endif
02db2b7b 2356 REGCP_SET(lastcp);
3b0527fe 2357 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2358 prog->endp[0] = PL_reginput - PL_bostr;
2359 return 1;
2360 }
02db2b7b 2361 REGCP_UNWIND(lastcp);
d6a28714
JH
2362 return 0;
2363}
2364
02db2b7b 2365
8ba1375e
MJD
2366#define sayYES goto yes
2367#define sayNO goto no
e0f9d4a8 2368#define sayNO_ANYOF goto no_anyof
8ba1375e 2369#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2370#define sayNO_FINAL goto no_final
2371#define sayNO_SILENT goto do_no
2372#define saySAME(x) if (x) goto yes; else goto no
2373
3ab3c9b4
HS
2374#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2375#define POSCACHE_SEEN 1 /* we know what we're caching */
2376#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2377
3ab3c9b4 2378#define CACHEsayYES STMT_START { \
d8319b27 2379 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2380 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2381 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2382 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2383 } \
2384 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2385 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2386 } \
2387 else { \
3ab3c9b4
HS
2388 /* cache records failure, but this is success */ \
2389 DEBUG_r( \
2390 PerlIO_printf(Perl_debug_log, \
2391 "%*s (remove success from failure cache)\n", \
2392 REPORT_CODE_OFF+PL_regindent*2, "") \
2393 ); \
d8319b27 2394 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2395 } \
2396 } \
2397 sayYES; \
2398} STMT_END
7409bbd3 2399
3ab3c9b4 2400#define CACHEsayNO STMT_START { \
d8319b27 2401 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2402 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2403 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2404 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2405 } \
2406 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2407 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2408 } \
2409 else { \
3ab3c9b4
HS
2410 /* cache records success, but this is failure */ \
2411 DEBUG_r( \
2412 PerlIO_printf(Perl_debug_log, \
2413 "%*s (remove failure from success cache)\n", \
2414 REPORT_CODE_OFF+PL_regindent*2, "") \
2415 ); \
d8319b27 2416 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2417 } \
2418 } \
2419 sayNO; \
2420} STMT_END
2421
a3621e74
YO
2422/* this is used to determine how far from the left messages like
2423 'failed...' are printed. Currently 29 makes these messages line
2424 up with the opcode they refer to. Earlier perls used 25 which
2425 left these messages outdented making reviewing a debug output
2426 quite difficult.
2427*/
2428#define REPORT_CODE_OFF 29
2429
2430
2431/* Make sure there is a test for this +1 options in re_tests */
2432#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2433
40a82448
DM
2434#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2435#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2436
86545054
DM
2437#define SLAB_FIRST(s) (&(s)->states[0])
2438#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2439
5d9a96ca
DM
2440/* grab a new slab and return the first slot in it */
2441
2442STATIC regmatch_state *
2443S_push_slab(pTHX)
2444{
54df2634
NC
2445#if PERL_VERSION < 9
2446 dMY_CXT;
2447#endif
5d9a96ca
DM
2448 regmatch_slab *s = PL_regmatch_slab->next;
2449 if (!s) {
2450 Newx(s, 1, regmatch_slab);
2451 s->prev = PL_regmatch_slab;
2452 s->next = NULL;
2453 PL_regmatch_slab->next = s;
2454 }
2455 PL_regmatch_slab = s;
86545054 2456 return SLAB_FIRST(s);
5d9a96ca 2457}
5b47454d 2458
95b24440
DM
2459/* simulate a recursive call to regmatch */
2460
2461#define REGMATCH(ns, where) \
5d9a96ca
DM
2462 st->scan = scan; \
2463 scan = (ns); \
2464 st->resume_state = resume_##where; \
95b24440
DM
2465 goto start_recurse; \
2466 resume_point_##where:
2467
40a82448
DM
2468/* push a new state then goto it */
2469
2470#define PUSH_STATE_GOTO(state, node) \
2471 scan = node; \
2472 st->resume_state = state; \
2473 goto push_state;
2474
2475/* push a new state with success backtracking, then goto it */
2476
2477#define PUSH_YES_STATE_GOTO(state, node) \
2478 scan = node; \
2479 st->resume_state = state; \
2480 goto push_yes_state;
2481
aa283a38 2482
aa283a38 2483
d6a28714
JH
2484/*
2485 - regmatch - main matching routine
2486 *
2487 * Conceptually the strategy is simple: check to see whether the current
2488 * node matches, call self recursively to see whether the rest matches,
2489 * and then act accordingly. In practice we make some effort to avoid
2490 * recursion, in particular by going through "ordinary" nodes (that don't
2491 * need to know whether the rest of the match failed) by a loop instead of
2492 * by recursion.
2493 */
2494/* [lwall] I've hoisted the register declarations to the outer block in order to
2495 * maybe save a little bit of pushing and popping on the stack. It also takes
2496 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2497 *
2498 * This function used to be heavily recursive, but since this had the
2499 * effect of blowing the CPU stack on complex regexes, it has been
2500 * restructured to be iterative, and to save state onto the heap rather
2501 * than the stack. Essentially whereever regmatch() used to be called, it
2502 * pushes the current state, notes where to return, then jumps back into
2503 * the main loop.
2504 *
2505 * Originally the structure of this function used to look something like
2506
2507 S_regmatch() {
2508 int a = 1, b = 2;
2509 ...
2510 while (scan != NULL) {
5d9a96ca 2511 a++; // do stuff with a and b
95b24440
DM
2512 ...
2513 switch (OP(scan)) {
2514 case FOO: {
2515 int local = 3;
2516 ...
2517 if (regmatch(...)) // recurse
2518 goto yes;
2519 }
2520 ...
2521 }
2522 }
2523 yes:
2524 return 1;
2525 }
2526
2527 * Now it looks something like this:
2528
5d9a96ca 2529 typedef struct {
95b24440
DM
2530 int a, b, local;
2531 int resume_state;
5d9a96ca 2532 } regmatch_state;
95b24440
DM
2533
2534 S_regmatch() {
5d9a96ca
DM
2535 regmatch_state *st = new();
2536 int depth=0;
2537 st->a++; // do stuff with a and b
95b24440
DM
2538 ...
2539 while (scan != NULL) {
2540 ...
2541 switch (OP(scan)) {
2542 case FOO: {
5d9a96ca 2543 st->local = 3;
95b24440 2544 ...
5d9a96ca
DM
2545 st->scan = scan;
2546 scan = ...;
2547 st->resume_state = resume_FOO;
2548 goto start_recurse; // recurse
95b24440 2549
5d9a96ca
DM
2550 resume_point_FOO:
2551 if (result)
95b24440
DM
2552 goto yes;
2553 }
2554 ...
2555 }
5d9a96ca
DM
2556 start_recurse:
2557 st = new(); push a new state
2558 st->a = 1; st->b = 2;
2559 depth++;
95b24440 2560 }
5d9a96ca 2561 yes:
95b24440 2562 result = 1;
5d9a96ca
DM
2563 if (depth--) {
2564 st = pop();
95b24440
DM
2565 switch (resume_state) {
2566 case resume_FOO:
2567 goto resume_point_FOO;
2568 ...
2569 }
2570 }
2571 return result
2572 }
2573
2574 * WARNING: this means that any line in this function that contains a
2575 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2576 * regmatch() using gotos instead. Thus the values of any local variables
2577 * not saved in the regmatch_state structure will have been lost when
2578 * execution resumes on the next line .
5d9a96ca
DM
2579 *
2580 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2581 * PL_regmatch_state always points to the currently active state, and
2582 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2583 * The first time regmatch is called, the first slab is allocated, and is
2584 * never freed until interpreter desctruction. When the slab is full,
2585 * a new one is allocated chained to the end. At exit from regmatch, slabs
2586 * allocated since entry are freed.
d6a28714 2587 */
95b24440 2588
40a82448 2589/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2590#define TRIE_next (REGNODE_MAX+1)
2591#define TRIE_next_fail (REGNODE_MAX+2)
2592#define EVAL_A (REGNODE_MAX+3)
2593#define EVAL_A_fail (REGNODE_MAX+4)
2594#define resume_CURLYX (REGNODE_MAX+5)
2595#define resume_WHILEM1 (REGNODE_MAX+6)
2596#define resume_WHILEM2 (REGNODE_MAX+7)
2597#define resume_WHILEM3 (REGNODE_MAX+8)
2598#define resume_WHILEM4 (REGNODE_MAX+9)
2599#define resume_WHILEM5 (REGNODE_MAX+10)
2600#define resume_WHILEM6 (REGNODE_MAX+11)
2601#define BRANCH_next (REGNODE_MAX+12)
2602#define BRANCH_next_fail (REGNODE_MAX+13)
2603#define CURLYM_A (REGNODE_MAX+14)
2604#define CURLYM_A_fail (REGNODE_MAX+15)
2605#define CURLYM_B (REGNODE_MAX+16)
2606#define CURLYM_B_fail (REGNODE_MAX+17)
2607#define IFMATCH_A (REGNODE_MAX+18)
2608#define IFMATCH_A_fail (REGNODE_MAX+19)
2609#define CURLY_B_min_known (REGNODE_MAX+20)
2610#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2611#define CURLY_B_min (REGNODE_MAX+22)
2612#define CURLY_B_min_fail (REGNODE_MAX+23)
2613#define CURLY_B_max (REGNODE_MAX+24)
2614#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2615
2616
3dab1dad 2617#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2618
3df15adc
YO
2619#ifdef DEBUGGING
2620
2621STATIC void
07be1b83
YO
2622S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2623{
2624 const int docolor = *PL_colors[0];
2625 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2626 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2627 /* The part of the string before starttry has one color
2628 (pref0_len chars), between starttry and current
2629 position another one (pref_len - pref0_len chars),
2630 after the current position the third one.
2631 We assume that pref0_len <= pref_len, otherwise we
2632 decrease pref0_len. */
2633 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2634 ? (5 + taill) - l : locinput - PL_bostr;
2635 int pref0_len;
2636
2637 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2638 pref_len++;
2639 pref0_len = pref_len - (locinput - PL_reg_starttry);
2640 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2641 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2642 ? (5 + taill) - pref_len : PL_regeol - locinput);
2643 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2644 l--;
2645 if (pref0_len < 0)
2646 pref0_len = 0;
2647 if (pref0_len > pref_len)
2648 pref0_len = pref_len;
2649 {
3df15adc
YO
2650 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2651 const char * const s0 = is_uni ?
2652 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2653 pref0_len, 60, UNI_DISPLAY_REGEX) :
2654 pv_escape(PERL_DEBUG_PAD(0), (locinput - pref_len),
2655 pref0_len, 60, 0);
2656
2657 const int len0 = strlen(s0);
2658 const char * const s1 = is_uni ?
2659 pv_uni_display(PERL_DEBUG_PAD(1),
2660 (U8*)(locinput - pref_len + pref0_len),
2661 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2662 pv_escape(PERL_DEBUG_PAD(1),
2663 (locinput - pref_len + pref0_len),
2664 pref_len - pref0_len, 60, 0);
2665
2666 const int len1 = (int)strlen(s1);
2667 const char * const s2 = is_uni ?
2668 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2669 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2670 pv_escape(PERL_DEBUG_PAD(2), locinput,
2671 PL_regeol - locinput, 60, 0);
2672 const int len2 = (int)strlen(s2);
2673 PerlIO_printf(Perl_debug_log,
07be1b83
YO
2674 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2675 (IV)(locinput - PL_bostr),
2676 PL_colors[4],
2677 len0, s0,
2678 PL_colors[5],
2679 PL_colors[2],
2680 len1, s1,
2681 PL_colors[3],
2682 (docolor ? "" : "> <"),
2683 PL_colors[0],
2684 len2, s2,
2685 PL_colors[1],
2686 15 - l - pref_len + 1,
2687 "");
2688 }
2689}
3df15adc 2690
07be1b83
YO
2691#endif
2692
d6a28714 2693STATIC I32 /* 0 failure, 1 success */
3b0527fe 2694S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2695{
54df2634
NC
2696#if PERL_VERSION < 9
2697 dMY_CXT;
2698#endif
27da23d5 2699 dVAR;
95b24440 2700 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2701 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2702
3b0527fe
DM
2703 regexp *rex = reginfo->prog;
2704
5d9a96ca
DM
2705 regmatch_slab *orig_slab;
2706 regmatch_state *orig_state;
a3621e74 2707
5d9a96ca
DM
2708 /* the current state. This is a cached copy of PL_regmatch_state */
2709 register regmatch_state *st;
95b24440 2710
5d9a96ca
DM
2711 /* cache heavy used fields of st in registers */
2712 register regnode *scan;
2713 register regnode *next;
2714 register I32 n = 0; /* initialize to shut up compiler warning */
2715 register char *locinput = PL_reginput;
95b24440 2716
5d9a96ca
DM
2717 /* these variables are NOT saved during a recusive RFEGMATCH: */
2718 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2719 bool result; /* return value of S_regmatch */
5d9a96ca 2720 int depth = 0; /* depth of recursion */
77cb431f
DM
2721 regmatch_state *yes_state = NULL; /* state to pop to on success of
2722 subpattern */
40a82448 2723 U32 state_num;
95b24440
DM
2724
2725#ifdef DEBUGGING
e68ec53f 2726 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2727 PL_regindent++;
2728#endif
2729
5d9a96ca
DM
2730 /* on first ever call to regmatch, allocate first slab */
2731 if (!PL_regmatch_slab) {
2732 Newx(PL_regmatch_slab, 1, regmatch_slab);
2733 PL_regmatch_slab->prev = NULL;
2734 PL_regmatch_slab->next = NULL;
86545054 2735 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2736 }
2737
2738 /* remember current high-water mark for exit */
2739 /* XXX this should be done with SAVE* instead */
2740 orig_slab = PL_regmatch_slab;
2741 orig_state = PL_regmatch_state;
2742
2743 /* grab next free state slot */
2744 st = ++PL_regmatch_state;
86545054 2745 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2746 st = PL_regmatch_state = S_push_slab(aTHX);
2747
2748 st->minmod = 0;
2749 st->sw = 0;
2750 st->logical = 0;
5d9a96ca 2751 st->cc = NULL;
d6a28714
JH
2752 /* Note that nextchr is a byte even in UTF */
2753 nextchr = UCHARAT(locinput);
2754 scan = prog;
2755 while (scan != NULL) {
8ba1375e 2756
a3621e74 2757 DEBUG_EXECUTE_r( {
6136c704 2758 SV * const prop = sv_newmortal();
07be1b83 2759 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2760 regprop(rex, prop, scan);
07be1b83
YO
2761
2762 PerlIO_printf(Perl_debug_log,
2763 "%3"IVdf":%*s%s(%"IVdf")\n",
2764 (IV)(scan - rex->program), PL_regindent*2, "",
2765 SvPVX_const(prop),
2766 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2767 });
d6a28714
JH
2768
2769 next = scan + NEXT_OFF(scan);
2770 if (next == scan)
2771 next = NULL;
40a82448 2772 state_num = OP(scan);
d6a28714 2773
40a82448
DM
2774 reenter_switch:
2775 switch (state_num) {
d6a28714 2776 case BOL:
7fba1cd6 2777 if (locinput == PL_bostr)
d6a28714 2778 {
3b0527fe 2779 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2780 break;
2781 }
d6a28714
JH
2782 sayNO;
2783 case MBOL:
12d33761
HS
2784 if (locinput == PL_bostr ||
2785 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2786 {
b8c5462f
JH
2787 break;
2788 }
d6a28714
JH
2789 sayNO;
2790 case SBOL:
c2a73568 2791 if (locinput == PL_bostr)
b8c5462f 2792 break;
d6a28714
JH
2793 sayNO;
2794 case GPOS:
3b0527fe 2795 if (locinput == reginfo->ganch)
d6a28714
JH
2796 break;
2797 sayNO;
2798 case EOL:
d6a28714
JH
2799 goto seol;
2800 case MEOL:
d6a28714 2801 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2802 sayNO;
b8c5462f 2803 break;
d6a28714
JH
2804 case SEOL:
2805 seol:
2806 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2807 sayNO;
d6a28714 2808 if (PL_regeol - locinput > 1)
b8c5462f 2809 sayNO;
b8c5462f 2810 break;
d6a28714
JH
2811 case EOS:
2812 if (PL_regeol != locinput)
b8c5462f 2813 sayNO;
d6a28714 2814 break;
ffc61ed2 2815 case SANY:
d6a28714 2816 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2817 sayNO;
f33976b4
DB
2818 if (do_utf8) {
2819 locinput += PL_utf8skip[nextchr];
2820 if (locinput > PL_regeol)
2821 sayNO;
2822 nextchr = UCHARAT(locinput);
2823 }
2824 else
2825 nextchr = UCHARAT(++locinput);
2826 break;
2827 case CANY:
2828 if (!nextchr && locinput >= PL_regeol)
2829 sayNO;
b8c5462f 2830 nextchr = UCHARAT(++locinput);
a0d0e21e 2831 break;
ffc61ed2 2832 case REG_ANY:
1aa99e6b
IH
2833 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2834 sayNO;
2835 if (do_utf8) {
b8c5462f 2836 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2837 if (locinput > PL_regeol)
2838 sayNO;
a0ed51b3 2839 nextchr = UCHARAT(locinput);
a0ed51b3 2840 }
1aa99e6b
IH
2841 else
2842 nextchr = UCHARAT(++locinput);
a0ed51b3 2843 break;
166ba7cd
DM
2844
2845#undef ST
2846#define ST st->u.trie
2847
5b47454d 2848 case TRIE:
3dab1dad 2849 {
07be1b83 2850 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2851 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2852 trie_type = do_utf8 ?
2853 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2854 : trie_plain;
2855
2856 /* what trie are we using right now */
be8e71aa 2857 reg_trie_data * const trie
3dab1dad
YO
2858 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2859 U32 state = trie->startstate;
166ba7cd
DM
2860
2861 U8 *uc = ( U8* )locinput;
2862 U16 charid = 0;
2863 U32 base = 0;
2864 UV uvc = 0;
2865 STRLEN len = 0;
2866 STRLEN foldlen = 0;
2867 U8 *uscan = (U8*)NULL;
2868 STRLEN bufflen=0;
2869 SV *sv_accept_buff = NULL;
2870 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2871
2872 ST.accepted = 0; /* how many accepting states we have seen */
2873 ST.B = next;
2874#ifdef DEBUGGING
2875 ST.me = scan;
2876#endif
3dab1dad
YO
2877
2878 if (trie->bitmap && trie_type != trie_utf8_fold &&
2879 !TRIE_BITMAP_TEST(trie,*locinput)
2880 ) {
2881 if (trie->states[ state ].wordnum) {
2882 DEBUG_EXECUTE_r(
2883 PerlIO_printf(Perl_debug_log,
2884 "%*s %smatched empty string...%s\n",
2885 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2886 );
2887 break;
2888 } else {
2889 DEBUG_EXECUTE_r(
2890 PerlIO_printf(Perl_debug_log,
2891 "%*s %sfailed to match start class...%s\n",
2892 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2893 );
2894 sayNO_SILENT;
2895 }
2896 }
166ba7cd 2897
07be1b83
YO
2898 /*
2899 traverse the TRIE keeping track of all accepting states
2900 we transition through until we get to a failing node.
2901 */
2902
a3621e74
YO
2903 while ( state && uc <= (U8*)PL_regeol ) {
2904
5b47454d 2905 if (trie->states[ state ].wordnum) {
166ba7cd 2906 if (!ST.accepted ) {
5b47454d
DM
2907 ENTER;
2908 SAVETMPS;
2909 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2910 sv_accept_buff=newSV(bufflen *
2911 sizeof(reg_trie_accepted) - 1);
2912 SvCUR_set(sv_accept_buff,
2913 sizeof(reg_trie_accepted));
2914 SvPOK_on(sv_accept_buff);
2915 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2916 SAVETMPS;
2917 ST.accept_buff =
5b47454d
DM
2918 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2919 }
2920 else {
166ba7cd 2921 if (ST.accepted >= bufflen) {
5b47454d 2922 bufflen *= 2;
166ba7cd 2923 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2924 SvGROW(sv_accept_buff,
2925 bufflen * sizeof(reg_trie_accepted));
2926 }
2927 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2928 + sizeof(reg_trie_accepted));
2929 }
166ba7cd
DM
2930 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2931 ST.accept_buff[ST.accepted].endpos = uc;
2932 ++ST.accepted;
5b47454d 2933 }
a3621e74
YO
2934
2935 base = trie->states[ state ].trans.base;
2936
07be1b83
YO
2937 DEBUG_TRIE_EXECUTE_r({
2938 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2939 PerlIO_printf( Perl_debug_log,
e4584336 2940 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2941 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2942 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2943 });
a3621e74
YO
2944
2945 if ( base ) {
5b47454d 2946 switch (trie_type) {
3dab1dad 2947 case trie_utf8_fold:
a3621e74
YO
2948 if ( foldlen>0 ) {
2949 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2950 foldlen -= len;
2951 uscan += len;
2952 len=0;
2953 } else {
2954 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2955 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2956 foldlen -= UNISKIP( uvc );
2957 uscan = foldbuf + UNISKIP( uvc );
2958 }
5b47454d
DM
2959 break;
2960 case trie_utf8:
2961 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2962 &len, uniflags );
2963 break;
2964 case trie_plain:
e4584336 2965 uvc = (UV)*uc;
a3621e74
YO
2966 len = 1;
2967 }
2968
5b47454d
DM
2969 if (uvc < 256) {
2970 charid = trie->charmap[ uvc ];
2971 }
2972 else {
2973 charid = 0;
2974 if (trie->widecharmap) {
3dab1dad 2975 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2976 (char*)&uvc, sizeof(UV), 0);
2977 if (svpp)
2978 charid = (U16)SvIV(*svpp);
2979 }
2980 }
a3621e74 2981
5b47454d
DM
2982 if (charid &&
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2985 < trie->lasttrans)
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2988 {
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
2991 }
2992 else {
2993 state = 0;
2994 }
2995 uc += len;
2996
2997 }
2998 else {
a3621e74
YO
2999 state = 0;
3000 }
3001 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3005 );
3006 }
166ba7cd 3007 if (!ST.accepted )
a3621e74 3008 sayNO;
a3621e74 3009
166ba7cd
DM
3010 DEBUG_EXECUTE_r(
3011 PerlIO_printf( Perl_debug_log,
3012 "%*s %sgot %"IVdf" possible matches%s\n",
3013 REPORT_CODE_OFF + PL_regindent * 2, "",
3014 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3015 );
3016 }
3017
3018 /* FALL THROUGH */
3019
3020 case TRIE_next_fail: /* we failed - try next alterative */
3021
3022 if ( ST.accepted == 1 ) {
3023 /* only one choice left - just continue */
3024 DEBUG_EXECUTE_r({
3025 reg_trie_data * const trie
3026 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3027 SV ** const tmp = RX_DEBUG(reginfo->prog)
3028 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
3029 : NULL;
3030 PerlIO_printf( Perl_debug_log,
3031 "%*s %sonly one match left: #%d <%s>%s\n",
3032 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3033 ST.accept_buff[ 0 ].wordnum,
3034 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3035 PL_colors[5] );
3036 });
3037 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3038 /* in this case we free tmps/leave before we call regmatch
3039 as we wont be using accept_buff again. */
3040 FREETMPS;
3041 LEAVE;
3042 locinput = PL_reginput;
3043 nextchr = UCHARAT(locinput);
3044 scan = ST.B;
3045 continue; /* execute rest of RE */
3046 }
3047
3048 if (!ST.accepted-- ) {
3049 FREETMPS;
3050 LEAVE;
3051 sayNO;
3052 }
3053
a3621e74 3054 /*
166ba7cd
DM
3055 There are at least two accepting states left. Presumably
3056 the number of accepting states is going to be low,
3057 typically two. So we simply scan through to find the one
3058 with lowest wordnum. Once we find it, we swap the last
3059 state into its place and decrement the size. We then try to
3060 match the rest of the pattern at the point where the word
3061 ends. If we succeed, control just continues along the
3062 regex; if we fail we return here to try the next accepting
3063 state
3064 */
a3621e74 3065
166ba7cd
DM
3066 {
3067 U32 best = 0;
3068 U32 cur;
3069 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3070 DEBUG_TRIE_EXECUTE_r(
f2278c82 3071 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
3072 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3073 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3074 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3075 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3076 );
3077
3078 if (ST.accept_buff[cur].wordnum <
3079 ST.accept_buff[best].wordnum)
3080 best = cur;
a3621e74 3081 }
166ba7cd
DM
3082
3083 DEBUG_EXECUTE_r({
3084 reg_trie_data * const trie
3085 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3086 SV ** const tmp = RX_DEBUG(reginfo->prog)
3087 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3088 : NULL;
3089 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3090 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3091 ST.accept_buff[best].wordnum,
3092 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3093 PL_colors[5] );
3094 });
3095
3096 if ( best<ST.accepted ) {
3097 reg_trie_accepted tmp = ST.accept_buff[ best ];
3098 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3099 ST.accept_buff[ ST.accepted ] = tmp;
3100 best = ST.accepted;
a3621e74 3101 }
166ba7cd
DM
3102 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3103 }
3104 PUSH_STATE_GOTO(TRIE_next, ST.B);
3105 /* NOTREACHED */
3106
3107#undef ST
3108
95b24440
DM
3109 case EXACT: {
3110 char *s = STRING(scan);
5d9a96ca 3111 st->ln = STR_LEN(scan);
eb160463 3112 if (do_utf8 != UTF) {
bc517b45 3113 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3114 char *l = locinput;
be8e71aa 3115 const char * const e = s + st->ln;
a72c7584 3116
5ff6fc6d
JH
3117 if (do_utf8) {
3118 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3119 while (s < e) {
a3b680e6 3120 STRLEN ulen;
1aa99e6b 3121 if (l >= PL_regeol)
5ff6fc6d
JH
3122 sayNO;
3123 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3124 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3125 uniflags))
5ff6fc6d 3126 sayNO;
bc517b45 3127 l += ulen;
5ff6fc6d 3128 s ++;
1aa99e6b 3129 }
5ff6fc6d
JH
3130 }
3131 else {
3132 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3133 while (s < e) {
a3b680e6 3134 STRLEN ulen;
1aa99e6b
IH
3135 if (l >= PL_regeol)
3136 sayNO;
5ff6fc6d 3137 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3138 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3139 uniflags))
1aa99e6b 3140 sayNO;
bc517b45 3141 s += ulen;
a72c7584 3142 l ++;
1aa99e6b 3143 }
5ff6fc6d 3144 }
1aa99e6b
IH
3145 locinput = l;
3146 nextchr = UCHARAT(locinput);
3147 break;
3148 }
bc517b45 3149 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3150 /* Inline the first character, for speed. */
3151 if (UCHARAT(s) != nextchr)
3152 sayNO;
5d9a96ca 3153 if (PL_regeol - locinput < st->ln)
d6a28714 3154 sayNO;
5d9a96ca 3155 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3156 sayNO;
5d9a96ca 3157 locinput += st->ln;
d6a28714
JH
3158 nextchr = UCHARAT(locinput);
3159 break;
95b24440 3160 }
d6a28714 3161 case EXACTFL:
b8c5462f
JH
3162 PL_reg_flags |= RF_tainted;
3163 /* FALL THROUGH */
95b24440 3164 case EXACTF: {
be8e71aa 3165 char * const s = STRING(scan);
5d9a96ca 3166 st->ln = STR_LEN(scan);
d6a28714 3167
d07ddd77
JH
3168 if (do_utf8 || UTF) {
3169 /* Either target or the pattern are utf8. */
be8e71aa 3170 const char * const l = locinput;
d07ddd77 3171 char *e = PL_regeol;
bc517b45 3172
5d9a96ca 3173 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3174 l, &e, 0, do_utf8)) {
5486206c
JH
3175 /* One more case for the sharp s:
3176 * pack("U0U*", 0xDF) =~ /ss/i,
3177 * the 0xC3 0x9F are the UTF-8
3178 * byte sequence for the U+00DF. */
3179 if (!(do_utf8 &&
3180 toLOWER(s[0]) == 's' &&
5d9a96ca 3181 st->ln >= 2 &&
5486206c
JH
3182 toLOWER(s[1]) == 's' &&
3183 (U8)l[0] == 0xC3 &&
3184 e - l >= 2 &&
3185 (U8)l[1] == 0x9F))
3186 sayNO;
3187 }
d07ddd77
JH
3188 locinput = e;
3189 nextchr = UCHARAT(locinput);
3190 break;
a0ed51b3 3191 }
d6a28714 3192
bc517b45
JH
3193 /* Neither the target and the pattern are utf8. */
3194
d6a28714
JH
3195 /* Inline the first character, for speed. */
3196 if (UCHARAT(s) != nextchr &&
3197 UCHARAT(s) != ((OP(scan) == EXACTF)
3198 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3199 sayNO;
5d9a96ca 3200 if (PL_regeol - locinput < st->ln)
b8c5462f 3201 sayNO;
5d9a96ca
DM
3202 if (st->ln > 1 && (OP(scan) == EXACTF
3203 ? ibcmp(s, locinput, st->ln)
3204 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3205 sayNO;
5d9a96ca 3206 locinput += st->ln;
d6a28714 3207 nextchr = UCHARAT(locinput);
a0d0e21e 3208 break;
95b24440 3209 }
d6a28714 3210 case ANYOF:
ffc61ed2 3211 if (do_utf8) {
9e55ce06
JH
3212 STRLEN inclasslen = PL_regeol - locinput;
3213
32fc9b6a 3214 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3215 sayNO_ANYOF;
ffc61ed2
JH
3216 if (locinput >= PL_regeol)
3217 sayNO;
0f0076b4 3218 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3219 nextchr = UCHARAT(locinput);
e0f9d4a8 3220 break;
ffc61ed2
JH
3221 }
3222 else {
3223 if (nextchr < 0)
3224 nextchr = UCHARAT(locinput);
32fc9b6a 3225 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3226 sayNO_ANYOF;
ffc61ed2
JH
3227 if (!nextchr && locinput >= PL_regeol)
3228 sayNO;
3229 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3230 break;
3231 }
3232 no_anyof:
3233 /* If we might have the case of the German sharp s
3234 * in a casefolding Unicode character class. */
3235
ebc501f0
JH
3236 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3237 locinput += SHARP_S_SKIP;
e0f9d4a8 3238 nextchr = UCHARAT(locinput);
ffc61ed2 3239 }
e0f9d4a8
JH
3240 else
3241 sayNO;
b8c5462f 3242 break;
d6a28714 3243 case ALNUML:
b8c5462f
JH
3244 PL_reg_flags |= RF_tainted;
3245 /* FALL THROUGH */
d6a28714 3246 case ALNUM:
b8c5462f 3247 if (!nextchr)
4633a7c4 3248 sayNO;
ffc61ed2 3249 if (do_utf8) {
1a4fad37 3250 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3251 if (!(OP(scan) == ALNUM
bb7a0f54 3252 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3253 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3254 {
3255 sayNO;
a0ed51b3 3256 }
b8c5462f 3257 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3258 nextchr = UCHARAT(locinput);
3259 break;
3260 }
ffc61ed2 3261 if (!(OP(scan) == ALNUM
d6a28714 3262 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3263 sayNO;
b8c5462f 3264 nextchr = UCHARAT(++locinput);
a0d0e21e 3265 break;
d6a28714 3266 case NALNUML:
b8c5462f
JH
3267 PL_reg_flags |= RF_tainted;
3268 /* FALL THROUGH */
d6a28714
JH
3269 case NALNUM:
3270 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3271 sayNO;
ffc61ed2 3272 if (do_utf8) {
1a4fad37 3273 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3274 if (OP(scan) == NALNUM
bb7a0f54 3275 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3276 : isALNUM_LC_utf8((U8*)locinput))
3277 {
b8c5462f 3278 sayNO;
d6a28714 3279 }
b8c5462f
JH
3280 locinput += PL_utf8skip[nextchr];
3281 nextchr = UCHARAT(locinput);
3282 break;
3283 }
ffc61ed2 3284 if (OP(scan) == NALNUM
d6a28714 3285 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3286 sayNO;
76e3520e 3287 nextchr = UCHARAT(++locinput);
a0d0e21e 3288 break;
d6a28714
JH
3289 case BOUNDL:
3290 case NBOUNDL:
3280af22 3291 PL_reg_flags |= RF_tainted;
bbce6d69 3292 /* FALL THROUGH */
d6a28714
JH
3293 case BOUND:
3294 case NBOUND:
3295 /* was last char in word? */
ffc61ed2 3296 if (do_utf8) {
12d33761 3297 if (locinput == PL_bostr)
5d9a96ca 3298 st->ln = '\n';
ffc61ed2 3299 else {
a3b680e6 3300 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3301
4ad0818d 3302 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3303 }
3304 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3305 st->ln = isALNUM_uni(st->ln);
1a4fad37 3306 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3307 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3308 }
3309 else {
5d9a96ca 3310 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3311 n = isALNUM_LC_utf8((U8*)locinput);
3312 }
a0ed51b3 3313 }
d6a28714 3314 else {
5d9a96ca 3315 st->ln = (locinput != PL_bostr) ?
12d33761 3316 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3317 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3318 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3319 n = isALNUM(nextchr);
3320 }
3321 else {
5d9a96ca 3322 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3323 n = isALNUM_LC(nextchr);
3324 }
d6a28714 3325 }
5d9a96ca 3326 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3327 OP(scan) == BOUNDL))
3328 sayNO;
a0ed51b3 3329 break;
d6a28714 3330 case SPACEL:
3280af22 3331 PL_reg_flags |= RF_tainted;
bbce6d69 3332 /* FALL THROUGH */
d6a28714 3333 case SPACE:
9442cb0e 3334 if (!nextchr)
4633a7c4 3335 sayNO;
1aa99e6b 3336 if (do_utf8) {
fd400ab9 3337 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3338 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3339 if (!(OP(scan) == SPACE
bb7a0f54 3340 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3341 : isSPACE_LC_utf8((U8*)locinput)))
3342 {
3343 sayNO;
3344 }
3345 locinput += PL_utf8skip[nextchr];
3346 nextchr = UCHARAT(locinput);
3347 break;
d6a28714 3348 }
ffc61ed2
JH
3349 if (!(OP(scan) == SPACE
3350 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3351 sayNO;
3352 nextchr = UCHARAT(++locinput);
3353 }
3354 else {
3355 if (!(OP(scan) == SPACE
3356 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3357 sayNO;
3358 nextchr = UCHARAT(++locinput);
a0ed51b3 3359 }
a0ed51b3 3360 break;
d6a28714 3361 case NSPACEL:
3280af22 3362 PL_reg_flags |= RF_tainted;
bbce6d69 3363 /* FALL THROUGH */
d6a28714 3364 case NSPACE:
9442cb0e 3365 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3366 sayNO;
1aa99e6b 3367 if (do_utf8) {
1a4fad37 3368 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3369 if (OP(scan) == NSPACE
bb7a0f54 3370 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3371 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3372 {
3373 sayNO;
3374 }
3375 locinput += PL_utf8skip[nextchr];
3376 nextchr = UCHARAT(locinput);
3377 break;
a0ed51b3 3378 }
ffc61ed2 3379 if (OP(scan) == NSPACE
d6a28714 3380 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3381 sayNO;
76e3520e 3382 nextchr = UCHARAT(++locinput);
a0d0e21e 3383 break;
d6a28714 3384 case DIGITL:
a0ed51b3
LW
3385 PL_reg_flags |= RF_tainted;
3386 /* FALL THROUGH */
d6a28714 3387 case DIGIT:
9442cb0e 3388 if (!nextchr)
a0ed51b3 3389 sayNO;
1aa99e6b 3390 if (do_utf8) {
1a4fad37 3391 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3392 if (!(OP(scan) == DIGIT
bb7a0f54 3393 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3394 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3395 {
a0ed51b3 3396 sayNO;
dfe13c55 3397 }
6f06b55f 3398 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3399 nextchr = UCHARAT(locinput);
3400 break;
3401 }
ffc61ed2 3402 if (!(OP(scan) == DIGIT
9442cb0e 3403 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3404 sayNO;
3405 nextchr = UCHARAT(++locinput);
3406 break;
d6a28714 3407 case NDIGITL:
b8c5462f
JH
3408 PL_reg_flags |= RF_tainted;
3409 /* FALL THROUGH */
d6a28714 3410 case NDIGIT:
9442cb0e 3411 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3412 sayNO;
1aa99e6b 3413 if (do_utf8) {
1a4fad37 3414 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3415 if (OP(scan) == NDIGIT
bb7a0f54 3416 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3417 : isDIGIT_LC_utf8((U8*)locinput))
3418 {
a0ed51b3 3419 sayNO;
9442cb0e 3420 }
6f06b55f 3421 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3422 nextchr = UCHARAT(locinput);
3423 break;
3424 }
ffc61ed2 3425 if (OP(scan) == NDIGIT
9442cb0e 3426 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3427 sayNO;
3428 nextchr = UCHARAT(++locinput);
3429 break;
3430 case CLUMP:
b7c83a7e 3431 if (locinput >= PL_regeol)
a0ed51b3 3432 sayNO;
b7c83a7e 3433 if (do_utf8) {
1a4fad37 3434 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3435 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3436 sayNO;
3437 locinput += PL_utf8skip[nextchr];
3438 while (locinput < PL_regeol &&
3439 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3440 locinput += UTF8SKIP(locinput);
3441 if (locinput > PL_regeol)
3442 sayNO;
eb08e2da
JH
3443 }
3444 else
3445 locinput++;
a0ed51b3
LW
3446 nextchr = UCHARAT(locinput);
3447 break;
c8756f30 3448 case REFFL:
3280af22 3449 PL_reg_flags |= RF_tainted;
c8756f30 3450 /* FALL THROUGH */
c277df42 3451 case REF:
95b24440
DM
3452 case REFF: {
3453 char *s;
c277df42 3454 n = ARG(scan); /* which paren pair */
5d9a96ca 3455 st->ln = PL_regstartp[n];
2c2d71f5 3456 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3457 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3458 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3459 if (st->ln == PL_regendp[n])
a0d0e21e 3460 break;
a0ed51b3 3461
5d9a96ca 3462 s = PL_bostr + st->ln;
1aa99e6b 3463 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3464 char *l = locinput;
a3b680e6 3465 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3466 /*
3467 * Note that we can't do the "other character" lookup trick as
3468 * in the 8-bit case (no pun intended) because in Unicode we
3469 * have to map both upper and title case to lower case.
3470 */
3471 if (OP(scan) == REFF) {
3472 while (s < e) {
a3b680e6
AL
3473 STRLEN ulen1, ulen2;
3474 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3475 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3476
a0ed51b3
LW
3477 if (l >= PL_regeol)
3478 sayNO;
a2a2844f
JH
3479 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3480 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3481 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3482 sayNO;
a2a2844f
JH
3483 s += ulen1;
3484 l += ulen2;
a0ed51b3
LW
3485 }
3486 }
3487 locinput = l;
3488 nextchr = UCHARAT(locinput);
3489 break;
3490 }
3491
a0d0e21e 3492 /* Inline the first character, for speed. */
76e3520e 3493 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3494 (OP(scan) == REF ||
3495 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3496 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3497 sayNO;
5d9a96ca
DM
3498 st->ln = PL_regendp[n] - st->ln;
3499 if (locinput + st->ln > PL_regeol)
4633a7c4 3500 sayNO;
5d9a96ca
DM
3501 if (st->ln > 1 && (OP(scan) == REF
3502 ? memNE(s, locinput, st->ln)
c8756f30 3503 : (OP(scan) == REFF
5d9a96ca
DM
3504 ? ibcmp(s, locinput, st->ln)
3505 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3506 sayNO;
5d9a96ca 3507 locinput += st->ln;
76e3520e 3508 nextchr = UCHARAT(locinput);
a0d0e21e 3509 break;
95b24440 3510 }
a0d0e21e
LW
3511
3512 case NOTHING:
c277df42 3513 case TAIL:
a0d0e21e
LW
3514 break;
3515 case BACK:
3516 break;
40a82448
DM
3517
3518#undef ST
3519#define ST st->u.eval
3520
3521 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3522 {
c277df42 3523 SV *ret;
8e5e9ebe 3524 {
4aabdb9b
DM
3525 /* execute the code in the {...} */
3526 dSP;
6136c704 3527 SV ** const before = SP;
4aabdb9b
DM
3528 OP_4tree * const oop = PL_op;
3529 COP * const ocurcop = PL_curcop;
3530 PAD *old_comppad;
4aabdb9b
DM
3531
3532 n = ARG(scan);
32fc9b6a 3533 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3534 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3535 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3536 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3537
8e5e9ebe
RGS
3538 CALLRUNOPS(aTHX); /* Scalar context. */
3539 SPAGAIN;
3540 if (SP == before)
075aa684 3541 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3542 else {
3543 ret = POPs;
3544 PUTBACK;
3545 }
4aabdb9b
DM
3546
3547 PL_op = oop;
3548 PAD_RESTORE_LOCAL(old_comppad);
3549 PL_curcop = ocurcop;
3550 if (!st->logical) {
3551 /* /(?{...})/ */
3552 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3553 break;
3554 }
8e5e9ebe 3555 }
4aabdb9b
DM
3556 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3557 regexp *re;
4aabdb9b 3558 {
4f639d21
DM
3559 /* extract RE object from returned value; compiling if
3560 * necessary */
3561
6136c704 3562 MAGIC *mg = NULL;
be8e71aa 3563 const SV *sv;
faf82a0b
AE
3564 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3565 mg = mg_find(sv, PERL_MAGIC_qr);
3566 else if (SvSMAGICAL(ret)) {
3567 if (SvGMAGICAL(ret))
3568 sv_unmagic(ret, PERL_MAGIC_qr);
3569 else
3570 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3571 }
faf82a0b 3572
0f5d15d6
IZ
3573 if (mg) {
3574 re = (regexp *)mg->mg_obj;
df0003d4 3575 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3576 }
3577 else {
3578 STRLEN len;
6136c704 3579 const char * const t = SvPV_const(ret, len);
0f5d15d6 3580 PMOP pm;
a3b680e6 3581 const I32 osize = PL_regsize;
0f5d15d6 3582
5fcd1c1b 3583 Zero(&pm, 1, PMOP);
4aabdb9b 3584 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3585 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3586 if (!(SvFLAGS(ret)
faf82a0b
AE
3587 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3588 | SVs_GMG)))
14befaf4
DM
3589 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3590 PERL_MAGIC_qr,0,0);
0f5d15d6 3591 PL_regsize = osize;
0f5d15d6 3592 }
4aabdb9b 3593 }
aa283a38
DM
3594
3595 /* run the pattern returned from (??{...}) */
3596
4aabdb9b
DM
3597 DEBUG_EXECUTE_r(
3598 PerlIO_printf(Perl_debug_log,
3599 "Entering embedded \"%s%.60s%s%s\"\n",
3600 PL_colors[0],
3601 re->precomp,
3602 PL_colors[1],
3603 (strlen(re->precomp) > 60 ? "..." : ""))
3604 );
2c2d71f5 3605
40a82448
DM
3606 ST.cp = regcppush(0); /* Save *all* the positions. */
3607 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3608 *PL_reglastparen = 0;
3609 *PL_reglastcloseparen = 0;
4aabdb9b 3610 PL_reginput = locinput;
4aabdb9b
DM
3611
3612 /* XXXX This is too dramatic a measure... */
3613 PL_reg_maxiter = 0;
3614
5d9a96ca 3615 st->logical = 0;
40a82448 3616 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3617 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3618 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3619 ST.prev_rex = rex;
aa283a38 3620 rex = re;
aa283a38 3621
40a82448 3622 ST.B = next;
aa283a38 3623 /* now continue from first node in postoned RE */
40a82448 3624 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3625 /* NOTREACHED */
a0ed51b3 3626 }
4aabdb9b
DM
3627 /* /(?(?{...})X|Y)/ */
3628 st->sw = SvTRUE(ret);
3629 st->logical = 0;
c277df42
IZ
3630 break;
3631 }
40a82448
DM
3632
3633 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3634 if (ST.toggleutf)
3635 PL_reg_flags ^= RF_utf8;
3636 ReREFCNT_dec(rex);
3637 rex = ST.prev_rex;
3638 /* XXXX This is too dramatic a measure... */
3639 PL_reg_maxiter = 0;
3640 /* Restore parens of the caller without popping the
3641 * savestack */
3642 {
3643 const I32 tmp = PL_savestack_ix;
3644 PL_savestack_ix = ST.lastcp;
3645 regcppop(rex);
3646 PL_savestack_ix = tmp;
3647 }
3648 PL_reginput = locinput;
3649 /* continue at the node following the (??{...}) */
3650 scan = ST.B;
3651 continue;
3652
3653 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3654 /* Restore state to the outer re then re-throw the failure */
3655 if (ST.toggleutf)
3656 PL_reg_flags ^= RF_utf8;
3657 ReREFCNT_dec(rex);
3658 rex = ST.prev_rex;
3659
3660 /* XXXX This is too dramatic a measure... */
3661 PL_reg_maxiter = 0;
3662
3663 PL_reginput = locinput;
3664 REGCP_UNWIND(ST.lastcp);
3665 regcppop(rex);
3666 sayNO_SILENT;
3667
3668#undef ST
3669
a0d0e21e 3670 case OPEN:
c277df42 3671 n = ARG(scan); /* which paren pair */
3280af22
NIS
3672 PL_reg_start_tmp[n] = locinput;
3673 if (n > PL_regsize)
3674 PL_regsize = n;
a0d0e21e
LW
3675 break;
3676 case CLOSE:
c277df42 3677 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3678 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3679 PL_regendp[n] = locinput - PL_bostr;
eb160463 3680 if (n > (I32)*PL_reglastparen)
3280af22 3681 *PL_reglastparen = n;
a01268b5 3682 *PL_reglastcloseparen = n;
a0d0e21e 3683 break;
c277df42
IZ
3684 case GROUPP:
3685 n = ARG(scan); /* which paren pair */
5d9a96ca 3686 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3687 break;
3688 case IFTHEN:
2c2d71f5 3689 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3690 if (st->sw)
c277df42
IZ
3691 next = NEXTOPER(NEXTOPER(scan));
3692 else {
3693 next = scan + ARG(scan);
3694 if (OP(next) == IFTHEN) /* Fake one. */
3695 next = NEXTOPER(NEXTOPER(next));
3696 }
3697 break;
3698 case LOGICAL:
5d9a96ca 3699 st->logical = scan->flags;
c277df42 3700 break;
2ab05381 3701/*******************************************************************
a0374537
DM
3702 cc points to the regmatch_state associated with the most recent CURLYX.
3703 This struct contains info about the innermost (...)* loop (an
3704 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3705
3706 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3707
95b24440 3708 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3709
a0374537 3710 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3711 with the starting point at WHILEM node;
2ab05381
IZ
3712
3713 3) Each hit of WHILEM node tries to match A and Z (in the order
3714 depending on the current iteration, min/max of {min,max} and
3715 greediness). The information about where are nodes for "A"
a0374537 3716 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3717 was already matched, and greediness.
3718
3719 4) After A matches, the same WHILEM node is hit again.
3720
95b24440 3721 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3722 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3723 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3724 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3725 of the external loop.
3726
a0374537 3727 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3728 and whatever it mentions via ->next, and additional attached trees
3729 corresponding to temporarily unset infoblocks as in "5" above.
3730
95b24440 3731 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3732 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3733 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3734 infoblocks are drawn below the "reset" infoblock.
3735
3736 In fact in the picture below we do not show failed matches for Z and T
3737 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3738 more obvious *why* one needs to *temporary* unset infoblocks.]
3739
3740 Matched REx position InfoBlocks Comment
3741 (Y(A)*?Z)*?T x
3742 Y(A)*?Z)*?T x <- O
3743 Y (A)*?Z)*?T x <- O
3744 Y A)*?Z)*?T x <- O <- I
3745 YA )*?Z)*?T x <- O <- I
3746 YA A)*?Z)*?T x <- O <- I
3747 YAA )*?Z)*?T x <- O <- I
3748 YAA Z)*?T x <- O # Temporary unset I
3749 I
3750
3751 YAAZ Y(A)*?Z)*?T x <- O
3752 I
3753
3754 YAAZY (A)*?Z)*?T x <- O
3755 I
3756
3757 YAAZY A)*?Z)*?T x <- O <- I
3758 I
3759
3760 YAAZYA )*?Z)*?T x <- O <- I
3761 I
3762
3763 YAAZYA Z)*?T x <- O # Temporary unset I
3764 I,I
3765
3766 YAAZYAZ )*?T x <- O
3767 I,I
3768
3769 YAAZYAZ T x # Temporary unset O
3770 O
3771 I,I
3772
3773 YAAZYAZT x
3774 O
3775 I,I
3776 *******************************************************************/
95b24440 3777
a0d0e21e 3778 case CURLYX: {
cb434fcc
IZ
3779 /* No need to save/restore up to this paren */
3780 I32 parenfloor = scan->flags;
c277df42 3781
c2b7afd3
NC
3782 /* Dave says:
3783
3784 CURLYX and WHILEM are always paired: they're the moral
3785 equivalent of pp_enteriter anbd pp_iter.
3786
3787 The only time next could be null is if the node tree is
3788 corrupt. This was mentioned on p5p a few days ago.
3789
3790 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3791 So we'll assert that this is true:
3792 */
3793 assert(next);
30b2893d 3794 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3795 next += ARG(next);
cb434fcc
IZ
3796 /* XXXX Probably it is better to teach regpush to support
3797 parenfloor > PL_regsize... */
eb160463 3798 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3799 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3800
d8319b27
DM
3801 st->u.curlyx.cp = PL_savestack_ix;
3802 st->u.curlyx.outercc = st->cc;
a0374537
DM
3803 st->cc = st;
3804 /* these fields contain the state of the current curly.
3805 * they are accessed by subsequent WHILEMs;
3806 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3807 st->u.curlyx.parenfloor = parenfloor;
3808 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3809 st->u.curlyx.min = ARG1(scan);
3810 st->u.curlyx.max = ARG2(scan);
3811 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3812 st->u.curlyx.lastloc = 0;
a0374537
DM
3813 /* st->next and st->minmod are also read by WHILEM */
3814
3280af22 3815 PL_reginput = locinput;
95b24440
DM
3816 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3817 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3818 regcpblow(st->u.curlyx.cp);
3819 st->cc = st->u.curlyx.outercc;
95b24440 3820 saySAME(result);
a0d0e21e 3821 }
5f66b61c 3822 /* NOTREACHED */
a0d0e21e
LW
3823 case WHILEM: {
3824 /*
3825 * This is really hard to understand, because after we match
3826 * what we're trying to match, we must make sure the rest of
2c2d71f5 3827 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3828 * to go back UP the parse tree by recursing ever deeper. And
3829 * if it fails, we have to reset our parent's current state
3830 * that we can try again after backing off.
3831 */
3832
c2b7afd3
NC
3833 /* Dave says:
3834
3835 st->cc gets initialised by CURLYX ready for use by WHILEM.
3836 So again, unless somethings been corrupted, st->cc cannot
3837 be null at that point in WHILEM.
3838
3839 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3840 So we'll assert that this is true:
3841 */
3842 assert(st->cc);
d8319b27
DM
3843 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3844 st->u.whilem.cache_offset = 0;
3845 st->u.whilem.cache_bit = 0;
c277df42 3846
d8319b27 3847 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3848 PL_reginput = locinput;
a0d0e21e 3849
a3621e74 3850 DEBUG_EXECUTE_r(
9041c2e3 3851 PerlIO_printf(Perl_debug_log,
91f3b821 3852 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3853 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3854 (long)n, (long)st->cc->u.curlyx.min,
3855 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3856 );
4633a7c4 3857
a0d0e21e
LW
3858 /* If degenerate scan matches "", assume scan done. */
3859
d8319b27
DM
3860 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3861 st->u.whilem.savecc = st->cc;
3862 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3863 if (st->cc)
d8319b27 3864 st->ln = st->cc->u.curlyx.cur;
a3621e74 3865 DEBUG_EXECUTE_r(
c3464db5
DD
3866 PerlIO_printf(Perl_debug_log,
3867 "%*s empty match detected, try continuation...\n",
3280af22 3868 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3869 );
d8319b27 3870 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3871 /*** all unsaved local vars undefined at this point */
d8319b27 3872 st->cc = st->u.whilem.savecc;
95b24440 3873 if (result)
4633a7c4 3874 sayYES;
d8319b27
DM
3875 if (st->cc->u.curlyx.outercc)
3876 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3877 sayNO;
a0d0e21e
LW
3878 }
3879
3880 /* First just match a string of min scans. */
3881
d8319b27
DM
3882 if (n < st->cc->u.curlyx.min) {
3883 st->cc->u.curlyx.cur = n;
3884 st->cc->u.curlyx.lastloc = locinput;
3885 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3886 /*** all unsaved local vars undefined at this point */
3887 if (result)
4633a7c4 3888 sayYES;
d8319b27
DM
3889 st->cc->u.curlyx.cur = n - 1;
3890 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3891 sayNO;
a0d0e21e
LW
3892 }
3893
2c2d71f5
JH
3894 if (scan->flags) {
3895 /* Check whether we already were at this position.
3896 Postpone detection until we know the match is not
3897 *that* much linear. */
3898 if (!PL_reg_maxiter) {
3899 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3900 /* possible overflow for long strings and many CURLYX's */
3901 if (PL_reg_maxiter < 0)
3902 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3903 PL_reg_leftiter = PL_reg_maxiter;
3904 }
3905 if (PL_reg_leftiter-- == 0) {
a3b680e6 3906 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3907 if (PL_reg_poscache) {
eb160463 3908 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3909 Renew(PL_reg_poscache, size, char);
3910 PL_reg_poscache_size = size;
3911 }
3912 Zero(PL_reg_poscache, size, char);
3913 }
3914 else {
3915 PL_reg_poscache_size = size;
a02a5408 3916 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3917 }
a3621e74 3918 DEBUG_EXECUTE_r(
2c2d71f5
JH
3919 PerlIO_printf(Perl_debug_log,
3920 "%sDetected a super-linear match, switching on caching%s...\n",
3921 PL_colors[4], PL_colors[5])
3922 );
3923 }
3924 if (PL_reg_leftiter < 0) {
d8319b27 3925 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3926
d8319b27
DM
3927 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3928 + st->u.whilem.cache_offset * (scan->flags>>4);
3929 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3930 st->u.whilem.cache_offset /= 8;
3931 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3932 DEBUG_EXECUTE_r(
2c2d71f5
JH
3933 PerlIO_printf(Perl_debug_log,
3934 "%*s already tried at this position...\n",
3935 REPORT_CODE_OFF+PL_regindent*2, "")
3936 );
3ab3c9b4
HS
3937 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3938 /* cache records success */
c2b0868c
HS
3939 sayYES;
3940 else
3ab3c9b4 3941 /* cache records failure */
c2b0868c 3942 sayNO_SILENT;
2c2d71f5 3943 }
2c2d71f5
JH
3944 }
3945 }
3946
a0d0e21e
LW
3947 /* Prefer next over scan for minimal matching. */
3948
5d9a96ca 3949 if (st->cc->minmod) {
d8319b27
DM
3950 st->u.whilem.savecc = st->cc;
3951 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3952 if (st->cc)
d8319b27
DM
3953 st->ln = st->cc->u.curlyx.cur;
3954 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3955 REGCP_SET(st->u.whilem.lastcp);
3956 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3957 /*** all unsaved local vars undefined at this point */
d8319b27 3958 st->cc = st->u.whilem.savecc;
95b24440 3959 if (result) {
d8319b27 3960 regcpblow(st->u.whilem.cp);
3ab3c9b4 3961 CACHEsayYES; /* All done. */
5f05dabc 3962 }
d8319b27 3963 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3964 regcppop(rex);
d8319b27
DM
3965 if (st->cc->u.curlyx.outercc)
3966 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3967
d8319b27 3968 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3969 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3970 && !(PL_reg_flags & RF_warned)) {
3971 PL_reg_flags |= RF_warned;
9014280d 3972 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3973 "Complex regular subexpression recursion",
3974 REG_INFTY - 1);
c277df42 3975 }
3ab3c9b4 3976 CACHEsayNO;
c277df42 3977 }
a687059c 3978
a3621e74 3979 DEBUG_EXECUTE_r(
c3464db5
DD
3980 PerlIO_printf(Perl_debug_log,
3981 "%*s trying longer...\n",
3280af22 3982 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3983 );
a0d0e21e 3984 /* Try scanning more and see if it helps. */
3280af22 3985 PL_reginput = locinput;
d8319b27
DM
3986 st->cc->u.curlyx.cur = n;
3987 st->cc->u.curlyx.lastloc = locinput;
3988 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3989 REGCP_SET(st->u.whilem.lastcp);
3990 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3991 /*** all unsaved local vars undefined at this point */
3992 if (result) {
d8319b27 3993 regcpblow(st->u.whilem.cp);
3ab3c9b4 3994 CACHEsayYES;
5f05dabc 3995 }
d8319b27 3996 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3997 regcppop(rex);
d8319b27
DM
3998 st->cc->u.curlyx.cur = n - 1;
3999 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4000 CACHEsayNO;
a0d0e21e
LW
4001 }
4002
4003 /* Prefer scan over next for maximal matching. */
4004
d8319b27
DM
4005 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
4006 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
4007 st->cc->u.curlyx.cur = n;
4008 st->cc->u.curlyx.lastloc = locinput;
4009 REGCP_SET(st->u.whilem.lastcp);
4010 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
4011 /*** all unsaved local vars undefined at this point */
4012 if (result) {
d8319b27 4013 regcpblow(st->u.whilem.cp);
3ab3c9b4 4014 CACHEsayYES;
5f05dabc 4015 }
d8319b27 4016 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 4017 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 4018 PL_reginput = locinput;
a3621e74 4019 DEBUG_EXECUTE_r(
c3464db5
DD
4020 PerlIO_printf(Perl_debug_log,
4021 "%*s failed, try continuation...\n",
3280af22 4022 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4023 );
4024 }
9041c2e3 4025 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4026 && !(PL_reg_flags & RF_warned)) {
3280af22 4027 PL_reg_flags |= RF_warned;
9014280d 4028 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4029 "Complex regular subexpression recursion",
4030 REG_INFTY - 1);
a0d0e21e
LW
4031 }
4032
4033 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4034 st->u.whilem.savecc = st->cc;
4035 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4036 if (st->cc)
d8319b27
DM
4037 st->ln = st->cc->u.curlyx.cur;
4038 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4039 /*** all unsaved local vars undefined at this point */
d8319b27 4040 st->cc = st->u.whilem.savecc;
95b24440 4041 if (result)
3ab3c9b4 4042 CACHEsayYES;
d8319b27
DM
4043 if (st->cc->u.curlyx.outercc)
4044 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4045 st->cc->u.curlyx.cur = n - 1;
4046 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4047 CACHEsayNO;
a0d0e21e 4048 }
5f66b61c 4049 /* NOTREACHED */
40a82448
DM
4050
4051#undef ST
4052#define ST st->u.branch
4053
4054 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4055 next = scan + ARG(scan);
4056 if (next == scan)
4057 next = NULL;
40a82448
DM
4058 scan = NEXTOPER(scan);
4059 /* FALL THROUGH */
c277df42 4060
40a82448
DM
4061 case BRANCH: /* /(...|A|...)/ */
4062 scan = NEXTOPER(scan); /* scan now points to inner node */
4063 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4064 /* last branch; skip state push and jump direct to node */
4065 continue;
4066 ST.lastparen = *PL_reglastparen;
4067 ST.next_branch = next;
4068 REGCP_SET(ST.cp);
4069 PL_reginput = locinput;
02db2b7b 4070
40a82448
DM
4071 /* Now go into the branch */
4072 PUSH_STATE_GOTO(BRANCH_next, scan);
4073 /* NOTREACHED */
4074
4075 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4076 REGCP_UNWIND(ST.cp);
4077 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4078 PL_regendp[n] = -1;
4079 *PL_reglastparen = n;
4080 scan = ST.next_branch;
4081 /* no more branches? */
4082 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4083 sayNO;
4084 continue; /* execute next BRANCH[J] op */
4085 /* NOTREACHED */
4086
a0d0e21e 4087 case MINMOD:
5d9a96ca 4088 st->minmod = 1;
a0d0e21e 4089 break;
40a82448
DM
4090
4091#undef ST
4092#define ST st->u.curlym
4093
4094 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4095
4096 /* This is an optimisation of CURLYX that enables us to push
4097 * only a single backtracking state, no matter now many matches
4098 * there are in {m,n}. It relies on the pattern being constant
4099 * length, with no parens to influence future backrefs
4100 */
4101
4102 ST.me = scan;
dc45a647 4103 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4104
4105 /* if paren positive, emulate an OPEN/CLOSE around A */
4106 if (ST.me->flags) {
4107 I32 paren = ST.me->flags;
4108 if (paren > PL_regsize)
4109 PL_regsize = paren;
4110 if (paren > (I32)*PL_reglastparen)
4111 *PL_reglastparen = paren;
c277df42 4112 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4113 }
40a82448
DM
4114 ST.A = scan;
4115 ST.B = next;
4116 ST.alen = 0;
4117 ST.count = 0;
4118 ST.minmod = st->minmod;
4119 st->minmod = 0;
4120 ST.c1 = CHRTEST_UNINIT;
4121 REGCP_SET(ST.cp);
6407bf3b 4122
40a82448
DM
4123 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4124 goto curlym_do_B;
4125
4126 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4127 PL_reginput = locinput;
40a82448
DM
4128 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4129 /* NOTREACHED */
5f80c4cf 4130
40a82448
DM
4131 case CURLYM_A: /* we've just matched an A */
4132 locinput = st->locinput;
4133 nextchr = UCHARAT(locinput);
4134
4135 ST.count++;
4136 /* after first match, determine A's length: u.curlym.alen */
4137 if (ST.count == 1) {
4138 if (PL_reg_match_utf8) {
4139 char *s = locinput;
4140 while (s < PL_reginput) {
4141 ST.alen++;
4142 s += UTF8SKIP(s);
4143 }
4144 }
4145 else {
4146 ST.alen = PL_reginput - locinput;
4147 }
4148 if (ST.alen == 0)
4149 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4150 }
0cadcf80
DM
4151 DEBUG_EXECUTE_r(
4152 PerlIO_printf(Perl_debug_log,
40a82448 4153 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 4154 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 4155 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4156 );
4157
40a82448
DM
4158 locinput = PL_reginput;
4159 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4160 goto curlym_do_A; /* try to match another A */
4161 goto curlym_do_B; /* try to match B */
5f80c4cf 4162
40a82448
DM
4163 case CURLYM_A_fail: /* just failed to match an A */
4164 REGCP_UNWIND(ST.cp);
4165 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4166 sayNO;
0cadcf80 4167
40a82448
DM
4168 curlym_do_B: /* execute the B in /A{m,n}B/ */
4169 PL_reginput = locinput;
4170 if (ST.c1 == CHRTEST_UNINIT) {
4171 /* calculate c1 and c2 for possible match of 1st char
4172 * following curly */
4173 ST.c1 = ST.c2 = CHRTEST_VOID;
4174 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4175 regnode *text_node = ST.B;
4176 if (! HAS_TEXT(text_node))
4177 FIND_NEXT_IMPT(text_node);
4178 if (HAS_TEXT(text_node)
4179 && PL_regkind[OP(text_node)] != REF)
4180 {
4181 ST.c1 = (U8)*STRING(text_node);
4182 ST.c2 =
4183 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4184 ? PL_fold[ST.c1]
4185 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4186 ? PL_fold_locale[ST.c1]
4187 : ST.c1;
c277df42 4188 }
c277df42 4189 }
40a82448
DM
4190 }
4191
4192 DEBUG_EXECUTE_r(
4193 PerlIO_printf(Perl_debug_log,
4194 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4195 (int)(REPORT_CODE_OFF+PL_regindent*2),
4196 "", (IV)ST.count)
4197 );
4198 if (ST.c1 != CHRTEST_VOID
4199 && UCHARAT(PL_reginput) != ST.c1
4200 && UCHARAT(PL_reginput) != ST.c2)
4201 {
4202 /* simulate B failing */
4203 state_num = CURLYM_B_fail;
4204 goto reenter_switch;
4205 }
4206
4207 if (ST.me->flags) {
4208 /* mark current A as captured */
4209 I32 paren = ST.me->flags;
4210 if (ST.count) {
4211 PL_regstartp[paren]
4212 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4213 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4214 }
40a82448
DM
4215 else
4216 PL_regendp[paren] = -1;
c277df42 4217 }
40a82448 4218 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4219 /* NOTREACHED */
40a82448
DM
4220
4221 case CURLYM_B_fail: /* just failed to match a B */
4222 REGCP_UNWIND(ST.cp);
4223 if (ST.minmod) {
4224 if (ST.count == ARG2(ST.me) /* max */)
4225 sayNO;
4226 goto curlym_do_A; /* try to match a further A */
4227 }
4228 /* backtrack one A */
4229 if (ST.count == ARG1(ST.me) /* min */)
4230 sayNO;
4231 ST.count--;
4232 locinput = HOPc(locinput, -ST.alen);
4233 goto curlym_do_B; /* try to match B */
4234
c255a977
DM
4235#undef ST
4236#define ST st->u.curly
40a82448 4237
c255a977
DM
4238#define CURLY_SETPAREN(paren, success) \
4239 if (paren) { \
4240 if (success) { \
4241 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4242 PL_regendp[paren] = locinput - PL_bostr; \
4243 } \
4244 else \
4245 PL_regendp[paren] = -1; \
4246 }
4247
4248 case STAR: /* /A*B/ where A is width 1 */
4249 ST.paren = 0;
4250 ST.min = 0;
4251 ST.max = REG_INFTY;
a0d0e21e
LW
4252 scan = NEXTOPER(scan);
4253 goto repeat;
c255a977
DM
4254 case PLUS: /* /A+B/ where A is width 1 */
4255 ST.paren = 0;
4256 ST.min = 1;
4257 ST.max = REG_INFTY;
c277df42 4258 scan = NEXTOPER(scan);
c255a977
DM
4259 goto repeat;
4260 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4261 ST.paren = scan->flags; /* Which paren to set */
4262 if (ST.paren > PL_regsize)
4263 PL_regsize = ST.paren;
4264 if (ST.paren > (I32)*PL_reglastparen)
4265 *PL_reglastparen = ST.paren;
4266 ST.min = ARG1(scan); /* min to match */
4267 ST.max = ARG2(scan); /* max to match */
4268 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4269 goto repeat;
4270 case CURLY: /* /A{m,n}B/ where A is width 1 */
4271 ST.paren = 0;
4272 ST.min = ARG1(scan); /* min to match */
4273 ST.max = ARG2(scan); /* max to match */
4274 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4275 repeat:
a0d0e21e
LW
4276 /*
4277 * Lookahead to avoid useless match attempts
4278 * when we know what character comes next.
c255a977 4279 *
5f80c4cf
JP
4280 * Used to only do .*x and .*?x, but now it allows
4281 * for )'s, ('s and (?{ ... })'s to be in the way
4282 * of the quantifier and the EXACT-like node. -- japhy
4283 */
4284
c255a977
DM
4285 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4286 sayNO;
cca55fe3 4287 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4288 U8 *s;
4289 regnode *text_node = next;
4290
3dab1dad
YO
4291 if (! HAS_TEXT(text_node))
4292 FIND_NEXT_IMPT(text_node);
5f80c4cf 4293
9e137952 4294 if (! HAS_TEXT(text_node))
c255a977 4295 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4296 else {
3dab1dad 4297 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4298 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4299 goto assume_ok_easy;
cca55fe3 4300 }
be8e71aa
YO
4301 else
4302 s = (U8*)STRING(text_node);
5f80c4cf
JP
4303
4304 if (!UTF) {
c255a977 4305 ST.c2 = ST.c1 = *s;
f65d3ee7 4306 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4307 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4308 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4309 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4310 }
5f80c4cf 4311 else { /* UTF */
f65d3ee7 4312 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4313 STRLEN ulen1, ulen2;
89ebb4a3
JH
4314 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4315 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4316
4317 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4318 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4319
c255a977 4320 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4321 uniflags);
c255a977 4322 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4323 uniflags);
5f80c4cf
JP
4324 }
4325 else {
c255a977 4326 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4327 uniflags);
5f80c4cf 4328 }
1aa99e6b
IH
4329 }
4330 }
bbce6d69 4331 }
a0d0e21e 4332 else
c255a977 4333 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4334 assume_ok_easy:
c255a977
DM
4335
4336 ST.A = scan;
4337 ST.B = next;
3280af22 4338 PL_reginput = locinput;
5d9a96ca
DM
4339 if (st->minmod) {
4340 st->minmod = 0;
c255a977 4341 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4342 sayNO;
c255a977 4343 ST.count = ST.min;
a0ed51b3 4344 locinput = PL_reginput;
c255a977
DM
4345 REGCP_SET(ST.cp);
4346 if (ST.c1 == CHRTEST_VOID)
4347 goto curly_try_B_min;
4348
4349 ST.oldloc = locinput;
4350
4351 /* set ST.maxpos to the furthest point along the
4352 * string that could possibly match */
4353 if (ST.max == REG_INFTY) {
4354 ST.maxpos = PL_regeol - 1;
4355 if (do_utf8)
4356 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4357 ST.maxpos--;
4358 }
4359 else if (do_utf8) {
4360 int m = ST.max - ST.min;
4361 for (ST.maxpos = locinput;
4362 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4363 ST.maxpos += UTF8SKIP(ST.maxpos);
4364 }
4365 else {
4366 ST.maxpos = locinput + ST.max - ST.min;
4367 if (ST.maxpos >= PL_regeol)
4368 ST.maxpos = PL_regeol - 1;
4369 }
4370 goto curly_try_B_min_known;
4371
4372 }
4373 else {
4374 ST.count = regrepeat(rex, ST.A, ST.max);
4375 locinput = PL_reginput;
4376 if (ST.count < ST.min)
4377 sayNO;
4378 if ((ST.count > ST.min)
4379 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4380 {
4381 /* A{m,n} must come at the end of the string, there's
4382 * no point in backing off ... */
4383 ST.min = ST.count;
4384 /* ...except that $ and \Z can match before *and* after
4385 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4386 We may back off by one in this case. */
4387 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4388 ST.min--;
4389 }
4390 REGCP_SET(ST.cp);
4391 goto curly_try_B_max;
4392 }
4393 /* NOTREACHED */
4394
4395
4396 case CURLY_B_min_known_fail:
4397 /* failed to find B in a non-greedy match where c1,c2 valid */
4398 if (ST.paren && ST.count)
4399 PL_regendp[ST.paren] = -1;
4400
4401 PL_reginput = locinput; /* Could be reset... */
4402 REGCP_UNWIND(ST.cp);
4403 /* Couldn't or didn't -- move forward. */
4404 ST.oldloc = locinput;
4405 if (do_utf8)
4406 locinput += UTF8SKIP(locinput);
4407 else
4408 locinput++;
4409 ST.count++;
4410 curly_try_B_min_known:
4411 /* find the next place where 'B' could work, then call B */
4412 {
4413 int n;
4414 if (do_utf8) {
4415 n = (ST.oldloc == locinput) ? 0 : 1;
4416 if (ST.c1 == ST.c2) {
4417 STRLEN len;
4418 /* set n to utf8_distance(oldloc, locinput) */
4419 while (locinput <= ST.maxpos &&
4420 utf8n_to_uvchr((U8*)locinput,
4421 UTF8_MAXBYTES, &len,
4422 uniflags) != (UV)ST.c1) {
4423 locinput += len;
4424 n++;
4425 }
1aa99e6b
IH
4426 }
4427 else {
c255a977
DM
4428 /* set n to utf8_distance(oldloc, locinput) */
4429 while (locinput <= ST.maxpos) {
4430 STRLEN len;
4431 const UV c = utf8n_to_uvchr((U8*)locinput,
4432 UTF8_MAXBYTES, &len,
4433 uniflags);
4434 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4435 break;
4436 locinput += len;
4437 n++;
1aa99e6b 4438 }
0fe9bf95
IZ
4439 }
4440 }
c255a977
DM
4441 else {
4442 if (ST.c1 == ST.c2) {
4443 while (locinput <= ST.maxpos &&
4444 UCHARAT(locinput) != ST.c1)
4445 locinput++;
bbce6d69 4446 }
c255a977
DM
4447 else {
4448 while (locinput <= ST.maxpos
4449 && UCHARAT(locinput) != ST.c1
4450 && UCHARAT(locinput) != ST.c2)
4451 locinput++;
a0ed51b3 4452 }
c255a977
DM
4453 n = locinput - ST.oldloc;
4454 }
4455 if (locinput > ST.maxpos)
4456 sayNO;
4457 /* PL_reginput == oldloc now */
4458 if (n) {
4459 ST.count += n;
4460 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4461 sayNO;
a0d0e21e 4462 }
c255a977
DM
4463 PL_reginput = locinput;
4464 CURLY_SETPAREN(ST.paren, ST.count);
4465 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4466 }
c255a977
DM
4467 /* NOTREACHED */
4468
4469
4470 case CURLY_B_min_fail:
4471 /* failed to find B in a non-greedy match where c1,c2 invalid */
4472 if (ST.paren && ST.count)
4473 PL_regendp[ST.paren] = -1;
4474
4475 REGCP_UNWIND(ST.cp);
4476 /* failed -- move forward one */
4477 PL_reginput = locinput;
4478 if (regrepeat(rex, ST.A, 1)) {
4479 ST.count++;
a0ed51b3 4480 locinput = PL_reginput;
c255a977
DM
4481 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4482 ST.count > 0)) /* count overflow ? */
15272685 4483 {
c255a977
DM
4484 curly_try_B_min:
4485 CURLY_SETPAREN(ST.paren, ST.count);
4486 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4487 }
4488 }
4633a7c4 4489 sayNO;
c255a977
DM
4490 /* NOTREACHED */
4491
4492
4493 curly_try_B_max:
4494 /* a successful greedy match: now try to match B */
4495 {
4496 UV c = 0;
4497 if (ST.c1 != CHRTEST_VOID)
4498 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4499 UTF8_MAXBYTES, 0, uniflags)
466787eb 4500 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4501 /* If it could work, try it. */
4502 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4503 CURLY_SETPAREN(ST.paren, ST.count);
4504 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4505 /* NOTREACHED */
4506 }
4507 }
4508 /* FALL THROUGH */
4509 case CURLY_B_max_fail:
4510 /* failed to find B in a greedy match */
4511 if (ST.paren && ST.count)
4512 PL_regendp[ST.paren] = -1;
4513
4514 REGCP_UNWIND(ST.cp);
4515 /* back up. */
4516 if (--ST.count < ST.min)
4517 sayNO;
4518 PL_reginput = locinput = HOPc(locinput, -1);
4519 goto curly_try_B_max;
4520
4521#undef ST
4522
4523
a0d0e21e 4524 case END:
3b0527fe 4525 if (locinput < reginfo->till) {
a3621e74 4526 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4527 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4528 PL_colors[4],
4529 (long)(locinput - PL_reg_starttry),
3b0527fe 4530 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4531 PL_colors[5]));
4532 sayNO_FINAL; /* Cannot match: too short. */
4533 }
4534 PL_reginput = locinput; /* put where regtry can find it */
4535 sayYES_FINAL; /* Success! */
dad79028
DM
4536
4537 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4538 DEBUG_EXECUTE_r(
4539 PerlIO_printf(Perl_debug_log,
4540 "%*s %ssubpattern success...%s\n",
4541 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4542 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4543 sayYES_FINAL; /* Success! */
4544
40a82448
DM
4545#undef ST
4546#define ST st->u.ifmatch
4547
4548 case SUSPEND: /* (?>A) */
4549 ST.wanted = 1;
9fe1d20c 4550 PL_reginput = locinput;
9041c2e3 4551 goto do_ifmatch;
dad79028 4552
40a82448
DM
4553 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4554 ST.wanted = 0;
dad79028
DM
4555 goto ifmatch_trivial_fail_test;
4556
40a82448
DM
4557 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4558 ST.wanted = 1;
dad79028 4559 ifmatch_trivial_fail_test:
a0ed51b3 4560 if (scan->flags) {
52657f30 4561 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4562 if (!s) {
4563 /* trivial fail */
4564 if (st->logical) {
4565 st->logical = 0;
40a82448 4566 st->sw = 1 - (bool)ST.wanted;
dad79028 4567 }
40a82448 4568 else if (ST.wanted)
dad79028
DM
4569 sayNO;
4570 next = scan + ARG(scan);
4571 if (next == scan)
4572 next = NULL;
4573 break;
4574 }
efb30f32 4575 PL_reginput = s;
a0ed51b3
LW
4576 }
4577 else
4578 PL_reginput = locinput;
4579
c277df42 4580 do_ifmatch:
40a82448
DM
4581 ST.me = scan;
4582 /* execute body of (?...A) */
4583 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4584 /* NOTREACHED */
4585
4586 case IFMATCH_A_fail: /* body of (?...A) failed */
4587 ST.wanted = !ST.wanted;
4588 /* FALL THROUGH */
4589
4590 case IFMATCH_A: /* body of (?...A) succeeded */
4591 if (st->logical) {
4592 st->logical = 0;
4593 st->sw = (bool)ST.wanted;
4594 }
4595 else if (!ST.wanted)
4596 sayNO;
4597
4598 if (OP(ST.me) == SUSPEND)
4599 locinput = PL_reginput;
4600 else {
4601 locinput = PL_reginput = st->locinput;
4602 nextchr = UCHARAT(locinput);
4603 }
4604 scan = ST.me + ARG(ST.me);
4605 if (scan == ST.me)
4606 scan = NULL;
4607 continue; /* execute B */
4608
4609#undef ST
dad79028 4610
c277df42 4611 case LONGJMP:
c277df42
IZ
4612 next = scan + ARG(scan);
4613 if (next == scan)
4614 next = NULL;
a0d0e21e
LW
4615 break;
4616 default:
b900a521 4617 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4618 PTR2UV(scan), OP(scan));
cea2e8a9 4619 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4620 }
95b24440 4621
a0d0e21e 4622 scan = next;
95b24440
DM
4623 continue;
4624 /* NOTREACHED */
4625
40a82448
DM
4626 push_yes_state:
4627 /* push a state that backtracks on success */
4628 st->u.yes.prev_yes_state = yes_state;
4629 yes_state = st;
4630 /* FALL THROUGH */
4631 push_state:
4632 /* push a new regex state, then continue at scan */
4633 {
4634 regmatch_state *newst;
4635
4636 depth++;
4637 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4638 "PUSH STATE(%d)\n", depth));
4639 st->locinput = locinput;
4640 newst = st+1;
4641 if (newst > SLAB_LAST(PL_regmatch_slab))
4642 newst = S_push_slab(aTHX);
4643 PL_regmatch_state = newst;
4644 newst->cc = st->cc;
4645 /* XXX probably don't need to initialise these */
4646 newst->minmod = 0;
4647 newst->sw = 0;
4648 newst->logical = 0;
4649
4650 locinput = PL_reginput;
4651 nextchr = UCHARAT(locinput);
4652 st = newst;
4653 continue;
4654 /* NOTREACHED */
4655 }
4656
95b24440
DM
4657 /* simulate recursively calling regmatch(), but without actually
4658 * recursing - ie save the current state on the heap rather than on
4659 * the stack, then re-enter the loop. This avoids complex regexes
4660 * blowing the processor stack */
4661
4662 start_recurse:
4663 {
5d9a96ca
DM
4664 /* push new state */
4665 regmatch_state *oldst = st;
4666
4667 depth++;
40a82448 4668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4669
4670 /* grab the next free state slot */
4671 st++;
86545054 4672 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4673 st = S_push_slab(aTHX);
4674 PL_regmatch_state = st;
4675
4676 oldst->next = next;
4677 oldst->n = n;
4678 oldst->locinput = locinput;
5d9a96ca
DM
4679
4680 st->cc = oldst->cc;
95b24440
DM
4681 locinput = PL_reginput;
4682 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4683 st->minmod = 0;
4684 st->sw = 0;
4685 st->logical = 0;
95b24440
DM
4686#ifdef DEBUGGING
4687 PL_regindent++;
4688#endif
4689 }
a0d0e21e 4690 }
a687059c 4691
aa283a38
DM
4692
4693
a0d0e21e
LW
4694 /*
4695 * We get here only if there's trouble -- normally "case END" is
4696 * the terminating point.
4697 */
cea2e8a9 4698 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4699 /*NOTREACHED*/
4633a7c4
LW
4700 sayNO;
4701
7821416a 4702yes_final:
77cb431f
DM
4703
4704 if (yes_state) {
4705 /* we have successfully completed a subexpression, but we must now
4706 * pop to the state marked by yes_state and continue from there */
4707
4708 assert(st != yes_state);
4709 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4710 || yes_state > SLAB_LAST(PL_regmatch_slab))
4711 {
4712 /* not in this slab, pop slab */
4713 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4714 PL_regmatch_slab = PL_regmatch_slab->prev;
4715 st = SLAB_LAST(PL_regmatch_slab);
4716 }
4717 depth -= (st - yes_state);
40a82448
DM
4718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4719 depth+1, depth+(st - yes_state)));
77cb431f
DM
4720 st = yes_state;
4721 yes_state = st->u.yes.prev_yes_state;
4722 PL_regmatch_state = st;
4723
4724 switch (st->resume_state) {
40a82448
DM
4725 case IFMATCH_A:
4726 case CURLYM_A:
40a82448
DM
4727 case EVAL_A:
4728 state_num = st->resume_state;
4729 goto reenter_switch;
4730
4731 case CURLYM_B:
166ba7cd
DM
4732 case BRANCH_next:
4733 case TRIE_next:
c255a977 4734 case CURLY_B_max:
77cb431f 4735 default:
40a82448 4736 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4737 }
4738 }
4739
a3621e74 4740 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4741 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4742yes:
4743#ifdef DEBUGGING
3280af22 4744 PL_regindent--;
4633a7c4 4745#endif
02db2b7b 4746
95b24440 4747 result = 1;
aa283a38 4748 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4749 * will disappear when REGFMATCH goes */
aa283a38
DM
4750 if (depth) {
4751 /* restore previous state and re-enter */
40a82448
DM
4752 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4753 depth--;
4754 st--;
4755 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4756 PL_regmatch_slab = PL_regmatch_slab->prev;
4757 st = SLAB_LAST(PL_regmatch_slab);
4758 }
4759 PL_regmatch_state = st;
4760 scan = st->scan;
4761 next = st->next;
4762 n = st->n;
4763 locinput= st->locinput;
4764 nextchr = UCHARAT(locinput);
aa283a38
DM
4765
4766 switch (st->resume_state) {
aa283a38
DM
4767 case resume_CURLYX:
4768 goto resume_point_CURLYX;
4769 case resume_WHILEM1:
4770 goto resume_point_WHILEM1;
4771 case resume_WHILEM2:
4772 goto resume_point_WHILEM2;
4773 case resume_WHILEM3:
4774 goto resume_point_WHILEM3;
4775 case resume_WHILEM4:
4776 goto resume_point_WHILEM4;
4777 case resume_WHILEM5:
4778 goto resume_point_WHILEM5;
4779 case resume_WHILEM6:
4780 goto resume_point_WHILEM6;
77cb431f 4781
166ba7cd 4782 case TRIE_next:
40a82448
DM
4783 case CURLYM_A:
4784 case CURLYM_B:
4785 case EVAL_A:
4786 case IFMATCH_A:
4787 case BRANCH_next:
c255a977
DM
4788 case CURLY_B_max:
4789 case CURLY_B_min:
4790 case CURLY_B_min_known:
40a82448
DM
4791 break;
4792
aa283a38
DM
4793 default:
4794 Perl_croak(aTHX_ "regexp resume memory corruption");
4795 }
4796 }
4797 goto final_exit;
4633a7c4
LW
4798
4799no:
a3621e74 4800 DEBUG_EXECUTE_r(
7821416a
IZ
4801 PerlIO_printf(Perl_debug_log,
4802 "%*s %sfailed...%s\n",
e4584336 4803 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4804 );
7821416a
IZ
4805no_final:
4806do_no:
aa283a38 4807
4633a7c4 4808#ifdef DEBUGGING
3280af22 4809 PL_regindent--;
4633a7c4 4810#endif
95b24440 4811 result = 0;
5d9a96ca 4812
aa283a38
DM
4813 if (depth) {
4814 /* there's a previous state to backtrack to */
40a82448
DM
4815 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4816 depth--;
4817 st--;
4818 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4819 PL_regmatch_slab = PL_regmatch_slab->prev;
4820 st = SLAB_LAST(PL_regmatch_slab);
4821 }
4822 PL_regmatch_state = st;
4823 scan = st->scan;
4824 next = st->next;
4825 n = st->n;
4826 locinput= st->locinput;
4827 nextchr = UCHARAT(locinput);
4828
5d9a96ca 4829 switch (st->resume_state) {
95b24440
DM
4830 case resume_CURLYX:
4831 goto resume_point_CURLYX;
4832 case resume_WHILEM1:
4833 goto resume_point_WHILEM1;
4834 case resume_WHILEM2:
4835 goto resume_point_WHILEM2;
4836 case resume_WHILEM3:
4837 goto resume_point_WHILEM3;
4838 case resume_WHILEM4:
4839 goto resume_point_WHILEM4;
4840 case resume_WHILEM5:
4841 goto resume_point_WHILEM5;
4842 case resume_WHILEM6:
4843 goto resume_point_WHILEM6;
dad79028 4844
166ba7cd 4845 case TRIE_next:
40a82448
DM
4846 case EVAL_A:
4847 case BRANCH_next:
4848 case CURLYM_A:
4849 case CURLYM_B:
4850 case IFMATCH_A:
c255a977
DM
4851 case CURLY_B_max:
4852 case CURLY_B_min:
4853 case CURLY_B_min_known:
40a82448
DM
4854 if (yes_state == st)
4855 yes_state = st->u.yes.prev_yes_state;
4856 state_num = st->resume_state + 1; /* failure = success + 1 */
4857 goto reenter_switch;
dad79028 4858
95b24440
DM
4859 default:
4860 Perl_croak(aTHX_ "regexp resume memory corruption");
4861 }
95b24440 4862 }
aa283a38
DM
4863
4864final_exit:
4865
5d9a96ca
DM
4866 /* restore original high-water mark */
4867 PL_regmatch_slab = orig_slab;
4868 PL_regmatch_state = orig_state;
4869
4870 /* free all slabs above current one */
4871 if (orig_slab->next) {
c4fd8992 4872 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4873 orig_slab->next = NULL;
4874 while (sl) {
c4fd8992 4875 regmatch_slab * const osl = sl;
5d9a96ca 4876 sl = sl->next;
ad65c075 4877 Safefree(osl);
5d9a96ca
DM
4878 }
4879 }
4880
95b24440
DM
4881 return result;
4882
a687059c
LW
4883}
4884
4885/*
4886 - regrepeat - repeatedly match something simple, report how many
4887 */
4888/*
4889 * [This routine now assumes that it will only match on things of length 1.
4890 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4891 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4892 */
76e3520e 4893STATIC I32
32fc9b6a 4894S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4895{
27da23d5 4896 dVAR;
a0d0e21e 4897 register char *scan;
a0d0e21e 4898 register I32 c;
3280af22 4899 register char *loceol = PL_regeol;
a0ed51b3 4900 register I32 hardcount = 0;
53c4c00c 4901 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4902
3280af22 4903 scan = PL_reginput;
faf11cac
HS
4904 if (max == REG_INFTY)
4905 max = I32_MAX;
4906 else if (max < loceol - scan)
7f596f4c 4907 loceol = scan + max;
a0d0e21e 4908 switch (OP(p)) {
22c35a8c 4909 case REG_ANY:
1aa99e6b 4910 if (do_utf8) {
ffc61ed2 4911 loceol = PL_regeol;
1aa99e6b 4912 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4913 scan += UTF8SKIP(scan);
4914 hardcount++;
4915 }
4916 } else {
4917 while (scan < loceol && *scan != '\n')
4918 scan++;
a0ed51b3
LW
4919 }
4920 break;
ffc61ed2 4921 case SANY:
def8e4ea
JH
4922 if (do_utf8) {
4923 loceol = PL_regeol;
a0804c9e 4924 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4925 scan += UTF8SKIP(scan);
4926 hardcount++;
4927 }
4928 }
4929 else
4930 scan = loceol;
a0ed51b3 4931 break;
f33976b4
DB
4932 case CANY:
4933 scan = loceol;
4934 break;
090f7165
JH
4935 case EXACT: /* length of string is 1 */
4936 c = (U8)*STRING(p);
4937 while (scan < loceol && UCHARAT(scan) == c)
4938 scan++;
bbce6d69 4939 break;
4940 case EXACTF: /* length of string is 1 */
cd439c50 4941 c = (U8)*STRING(p);
bbce6d69 4942 while (scan < loceol &&
22c35a8c 4943 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4944 scan++;
4945 break;
4946 case EXACTFL: /* length of string is 1 */
3280af22 4947 PL_reg_flags |= RF_tainted;
cd439c50 4948 c = (U8)*STRING(p);
bbce6d69 4949 while (scan < loceol &&
22c35a8c 4950 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4951 scan++;
4952 break;
4953 case ANYOF:
ffc61ed2
JH
4954 if (do_utf8) {
4955 loceol = PL_regeol;
cfc92286 4956 while (hardcount < max && scan < loceol &&
32fc9b6a 4957 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4958 scan += UTF8SKIP(scan);
4959 hardcount++;
4960 }
4961 } else {
32fc9b6a 4962 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4963 scan++;
4964 }
a0d0e21e
LW
4965 break;
4966 case ALNUM:
1aa99e6b 4967 if (do_utf8) {
ffc61ed2 4968 loceol = PL_regeol;
1a4fad37 4969 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4970 while (hardcount < max && scan < loceol &&
3568d838 4971 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4972 scan += UTF8SKIP(scan);
4973 hardcount++;
4974 }
4975 } else {
4976 while (scan < loceol && isALNUM(*scan))
4977 scan++;
a0ed51b3
LW
4978 }
4979 break;
bbce6d69 4980 case ALNUML:
3280af22 4981 PL_reg_flags |= RF_tainted;
1aa99e6b 4982 if (do_utf8) {
ffc61ed2 4983 loceol = PL_regeol;
1aa99e6b
IH
4984 while (hardcount < max && scan < loceol &&
4985 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4986 scan += UTF8SKIP(scan);
4987 hardcount++;
4988 }
4989 } else {
4990 while (scan < loceol && isALNUM_LC(*scan))
4991 scan++;
a0ed51b3
LW
4992 }
4993 break;
a0d0e21e 4994 case NALNUM:
1aa99e6b 4995 if (do_utf8) {
ffc61ed2 4996 loceol = PL_regeol;
1a4fad37 4997 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4998 while (hardcount < max && scan < loceol &&
3568d838 4999 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5000 scan += UTF8SKIP(scan);
5001 hardcount++;
5002 }
5003 } else {
5004 while (scan < loceol && !isALNUM(*scan))
5005 scan++;
a0ed51b3
LW
5006 }
5007 break;
bbce6d69 5008 case NALNUML:
3280af22 5009 PL_reg_flags |= RF_tainted;
1aa99e6b 5010 if (do_utf8) {
ffc61ed2 5011 loceol = PL_regeol;
1aa99e6b
IH
5012 while (hardcount < max && scan < loceol &&
5013 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5014 scan += UTF8SKIP(scan);
5015 hardcount++;
5016 }
5017 } else {
5018 while (scan < loceol && !isALNUM_LC(*scan))
5019 scan++;
a0ed51b3
LW
5020 }
5021 break;
a0d0e21e 5022 case SPACE:
1aa99e6b 5023 if (do_utf8) {
ffc61ed2 5024 loceol = PL_regeol;
1a4fad37 5025 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5026 while (hardcount < max && scan < loceol &&
3568d838
JH
5027 (*scan == ' ' ||
5028 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5029 scan += UTF8SKIP(scan);
5030 hardcount++;
5031 }
5032 } else {
5033 while (scan < loceol && isSPACE(*scan))
5034 scan++;
a0ed51b3
LW
5035 }
5036 break;
bbce6d69 5037 case SPACEL:
3280af22 5038 PL_reg_flags |= RF_tainted;
1aa99e6b 5039 if (do_utf8) {
ffc61ed2 5040 loceol = PL_regeol;
1aa99e6b 5041 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5042 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5043 scan += UTF8SKIP(scan);
5044 hardcount++;
5045 }
5046 } else {
5047 while (scan < loceol && isSPACE_LC(*scan))
5048 scan++;
a0ed51b3
LW
5049 }
5050 break;
a0d0e21e 5051 case NSPACE:
1aa99e6b 5052 if (do_utf8) {
ffc61ed2 5053 loceol = PL_regeol;
1a4fad37 5054 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5055 while (hardcount < max && scan < loceol &&
3568d838
JH
5056 !(*scan == ' ' ||
5057 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5058 scan += UTF8SKIP(scan);
5059 hardcount++;
5060 }
5061 } else {
5062 while (scan < loceol && !isSPACE(*scan))
5063 scan++;
5064 break;
a0ed51b3 5065 }
bbce6d69 5066 case NSPACEL:
3280af22 5067 PL_reg_flags |= RF_tainted;
1aa99e6b 5068 if (do_utf8) {
ffc61ed2 5069 loceol = PL_regeol;
1aa99e6b 5070 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5071 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5072 scan += UTF8SKIP(scan);
5073 hardcount++;
5074 }
5075 } else {
5076 while (scan < loceol && !isSPACE_LC(*scan))
5077 scan++;
a0ed51b3
LW
5078 }
5079 break;
a0d0e21e 5080 case DIGIT:
1aa99e6b 5081 if (do_utf8) {
ffc61ed2 5082 loceol = PL_regeol;
1a4fad37 5083 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5084 while (hardcount < max && scan < loceol &&
3568d838 5085 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5086 scan += UTF8SKIP(scan);
5087 hardcount++;
5088 }
5089 } else {
5090 while (scan < loceol && isDIGIT(*scan))
5091 scan++;
a0ed51b3
LW
5092 }
5093 break;
a0d0e21e 5094 case NDIGIT:
1aa99e6b 5095 if (do_utf8) {
ffc61ed2 5096 loceol = PL_regeol;
1a4fad37 5097 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5098 while (hardcount < max && scan < loceol &&
3568d838 5099 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5100 scan += UTF8SKIP(scan);
5101 hardcount++;
5102 }
5103 } else {
5104 while (scan < loceol && !isDIGIT(*scan))
5105 scan++;
a0ed51b3
LW
5106 }
5107 break;
a0d0e21e
LW
5108 default: /* Called on something of 0 width. */
5109 break; /* So match right here or not at all. */
5110 }
a687059c 5111
a0ed51b3
LW
5112 if (hardcount)
5113 c = hardcount;
5114 else
5115 c = scan - PL_reginput;
3280af22 5116 PL_reginput = scan;
a687059c 5117
a3621e74 5118 DEBUG_r({
e68ec53f 5119 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5120 DEBUG_EXECUTE_r({
e68ec53f
YO
5121 SV * const prop = sv_newmortal();
5122 regprop(prog, prop, p);
5123 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5124 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5125 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5126 });
be8e71aa 5127 });
9041c2e3 5128
a0d0e21e 5129 return(c);
a687059c
LW
5130}
5131
c277df42 5132
be8e71aa 5133#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5134/*
ffc61ed2
JH
5135- regclass_swash - prepare the utf8 swash
5136*/
5137
5138SV *
32fc9b6a 5139Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5140{
97aff369 5141 dVAR;
9e55ce06
JH
5142 SV *sw = NULL;
5143 SV *si = NULL;
5144 SV *alt = NULL;
3dab1dad 5145 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5146
4f639d21 5147 if (data && data->count) {
a3b680e6 5148 const U32 n = ARG(node);
ffc61ed2 5149
4f639d21
DM
5150 if (data->what[n] == 's') {
5151 SV * const rv = (SV*)data->data[n];
890ce7af 5152 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5153 SV **const ary = AvARRAY(av);
9e55ce06 5154 SV **a, **b;
9041c2e3 5155
711a919c 5156 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5157 * documentation of these array elements. */
5158
b11f357e 5159 si = *ary;
8f7f7219 5160 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5161 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5162
ffc61ed2
JH
5163 if (a)
5164 sw = *a;
5165 else if (si && doinit) {
5166 sw = swash_init("utf8", "", si, 1, 0);
5167 (void)av_store(av, 1, sw);
5168 }
9e55ce06
JH
5169 if (b)
5170 alt = *b;
ffc61ed2
JH
5171 }
5172 }
5173
9e55ce06
JH
5174 if (listsvp)
5175 *listsvp = si;
5176 if (altsvp)
5177 *altsvp = alt;
ffc61ed2
JH
5178
5179 return sw;
5180}
76234dfb 5181#endif
ffc61ed2
JH
5182
5183/*
ba7b4546 5184 - reginclass - determine if a character falls into a character class
832705d4
JH
5185
5186 The n is the ANYOF regnode, the p is the target string, lenp
5187 is pointer to the maximum length of how far to go in the p
5188 (if the lenp is zero, UTF8SKIP(p) is used),
5189 do_utf8 tells whether the target string is in UTF-8.
5190
bbce6d69 5191 */
5192
76e3520e 5193STATIC bool
32fc9b6a 5194S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5195{
27da23d5 5196 dVAR;
a3b680e6 5197 const char flags = ANYOF_FLAGS(n);
bbce6d69 5198 bool match = FALSE;
cc07378b 5199 UV c = *p;
ae9ddab8 5200 STRLEN len = 0;
9e55ce06 5201 STRLEN plen;
1aa99e6b 5202
19f67299
TS
5203 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5204 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5205 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5206 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5207 if (len == (STRLEN)-1)
5208 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5209 }
bbce6d69 5210
0f0076b4 5211 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5212 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5213 if (lenp)
5214 *lenp = 0;
ffc61ed2 5215 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5216 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5217 match = TRUE;
bbce6d69 5218 }
3568d838 5219 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5220 match = TRUE;
ffc61ed2 5221 if (!match) {
9e55ce06 5222 AV *av;
32fc9b6a 5223 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5224
5225 if (sw) {
3568d838 5226 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5227 match = TRUE;
5228 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5229 if (!match && lenp && av) {
5230 I32 i;
9e55ce06 5231 for (i = 0; i <= av_len(av); i++) {
890ce7af 5232 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5233 STRLEN len;
890ce7af 5234 const char * const s = SvPV_const(sv, len);
9e55ce06 5235
061b10df 5236 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5237 *lenp = len;
5238 match = TRUE;
5239 break;
5240 }
5241 }
5242 }
5243 if (!match) {
89ebb4a3 5244 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5245 STRLEN tmplen;
5246
9e55ce06
JH
5247 to_utf8_fold(p, tmpbuf, &tmplen);
5248 if (swash_fetch(sw, tmpbuf, do_utf8))
5249 match = TRUE;
5250 }
ffc61ed2
JH
5251 }
5252 }
bbce6d69 5253 }
9e55ce06 5254 if (match && lenp && *lenp == 0)
0f0076b4 5255 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5256 }
1aa99e6b 5257 if (!match && c < 256) {
ffc61ed2
JH
5258 if (ANYOF_BITMAP_TEST(n, c))
5259 match = TRUE;
5260 else if (flags & ANYOF_FOLD) {
eb160463 5261 U8 f;
a0ed51b3 5262
ffc61ed2
JH
5263 if (flags & ANYOF_LOCALE) {
5264 PL_reg_flags |= RF_tainted;
5265 f = PL_fold_locale[c];
5266 }
5267 else
5268 f = PL_fold[c];
5269 if (f != c && ANYOF_BITMAP_TEST(n, f))
5270 match = TRUE;
5271 }
5272
5273 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5274 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5275 if (
5276 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5281 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5282 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5283 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5284 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5285 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5286 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5287 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5288 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5289 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5290 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5291 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5292 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5293 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5294 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5295 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5296 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5297 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5298 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5299 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5300 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5301 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5302 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5303 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5304 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5305 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5306 ) /* How's that for a conditional? */
5307 {
5308 match = TRUE;
5309 }
a0ed51b3 5310 }
a0ed51b3
LW
5311 }
5312
a0ed51b3
LW
5313 return (flags & ANYOF_INVERT) ? !match : match;
5314}
161b471a 5315
dfe13c55 5316STATIC U8 *
0ce71af7 5317S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5318{
97aff369 5319 dVAR;
a0ed51b3 5320 if (off >= 0) {
1aa99e6b 5321 while (off-- && s < lim) {
ffc61ed2 5322 /* XXX could check well-formedness here */
a0ed51b3 5323 s += UTF8SKIP(s);
ffc61ed2 5324 }
a0ed51b3
LW
5325 }
5326 else {
5327 while (off++) {
1aa99e6b 5328 if (s > lim) {
a0ed51b3 5329 s--;
ffc61ed2 5330 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5331 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5332 s--;
ffc61ed2
JH
5333 }
5334 /* XXX could check well-formedness here */
a0ed51b3
LW
5335 }
5336 }
5337 }
5338 return s;
5339}
161b471a 5340
dfe13c55 5341STATIC U8 *
0ce71af7 5342S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5343{
97aff369 5344 dVAR;
a0ed51b3 5345 if (off >= 0) {
1aa99e6b 5346 while (off-- && s < lim) {
ffc61ed2 5347 /* XXX could check well-formedness here */
a0ed51b3 5348 s += UTF8SKIP(s);
ffc61ed2 5349 }
a0ed51b3 5350 if (off >= 0)
3dab1dad 5351 return NULL;
a0ed51b3
LW
5352 }
5353 else {
5354 while (off++) {
1aa99e6b 5355 if (s > lim) {
a0ed51b3 5356 s--;
ffc61ed2 5357 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5358 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5359 s--;
ffc61ed2
JH
5360 }
5361 /* XXX could check well-formedness here */
a0ed51b3
LW
5362 }
5363 else
5364 break;
5365 }
5366 if (off <= 0)
3dab1dad 5367 return NULL;
a0ed51b3
LW
5368 }
5369 return s;
5370}
51371543 5371
51371543 5372static void
acfe0abc 5373restore_pos(pTHX_ void *arg)
51371543 5374{
97aff369 5375 dVAR;
097eb12c 5376 regexp * const rex = (regexp *)arg;
51371543
GS
5377 if (PL_reg_eval_set) {
5378 if (PL_reg_oldsaved) {
4f639d21
DM
5379 rex->subbeg = PL_reg_oldsaved;
5380 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5381#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5382 rex->saved_copy = PL_nrs;
ed252734 5383#endif
4f639d21 5384 RX_MATCH_COPIED_on(rex);
51371543
GS
5385 }
5386 PL_reg_magic->mg_len = PL_reg_oldpos;
5387 PL_reg_eval_set = 0;
5388 PL_curpm = PL_reg_oldcurpm;
5389 }
5390}
33b8afdf
JH
5391
5392STATIC void
5393S_to_utf8_substr(pTHX_ register regexp *prog)
5394{
33b8afdf 5395 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5396 SV* const sv = newSVsv(prog->float_substr);
5397 prog->float_utf8 = sv;
33b8afdf
JH
5398 sv_utf8_upgrade(sv);
5399 if (SvTAIL(prog->float_substr))
5400 SvTAIL_on(sv);
5401 if (prog->float_substr == prog->check_substr)
5402 prog->check_utf8 = sv;
5403 }
5404 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5405 SV* const sv = newSVsv(prog->anchored_substr);
5406 prog->anchored_utf8 = sv;
33b8afdf
JH
5407 sv_utf8_upgrade(sv);
5408 if (SvTAIL(prog->anchored_substr))
5409 SvTAIL_on(sv);
5410 if (prog->anchored_substr == prog->check_substr)
5411 prog->check_utf8 = sv;
5412 }
5413}
5414
5415STATIC void
5416S_to_byte_substr(pTHX_ register regexp *prog)
5417{
97aff369 5418 dVAR;
33b8afdf 5419 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5420 SV* sv = newSVsv(prog->float_utf8);
5421 prog->float_substr = sv;
33b8afdf
JH
5422 if (sv_utf8_downgrade(sv, TRUE)) {
5423 if (SvTAIL(prog->float_utf8))
5424 SvTAIL_on(sv);
5425 } else {
5426 SvREFCNT_dec(sv);
5427 prog->float_substr = sv = &PL_sv_undef;
5428 }
5429 if (prog->float_utf8 == prog->check_utf8)
5430 prog->check_substr = sv;
5431 }
5432 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5433 SV* sv = newSVsv(prog->anchored_utf8);
5434 prog->anchored_substr = sv;
33b8afdf
JH
5435 if (sv_utf8_downgrade(sv, TRUE)) {
5436 if (SvTAIL(prog->anchored_utf8))
5437 SvTAIL_on(sv);
5438 } else {
5439 SvREFCNT_dec(sv);
5440 prog->anchored_substr = sv = &PL_sv_undef;
5441 }
5442 if (prog->anchored_utf8 == prog->check_utf8)
5443 prog->check_substr = sv;
5444 }
5445}
66610fdd
RGS
5446
5447/*
5448 * Local variables:
5449 * c-indentation-style: bsd
5450 * c-basic-offset: 4
5451 * indent-tabs-mode: t
5452 * End:
5453 *
37442d52
RGS
5454 * ex: set ts=8 sts=4 sw=4 noet:
5455 */