This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
optimisation for File::Spec::abs2rel where both args are relative
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
a687059c 274/*
e50aee73 275 * pregexec and friends
a687059c
LW
276 */
277
76234dfb 278#ifndef PERL_IN_XSUB_RE
a687059c 279/*
c277df42 280 - pregexec - match a regexp against a string
a687059c 281 */
c277df42 282I32
864dbfa3 283Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 284 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
285/* strend: pointer to null at end of string */
286/* strbeg: real beginning of string */
287/* minend: end of match must be >=minend after stringarg. */
288/* nosave: For optimizations. */
289{
290 return
9041c2e3 291 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
292 nosave ? 0 : REXEC_COPY_STR);
293}
76234dfb 294#endif
22e551b9 295
9041c2e3 296/*
cad2e5aa
JH
297 * Need to implement the following flags for reg_anch:
298 *
299 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
300 * USE_INTUIT_ML
301 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
302 * INTUIT_AUTORITATIVE_ML
303 * INTUIT_ONCE_NOML - Intuit can match in one location only.
304 * INTUIT_ONCE_ML
305 *
306 * Another flag for this function: SECOND_TIME (so that float substrs
307 * with giant delta may be not rechecked).
308 */
309
310/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
311
3f7c398e 312/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
313 Otherwise, only SvCUR(sv) is used to get strbeg. */
314
315/* XXXX We assume that strpos is strbeg unless sv. */
316
6eb5f6b9
JH
317/* XXXX Some places assume that there is a fixed substring.
318 An update may be needed if optimizer marks as "INTUITable"
319 RExen without fixed substrings. Similarly, it is assumed that
320 lengths of all the strings are no more than minlen, thus they
321 cannot come from lookahead.
322 (Or minlen should take into account lookahead.) */
323
2c2d71f5
JH
324/* A failure to find a constant substring means that there is no need to make
325 an expensive call to REx engine, thus we celebrate a failure. Similarly,
326 finding a substring too deep into the string means that less calls to
30944b6d
IZ
327 regtry() should be needed.
328
329 REx compiler's optimizer found 4 possible hints:
330 a) Anchored substring;
331 b) Fixed substring;
332 c) Whether we are anchored (beginning-of-line or \G);
333 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 334 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
335 string which does not contradict any of them.
336 */
2c2d71f5 337
6eb5f6b9
JH
338/* Most of decisions we do here should have been done at compile time.
339 The nodes of the REx which we used for the search should have been
340 deleted from the finite automaton. */
341
cad2e5aa
JH
342char *
343Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
344 char *strend, U32 flags, re_scream_pos_data *data)
345{
97aff369 346 dVAR;
b7953727 347 register I32 start_shift = 0;
cad2e5aa 348 /* Should be nonnegative! */
b7953727 349 register I32 end_shift = 0;
2c2d71f5
JH
350 register char *s;
351 register SV *check;
a1933d95 352 char *strbeg;
cad2e5aa 353 char *t;
a3b680e6 354 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 355 I32 ml_anch;
bd61b366
SS
356 register char *other_last = NULL; /* other substr checked before this */
357 char *check_at = NULL; /* check substr found at this pos */
1df70142 358 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 359#ifdef DEBUGGING
890ce7af 360 const char * const i_strpos = strpos;
30944b6d 361#endif
a3621e74
YO
362
363 GET_RE_DEBUG_FLAGS_DECL;
364
a30b2f1f 365 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 366
b8d68ded 367 if (prog->reganch & ROPT_UTF8) {
a3621e74 368 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
369 "UTF-8 regex...\n"));
370 PL_reg_flags |= RF_utf8;
371 }
372
a3621e74 373 DEBUG_EXECUTE_r({
0df25f3d
YO
374 RE_PV_DISPLAY_DECL(s, len, PL_reg_match_utf8,
375 PERL_DEBUG_PAD_ZERO(0), strpos, strend - strpos, 60);
376
2a782b5b
JH
377 if (!PL_colorset)
378 reginitcolors();
b8d68ded 379 if (PL_reg_match_utf8)
a3621e74 380 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 381 "UTF-8 target...\n"));
2a782b5b 382 PerlIO_printf(Perl_debug_log,
a0288114 383 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 384 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
385 prog->precomp,
386 PL_colors[1],
387 (strlen(prog->precomp) > 60 ? "..." : ""),
388 PL_colors[0],
389 (int)(len > 60 ? 60 : len),
390 s, PL_colors[1],
391 (len > 60 ? "..." : "")
392 );
393 });
cad2e5aa 394
c344f387
JH
395 /* CHR_DIST() would be more correct here but it makes things slow. */
396 if (prog->minlen > strend - strpos) {
a3621e74 397 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 398 "String too short... [re_intuit_start]\n"));
cad2e5aa 399 goto fail;
2c2d71f5 400 }
a1933d95 401 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 402 PL_regeol = strend;
33b8afdf
JH
403 if (do_utf8) {
404 if (!prog->check_utf8 && prog->check_substr)
405 to_utf8_substr(prog);
406 check = prog->check_utf8;
407 } else {
408 if (!prog->check_substr && prog->check_utf8)
409 to_byte_substr(prog);
410 check = prog->check_substr;
411 }
412 if (check == &PL_sv_undef) {
a3621e74 413 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
414 "Non-utf string cannot match utf check string\n"));
415 goto fail;
416 }
2c2d71f5 417 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
418 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
419 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 420 && !multiline ) ); /* Check after \n? */
cad2e5aa 421
7e25d62c
JH
422 if (!ml_anch) {
423 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
424 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 425 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
426 && sv && !SvROK(sv)
427 && (strpos != strbeg)) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
429 goto fail;
430 }
431 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 432 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 433 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
434 I32 slen;
435
1aa99e6b 436 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
437 if (SvTAIL(check)) {
438 slen = SvCUR(check); /* >= 1 */
cad2e5aa 439
9041c2e3 440 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 441 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 443 goto fail_finish;
cad2e5aa
JH
444 }
445 /* Now should match s[0..slen-2] */
446 slen--;
3f7c398e 447 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 448 || (slen > 1
3f7c398e 449 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 450 report_neq:
a3621e74 451 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
452 goto fail_finish;
453 }
cad2e5aa 454 }
3f7c398e 455 else if (*SvPVX_const(check) != *s
653099ff 456 || ((slen = SvCUR(check)) > 1
3f7c398e 457 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 458 goto report_neq;
c315bfe8 459 check_at = s;
2c2d71f5 460 goto success_at_start;
7e25d62c 461 }
cad2e5aa 462 }
2c2d71f5 463 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 464 s = strpos;
2c2d71f5 465 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 466 end_shift = prog->minlen - start_shift -
653099ff 467 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 468 if (!ml_anch) {
a3b680e6 469 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 470 - (SvTAIL(check) != 0);
a3b680e6 471 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
472
473 if (end_shift < eshift)
474 end_shift = eshift;
475 }
cad2e5aa 476 }
2c2d71f5 477 else { /* Can match at random position */
cad2e5aa
JH
478 ml_anch = 0;
479 s = strpos;
2c2d71f5
JH
480 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
481 /* Should be nonnegative! */
482 end_shift = prog->minlen - start_shift -
653099ff 483 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
484 }
485
2c2d71f5 486#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 487 if (end_shift < 0)
6bbae5e6 488 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
489#endif
490
2c2d71f5
JH
491 restart:
492 /* Find a possible match in the region s..strend by looking for
493 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 494 if (flags & REXEC_SCREAM) {
cad2e5aa 495 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 496 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 497
2c2d71f5
JH
498 if (PL_screamfirst[BmRARE(check)] >= 0
499 || ( BmRARE(check) == '\n'
500 && (BmPREVIOUS(check) == SvCUR(check) - 1)
501 && SvTAIL(check) ))
9041c2e3 502 s = screaminstr(sv, check,
2c2d71f5 503 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 504 else
2c2d71f5 505 goto fail_finish;
4addbd3b
HS
506 /* we may be pointing at the wrong string */
507 if (s && RX_MATCH_COPIED(prog))
3f7c398e 508 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
509 if (data)
510 *data->scream_olds = s;
511 }
f33976b4 512 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
513 s = fbm_instr((U8*)(s + start_shift),
514 (U8*)(strend - end_shift),
7fba1cd6 515 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 516 else
1aa99e6b
IH
517 s = fbm_instr(HOP3(s, start_shift, strend),
518 HOP3(strend, -end_shift, strbeg),
7fba1cd6 519 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
520
521 /* Update the count-of-usability, remove useless subpatterns,
522 unshift s. */
2c2d71f5 523
a0288114 524 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 525 (s ? "Found" : "Did not find"),
33b8afdf 526 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 527 PL_colors[0],
7b0972df 528 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 529 SvPVX_const(check),
2c2d71f5
JH
530 PL_colors[1], (SvTAIL(check) ? "$" : ""),
531 (s ? " at offset " : "...\n") ) );
532
533 if (!s)
534 goto fail_finish;
535
6eb5f6b9
JH
536 check_at = s;
537
2c2d71f5 538 /* Finish the diagnostic message */
a3621e74 539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
540
541 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
542 Start with the other substr.
543 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 544 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
545 *always* match. Probably should be marked during compile...
546 Probably it is right to do no SCREAM here...
547 */
548
33b8afdf 549 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 550 /* Take into account the "other" substring. */
2c2d71f5
JH
551 /* XXXX May be hopelessly wrong for UTF... */
552 if (!other_last)
6eb5f6b9 553 other_last = strpos;
33b8afdf 554 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
555 do_other_anchored:
556 {
890ce7af
AL
557 char * const last = HOP3c(s, -start_shift, strbeg);
558 char *last1, *last2;
be8e71aa 559 char * const saved_s = s;
33b8afdf 560 SV* must;
2c2d71f5 561
2c2d71f5
JH
562 t = s - prog->check_offset_max;
563 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 564 && (!do_utf8
0ce71af7 565 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 566 && t > strpos)))
6f207bd3 567 NOOP;
2c2d71f5
JH
568 else
569 t = strpos;
1aa99e6b 570 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
571 if (t < other_last) /* These positions already checked */
572 t = other_last;
1aa99e6b 573 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
574 if (last < last1)
575 last1 = last;
576 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
577 /* On end-of-str: see comment below. */
33b8afdf
JH
578 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
579 if (must == &PL_sv_undef) {
580 s = (char*)NULL;
a3621e74 581 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
582 }
583 else
584 s = fbm_instr(
585 (unsigned char*)t,
586 HOP3(HOP3(last1, prog->anchored_offset, strend)
587 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
588 must,
7fba1cd6 589 multiline ? FBMrf_MULTILINE : 0
33b8afdf 590 );
a3621e74 591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 592 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
593 (s ? "Found" : "Contradicts"),
594 PL_colors[0],
33b8afdf
JH
595 (int)(SvCUR(must)
596 - (SvTAIL(must)!=0)),
3f7c398e 597 SvPVX_const(must),
33b8afdf 598 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
599 if (!s) {
600 if (last1 >= last2) {
a3621e74 601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
602 ", giving up...\n"));
603 goto fail_finish;
604 }
a3621e74 605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 606 ", trying floating at offset %ld...\n",
be8e71aa 607 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
608 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
609 s = HOP3c(last, 1, strend);
2c2d71f5
JH
610 goto restart;
611 }
612 else {
a3621e74 613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 614 (long)(s - i_strpos)));
1aa99e6b
IH
615 t = HOP3c(s, -prog->anchored_offset, strbeg);
616 other_last = HOP3c(s, 1, strend);
be8e71aa 617 s = saved_s;
2c2d71f5
JH
618 if (t == strpos)
619 goto try_at_start;
2c2d71f5
JH
620 goto try_at_offset;
621 }
30944b6d 622 }
2c2d71f5
JH
623 }
624 else { /* Take into account the floating substring. */
33b8afdf 625 char *last, *last1;
be8e71aa 626 char * const saved_s = s;
33b8afdf
JH
627 SV* must;
628
629 t = HOP3c(s, -start_shift, strbeg);
630 last1 = last =
631 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
632 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
633 last = HOP3c(t, prog->float_max_offset, strend);
634 s = HOP3c(t, prog->float_min_offset, strend);
635 if (s < other_last)
636 s = other_last;
2c2d71f5 637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
638 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
639 /* fbm_instr() takes into account exact value of end-of-str
640 if the check is SvTAIL(ed). Since false positives are OK,
641 and end-of-str is not later than strend we are OK. */
642 if (must == &PL_sv_undef) {
643 s = (char*)NULL;
a3621e74 644 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
645 }
646 else
2c2d71f5 647 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
648 (unsigned char*)last + SvCUR(must)
649 - (SvTAIL(must)!=0),
7fba1cd6 650 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 651 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
652 (s ? "Found" : "Contradicts"),
653 PL_colors[0],
654 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 655 SvPVX_const(must),
33b8afdf
JH
656 PL_colors[1], (SvTAIL(must) ? "$" : "")));
657 if (!s) {
658 if (last1 == last) {
a3621e74 659 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
660 ", giving up...\n"));
661 goto fail_finish;
2c2d71f5 662 }
a3621e74 663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 664 ", trying anchored starting at offset %ld...\n",
be8e71aa 665 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
666 other_last = last;
667 s = HOP3c(t, 1, strend);
668 goto restart;
669 }
670 else {
a3621e74 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
672 (long)(s - i_strpos)));
673 other_last = s; /* Fix this later. --Hugo */
be8e71aa 674 s = saved_s;
33b8afdf
JH
675 if (t == strpos)
676 goto try_at_start;
677 goto try_at_offset;
678 }
2c2d71f5 679 }
cad2e5aa 680 }
2c2d71f5
JH
681
682 t = s - prog->check_offset_max;
2c2d71f5 683 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 684 && (!do_utf8
0ce71af7 685 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 686 && t > strpos))) {
2c2d71f5
JH
687 /* Fixed substring is found far enough so that the match
688 cannot start at strpos. */
689 try_at_offset:
cad2e5aa 690 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
691 /* Eventually fbm_*() should handle this, but often
692 anchored_offset is not 0, so this check will not be wasted. */
693 /* XXXX In the code below we prefer to look for "^" even in
694 presence of anchored substrings. And we search even
695 beyond the found float position. These pessimizations
696 are historical artefacts only. */
697 find_anchor:
2c2d71f5 698 while (t < strend - prog->minlen) {
cad2e5aa 699 if (*t == '\n') {
4ee3650e 700 if (t < check_at - prog->check_offset_min) {
33b8afdf 701 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
702 /* Since we moved from the found position,
703 we definitely contradict the found anchored
30944b6d
IZ
704 substr. Due to the above check we do not
705 contradict "check" substr.
706 Thus we can arrive here only if check substr
707 is float. Redo checking for "other"=="fixed".
708 */
9041c2e3 709 strpos = t + 1;
a3621e74 710 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 711 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
712 goto do_other_anchored;
713 }
4ee3650e
GS
714 /* We don't contradict the found floating substring. */
715 /* XXXX Why not check for STCLASS? */
cad2e5aa 716 s = t + 1;
a3621e74 717 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 718 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
719 goto set_useful;
720 }
4ee3650e
GS
721 /* Position contradicts check-string */
722 /* XXXX probably better to look for check-string
723 than for "\n", so one should lower the limit for t? */
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 725 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 726 other_last = strpos = s = t + 1;
cad2e5aa
JH
727 goto restart;
728 }
729 t++;
730 }
a3621e74 731 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 732 PL_colors[0], PL_colors[1]));
2c2d71f5 733 goto fail_finish;
cad2e5aa 734 }
f5952150 735 else {
a3621e74 736 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 737 PL_colors[0], PL_colors[1]));
f5952150 738 }
cad2e5aa
JH
739 s = t;
740 set_useful:
33b8afdf 741 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
742 }
743 else {
f5952150 744 /* The found string does not prohibit matching at strpos,
2c2d71f5 745 - no optimization of calling REx engine can be performed,
f5952150
GS
746 unless it was an MBOL and we are not after MBOL,
747 or a future STCLASS check will fail this. */
2c2d71f5
JH
748 try_at_start:
749 /* Even in this situation we may use MBOL flag if strpos is offset
750 wrt the start of the string. */
05b4157f 751 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 752 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
753 /* May be due to an implicit anchor of m{.*foo} */
754 && !(prog->reganch & ROPT_IMPLICIT))
755 {
cad2e5aa
JH
756 t = strpos;
757 goto find_anchor;
758 }
a3621e74 759 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 760 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 761 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 762 );
2c2d71f5 763 success_at_start:
30944b6d 764 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
765 && (do_utf8 ? (
766 prog->check_utf8 /* Could be deleted already */
767 && --BmUSEFUL(prog->check_utf8) < 0
768 && (prog->check_utf8 == prog->float_utf8)
769 ) : (
770 prog->check_substr /* Could be deleted already */
771 && --BmUSEFUL(prog->check_substr) < 0
772 && (prog->check_substr == prog->float_substr)
773 )))
66e933ab 774 {
cad2e5aa 775 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
777 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
778 if (do_utf8 ? prog->check_substr : prog->check_utf8)
779 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
780 prog->check_substr = prog->check_utf8 = NULL; /* disable */
781 prog->float_substr = prog->float_utf8 = NULL; /* clear */
782 check = NULL; /* abort */
cad2e5aa 783 s = strpos;
3cf5c195
IZ
784 /* XXXX This is a remnant of the old implementation. It
785 looks wasteful, since now INTUIT can use many
6eb5f6b9 786 other heuristics. */
cad2e5aa
JH
787 prog->reganch &= ~RE_USE_INTUIT;
788 }
789 else
790 s = strpos;
791 }
792
6eb5f6b9
JH
793 /* Last resort... */
794 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 795 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
796 /* minlen == 0 is possible if regstclass is \b or \B,
797 and the fixed substr is ''$.
798 Since minlen is already taken into account, s+1 is before strend;
799 accidentally, minlen >= 1 guaranties no false positives at s + 1
800 even for \b or \B. But (minlen? 1 : 0) below assumes that
801 regstclass does not come from lookahead... */
802 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
803 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 804 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 805 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 806 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 807 : 1);
07be1b83 808 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 809 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 810 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
811 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
812 cl_l, strend)
813 : strend);
07be1b83
YO
814 /*if (OP(prog->regstclass) == TRIE)
815 endpos++;*/
6eb5f6b9 816 t = s;
3b0527fe 817 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
818 if (!s) {
819#ifdef DEBUGGING
cbbf8932 820 const char *what = NULL;
6eb5f6b9
JH
821#endif
822 if (endpos == strend) {
a3621e74 823 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
824 "Could not match STCLASS...\n") );
825 goto fail;
826 }
a3621e74 827 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 828 "This position contradicts STCLASS...\n") );
653099ff
GS
829 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
830 goto fail;
6eb5f6b9 831 /* Contradict one of substrings */
33b8afdf
JH
832 if (prog->anchored_substr || prog->anchored_utf8) {
833 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 834 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 835 hop_and_restart:
1aa99e6b 836 s = HOP3c(t, 1, strend);
66e933ab
GS
837 if (s + start_shift + end_shift > strend) {
838 /* XXXX Should be taken into account earlier? */
a3621e74 839 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
840 "Could not match STCLASS...\n") );
841 goto fail;
842 }
5e39e1e5
HS
843 if (!check)
844 goto giveup;
a3621e74 845 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 846 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
847 what, (long)(s + start_shift - i_strpos)) );
848 goto restart;
849 }
66e933ab 850 /* Have both, check_string is floating */
6eb5f6b9
JH
851 if (t + start_shift >= check_at) /* Contradicts floating=check */
852 goto retry_floating_check;
853 /* Recheck anchored substring, but not floating... */
9041c2e3 854 s = check_at;
5e39e1e5
HS
855 if (!check)
856 goto giveup;
a3621e74 857 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 858 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
859 (long)(other_last - i_strpos)) );
860 goto do_other_anchored;
861 }
60e71179
GS
862 /* Another way we could have checked stclass at the
863 current position only: */
864 if (ml_anch) {
865 s = t = t + 1;
5e39e1e5
HS
866 if (!check)
867 goto giveup;
a3621e74 868 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 869 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 870 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 871 goto try_at_offset;
66e933ab 872 }
33b8afdf 873 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 874 goto fail;
6eb5f6b9
JH
875 /* Check is floating subtring. */
876 retry_floating_check:
877 t = check_at - start_shift;
a3621e74 878 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
879 goto hop_and_restart;
880 }
b7953727 881 if (t != s) {
a3621e74 882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 883 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
884 (long)(t - i_strpos), (long)(s - i_strpos))
885 );
886 }
887 else {
a3621e74 888 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
889 "Does not contradict STCLASS...\n");
890 );
891 }
6eb5f6b9 892 }
5e39e1e5 893 giveup:
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
895 PL_colors[4], (check ? "Guessed" : "Giving up"),
896 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 897 return s;
2c2d71f5
JH
898
899 fail_finish: /* Substring not found */
33b8afdf
JH
900 if (prog->check_substr || prog->check_utf8) /* could be removed already */
901 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 902 fail:
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 904 PL_colors[4], PL_colors[5]));
bd61b366 905 return NULL;
cad2e5aa 906}
9661b544 907
6eb5f6b9 908/* We know what class REx starts with. Try to find this position... */
3b0527fe 909/* if reginfo is NULL, its a dryrun */
07be1b83
YO
910/* annoyingly all the vars in this routine have different names from their counterparts
911 in regmatch. /grrr */
3b0527fe 912
3c3eec57 913STATIC char *
07be1b83
YO
914S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
915 const char *strend, const regmatch_info *reginfo)
a687059c 916{
27da23d5 917 dVAR;
1df70142 918 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 919 char *m;
d8093b23 920 STRLEN ln;
5dab1207 921 STRLEN lnc;
078c425b 922 register STRLEN uskip;
d8093b23
G
923 unsigned int c1;
924 unsigned int c2;
6eb5f6b9
JH
925 char *e;
926 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 927 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 928
6eb5f6b9
JH
929 /* We know what class it must start with. */
930 switch (OP(c)) {
6eb5f6b9 931 case ANYOF:
388cc4de 932 if (do_utf8) {
078c425b 933 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
934 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
935 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
936 reginclass(prog, c, (U8*)s, 0, do_utf8) :
937 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 938 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
939 goto got_it;
940 else
941 tmp = doevery;
942 }
943 else
944 tmp = 1;
078c425b 945 s += uskip;
388cc4de
HS
946 }
947 }
948 else {
949 while (s < strend) {
950 STRLEN skip = 1;
951
32fc9b6a 952 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
953 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
954 /* The assignment of 2 is intentional:
955 * for the folded sharp s, the skip is 2. */
956 (skip = SHARP_S_SKIP))) {
3b0527fe 957 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
958 goto got_it;
959 else
960 tmp = doevery;
961 }
962 else
963 tmp = 1;
964 s += skip;
965 }
a0d0e21e 966 }
6eb5f6b9 967 break;
f33976b4
DB
968 case CANY:
969 while (s < strend) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
971 goto got_it;
972 else
973 tmp = doevery;
974 s++;
975 }
976 break;
6eb5f6b9 977 case EXACTF:
5dab1207
NIS
978 m = STRING(c);
979 ln = STR_LEN(c); /* length to match in octets/bytes */
980 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 981 if (UTF) {
a2a2844f 982 STRLEN ulen1, ulen2;
5dab1207 983 U8 *sm = (U8 *) m;
89ebb4a3
JH
984 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
985 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 986 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
987
988 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
989 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
990
89ebb4a3 991 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 992 0, uniflags);
89ebb4a3 993 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 994 0, uniflags);
5dab1207
NIS
995 lnc = 0;
996 while (sm < ((U8 *) m + ln)) {
997 lnc++;
998 sm += UTF8SKIP(sm);
999 }
1aa99e6b
IH
1000 }
1001 else {
1002 c1 = *(U8*)m;
1003 c2 = PL_fold[c1];
1004 }
6eb5f6b9
JH
1005 goto do_exactf;
1006 case EXACTFL:
5dab1207
NIS
1007 m = STRING(c);
1008 ln = STR_LEN(c);
1009 lnc = (I32) ln;
d8093b23 1010 c1 = *(U8*)m;
6eb5f6b9
JH
1011 c2 = PL_fold_locale[c1];
1012 do_exactf:
db12adc6 1013 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1014
3b0527fe 1015 if (!reginfo && e < s)
6eb5f6b9 1016 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1017
60a8b682
JH
1018 /* The idea in the EXACTF* cases is to first find the
1019 * first character of the EXACTF* node and then, if
1020 * necessary, case-insensitively compare the full
1021 * text of the node. The c1 and c2 are the first
1022 * characters (though in Unicode it gets a bit
1023 * more complicated because there are more cases
7f16dd3d
JH
1024 * than just upper and lower: one needs to use
1025 * the so-called folding case for case-insensitive
1026 * matching (called "loose matching" in Unicode).
1027 * ibcmp_utf8() will do just that. */
60a8b682 1028
1aa99e6b 1029 if (do_utf8) {
575cac57 1030 UV c, f;
89ebb4a3 1031 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1032 STRLEN len, foldlen;
4ad0818d 1033 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1034 if (c1 == c2) {
5dab1207
NIS
1035 /* Upper and lower of 1st char are equal -
1036 * probably not a "letter". */
1aa99e6b 1037 while (s <= e) {
89ebb4a3 1038 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1039 uniflags);
80aecb99
JH
1040 if ( c == c1
1041 && (ln == len ||
3dab1dad
YO
1042 ibcmp_utf8(s, NULL, 0, do_utf8,
1043 m, NULL, ln, (bool)UTF))
3b0527fe 1044 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1045 goto got_it;
80aecb99 1046 else {
1df70142 1047 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1048 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1049 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1050 if ( f != c
1051 && (f == c1 || f == c2)
1052 && (ln == foldlen ||
66423254 1053 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1054 NULL, foldlen, do_utf8,
d07ddd77 1055 m,
3dab1dad 1056 NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1058 goto got_it;
1059 }
1aa99e6b
IH
1060 s += len;
1061 }
09091399
JH
1062 }
1063 else {
1aa99e6b 1064 while (s <= e) {
89ebb4a3 1065 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1066 uniflags);
80aecb99 1067
60a8b682 1068 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1069 * Note that not all the possible combinations
1070 * are handled here: some of them are handled
1071 * by the standard folding rules, and some of
1072 * them (the character class or ANYOF cases)
1073 * are handled during compiletime in
1074 * regexec.c:S_regclass(). */
880bd946
JH
1075 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1076 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1077 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1078
1079 if ( (c == c1 || c == c2)
1080 && (ln == len ||
3dab1dad
YO
1081 ibcmp_utf8(s, NULL, 0, do_utf8,
1082 m, NULL, ln, (bool)UTF))
3b0527fe 1083 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1084 goto got_it;
80aecb99 1085 else {
1df70142 1086 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1087 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1088 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1089 if ( f != c
1090 && (f == c1 || f == c2)
1091 && (ln == foldlen ||
a6872d42 1092 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1093 NULL, foldlen, do_utf8,
d07ddd77 1094 m,
3dab1dad 1095 NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1097 goto got_it;
1098 }
1aa99e6b
IH
1099 s += len;
1100 }
09091399 1101 }
1aa99e6b
IH
1102 }
1103 else {
1104 if (c1 == c2)
1105 while (s <= e) {
1106 if ( *(U8*)s == c1
1107 && (ln == 1 || !(OP(c) == EXACTF
1108 ? ibcmp(s, m, ln)
1109 : ibcmp_locale(s, m, ln)))
3b0527fe 1110 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1111 goto got_it;
1112 s++;
1113 }
1114 else
1115 while (s <= e) {
1116 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1117 && (ln == 1 || !(OP(c) == EXACTF
1118 ? ibcmp(s, m, ln)
1119 : ibcmp_locale(s, m, ln)))
3b0527fe 1120 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1121 goto got_it;
1122 s++;
1123 }
b3c9acc1
IZ
1124 }
1125 break;
bbce6d69 1126 case BOUNDL:
3280af22 1127 PL_reg_flags |= RF_tainted;
bbce6d69 1128 /* FALL THROUGH */
a0d0e21e 1129 case BOUND:
ffc61ed2 1130 if (do_utf8) {
12d33761 1131 if (s == PL_bostr)
ffc61ed2
JH
1132 tmp = '\n';
1133 else {
6136c704 1134 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1135 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1136 }
1137 tmp = ((OP(c) == BOUND ?
9041c2e3 1138 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1139 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1140 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1141 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1142 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1143 isALNUM_LC_utf8((U8*)s)))
1144 {
1145 tmp = !tmp;
3b0527fe 1146 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1147 goto got_it;
1148 }
078c425b 1149 s += uskip;
a687059c 1150 }
a0d0e21e 1151 }
667bb95a 1152 else {
12d33761 1153 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1154 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1155 while (s < strend) {
1156 if (tmp ==
1157 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
1162 s++;
a0ed51b3 1163 }
a0ed51b3 1164 }
3b0527fe 1165 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1166 goto got_it;
1167 break;
bbce6d69 1168 case NBOUNDL:
3280af22 1169 PL_reg_flags |= RF_tainted;
bbce6d69 1170 /* FALL THROUGH */
a0d0e21e 1171 case NBOUND:
ffc61ed2 1172 if (do_utf8) {
12d33761 1173 if (s == PL_bostr)
ffc61ed2
JH
1174 tmp = '\n';
1175 else {
6136c704 1176 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1177 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1178 }
1179 tmp = ((OP(c) == NBOUND ?
9041c2e3 1180 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1181 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1182 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1183 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1184 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1185 isALNUM_LC_utf8((U8*)s)))
1186 tmp = !tmp;
3b0527fe 1187 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1188 goto got_it;
078c425b 1189 s += uskip;
ffc61ed2 1190 }
a0d0e21e 1191 }
667bb95a 1192 else {
12d33761 1193 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1194 tmp = ((OP(c) == NBOUND ?
1195 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1196 while (s < strend) {
1197 if (tmp ==
1198 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1201 goto got_it;
1202 s++;
1203 }
a0ed51b3 1204 }
3b0527fe 1205 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1206 goto got_it;
1207 break;
a0d0e21e 1208 case ALNUM:
ffc61ed2 1209 if (do_utf8) {
1a4fad37 1210 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1211 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1212 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1213 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 else
1216 tmp = doevery;
1217 }
bbce6d69 1218 else
ffc61ed2 1219 tmp = 1;
078c425b 1220 s += uskip;
bbce6d69 1221 }
bbce6d69 1222 }
ffc61ed2
JH
1223 else {
1224 while (s < strend) {
1225 if (isALNUM(*s)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
a0ed51b3 1231 else
ffc61ed2
JH
1232 tmp = 1;
1233 s++;
a0ed51b3 1234 }
a0ed51b3
LW
1235 }
1236 break;
bbce6d69 1237 case ALNUML:
3280af22 1238 PL_reg_flags |= RF_tainted;
ffc61ed2 1239 if (do_utf8) {
078c425b 1240 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1241 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1242 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1243 goto got_it;
1244 else
1245 tmp = doevery;
1246 }
a687059c 1247 else
ffc61ed2 1248 tmp = 1;
078c425b 1249 s += uskip;
a0d0e21e 1250 }
a0d0e21e 1251 }
ffc61ed2
JH
1252 else {
1253 while (s < strend) {
1254 if (isALNUM_LC(*s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a0ed51b3 1260 else
ffc61ed2
JH
1261 tmp = 1;
1262 s++;
a0ed51b3 1263 }
a0ed51b3
LW
1264 }
1265 break;
a0d0e21e 1266 case NALNUM:
ffc61ed2 1267 if (do_utf8) {
1a4fad37 1268 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1269 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1270 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1271 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1272 goto got_it;
1273 else
1274 tmp = doevery;
1275 }
bbce6d69 1276 else
ffc61ed2 1277 tmp = 1;
078c425b 1278 s += uskip;
bbce6d69 1279 }
bbce6d69 1280 }
ffc61ed2
JH
1281 else {
1282 while (s < strend) {
1283 if (!isALNUM(*s)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
a0ed51b3 1289 else
ffc61ed2
JH
1290 tmp = 1;
1291 s++;
a0ed51b3 1292 }
a0ed51b3
LW
1293 }
1294 break;
bbce6d69 1295 case NALNUML:
3280af22 1296 PL_reg_flags |= RF_tainted;
ffc61ed2 1297 if (do_utf8) {
078c425b 1298 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1299 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1300 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1301 goto got_it;
1302 else
1303 tmp = doevery;
1304 }
a687059c 1305 else
ffc61ed2 1306 tmp = 1;
078c425b 1307 s += uskip;
a687059c 1308 }
a0d0e21e 1309 }
ffc61ed2
JH
1310 else {
1311 while (s < strend) {
1312 if (!isALNUM_LC(*s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a0ed51b3 1318 else
ffc61ed2
JH
1319 tmp = 1;
1320 s++;
a0ed51b3 1321 }
a0ed51b3
LW
1322 }
1323 break;
a0d0e21e 1324 case SPACE:
ffc61ed2 1325 if (do_utf8) {
1a4fad37 1326 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1327 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1328 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1329 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1330 goto got_it;
1331 else
1332 tmp = doevery;
1333 }
a0d0e21e 1334 else
ffc61ed2 1335 tmp = 1;
078c425b 1336 s += uskip;
2304df62 1337 }
a0d0e21e 1338 }
ffc61ed2
JH
1339 else {
1340 while (s < strend) {
1341 if (isSPACE(*s)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0ed51b3 1347 else
ffc61ed2
JH
1348 tmp = 1;
1349 s++;
a0ed51b3 1350 }
a0ed51b3
LW
1351 }
1352 break;
bbce6d69 1353 case SPACEL:
3280af22 1354 PL_reg_flags |= RF_tainted;
ffc61ed2 1355 if (do_utf8) {
078c425b 1356 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1357 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1358 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1359 goto got_it;
1360 else
1361 tmp = doevery;
1362 }
bbce6d69 1363 else
ffc61ed2 1364 tmp = 1;
078c425b 1365 s += uskip;
bbce6d69 1366 }
bbce6d69 1367 }
ffc61ed2
JH
1368 else {
1369 while (s < strend) {
1370 if (isSPACE_LC(*s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
a0ed51b3 1376 else
ffc61ed2
JH
1377 tmp = 1;
1378 s++;
a0ed51b3 1379 }
a0ed51b3
LW
1380 }
1381 break;
a0d0e21e 1382 case NSPACE:
ffc61ed2 1383 if (do_utf8) {
1a4fad37 1384 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1385 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1386 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1387 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1388 goto got_it;
1389 else
1390 tmp = doevery;
1391 }
a0d0e21e 1392 else
ffc61ed2 1393 tmp = 1;
078c425b 1394 s += uskip;
a687059c 1395 }
a0d0e21e 1396 }
ffc61ed2
JH
1397 else {
1398 while (s < strend) {
1399 if (!isSPACE(*s)) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0ed51b3 1405 else
ffc61ed2
JH
1406 tmp = 1;
1407 s++;
a0ed51b3 1408 }
a0ed51b3
LW
1409 }
1410 break;
bbce6d69 1411 case NSPACEL:
3280af22 1412 PL_reg_flags |= RF_tainted;
ffc61ed2 1413 if (do_utf8) {
078c425b 1414 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1415 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1416 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1417 goto got_it;
1418 else
1419 tmp = doevery;
1420 }
bbce6d69 1421 else
ffc61ed2 1422 tmp = 1;
078c425b 1423 s += uskip;
bbce6d69 1424 }
bbce6d69 1425 }
ffc61ed2
JH
1426 else {
1427 while (s < strend) {
1428 if (!isSPACE_LC(*s)) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
a0ed51b3 1434 else
ffc61ed2
JH
1435 tmp = 1;
1436 s++;
a0ed51b3 1437 }
a0ed51b3
LW
1438 }
1439 break;
a0d0e21e 1440 case DIGIT:
ffc61ed2 1441 if (do_utf8) {
1a4fad37 1442 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1443 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1444 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1445 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1446 goto got_it;
1447 else
1448 tmp = doevery;
1449 }
a0d0e21e 1450 else
ffc61ed2 1451 tmp = 1;
078c425b 1452 s += uskip;
2b69d0c2 1453 }
a0d0e21e 1454 }
ffc61ed2
JH
1455 else {
1456 while (s < strend) {
1457 if (isDIGIT(*s)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0ed51b3 1463 else
ffc61ed2
JH
1464 tmp = 1;
1465 s++;
a0ed51b3 1466 }
a0ed51b3
LW
1467 }
1468 break;
b8c5462f
JH
1469 case DIGITL:
1470 PL_reg_flags |= RF_tainted;
ffc61ed2 1471 if (do_utf8) {
078c425b 1472 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1473 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1474 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1475 goto got_it;
1476 else
1477 tmp = doevery;
1478 }
b8c5462f 1479 else
ffc61ed2 1480 tmp = 1;
078c425b 1481 s += uskip;
b8c5462f 1482 }
b8c5462f 1483 }
ffc61ed2
JH
1484 else {
1485 while (s < strend) {
1486 if (isDIGIT_LC(*s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2
JH
1493 tmp = 1;
1494 s++;
b8c5462f 1495 }
b8c5462f
JH
1496 }
1497 break;
a0d0e21e 1498 case NDIGIT:
ffc61ed2 1499 if (do_utf8) {
1a4fad37 1500 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1501 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1502 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1503 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1504 goto got_it;
1505 else
1506 tmp = doevery;
1507 }
a0d0e21e 1508 else
ffc61ed2 1509 tmp = 1;
078c425b 1510 s += uskip;
a687059c 1511 }
a0d0e21e 1512 }
ffc61ed2
JH
1513 else {
1514 while (s < strend) {
1515 if (!isDIGIT(*s)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0ed51b3 1521 else
ffc61ed2
JH
1522 tmp = 1;
1523 s++;
a0ed51b3 1524 }
a0ed51b3
LW
1525 }
1526 break;
b8c5462f
JH
1527 case NDIGITL:
1528 PL_reg_flags |= RF_tainted;
ffc61ed2 1529 if (do_utf8) {
078c425b 1530 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1531 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1532 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1533 goto got_it;
1534 else
1535 tmp = doevery;
1536 }
b8c5462f 1537 else
ffc61ed2 1538 tmp = 1;
078c425b 1539 s += uskip;
b8c5462f 1540 }
a0ed51b3 1541 }
ffc61ed2
JH
1542 else {
1543 while (s < strend) {
1544 if (!isDIGIT_LC(*s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
cf93c79d 1550 else
ffc61ed2
JH
1551 tmp = 1;
1552 s++;
b8c5462f 1553 }
b8c5462f
JH
1554 }
1555 break;
07be1b83
YO
1556 case TRIE:
1557 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1558 {
1559 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1560 trie_type = do_utf8 ?
1561 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1562 : trie_plain;
1563 /* what trie are we using right now */
1564 reg_ac_data *aho
1565 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1566 reg_trie_data *trie=aho->trie;
1567
1568 const char *last_start = strend - trie->minlen;
1569 const char *real_start = s;
1570 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1571 SV *sv_points;
1572 U8 **points; /* map of where we were in the input string
1573 when reading a given string. For ASCII this
1574 is unnecessary overhead as the relationship
1575 is always 1:1, but for unicode, especially
1576 case folded unicode this is not true. */
f9e705e8 1577 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1578
1579 GET_RE_DEBUG_FLAGS_DECL;
1580
be8e71aa
YO
1581 /* We can't just allocate points here. We need to wrap it in
1582 * an SV so it gets freed properly if there is a croak while
1583 * running the match */
1584 ENTER;
1585 SAVETMPS;
1586 sv_points=newSV(maxlen * sizeof(U8 *));
1587 SvCUR_set(sv_points,
1588 maxlen * sizeof(U8 *));
1589 SvPOK_on(sv_points);
1590 sv_2mortal(sv_points);
1591 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1592
1593 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1594 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1595 s++;
1596 }
1597 }
1598
1599 while (s <= last_start) {
1600 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1601 U8 *uc = (U8*)s;
1602 U16 charid = 0;
1603 U32 base = 1;
1604 U32 state = 1;
1605 UV uvc = 0;
1606 STRLEN len = 0;
1607 STRLEN foldlen = 0;
1608 U8 *uscan = (U8*)NULL;
1609 U8 *leftmost = NULL;
1610
1611 U32 pointpos = 0;
1612
1613 while ( state && uc <= (U8*)strend ) {
1614 int failed=0;
1615 if (aho->states[ state ].wordnum) {
1616 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1617 if (!leftmost || lpos < leftmost)
1618 leftmost= lpos;
1619 if (base==0) break;
1620 }
1621 points[pointpos++ % maxlen]= uc;
1622 switch (trie_type) {
1623 case trie_utf8_fold:
1624 if ( foldlen>0 ) {
1625 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1626 foldlen -= len;
1627 uscan += len;
1628 len=0;
1629 } else {
07be1b83
YO
1630 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1631 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1632 foldlen -= UNISKIP( uvc );
1633 uscan = foldbuf + UNISKIP( uvc );
1634 }
1635 break;
1636 case trie_utf8:
1637 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1638 &len, uniflags );
1639 break;
1640 case trie_plain:
1641 uvc = (UV)*uc;
1642 len = 1;
1643 }
1644
1645 if (uvc < 256) {
1646 charid = trie->charmap[ uvc ];
1647 }
1648 else {
1649 charid = 0;
1650 if (trie->widecharmap) {
1651 SV** const svpp = hv_fetch(trie->widecharmap,
1652 (char*)&uvc, sizeof(UV), 0);
1653 if (svpp)
1654 charid = (U16)SvIV(*svpp);
1655 }
1656 }
1657 DEBUG_TRIE_EXECUTE_r(
1658 PerlIO_printf(Perl_debug_log,
1659 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1660 (int)((const char*)uc - real_start), charid, uvc)
1661 );
1662 uc += len;
1663
1664 do {
1665 U32 word = aho->states[ state ].wordnum;
1666 base = aho->states[ state ].trans.base;
1667
1668 DEBUG_TRIE_EXECUTE_r(
1669 PerlIO_printf( Perl_debug_log,
1670 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1671 failed ? "Fail transition to " : "",
1672 state, base, uvc, word)
1673 );
1674 if ( base ) {
1675 U32 tmp;
1676 if (charid &&
1677 (base + charid > trie->uniquecharcount )
1678 && (base + charid - 1 - trie->uniquecharcount
1679 < trie->lasttrans)
1680 && trie->trans[base + charid - 1 -
1681 trie->uniquecharcount].check == state
1682 && (tmp=trie->trans[base + charid - 1 -
1683 trie->uniquecharcount ].next))
1684 {
1685 state = tmp;
1686 break;
1687 }
1688 else {
1689 failed++;
1690 if ( state == 1 )
1691 break;
1692 else
1693 state = aho->fail[state];
1694 }
1695 }
1696 else {
1697 /* we must be accepting here */
1698 failed++;
1699 break;
1700 }
1701 } while(state);
1702 if (failed) {
1703 if (leftmost)
1704 break;
1705 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1706 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1707 uc++;
1708 }
1709 }
1710 }
1711 }
1712 if ( aho->states[ state ].wordnum ) {
1713 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1714 if (!leftmost || lpos < leftmost)
1715 leftmost = lpos;
1716 }
1717 DEBUG_TRIE_EXECUTE_r(
1718 PerlIO_printf( Perl_debug_log,
1719 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1720 "All done: ",
1721 state, base, uvc)
1722 );
1723 if (leftmost) {
1724 s = (char*)leftmost;
be8e71aa
YO
1725 if (!reginfo || regtry(reginfo, s)) {
1726 FREETMPS;
1727 LEAVE;
07be1b83 1728 goto got_it;
be8e71aa 1729 }
07be1b83
YO
1730 s = HOPc(s,1);
1731 } else {
1732 break;
1733 }
1734 }
be8e71aa
YO
1735 FREETMPS;
1736 LEAVE;
07be1b83
YO
1737 }
1738 break;
b3c9acc1 1739 default:
3c3eec57
GS
1740 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1741 break;
d6a28714 1742 }
6eb5f6b9
JH
1743 return 0;
1744 got_it:
1745 return s;
1746}
1747
1748/*
1749 - regexec_flags - match a regexp against a string
1750 */
1751I32
1752Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1753 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1754/* strend: pointer to null at end of string */
1755/* strbeg: real beginning of string */
1756/* minend: end of match must be >=minend after stringarg. */
1757/* data: May be used for some additional optimizations. */
1758/* nosave: For optimizations. */
1759{
97aff369 1760 dVAR;
6eb5f6b9
JH
1761 register char *s;
1762 register regnode *c;
1763 register char *startpos = stringarg;
6eb5f6b9
JH
1764 I32 minlen; /* must match at least this many chars */
1765 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1766 I32 end_shift = 0; /* Same for the end. */ /* CC */
1767 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1768 char *scream_olds = NULL;
3dab1dad 1769 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1770 const bool do_utf8 = DO_UTF8(sv);
2757e526 1771 I32 multiline;
0df25f3d 1772
3b0527fe 1773 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1774
1775 GET_RE_DEBUG_FLAGS_DECL;
1776
9d4ba2ae 1777 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1778
1779 /* Be paranoid... */
1780 if (prog == NULL || startpos == NULL) {
1781 Perl_croak(aTHX_ "NULL regexp parameter");
1782 return 0;
1783 }
1784
2757e526 1785 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1786 reginfo.prog = prog;
2757e526 1787
bac06658
JH
1788 RX_MATCH_UTF8_set(prog, do_utf8);
1789
6eb5f6b9 1790 minlen = prog->minlen;
61a36c01 1791 if (strend - startpos < minlen) {
a3621e74 1792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1793 "String too short [regexec_flags]...\n"));
1794 goto phooey;
1aa99e6b 1795 }
6eb5f6b9 1796
6eb5f6b9
JH
1797 /* Check validity of program. */
1798 if (UCHARAT(prog->program) != REG_MAGIC) {
1799 Perl_croak(aTHX_ "corrupted regexp program");
1800 }
1801
1802 PL_reg_flags = 0;
1803 PL_reg_eval_set = 0;
1804 PL_reg_maxiter = 0;
1805
1806 if (prog->reganch & ROPT_UTF8)
1807 PL_reg_flags |= RF_utf8;
1808
1809 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1810 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1811 PL_bostr = strbeg;
3b0527fe 1812 reginfo.sv = sv;
6eb5f6b9
JH
1813
1814 /* Mark end of line for $ (and such) */
1815 PL_regeol = strend;
1816
1817 /* see how far we have to get to not match where we matched before */
3b0527fe 1818 reginfo.till = startpos+minend;
6eb5f6b9 1819
6eb5f6b9
JH
1820 /* If there is a "must appear" string, look for it. */
1821 s = startpos;
1822
3b0527fe 1823 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1824 MAGIC *mg;
1825
1826 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1827 reginfo.ganch = startpos;
6eb5f6b9
JH
1828 else if (sv && SvTYPE(sv) >= SVt_PVMG
1829 && SvMAGIC(sv)
14befaf4
DM
1830 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1831 && mg->mg_len >= 0) {
3b0527fe 1832 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1833 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1834 if (s > reginfo.ganch)
6eb5f6b9 1835 goto phooey;
3b0527fe 1836 s = reginfo.ganch;
6eb5f6b9
JH
1837 }
1838 }
1839 else /* pos() not defined */
3b0527fe 1840 reginfo.ganch = strbeg;
6eb5f6b9
JH
1841 }
1842
a0714e2c 1843 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1844 re_scream_pos_data d;
1845
1846 d.scream_olds = &scream_olds;
1847 d.scream_pos = &scream_pos;
1848 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1849 if (!s) {
a3621e74 1850 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1851 goto phooey; /* not present */
3fa9c3d7 1852 }
6eb5f6b9
JH
1853 }
1854
a3621e74 1855 DEBUG_EXECUTE_r({
0df25f3d
YO
1856 RE_PV_DISPLAY_DECL(s0, len0, UTF,
1857 PERL_DEBUG_PAD_ZERO(0), prog->precomp, prog->prelen, 60);
1858 RE_PV_DISPLAY_DECL(s1, len1, do_utf8,
1859 PERL_DEBUG_PAD_ZERO(1), startpos, strend - startpos, 60);
1860
2a782b5b
JH
1861 if (!PL_colorset)
1862 reginitcolors();
1863 PerlIO_printf(Perl_debug_log,
a0288114 1864 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1865 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1866 len0, len0, s0,
2a782b5b 1867 PL_colors[1],
9e55ce06 1868 len0 > 60 ? "..." : "",
2a782b5b 1869 PL_colors[0],
9e55ce06
JH
1870 (int)(len1 > 60 ? 60 : len1),
1871 s1, PL_colors[1],
1872 (len1 > 60 ? "..." : "")
2a782b5b
JH
1873 );
1874 });
6eb5f6b9
JH
1875
1876 /* Simplest case: anchored match need be tried only once. */
1877 /* [unless only anchor is BOL and multiline is set] */
1878 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1879 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1880 goto got_it;
7fba1cd6 1881 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1882 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1883 {
1884 char *end;
1885
1886 if (minlen)
1887 dontbother = minlen - 1;
1aa99e6b 1888 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1889 /* for multiline we only have to try after newlines */
33b8afdf 1890 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1891 if (s == startpos)
1892 goto after_try;
1893 while (1) {
3b0527fe 1894 if (regtry(&reginfo, s))
6eb5f6b9
JH
1895 goto got_it;
1896 after_try:
1897 if (s >= end)
1898 goto phooey;
1899 if (prog->reganch & RE_USE_INTUIT) {
1900 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1901 if (!s)
1902 goto phooey;
1903 }
1904 else
1905 s++;
1906 }
1907 } else {
1908 if (s > startpos)
1909 s--;
1910 while (s < end) {
1911 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1912 if (regtry(&reginfo, s))
6eb5f6b9
JH
1913 goto got_it;
1914 }
1915 }
1916 }
1917 }
1918 goto phooey;
1919 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1920 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1921 goto got_it;
1922 goto phooey;
1923 }
1924
1925 /* Messy cases: unanchored match. */
33b8afdf 1926 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1927 /* we have /x+whatever/ */
1928 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1929 char ch;
bf93d4cc
GS
1930#ifdef DEBUGGING
1931 int did_match = 0;
1932#endif
33b8afdf
JH
1933 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1935 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1936
1aa99e6b 1937 if (do_utf8) {
6eb5f6b9
JH
1938 while (s < strend) {
1939 if (*s == ch) {
a3621e74 1940 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1941 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1942 s += UTF8SKIP(s);
1943 while (s < strend && *s == ch)
1944 s += UTF8SKIP(s);
1945 }
1946 s += UTF8SKIP(s);
1947 }
1948 }
1949 else {
1950 while (s < strend) {
1951 if (*s == ch) {
a3621e74 1952 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1953 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1954 s++;
1955 while (s < strend && *s == ch)
1956 s++;
1957 }
1958 s++;
1959 }
1960 }
a3621e74 1961 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1962 PerlIO_printf(Perl_debug_log,
b7953727
JH
1963 "Did not find anchored character...\n")
1964 );
6eb5f6b9 1965 }
a0714e2c
SS
1966 else if (prog->anchored_substr != NULL
1967 || prog->anchored_utf8 != NULL
1968 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1969 && prog->float_max_offset < strend - s)) {
1970 SV *must;
1971 I32 back_max;
1972 I32 back_min;
1973 char *last;
6eb5f6b9 1974 char *last1; /* Last position checked before */
bf93d4cc
GS
1975#ifdef DEBUGGING
1976 int did_match = 0;
1977#endif
33b8afdf
JH
1978 if (prog->anchored_substr || prog->anchored_utf8) {
1979 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1980 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1981 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1982 back_max = back_min = prog->anchored_offset;
1983 } else {
1984 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1985 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1986 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1987 back_max = prog->float_max_offset;
1988 back_min = prog->float_min_offset;
1989 }
1990 if (must == &PL_sv_undef)
1991 /* could not downgrade utf8 check substring, so must fail */
1992 goto phooey;
1993
1994 last = HOP3c(strend, /* Cannot start after this */
1995 -(I32)(CHR_SVLEN(must)
1996 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1997
1998 if (s > PL_bostr)
1999 last1 = HOPc(s, -1);
2000 else
2001 last1 = s - 1; /* bogus */
2002
a0288114 2003 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2004 check_substr==must. */
2005 scream_pos = -1;
2006 dontbother = end_shift;
2007 strend = HOPc(strend, -dontbother);
2008 while ( (s <= last) &&
9041c2e3 2009 ((flags & REXEC_SCREAM)
1aa99e6b 2010 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2011 end_shift, &scream_pos, 0))
1aa99e6b 2012 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2013 (unsigned char*)strend, must,
7fba1cd6 2014 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2015 /* we may be pointing at the wrong string */
2016 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2017 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2018 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2019 if (HOPc(s, -back_max) > last1) {
2020 last1 = HOPc(s, -back_min);
2021 s = HOPc(s, -back_max);
2022 }
2023 else {
52657f30 2024 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2025
2026 last1 = HOPc(s, -back_min);
52657f30 2027 s = t;
6eb5f6b9 2028 }
1aa99e6b 2029 if (do_utf8) {
6eb5f6b9 2030 while (s <= last1) {
3b0527fe 2031 if (regtry(&reginfo, s))
6eb5f6b9
JH
2032 goto got_it;
2033 s += UTF8SKIP(s);
2034 }
2035 }
2036 else {
2037 while (s <= last1) {
3b0527fe 2038 if (regtry(&reginfo, s))
6eb5f6b9
JH
2039 goto got_it;
2040 s++;
2041 }
2042 }
2043 }
a3621e74 2044 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2045 PerlIO_printf(Perl_debug_log,
a0288114 2046 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2047 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2048 ? "anchored" : "floating"),
2049 PL_colors[0],
2050 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2051 SvPVX_const(must),
b7953727
JH
2052 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2053 );
6eb5f6b9
JH
2054 goto phooey;
2055 }
155aba94 2056 else if ((c = prog->regstclass)) {
f14c76ed 2057 if (minlen) {
be8e71aa 2058 const OPCODE op = OP(prog->regstclass);
66e933ab 2059 /* don't bother with what can't match */
07be1b83 2060 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2061 strend = HOPc(strend, -(minlen - 1));
2062 }
a3621e74 2063 DEBUG_EXECUTE_r({
be8e71aa 2064 SV * const prop = sv_newmortal();
32fc9b6a 2065 regprop(prog, prop, c);
0df25f3d
YO
2066 {
2067 RE_PV_DISPLAY_DECL(s0,len0,UTF,
2068 PERL_DEBUG_PAD_ZERO(0),SvPVX_const(prop),SvCUR(prop),60);
2069 RE_PV_DISPLAY_DECL(s1,len1,UTF,
2070 PERL_DEBUG_PAD_ZERO(1),s,strend-s,60);
2071 PerlIO_printf(Perl_debug_log,
2072 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
2073 len0, len0, s0,
2074 len1, len1, s1, (int)(strend - s));
2075 }
ffc61ed2 2076 });
3b0527fe 2077 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2078 goto got_it;
07be1b83 2079 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2080 }
2081 else {
2082 dontbother = 0;
a0714e2c 2083 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2084 /* Trim the end. */
d6a28714 2085 char *last;
33b8afdf
JH
2086 SV* float_real;
2087
2088 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2089 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2090 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2091
2092 if (flags & REXEC_SCREAM) {
33b8afdf 2093 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2094 end_shift, &scream_pos, 1); /* last one */
2095 if (!last)
ffc61ed2 2096 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2097 /* we may be pointing at the wrong string */
2098 else if (RX_MATCH_COPIED(prog))
3f7c398e 2099 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2100 }
d6a28714
JH
2101 else {
2102 STRLEN len;
cfd0369c 2103 const char * const little = SvPV_const(float_real, len);
d6a28714 2104
33b8afdf 2105 if (SvTAIL(float_real)) {
d6a28714
JH
2106 if (memEQ(strend - len + 1, little, len - 1))
2107 last = strend - len + 1;
7fba1cd6 2108 else if (!multiline)
9041c2e3 2109 last = memEQ(strend - len, little, len)
bd61b366 2110 ? strend - len : NULL;
b8c5462f 2111 else
d6a28714
JH
2112 goto find_last;
2113 } else {
2114 find_last:
9041c2e3 2115 if (len)
d6a28714 2116 last = rninstr(s, strend, little, little + len);
b8c5462f 2117 else
a0288114 2118 last = strend; /* matching "$" */
b8c5462f 2119 }
b8c5462f 2120 }
bf93d4cc 2121 if (last == NULL) {
a3621e74 2122 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2123 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2124 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2125 goto phooey; /* Should not happen! */
2126 }
d6a28714
JH
2127 dontbother = strend - last + prog->float_min_offset;
2128 }
2129 if (minlen && (dontbother < minlen))
2130 dontbother = minlen - 1;
2131 strend -= dontbother; /* this one's always in bytes! */
2132 /* We don't know much -- general case. */
1aa99e6b 2133 if (do_utf8) {
d6a28714 2134 for (;;) {
3b0527fe 2135 if (regtry(&reginfo, s))
d6a28714
JH
2136 goto got_it;
2137 if (s >= strend)
2138 break;
b8c5462f 2139 s += UTF8SKIP(s);
d6a28714
JH
2140 };
2141 }
2142 else {
2143 do {
3b0527fe 2144 if (regtry(&reginfo, s))
d6a28714
JH
2145 goto got_it;
2146 } while (s++ < strend);
2147 }
2148 }
2149
2150 /* Failure. */
2151 goto phooey;
2152
2153got_it:
2154 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2155
2156 if (PL_reg_eval_set) {
2157 /* Preserve the current value of $^R */
2158 if (oreplsv != GvSV(PL_replgv))
2159 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2160 restored, the value remains
2161 the same. */
4f639d21 2162 restore_pos(aTHX_ prog);
d6a28714
JH
2163 }
2164
2165 /* make sure $`, $&, $', and $digit will work later */
2166 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2167 RX_MATCH_COPY_FREE(prog);
d6a28714 2168 if (flags & REXEC_COPY_STR) {
be8e71aa 2169 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2170#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2171 if ((SvIsCOW(sv)
2172 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2173 if (DEBUG_C_TEST) {
2174 PerlIO_printf(Perl_debug_log,
2175 "Copy on write: regexp capture, type %d\n",
2176 (int) SvTYPE(sv));
2177 }
2178 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2179 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2180 assert (SvPOKp(prog->saved_copy));
2181 } else
2182#endif
2183 {
2184 RX_MATCH_COPIED_on(prog);
2185 s = savepvn(strbeg, i);
2186 prog->subbeg = s;
2187 }
d6a28714 2188 prog->sublen = i;
d6a28714
JH
2189 }
2190 else {
2191 prog->subbeg = strbeg;
2192 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2193 }
2194 }
9041c2e3 2195
d6a28714
JH
2196 return 1;
2197
2198phooey:
a3621e74 2199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2200 PL_colors[4], PL_colors[5]));
d6a28714 2201 if (PL_reg_eval_set)
4f639d21 2202 restore_pos(aTHX_ prog);
d6a28714
JH
2203 return 0;
2204}
2205
2206/*
2207 - regtry - try match at specific point
2208 */
2209STATIC I32 /* 0 failure, 1 success */
3b0527fe 2210S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2211{
97aff369 2212 dVAR;
d6a28714
JH
2213 register I32 *sp;
2214 register I32 *ep;
2215 CHECKPOINT lastcp;
3b0527fe 2216 regexp *prog = reginfo->prog;
a3621e74 2217 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2218
02db2b7b
IZ
2219#ifdef DEBUGGING
2220 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2221#endif
d6a28714
JH
2222 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2223 MAGIC *mg;
2224
2225 PL_reg_eval_set = RS_init;
a3621e74 2226 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2227 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2228 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2229 ));
e8347627 2230 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2231 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2232 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2233 SAVETMPS;
2234 /* Apparently this is not needed, judging by wantarray. */
e8347627 2235 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2236 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2237
3b0527fe 2238 if (reginfo->sv) {
d6a28714 2239 /* Make $_ available to executed code. */
3b0527fe 2240 if (reginfo->sv != DEFSV) {
59f00321 2241 SAVE_DEFSV;
3b0527fe 2242 DEFSV = reginfo->sv;
b8c5462f 2243 }
d6a28714 2244
3b0527fe
DM
2245 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2246 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2247 /* prepare for quick setting of pos */
d300d9fa
NC
2248#ifdef PERL_OLD_COPY_ON_WRITE
2249 if (SvIsCOW(sv))
2250 sv_force_normal_flags(sv, 0);
2251#endif
3dab1dad 2252 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2253 &PL_vtbl_mglob, NULL, 0);
d6a28714 2254 mg->mg_len = -1;
b8c5462f 2255 }
d6a28714
JH
2256 PL_reg_magic = mg;
2257 PL_reg_oldpos = mg->mg_len;
4f639d21 2258 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2259 }
09687e5a 2260 if (!PL_reg_curpm) {
a02a5408 2261 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2262#ifdef USE_ITHREADS
2263 {
be8e71aa 2264 SV* const repointer = newSViv(0);
577e12cc 2265 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2266 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2267 av_push(PL_regex_padav,repointer);
2268 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2269 PL_regex_pad = AvARRAY(PL_regex_padav);
2270 }
2271#endif
2272 }
aaa362c4 2273 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2274 PL_reg_oldcurpm = PL_curpm;
2275 PL_curpm = PL_reg_curpm;
2276 if (RX_MATCH_COPIED(prog)) {
2277 /* Here is a serious problem: we cannot rewrite subbeg,
2278 since it may be needed if this match fails. Thus
2279 $` inside (?{}) could fail... */
2280 PL_reg_oldsaved = prog->subbeg;
2281 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2282#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2283 PL_nrs = prog->saved_copy;
2284#endif
d6a28714
JH
2285 RX_MATCH_COPIED_off(prog);
2286 }
2287 else
bd61b366 2288 PL_reg_oldsaved = NULL;
d6a28714
JH
2289 prog->subbeg = PL_bostr;
2290 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2291 }
973dddac 2292 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2293 PL_reginput = startpos;
2294 PL_regstartp = prog->startp;
2295 PL_regendp = prog->endp;
2296 PL_reglastparen = &prog->lastparen;
a01268b5 2297 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2298 prog->lastparen = 0;
03994de8 2299 prog->lastcloseparen = 0;
d6a28714 2300 PL_regsize = 0;
a3621e74 2301 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2302 if (PL_reg_start_tmpl <= prog->nparens) {
2303 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2304 if(PL_reg_start_tmp)
2305 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2306 else
a02a5408 2307 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2308 }
2309
2310 /* XXXX What this code is doing here?!!! There should be no need
2311 to do this again and again, PL_reglastparen should take care of
3dd2943c 2312 this! --ilya*/
dafc8851
JH
2313
2314 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2315 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2316 * PL_reglastparen), is not needed at all by the test suite
2317 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2318 * enough, for building DynaLoader, or otherwise this
2319 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2320 * will happen. Meanwhile, this code *is* needed for the
2321 * above-mentioned test suite tests to succeed. The common theme
2322 * on those tests seems to be returning null fields from matches.
2323 * --jhi */
dafc8851 2324#if 1
d6a28714
JH
2325 sp = prog->startp;
2326 ep = prog->endp;
2327 if (prog->nparens) {
097eb12c 2328 register I32 i;
eb160463 2329 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2330 *++sp = -1;
2331 *++ep = -1;
2332 }
2333 }
dafc8851 2334#endif
02db2b7b 2335 REGCP_SET(lastcp);
3b0527fe 2336 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2337 prog->endp[0] = PL_reginput - PL_bostr;
2338 return 1;
2339 }
02db2b7b 2340 REGCP_UNWIND(lastcp);
d6a28714
JH
2341 return 0;
2342}
2343
02db2b7b 2344
8ba1375e
MJD
2345#define sayYES goto yes
2346#define sayNO goto no
e0f9d4a8 2347#define sayNO_ANYOF goto no_anyof
8ba1375e 2348#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2349#define sayNO_FINAL goto no_final
2350#define sayNO_SILENT goto do_no
2351#define saySAME(x) if (x) goto yes; else goto no
2352
3ab3c9b4
HS
2353#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2354#define POSCACHE_SEEN 1 /* we know what we're caching */
2355#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2356
3ab3c9b4 2357#define CACHEsayYES STMT_START { \
d8319b27 2358 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2359 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2360 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2361 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2362 } \
2363 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2364 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2365 } \
2366 else { \
3ab3c9b4
HS
2367 /* cache records failure, but this is success */ \
2368 DEBUG_r( \
2369 PerlIO_printf(Perl_debug_log, \
2370 "%*s (remove success from failure cache)\n", \
2371 REPORT_CODE_OFF+PL_regindent*2, "") \
2372 ); \
d8319b27 2373 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2374 } \
2375 } \
2376 sayYES; \
2377} STMT_END
7409bbd3 2378
3ab3c9b4 2379#define CACHEsayNO STMT_START { \
d8319b27 2380 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2381 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2382 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2383 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2384 } \
2385 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2386 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2387 } \
2388 else { \
3ab3c9b4
HS
2389 /* cache records success, but this is failure */ \
2390 DEBUG_r( \
2391 PerlIO_printf(Perl_debug_log, \
2392 "%*s (remove failure from success cache)\n", \
2393 REPORT_CODE_OFF+PL_regindent*2, "") \
2394 ); \
d8319b27 2395 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2396 } \
2397 } \
2398 sayNO; \
2399} STMT_END
2400
a3621e74
YO
2401/* this is used to determine how far from the left messages like
2402 'failed...' are printed. Currently 29 makes these messages line
2403 up with the opcode they refer to. Earlier perls used 25 which
2404 left these messages outdented making reviewing a debug output
2405 quite difficult.
2406*/
2407#define REPORT_CODE_OFF 29
2408
2409
2410/* Make sure there is a test for this +1 options in re_tests */
2411#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2412
40a82448
DM
2413#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2414#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2415
86545054
DM
2416#define SLAB_FIRST(s) (&(s)->states[0])
2417#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2418
5d9a96ca
DM
2419/* grab a new slab and return the first slot in it */
2420
2421STATIC regmatch_state *
2422S_push_slab(pTHX)
2423{
54df2634
NC
2424#if PERL_VERSION < 9
2425 dMY_CXT;
2426#endif
5d9a96ca
DM
2427 regmatch_slab *s = PL_regmatch_slab->next;
2428 if (!s) {
2429 Newx(s, 1, regmatch_slab);
2430 s->prev = PL_regmatch_slab;
2431 s->next = NULL;
2432 PL_regmatch_slab->next = s;
2433 }
2434 PL_regmatch_slab = s;
86545054 2435 return SLAB_FIRST(s);
5d9a96ca 2436}
5b47454d 2437
95b24440
DM
2438/* simulate a recursive call to regmatch */
2439
2440#define REGMATCH(ns, where) \
5d9a96ca
DM
2441 st->scan = scan; \
2442 scan = (ns); \
2443 st->resume_state = resume_##where; \
95b24440
DM
2444 goto start_recurse; \
2445 resume_point_##where:
2446
40a82448
DM
2447/* push a new state then goto it */
2448
2449#define PUSH_STATE_GOTO(state, node) \
2450 scan = node; \
2451 st->resume_state = state; \
2452 goto push_state;
2453
2454/* push a new state with success backtracking, then goto it */
2455
2456#define PUSH_YES_STATE_GOTO(state, node) \
2457 scan = node; \
2458 st->resume_state = state; \
2459 goto push_yes_state;
2460
aa283a38 2461
aa283a38 2462
d6a28714
JH
2463/*
2464 - regmatch - main matching routine
2465 *
2466 * Conceptually the strategy is simple: check to see whether the current
2467 * node matches, call self recursively to see whether the rest matches,
2468 * and then act accordingly. In practice we make some effort to avoid
2469 * recursion, in particular by going through "ordinary" nodes (that don't
2470 * need to know whether the rest of the match failed) by a loop instead of
2471 * by recursion.
2472 */
2473/* [lwall] I've hoisted the register declarations to the outer block in order to
2474 * maybe save a little bit of pushing and popping on the stack. It also takes
2475 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2476 *
2477 * This function used to be heavily recursive, but since this had the
2478 * effect of blowing the CPU stack on complex regexes, it has been
2479 * restructured to be iterative, and to save state onto the heap rather
2480 * than the stack. Essentially whereever regmatch() used to be called, it
2481 * pushes the current state, notes where to return, then jumps back into
2482 * the main loop.
2483 *
2484 * Originally the structure of this function used to look something like
2485
2486 S_regmatch() {
2487 int a = 1, b = 2;
2488 ...
2489 while (scan != NULL) {
5d9a96ca 2490 a++; // do stuff with a and b
95b24440
DM
2491 ...
2492 switch (OP(scan)) {
2493 case FOO: {
2494 int local = 3;
2495 ...
2496 if (regmatch(...)) // recurse
2497 goto yes;
2498 }
2499 ...
2500 }
2501 }
2502 yes:
2503 return 1;
2504 }
2505
2506 * Now it looks something like this:
2507
5d9a96ca 2508 typedef struct {
95b24440
DM
2509 int a, b, local;
2510 int resume_state;
5d9a96ca 2511 } regmatch_state;
95b24440
DM
2512
2513 S_regmatch() {
5d9a96ca
DM
2514 regmatch_state *st = new();
2515 int depth=0;
2516 st->a++; // do stuff with a and b
95b24440
DM
2517 ...
2518 while (scan != NULL) {
2519 ...
2520 switch (OP(scan)) {
2521 case FOO: {
5d9a96ca 2522 st->local = 3;
95b24440 2523 ...
5d9a96ca
DM
2524 st->scan = scan;
2525 scan = ...;
2526 st->resume_state = resume_FOO;
2527 goto start_recurse; // recurse
95b24440 2528
5d9a96ca
DM
2529 resume_point_FOO:
2530 if (result)
95b24440
DM
2531 goto yes;
2532 }
2533 ...
2534 }
5d9a96ca
DM
2535 start_recurse:
2536 st = new(); push a new state
2537 st->a = 1; st->b = 2;
2538 depth++;
95b24440 2539 }
5d9a96ca 2540 yes:
95b24440 2541 result = 1;
5d9a96ca
DM
2542 if (depth--) {
2543 st = pop();
95b24440
DM
2544 switch (resume_state) {
2545 case resume_FOO:
2546 goto resume_point_FOO;
2547 ...
2548 }
2549 }
2550 return result
2551 }
2552
2553 * WARNING: this means that any line in this function that contains a
2554 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2555 * regmatch() using gotos instead. Thus the values of any local variables
2556 * not saved in the regmatch_state structure will have been lost when
2557 * execution resumes on the next line .
5d9a96ca
DM
2558 *
2559 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2560 * PL_regmatch_state always points to the currently active state, and
2561 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2562 * The first time regmatch is called, the first slab is allocated, and is
2563 * never freed until interpreter desctruction. When the slab is full,
2564 * a new one is allocated chained to the end. At exit from regmatch, slabs
2565 * allocated since entry are freed.
d6a28714 2566 */
95b24440 2567
40a82448 2568/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2569#define TRIE_next (REGNODE_MAX+1)
2570#define TRIE_next_fail (REGNODE_MAX+2)
2571#define EVAL_A (REGNODE_MAX+3)
2572#define EVAL_A_fail (REGNODE_MAX+4)
2573#define resume_CURLYX (REGNODE_MAX+5)
2574#define resume_WHILEM1 (REGNODE_MAX+6)
2575#define resume_WHILEM2 (REGNODE_MAX+7)
2576#define resume_WHILEM3 (REGNODE_MAX+8)
2577#define resume_WHILEM4 (REGNODE_MAX+9)
2578#define resume_WHILEM5 (REGNODE_MAX+10)
2579#define resume_WHILEM6 (REGNODE_MAX+11)
2580#define BRANCH_next (REGNODE_MAX+12)
2581#define BRANCH_next_fail (REGNODE_MAX+13)
2582#define CURLYM_A (REGNODE_MAX+14)
2583#define CURLYM_A_fail (REGNODE_MAX+15)
2584#define CURLYM_B (REGNODE_MAX+16)
2585#define CURLYM_B_fail (REGNODE_MAX+17)
2586#define IFMATCH_A (REGNODE_MAX+18)
2587#define IFMATCH_A_fail (REGNODE_MAX+19)
2588#define CURLY_B_min_known (REGNODE_MAX+20)
2589#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2590#define CURLY_B_min (REGNODE_MAX+22)
2591#define CURLY_B_min_fail (REGNODE_MAX+23)
2592#define CURLY_B_max (REGNODE_MAX+24)
2593#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2594
2595
3dab1dad 2596#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2597
3df15adc
YO
2598#ifdef DEBUGGING
2599
2600STATIC void
07be1b83
YO
2601S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2602{
2603 const int docolor = *PL_colors[0];
2604 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2605 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2606 /* The part of the string before starttry has one color
2607 (pref0_len chars), between starttry and current
2608 position another one (pref_len - pref0_len chars),
2609 after the current position the third one.
2610 We assume that pref0_len <= pref_len, otherwise we
2611 decrease pref0_len. */
2612 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2613 ? (5 + taill) - l : locinput - PL_bostr;
2614 int pref0_len;
2615
2616 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2617 pref_len++;
2618 pref0_len = pref_len - (locinput - PL_reg_starttry);
2619 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2620 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2621 ? (5 + taill) - pref_len : PL_regeol - locinput);
2622 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2623 l--;
2624 if (pref0_len < 0)
2625 pref0_len = 0;
2626 if (pref0_len > pref_len)
2627 pref0_len = pref_len;
2628 {
3df15adc 2629 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d
YO
2630
2631 RE_PV_DISPLAY_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2632 (locinput - pref_len),pref0_len, 60);
2633
2634 RE_PV_DISPLAY_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2635 (locinput - pref_len + pref0_len),
0df25f3d
YO
2636 pref_len - pref0_len, 60);
2637
2638 RE_PV_DISPLAY_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2639 locinput, PL_regeol - locinput, 60);
2640
3df15adc 2641 PerlIO_printf(Perl_debug_log,
07be1b83
YO
2642 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2643 (IV)(locinput - PL_bostr),
2644 PL_colors[4],
2645 len0, s0,
2646 PL_colors[5],
2647 PL_colors[2],
2648 len1, s1,
2649 PL_colors[3],
2650 (docolor ? "" : "> <"),
2651 PL_colors[0],
2652 len2, s2,
2653 PL_colors[1],
2654 15 - l - pref_len + 1,
2655 "");
2656 }
2657}
3df15adc 2658
07be1b83
YO
2659#endif
2660
d6a28714 2661STATIC I32 /* 0 failure, 1 success */
3b0527fe 2662S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2663{
54df2634
NC
2664#if PERL_VERSION < 9
2665 dMY_CXT;
2666#endif
27da23d5 2667 dVAR;
95b24440 2668 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2669 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2670
3b0527fe
DM
2671 regexp *rex = reginfo->prog;
2672
5d9a96ca
DM
2673 regmatch_slab *orig_slab;
2674 regmatch_state *orig_state;
a3621e74 2675
5d9a96ca
DM
2676 /* the current state. This is a cached copy of PL_regmatch_state */
2677 register regmatch_state *st;
95b24440 2678
5d9a96ca
DM
2679 /* cache heavy used fields of st in registers */
2680 register regnode *scan;
2681 register regnode *next;
2682 register I32 n = 0; /* initialize to shut up compiler warning */
2683 register char *locinput = PL_reginput;
95b24440 2684
5d9a96ca
DM
2685 /* these variables are NOT saved during a recusive RFEGMATCH: */
2686 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2687 bool result; /* return value of S_regmatch */
5d9a96ca 2688 int depth = 0; /* depth of recursion */
77cb431f
DM
2689 regmatch_state *yes_state = NULL; /* state to pop to on success of
2690 subpattern */
40a82448 2691 U32 state_num;
95b24440
DM
2692
2693#ifdef DEBUGGING
e68ec53f 2694 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2695 PL_regindent++;
2696#endif
2697
5d9a96ca
DM
2698 /* on first ever call to regmatch, allocate first slab */
2699 if (!PL_regmatch_slab) {
2700 Newx(PL_regmatch_slab, 1, regmatch_slab);
2701 PL_regmatch_slab->prev = NULL;
2702 PL_regmatch_slab->next = NULL;
86545054 2703 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2704 }
2705
2706 /* remember current high-water mark for exit */
2707 /* XXX this should be done with SAVE* instead */
2708 orig_slab = PL_regmatch_slab;
2709 orig_state = PL_regmatch_state;
2710
2711 /* grab next free state slot */
2712 st = ++PL_regmatch_state;
86545054 2713 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2714 st = PL_regmatch_state = S_push_slab(aTHX);
2715
2716 st->minmod = 0;
2717 st->sw = 0;
2718 st->logical = 0;
5d9a96ca 2719 st->cc = NULL;
d6a28714
JH
2720 /* Note that nextchr is a byte even in UTF */
2721 nextchr = UCHARAT(locinput);
2722 scan = prog;
2723 while (scan != NULL) {
8ba1375e 2724
a3621e74 2725 DEBUG_EXECUTE_r( {
6136c704 2726 SV * const prop = sv_newmortal();
07be1b83 2727 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2728 regprop(rex, prop, scan);
07be1b83
YO
2729
2730 PerlIO_printf(Perl_debug_log,
2731 "%3"IVdf":%*s%s(%"IVdf")\n",
2732 (IV)(scan - rex->program), PL_regindent*2, "",
2733 SvPVX_const(prop),
2734 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2735 });
d6a28714
JH
2736
2737 next = scan + NEXT_OFF(scan);
2738 if (next == scan)
2739 next = NULL;
40a82448 2740 state_num = OP(scan);
d6a28714 2741
40a82448
DM
2742 reenter_switch:
2743 switch (state_num) {
d6a28714 2744 case BOL:
7fba1cd6 2745 if (locinput == PL_bostr)
d6a28714 2746 {
3b0527fe 2747 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2748 break;
2749 }
d6a28714
JH
2750 sayNO;
2751 case MBOL:
12d33761
HS
2752 if (locinput == PL_bostr ||
2753 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2754 {
b8c5462f
JH
2755 break;
2756 }
d6a28714
JH
2757 sayNO;
2758 case SBOL:
c2a73568 2759 if (locinput == PL_bostr)
b8c5462f 2760 break;
d6a28714
JH
2761 sayNO;
2762 case GPOS:
3b0527fe 2763 if (locinput == reginfo->ganch)
d6a28714
JH
2764 break;
2765 sayNO;
2766 case EOL:
d6a28714
JH
2767 goto seol;
2768 case MEOL:
d6a28714 2769 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2770 sayNO;
b8c5462f 2771 break;
d6a28714
JH
2772 case SEOL:
2773 seol:
2774 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2775 sayNO;
d6a28714 2776 if (PL_regeol - locinput > 1)
b8c5462f 2777 sayNO;
b8c5462f 2778 break;
d6a28714
JH
2779 case EOS:
2780 if (PL_regeol != locinput)
b8c5462f 2781 sayNO;
d6a28714 2782 break;
ffc61ed2 2783 case SANY:
d6a28714 2784 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2785 sayNO;
f33976b4
DB
2786 if (do_utf8) {
2787 locinput += PL_utf8skip[nextchr];
2788 if (locinput > PL_regeol)
2789 sayNO;
2790 nextchr = UCHARAT(locinput);
2791 }
2792 else
2793 nextchr = UCHARAT(++locinput);
2794 break;
2795 case CANY:
2796 if (!nextchr && locinput >= PL_regeol)
2797 sayNO;
b8c5462f 2798 nextchr = UCHARAT(++locinput);
a0d0e21e 2799 break;
ffc61ed2 2800 case REG_ANY:
1aa99e6b
IH
2801 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2802 sayNO;
2803 if (do_utf8) {
b8c5462f 2804 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2805 if (locinput > PL_regeol)
2806 sayNO;
a0ed51b3 2807 nextchr = UCHARAT(locinput);
a0ed51b3 2808 }
1aa99e6b
IH
2809 else
2810 nextchr = UCHARAT(++locinput);
a0ed51b3 2811 break;
166ba7cd
DM
2812
2813#undef ST
2814#define ST st->u.trie
2815
5b47454d 2816 case TRIE:
3dab1dad 2817 {
07be1b83 2818 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2819 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2820 trie_type = do_utf8 ?
2821 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2822 : trie_plain;
2823
2824 /* what trie are we using right now */
be8e71aa 2825 reg_trie_data * const trie
3dab1dad
YO
2826 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2827 U32 state = trie->startstate;
166ba7cd
DM
2828
2829 U8 *uc = ( U8* )locinput;
2830 U16 charid = 0;
2831 U32 base = 0;
2832 UV uvc = 0;
2833 STRLEN len = 0;
2834 STRLEN foldlen = 0;
2835 U8 *uscan = (U8*)NULL;
2836 STRLEN bufflen=0;
2837 SV *sv_accept_buff = NULL;
2838 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2839
2840 ST.accepted = 0; /* how many accepting states we have seen */
2841 ST.B = next;
2842#ifdef DEBUGGING
2843 ST.me = scan;
2844#endif
3dab1dad
YO
2845
2846 if (trie->bitmap && trie_type != trie_utf8_fold &&
2847 !TRIE_BITMAP_TEST(trie,*locinput)
2848 ) {
2849 if (trie->states[ state ].wordnum) {
2850 DEBUG_EXECUTE_r(
2851 PerlIO_printf(Perl_debug_log,
2852 "%*s %smatched empty string...%s\n",
2853 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2854 );
2855 break;
2856 } else {
2857 DEBUG_EXECUTE_r(
2858 PerlIO_printf(Perl_debug_log,
2859 "%*s %sfailed to match start class...%s\n",
2860 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2861 );
2862 sayNO_SILENT;
2863 }
2864 }
166ba7cd 2865
07be1b83
YO
2866 /*
2867 traverse the TRIE keeping track of all accepting states
2868 we transition through until we get to a failing node.
2869 */
2870
a3621e74
YO
2871 while ( state && uc <= (U8*)PL_regeol ) {
2872
5b47454d 2873 if (trie->states[ state ].wordnum) {
166ba7cd 2874 if (!ST.accepted ) {
5b47454d
DM
2875 ENTER;
2876 SAVETMPS;
2877 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2878 sv_accept_buff=newSV(bufflen *
2879 sizeof(reg_trie_accepted) - 1);
2880 SvCUR_set(sv_accept_buff,
2881 sizeof(reg_trie_accepted));
2882 SvPOK_on(sv_accept_buff);
2883 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2884 SAVETMPS;
2885 ST.accept_buff =
5b47454d
DM
2886 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2887 }
2888 else {
166ba7cd 2889 if (ST.accepted >= bufflen) {
5b47454d 2890 bufflen *= 2;
166ba7cd 2891 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2892 SvGROW(sv_accept_buff,
2893 bufflen * sizeof(reg_trie_accepted));
2894 }
2895 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2896 + sizeof(reg_trie_accepted));
2897 }
166ba7cd
DM
2898 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2899 ST.accept_buff[ST.accepted].endpos = uc;
2900 ++ST.accepted;
5b47454d 2901 }
a3621e74
YO
2902
2903 base = trie->states[ state ].trans.base;
2904
07be1b83
YO
2905 DEBUG_TRIE_EXECUTE_r({
2906 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2907 PerlIO_printf( Perl_debug_log,
e4584336 2908 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2909 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2910 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2911 });
a3621e74
YO
2912
2913 if ( base ) {
5b47454d 2914 switch (trie_type) {
3dab1dad 2915 case trie_utf8_fold:
a3621e74
YO
2916 if ( foldlen>0 ) {
2917 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2918 foldlen -= len;
2919 uscan += len;
2920 len=0;
2921 } else {
2922 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2923 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2924 foldlen -= UNISKIP( uvc );
2925 uscan = foldbuf + UNISKIP( uvc );
2926 }
5b47454d
DM
2927 break;
2928 case trie_utf8:
2929 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2930 &len, uniflags );
2931 break;
2932 case trie_plain:
e4584336 2933 uvc = (UV)*uc;
a3621e74
YO
2934 len = 1;
2935 }
2936
5b47454d
DM
2937 if (uvc < 256) {
2938 charid = trie->charmap[ uvc ];
2939 }
2940 else {
2941 charid = 0;
2942 if (trie->widecharmap) {
3dab1dad 2943 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2944 (char*)&uvc, sizeof(UV), 0);
2945 if (svpp)
2946 charid = (U16)SvIV(*svpp);
2947 }
2948 }
a3621e74 2949
5b47454d
DM
2950 if (charid &&
2951 (base + charid > trie->uniquecharcount )
2952 && (base + charid - 1 - trie->uniquecharcount
2953 < trie->lasttrans)
2954 && trie->trans[base + charid - 1 -
2955 trie->uniquecharcount].check == state)
2956 {
2957 state = trie->trans[base + charid - 1 -
2958 trie->uniquecharcount ].next;
2959 }
2960 else {
2961 state = 0;
2962 }
2963 uc += len;
2964
2965 }
2966 else {
a3621e74
YO
2967 state = 0;
2968 }
2969 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2970 PerlIO_printf( Perl_debug_log,
2971 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2972 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2973 );
2974 }
166ba7cd 2975 if (!ST.accepted )
a3621e74 2976 sayNO;
a3621e74 2977
166ba7cd
DM
2978 DEBUG_EXECUTE_r(
2979 PerlIO_printf( Perl_debug_log,
2980 "%*s %sgot %"IVdf" possible matches%s\n",
2981 REPORT_CODE_OFF + PL_regindent * 2, "",
2982 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2983 );
2984 }
2985
2986 /* FALL THROUGH */
2987
2988 case TRIE_next_fail: /* we failed - try next alterative */
2989
2990 if ( ST.accepted == 1 ) {
2991 /* only one choice left - just continue */
2992 DEBUG_EXECUTE_r({
2993 reg_trie_data * const trie
2994 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2995 SV ** const tmp = RX_DEBUG(reginfo->prog)
2996 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2997 : NULL;
2998 PerlIO_printf( Perl_debug_log,
2999 "%*s %sonly one match left: #%d <%s>%s\n",
3000 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3001 ST.accept_buff[ 0 ].wordnum,
3002 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3003 PL_colors[5] );
3004 });
3005 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3006 /* in this case we free tmps/leave before we call regmatch
3007 as we wont be using accept_buff again. */
3008 FREETMPS;
3009 LEAVE;
3010 locinput = PL_reginput;
3011 nextchr = UCHARAT(locinput);
3012 scan = ST.B;
3013 continue; /* execute rest of RE */
3014 }
3015
3016 if (!ST.accepted-- ) {
3017 FREETMPS;
3018 LEAVE;
3019 sayNO;
3020 }
3021
a3621e74 3022 /*
166ba7cd
DM
3023 There are at least two accepting states left. Presumably
3024 the number of accepting states is going to be low,
3025 typically two. So we simply scan through to find the one
3026 with lowest wordnum. Once we find it, we swap the last
3027 state into its place and decrement the size. We then try to
3028 match the rest of the pattern at the point where the word
3029 ends. If we succeed, control just continues along the
3030 regex; if we fail we return here to try the next accepting
3031 state
3032 */
a3621e74 3033
166ba7cd
DM
3034 {
3035 U32 best = 0;
3036 U32 cur;
3037 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3038 DEBUG_TRIE_EXECUTE_r(
f2278c82 3039 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
3040 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3041 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
3042 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3043 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3044 );
3045
3046 if (ST.accept_buff[cur].wordnum <
3047 ST.accept_buff[best].wordnum)
3048 best = cur;
a3621e74 3049 }
166ba7cd
DM
3050
3051 DEBUG_EXECUTE_r({
3052 reg_trie_data * const trie
3053 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
3054 SV ** const tmp = RX_DEBUG(reginfo->prog)
3055 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
3056 : NULL;
3057 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3058 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3059 ST.accept_buff[best].wordnum,
3060 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3061 PL_colors[5] );
3062 });
3063
3064 if ( best<ST.accepted ) {
3065 reg_trie_accepted tmp = ST.accept_buff[ best ];
3066 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3067 ST.accept_buff[ ST.accepted ] = tmp;
3068 best = ST.accepted;
a3621e74 3069 }
166ba7cd
DM
3070 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3071 }
3072 PUSH_STATE_GOTO(TRIE_next, ST.B);
3073 /* NOTREACHED */
3074
3075#undef ST
3076
95b24440
DM
3077 case EXACT: {
3078 char *s = STRING(scan);
5d9a96ca 3079 st->ln = STR_LEN(scan);
eb160463 3080 if (do_utf8 != UTF) {
bc517b45 3081 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3082 char *l = locinput;
be8e71aa 3083 const char * const e = s + st->ln;
a72c7584 3084
5ff6fc6d
JH
3085 if (do_utf8) {
3086 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3087 while (s < e) {
a3b680e6 3088 STRLEN ulen;
1aa99e6b 3089 if (l >= PL_regeol)
5ff6fc6d
JH
3090 sayNO;
3091 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3092 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3093 uniflags))
5ff6fc6d 3094 sayNO;
bc517b45 3095 l += ulen;
5ff6fc6d 3096 s ++;
1aa99e6b 3097 }
5ff6fc6d
JH
3098 }
3099 else {
3100 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3101 while (s < e) {
a3b680e6 3102 STRLEN ulen;
1aa99e6b
IH
3103 if (l >= PL_regeol)
3104 sayNO;
5ff6fc6d 3105 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3106 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3107 uniflags))
1aa99e6b 3108 sayNO;
bc517b45 3109 s += ulen;
a72c7584 3110 l ++;
1aa99e6b 3111 }
5ff6fc6d 3112 }
1aa99e6b
IH
3113 locinput = l;
3114 nextchr = UCHARAT(locinput);
3115 break;
3116 }
bc517b45 3117 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3118 /* Inline the first character, for speed. */
3119 if (UCHARAT(s) != nextchr)
3120 sayNO;
5d9a96ca 3121 if (PL_regeol - locinput < st->ln)
d6a28714 3122 sayNO;
5d9a96ca 3123 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3124 sayNO;
5d9a96ca 3125 locinput += st->ln;
d6a28714
JH
3126 nextchr = UCHARAT(locinput);
3127 break;
95b24440 3128 }
d6a28714 3129 case EXACTFL:
b8c5462f
JH
3130 PL_reg_flags |= RF_tainted;
3131 /* FALL THROUGH */
95b24440 3132 case EXACTF: {
be8e71aa 3133 char * const s = STRING(scan);
5d9a96ca 3134 st->ln = STR_LEN(scan);
d6a28714 3135
d07ddd77
JH
3136 if (do_utf8 || UTF) {
3137 /* Either target or the pattern are utf8. */
be8e71aa 3138 const char * const l = locinput;
d07ddd77 3139 char *e = PL_regeol;
bc517b45 3140
5d9a96ca 3141 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3142 l, &e, 0, do_utf8)) {
5486206c
JH
3143 /* One more case for the sharp s:
3144 * pack("U0U*", 0xDF) =~ /ss/i,
3145 * the 0xC3 0x9F are the UTF-8
3146 * byte sequence for the U+00DF. */
3147 if (!(do_utf8 &&
3148 toLOWER(s[0]) == 's' &&
5d9a96ca 3149 st->ln >= 2 &&
5486206c
JH
3150 toLOWER(s[1]) == 's' &&
3151 (U8)l[0] == 0xC3 &&
3152 e - l >= 2 &&
3153 (U8)l[1] == 0x9F))
3154 sayNO;
3155 }
d07ddd77
JH
3156 locinput = e;
3157 nextchr = UCHARAT(locinput);
3158 break;
a0ed51b3 3159 }
d6a28714 3160
bc517b45
JH
3161 /* Neither the target and the pattern are utf8. */
3162
d6a28714
JH
3163 /* Inline the first character, for speed. */
3164 if (UCHARAT(s) != nextchr &&
3165 UCHARAT(s) != ((OP(scan) == EXACTF)
3166 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3167 sayNO;
5d9a96ca 3168 if (PL_regeol - locinput < st->ln)
b8c5462f 3169 sayNO;
5d9a96ca
DM
3170 if (st->ln > 1 && (OP(scan) == EXACTF
3171 ? ibcmp(s, locinput, st->ln)
3172 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3173 sayNO;
5d9a96ca 3174 locinput += st->ln;
d6a28714 3175 nextchr = UCHARAT(locinput);
a0d0e21e 3176 break;
95b24440 3177 }
d6a28714 3178 case ANYOF:
ffc61ed2 3179 if (do_utf8) {
9e55ce06
JH
3180 STRLEN inclasslen = PL_regeol - locinput;
3181
32fc9b6a 3182 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3183 sayNO_ANYOF;
ffc61ed2
JH
3184 if (locinput >= PL_regeol)
3185 sayNO;
0f0076b4 3186 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3187 nextchr = UCHARAT(locinput);
e0f9d4a8 3188 break;
ffc61ed2
JH
3189 }
3190 else {
3191 if (nextchr < 0)
3192 nextchr = UCHARAT(locinput);
32fc9b6a 3193 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3194 sayNO_ANYOF;
ffc61ed2
JH
3195 if (!nextchr && locinput >= PL_regeol)
3196 sayNO;
3197 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3198 break;
3199 }
3200 no_anyof:
3201 /* If we might have the case of the German sharp s
3202 * in a casefolding Unicode character class. */
3203
ebc501f0
JH
3204 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3205 locinput += SHARP_S_SKIP;
e0f9d4a8 3206 nextchr = UCHARAT(locinput);
ffc61ed2 3207 }
e0f9d4a8
JH
3208 else
3209 sayNO;
b8c5462f 3210 break;
d6a28714 3211 case ALNUML:
b8c5462f
JH
3212 PL_reg_flags |= RF_tainted;
3213 /* FALL THROUGH */
d6a28714 3214 case ALNUM:
b8c5462f 3215 if (!nextchr)
4633a7c4 3216 sayNO;
ffc61ed2 3217 if (do_utf8) {
1a4fad37 3218 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3219 if (!(OP(scan) == ALNUM
bb7a0f54 3220 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3221 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3222 {
3223 sayNO;
a0ed51b3 3224 }
b8c5462f 3225 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3226 nextchr = UCHARAT(locinput);
3227 break;
3228 }
ffc61ed2 3229 if (!(OP(scan) == ALNUM
d6a28714 3230 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3231 sayNO;
b8c5462f 3232 nextchr = UCHARAT(++locinput);
a0d0e21e 3233 break;
d6a28714 3234 case NALNUML:
b8c5462f
JH
3235 PL_reg_flags |= RF_tainted;
3236 /* FALL THROUGH */
d6a28714
JH
3237 case NALNUM:
3238 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3239 sayNO;
ffc61ed2 3240 if (do_utf8) {
1a4fad37 3241 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3242 if (OP(scan) == NALNUM
bb7a0f54 3243 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3244 : isALNUM_LC_utf8((U8*)locinput))
3245 {
b8c5462f 3246 sayNO;
d6a28714 3247 }
b8c5462f
JH
3248 locinput += PL_utf8skip[nextchr];
3249 nextchr = UCHARAT(locinput);
3250 break;
3251 }
ffc61ed2 3252 if (OP(scan) == NALNUM
d6a28714 3253 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3254 sayNO;
76e3520e 3255 nextchr = UCHARAT(++locinput);
a0d0e21e 3256 break;
d6a28714
JH
3257 case BOUNDL:
3258 case NBOUNDL:
3280af22 3259 PL_reg_flags |= RF_tainted;
bbce6d69 3260 /* FALL THROUGH */
d6a28714
JH
3261 case BOUND:
3262 case NBOUND:
3263 /* was last char in word? */
ffc61ed2 3264 if (do_utf8) {
12d33761 3265 if (locinput == PL_bostr)
5d9a96ca 3266 st->ln = '\n';
ffc61ed2 3267 else {
a3b680e6 3268 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3269
4ad0818d 3270 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3271 }
3272 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3273 st->ln = isALNUM_uni(st->ln);
1a4fad37 3274 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3275 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3276 }
3277 else {
5d9a96ca 3278 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3279 n = isALNUM_LC_utf8((U8*)locinput);
3280 }
a0ed51b3 3281 }
d6a28714 3282 else {
5d9a96ca 3283 st->ln = (locinput != PL_bostr) ?
12d33761 3284 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3285 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3286 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3287 n = isALNUM(nextchr);
3288 }
3289 else {
5d9a96ca 3290 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3291 n = isALNUM_LC(nextchr);
3292 }
d6a28714 3293 }
5d9a96ca 3294 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3295 OP(scan) == BOUNDL))
3296 sayNO;
a0ed51b3 3297 break;
d6a28714 3298 case SPACEL:
3280af22 3299 PL_reg_flags |= RF_tainted;
bbce6d69 3300 /* FALL THROUGH */
d6a28714 3301 case SPACE:
9442cb0e 3302 if (!nextchr)
4633a7c4 3303 sayNO;
1aa99e6b 3304 if (do_utf8) {
fd400ab9 3305 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3306 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3307 if (!(OP(scan) == SPACE
bb7a0f54 3308 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3309 : isSPACE_LC_utf8((U8*)locinput)))
3310 {
3311 sayNO;
3312 }
3313 locinput += PL_utf8skip[nextchr];
3314 nextchr = UCHARAT(locinput);
3315 break;
d6a28714 3316 }
ffc61ed2
JH
3317 if (!(OP(scan) == SPACE
3318 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3319 sayNO;
3320 nextchr = UCHARAT(++locinput);
3321 }
3322 else {
3323 if (!(OP(scan) == SPACE
3324 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3325 sayNO;
3326 nextchr = UCHARAT(++locinput);
a0ed51b3 3327 }
a0ed51b3 3328 break;
d6a28714 3329 case NSPACEL:
3280af22 3330 PL_reg_flags |= RF_tainted;
bbce6d69 3331 /* FALL THROUGH */
d6a28714 3332 case NSPACE:
9442cb0e 3333 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3334 sayNO;
1aa99e6b 3335 if (do_utf8) {
1a4fad37 3336 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3337 if (OP(scan) == NSPACE
bb7a0f54 3338 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3339 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3340 {
3341 sayNO;
3342 }
3343 locinput += PL_utf8skip[nextchr];
3344 nextchr = UCHARAT(locinput);
3345 break;
a0ed51b3 3346 }
ffc61ed2 3347 if (OP(scan) == NSPACE
d6a28714 3348 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3349 sayNO;
76e3520e 3350 nextchr = UCHARAT(++locinput);
a0d0e21e 3351 break;
d6a28714 3352 case DIGITL:
a0ed51b3
LW
3353 PL_reg_flags |= RF_tainted;
3354 /* FALL THROUGH */
d6a28714 3355 case DIGIT:
9442cb0e 3356 if (!nextchr)
a0ed51b3 3357 sayNO;
1aa99e6b 3358 if (do_utf8) {
1a4fad37 3359 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3360 if (!(OP(scan) == DIGIT
bb7a0f54 3361 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3362 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3363 {
a0ed51b3 3364 sayNO;
dfe13c55 3365 }
6f06b55f 3366 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3367 nextchr = UCHARAT(locinput);
3368 break;
3369 }
ffc61ed2 3370 if (!(OP(scan) == DIGIT
9442cb0e 3371 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3372 sayNO;
3373 nextchr = UCHARAT(++locinput);
3374 break;
d6a28714 3375 case NDIGITL:
b8c5462f
JH
3376 PL_reg_flags |= RF_tainted;
3377 /* FALL THROUGH */
d6a28714 3378 case NDIGIT:
9442cb0e 3379 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3380 sayNO;
1aa99e6b 3381 if (do_utf8) {
1a4fad37 3382 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3383 if (OP(scan) == NDIGIT
bb7a0f54 3384 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3385 : isDIGIT_LC_utf8((U8*)locinput))
3386 {
a0ed51b3 3387 sayNO;
9442cb0e 3388 }
6f06b55f 3389 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3390 nextchr = UCHARAT(locinput);
3391 break;
3392 }
ffc61ed2 3393 if (OP(scan) == NDIGIT
9442cb0e 3394 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3395 sayNO;
3396 nextchr = UCHARAT(++locinput);
3397 break;
3398 case CLUMP:
b7c83a7e 3399 if (locinput >= PL_regeol)
a0ed51b3 3400 sayNO;
b7c83a7e 3401 if (do_utf8) {
1a4fad37 3402 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3403 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3404 sayNO;
3405 locinput += PL_utf8skip[nextchr];
3406 while (locinput < PL_regeol &&
3407 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3408 locinput += UTF8SKIP(locinput);
3409 if (locinput > PL_regeol)
3410 sayNO;
eb08e2da
JH
3411 }
3412 else
3413 locinput++;
a0ed51b3
LW
3414 nextchr = UCHARAT(locinput);
3415 break;
c8756f30 3416 case REFFL:
3280af22 3417 PL_reg_flags |= RF_tainted;
c8756f30 3418 /* FALL THROUGH */
c277df42 3419 case REF:
95b24440
DM
3420 case REFF: {
3421 char *s;
c277df42 3422 n = ARG(scan); /* which paren pair */
5d9a96ca 3423 st->ln = PL_regstartp[n];
2c2d71f5 3424 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3425 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3426 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3427 if (st->ln == PL_regendp[n])
a0d0e21e 3428 break;
a0ed51b3 3429
5d9a96ca 3430 s = PL_bostr + st->ln;
1aa99e6b 3431 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3432 char *l = locinput;
a3b680e6 3433 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3434 /*
3435 * Note that we can't do the "other character" lookup trick as
3436 * in the 8-bit case (no pun intended) because in Unicode we
3437 * have to map both upper and title case to lower case.
3438 */
3439 if (OP(scan) == REFF) {
3440 while (s < e) {
a3b680e6
AL
3441 STRLEN ulen1, ulen2;
3442 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3443 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3444
a0ed51b3
LW
3445 if (l >= PL_regeol)
3446 sayNO;
a2a2844f
JH
3447 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3448 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3449 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3450 sayNO;
a2a2844f
JH
3451 s += ulen1;
3452 l += ulen2;
a0ed51b3
LW
3453 }
3454 }
3455 locinput = l;
3456 nextchr = UCHARAT(locinput);
3457 break;
3458 }
3459
a0d0e21e 3460 /* Inline the first character, for speed. */
76e3520e 3461 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3462 (OP(scan) == REF ||
3463 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3464 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3465 sayNO;
5d9a96ca
DM
3466 st->ln = PL_regendp[n] - st->ln;
3467 if (locinput + st->ln > PL_regeol)
4633a7c4 3468 sayNO;
5d9a96ca
DM
3469 if (st->ln > 1 && (OP(scan) == REF
3470 ? memNE(s, locinput, st->ln)
c8756f30 3471 : (OP(scan) == REFF
5d9a96ca
DM
3472 ? ibcmp(s, locinput, st->ln)
3473 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3474 sayNO;
5d9a96ca 3475 locinput += st->ln;
76e3520e 3476 nextchr = UCHARAT(locinput);
a0d0e21e 3477 break;
95b24440 3478 }
a0d0e21e
LW
3479
3480 case NOTHING:
c277df42 3481 case TAIL:
a0d0e21e
LW
3482 break;
3483 case BACK:
3484 break;
40a82448
DM
3485
3486#undef ST
3487#define ST st->u.eval
3488
3489 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3490 {
c277df42 3491 SV *ret;
8e5e9ebe 3492 {
4aabdb9b
DM
3493 /* execute the code in the {...} */
3494 dSP;
6136c704 3495 SV ** const before = SP;
4aabdb9b
DM
3496 OP_4tree * const oop = PL_op;
3497 COP * const ocurcop = PL_curcop;
3498 PAD *old_comppad;
4aabdb9b
DM
3499
3500 n = ARG(scan);
32fc9b6a 3501 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3502 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3503 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3504 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3505
8e5e9ebe
RGS
3506 CALLRUNOPS(aTHX); /* Scalar context. */
3507 SPAGAIN;
3508 if (SP == before)
075aa684 3509 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3510 else {
3511 ret = POPs;
3512 PUTBACK;
3513 }
4aabdb9b
DM
3514
3515 PL_op = oop;
3516 PAD_RESTORE_LOCAL(old_comppad);
3517 PL_curcop = ocurcop;
3518 if (!st->logical) {
3519 /* /(?{...})/ */
3520 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3521 break;
3522 }
8e5e9ebe 3523 }
4aabdb9b
DM
3524 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3525 regexp *re;
4aabdb9b 3526 {
4f639d21
DM
3527 /* extract RE object from returned value; compiling if
3528 * necessary */
3529
6136c704 3530 MAGIC *mg = NULL;
be8e71aa 3531 const SV *sv;
faf82a0b
AE
3532 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3533 mg = mg_find(sv, PERL_MAGIC_qr);
3534 else if (SvSMAGICAL(ret)) {
3535 if (SvGMAGICAL(ret))
3536 sv_unmagic(ret, PERL_MAGIC_qr);
3537 else
3538 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3539 }
faf82a0b 3540
0f5d15d6
IZ
3541 if (mg) {
3542 re = (regexp *)mg->mg_obj;
df0003d4 3543 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3544 }
3545 else {
3546 STRLEN len;
6136c704 3547 const char * const t = SvPV_const(ret, len);
0f5d15d6 3548 PMOP pm;
a3b680e6 3549 const I32 osize = PL_regsize;
0f5d15d6 3550
5fcd1c1b 3551 Zero(&pm, 1, PMOP);
4aabdb9b 3552 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3553 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3554 if (!(SvFLAGS(ret)
faf82a0b
AE
3555 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3556 | SVs_GMG)))
14befaf4
DM
3557 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3558 PERL_MAGIC_qr,0,0);
0f5d15d6 3559 PL_regsize = osize;
0f5d15d6 3560 }
4aabdb9b 3561 }
aa283a38
DM
3562
3563 /* run the pattern returned from (??{...}) */
3564
4aabdb9b
DM
3565 DEBUG_EXECUTE_r(
3566 PerlIO_printf(Perl_debug_log,
3567 "Entering embedded \"%s%.60s%s%s\"\n",
3568 PL_colors[0],
3569 re->precomp,
3570 PL_colors[1],
3571 (strlen(re->precomp) > 60 ? "..." : ""))
3572 );
2c2d71f5 3573
40a82448
DM
3574 ST.cp = regcppush(0); /* Save *all* the positions. */
3575 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3576 *PL_reglastparen = 0;
3577 *PL_reglastcloseparen = 0;
4aabdb9b 3578 PL_reginput = locinput;
4aabdb9b
DM
3579
3580 /* XXXX This is too dramatic a measure... */
3581 PL_reg_maxiter = 0;
3582
5d9a96ca 3583 st->logical = 0;
40a82448 3584 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3585 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3586 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3587 ST.prev_rex = rex;
aa283a38 3588 rex = re;
aa283a38 3589
40a82448 3590 ST.B = next;
aa283a38 3591 /* now continue from first node in postoned RE */
40a82448 3592 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3593 /* NOTREACHED */
a0ed51b3 3594 }
4aabdb9b
DM
3595 /* /(?(?{...})X|Y)/ */
3596 st->sw = SvTRUE(ret);
3597 st->logical = 0;
c277df42
IZ
3598 break;
3599 }
40a82448
DM
3600
3601 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3602 if (ST.toggleutf)
3603 PL_reg_flags ^= RF_utf8;
3604 ReREFCNT_dec(rex);
3605 rex = ST.prev_rex;
3606 /* XXXX This is too dramatic a measure... */
3607 PL_reg_maxiter = 0;
3608 /* Restore parens of the caller without popping the
3609 * savestack */
3610 {
3611 const I32 tmp = PL_savestack_ix;
3612 PL_savestack_ix = ST.lastcp;
3613 regcppop(rex);
3614 PL_savestack_ix = tmp;
3615 }
3616 PL_reginput = locinput;
3617 /* continue at the node following the (??{...}) */
3618 scan = ST.B;
3619 continue;
3620
3621 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3622 /* Restore state to the outer re then re-throw the failure */
3623 if (ST.toggleutf)
3624 PL_reg_flags ^= RF_utf8;
3625 ReREFCNT_dec(rex);
3626 rex = ST.prev_rex;
3627
3628 /* XXXX This is too dramatic a measure... */
3629 PL_reg_maxiter = 0;
3630
3631 PL_reginput = locinput;
3632 REGCP_UNWIND(ST.lastcp);
3633 regcppop(rex);
3634 sayNO_SILENT;
3635
3636#undef ST
3637
a0d0e21e 3638 case OPEN:
c277df42 3639 n = ARG(scan); /* which paren pair */
3280af22
NIS
3640 PL_reg_start_tmp[n] = locinput;
3641 if (n > PL_regsize)
3642 PL_regsize = n;
a0d0e21e
LW
3643 break;
3644 case CLOSE:
c277df42 3645 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3646 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3647 PL_regendp[n] = locinput - PL_bostr;
eb160463 3648 if (n > (I32)*PL_reglastparen)
3280af22 3649 *PL_reglastparen = n;
a01268b5 3650 *PL_reglastcloseparen = n;
a0d0e21e 3651 break;
c277df42
IZ
3652 case GROUPP:
3653 n = ARG(scan); /* which paren pair */
5d9a96ca 3654 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3655 break;
3656 case IFTHEN:
2c2d71f5 3657 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3658 if (st->sw)
c277df42
IZ
3659 next = NEXTOPER(NEXTOPER(scan));
3660 else {
3661 next = scan + ARG(scan);
3662 if (OP(next) == IFTHEN) /* Fake one. */
3663 next = NEXTOPER(NEXTOPER(next));
3664 }
3665 break;
3666 case LOGICAL:
5d9a96ca 3667 st->logical = scan->flags;
c277df42 3668 break;
2ab05381 3669/*******************************************************************
a0374537
DM
3670 cc points to the regmatch_state associated with the most recent CURLYX.
3671 This struct contains info about the innermost (...)* loop (an
3672 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3673
3674 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3675
95b24440 3676 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3677
a0374537 3678 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3679 with the starting point at WHILEM node;
2ab05381
IZ
3680
3681 3) Each hit of WHILEM node tries to match A and Z (in the order
3682 depending on the current iteration, min/max of {min,max} and
3683 greediness). The information about where are nodes for "A"
a0374537 3684 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3685 was already matched, and greediness.
3686
3687 4) After A matches, the same WHILEM node is hit again.
3688
95b24440 3689 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3690 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3691 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3692 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3693 of the external loop.
3694
a0374537 3695 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3696 and whatever it mentions via ->next, and additional attached trees
3697 corresponding to temporarily unset infoblocks as in "5" above.
3698
95b24440 3699 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3700 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3701 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3702 infoblocks are drawn below the "reset" infoblock.
3703
3704 In fact in the picture below we do not show failed matches for Z and T
3705 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3706 more obvious *why* one needs to *temporary* unset infoblocks.]
3707
3708 Matched REx position InfoBlocks Comment
3709 (Y(A)*?Z)*?T x
3710 Y(A)*?Z)*?T x <- O
3711 Y (A)*?Z)*?T x <- O
3712 Y A)*?Z)*?T x <- O <- I
3713 YA )*?Z)*?T x <- O <- I
3714 YA A)*?Z)*?T x <- O <- I
3715 YAA )*?Z)*?T x <- O <- I
3716 YAA Z)*?T x <- O # Temporary unset I
3717 I
3718
3719 YAAZ Y(A)*?Z)*?T x <- O
3720 I
3721
3722 YAAZY (A)*?Z)*?T x <- O
3723 I
3724
3725 YAAZY A)*?Z)*?T x <- O <- I
3726 I
3727
3728 YAAZYA )*?Z)*?T x <- O <- I
3729 I
3730
3731 YAAZYA Z)*?T x <- O # Temporary unset I
3732 I,I
3733
3734 YAAZYAZ )*?T x <- O
3735 I,I
3736
3737 YAAZYAZ T x # Temporary unset O
3738 O
3739 I,I
3740
3741 YAAZYAZT x
3742 O
3743 I,I
3744 *******************************************************************/
95b24440 3745
a0d0e21e 3746 case CURLYX: {
cb434fcc
IZ
3747 /* No need to save/restore up to this paren */
3748 I32 parenfloor = scan->flags;
c277df42 3749
c2b7afd3
NC
3750 /* Dave says:
3751
3752 CURLYX and WHILEM are always paired: they're the moral
3753 equivalent of pp_enteriter anbd pp_iter.
3754
3755 The only time next could be null is if the node tree is
3756 corrupt. This was mentioned on p5p a few days ago.
3757
3758 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3759 So we'll assert that this is true:
3760 */
3761 assert(next);
30b2893d 3762 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3763 next += ARG(next);
cb434fcc
IZ
3764 /* XXXX Probably it is better to teach regpush to support
3765 parenfloor > PL_regsize... */
eb160463 3766 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3767 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3768
d8319b27
DM
3769 st->u.curlyx.cp = PL_savestack_ix;
3770 st->u.curlyx.outercc = st->cc;
a0374537
DM
3771 st->cc = st;
3772 /* these fields contain the state of the current curly.
3773 * they are accessed by subsequent WHILEMs;
3774 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3775 st->u.curlyx.parenfloor = parenfloor;
3776 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3777 st->u.curlyx.min = ARG1(scan);
3778 st->u.curlyx.max = ARG2(scan);
3779 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3780 st->u.curlyx.lastloc = 0;
a0374537
DM
3781 /* st->next and st->minmod are also read by WHILEM */
3782
3280af22 3783 PL_reginput = locinput;
95b24440
DM
3784 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3785 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3786 regcpblow(st->u.curlyx.cp);
3787 st->cc = st->u.curlyx.outercc;
95b24440 3788 saySAME(result);
a0d0e21e 3789 }
5f66b61c 3790 /* NOTREACHED */
a0d0e21e
LW
3791 case WHILEM: {
3792 /*
3793 * This is really hard to understand, because after we match
3794 * what we're trying to match, we must make sure the rest of
2c2d71f5 3795 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3796 * to go back UP the parse tree by recursing ever deeper. And
3797 * if it fails, we have to reset our parent's current state
3798 * that we can try again after backing off.
3799 */
3800
c2b7afd3
NC
3801 /* Dave says:
3802
3803 st->cc gets initialised by CURLYX ready for use by WHILEM.
3804 So again, unless somethings been corrupted, st->cc cannot
3805 be null at that point in WHILEM.
3806
3807 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3808 So we'll assert that this is true:
3809 */
3810 assert(st->cc);
d8319b27
DM
3811 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3812 st->u.whilem.cache_offset = 0;
3813 st->u.whilem.cache_bit = 0;
c277df42 3814
d8319b27 3815 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3816 PL_reginput = locinput;
a0d0e21e 3817
a3621e74 3818 DEBUG_EXECUTE_r(
9041c2e3 3819 PerlIO_printf(Perl_debug_log,
91f3b821 3820 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3821 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3822 (long)n, (long)st->cc->u.curlyx.min,
3823 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3824 );
4633a7c4 3825
a0d0e21e
LW
3826 /* If degenerate scan matches "", assume scan done. */
3827
d8319b27
DM
3828 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3829 st->u.whilem.savecc = st->cc;
3830 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3831 if (st->cc)
d8319b27 3832 st->ln = st->cc->u.curlyx.cur;
a3621e74 3833 DEBUG_EXECUTE_r(
c3464db5
DD
3834 PerlIO_printf(Perl_debug_log,
3835 "%*s empty match detected, try continuation...\n",
3280af22 3836 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3837 );
d8319b27 3838 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3839 /*** all unsaved local vars undefined at this point */
d8319b27 3840 st->cc = st->u.whilem.savecc;
95b24440 3841 if (result)
4633a7c4 3842 sayYES;
d8319b27
DM
3843 if (st->cc->u.curlyx.outercc)
3844 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3845 sayNO;
a0d0e21e
LW
3846 }
3847
3848 /* First just match a string of min scans. */
3849
d8319b27
DM
3850 if (n < st->cc->u.curlyx.min) {
3851 st->cc->u.curlyx.cur = n;
3852 st->cc->u.curlyx.lastloc = locinput;
3853 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3854 /*** all unsaved local vars undefined at this point */
3855 if (result)
4633a7c4 3856 sayYES;
d8319b27
DM
3857 st->cc->u.curlyx.cur = n - 1;
3858 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3859 sayNO;
a0d0e21e
LW
3860 }
3861
2c2d71f5
JH
3862 if (scan->flags) {
3863 /* Check whether we already were at this position.
3864 Postpone detection until we know the match is not
3865 *that* much linear. */
3866 if (!PL_reg_maxiter) {
3867 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3868 /* possible overflow for long strings and many CURLYX's */
3869 if (PL_reg_maxiter < 0)
3870 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3871 PL_reg_leftiter = PL_reg_maxiter;
3872 }
3873 if (PL_reg_leftiter-- == 0) {
a3b680e6 3874 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3875 if (PL_reg_poscache) {
eb160463 3876 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3877 Renew(PL_reg_poscache, size, char);
3878 PL_reg_poscache_size = size;
3879 }
3880 Zero(PL_reg_poscache, size, char);
3881 }
3882 else {
3883 PL_reg_poscache_size = size;
a02a5408 3884 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3885 }
a3621e74 3886 DEBUG_EXECUTE_r(
2c2d71f5
JH
3887 PerlIO_printf(Perl_debug_log,
3888 "%sDetected a super-linear match, switching on caching%s...\n",
3889 PL_colors[4], PL_colors[5])
3890 );
3891 }
3892 if (PL_reg_leftiter < 0) {
d8319b27 3893 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3894
d8319b27
DM
3895 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3896 + st->u.whilem.cache_offset * (scan->flags>>4);
3897 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3898 st->u.whilem.cache_offset /= 8;
3899 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3900 DEBUG_EXECUTE_r(
2c2d71f5
JH
3901 PerlIO_printf(Perl_debug_log,
3902 "%*s already tried at this position...\n",
3903 REPORT_CODE_OFF+PL_regindent*2, "")
3904 );
3ab3c9b4
HS
3905 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3906 /* cache records success */
c2b0868c
HS
3907 sayYES;
3908 else
3ab3c9b4 3909 /* cache records failure */
c2b0868c 3910 sayNO_SILENT;
2c2d71f5 3911 }
2c2d71f5
JH
3912 }
3913 }
3914
a0d0e21e
LW
3915 /* Prefer next over scan for minimal matching. */
3916
5d9a96ca 3917 if (st->cc->minmod) {
d8319b27
DM
3918 st->u.whilem.savecc = st->cc;
3919 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3920 if (st->cc)
d8319b27
DM
3921 st->ln = st->cc->u.curlyx.cur;
3922 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3923 REGCP_SET(st->u.whilem.lastcp);
3924 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3925 /*** all unsaved local vars undefined at this point */
d8319b27 3926 st->cc = st->u.whilem.savecc;
95b24440 3927 if (result) {
d8319b27 3928 regcpblow(st->u.whilem.cp);
3ab3c9b4 3929 CACHEsayYES; /* All done. */
5f05dabc 3930 }
d8319b27 3931 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3932 regcppop(rex);
d8319b27
DM
3933 if (st->cc->u.curlyx.outercc)
3934 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3935
d8319b27 3936 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3937 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3938 && !(PL_reg_flags & RF_warned)) {
3939 PL_reg_flags |= RF_warned;
9014280d 3940 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3941 "Complex regular subexpression recursion",
3942 REG_INFTY - 1);
c277df42 3943 }
3ab3c9b4 3944 CACHEsayNO;
c277df42 3945 }
a687059c 3946
a3621e74 3947 DEBUG_EXECUTE_r(
c3464db5
DD
3948 PerlIO_printf(Perl_debug_log,
3949 "%*s trying longer...\n",
3280af22 3950 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3951 );
a0d0e21e 3952 /* Try scanning more and see if it helps. */
3280af22 3953 PL_reginput = locinput;
d8319b27
DM
3954 st->cc->u.curlyx.cur = n;
3955 st->cc->u.curlyx.lastloc = locinput;
3956 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3957 REGCP_SET(st->u.whilem.lastcp);
3958 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3959 /*** all unsaved local vars undefined at this point */
3960 if (result) {
d8319b27 3961 regcpblow(st->u.whilem.cp);
3ab3c9b4 3962 CACHEsayYES;
5f05dabc 3963 }
d8319b27 3964 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3965 regcppop(rex);
d8319b27
DM
3966 st->cc->u.curlyx.cur = n - 1;
3967 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3968 CACHEsayNO;
a0d0e21e
LW
3969 }
3970
3971 /* Prefer scan over next for maximal matching. */
3972
d8319b27
DM
3973 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
3974 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3975 st->cc->u.curlyx.cur = n;
3976 st->cc->u.curlyx.lastloc = locinput;
3977 REGCP_SET(st->u.whilem.lastcp);
3978 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
3979 /*** all unsaved local vars undefined at this point */
3980 if (result) {
d8319b27 3981 regcpblow(st->u.whilem.cp);
3ab3c9b4 3982 CACHEsayYES;
5f05dabc 3983 }
d8319b27 3984 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3985 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 3986 PL_reginput = locinput;
a3621e74 3987 DEBUG_EXECUTE_r(
c3464db5
DD
3988 PerlIO_printf(Perl_debug_log,
3989 "%*s failed, try continuation...\n",
3280af22 3990 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3991 );
3992 }
9041c2e3 3993 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3994 && !(PL_reg_flags & RF_warned)) {
3280af22 3995 PL_reg_flags |= RF_warned;
9014280d 3996 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3997 "Complex regular subexpression recursion",
3998 REG_INFTY - 1);
a0d0e21e
LW
3999 }
4000
4001 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4002 st->u.whilem.savecc = st->cc;
4003 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4004 if (st->cc)
d8319b27
DM
4005 st->ln = st->cc->u.curlyx.cur;
4006 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4007 /*** all unsaved local vars undefined at this point */
d8319b27 4008 st->cc = st->u.whilem.savecc;
95b24440 4009 if (result)
3ab3c9b4 4010 CACHEsayYES;
d8319b27
DM
4011 if (st->cc->u.curlyx.outercc)
4012 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4013 st->cc->u.curlyx.cur = n - 1;
4014 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4015 CACHEsayNO;
a0d0e21e 4016 }
5f66b61c 4017 /* NOTREACHED */
40a82448
DM
4018
4019#undef ST
4020#define ST st->u.branch
4021
4022 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4023 next = scan + ARG(scan);
4024 if (next == scan)
4025 next = NULL;
40a82448
DM
4026 scan = NEXTOPER(scan);
4027 /* FALL THROUGH */
c277df42 4028
40a82448
DM
4029 case BRANCH: /* /(...|A|...)/ */
4030 scan = NEXTOPER(scan); /* scan now points to inner node */
4031 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4032 /* last branch; skip state push and jump direct to node */
4033 continue;
4034 ST.lastparen = *PL_reglastparen;
4035 ST.next_branch = next;
4036 REGCP_SET(ST.cp);
4037 PL_reginput = locinput;
02db2b7b 4038
40a82448
DM
4039 /* Now go into the branch */
4040 PUSH_STATE_GOTO(BRANCH_next, scan);
4041 /* NOTREACHED */
4042
4043 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4044 REGCP_UNWIND(ST.cp);
4045 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4046 PL_regendp[n] = -1;
4047 *PL_reglastparen = n;
4048 scan = ST.next_branch;
4049 /* no more branches? */
4050 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4051 sayNO;
4052 continue; /* execute next BRANCH[J] op */
4053 /* NOTREACHED */
4054
a0d0e21e 4055 case MINMOD:
5d9a96ca 4056 st->minmod = 1;
a0d0e21e 4057 break;
40a82448
DM
4058
4059#undef ST
4060#define ST st->u.curlym
4061
4062 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4063
4064 /* This is an optimisation of CURLYX that enables us to push
4065 * only a single backtracking state, no matter now many matches
4066 * there are in {m,n}. It relies on the pattern being constant
4067 * length, with no parens to influence future backrefs
4068 */
4069
4070 ST.me = scan;
dc45a647 4071 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4072
4073 /* if paren positive, emulate an OPEN/CLOSE around A */
4074 if (ST.me->flags) {
4075 I32 paren = ST.me->flags;
4076 if (paren > PL_regsize)
4077 PL_regsize = paren;
4078 if (paren > (I32)*PL_reglastparen)
4079 *PL_reglastparen = paren;
c277df42 4080 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4081 }
40a82448
DM
4082 ST.A = scan;
4083 ST.B = next;
4084 ST.alen = 0;
4085 ST.count = 0;
4086 ST.minmod = st->minmod;
4087 st->minmod = 0;
4088 ST.c1 = CHRTEST_UNINIT;
4089 REGCP_SET(ST.cp);
6407bf3b 4090
40a82448
DM
4091 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4092 goto curlym_do_B;
4093
4094 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4095 PL_reginput = locinput;
40a82448
DM
4096 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4097 /* NOTREACHED */
5f80c4cf 4098
40a82448
DM
4099 case CURLYM_A: /* we've just matched an A */
4100 locinput = st->locinput;
4101 nextchr = UCHARAT(locinput);
4102
4103 ST.count++;
4104 /* after first match, determine A's length: u.curlym.alen */
4105 if (ST.count == 1) {
4106 if (PL_reg_match_utf8) {
4107 char *s = locinput;
4108 while (s < PL_reginput) {
4109 ST.alen++;
4110 s += UTF8SKIP(s);
4111 }
4112 }
4113 else {
4114 ST.alen = PL_reginput - locinput;
4115 }
4116 if (ST.alen == 0)
4117 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4118 }
0cadcf80
DM
4119 DEBUG_EXECUTE_r(
4120 PerlIO_printf(Perl_debug_log,
40a82448 4121 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 4122 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 4123 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4124 );
4125
40a82448
DM
4126 locinput = PL_reginput;
4127 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4128 goto curlym_do_A; /* try to match another A */
4129 goto curlym_do_B; /* try to match B */
5f80c4cf 4130
40a82448
DM
4131 case CURLYM_A_fail: /* just failed to match an A */
4132 REGCP_UNWIND(ST.cp);
4133 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4134 sayNO;
0cadcf80 4135
40a82448
DM
4136 curlym_do_B: /* execute the B in /A{m,n}B/ */
4137 PL_reginput = locinput;
4138 if (ST.c1 == CHRTEST_UNINIT) {
4139 /* calculate c1 and c2 for possible match of 1st char
4140 * following curly */
4141 ST.c1 = ST.c2 = CHRTEST_VOID;
4142 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4143 regnode *text_node = ST.B;
4144 if (! HAS_TEXT(text_node))
4145 FIND_NEXT_IMPT(text_node);
4146 if (HAS_TEXT(text_node)
4147 && PL_regkind[OP(text_node)] != REF)
4148 {
4149 ST.c1 = (U8)*STRING(text_node);
4150 ST.c2 =
4151 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4152 ? PL_fold[ST.c1]
4153 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4154 ? PL_fold_locale[ST.c1]
4155 : ST.c1;
c277df42 4156 }
c277df42 4157 }
40a82448
DM
4158 }
4159
4160 DEBUG_EXECUTE_r(
4161 PerlIO_printf(Perl_debug_log,
4162 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4163 (int)(REPORT_CODE_OFF+PL_regindent*2),
4164 "", (IV)ST.count)
4165 );
4166 if (ST.c1 != CHRTEST_VOID
4167 && UCHARAT(PL_reginput) != ST.c1
4168 && UCHARAT(PL_reginput) != ST.c2)
4169 {
4170 /* simulate B failing */
4171 state_num = CURLYM_B_fail;
4172 goto reenter_switch;
4173 }
4174
4175 if (ST.me->flags) {
4176 /* mark current A as captured */
4177 I32 paren = ST.me->flags;
4178 if (ST.count) {
4179 PL_regstartp[paren]
4180 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4181 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4182 }
40a82448
DM
4183 else
4184 PL_regendp[paren] = -1;
c277df42 4185 }
40a82448 4186 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4187 /* NOTREACHED */
40a82448
DM
4188
4189 case CURLYM_B_fail: /* just failed to match a B */
4190 REGCP_UNWIND(ST.cp);
4191 if (ST.minmod) {
4192 if (ST.count == ARG2(ST.me) /* max */)
4193 sayNO;
4194 goto curlym_do_A; /* try to match a further A */
4195 }
4196 /* backtrack one A */
4197 if (ST.count == ARG1(ST.me) /* min */)
4198 sayNO;
4199 ST.count--;
4200 locinput = HOPc(locinput, -ST.alen);
4201 goto curlym_do_B; /* try to match B */
4202
c255a977
DM
4203#undef ST
4204#define ST st->u.curly
40a82448 4205
c255a977
DM
4206#define CURLY_SETPAREN(paren, success) \
4207 if (paren) { \
4208 if (success) { \
4209 PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \
4210 PL_regendp[paren] = locinput - PL_bostr; \
4211 } \
4212 else \
4213 PL_regendp[paren] = -1; \
4214 }
4215
4216 case STAR: /* /A*B/ where A is width 1 */
4217 ST.paren = 0;
4218 ST.min = 0;
4219 ST.max = REG_INFTY;
a0d0e21e
LW
4220 scan = NEXTOPER(scan);
4221 goto repeat;
c255a977
DM
4222 case PLUS: /* /A+B/ where A is width 1 */
4223 ST.paren = 0;
4224 ST.min = 1;
4225 ST.max = REG_INFTY;
c277df42 4226 scan = NEXTOPER(scan);
c255a977
DM
4227 goto repeat;
4228 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4229 ST.paren = scan->flags; /* Which paren to set */
4230 if (ST.paren > PL_regsize)
4231 PL_regsize = ST.paren;
4232 if (ST.paren > (I32)*PL_reglastparen)
4233 *PL_reglastparen = ST.paren;
4234 ST.min = ARG1(scan); /* min to match */
4235 ST.max = ARG2(scan); /* max to match */
4236 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4237 goto repeat;
4238 case CURLY: /* /A{m,n}B/ where A is width 1 */
4239 ST.paren = 0;
4240 ST.min = ARG1(scan); /* min to match */
4241 ST.max = ARG2(scan); /* max to match */
4242 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4243 repeat:
a0d0e21e
LW
4244 /*
4245 * Lookahead to avoid useless match attempts
4246 * when we know what character comes next.
c255a977 4247 *
5f80c4cf
JP
4248 * Used to only do .*x and .*?x, but now it allows
4249 * for )'s, ('s and (?{ ... })'s to be in the way
4250 * of the quantifier and the EXACT-like node. -- japhy
4251 */
4252
c255a977
DM
4253 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4254 sayNO;
cca55fe3 4255 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4256 U8 *s;
4257 regnode *text_node = next;
4258
3dab1dad
YO
4259 if (! HAS_TEXT(text_node))
4260 FIND_NEXT_IMPT(text_node);
5f80c4cf 4261
9e137952 4262 if (! HAS_TEXT(text_node))
c255a977 4263 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4264 else {
3dab1dad 4265 if (PL_regkind[OP(text_node)] == REF) {
c255a977 4266 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4267 goto assume_ok_easy;
cca55fe3 4268 }
be8e71aa
YO
4269 else
4270 s = (U8*)STRING(text_node);
5f80c4cf
JP
4271
4272 if (!UTF) {
c255a977 4273 ST.c2 = ST.c1 = *s;
f65d3ee7 4274 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
c255a977 4275 ST.c2 = PL_fold[ST.c1];
f65d3ee7 4276 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
c255a977 4277 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4278 }
5f80c4cf 4279 else { /* UTF */
f65d3ee7 4280 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4281 STRLEN ulen1, ulen2;
89ebb4a3
JH
4282 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4283 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4284
4285 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4286 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4287
c255a977 4288 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4289 uniflags);
c255a977 4290 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4291 uniflags);
5f80c4cf
JP
4292 }
4293 else {
c255a977 4294 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4295 uniflags);
5f80c4cf 4296 }
1aa99e6b
IH
4297 }
4298 }
bbce6d69 4299 }
a0d0e21e 4300 else
c255a977 4301 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4302 assume_ok_easy:
c255a977
DM
4303
4304 ST.A = scan;
4305 ST.B = next;
3280af22 4306 PL_reginput = locinput;
5d9a96ca
DM
4307 if (st->minmod) {
4308 st->minmod = 0;
c255a977 4309 if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
4633a7c4 4310 sayNO;
c255a977 4311 ST.count = ST.min;
a0ed51b3 4312 locinput = PL_reginput;
c255a977
DM
4313 REGCP_SET(ST.cp);
4314 if (ST.c1 == CHRTEST_VOID)
4315 goto curly_try_B_min;
4316
4317 ST.oldloc = locinput;
4318
4319 /* set ST.maxpos to the furthest point along the
4320 * string that could possibly match */
4321 if (ST.max == REG_INFTY) {
4322 ST.maxpos = PL_regeol - 1;
4323 if (do_utf8)
4324 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4325 ST.maxpos--;
4326 }
4327 else if (do_utf8) {
4328 int m = ST.max - ST.min;
4329 for (ST.maxpos = locinput;
4330 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4331 ST.maxpos += UTF8SKIP(ST.maxpos);
4332 }
4333 else {
4334 ST.maxpos = locinput + ST.max - ST.min;
4335 if (ST.maxpos >= PL_regeol)
4336 ST.maxpos = PL_regeol - 1;
4337 }
4338 goto curly_try_B_min_known;
4339
4340 }
4341 else {
4342 ST.count = regrepeat(rex, ST.A, ST.max);
4343 locinput = PL_reginput;
4344 if (ST.count < ST.min)
4345 sayNO;
4346 if ((ST.count > ST.min)
4347 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4348 {
4349 /* A{m,n} must come at the end of the string, there's
4350 * no point in backing off ... */
4351 ST.min = ST.count;
4352 /* ...except that $ and \Z can match before *and* after
4353 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4354 We may back off by one in this case. */
4355 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4356 ST.min--;
4357 }
4358 REGCP_SET(ST.cp);
4359 goto curly_try_B_max;
4360 }
4361 /* NOTREACHED */
4362
4363
4364 case CURLY_B_min_known_fail:
4365 /* failed to find B in a non-greedy match where c1,c2 valid */
4366 if (ST.paren && ST.count)
4367 PL_regendp[ST.paren] = -1;
4368
4369 PL_reginput = locinput; /* Could be reset... */
4370 REGCP_UNWIND(ST.cp);
4371 /* Couldn't or didn't -- move forward. */
4372 ST.oldloc = locinput;
4373 if (do_utf8)
4374 locinput += UTF8SKIP(locinput);
4375 else
4376 locinput++;
4377 ST.count++;
4378 curly_try_B_min_known:
4379 /* find the next place where 'B' could work, then call B */
4380 {
4381 int n;
4382 if (do_utf8) {
4383 n = (ST.oldloc == locinput) ? 0 : 1;
4384 if (ST.c1 == ST.c2) {
4385 STRLEN len;
4386 /* set n to utf8_distance(oldloc, locinput) */
4387 while (locinput <= ST.maxpos &&
4388 utf8n_to_uvchr((U8*)locinput,
4389 UTF8_MAXBYTES, &len,
4390 uniflags) != (UV)ST.c1) {
4391 locinput += len;
4392 n++;
4393 }
1aa99e6b
IH
4394 }
4395 else {
c255a977
DM
4396 /* set n to utf8_distance(oldloc, locinput) */
4397 while (locinput <= ST.maxpos) {
4398 STRLEN len;
4399 const UV c = utf8n_to_uvchr((U8*)locinput,
4400 UTF8_MAXBYTES, &len,
4401 uniflags);
4402 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4403 break;
4404 locinput += len;
4405 n++;
1aa99e6b 4406 }
0fe9bf95
IZ
4407 }
4408 }
c255a977
DM
4409 else {
4410 if (ST.c1 == ST.c2) {
4411 while (locinput <= ST.maxpos &&
4412 UCHARAT(locinput) != ST.c1)
4413 locinput++;
bbce6d69 4414 }
c255a977
DM
4415 else {
4416 while (locinput <= ST.maxpos
4417 && UCHARAT(locinput) != ST.c1
4418 && UCHARAT(locinput) != ST.c2)
4419 locinput++;
a0ed51b3 4420 }
c255a977
DM
4421 n = locinput - ST.oldloc;
4422 }
4423 if (locinput > ST.maxpos)
4424 sayNO;
4425 /* PL_reginput == oldloc now */
4426 if (n) {
4427 ST.count += n;
4428 if (regrepeat(rex, ST.A, n) < n)
4633a7c4 4429 sayNO;
a0d0e21e 4430 }
c255a977
DM
4431 PL_reginput = locinput;
4432 CURLY_SETPAREN(ST.paren, ST.count);
4433 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 4434 }
c255a977
DM
4435 /* NOTREACHED */
4436
4437
4438 case CURLY_B_min_fail:
4439 /* failed to find B in a non-greedy match where c1,c2 invalid */
4440 if (ST.paren && ST.count)
4441 PL_regendp[ST.paren] = -1;
4442
4443 REGCP_UNWIND(ST.cp);
4444 /* failed -- move forward one */
4445 PL_reginput = locinput;
4446 if (regrepeat(rex, ST.A, 1)) {
4447 ST.count++;
a0ed51b3 4448 locinput = PL_reginput;
c255a977
DM
4449 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4450 ST.count > 0)) /* count overflow ? */
15272685 4451 {
c255a977
DM
4452 curly_try_B_min:
4453 CURLY_SETPAREN(ST.paren, ST.count);
4454 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
4455 }
4456 }
4633a7c4 4457 sayNO;
c255a977
DM
4458 /* NOTREACHED */
4459
4460
4461 curly_try_B_max:
4462 /* a successful greedy match: now try to match B */
4463 {
4464 UV c = 0;
4465 if (ST.c1 != CHRTEST_VOID)
4466 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4467 UTF8_MAXBYTES, 0, uniflags)
466787eb 4468 : (UV) UCHARAT(PL_reginput);
c255a977
DM
4469 /* If it could work, try it. */
4470 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4471 CURLY_SETPAREN(ST.paren, ST.count);
4472 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4473 /* NOTREACHED */
4474 }
4475 }
4476 /* FALL THROUGH */
4477 case CURLY_B_max_fail:
4478 /* failed to find B in a greedy match */
4479 if (ST.paren && ST.count)
4480 PL_regendp[ST.paren] = -1;
4481
4482 REGCP_UNWIND(ST.cp);
4483 /* back up. */
4484 if (--ST.count < ST.min)
4485 sayNO;
4486 PL_reginput = locinput = HOPc(locinput, -1);
4487 goto curly_try_B_max;
4488
4489#undef ST
4490
4491
a0d0e21e 4492 case END:
3b0527fe 4493 if (locinput < reginfo->till) {
a3621e74 4494 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4495 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4496 PL_colors[4],
4497 (long)(locinput - PL_reg_starttry),
3b0527fe 4498 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4499 PL_colors[5]));
4500 sayNO_FINAL; /* Cannot match: too short. */
4501 }
4502 PL_reginput = locinput; /* put where regtry can find it */
4503 sayYES_FINAL; /* Success! */
dad79028
DM
4504
4505 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4506 DEBUG_EXECUTE_r(
4507 PerlIO_printf(Perl_debug_log,
4508 "%*s %ssubpattern success...%s\n",
4509 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4510 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4511 sayYES_FINAL; /* Success! */
4512
40a82448
DM
4513#undef ST
4514#define ST st->u.ifmatch
4515
4516 case SUSPEND: /* (?>A) */
4517 ST.wanted = 1;
9fe1d20c 4518 PL_reginput = locinput;
9041c2e3 4519 goto do_ifmatch;
dad79028 4520
40a82448
DM
4521 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4522 ST.wanted = 0;
dad79028
DM
4523 goto ifmatch_trivial_fail_test;
4524
40a82448
DM
4525 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4526 ST.wanted = 1;
dad79028 4527 ifmatch_trivial_fail_test:
a0ed51b3 4528 if (scan->flags) {
52657f30 4529 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4530 if (!s) {
4531 /* trivial fail */
4532 if (st->logical) {
4533 st->logical = 0;
40a82448 4534 st->sw = 1 - (bool)ST.wanted;
dad79028 4535 }
40a82448 4536 else if (ST.wanted)
dad79028
DM
4537 sayNO;
4538 next = scan + ARG(scan);
4539 if (next == scan)
4540 next = NULL;
4541 break;
4542 }
efb30f32 4543 PL_reginput = s;
a0ed51b3
LW
4544 }
4545 else
4546 PL_reginput = locinput;
4547
c277df42 4548 do_ifmatch:
40a82448
DM
4549 ST.me = scan;
4550 /* execute body of (?...A) */
4551 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4552 /* NOTREACHED */
4553
4554 case IFMATCH_A_fail: /* body of (?...A) failed */
4555 ST.wanted = !ST.wanted;
4556 /* FALL THROUGH */
4557
4558 case IFMATCH_A: /* body of (?...A) succeeded */
4559 if (st->logical) {
4560 st->logical = 0;
4561 st->sw = (bool)ST.wanted;
4562 }
4563 else if (!ST.wanted)
4564 sayNO;
4565
4566 if (OP(ST.me) == SUSPEND)
4567 locinput = PL_reginput;
4568 else {
4569 locinput = PL_reginput = st->locinput;
4570 nextchr = UCHARAT(locinput);
4571 }
4572 scan = ST.me + ARG(ST.me);
4573 if (scan == ST.me)
4574 scan = NULL;
4575 continue; /* execute B */
4576
4577#undef ST
dad79028 4578
c277df42 4579 case LONGJMP:
c277df42
IZ
4580 next = scan + ARG(scan);
4581 if (next == scan)
4582 next = NULL;
a0d0e21e
LW
4583 break;
4584 default:
b900a521 4585 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4586 PTR2UV(scan), OP(scan));
cea2e8a9 4587 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4588 }
95b24440 4589
a0d0e21e 4590 scan = next;
95b24440
DM
4591 continue;
4592 /* NOTREACHED */
4593
40a82448
DM
4594 push_yes_state:
4595 /* push a state that backtracks on success */
4596 st->u.yes.prev_yes_state = yes_state;
4597 yes_state = st;
4598 /* FALL THROUGH */
4599 push_state:
4600 /* push a new regex state, then continue at scan */
4601 {
4602 regmatch_state *newst;
4603
4604 depth++;
4605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4606 "PUSH STATE(%d)\n", depth));
4607 st->locinput = locinput;
4608 newst = st+1;
4609 if (newst > SLAB_LAST(PL_regmatch_slab))
4610 newst = S_push_slab(aTHX);
4611 PL_regmatch_state = newst;
4612 newst->cc = st->cc;
4613 /* XXX probably don't need to initialise these */
4614 newst->minmod = 0;
4615 newst->sw = 0;
4616 newst->logical = 0;
4617
4618 locinput = PL_reginput;
4619 nextchr = UCHARAT(locinput);
4620 st = newst;
4621 continue;
4622 /* NOTREACHED */
4623 }
4624
95b24440
DM
4625 /* simulate recursively calling regmatch(), but without actually
4626 * recursing - ie save the current state on the heap rather than on
4627 * the stack, then re-enter the loop. This avoids complex regexes
4628 * blowing the processor stack */
4629
4630 start_recurse:
4631 {
5d9a96ca
DM
4632 /* push new state */
4633 regmatch_state *oldst = st;
4634
4635 depth++;
40a82448 4636 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4637
4638 /* grab the next free state slot */
4639 st++;
86545054 4640 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4641 st = S_push_slab(aTHX);
4642 PL_regmatch_state = st;
4643
4644 oldst->next = next;
4645 oldst->n = n;
4646 oldst->locinput = locinput;
5d9a96ca
DM
4647
4648 st->cc = oldst->cc;
95b24440
DM
4649 locinput = PL_reginput;
4650 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4651 st->minmod = 0;
4652 st->sw = 0;
4653 st->logical = 0;
95b24440
DM
4654#ifdef DEBUGGING
4655 PL_regindent++;
4656#endif
4657 }
a0d0e21e 4658 }
a687059c 4659
aa283a38
DM
4660
4661
a0d0e21e
LW
4662 /*
4663 * We get here only if there's trouble -- normally "case END" is
4664 * the terminating point.
4665 */
cea2e8a9 4666 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4667 /*NOTREACHED*/
4633a7c4
LW
4668 sayNO;
4669
7821416a 4670yes_final:
77cb431f
DM
4671
4672 if (yes_state) {
4673 /* we have successfully completed a subexpression, but we must now
4674 * pop to the state marked by yes_state and continue from there */
4675
4676 assert(st != yes_state);
4677 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4678 || yes_state > SLAB_LAST(PL_regmatch_slab))
4679 {
4680 /* not in this slab, pop slab */
4681 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4682 PL_regmatch_slab = PL_regmatch_slab->prev;
4683 st = SLAB_LAST(PL_regmatch_slab);
4684 }
4685 depth -= (st - yes_state);
40a82448
DM
4686 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4687 depth+1, depth+(st - yes_state)));
77cb431f
DM
4688 st = yes_state;
4689 yes_state = st->u.yes.prev_yes_state;
4690 PL_regmatch_state = st;
4691
4692 switch (st->resume_state) {
40a82448
DM
4693 case IFMATCH_A:
4694 case CURLYM_A:
40a82448
DM
4695 case EVAL_A:
4696 state_num = st->resume_state;
4697 goto reenter_switch;
4698
4699 case CURLYM_B:
166ba7cd
DM
4700 case BRANCH_next:
4701 case TRIE_next:
c255a977 4702 case CURLY_B_max:
77cb431f 4703 default:
40a82448 4704 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4705 }
4706 }
4707
a3621e74 4708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4709 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4710yes:
4711#ifdef DEBUGGING
3280af22 4712 PL_regindent--;
4633a7c4 4713#endif
02db2b7b 4714
95b24440 4715 result = 1;
aa283a38 4716 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4717 * will disappear when REGFMATCH goes */
aa283a38
DM
4718 if (depth) {
4719 /* restore previous state and re-enter */
40a82448
DM
4720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4721 depth--;
4722 st--;
4723 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4724 PL_regmatch_slab = PL_regmatch_slab->prev;
4725 st = SLAB_LAST(PL_regmatch_slab);
4726 }
4727 PL_regmatch_state = st;
4728 scan = st->scan;
4729 next = st->next;
4730 n = st->n;
4731 locinput= st->locinput;
4732 nextchr = UCHARAT(locinput);
aa283a38
DM
4733
4734 switch (st->resume_state) {
aa283a38
DM
4735 case resume_CURLYX:
4736 goto resume_point_CURLYX;
4737 case resume_WHILEM1:
4738 goto resume_point_WHILEM1;
4739 case resume_WHILEM2:
4740 goto resume_point_WHILEM2;
4741 case resume_WHILEM3:
4742 goto resume_point_WHILEM3;
4743 case resume_WHILEM4:
4744 goto resume_point_WHILEM4;
4745 case resume_WHILEM5:
4746 goto resume_point_WHILEM5;
4747 case resume_WHILEM6:
4748 goto resume_point_WHILEM6;
77cb431f 4749
166ba7cd 4750 case TRIE_next:
40a82448
DM
4751 case CURLYM_A:
4752 case CURLYM_B:
4753 case EVAL_A:
4754 case IFMATCH_A:
4755 case BRANCH_next:
c255a977
DM
4756 case CURLY_B_max:
4757 case CURLY_B_min:
4758 case CURLY_B_min_known:
40a82448
DM
4759 break;
4760
aa283a38
DM
4761 default:
4762 Perl_croak(aTHX_ "regexp resume memory corruption");
4763 }
4764 }
4765 goto final_exit;
4633a7c4
LW
4766
4767no:
a3621e74 4768 DEBUG_EXECUTE_r(
7821416a
IZ
4769 PerlIO_printf(Perl_debug_log,
4770 "%*s %sfailed...%s\n",
e4584336 4771 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4772 );
7821416a
IZ
4773no_final:
4774do_no:
aa283a38 4775
4633a7c4 4776#ifdef DEBUGGING
3280af22 4777 PL_regindent--;
4633a7c4 4778#endif
95b24440 4779 result = 0;
5d9a96ca 4780
aa283a38
DM
4781 if (depth) {
4782 /* there's a previous state to backtrack to */
40a82448
DM
4783 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4784 depth--;
4785 st--;
4786 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4787 PL_regmatch_slab = PL_regmatch_slab->prev;
4788 st = SLAB_LAST(PL_regmatch_slab);
4789 }
4790 PL_regmatch_state = st;
4791 scan = st->scan;
4792 next = st->next;
4793 n = st->n;
4794 locinput= st->locinput;
4795 nextchr = UCHARAT(locinput);
4796
5d9a96ca 4797 switch (st->resume_state) {
95b24440
DM
4798 case resume_CURLYX:
4799 goto resume_point_CURLYX;
4800 case resume_WHILEM1:
4801 goto resume_point_WHILEM1;
4802 case resume_WHILEM2:
4803 goto resume_point_WHILEM2;
4804 case resume_WHILEM3:
4805 goto resume_point_WHILEM3;
4806 case resume_WHILEM4:
4807 goto resume_point_WHILEM4;
4808 case resume_WHILEM5:
4809 goto resume_point_WHILEM5;
4810 case resume_WHILEM6:
4811 goto resume_point_WHILEM6;
dad79028 4812
166ba7cd 4813 case TRIE_next:
40a82448
DM
4814 case EVAL_A:
4815 case BRANCH_next:
4816 case CURLYM_A:
4817 case CURLYM_B:
4818 case IFMATCH_A:
c255a977
DM
4819 case CURLY_B_max:
4820 case CURLY_B_min:
4821 case CURLY_B_min_known:
40a82448
DM
4822 if (yes_state == st)
4823 yes_state = st->u.yes.prev_yes_state;
4824 state_num = st->resume_state + 1; /* failure = success + 1 */
4825 goto reenter_switch;
dad79028 4826
95b24440
DM
4827 default:
4828 Perl_croak(aTHX_ "regexp resume memory corruption");
4829 }
95b24440 4830 }
aa283a38
DM
4831
4832final_exit:
4833
5d9a96ca
DM
4834 /* restore original high-water mark */
4835 PL_regmatch_slab = orig_slab;
4836 PL_regmatch_state = orig_state;
4837
4838 /* free all slabs above current one */
4839 if (orig_slab->next) {
c4fd8992 4840 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4841 orig_slab->next = NULL;
4842 while (sl) {
c4fd8992 4843 regmatch_slab * const osl = sl;
5d9a96ca 4844 sl = sl->next;
ad65c075 4845 Safefree(osl);
5d9a96ca
DM
4846 }
4847 }
4848
95b24440
DM
4849 return result;
4850
a687059c
LW
4851}
4852
4853/*
4854 - regrepeat - repeatedly match something simple, report how many
4855 */
4856/*
4857 * [This routine now assumes that it will only match on things of length 1.
4858 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4859 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4860 */
76e3520e 4861STATIC I32
32fc9b6a 4862S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4863{
27da23d5 4864 dVAR;
a0d0e21e 4865 register char *scan;
a0d0e21e 4866 register I32 c;
3280af22 4867 register char *loceol = PL_regeol;
a0ed51b3 4868 register I32 hardcount = 0;
53c4c00c 4869 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4870
3280af22 4871 scan = PL_reginput;
faf11cac
HS
4872 if (max == REG_INFTY)
4873 max = I32_MAX;
4874 else if (max < loceol - scan)
7f596f4c 4875 loceol = scan + max;
a0d0e21e 4876 switch (OP(p)) {
22c35a8c 4877 case REG_ANY:
1aa99e6b 4878 if (do_utf8) {
ffc61ed2 4879 loceol = PL_regeol;
1aa99e6b 4880 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4881 scan += UTF8SKIP(scan);
4882 hardcount++;
4883 }
4884 } else {
4885 while (scan < loceol && *scan != '\n')
4886 scan++;
a0ed51b3
LW
4887 }
4888 break;
ffc61ed2 4889 case SANY:
def8e4ea
JH
4890 if (do_utf8) {
4891 loceol = PL_regeol;
a0804c9e 4892 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4893 scan += UTF8SKIP(scan);
4894 hardcount++;
4895 }
4896 }
4897 else
4898 scan = loceol;
a0ed51b3 4899 break;
f33976b4
DB
4900 case CANY:
4901 scan = loceol;
4902 break;
090f7165
JH
4903 case EXACT: /* length of string is 1 */
4904 c = (U8)*STRING(p);
4905 while (scan < loceol && UCHARAT(scan) == c)
4906 scan++;
bbce6d69 4907 break;
4908 case EXACTF: /* length of string is 1 */
cd439c50 4909 c = (U8)*STRING(p);
bbce6d69 4910 while (scan < loceol &&
22c35a8c 4911 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4912 scan++;
4913 break;
4914 case EXACTFL: /* length of string is 1 */
3280af22 4915 PL_reg_flags |= RF_tainted;
cd439c50 4916 c = (U8)*STRING(p);
bbce6d69 4917 while (scan < loceol &&
22c35a8c 4918 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4919 scan++;
4920 break;
4921 case ANYOF:
ffc61ed2
JH
4922 if (do_utf8) {
4923 loceol = PL_regeol;
cfc92286 4924 while (hardcount < max && scan < loceol &&
32fc9b6a 4925 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4926 scan += UTF8SKIP(scan);
4927 hardcount++;
4928 }
4929 } else {
32fc9b6a 4930 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4931 scan++;
4932 }
a0d0e21e
LW
4933 break;
4934 case ALNUM:
1aa99e6b 4935 if (do_utf8) {
ffc61ed2 4936 loceol = PL_regeol;
1a4fad37 4937 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4938 while (hardcount < max && scan < loceol &&
3568d838 4939 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4940 scan += UTF8SKIP(scan);
4941 hardcount++;
4942 }
4943 } else {
4944 while (scan < loceol && isALNUM(*scan))
4945 scan++;
a0ed51b3
LW
4946 }
4947 break;
bbce6d69 4948 case ALNUML:
3280af22 4949 PL_reg_flags |= RF_tainted;
1aa99e6b 4950 if (do_utf8) {
ffc61ed2 4951 loceol = PL_regeol;
1aa99e6b
IH
4952 while (hardcount < max && scan < loceol &&
4953 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4954 scan += UTF8SKIP(scan);
4955 hardcount++;
4956 }
4957 } else {
4958 while (scan < loceol && isALNUM_LC(*scan))
4959 scan++;
a0ed51b3
LW
4960 }
4961 break;
a0d0e21e 4962 case NALNUM:
1aa99e6b 4963 if (do_utf8) {
ffc61ed2 4964 loceol = PL_regeol;
1a4fad37 4965 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4966 while (hardcount < max && scan < loceol &&
3568d838 4967 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4968 scan += UTF8SKIP(scan);
4969 hardcount++;
4970 }
4971 } else {
4972 while (scan < loceol && !isALNUM(*scan))
4973 scan++;
a0ed51b3
LW
4974 }
4975 break;
bbce6d69 4976 case NALNUML:
3280af22 4977 PL_reg_flags |= RF_tainted;
1aa99e6b 4978 if (do_utf8) {
ffc61ed2 4979 loceol = PL_regeol;
1aa99e6b
IH
4980 while (hardcount < max && scan < loceol &&
4981 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4982 scan += UTF8SKIP(scan);
4983 hardcount++;
4984 }
4985 } else {
4986 while (scan < loceol && !isALNUM_LC(*scan))
4987 scan++;
a0ed51b3
LW
4988 }
4989 break;
a0d0e21e 4990 case SPACE:
1aa99e6b 4991 if (do_utf8) {
ffc61ed2 4992 loceol = PL_regeol;
1a4fad37 4993 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 4994 while (hardcount < max && scan < loceol &&
3568d838
JH
4995 (*scan == ' ' ||
4996 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4997 scan += UTF8SKIP(scan);
4998 hardcount++;
4999 }
5000 } else {
5001 while (scan < loceol && isSPACE(*scan))
5002 scan++;
a0ed51b3
LW
5003 }
5004 break;
bbce6d69 5005 case SPACEL:
3280af22 5006 PL_reg_flags |= RF_tainted;
1aa99e6b 5007 if (do_utf8) {
ffc61ed2 5008 loceol = PL_regeol;
1aa99e6b 5009 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5010 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5011 scan += UTF8SKIP(scan);
5012 hardcount++;
5013 }
5014 } else {
5015 while (scan < loceol && isSPACE_LC(*scan))
5016 scan++;
a0ed51b3
LW
5017 }
5018 break;
a0d0e21e 5019 case NSPACE:
1aa99e6b 5020 if (do_utf8) {
ffc61ed2 5021 loceol = PL_regeol;
1a4fad37 5022 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5023 while (hardcount < max && scan < loceol &&
3568d838
JH
5024 !(*scan == ' ' ||
5025 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5026 scan += UTF8SKIP(scan);
5027 hardcount++;
5028 }
5029 } else {
5030 while (scan < loceol && !isSPACE(*scan))
5031 scan++;
5032 break;
a0ed51b3 5033 }
bbce6d69 5034 case NSPACEL:
3280af22 5035 PL_reg_flags |= RF_tainted;
1aa99e6b 5036 if (do_utf8) {
ffc61ed2 5037 loceol = PL_regeol;
1aa99e6b 5038 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5039 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5040 scan += UTF8SKIP(scan);
5041 hardcount++;
5042 }
5043 } else {
5044 while (scan < loceol && !isSPACE_LC(*scan))
5045 scan++;
a0ed51b3
LW
5046 }
5047 break;
a0d0e21e 5048 case DIGIT:
1aa99e6b 5049 if (do_utf8) {
ffc61ed2 5050 loceol = PL_regeol;
1a4fad37 5051 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5052 while (hardcount < max && scan < loceol &&
3568d838 5053 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5054 scan += UTF8SKIP(scan);
5055 hardcount++;
5056 }
5057 } else {
5058 while (scan < loceol && isDIGIT(*scan))
5059 scan++;
a0ed51b3
LW
5060 }
5061 break;
a0d0e21e 5062 case NDIGIT:
1aa99e6b 5063 if (do_utf8) {
ffc61ed2 5064 loceol = PL_regeol;
1a4fad37 5065 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5066 while (hardcount < max && scan < loceol &&
3568d838 5067 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5068 scan += UTF8SKIP(scan);
5069 hardcount++;
5070 }
5071 } else {
5072 while (scan < loceol && !isDIGIT(*scan))
5073 scan++;
a0ed51b3
LW
5074 }
5075 break;
a0d0e21e
LW
5076 default: /* Called on something of 0 width. */
5077 break; /* So match right here or not at all. */
5078 }
a687059c 5079
a0ed51b3
LW
5080 if (hardcount)
5081 c = hardcount;
5082 else
5083 c = scan - PL_reginput;
3280af22 5084 PL_reginput = scan;
a687059c 5085
a3621e74 5086 DEBUG_r({
e68ec53f 5087 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5088 DEBUG_EXECUTE_r({
e68ec53f
YO
5089 SV * const prop = sv_newmortal();
5090 regprop(prog, prop, p);
5091 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5092 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5093 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5094 });
be8e71aa 5095 });
9041c2e3 5096
a0d0e21e 5097 return(c);
a687059c
LW
5098}
5099
c277df42 5100
be8e71aa 5101#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5102/*
ffc61ed2
JH
5103- regclass_swash - prepare the utf8 swash
5104*/
5105
5106SV *
32fc9b6a 5107Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5108{
97aff369 5109 dVAR;
9e55ce06
JH
5110 SV *sw = NULL;
5111 SV *si = NULL;
5112 SV *alt = NULL;
3dab1dad 5113 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5114
4f639d21 5115 if (data && data->count) {
a3b680e6 5116 const U32 n = ARG(node);
ffc61ed2 5117
4f639d21
DM
5118 if (data->what[n] == 's') {
5119 SV * const rv = (SV*)data->data[n];
890ce7af 5120 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5121 SV **const ary = AvARRAY(av);
9e55ce06 5122 SV **a, **b;
9041c2e3 5123
711a919c 5124 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5125 * documentation of these array elements. */
5126
b11f357e 5127 si = *ary;
8f7f7219 5128 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5129 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5130
ffc61ed2
JH
5131 if (a)
5132 sw = *a;
5133 else if (si && doinit) {
5134 sw = swash_init("utf8", "", si, 1, 0);
5135 (void)av_store(av, 1, sw);
5136 }
9e55ce06
JH
5137 if (b)
5138 alt = *b;
ffc61ed2
JH
5139 }
5140 }
5141
9e55ce06
JH
5142 if (listsvp)
5143 *listsvp = si;
5144 if (altsvp)
5145 *altsvp = alt;
ffc61ed2
JH
5146
5147 return sw;
5148}
76234dfb 5149#endif
ffc61ed2
JH
5150
5151/*
ba7b4546 5152 - reginclass - determine if a character falls into a character class
832705d4
JH
5153
5154 The n is the ANYOF regnode, the p is the target string, lenp
5155 is pointer to the maximum length of how far to go in the p
5156 (if the lenp is zero, UTF8SKIP(p) is used),
5157 do_utf8 tells whether the target string is in UTF-8.
5158
bbce6d69 5159 */
5160
76e3520e 5161STATIC bool
32fc9b6a 5162S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5163{
27da23d5 5164 dVAR;
a3b680e6 5165 const char flags = ANYOF_FLAGS(n);
bbce6d69 5166 bool match = FALSE;
cc07378b 5167 UV c = *p;
ae9ddab8 5168 STRLEN len = 0;
9e55ce06 5169 STRLEN plen;
1aa99e6b 5170
19f67299
TS
5171 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5172 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5173 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5174 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5175 if (len == (STRLEN)-1)
5176 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5177 }
bbce6d69 5178
0f0076b4 5179 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5180 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5181 if (lenp)
5182 *lenp = 0;
ffc61ed2 5183 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5184 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5185 match = TRUE;
bbce6d69 5186 }
3568d838 5187 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5188 match = TRUE;
ffc61ed2 5189 if (!match) {
9e55ce06 5190 AV *av;
32fc9b6a 5191 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5192
5193 if (sw) {
3568d838 5194 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5195 match = TRUE;
5196 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5197 if (!match && lenp && av) {
5198 I32 i;
9e55ce06 5199 for (i = 0; i <= av_len(av); i++) {
890ce7af 5200 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5201 STRLEN len;
890ce7af 5202 const char * const s = SvPV_const(sv, len);
9e55ce06 5203
061b10df 5204 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5205 *lenp = len;
5206 match = TRUE;
5207 break;
5208 }
5209 }
5210 }
5211 if (!match) {
89ebb4a3 5212 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5213 STRLEN tmplen;
5214
9e55ce06
JH
5215 to_utf8_fold(p, tmpbuf, &tmplen);
5216 if (swash_fetch(sw, tmpbuf, do_utf8))
5217 match = TRUE;
5218 }
ffc61ed2
JH
5219 }
5220 }
bbce6d69 5221 }
9e55ce06 5222 if (match && lenp && *lenp == 0)
0f0076b4 5223 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5224 }
1aa99e6b 5225 if (!match && c < 256) {
ffc61ed2
JH
5226 if (ANYOF_BITMAP_TEST(n, c))
5227 match = TRUE;
5228 else if (flags & ANYOF_FOLD) {
eb160463 5229 U8 f;
a0ed51b3 5230
ffc61ed2
JH
5231 if (flags & ANYOF_LOCALE) {
5232 PL_reg_flags |= RF_tainted;
5233 f = PL_fold_locale[c];
5234 }
5235 else
5236 f = PL_fold[c];
5237 if (f != c && ANYOF_BITMAP_TEST(n, f))
5238 match = TRUE;
5239 }
5240
5241 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5242 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5243 if (
5244 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5245 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5246 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5247 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5248 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5249 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5250 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5251 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5252 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5253 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5254 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5255 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5256 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5257 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5258 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5259 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5260 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5261 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5262 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5263 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5264 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5265 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5266 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5267 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5268 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5274 ) /* How's that for a conditional? */
5275 {
5276 match = TRUE;
5277 }
a0ed51b3 5278 }
a0ed51b3
LW
5279 }
5280
a0ed51b3
LW
5281 return (flags & ANYOF_INVERT) ? !match : match;
5282}
161b471a 5283
dfe13c55 5284STATIC U8 *
0ce71af7 5285S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5286{
97aff369 5287 dVAR;
a0ed51b3 5288 if (off >= 0) {
1aa99e6b 5289 while (off-- && s < lim) {
ffc61ed2 5290 /* XXX could check well-formedness here */
a0ed51b3 5291 s += UTF8SKIP(s);
ffc61ed2 5292 }
a0ed51b3
LW
5293 }
5294 else {
5295 while (off++) {
1aa99e6b 5296 if (s > lim) {
a0ed51b3 5297 s--;
ffc61ed2 5298 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5299 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5300 s--;
ffc61ed2
JH
5301 }
5302 /* XXX could check well-formedness here */
a0ed51b3
LW
5303 }
5304 }
5305 }
5306 return s;
5307}
161b471a 5308
dfe13c55 5309STATIC U8 *
0ce71af7 5310S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5311{
97aff369 5312 dVAR;
a0ed51b3 5313 if (off >= 0) {
1aa99e6b 5314 while (off-- && s < lim) {
ffc61ed2 5315 /* XXX could check well-formedness here */
a0ed51b3 5316 s += UTF8SKIP(s);
ffc61ed2 5317 }
a0ed51b3 5318 if (off >= 0)
3dab1dad 5319 return NULL;
a0ed51b3
LW
5320 }
5321 else {
5322 while (off++) {
1aa99e6b 5323 if (s > lim) {
a0ed51b3 5324 s--;
ffc61ed2 5325 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5326 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5327 s--;
ffc61ed2
JH
5328 }
5329 /* XXX could check well-formedness here */
a0ed51b3
LW
5330 }
5331 else
5332 break;
5333 }
5334 if (off <= 0)
3dab1dad 5335 return NULL;
a0ed51b3
LW
5336 }
5337 return s;
5338}
51371543 5339
51371543 5340static void
acfe0abc 5341restore_pos(pTHX_ void *arg)
51371543 5342{
97aff369 5343 dVAR;
097eb12c 5344 regexp * const rex = (regexp *)arg;
51371543
GS
5345 if (PL_reg_eval_set) {
5346 if (PL_reg_oldsaved) {
4f639d21
DM
5347 rex->subbeg = PL_reg_oldsaved;
5348 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5349#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5350 rex->saved_copy = PL_nrs;
ed252734 5351#endif
4f639d21 5352 RX_MATCH_COPIED_on(rex);
51371543
GS
5353 }
5354 PL_reg_magic->mg_len = PL_reg_oldpos;
5355 PL_reg_eval_set = 0;
5356 PL_curpm = PL_reg_oldcurpm;
5357 }
5358}
33b8afdf
JH
5359
5360STATIC void
5361S_to_utf8_substr(pTHX_ register regexp *prog)
5362{
33b8afdf 5363 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5364 SV* const sv = newSVsv(prog->float_substr);
5365 prog->float_utf8 = sv;
33b8afdf
JH
5366 sv_utf8_upgrade(sv);
5367 if (SvTAIL(prog->float_substr))
5368 SvTAIL_on(sv);
5369 if (prog->float_substr == prog->check_substr)
5370 prog->check_utf8 = sv;
5371 }
5372 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5373 SV* const sv = newSVsv(prog->anchored_substr);
5374 prog->anchored_utf8 = sv;
33b8afdf
JH
5375 sv_utf8_upgrade(sv);
5376 if (SvTAIL(prog->anchored_substr))
5377 SvTAIL_on(sv);
5378 if (prog->anchored_substr == prog->check_substr)
5379 prog->check_utf8 = sv;
5380 }
5381}
5382
5383STATIC void
5384S_to_byte_substr(pTHX_ register regexp *prog)
5385{
97aff369 5386 dVAR;
33b8afdf 5387 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5388 SV* sv = newSVsv(prog->float_utf8);
5389 prog->float_substr = sv;
33b8afdf
JH
5390 if (sv_utf8_downgrade(sv, TRUE)) {
5391 if (SvTAIL(prog->float_utf8))
5392 SvTAIL_on(sv);
5393 } else {
5394 SvREFCNT_dec(sv);
5395 prog->float_substr = sv = &PL_sv_undef;
5396 }
5397 if (prog->float_utf8 == prog->check_utf8)
5398 prog->check_substr = sv;
5399 }
5400 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5401 SV* sv = newSVsv(prog->anchored_utf8);
5402 prog->anchored_substr = sv;
33b8afdf
JH
5403 if (sv_utf8_downgrade(sv, TRUE)) {
5404 if (SvTAIL(prog->anchored_utf8))
5405 SvTAIL_on(sv);
5406 } else {
5407 SvREFCNT_dec(sv);
5408 prog->anchored_substr = sv = &PL_sv_undef;
5409 }
5410 if (prog->anchored_utf8 == prog->check_utf8)
5411 prog->check_substr = sv;
5412 }
5413}
66610fdd
RGS
5414
5415/*
5416 * Local variables:
5417 * c-indentation-style: bsd
5418 * c-basic-offset: 4
5419 * indent-tabs-mode: t
5420 * End:
5421 *
37442d52
RGS
5422 * ex: set ts=8 sts=4 sw=4 noet:
5423 */