This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Comply with the 0x80th commandment
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 163 int p;
40a82448 164 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 165
e49a9654
IH
166 if (paren_elems_to_push < 0)
167 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
168
a01268b5 169#define REGCP_OTHER_ELEMS 6
4b3c1a47 170 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 171 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 172/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
173 SSPUSHINT(PL_regendp[p]);
174 SSPUSHINT(PL_regstartp[p]);
3280af22 175 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 176 SSPUSHINT(p);
40a82448
DM
177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
178 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
179 (UV)p, (IV)PL_regstartp[p],
180 (IV)(PL_reg_start_tmp[p] - PL_bostr),
181 (IV)PL_regendp[p]
182 ));
a0d0e21e 183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
a3621e74 199# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
a3621e74 203# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
097eb12c 209S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 210{
97aff369 211 dVAR;
b1ce53c5 212 I32 i;
a0d0e21e 213 char *input;
b1ce53c5 214
a3621e74
YO
215 GET_RE_DEBUG_FLAGS_DECL;
216
b1ce53c5 217 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 218 i = SSPOPINT;
b1ce53c5
JH
219 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
220 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 221 input = (char *) SSPOPPTR;
a01268b5 222 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
223 *PL_reglastparen = SSPOPINT;
224 PL_regsize = SSPOPINT;
b1ce53c5
JH
225
226 /* Now restore the parentheses context. */
41123dfd
JH
227 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
228 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 229 I32 tmps;
097eb12c 230 U32 paren = (U32)SSPOPINT;
3280af22 231 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
232 PL_regstartp[paren] = SSPOPINT;
233 tmps = SSPOPINT;
3280af22
NIS
234 if (paren <= *PL_reglastparen)
235 PL_regendp[paren] = tmps;
a3621e74 236 DEBUG_EXECUTE_r(
c3464db5 237 PerlIO_printf(Perl_debug_log,
b900a521 238 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 239 (UV)paren, (IV)PL_regstartp[paren],
b900a521 240 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 241 (IV)PL_regendp[paren],
3280af22 242 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 243 );
a0d0e21e 244 }
a3621e74 245 DEBUG_EXECUTE_r(
bb7a0f54 246 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 247 PerlIO_printf(Perl_debug_log,
faccc32b 248 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 249 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
250 }
251 );
daf18116 252#if 1
dafc8851
JH
253 /* It would seem that the similar code in regtry()
254 * already takes care of this, and in fact it is in
255 * a better location to since this code can #if 0-ed out
256 * but the code in regtry() is needed or otherwise tests
257 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
258 * (as of patchlevel 7877) will fail. Then again,
259 * this code seems to be necessary or otherwise
260 * building DynaLoader will fail:
261 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
262 * --jhi */
bb7a0f54 263 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
264 if (i > PL_regsize)
265 PL_regstartp[i] = -1;
266 PL_regendp[i] = -1;
a0d0e21e 267 }
dafc8851 268#endif
a0d0e21e
LW
269 return input;
270}
271
02db2b7b 272#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 273
95b24440 274#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
275 if (paren) { \
276 if (n) { \
277 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
278 PL_regendp[paren] = input - PL_bostr; \
279 } \
280 else \
281 PL_regendp[paren] = -1; \
282 } \
95b24440
DM
283 REGMATCH(next, where); \
284 if (result) \
29d1e993
HS
285 sayYES; \
286 if (paren && n) \
287 PL_regendp[paren] = -1; \
288}
289
290
a687059c 291/*
e50aee73 292 * pregexec and friends
a687059c
LW
293 */
294
76234dfb 295#ifndef PERL_IN_XSUB_RE
a687059c 296/*
c277df42 297 - pregexec - match a regexp against a string
a687059c 298 */
c277df42 299I32
864dbfa3 300Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 301 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
302/* strend: pointer to null at end of string */
303/* strbeg: real beginning of string */
304/* minend: end of match must be >=minend after stringarg. */
305/* nosave: For optimizations. */
306{
307 return
9041c2e3 308 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
309 nosave ? 0 : REXEC_COPY_STR);
310}
76234dfb 311#endif
22e551b9 312
9041c2e3 313/*
cad2e5aa
JH
314 * Need to implement the following flags for reg_anch:
315 *
316 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
317 * USE_INTUIT_ML
318 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
319 * INTUIT_AUTORITATIVE_ML
320 * INTUIT_ONCE_NOML - Intuit can match in one location only.
321 * INTUIT_ONCE_ML
322 *
323 * Another flag for this function: SECOND_TIME (so that float substrs
324 * with giant delta may be not rechecked).
325 */
326
327/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
328
3f7c398e 329/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
330 Otherwise, only SvCUR(sv) is used to get strbeg. */
331
332/* XXXX We assume that strpos is strbeg unless sv. */
333
6eb5f6b9
JH
334/* XXXX Some places assume that there is a fixed substring.
335 An update may be needed if optimizer marks as "INTUITable"
336 RExen without fixed substrings. Similarly, it is assumed that
337 lengths of all the strings are no more than minlen, thus they
338 cannot come from lookahead.
339 (Or minlen should take into account lookahead.) */
340
2c2d71f5
JH
341/* A failure to find a constant substring means that there is no need to make
342 an expensive call to REx engine, thus we celebrate a failure. Similarly,
343 finding a substring too deep into the string means that less calls to
30944b6d
IZ
344 regtry() should be needed.
345
346 REx compiler's optimizer found 4 possible hints:
347 a) Anchored substring;
348 b) Fixed substring;
349 c) Whether we are anchored (beginning-of-line or \G);
350 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 351 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
352 string which does not contradict any of them.
353 */
2c2d71f5 354
6eb5f6b9
JH
355/* Most of decisions we do here should have been done at compile time.
356 The nodes of the REx which we used for the search should have been
357 deleted from the finite automaton. */
358
cad2e5aa
JH
359char *
360Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
361 char *strend, U32 flags, re_scream_pos_data *data)
362{
97aff369 363 dVAR;
b7953727 364 register I32 start_shift = 0;
cad2e5aa 365 /* Should be nonnegative! */
b7953727 366 register I32 end_shift = 0;
2c2d71f5
JH
367 register char *s;
368 register SV *check;
a1933d95 369 char *strbeg;
cad2e5aa 370 char *t;
a3b680e6 371 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 372 I32 ml_anch;
bd61b366
SS
373 register char *other_last = NULL; /* other substr checked before this */
374 char *check_at = NULL; /* check substr found at this pos */
1df70142 375 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 376#ifdef DEBUGGING
890ce7af
AL
377 const char * const i_strpos = strpos;
378 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 379#endif
a3621e74
YO
380
381 GET_RE_DEBUG_FLAGS_DECL;
382
a30b2f1f 383 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 384
b8d68ded 385 if (prog->reganch & ROPT_UTF8) {
a3621e74 386 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
387 "UTF-8 regex...\n"));
388 PL_reg_flags |= RF_utf8;
389 }
390
a3621e74 391 DEBUG_EXECUTE_r({
1df70142 392 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
393 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
394 strpos;
1df70142 395 const int len = PL_reg_match_utf8 ?
bb7a0f54 396 (int)strlen(s) : strend - strpos;
2a782b5b
JH
397 if (!PL_colorset)
398 reginitcolors();
b8d68ded 399 if (PL_reg_match_utf8)
a3621e74 400 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 401 "UTF-8 target...\n"));
2a782b5b 402 PerlIO_printf(Perl_debug_log,
a0288114 403 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 404 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
405 prog->precomp,
406 PL_colors[1],
407 (strlen(prog->precomp) > 60 ? "..." : ""),
408 PL_colors[0],
409 (int)(len > 60 ? 60 : len),
410 s, PL_colors[1],
411 (len > 60 ? "..." : "")
412 );
413 });
cad2e5aa 414
c344f387
JH
415 /* CHR_DIST() would be more correct here but it makes things slow. */
416 if (prog->minlen > strend - strpos) {
a3621e74 417 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 418 "String too short... [re_intuit_start]\n"));
cad2e5aa 419 goto fail;
2c2d71f5 420 }
a1933d95 421 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 422 PL_regeol = strend;
33b8afdf
JH
423 if (do_utf8) {
424 if (!prog->check_utf8 && prog->check_substr)
425 to_utf8_substr(prog);
426 check = prog->check_utf8;
427 } else {
428 if (!prog->check_substr && prog->check_utf8)
429 to_byte_substr(prog);
430 check = prog->check_substr;
431 }
432 if (check == &PL_sv_undef) {
a3621e74 433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
434 "Non-utf string cannot match utf check string\n"));
435 goto fail;
436 }
2c2d71f5 437 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
438 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
439 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 440 && !multiline ) ); /* Check after \n? */
cad2e5aa 441
7e25d62c
JH
442 if (!ml_anch) {
443 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
444 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 445 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
446 && sv && !SvROK(sv)
447 && (strpos != strbeg)) {
a3621e74 448 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
449 goto fail;
450 }
451 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 452 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 453 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
454 I32 slen;
455
1aa99e6b 456 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
457 if (SvTAIL(check)) {
458 slen = SvCUR(check); /* >= 1 */
cad2e5aa 459
9041c2e3 460 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 461 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 463 goto fail_finish;
cad2e5aa
JH
464 }
465 /* Now should match s[0..slen-2] */
466 slen--;
3f7c398e 467 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 468 || (slen > 1
3f7c398e 469 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 470 report_neq:
a3621e74 471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
472 goto fail_finish;
473 }
cad2e5aa 474 }
3f7c398e 475 else if (*SvPVX_const(check) != *s
653099ff 476 || ((slen = SvCUR(check)) > 1
3f7c398e 477 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 478 goto report_neq;
c315bfe8 479 check_at = s;
2c2d71f5 480 goto success_at_start;
7e25d62c 481 }
cad2e5aa 482 }
2c2d71f5 483 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 484 s = strpos;
2c2d71f5 485 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 486 end_shift = prog->minlen - start_shift -
653099ff 487 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 488 if (!ml_anch) {
a3b680e6 489 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 490 - (SvTAIL(check) != 0);
a3b680e6 491 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
492
493 if (end_shift < eshift)
494 end_shift = eshift;
495 }
cad2e5aa 496 }
2c2d71f5 497 else { /* Can match at random position */
cad2e5aa
JH
498 ml_anch = 0;
499 s = strpos;
2c2d71f5
JH
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 /* Should be nonnegative! */
502 end_shift = prog->minlen - start_shift -
653099ff 503 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
504 }
505
2c2d71f5 506#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 507 if (end_shift < 0)
6bbae5e6 508 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
509#endif
510
2c2d71f5
JH
511 restart:
512 /* Find a possible match in the region s..strend by looking for
513 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 514 if (flags & REXEC_SCREAM) {
cad2e5aa 515 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 516 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 517
2c2d71f5
JH
518 if (PL_screamfirst[BmRARE(check)] >= 0
519 || ( BmRARE(check) == '\n'
520 && (BmPREVIOUS(check) == SvCUR(check) - 1)
521 && SvTAIL(check) ))
9041c2e3 522 s = screaminstr(sv, check,
2c2d71f5 523 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 524 else
2c2d71f5 525 goto fail_finish;
4addbd3b
HS
526 /* we may be pointing at the wrong string */
527 if (s && RX_MATCH_COPIED(prog))
3f7c398e 528 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
529 if (data)
530 *data->scream_olds = s;
531 }
f33976b4 532 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
533 s = fbm_instr((U8*)(s + start_shift),
534 (U8*)(strend - end_shift),
7fba1cd6 535 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 536 else
1aa99e6b
IH
537 s = fbm_instr(HOP3(s, start_shift, strend),
538 HOP3(strend, -end_shift, strbeg),
7fba1cd6 539 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
540
541 /* Update the count-of-usability, remove useless subpatterns,
542 unshift s. */
2c2d71f5 543
a0288114 544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 545 (s ? "Found" : "Did not find"),
33b8afdf 546 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 547 PL_colors[0],
7b0972df 548 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 549 SvPVX_const(check),
2c2d71f5
JH
550 PL_colors[1], (SvTAIL(check) ? "$" : ""),
551 (s ? " at offset " : "...\n") ) );
552
553 if (!s)
554 goto fail_finish;
555
6eb5f6b9
JH
556 check_at = s;
557
2c2d71f5 558 /* Finish the diagnostic message */
a3621e74 559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
560
561 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
562 Start with the other substr.
563 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 564 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
565 *always* match. Probably should be marked during compile...
566 Probably it is right to do no SCREAM here...
567 */
568
33b8afdf 569 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 570 /* Take into account the "other" substring. */
2c2d71f5
JH
571 /* XXXX May be hopelessly wrong for UTF... */
572 if (!other_last)
6eb5f6b9 573 other_last = strpos;
33b8afdf 574 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
575 do_other_anchored:
576 {
890ce7af
AL
577 char * const last = HOP3c(s, -start_shift, strbeg);
578 char *last1, *last2;
be8e71aa 579 char * const saved_s = s;
33b8afdf 580 SV* must;
2c2d71f5 581
2c2d71f5
JH
582 t = s - prog->check_offset_max;
583 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 584 && (!do_utf8
0ce71af7 585 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 586 && t > strpos)))
6f207bd3 587 NOOP;
2c2d71f5
JH
588 else
589 t = strpos;
1aa99e6b 590 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
591 if (t < other_last) /* These positions already checked */
592 t = other_last;
1aa99e6b 593 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
594 if (last < last1)
595 last1 = last;
596 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
597 /* On end-of-str: see comment below. */
33b8afdf
JH
598 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
599 if (must == &PL_sv_undef) {
600 s = (char*)NULL;
a3621e74 601 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
602 }
603 else
604 s = fbm_instr(
605 (unsigned char*)t,
606 HOP3(HOP3(last1, prog->anchored_offset, strend)
607 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
608 must,
7fba1cd6 609 multiline ? FBMrf_MULTILINE : 0
33b8afdf 610 );
a3621e74 611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 612 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
613 (s ? "Found" : "Contradicts"),
614 PL_colors[0],
33b8afdf
JH
615 (int)(SvCUR(must)
616 - (SvTAIL(must)!=0)),
3f7c398e 617 SvPVX_const(must),
33b8afdf 618 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
619 if (!s) {
620 if (last1 >= last2) {
a3621e74 621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
622 ", giving up...\n"));
623 goto fail_finish;
624 }
a3621e74 625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 626 ", trying floating at offset %ld...\n",
be8e71aa 627 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
628 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
629 s = HOP3c(last, 1, strend);
2c2d71f5
JH
630 goto restart;
631 }
632 else {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 634 (long)(s - i_strpos)));
1aa99e6b
IH
635 t = HOP3c(s, -prog->anchored_offset, strbeg);
636 other_last = HOP3c(s, 1, strend);
be8e71aa 637 s = saved_s;
2c2d71f5
JH
638 if (t == strpos)
639 goto try_at_start;
2c2d71f5
JH
640 goto try_at_offset;
641 }
30944b6d 642 }
2c2d71f5
JH
643 }
644 else { /* Take into account the floating substring. */
33b8afdf 645 char *last, *last1;
be8e71aa 646 char * const saved_s = s;
33b8afdf
JH
647 SV* must;
648
649 t = HOP3c(s, -start_shift, strbeg);
650 last1 = last =
651 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
652 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
653 last = HOP3c(t, prog->float_max_offset, strend);
654 s = HOP3c(t, prog->float_min_offset, strend);
655 if (s < other_last)
656 s = other_last;
2c2d71f5 657 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
658 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
659 /* fbm_instr() takes into account exact value of end-of-str
660 if the check is SvTAIL(ed). Since false positives are OK,
661 and end-of-str is not later than strend we are OK. */
662 if (must == &PL_sv_undef) {
663 s = (char*)NULL;
a3621e74 664 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
665 }
666 else
2c2d71f5 667 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
668 (unsigned char*)last + SvCUR(must)
669 - (SvTAIL(must)!=0),
7fba1cd6 670 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
672 (s ? "Found" : "Contradicts"),
673 PL_colors[0],
674 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 675 SvPVX_const(must),
33b8afdf
JH
676 PL_colors[1], (SvTAIL(must) ? "$" : "")));
677 if (!s) {
678 if (last1 == last) {
a3621e74 679 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
680 ", giving up...\n"));
681 goto fail_finish;
2c2d71f5 682 }
a3621e74 683 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 684 ", trying anchored starting at offset %ld...\n",
be8e71aa 685 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
686 other_last = last;
687 s = HOP3c(t, 1, strend);
688 goto restart;
689 }
690 else {
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
692 (long)(s - i_strpos)));
693 other_last = s; /* Fix this later. --Hugo */
be8e71aa 694 s = saved_s;
33b8afdf
JH
695 if (t == strpos)
696 goto try_at_start;
697 goto try_at_offset;
698 }
2c2d71f5 699 }
cad2e5aa 700 }
2c2d71f5
JH
701
702 t = s - prog->check_offset_max;
2c2d71f5 703 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 704 && (!do_utf8
0ce71af7 705 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 706 && t > strpos))) {
2c2d71f5
JH
707 /* Fixed substring is found far enough so that the match
708 cannot start at strpos. */
709 try_at_offset:
cad2e5aa 710 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
711 /* Eventually fbm_*() should handle this, but often
712 anchored_offset is not 0, so this check will not be wasted. */
713 /* XXXX In the code below we prefer to look for "^" even in
714 presence of anchored substrings. And we search even
715 beyond the found float position. These pessimizations
716 are historical artefacts only. */
717 find_anchor:
2c2d71f5 718 while (t < strend - prog->minlen) {
cad2e5aa 719 if (*t == '\n') {
4ee3650e 720 if (t < check_at - prog->check_offset_min) {
33b8afdf 721 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
722 /* Since we moved from the found position,
723 we definitely contradict the found anchored
30944b6d
IZ
724 substr. Due to the above check we do not
725 contradict "check" substr.
726 Thus we can arrive here only if check substr
727 is float. Redo checking for "other"=="fixed".
728 */
9041c2e3 729 strpos = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
732 goto do_other_anchored;
733 }
4ee3650e
GS
734 /* We don't contradict the found floating substring. */
735 /* XXXX Why not check for STCLASS? */
cad2e5aa 736 s = t + 1;
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
739 goto set_useful;
740 }
4ee3650e
GS
741 /* Position contradicts check-string */
742 /* XXXX probably better to look for check-string
743 than for "\n", so one should lower the limit for t? */
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 745 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 746 other_last = strpos = s = t + 1;
cad2e5aa
JH
747 goto restart;
748 }
749 t++;
750 }
a3621e74 751 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 752 PL_colors[0], PL_colors[1]));
2c2d71f5 753 goto fail_finish;
cad2e5aa 754 }
f5952150 755 else {
a3621e74 756 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 757 PL_colors[0], PL_colors[1]));
f5952150 758 }
cad2e5aa
JH
759 s = t;
760 set_useful:
33b8afdf 761 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
762 }
763 else {
f5952150 764 /* The found string does not prohibit matching at strpos,
2c2d71f5 765 - no optimization of calling REx engine can be performed,
f5952150
GS
766 unless it was an MBOL and we are not after MBOL,
767 or a future STCLASS check will fail this. */
2c2d71f5
JH
768 try_at_start:
769 /* Even in this situation we may use MBOL flag if strpos is offset
770 wrt the start of the string. */
05b4157f 771 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 772 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
773 /* May be due to an implicit anchor of m{.*foo} */
774 && !(prog->reganch & ROPT_IMPLICIT))
775 {
cad2e5aa
JH
776 t = strpos;
777 goto find_anchor;
778 }
a3621e74 779 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 780 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 781 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 782 );
2c2d71f5 783 success_at_start:
30944b6d 784 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
785 && (do_utf8 ? (
786 prog->check_utf8 /* Could be deleted already */
787 && --BmUSEFUL(prog->check_utf8) < 0
788 && (prog->check_utf8 == prog->float_utf8)
789 ) : (
790 prog->check_substr /* Could be deleted already */
791 && --BmUSEFUL(prog->check_substr) < 0
792 && (prog->check_substr == prog->float_substr)
793 )))
66e933ab 794 {
cad2e5aa 795 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
797 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
798 if (do_utf8 ? prog->check_substr : prog->check_utf8)
799 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
800 prog->check_substr = prog->check_utf8 = NULL; /* disable */
801 prog->float_substr = prog->float_utf8 = NULL; /* clear */
802 check = NULL; /* abort */
cad2e5aa 803 s = strpos;
3cf5c195
IZ
804 /* XXXX This is a remnant of the old implementation. It
805 looks wasteful, since now INTUIT can use many
6eb5f6b9 806 other heuristics. */
cad2e5aa
JH
807 prog->reganch &= ~RE_USE_INTUIT;
808 }
809 else
810 s = strpos;
811 }
812
6eb5f6b9
JH
813 /* Last resort... */
814 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 815 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
816 /* minlen == 0 is possible if regstclass is \b or \B,
817 and the fixed substr is ''$.
818 Since minlen is already taken into account, s+1 is before strend;
819 accidentally, minlen >= 1 guaranties no false positives at s + 1
820 even for \b or \B. But (minlen? 1 : 0) below assumes that
821 regstclass does not come from lookahead... */
822 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
823 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 824 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 825 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 826 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 827 : 1);
07be1b83 828 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 829 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 830 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
831 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
832 cl_l, strend)
833 : strend);
07be1b83
YO
834 /*if (OP(prog->regstclass) == TRIE)
835 endpos++;*/
6eb5f6b9 836 t = s;
3b0527fe 837 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
838 if (!s) {
839#ifdef DEBUGGING
cbbf8932 840 const char *what = NULL;
6eb5f6b9
JH
841#endif
842 if (endpos == strend) {
a3621e74 843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
844 "Could not match STCLASS...\n") );
845 goto fail;
846 }
a3621e74 847 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 848 "This position contradicts STCLASS...\n") );
653099ff
GS
849 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
850 goto fail;
6eb5f6b9 851 /* Contradict one of substrings */
33b8afdf
JH
852 if (prog->anchored_substr || prog->anchored_utf8) {
853 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 854 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 855 hop_and_restart:
1aa99e6b 856 s = HOP3c(t, 1, strend);
66e933ab
GS
857 if (s + start_shift + end_shift > strend) {
858 /* XXXX Should be taken into account earlier? */
a3621e74 859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
860 "Could not match STCLASS...\n") );
861 goto fail;
862 }
5e39e1e5
HS
863 if (!check)
864 goto giveup;
a3621e74 865 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 866 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
867 what, (long)(s + start_shift - i_strpos)) );
868 goto restart;
869 }
66e933ab 870 /* Have both, check_string is floating */
6eb5f6b9
JH
871 if (t + start_shift >= check_at) /* Contradicts floating=check */
872 goto retry_floating_check;
873 /* Recheck anchored substring, but not floating... */
9041c2e3 874 s = check_at;
5e39e1e5
HS
875 if (!check)
876 goto giveup;
a3621e74 877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 878 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
879 (long)(other_last - i_strpos)) );
880 goto do_other_anchored;
881 }
60e71179
GS
882 /* Another way we could have checked stclass at the
883 current position only: */
884 if (ml_anch) {
885 s = t = t + 1;
5e39e1e5
HS
886 if (!check)
887 goto giveup;
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 889 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 890 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 891 goto try_at_offset;
66e933ab 892 }
33b8afdf 893 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 894 goto fail;
6eb5f6b9
JH
895 /* Check is floating subtring. */
896 retry_floating_check:
897 t = check_at - start_shift;
a3621e74 898 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
899 goto hop_and_restart;
900 }
b7953727 901 if (t != s) {
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 903 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
904 (long)(t - i_strpos), (long)(s - i_strpos))
905 );
906 }
907 else {
a3621e74 908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
909 "Does not contradict STCLASS...\n");
910 );
911 }
6eb5f6b9 912 }
5e39e1e5 913 giveup:
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
915 PL_colors[4], (check ? "Guessed" : "Giving up"),
916 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 917 return s;
2c2d71f5
JH
918
919 fail_finish: /* Substring not found */
33b8afdf
JH
920 if (prog->check_substr || prog->check_utf8) /* could be removed already */
921 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 922 fail:
a3621e74 923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 924 PL_colors[4], PL_colors[5]));
bd61b366 925 return NULL;
cad2e5aa 926}
9661b544 927
6eb5f6b9 928/* We know what class REx starts with. Try to find this position... */
3b0527fe 929/* if reginfo is NULL, its a dryrun */
07be1b83
YO
930/* annoyingly all the vars in this routine have different names from their counterparts
931 in regmatch. /grrr */
3b0527fe 932
3c3eec57 933STATIC char *
07be1b83
YO
934S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
935 const char *strend, const regmatch_info *reginfo)
a687059c 936{
27da23d5 937 dVAR;
1df70142 938 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 939 char *m;
d8093b23 940 STRLEN ln;
5dab1207 941 STRLEN lnc;
078c425b 942 register STRLEN uskip;
d8093b23
G
943 unsigned int c1;
944 unsigned int c2;
6eb5f6b9
JH
945 char *e;
946 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 947 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 948
6eb5f6b9
JH
949 /* We know what class it must start with. */
950 switch (OP(c)) {
6eb5f6b9 951 case ANYOF:
388cc4de 952 if (do_utf8) {
078c425b 953 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
954 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
955 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
956 reginclass(prog, c, (U8*)s, 0, do_utf8) :
957 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 958 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
959 goto got_it;
960 else
961 tmp = doevery;
962 }
963 else
964 tmp = 1;
078c425b 965 s += uskip;
388cc4de
HS
966 }
967 }
968 else {
969 while (s < strend) {
970 STRLEN skip = 1;
971
32fc9b6a 972 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
973 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
974 /* The assignment of 2 is intentional:
975 * for the folded sharp s, the skip is 2. */
976 (skip = SHARP_S_SKIP))) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
978 goto got_it;
979 else
980 tmp = doevery;
981 }
982 else
983 tmp = 1;
984 s += skip;
985 }
a0d0e21e 986 }
6eb5f6b9 987 break;
f33976b4
DB
988 case CANY:
989 while (s < strend) {
3b0527fe 990 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
991 goto got_it;
992 else
993 tmp = doevery;
994 s++;
995 }
996 break;
6eb5f6b9 997 case EXACTF:
5dab1207
NIS
998 m = STRING(c);
999 ln = STR_LEN(c); /* length to match in octets/bytes */
1000 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1001 if (UTF) {
a2a2844f 1002 STRLEN ulen1, ulen2;
5dab1207 1003 U8 *sm = (U8 *) m;
89ebb4a3
JH
1004 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1005 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1006 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1007
1008 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1009 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1010
89ebb4a3 1011 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1012 0, uniflags);
89ebb4a3 1013 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1014 0, uniflags);
5dab1207
NIS
1015 lnc = 0;
1016 while (sm < ((U8 *) m + ln)) {
1017 lnc++;
1018 sm += UTF8SKIP(sm);
1019 }
1aa99e6b
IH
1020 }
1021 else {
1022 c1 = *(U8*)m;
1023 c2 = PL_fold[c1];
1024 }
6eb5f6b9
JH
1025 goto do_exactf;
1026 case EXACTFL:
5dab1207
NIS
1027 m = STRING(c);
1028 ln = STR_LEN(c);
1029 lnc = (I32) ln;
d8093b23 1030 c1 = *(U8*)m;
6eb5f6b9
JH
1031 c2 = PL_fold_locale[c1];
1032 do_exactf:
db12adc6 1033 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1034
3b0527fe 1035 if (!reginfo && e < s)
6eb5f6b9 1036 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1037
60a8b682
JH
1038 /* The idea in the EXACTF* cases is to first find the
1039 * first character of the EXACTF* node and then, if
1040 * necessary, case-insensitively compare the full
1041 * text of the node. The c1 and c2 are the first
1042 * characters (though in Unicode it gets a bit
1043 * more complicated because there are more cases
7f16dd3d
JH
1044 * than just upper and lower: one needs to use
1045 * the so-called folding case for case-insensitive
1046 * matching (called "loose matching" in Unicode).
1047 * ibcmp_utf8() will do just that. */
60a8b682 1048
1aa99e6b 1049 if (do_utf8) {
575cac57 1050 UV c, f;
89ebb4a3 1051 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1052 STRLEN len, foldlen;
4ad0818d 1053 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1054 if (c1 == c2) {
5dab1207
NIS
1055 /* Upper and lower of 1st char are equal -
1056 * probably not a "letter". */
1aa99e6b 1057 while (s <= e) {
89ebb4a3 1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1059 uniflags);
80aecb99
JH
1060 if ( c == c1
1061 && (ln == len ||
3dab1dad
YO
1062 ibcmp_utf8(s, NULL, 0, do_utf8,
1063 m, NULL, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1065 goto got_it;
80aecb99 1066 else {
1df70142 1067 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254 1073 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1074 NULL, foldlen, do_utf8,
d07ddd77 1075 m,
3dab1dad 1076 NULL, ln, (bool)UTF))
3b0527fe 1077 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
89ebb4a3 1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1086 uniflags);
80aecb99 1087
60a8b682 1088 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1089 * Note that not all the possible combinations
1090 * are handled here: some of them are handled
1091 * by the standard folding rules, and some of
1092 * them (the character class or ANYOF cases)
1093 * are handled during compiletime in
1094 * regexec.c:S_regclass(). */
880bd946
JH
1095 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1096 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1097 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1098
1099 if ( (c == c1 || c == c2)
1100 && (ln == len ||
3dab1dad
YO
1101 ibcmp_utf8(s, NULL, 0, do_utf8,
1102 m, NULL, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1104 goto got_it;
80aecb99 1105 else {
1df70142 1106 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1113 NULL, foldlen, do_utf8,
d07ddd77 1114 m,
3dab1dad 1115 NULL, ln, (bool)UTF))
3b0527fe 1116 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
3b0527fe 1140 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
6136c704 1154 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1155 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1156 }
1157 tmp = ((OP(c) == BOUND ?
9041c2e3 1158 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1159 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1160 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1161 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1162 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1163 isALNUM_LC_utf8((U8*)s)))
1164 {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
078c425b 1169 s += uskip;
a687059c 1170 }
a0d0e21e 1171 }
667bb95a 1172 else {
12d33761 1173 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1174 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1175 while (s < strend) {
1176 if (tmp ==
1177 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1178 tmp = !tmp;
3b0527fe 1179 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1180 goto got_it;
1181 }
1182 s++;
a0ed51b3 1183 }
a0ed51b3 1184 }
3b0527fe 1185 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1186 goto got_it;
1187 break;
bbce6d69 1188 case NBOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case NBOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == NBOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1202 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1203 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1208 goto got_it;
078c425b 1209 s += uskip;
ffc61ed2 1210 }
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1214 tmp = ((OP(c) == NBOUND ?
1215 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1216 while (s < strend) {
1217 if (tmp ==
1218 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1219 tmp = !tmp;
3b0527fe 1220 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 s++;
1223 }
a0ed51b3 1224 }
3b0527fe 1225 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1226 goto got_it;
1227 break;
a0d0e21e 1228 case ALNUM:
ffc61ed2 1229 if (do_utf8) {
1a4fad37 1230 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1232 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
bbce6d69 1238 else
ffc61ed2 1239 tmp = 1;
078c425b 1240 s += uskip;
bbce6d69 1241 }
bbce6d69 1242 }
ffc61ed2
JH
1243 else {
1244 while (s < strend) {
1245 if (isALNUM(*s)) {
3b0527fe 1246 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1247 goto got_it;
1248 else
1249 tmp = doevery;
1250 }
a0ed51b3 1251 else
ffc61ed2
JH
1252 tmp = 1;
1253 s++;
a0ed51b3 1254 }
a0ed51b3
LW
1255 }
1256 break;
bbce6d69 1257 case ALNUML:
3280af22 1258 PL_reg_flags |= RF_tainted;
ffc61ed2 1259 if (do_utf8) {
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1261 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a687059c 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
a0d0e21e 1270 }
a0d0e21e 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM_LC(*s)) {
3b0527fe 1275 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
a0d0e21e 1286 case NALNUM:
ffc61ed2 1287 if (do_utf8) {
1a4fad37 1288 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1290 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
bbce6d69 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
bbce6d69 1299 }
bbce6d69 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (!isALNUM(*s)) {
3b0527fe 1304 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
bbce6d69 1315 case NALNUML:
3280af22 1316 PL_reg_flags |= RF_tainted;
ffc61ed2 1317 if (do_utf8) {
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1319 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a687059c 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
a687059c 1328 }
a0d0e21e 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM_LC(*s)) {
3b0527fe 1333 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
a0d0e21e 1344 case SPACE:
ffc61ed2 1345 if (do_utf8) {
1a4fad37 1346 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1348 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0d0e21e 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
2304df62 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (isSPACE(*s)) {
3b0527fe 1362 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
bbce6d69 1373 case SPACEL:
3280af22 1374 PL_reg_flags |= RF_tainted;
ffc61ed2 1375 if (do_utf8) {
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1377 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
bbce6d69 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
bbce6d69 1386 }
bbce6d69 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE_LC(*s)) {
3b0527fe 1391 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
a0d0e21e 1402 case NSPACE:
ffc61ed2 1403 if (do_utf8) {
1a4fad37 1404 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1406 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0d0e21e 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
a687059c 1415 }
a0d0e21e 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (!isSPACE(*s)) {
3b0527fe 1420 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
bbce6d69 1431 case NSPACEL:
3280af22 1432 PL_reg_flags |= RF_tainted;
ffc61ed2 1433 if (do_utf8) {
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1435 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
bbce6d69 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
bbce6d69 1444 }
bbce6d69 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE_LC(*s)) {
3b0527fe 1449 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
a0d0e21e 1460 case DIGIT:
ffc61ed2 1461 if (do_utf8) {
1a4fad37 1462 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1464 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0d0e21e 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
2b69d0c2 1473 }
a0d0e21e 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (isDIGIT(*s)) {
3b0527fe 1478 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
b8c5462f
JH
1489 case DIGITL:
1490 PL_reg_flags |= RF_tainted;
ffc61ed2 1491 if (do_utf8) {
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1493 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
b8c5462f 1502 }
b8c5462f 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT_LC(*s)) {
3b0527fe 1507 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
b8c5462f 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
b8c5462f 1515 }
b8c5462f
JH
1516 }
1517 break;
a0d0e21e 1518 case NDIGIT:
ffc61ed2 1519 if (do_utf8) {
1a4fad37 1520 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1522 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0d0e21e 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
a687059c 1531 }
a0d0e21e 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (!isDIGIT(*s)) {
3b0527fe 1536 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
a0ed51b3 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
a0ed51b3 1544 }
a0ed51b3
LW
1545 }
1546 break;
b8c5462f
JH
1547 case NDIGITL:
1548 PL_reg_flags |= RF_tainted;
ffc61ed2 1549 if (do_utf8) {
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1551 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
b8c5462f 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
b8c5462f 1560 }
a0ed51b3 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT_LC(*s)) {
3b0527fe 1565 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
cf93c79d 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
b8c5462f 1573 }
b8c5462f
JH
1574 }
1575 break;
07be1b83
YO
1576 case TRIE:
1577 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1578 {
1579 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1580 trie_type = do_utf8 ?
1581 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1582 : trie_plain;
1583 /* what trie are we using right now */
1584 reg_ac_data *aho
1585 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1586 reg_trie_data *trie=aho->trie;
1587
1588 const char *last_start = strend - trie->minlen;
1589 const char *real_start = s;
1590 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1591 SV *sv_points;
1592 U8 **points; /* map of where we were in the input string
1593 when reading a given string. For ASCII this
1594 is unnecessary overhead as the relationship
1595 is always 1:1, but for unicode, especially
1596 case folded unicode this is not true. */
07be1b83
YO
1597
1598 GET_RE_DEBUG_FLAGS_DECL;
1599
be8e71aa
YO
1600 /* We can't just allocate points here. We need to wrap it in
1601 * an SV so it gets freed properly if there is a croak while
1602 * running the match */
1603 ENTER;
1604 SAVETMPS;
1605 sv_points=newSV(maxlen * sizeof(U8 *));
1606 SvCUR_set(sv_points,
1607 maxlen * sizeof(U8 *));
1608 SvPOK_on(sv_points);
1609 sv_2mortal(sv_points);
1610 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1611
1612 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1613 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1614 s++;
1615 }
1616 }
1617
1618 while (s <= last_start) {
1619 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1620 U8 *uc = (U8*)s;
1621 U16 charid = 0;
1622 U32 base = 1;
1623 U32 state = 1;
1624 UV uvc = 0;
1625 STRLEN len = 0;
1626 STRLEN foldlen = 0;
1627 U8 *uscan = (U8*)NULL;
1628 U8 *leftmost = NULL;
1629
1630 U32 pointpos = 0;
1631
1632 while ( state && uc <= (U8*)strend ) {
1633 int failed=0;
1634 if (aho->states[ state ].wordnum) {
1635 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1636 if (!leftmost || lpos < leftmost)
1637 leftmost= lpos;
1638 if (base==0) break;
1639 }
1640 points[pointpos++ % maxlen]= uc;
1641 switch (trie_type) {
1642 case trie_utf8_fold:
1643 if ( foldlen>0 ) {
1644 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1645 foldlen -= len;
1646 uscan += len;
1647 len=0;
1648 } else {
1649 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1650 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1651 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1652 foldlen -= UNISKIP( uvc );
1653 uscan = foldbuf + UNISKIP( uvc );
1654 }
1655 break;
1656 case trie_utf8:
1657 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1658 &len, uniflags );
1659 break;
1660 case trie_plain:
1661 uvc = (UV)*uc;
1662 len = 1;
1663 }
1664
1665 if (uvc < 256) {
1666 charid = trie->charmap[ uvc ];
1667 }
1668 else {
1669 charid = 0;
1670 if (trie->widecharmap) {
1671 SV** const svpp = hv_fetch(trie->widecharmap,
1672 (char*)&uvc, sizeof(UV), 0);
1673 if (svpp)
1674 charid = (U16)SvIV(*svpp);
1675 }
1676 }
1677 DEBUG_TRIE_EXECUTE_r(
1678 PerlIO_printf(Perl_debug_log,
1679 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1680 (int)((const char*)uc - real_start), charid, uvc)
1681 );
1682 uc += len;
1683
1684 do {
1685 U32 word = aho->states[ state ].wordnum;
1686 base = aho->states[ state ].trans.base;
1687
1688 DEBUG_TRIE_EXECUTE_r(
1689 PerlIO_printf( Perl_debug_log,
1690 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1691 failed ? "Fail transition to " : "",
1692 state, base, uvc, word)
1693 );
1694 if ( base ) {
1695 U32 tmp;
1696 if (charid &&
1697 (base + charid > trie->uniquecharcount )
1698 && (base + charid - 1 - trie->uniquecharcount
1699 < trie->lasttrans)
1700 && trie->trans[base + charid - 1 -
1701 trie->uniquecharcount].check == state
1702 && (tmp=trie->trans[base + charid - 1 -
1703 trie->uniquecharcount ].next))
1704 {
1705 state = tmp;
1706 break;
1707 }
1708 else {
1709 failed++;
1710 if ( state == 1 )
1711 break;
1712 else
1713 state = aho->fail[state];
1714 }
1715 }
1716 else {
1717 /* we must be accepting here */
1718 failed++;
1719 break;
1720 }
1721 } while(state);
1722 if (failed) {
1723 if (leftmost)
1724 break;
1725 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1726 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1727 uc++;
1728 }
1729 }
1730 }
1731 }
1732 if ( aho->states[ state ].wordnum ) {
1733 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1734 if (!leftmost || lpos < leftmost)
1735 leftmost = lpos;
1736 }
1737 DEBUG_TRIE_EXECUTE_r(
1738 PerlIO_printf( Perl_debug_log,
1739 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1740 "All done: ",
1741 state, base, uvc)
1742 );
1743 if (leftmost) {
1744 s = (char*)leftmost;
be8e71aa
YO
1745 if (!reginfo || regtry(reginfo, s)) {
1746 FREETMPS;
1747 LEAVE;
07be1b83 1748 goto got_it;
be8e71aa 1749 }
07be1b83
YO
1750 s = HOPc(s,1);
1751 } else {
1752 break;
1753 }
1754 }
be8e71aa
YO
1755 FREETMPS;
1756 LEAVE;
07be1b83
YO
1757 }
1758 break;
b3c9acc1 1759 default:
3c3eec57
GS
1760 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1761 break;
d6a28714 1762 }
6eb5f6b9
JH
1763 return 0;
1764 got_it:
1765 return s;
1766}
1767
1768/*
1769 - regexec_flags - match a regexp against a string
1770 */
1771I32
1772Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1773 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1774/* strend: pointer to null at end of string */
1775/* strbeg: real beginning of string */
1776/* minend: end of match must be >=minend after stringarg. */
1777/* data: May be used for some additional optimizations. */
1778/* nosave: For optimizations. */
1779{
97aff369 1780 dVAR;
6eb5f6b9
JH
1781 register char *s;
1782 register regnode *c;
1783 register char *startpos = stringarg;
6eb5f6b9
JH
1784 I32 minlen; /* must match at least this many chars */
1785 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1786 I32 end_shift = 0; /* Same for the end. */ /* CC */
1787 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1788 char *scream_olds = NULL;
3dab1dad 1789 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1790 const bool do_utf8 = DO_UTF8(sv);
2757e526 1791 I32 multiline;
2a782b5b 1792#ifdef DEBUGGING
2757e526
JH
1793 SV* dsv0;
1794 SV* dsv1;
2a782b5b 1795#endif
3b0527fe 1796 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1797
1798 GET_RE_DEBUG_FLAGS_DECL;
1799
9d4ba2ae 1800 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1801
1802 /* Be paranoid... */
1803 if (prog == NULL || startpos == NULL) {
1804 Perl_croak(aTHX_ "NULL regexp parameter");
1805 return 0;
1806 }
1807
2757e526 1808 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1809 reginfo.prog = prog;
2757e526
JH
1810
1811#ifdef DEBUGGING
1812 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1813 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1814#endif
1815
bac06658
JH
1816 RX_MATCH_UTF8_set(prog, do_utf8);
1817
6eb5f6b9 1818 minlen = prog->minlen;
61a36c01 1819 if (strend - startpos < minlen) {
a3621e74 1820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1821 "String too short [regexec_flags]...\n"));
1822 goto phooey;
1aa99e6b 1823 }
6eb5f6b9 1824
6eb5f6b9
JH
1825 /* Check validity of program. */
1826 if (UCHARAT(prog->program) != REG_MAGIC) {
1827 Perl_croak(aTHX_ "corrupted regexp program");
1828 }
1829
1830 PL_reg_flags = 0;
1831 PL_reg_eval_set = 0;
1832 PL_reg_maxiter = 0;
1833
1834 if (prog->reganch & ROPT_UTF8)
1835 PL_reg_flags |= RF_utf8;
1836
1837 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1838 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1839 PL_bostr = strbeg;
3b0527fe 1840 reginfo.sv = sv;
6eb5f6b9
JH
1841
1842 /* Mark end of line for $ (and such) */
1843 PL_regeol = strend;
1844
1845 /* see how far we have to get to not match where we matched before */
3b0527fe 1846 reginfo.till = startpos+minend;
6eb5f6b9 1847
6eb5f6b9
JH
1848 /* If there is a "must appear" string, look for it. */
1849 s = startpos;
1850
3b0527fe 1851 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1852 MAGIC *mg;
1853
1854 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1855 reginfo.ganch = startpos;
6eb5f6b9
JH
1856 else if (sv && SvTYPE(sv) >= SVt_PVMG
1857 && SvMAGIC(sv)
14befaf4
DM
1858 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1859 && mg->mg_len >= 0) {
3b0527fe 1860 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1861 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1862 if (s > reginfo.ganch)
6eb5f6b9 1863 goto phooey;
3b0527fe 1864 s = reginfo.ganch;
6eb5f6b9
JH
1865 }
1866 }
1867 else /* pos() not defined */
3b0527fe 1868 reginfo.ganch = strbeg;
6eb5f6b9
JH
1869 }
1870
a0714e2c 1871 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1872 re_scream_pos_data d;
1873
1874 d.scream_olds = &scream_olds;
1875 d.scream_pos = &scream_pos;
1876 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1877 if (!s) {
a3621e74 1878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1879 goto phooey; /* not present */
3fa9c3d7 1880 }
6eb5f6b9
JH
1881 }
1882
a3621e74 1883 DEBUG_EXECUTE_r({
1df70142
AL
1884 const char * const s0 = UTF
1885 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1886 UNI_DISPLAY_REGEX)
1887 : prog->precomp;
bb7a0f54 1888 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1889 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1890 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1891 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1892 if (!PL_colorset)
1893 reginitcolors();
1894 PerlIO_printf(Perl_debug_log,
a0288114 1895 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1896 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1897 len0, len0, s0,
2a782b5b 1898 PL_colors[1],
9e55ce06 1899 len0 > 60 ? "..." : "",
2a782b5b 1900 PL_colors[0],
9e55ce06
JH
1901 (int)(len1 > 60 ? 60 : len1),
1902 s1, PL_colors[1],
1903 (len1 > 60 ? "..." : "")
2a782b5b
JH
1904 );
1905 });
6eb5f6b9
JH
1906
1907 /* Simplest case: anchored match need be tried only once. */
1908 /* [unless only anchor is BOL and multiline is set] */
1909 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1910 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1911 goto got_it;
7fba1cd6 1912 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1913 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1914 {
1915 char *end;
1916
1917 if (minlen)
1918 dontbother = minlen - 1;
1aa99e6b 1919 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1920 /* for multiline we only have to try after newlines */
33b8afdf 1921 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1922 if (s == startpos)
1923 goto after_try;
1924 while (1) {
3b0527fe 1925 if (regtry(&reginfo, s))
6eb5f6b9
JH
1926 goto got_it;
1927 after_try:
1928 if (s >= end)
1929 goto phooey;
1930 if (prog->reganch & RE_USE_INTUIT) {
1931 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1932 if (!s)
1933 goto phooey;
1934 }
1935 else
1936 s++;
1937 }
1938 } else {
1939 if (s > startpos)
1940 s--;
1941 while (s < end) {
1942 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1943 if (regtry(&reginfo, s))
6eb5f6b9
JH
1944 goto got_it;
1945 }
1946 }
1947 }
1948 }
1949 goto phooey;
1950 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1951 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1952 goto got_it;
1953 goto phooey;
1954 }
1955
1956 /* Messy cases: unanchored match. */
33b8afdf 1957 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1958 /* we have /x+whatever/ */
1959 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1960 char ch;
bf93d4cc
GS
1961#ifdef DEBUGGING
1962 int did_match = 0;
1963#endif
33b8afdf
JH
1964 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1966 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1967
1aa99e6b 1968 if (do_utf8) {
6eb5f6b9
JH
1969 while (s < strend) {
1970 if (*s == ch) {
a3621e74 1971 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1972 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1973 s += UTF8SKIP(s);
1974 while (s < strend && *s == ch)
1975 s += UTF8SKIP(s);
1976 }
1977 s += UTF8SKIP(s);
1978 }
1979 }
1980 else {
1981 while (s < strend) {
1982 if (*s == ch) {
a3621e74 1983 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1984 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1985 s++;
1986 while (s < strend && *s == ch)
1987 s++;
1988 }
1989 s++;
1990 }
1991 }
a3621e74 1992 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1993 PerlIO_printf(Perl_debug_log,
b7953727
JH
1994 "Did not find anchored character...\n")
1995 );
6eb5f6b9 1996 }
a0714e2c
SS
1997 else if (prog->anchored_substr != NULL
1998 || prog->anchored_utf8 != NULL
1999 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2000 && prog->float_max_offset < strend - s)) {
2001 SV *must;
2002 I32 back_max;
2003 I32 back_min;
2004 char *last;
6eb5f6b9 2005 char *last1; /* Last position checked before */
bf93d4cc
GS
2006#ifdef DEBUGGING
2007 int did_match = 0;
2008#endif
33b8afdf
JH
2009 if (prog->anchored_substr || prog->anchored_utf8) {
2010 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2011 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2012 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2013 back_max = back_min = prog->anchored_offset;
2014 } else {
2015 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2016 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2017 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2018 back_max = prog->float_max_offset;
2019 back_min = prog->float_min_offset;
2020 }
2021 if (must == &PL_sv_undef)
2022 /* could not downgrade utf8 check substring, so must fail */
2023 goto phooey;
2024
2025 last = HOP3c(strend, /* Cannot start after this */
2026 -(I32)(CHR_SVLEN(must)
2027 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2028
2029 if (s > PL_bostr)
2030 last1 = HOPc(s, -1);
2031 else
2032 last1 = s - 1; /* bogus */
2033
a0288114 2034 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2035 check_substr==must. */
2036 scream_pos = -1;
2037 dontbother = end_shift;
2038 strend = HOPc(strend, -dontbother);
2039 while ( (s <= last) &&
9041c2e3 2040 ((flags & REXEC_SCREAM)
1aa99e6b 2041 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2042 end_shift, &scream_pos, 0))
1aa99e6b 2043 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2044 (unsigned char*)strend, must,
7fba1cd6 2045 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2046 /* we may be pointing at the wrong string */
2047 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2048 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2049 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2050 if (HOPc(s, -back_max) > last1) {
2051 last1 = HOPc(s, -back_min);
2052 s = HOPc(s, -back_max);
2053 }
2054 else {
52657f30 2055 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2056
2057 last1 = HOPc(s, -back_min);
52657f30 2058 s = t;
6eb5f6b9 2059 }
1aa99e6b 2060 if (do_utf8) {
6eb5f6b9 2061 while (s <= last1) {
3b0527fe 2062 if (regtry(&reginfo, s))
6eb5f6b9
JH
2063 goto got_it;
2064 s += UTF8SKIP(s);
2065 }
2066 }
2067 else {
2068 while (s <= last1) {
3b0527fe 2069 if (regtry(&reginfo, s))
6eb5f6b9
JH
2070 goto got_it;
2071 s++;
2072 }
2073 }
2074 }
a3621e74 2075 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2076 PerlIO_printf(Perl_debug_log,
a0288114 2077 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2078 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2079 ? "anchored" : "floating"),
2080 PL_colors[0],
2081 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2082 SvPVX_const(must),
b7953727
JH
2083 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2084 );
6eb5f6b9
JH
2085 goto phooey;
2086 }
155aba94 2087 else if ((c = prog->regstclass)) {
f14c76ed 2088 if (minlen) {
be8e71aa 2089 const OPCODE op = OP(prog->regstclass);
66e933ab 2090 /* don't bother with what can't match */
07be1b83 2091 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2092 strend = HOPc(strend, -(minlen - 1));
2093 }
a3621e74 2094 DEBUG_EXECUTE_r({
be8e71aa 2095 SV * const prop = sv_newmortal();
cfd0369c
NC
2096 const char *s0;
2097 const char *s1;
9e55ce06
JH
2098 int len0;
2099 int len1;
2100
32fc9b6a 2101 regprop(prog, prop, c);
9e55ce06 2102 s0 = UTF ?
3f7c398e 2103 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2104 UNI_DISPLAY_REGEX) :
cfd0369c 2105 SvPVX_const(prop);
9e55ce06
JH
2106 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2107 s1 = UTF ?
c728cb41 2108 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2109 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2110 PerlIO_printf(Perl_debug_log,
07be1b83 2111 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2112 len0, len0, s0,
07be1b83 2113 len1, len1, s1, (int)(strend - s));
ffc61ed2 2114 });
3b0527fe 2115 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2116 goto got_it;
07be1b83 2117 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2118 }
2119 else {
2120 dontbother = 0;
a0714e2c 2121 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2122 /* Trim the end. */
d6a28714 2123 char *last;
33b8afdf
JH
2124 SV* float_real;
2125
2126 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2127 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2128 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2129
2130 if (flags & REXEC_SCREAM) {
33b8afdf 2131 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2132 end_shift, &scream_pos, 1); /* last one */
2133 if (!last)
ffc61ed2 2134 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2135 /* we may be pointing at the wrong string */
2136 else if (RX_MATCH_COPIED(prog))
3f7c398e 2137 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2138 }
d6a28714
JH
2139 else {
2140 STRLEN len;
cfd0369c 2141 const char * const little = SvPV_const(float_real, len);
d6a28714 2142
33b8afdf 2143 if (SvTAIL(float_real)) {
d6a28714
JH
2144 if (memEQ(strend - len + 1, little, len - 1))
2145 last = strend - len + 1;
7fba1cd6 2146 else if (!multiline)
9041c2e3 2147 last = memEQ(strend - len, little, len)
bd61b366 2148 ? strend - len : NULL;
b8c5462f 2149 else
d6a28714
JH
2150 goto find_last;
2151 } else {
2152 find_last:
9041c2e3 2153 if (len)
d6a28714 2154 last = rninstr(s, strend, little, little + len);
b8c5462f 2155 else
a0288114 2156 last = strend; /* matching "$" */
b8c5462f 2157 }
b8c5462f 2158 }
bf93d4cc 2159 if (last == NULL) {
a3621e74 2160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2161 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2162 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2163 goto phooey; /* Should not happen! */
2164 }
d6a28714
JH
2165 dontbother = strend - last + prog->float_min_offset;
2166 }
2167 if (minlen && (dontbother < minlen))
2168 dontbother = minlen - 1;
2169 strend -= dontbother; /* this one's always in bytes! */
2170 /* We don't know much -- general case. */
1aa99e6b 2171 if (do_utf8) {
d6a28714 2172 for (;;) {
3b0527fe 2173 if (regtry(&reginfo, s))
d6a28714
JH
2174 goto got_it;
2175 if (s >= strend)
2176 break;
b8c5462f 2177 s += UTF8SKIP(s);
d6a28714
JH
2178 };
2179 }
2180 else {
2181 do {
3b0527fe 2182 if (regtry(&reginfo, s))
d6a28714
JH
2183 goto got_it;
2184 } while (s++ < strend);
2185 }
2186 }
2187
2188 /* Failure. */
2189 goto phooey;
2190
2191got_it:
2192 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2193
2194 if (PL_reg_eval_set) {
2195 /* Preserve the current value of $^R */
2196 if (oreplsv != GvSV(PL_replgv))
2197 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2198 restored, the value remains
2199 the same. */
4f639d21 2200 restore_pos(aTHX_ prog);
d6a28714
JH
2201 }
2202
2203 /* make sure $`, $&, $', and $digit will work later */
2204 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2205 RX_MATCH_COPY_FREE(prog);
d6a28714 2206 if (flags & REXEC_COPY_STR) {
be8e71aa 2207 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2208#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2209 if ((SvIsCOW(sv)
2210 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2211 if (DEBUG_C_TEST) {
2212 PerlIO_printf(Perl_debug_log,
2213 "Copy on write: regexp capture, type %d\n",
2214 (int) SvTYPE(sv));
2215 }
2216 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2217 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2218 assert (SvPOKp(prog->saved_copy));
2219 } else
2220#endif
2221 {
2222 RX_MATCH_COPIED_on(prog);
2223 s = savepvn(strbeg, i);
2224 prog->subbeg = s;
2225 }
d6a28714 2226 prog->sublen = i;
d6a28714
JH
2227 }
2228 else {
2229 prog->subbeg = strbeg;
2230 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2231 }
2232 }
9041c2e3 2233
d6a28714
JH
2234 return 1;
2235
2236phooey:
a3621e74 2237 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2238 PL_colors[4], PL_colors[5]));
d6a28714 2239 if (PL_reg_eval_set)
4f639d21 2240 restore_pos(aTHX_ prog);
d6a28714
JH
2241 return 0;
2242}
2243
2244/*
2245 - regtry - try match at specific point
2246 */
2247STATIC I32 /* 0 failure, 1 success */
3b0527fe 2248S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2249{
97aff369 2250 dVAR;
d6a28714
JH
2251 register I32 *sp;
2252 register I32 *ep;
2253 CHECKPOINT lastcp;
3b0527fe 2254 regexp *prog = reginfo->prog;
a3621e74 2255 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2256
02db2b7b
IZ
2257#ifdef DEBUGGING
2258 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2259#endif
d6a28714
JH
2260 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2261 MAGIC *mg;
2262
2263 PL_reg_eval_set = RS_init;
a3621e74 2264 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2265 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2266 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2267 ));
e8347627 2268 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2269 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2270 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2271 SAVETMPS;
2272 /* Apparently this is not needed, judging by wantarray. */
e8347627 2273 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2274 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2275
3b0527fe 2276 if (reginfo->sv) {
d6a28714 2277 /* Make $_ available to executed code. */
3b0527fe 2278 if (reginfo->sv != DEFSV) {
59f00321 2279 SAVE_DEFSV;
3b0527fe 2280 DEFSV = reginfo->sv;
b8c5462f 2281 }
d6a28714 2282
3b0527fe
DM
2283 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2284 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2285 /* prepare for quick setting of pos */
d300d9fa
NC
2286#ifdef PERL_OLD_COPY_ON_WRITE
2287 if (SvIsCOW(sv))
2288 sv_force_normal_flags(sv, 0);
2289#endif
3dab1dad 2290 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2291 &PL_vtbl_mglob, NULL, 0);
d6a28714 2292 mg->mg_len = -1;
b8c5462f 2293 }
d6a28714
JH
2294 PL_reg_magic = mg;
2295 PL_reg_oldpos = mg->mg_len;
4f639d21 2296 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2297 }
09687e5a 2298 if (!PL_reg_curpm) {
a02a5408 2299 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2300#ifdef USE_ITHREADS
2301 {
be8e71aa 2302 SV* const repointer = newSViv(0);
577e12cc 2303 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2304 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2305 av_push(PL_regex_padav,repointer);
2306 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2307 PL_regex_pad = AvARRAY(PL_regex_padav);
2308 }
2309#endif
2310 }
aaa362c4 2311 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2312 PL_reg_oldcurpm = PL_curpm;
2313 PL_curpm = PL_reg_curpm;
2314 if (RX_MATCH_COPIED(prog)) {
2315 /* Here is a serious problem: we cannot rewrite subbeg,
2316 since it may be needed if this match fails. Thus
2317 $` inside (?{}) could fail... */
2318 PL_reg_oldsaved = prog->subbeg;
2319 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2320#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2321 PL_nrs = prog->saved_copy;
2322#endif
d6a28714
JH
2323 RX_MATCH_COPIED_off(prog);
2324 }
2325 else
bd61b366 2326 PL_reg_oldsaved = NULL;
d6a28714
JH
2327 prog->subbeg = PL_bostr;
2328 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2329 }
973dddac 2330 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2331 PL_reginput = startpos;
2332 PL_regstartp = prog->startp;
2333 PL_regendp = prog->endp;
2334 PL_reglastparen = &prog->lastparen;
a01268b5 2335 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2336 prog->lastparen = 0;
03994de8 2337 prog->lastcloseparen = 0;
d6a28714 2338 PL_regsize = 0;
a3621e74 2339 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2340 if (PL_reg_start_tmpl <= prog->nparens) {
2341 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2342 if(PL_reg_start_tmp)
2343 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2344 else
a02a5408 2345 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2346 }
2347
2348 /* XXXX What this code is doing here?!!! There should be no need
2349 to do this again and again, PL_reglastparen should take care of
3dd2943c 2350 this! --ilya*/
dafc8851
JH
2351
2352 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2353 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2354 * PL_reglastparen), is not needed at all by the test suite
2355 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2356 * enough, for building DynaLoader, or otherwise this
2357 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2358 * will happen. Meanwhile, this code *is* needed for the
2359 * above-mentioned test suite tests to succeed. The common theme
2360 * on those tests seems to be returning null fields from matches.
2361 * --jhi */
dafc8851 2362#if 1
d6a28714
JH
2363 sp = prog->startp;
2364 ep = prog->endp;
2365 if (prog->nparens) {
097eb12c 2366 register I32 i;
eb160463 2367 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2368 *++sp = -1;
2369 *++ep = -1;
2370 }
2371 }
dafc8851 2372#endif
02db2b7b 2373 REGCP_SET(lastcp);
3b0527fe 2374 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2375 prog->endp[0] = PL_reginput - PL_bostr;
2376 return 1;
2377 }
02db2b7b 2378 REGCP_UNWIND(lastcp);
d6a28714
JH
2379 return 0;
2380}
2381
02db2b7b 2382
8ba1375e
MJD
2383#define sayYES goto yes
2384#define sayNO goto no
e0f9d4a8 2385#define sayNO_ANYOF goto no_anyof
8ba1375e 2386#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2387#define sayNO_FINAL goto no_final
2388#define sayNO_SILENT goto do_no
2389#define saySAME(x) if (x) goto yes; else goto no
2390
3ab3c9b4
HS
2391#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2392#define POSCACHE_SEEN 1 /* we know what we're caching */
2393#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2394
3ab3c9b4 2395#define CACHEsayYES STMT_START { \
d8319b27 2396 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2397 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2398 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2399 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2400 } \
2401 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2403 } \
2404 else { \
3ab3c9b4
HS
2405 /* cache records failure, but this is success */ \
2406 DEBUG_r( \
2407 PerlIO_printf(Perl_debug_log, \
2408 "%*s (remove success from failure cache)\n", \
2409 REPORT_CODE_OFF+PL_regindent*2, "") \
2410 ); \
d8319b27 2411 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2412 } \
2413 } \
2414 sayYES; \
2415} STMT_END
7409bbd3 2416
3ab3c9b4 2417#define CACHEsayNO STMT_START { \
d8319b27 2418 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2419 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2420 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2421 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2422 } \
2423 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2425 } \
2426 else { \
3ab3c9b4
HS
2427 /* cache records success, but this is failure */ \
2428 DEBUG_r( \
2429 PerlIO_printf(Perl_debug_log, \
2430 "%*s (remove failure from success cache)\n", \
2431 REPORT_CODE_OFF+PL_regindent*2, "") \
2432 ); \
d8319b27 2433 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2434 } \
2435 } \
2436 sayNO; \
2437} STMT_END
2438
a3621e74
YO
2439/* this is used to determine how far from the left messages like
2440 'failed...' are printed. Currently 29 makes these messages line
2441 up with the opcode they refer to. Earlier perls used 25 which
2442 left these messages outdented making reviewing a debug output
2443 quite difficult.
2444*/
2445#define REPORT_CODE_OFF 29
2446
2447
2448/* Make sure there is a test for this +1 options in re_tests */
2449#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2450
40a82448
DM
2451#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2452#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2453
86545054
DM
2454#define SLAB_FIRST(s) (&(s)->states[0])
2455#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2456
5d9a96ca
DM
2457/* grab a new slab and return the first slot in it */
2458
2459STATIC regmatch_state *
2460S_push_slab(pTHX)
2461{
54df2634
NC
2462#if PERL_VERSION < 9
2463 dMY_CXT;
2464#endif
5d9a96ca
DM
2465 regmatch_slab *s = PL_regmatch_slab->next;
2466 if (!s) {
2467 Newx(s, 1, regmatch_slab);
2468 s->prev = PL_regmatch_slab;
2469 s->next = NULL;
2470 PL_regmatch_slab->next = s;
2471 }
2472 PL_regmatch_slab = s;
86545054 2473 return SLAB_FIRST(s);
5d9a96ca 2474}
5b47454d 2475
95b24440
DM
2476/* simulate a recursive call to regmatch */
2477
2478#define REGMATCH(ns, where) \
5d9a96ca
DM
2479 st->scan = scan; \
2480 scan = (ns); \
2481 st->resume_state = resume_##where; \
95b24440
DM
2482 goto start_recurse; \
2483 resume_point_##where:
2484
40a82448
DM
2485/* push a new state then goto it */
2486
2487#define PUSH_STATE_GOTO(state, node) \
2488 scan = node; \
2489 st->resume_state = state; \
2490 goto push_state;
2491
2492/* push a new state with success backtracking, then goto it */
2493
2494#define PUSH_YES_STATE_GOTO(state, node) \
2495 scan = node; \
2496 st->resume_state = state; \
2497 goto push_yes_state;
2498
aa283a38 2499
aa283a38 2500
d6a28714
JH
2501/*
2502 - regmatch - main matching routine
2503 *
2504 * Conceptually the strategy is simple: check to see whether the current
2505 * node matches, call self recursively to see whether the rest matches,
2506 * and then act accordingly. In practice we make some effort to avoid
2507 * recursion, in particular by going through "ordinary" nodes (that don't
2508 * need to know whether the rest of the match failed) by a loop instead of
2509 * by recursion.
2510 */
2511/* [lwall] I've hoisted the register declarations to the outer block in order to
2512 * maybe save a little bit of pushing and popping on the stack. It also takes
2513 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2514 *
2515 * This function used to be heavily recursive, but since this had the
2516 * effect of blowing the CPU stack on complex regexes, it has been
2517 * restructured to be iterative, and to save state onto the heap rather
2518 * than the stack. Essentially whereever regmatch() used to be called, it
2519 * pushes the current state, notes where to return, then jumps back into
2520 * the main loop.
2521 *
2522 * Originally the structure of this function used to look something like
2523
2524 S_regmatch() {
2525 int a = 1, b = 2;
2526 ...
2527 while (scan != NULL) {
5d9a96ca 2528 a++; // do stuff with a and b
95b24440
DM
2529 ...
2530 switch (OP(scan)) {
2531 case FOO: {
2532 int local = 3;
2533 ...
2534 if (regmatch(...)) // recurse
2535 goto yes;
2536 }
2537 ...
2538 }
2539 }
2540 yes:
2541 return 1;
2542 }
2543
2544 * Now it looks something like this:
2545
5d9a96ca 2546 typedef struct {
95b24440
DM
2547 int a, b, local;
2548 int resume_state;
5d9a96ca 2549 } regmatch_state;
95b24440
DM
2550
2551 S_regmatch() {
5d9a96ca
DM
2552 regmatch_state *st = new();
2553 int depth=0;
2554 st->a++; // do stuff with a and b
95b24440
DM
2555 ...
2556 while (scan != NULL) {
2557 ...
2558 switch (OP(scan)) {
2559 case FOO: {
5d9a96ca 2560 st->local = 3;
95b24440 2561 ...
5d9a96ca
DM
2562 st->scan = scan;
2563 scan = ...;
2564 st->resume_state = resume_FOO;
2565 goto start_recurse; // recurse
95b24440 2566
5d9a96ca
DM
2567 resume_point_FOO:
2568 if (result)
95b24440
DM
2569 goto yes;
2570 }
2571 ...
2572 }
5d9a96ca
DM
2573 start_recurse:
2574 st = new(); push a new state
2575 st->a = 1; st->b = 2;
2576 depth++;
95b24440 2577 }
5d9a96ca 2578 yes:
95b24440 2579 result = 1;
5d9a96ca
DM
2580 if (depth--) {
2581 st = pop();
95b24440
DM
2582 switch (resume_state) {
2583 case resume_FOO:
2584 goto resume_point_FOO;
2585 ...
2586 }
2587 }
2588 return result
2589 }
2590
2591 * WARNING: this means that any line in this function that contains a
2592 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2593 * regmatch() using gotos instead. Thus the values of any local variables
2594 * not saved in the regmatch_state structure will have been lost when
2595 * execution resumes on the next line .
5d9a96ca
DM
2596 *
2597 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2598 * PL_regmatch_state always points to the currently active state, and
2599 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2600 * The first time regmatch is called, the first slab is allocated, and is
2601 * never freed until interpreter desctruction. When the slab is full,
2602 * a new one is allocated chained to the end. At exit from regmatch, slabs
2603 * allocated since entry are freed.
d6a28714 2604 */
95b24440 2605
40a82448
DM
2606/* *** every FOO_fail should = FOO+1 */
2607#define resume_TRIE1 (REGNODE_MAX+1)
2608#define resume_TRIE2 (REGNODE_MAX+2)
2609#define EVAL_A (REGNODE_MAX+3)
2610#define EVAL_A_fail (REGNODE_MAX+4)
2611#define resume_CURLYX (REGNODE_MAX+5)
2612#define resume_WHILEM1 (REGNODE_MAX+6)
2613#define resume_WHILEM2 (REGNODE_MAX+7)
2614#define resume_WHILEM3 (REGNODE_MAX+8)
2615#define resume_WHILEM4 (REGNODE_MAX+9)
2616#define resume_WHILEM5 (REGNODE_MAX+10)
2617#define resume_WHILEM6 (REGNODE_MAX+11)
2618#define BRANCH_next (REGNODE_MAX+12)
2619#define BRANCH_next_fail (REGNODE_MAX+13)
2620#define CURLYM_A (REGNODE_MAX+14)
2621#define CURLYM_A_fail (REGNODE_MAX+15)
2622#define CURLYM_B (REGNODE_MAX+16)
2623#define CURLYM_B_fail (REGNODE_MAX+17)
2624#define IFMATCH_A (REGNODE_MAX+18)
2625#define IFMATCH_A_fail (REGNODE_MAX+19)
2626#define resume_PLUS1 (REGNODE_MAX+20)
2627#define resume_PLUS2 (REGNODE_MAX+21)
2628#define resume_PLUS3 (REGNODE_MAX+22)
2629#define resume_PLUS4 (REGNODE_MAX+23)
2630
2631
2632
3dab1dad 2633#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2634
07be1b83
YO
2635#ifdef DEBUGGING
2636STATIC void
2637S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2638{
2639 const int docolor = *PL_colors[0];
2640 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2641 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2642 /* The part of the string before starttry has one color
2643 (pref0_len chars), between starttry and current
2644 position another one (pref_len - pref0_len chars),
2645 after the current position the third one.
2646 We assume that pref0_len <= pref_len, otherwise we
2647 decrease pref0_len. */
2648 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2649 ? (5 + taill) - l : locinput - PL_bostr;
2650 int pref0_len;
2651
2652 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2653 pref_len++;
2654 pref0_len = pref_len - (locinput - PL_reg_starttry);
2655 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2656 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2657 ? (5 + taill) - pref_len : PL_regeol - locinput);
2658 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2659 l--;
2660 if (pref0_len < 0)
2661 pref0_len = 0;
2662 if (pref0_len > pref_len)
2663 pref0_len = pref_len;
2664 {
2665 const char * const s0 =
2666 do_utf8 && OP(scan) != CANY ?
2667 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2668 pref0_len, 60, UNI_DISPLAY_REGEX) :
2669 locinput - pref_len;
2670 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2671 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2672 pv_uni_display(PERL_DEBUG_PAD(1),
2673 (U8*)(locinput - pref_len + pref0_len),
2674 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2675 locinput - pref_len + pref0_len;
2676 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2677 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2678 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2679 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2680 locinput;
2681 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2682 PerlIO_printf(Perl_debug_log,
2683 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2684 (IV)(locinput - PL_bostr),
2685 PL_colors[4],
2686 len0, s0,
2687 PL_colors[5],
2688 PL_colors[2],
2689 len1, s1,
2690 PL_colors[3],
2691 (docolor ? "" : "> <"),
2692 PL_colors[0],
2693 len2, s2,
2694 PL_colors[1],
2695 15 - l - pref_len + 1,
2696 "");
2697 }
2698}
2699#endif
2700
d6a28714 2701STATIC I32 /* 0 failure, 1 success */
3b0527fe 2702S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2703{
54df2634
NC
2704#if PERL_VERSION < 9
2705 dMY_CXT;
2706#endif
27da23d5 2707 dVAR;
95b24440 2708 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2709 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2710
3b0527fe
DM
2711 regexp *rex = reginfo->prog;
2712
5d9a96ca
DM
2713 regmatch_slab *orig_slab;
2714 regmatch_state *orig_state;
a3621e74 2715
5d9a96ca
DM
2716 /* the current state. This is a cached copy of PL_regmatch_state */
2717 register regmatch_state *st;
95b24440 2718
5d9a96ca
DM
2719 /* cache heavy used fields of st in registers */
2720 register regnode *scan;
2721 register regnode *next;
2722 register I32 n = 0; /* initialize to shut up compiler warning */
2723 register char *locinput = PL_reginput;
95b24440 2724
5d9a96ca
DM
2725 /* these variables are NOT saved during a recusive RFEGMATCH: */
2726 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2727 bool result; /* return value of S_regmatch */
5d9a96ca 2728 int depth = 0; /* depth of recursion */
77cb431f
DM
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2730 subpattern */
40a82448 2731 U32 state_num;
95b24440
DM
2732
2733#ifdef DEBUGGING
e68ec53f 2734 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2735 PL_regindent++;
2736#endif
2737
5d9a96ca
DM
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
86545054 2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2744 }
2745
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2750
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
86545054 2753 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2755
2756 st->minmod = 0;
2757 st->sw = 0;
2758 st->logical = 0;
5d9a96ca 2759 st->cc = NULL;
d6a28714
JH
2760 /* Note that nextchr is a byte even in UTF */
2761 nextchr = UCHARAT(locinput);
2762 scan = prog;
2763 while (scan != NULL) {
8ba1375e 2764
a3621e74 2765 DEBUG_EXECUTE_r( {
6136c704 2766 SV * const prop = sv_newmortal();
07be1b83 2767 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2768 regprop(rex, prop, scan);
07be1b83
YO
2769
2770 PerlIO_printf(Perl_debug_log,
2771 "%3"IVdf":%*s%s(%"IVdf")\n",
2772 (IV)(scan - rex->program), PL_regindent*2, "",
2773 SvPVX_const(prop),
2774 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2775 });
d6a28714
JH
2776
2777 next = scan + NEXT_OFF(scan);
2778 if (next == scan)
2779 next = NULL;
40a82448 2780 state_num = OP(scan);
d6a28714 2781
40a82448
DM
2782 reenter_switch:
2783 switch (state_num) {
d6a28714 2784 case BOL:
7fba1cd6 2785 if (locinput == PL_bostr)
d6a28714 2786 {
3b0527fe 2787 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2788 break;
2789 }
d6a28714
JH
2790 sayNO;
2791 case MBOL:
12d33761
HS
2792 if (locinput == PL_bostr ||
2793 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2794 {
b8c5462f
JH
2795 break;
2796 }
d6a28714
JH
2797 sayNO;
2798 case SBOL:
c2a73568 2799 if (locinput == PL_bostr)
b8c5462f 2800 break;
d6a28714
JH
2801 sayNO;
2802 case GPOS:
3b0527fe 2803 if (locinput == reginfo->ganch)
d6a28714
JH
2804 break;
2805 sayNO;
2806 case EOL:
d6a28714
JH
2807 goto seol;
2808 case MEOL:
d6a28714 2809 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2810 sayNO;
b8c5462f 2811 break;
d6a28714
JH
2812 case SEOL:
2813 seol:
2814 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2815 sayNO;
d6a28714 2816 if (PL_regeol - locinput > 1)
b8c5462f 2817 sayNO;
b8c5462f 2818 break;
d6a28714
JH
2819 case EOS:
2820 if (PL_regeol != locinput)
b8c5462f 2821 sayNO;
d6a28714 2822 break;
ffc61ed2 2823 case SANY:
d6a28714 2824 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2825 sayNO;
f33976b4
DB
2826 if (do_utf8) {
2827 locinput += PL_utf8skip[nextchr];
2828 if (locinput > PL_regeol)
2829 sayNO;
2830 nextchr = UCHARAT(locinput);
2831 }
2832 else
2833 nextchr = UCHARAT(++locinput);
2834 break;
2835 case CANY:
2836 if (!nextchr && locinput >= PL_regeol)
2837 sayNO;
b8c5462f 2838 nextchr = UCHARAT(++locinput);
a0d0e21e 2839 break;
ffc61ed2 2840 case REG_ANY:
1aa99e6b
IH
2841 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2842 sayNO;
2843 if (do_utf8) {
b8c5462f 2844 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2845 if (locinput > PL_regeol)
2846 sayNO;
a0ed51b3 2847 nextchr = UCHARAT(locinput);
a0ed51b3 2848 }
1aa99e6b
IH
2849 else
2850 nextchr = UCHARAT(++locinput);
a0ed51b3 2851 break;
5b47454d 2852 case TRIE:
3dab1dad 2853 {
07be1b83 2854 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2855 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2856 trie_type = do_utf8 ?
2857 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2858 : trie_plain;
2859
2860 /* what trie are we using right now */
be8e71aa 2861 reg_trie_data * const trie
3dab1dad
YO
2862 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2863 U32 state = trie->startstate;
2864
2865 if (trie->bitmap && trie_type != trie_utf8_fold &&
2866 !TRIE_BITMAP_TEST(trie,*locinput)
2867 ) {
2868 if (trie->states[ state ].wordnum) {
2869 DEBUG_EXECUTE_r(
2870 PerlIO_printf(Perl_debug_log,
2871 "%*s %smatched empty string...%s\n",
2872 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2873 );
2874 break;
2875 } else {
2876 DEBUG_EXECUTE_r(
2877 PerlIO_printf(Perl_debug_log,
2878 "%*s %sfailed to match start class...%s\n",
2879 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2880 );
2881 sayNO_SILENT;
2882 }
2883 }
a3621e74 2884 {
07be1b83
YO
2885 /*
2886 traverse the TRIE keeping track of all accepting states
2887 we transition through until we get to a failing node.
2888 */
2889
a3621e74 2890 U8 *uc = ( U8* )locinput;
a3621e74
YO
2891 U16 charid = 0;
2892 U32 base = 0;
2893 UV uvc = 0;
2894 STRLEN len = 0;
2895 STRLEN foldlen = 0;
a3621e74
YO
2896 U8 *uscan = (U8*)NULL;
2897 STRLEN bufflen=0;
95b24440 2898 SV *sv_accept_buff = NULL;
5b47454d 2899
d8319b27 2900 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2901 result = 0;
a3621e74
YO
2902
2903 while ( state && uc <= (U8*)PL_regeol ) {
2904
5b47454d 2905 if (trie->states[ state ].wordnum) {
d8319b27 2906 if (!st->u.trie.accepted ) {
5b47454d
DM
2907 ENTER;
2908 SAVETMPS;
2909 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2910 sv_accept_buff=newSV(bufflen *
2911 sizeof(reg_trie_accepted) - 1);
2912 SvCUR_set(sv_accept_buff,
2913 sizeof(reg_trie_accepted));
2914 SvPOK_on(sv_accept_buff);
2915 sv_2mortal(sv_accept_buff);
d8319b27 2916 st->u.trie.accept_buff =
5b47454d
DM
2917 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2918 }
2919 else {
d8319b27 2920 if (st->u.trie.accepted >= bufflen) {
5b47454d 2921 bufflen *= 2;
d8319b27 2922 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2923 SvGROW(sv_accept_buff,
2924 bufflen * sizeof(reg_trie_accepted));
2925 }
2926 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2927 + sizeof(reg_trie_accepted));
2928 }
d8319b27
DM
2929 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2930 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2931 ++st->u.trie.accepted;
5b47454d 2932 }
a3621e74
YO
2933
2934 base = trie->states[ state ].trans.base;
2935
07be1b83
YO
2936 DEBUG_TRIE_EXECUTE_r({
2937 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2938 PerlIO_printf( Perl_debug_log,
e4584336 2939 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2940 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2941 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2942 });
a3621e74
YO
2943
2944 if ( base ) {
5b47454d 2945 switch (trie_type) {
3dab1dad 2946 case trie_utf8_fold:
a3621e74
YO
2947 if ( foldlen>0 ) {
2948 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2949 foldlen -= len;
2950 uscan += len;
2951 len=0;
2952 } else {
1df70142 2953 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2954 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2955 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2956 foldlen -= UNISKIP( uvc );
2957 uscan = foldbuf + UNISKIP( uvc );
2958 }
5b47454d
DM
2959 break;
2960 case trie_utf8:
2961 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2962 &len, uniflags );
2963 break;
2964 case trie_plain:
e4584336 2965 uvc = (UV)*uc;
a3621e74
YO
2966 len = 1;
2967 }
2968
5b47454d
DM
2969 if (uvc < 256) {
2970 charid = trie->charmap[ uvc ];
2971 }
2972 else {
2973 charid = 0;
2974 if (trie->widecharmap) {
3dab1dad 2975 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2976 (char*)&uvc, sizeof(UV), 0);
2977 if (svpp)
2978 charid = (U16)SvIV(*svpp);
2979 }
2980 }
a3621e74 2981
5b47454d
DM
2982 if (charid &&
2983 (base + charid > trie->uniquecharcount )
2984 && (base + charid - 1 - trie->uniquecharcount
2985 < trie->lasttrans)
2986 && trie->trans[base + charid - 1 -
2987 trie->uniquecharcount].check == state)
2988 {
2989 state = trie->trans[base + charid - 1 -
2990 trie->uniquecharcount ].next;
2991 }
2992 else {
2993 state = 0;
2994 }
2995 uc += len;
2996
2997 }
2998 else {
a3621e74
YO
2999 state = 0;
3000 }
3001 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3002 PerlIO_printf( Perl_debug_log,
3003 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3004 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3005 );
3006 }
d8319b27 3007 if (!st->u.trie.accepted )
a3621e74 3008 sayNO;
a3621e74
YO
3009
3010 /*
3011 There was at least one accepting state that we
3012 transitioned through. Presumably the number of accepting
3013 states is going to be low, typically one or two. So we
3014 simply scan through to find the one with lowest wordnum.
3015 Once we find it, we swap the last state into its place
3016 and decrement the size. We then try to match the rest of
3017 the pattern at the point where the word ends, if we
3018 succeed then we end the loop, otherwise the loop
3019 eventually terminates once all of the accepting states
3020 have been tried.
3021 */
a3621e74 3022
d8319b27 3023 if ( st->u.trie.accepted == 1 ) {
f2278c82 3024 DEBUG_EXECUTE_r({
be8e71aa 3025 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3026 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3027 : NULL;
3028 PerlIO_printf( Perl_debug_log,
3029 "%*s %sonly one match : #%d <%s>%s\n",
3030 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3031 st->u.trie.accept_buff[ 0 ].wordnum,
3032 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3033 PL_colors[5] );
3034 });
d8319b27 3035 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3036 /* in this case we free tmps/leave before we call regmatch
3037 as we wont be using accept_buff again. */
3038 FREETMPS;
3039 LEAVE;
07be1b83 3040 /* do we need this? why dont we just do a break? */
95b24440
DM
3041 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3042 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3043 } else {
3044 DEBUG_EXECUTE_r(
e4584336 3045 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3046 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3047 PL_colors[5] );
3048 );
d8319b27 3049 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3050 U32 best = 0;
3051 U32 cur;
d8319b27 3052 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3053 DEBUG_TRIE_EXECUTE_r(
3054 PerlIO_printf( Perl_debug_log,
3055 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3056 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3057 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3058 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3059 );
a3621e74 3060
d8319b27
DM
3061 if (st->u.trie.accept_buff[cur].wordnum <
3062 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3063 best = cur;
a3621e74 3064 }
f2278c82
YO
3065 DEBUG_EXECUTE_r({
3066 reg_trie_data * const trie = (reg_trie_data*)
3067 rex->data->data[ARG(scan)];
be8e71aa 3068 SV ** const tmp = RX_DEBUG(reginfo->prog)
f2278c82
YO
3069 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3070 : NULL;
3071 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3072 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3073 st->u.trie.accept_buff[best].wordnum,
3074 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3075 PL_colors[5] );
3076 });
d8319b27
DM
3077 if ( best<st->u.trie.accepted ) {
3078 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3079 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3080 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3081 best = st->u.trie.accepted;
a3621e74 3082 }
d8319b27 3083 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3084
3085 /*
3086 as far as I can tell we only need the SAVETMPS/FREETMPS
3087 for re's with EVAL in them but I'm leaving them in for
3088 all until I can be sure.
3089 */
3090 SAVETMPS;
95b24440
DM
3091 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3092 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3093 FREETMPS;
3094 }
3095 FREETMPS;
3096 LEAVE;
3097 }
3098
95b24440 3099 if (result) {
a3621e74
YO
3100 sayYES;
3101 } else {
3102 sayNO;
3103 }
3dab1dad 3104 }}
a3621e74 3105 /* unreached codepoint */
95b24440
DM
3106 case EXACT: {
3107 char *s = STRING(scan);
5d9a96ca 3108 st->ln = STR_LEN(scan);
eb160463 3109 if (do_utf8 != UTF) {
bc517b45 3110 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3111 char *l = locinput;
be8e71aa 3112 const char * const e = s + st->ln;
a72c7584 3113
5ff6fc6d
JH
3114 if (do_utf8) {
3115 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3116 while (s < e) {
a3b680e6 3117 STRLEN ulen;
1aa99e6b 3118 if (l >= PL_regeol)
5ff6fc6d
JH
3119 sayNO;
3120 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3121 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3122 uniflags))
5ff6fc6d 3123 sayNO;
bc517b45 3124 l += ulen;
5ff6fc6d 3125 s ++;
1aa99e6b 3126 }
5ff6fc6d
JH
3127 }
3128 else {
3129 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3130 while (s < e) {
a3b680e6 3131 STRLEN ulen;
1aa99e6b
IH
3132 if (l >= PL_regeol)
3133 sayNO;
5ff6fc6d 3134 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3135 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3136 uniflags))
1aa99e6b 3137 sayNO;
bc517b45 3138 s += ulen;
a72c7584 3139 l ++;
1aa99e6b 3140 }
5ff6fc6d 3141 }
1aa99e6b
IH
3142 locinput = l;
3143 nextchr = UCHARAT(locinput);
3144 break;
3145 }
bc517b45 3146 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3147 /* Inline the first character, for speed. */
3148 if (UCHARAT(s) != nextchr)
3149 sayNO;
5d9a96ca 3150 if (PL_regeol - locinput < st->ln)
d6a28714 3151 sayNO;
5d9a96ca 3152 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3153 sayNO;
5d9a96ca 3154 locinput += st->ln;
d6a28714
JH
3155 nextchr = UCHARAT(locinput);
3156 break;
95b24440 3157 }
d6a28714 3158 case EXACTFL:
b8c5462f
JH
3159 PL_reg_flags |= RF_tainted;
3160 /* FALL THROUGH */
95b24440 3161 case EXACTF: {
be8e71aa 3162 char * const s = STRING(scan);
5d9a96ca 3163 st->ln = STR_LEN(scan);
d6a28714 3164
d07ddd77
JH
3165 if (do_utf8 || UTF) {
3166 /* Either target or the pattern are utf8. */
be8e71aa 3167 const char * const l = locinput;
d07ddd77 3168 char *e = PL_regeol;
bc517b45 3169
5d9a96ca 3170 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3171 l, &e, 0, do_utf8)) {
5486206c
JH
3172 /* One more case for the sharp s:
3173 * pack("U0U*", 0xDF) =~ /ss/i,
3174 * the 0xC3 0x9F are the UTF-8
3175 * byte sequence for the U+00DF. */
3176 if (!(do_utf8 &&
3177 toLOWER(s[0]) == 's' &&
5d9a96ca 3178 st->ln >= 2 &&
5486206c
JH
3179 toLOWER(s[1]) == 's' &&
3180 (U8)l[0] == 0xC3 &&
3181 e - l >= 2 &&
3182 (U8)l[1] == 0x9F))
3183 sayNO;
3184 }
d07ddd77
JH
3185 locinput = e;
3186 nextchr = UCHARAT(locinput);
3187 break;
a0ed51b3 3188 }
d6a28714 3189
bc517b45
JH
3190 /* Neither the target and the pattern are utf8. */
3191
d6a28714
JH
3192 /* Inline the first character, for speed. */
3193 if (UCHARAT(s) != nextchr &&
3194 UCHARAT(s) != ((OP(scan) == EXACTF)
3195 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3196 sayNO;
5d9a96ca 3197 if (PL_regeol - locinput < st->ln)
b8c5462f 3198 sayNO;
5d9a96ca
DM
3199 if (st->ln > 1 && (OP(scan) == EXACTF
3200 ? ibcmp(s, locinput, st->ln)
3201 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3202 sayNO;
5d9a96ca 3203 locinput += st->ln;
d6a28714 3204 nextchr = UCHARAT(locinput);
a0d0e21e 3205 break;
95b24440 3206 }
d6a28714 3207 case ANYOF:
ffc61ed2 3208 if (do_utf8) {
9e55ce06
JH
3209 STRLEN inclasslen = PL_regeol - locinput;
3210
32fc9b6a 3211 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3212 sayNO_ANYOF;
ffc61ed2
JH
3213 if (locinput >= PL_regeol)
3214 sayNO;
0f0076b4 3215 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3216 nextchr = UCHARAT(locinput);
e0f9d4a8 3217 break;
ffc61ed2
JH
3218 }
3219 else {
3220 if (nextchr < 0)
3221 nextchr = UCHARAT(locinput);
32fc9b6a 3222 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3223 sayNO_ANYOF;
ffc61ed2
JH
3224 if (!nextchr && locinput >= PL_regeol)
3225 sayNO;
3226 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3227 break;
3228 }
3229 no_anyof:
3230 /* If we might have the case of the German sharp s
3231 * in a casefolding Unicode character class. */
3232
ebc501f0
JH
3233 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3234 locinput += SHARP_S_SKIP;
e0f9d4a8 3235 nextchr = UCHARAT(locinput);
ffc61ed2 3236 }
e0f9d4a8
JH
3237 else
3238 sayNO;
b8c5462f 3239 break;
d6a28714 3240 case ALNUML:
b8c5462f
JH
3241 PL_reg_flags |= RF_tainted;
3242 /* FALL THROUGH */
d6a28714 3243 case ALNUM:
b8c5462f 3244 if (!nextchr)
4633a7c4 3245 sayNO;
ffc61ed2 3246 if (do_utf8) {
1a4fad37 3247 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3248 if (!(OP(scan) == ALNUM
bb7a0f54 3249 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3250 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3251 {
3252 sayNO;
a0ed51b3 3253 }
b8c5462f 3254 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3255 nextchr = UCHARAT(locinput);
3256 break;
3257 }
ffc61ed2 3258 if (!(OP(scan) == ALNUM
d6a28714 3259 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3260 sayNO;
b8c5462f 3261 nextchr = UCHARAT(++locinput);
a0d0e21e 3262 break;
d6a28714 3263 case NALNUML:
b8c5462f
JH
3264 PL_reg_flags |= RF_tainted;
3265 /* FALL THROUGH */
d6a28714
JH
3266 case NALNUM:
3267 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3268 sayNO;
ffc61ed2 3269 if (do_utf8) {
1a4fad37 3270 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3271 if (OP(scan) == NALNUM
bb7a0f54 3272 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3273 : isALNUM_LC_utf8((U8*)locinput))
3274 {
b8c5462f 3275 sayNO;
d6a28714 3276 }
b8c5462f
JH
3277 locinput += PL_utf8skip[nextchr];
3278 nextchr = UCHARAT(locinput);
3279 break;
3280 }
ffc61ed2 3281 if (OP(scan) == NALNUM
d6a28714 3282 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3283 sayNO;
76e3520e 3284 nextchr = UCHARAT(++locinput);
a0d0e21e 3285 break;
d6a28714
JH
3286 case BOUNDL:
3287 case NBOUNDL:
3280af22 3288 PL_reg_flags |= RF_tainted;
bbce6d69 3289 /* FALL THROUGH */
d6a28714
JH
3290 case BOUND:
3291 case NBOUND:
3292 /* was last char in word? */
ffc61ed2 3293 if (do_utf8) {
12d33761 3294 if (locinput == PL_bostr)
5d9a96ca 3295 st->ln = '\n';
ffc61ed2 3296 else {
a3b680e6 3297 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3298
4ad0818d 3299 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3300 }
3301 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3302 st->ln = isALNUM_uni(st->ln);
1a4fad37 3303 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3304 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3305 }
3306 else {
5d9a96ca 3307 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3308 n = isALNUM_LC_utf8((U8*)locinput);
3309 }
a0ed51b3 3310 }
d6a28714 3311 else {
5d9a96ca 3312 st->ln = (locinput != PL_bostr) ?
12d33761 3313 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3314 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3315 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3316 n = isALNUM(nextchr);
3317 }
3318 else {
5d9a96ca 3319 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3320 n = isALNUM_LC(nextchr);
3321 }
d6a28714 3322 }
5d9a96ca 3323 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3324 OP(scan) == BOUNDL))
3325 sayNO;
a0ed51b3 3326 break;
d6a28714 3327 case SPACEL:
3280af22 3328 PL_reg_flags |= RF_tainted;
bbce6d69 3329 /* FALL THROUGH */
d6a28714 3330 case SPACE:
9442cb0e 3331 if (!nextchr)
4633a7c4 3332 sayNO;
1aa99e6b 3333 if (do_utf8) {
fd400ab9 3334 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3335 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3336 if (!(OP(scan) == SPACE
bb7a0f54 3337 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3338 : isSPACE_LC_utf8((U8*)locinput)))
3339 {
3340 sayNO;
3341 }
3342 locinput += PL_utf8skip[nextchr];
3343 nextchr = UCHARAT(locinput);
3344 break;
d6a28714 3345 }
ffc61ed2
JH
3346 if (!(OP(scan) == SPACE
3347 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3348 sayNO;
3349 nextchr = UCHARAT(++locinput);
3350 }
3351 else {
3352 if (!(OP(scan) == SPACE
3353 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3354 sayNO;
3355 nextchr = UCHARAT(++locinput);
a0ed51b3 3356 }
a0ed51b3 3357 break;
d6a28714 3358 case NSPACEL:
3280af22 3359 PL_reg_flags |= RF_tainted;
bbce6d69 3360 /* FALL THROUGH */
d6a28714 3361 case NSPACE:
9442cb0e 3362 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3363 sayNO;
1aa99e6b 3364 if (do_utf8) {
1a4fad37 3365 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3366 if (OP(scan) == NSPACE
bb7a0f54 3367 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3368 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3369 {
3370 sayNO;
3371 }
3372 locinput += PL_utf8skip[nextchr];
3373 nextchr = UCHARAT(locinput);
3374 break;
a0ed51b3 3375 }
ffc61ed2 3376 if (OP(scan) == NSPACE
d6a28714 3377 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3378 sayNO;
76e3520e 3379 nextchr = UCHARAT(++locinput);
a0d0e21e 3380 break;
d6a28714 3381 case DIGITL:
a0ed51b3
LW
3382 PL_reg_flags |= RF_tainted;
3383 /* FALL THROUGH */
d6a28714 3384 case DIGIT:
9442cb0e 3385 if (!nextchr)
a0ed51b3 3386 sayNO;
1aa99e6b 3387 if (do_utf8) {
1a4fad37 3388 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3389 if (!(OP(scan) == DIGIT
bb7a0f54 3390 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3391 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3392 {
a0ed51b3 3393 sayNO;
dfe13c55 3394 }
6f06b55f 3395 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3396 nextchr = UCHARAT(locinput);
3397 break;
3398 }
ffc61ed2 3399 if (!(OP(scan) == DIGIT
9442cb0e 3400 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3401 sayNO;
3402 nextchr = UCHARAT(++locinput);
3403 break;
d6a28714 3404 case NDIGITL:
b8c5462f
JH
3405 PL_reg_flags |= RF_tainted;
3406 /* FALL THROUGH */
d6a28714 3407 case NDIGIT:
9442cb0e 3408 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3409 sayNO;
1aa99e6b 3410 if (do_utf8) {
1a4fad37 3411 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3412 if (OP(scan) == NDIGIT
bb7a0f54 3413 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3414 : isDIGIT_LC_utf8((U8*)locinput))
3415 {
a0ed51b3 3416 sayNO;
9442cb0e 3417 }
6f06b55f 3418 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3419 nextchr = UCHARAT(locinput);
3420 break;
3421 }
ffc61ed2 3422 if (OP(scan) == NDIGIT
9442cb0e 3423 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3424 sayNO;
3425 nextchr = UCHARAT(++locinput);
3426 break;
3427 case CLUMP:
b7c83a7e 3428 if (locinput >= PL_regeol)
a0ed51b3 3429 sayNO;
b7c83a7e 3430 if (do_utf8) {
1a4fad37 3431 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3432 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3433 sayNO;
3434 locinput += PL_utf8skip[nextchr];
3435 while (locinput < PL_regeol &&
3436 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3437 locinput += UTF8SKIP(locinput);
3438 if (locinput > PL_regeol)
3439 sayNO;
eb08e2da
JH
3440 }
3441 else
3442 locinput++;
a0ed51b3
LW
3443 nextchr = UCHARAT(locinput);
3444 break;
c8756f30 3445 case REFFL:
3280af22 3446 PL_reg_flags |= RF_tainted;
c8756f30 3447 /* FALL THROUGH */
c277df42 3448 case REF:
95b24440
DM
3449 case REFF: {
3450 char *s;
c277df42 3451 n = ARG(scan); /* which paren pair */
5d9a96ca 3452 st->ln = PL_regstartp[n];
2c2d71f5 3453 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3454 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3455 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3456 if (st->ln == PL_regendp[n])
a0d0e21e 3457 break;
a0ed51b3 3458
5d9a96ca 3459 s = PL_bostr + st->ln;
1aa99e6b 3460 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3461 char *l = locinput;
a3b680e6 3462 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3463 /*
3464 * Note that we can't do the "other character" lookup trick as
3465 * in the 8-bit case (no pun intended) because in Unicode we
3466 * have to map both upper and title case to lower case.
3467 */
3468 if (OP(scan) == REFF) {
3469 while (s < e) {
a3b680e6
AL
3470 STRLEN ulen1, ulen2;
3471 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3472 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3473
a0ed51b3
LW
3474 if (l >= PL_regeol)
3475 sayNO;
a2a2844f
JH
3476 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3477 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3478 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3479 sayNO;
a2a2844f
JH
3480 s += ulen1;
3481 l += ulen2;
a0ed51b3
LW
3482 }
3483 }
3484 locinput = l;
3485 nextchr = UCHARAT(locinput);
3486 break;
3487 }
3488
a0d0e21e 3489 /* Inline the first character, for speed. */
76e3520e 3490 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3491 (OP(scan) == REF ||
3492 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3493 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3494 sayNO;
5d9a96ca
DM
3495 st->ln = PL_regendp[n] - st->ln;
3496 if (locinput + st->ln > PL_regeol)
4633a7c4 3497 sayNO;
5d9a96ca
DM
3498 if (st->ln > 1 && (OP(scan) == REF
3499 ? memNE(s, locinput, st->ln)
c8756f30 3500 : (OP(scan) == REFF
5d9a96ca
DM
3501 ? ibcmp(s, locinput, st->ln)
3502 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3503 sayNO;
5d9a96ca 3504 locinput += st->ln;
76e3520e 3505 nextchr = UCHARAT(locinput);
a0d0e21e 3506 break;
95b24440 3507 }
a0d0e21e
LW
3508
3509 case NOTHING:
c277df42 3510 case TAIL:
a0d0e21e
LW
3511 break;
3512 case BACK:
3513 break;
40a82448
DM
3514
3515#undef ST
3516#define ST st->u.eval
3517
3518 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3519 {
c277df42 3520 SV *ret;
8e5e9ebe 3521 {
4aabdb9b
DM
3522 /* execute the code in the {...} */
3523 dSP;
6136c704 3524 SV ** const before = SP;
4aabdb9b
DM
3525 OP_4tree * const oop = PL_op;
3526 COP * const ocurcop = PL_curcop;
3527 PAD *old_comppad;
4aabdb9b
DM
3528
3529 n = ARG(scan);
32fc9b6a 3530 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3531 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3532 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3533 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3534
8e5e9ebe
RGS
3535 CALLRUNOPS(aTHX); /* Scalar context. */
3536 SPAGAIN;
3537 if (SP == before)
075aa684 3538 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3539 else {
3540 ret = POPs;
3541 PUTBACK;
3542 }
4aabdb9b
DM
3543
3544 PL_op = oop;
3545 PAD_RESTORE_LOCAL(old_comppad);
3546 PL_curcop = ocurcop;
3547 if (!st->logical) {
3548 /* /(?{...})/ */
3549 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3550 break;
3551 }
8e5e9ebe 3552 }
4aabdb9b
DM
3553 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3554 regexp *re;
4aabdb9b 3555 {
4f639d21
DM
3556 /* extract RE object from returned value; compiling if
3557 * necessary */
3558
6136c704 3559 MAGIC *mg = NULL;
be8e71aa 3560 const SV *sv;
faf82a0b
AE
3561 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3562 mg = mg_find(sv, PERL_MAGIC_qr);
3563 else if (SvSMAGICAL(ret)) {
3564 if (SvGMAGICAL(ret))
3565 sv_unmagic(ret, PERL_MAGIC_qr);
3566 else
3567 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3568 }
faf82a0b 3569
0f5d15d6
IZ
3570 if (mg) {
3571 re = (regexp *)mg->mg_obj;
df0003d4 3572 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3573 }
3574 else {
3575 STRLEN len;
6136c704 3576 const char * const t = SvPV_const(ret, len);
0f5d15d6 3577 PMOP pm;
a3b680e6 3578 const I32 osize = PL_regsize;
0f5d15d6 3579
5fcd1c1b 3580 Zero(&pm, 1, PMOP);
4aabdb9b 3581 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3582 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3583 if (!(SvFLAGS(ret)
faf82a0b
AE
3584 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3585 | SVs_GMG)))
14befaf4
DM
3586 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3587 PERL_MAGIC_qr,0,0);
0f5d15d6 3588 PL_regsize = osize;
0f5d15d6 3589 }
4aabdb9b 3590 }
aa283a38
DM
3591
3592 /* run the pattern returned from (??{...}) */
3593
4aabdb9b
DM
3594 DEBUG_EXECUTE_r(
3595 PerlIO_printf(Perl_debug_log,
3596 "Entering embedded \"%s%.60s%s%s\"\n",
3597 PL_colors[0],
3598 re->precomp,
3599 PL_colors[1],
3600 (strlen(re->precomp) > 60 ? "..." : ""))
3601 );
2c2d71f5 3602
40a82448
DM
3603 ST.cp = regcppush(0); /* Save *all* the positions. */
3604 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3605 *PL_reglastparen = 0;
3606 *PL_reglastcloseparen = 0;
4aabdb9b 3607 PL_reginput = locinput;
4aabdb9b
DM
3608
3609 /* XXXX This is too dramatic a measure... */
3610 PL_reg_maxiter = 0;
3611
5d9a96ca 3612 st->logical = 0;
40a82448 3613 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3614 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3615 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3616 ST.prev_rex = rex;
aa283a38 3617 rex = re;
aa283a38 3618
40a82448 3619 ST.B = next;
aa283a38 3620 /* now continue from first node in postoned RE */
40a82448 3621 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3622 /* NOTREACHED */
a0ed51b3 3623 }
4aabdb9b
DM
3624 /* /(?(?{...})X|Y)/ */
3625 st->sw = SvTRUE(ret);
3626 st->logical = 0;
c277df42
IZ
3627 break;
3628 }
40a82448
DM
3629
3630 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3631 if (ST.toggleutf)
3632 PL_reg_flags ^= RF_utf8;
3633 ReREFCNT_dec(rex);
3634 rex = ST.prev_rex;
3635 /* XXXX This is too dramatic a measure... */
3636 PL_reg_maxiter = 0;
3637 /* Restore parens of the caller without popping the
3638 * savestack */
3639 {
3640 const I32 tmp = PL_savestack_ix;
3641 PL_savestack_ix = ST.lastcp;
3642 regcppop(rex);
3643 PL_savestack_ix = tmp;
3644 }
3645 PL_reginput = locinput;
3646 /* continue at the node following the (??{...}) */
3647 scan = ST.B;
3648 continue;
3649
3650 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3651 /* Restore state to the outer re then re-throw the failure */
3652 if (ST.toggleutf)
3653 PL_reg_flags ^= RF_utf8;
3654 ReREFCNT_dec(rex);
3655 rex = ST.prev_rex;
3656
3657 /* XXXX This is too dramatic a measure... */
3658 PL_reg_maxiter = 0;
3659
3660 PL_reginput = locinput;
3661 REGCP_UNWIND(ST.lastcp);
3662 regcppop(rex);
3663 sayNO_SILENT;
3664
3665#undef ST
3666
a0d0e21e 3667 case OPEN:
c277df42 3668 n = ARG(scan); /* which paren pair */
3280af22
NIS
3669 PL_reg_start_tmp[n] = locinput;
3670 if (n > PL_regsize)
3671 PL_regsize = n;
a0d0e21e
LW
3672 break;
3673 case CLOSE:
c277df42 3674 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3675 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3676 PL_regendp[n] = locinput - PL_bostr;
eb160463 3677 if (n > (I32)*PL_reglastparen)
3280af22 3678 *PL_reglastparen = n;
a01268b5 3679 *PL_reglastcloseparen = n;
a0d0e21e 3680 break;
c277df42
IZ
3681 case GROUPP:
3682 n = ARG(scan); /* which paren pair */
5d9a96ca 3683 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3684 break;
3685 case IFTHEN:
2c2d71f5 3686 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3687 if (st->sw)
c277df42
IZ
3688 next = NEXTOPER(NEXTOPER(scan));
3689 else {
3690 next = scan + ARG(scan);
3691 if (OP(next) == IFTHEN) /* Fake one. */
3692 next = NEXTOPER(NEXTOPER(next));
3693 }
3694 break;
3695 case LOGICAL:
5d9a96ca 3696 st->logical = scan->flags;
c277df42 3697 break;
2ab05381 3698/*******************************************************************
a0374537
DM
3699 cc points to the regmatch_state associated with the most recent CURLYX.
3700 This struct contains info about the innermost (...)* loop (an
3701 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3702
3703 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3704
95b24440 3705 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3706
a0374537 3707 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3708 with the starting point at WHILEM node;
2ab05381
IZ
3709
3710 3) Each hit of WHILEM node tries to match A and Z (in the order
3711 depending on the current iteration, min/max of {min,max} and
3712 greediness). The information about where are nodes for "A"
a0374537 3713 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3714 was already matched, and greediness.
3715
3716 4) After A matches, the same WHILEM node is hit again.
3717
95b24440 3718 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3719 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3720 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3721 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3722 of the external loop.
3723
a0374537 3724 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3725 and whatever it mentions via ->next, and additional attached trees
3726 corresponding to temporarily unset infoblocks as in "5" above.
3727
95b24440 3728 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3729 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3730 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3731 infoblocks are drawn below the "reset" infoblock.
3732
3733 In fact in the picture below we do not show failed matches for Z and T
3734 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3735 more obvious *why* one needs to *temporary* unset infoblocks.]
3736
3737 Matched REx position InfoBlocks Comment
3738 (Y(A)*?Z)*?T x
3739 Y(A)*?Z)*?T x <- O
3740 Y (A)*?Z)*?T x <- O
3741 Y A)*?Z)*?T x <- O <- I
3742 YA )*?Z)*?T x <- O <- I
3743 YA A)*?Z)*?T x <- O <- I
3744 YAA )*?Z)*?T x <- O <- I
3745 YAA Z)*?T x <- O # Temporary unset I
3746 I
3747
3748 YAAZ Y(A)*?Z)*?T x <- O
3749 I
3750
3751 YAAZY (A)*?Z)*?T x <- O
3752 I
3753
3754 YAAZY A)*?Z)*?T x <- O <- I
3755 I
3756
3757 YAAZYA )*?Z)*?T x <- O <- I
3758 I
3759
3760 YAAZYA Z)*?T x <- O # Temporary unset I
3761 I,I
3762
3763 YAAZYAZ )*?T x <- O
3764 I,I
3765
3766 YAAZYAZ T x # Temporary unset O
3767 O
3768 I,I
3769
3770 YAAZYAZT x
3771 O
3772 I,I
3773 *******************************************************************/
95b24440 3774
a0d0e21e 3775 case CURLYX: {
cb434fcc
IZ
3776 /* No need to save/restore up to this paren */
3777 I32 parenfloor = scan->flags;
c277df42 3778
c2b7afd3
NC
3779 /* Dave says:
3780
3781 CURLYX and WHILEM are always paired: they're the moral
3782 equivalent of pp_enteriter anbd pp_iter.
3783
3784 The only time next could be null is if the node tree is
3785 corrupt. This was mentioned on p5p a few days ago.
3786
3787 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3788 So we'll assert that this is true:
3789 */
3790 assert(next);
30b2893d 3791 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3792 next += ARG(next);
cb434fcc
IZ
3793 /* XXXX Probably it is better to teach regpush to support
3794 parenfloor > PL_regsize... */
eb160463 3795 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3796 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3797
d8319b27
DM
3798 st->u.curlyx.cp = PL_savestack_ix;
3799 st->u.curlyx.outercc = st->cc;
a0374537
DM
3800 st->cc = st;
3801 /* these fields contain the state of the current curly.
3802 * they are accessed by subsequent WHILEMs;
3803 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3804 st->u.curlyx.parenfloor = parenfloor;
3805 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3806 st->u.curlyx.min = ARG1(scan);
3807 st->u.curlyx.max = ARG2(scan);
3808 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3809 st->u.curlyx.lastloc = 0;
a0374537
DM
3810 /* st->next and st->minmod are also read by WHILEM */
3811
3280af22 3812 PL_reginput = locinput;
95b24440
DM
3813 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3814 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3815 regcpblow(st->u.curlyx.cp);
3816 st->cc = st->u.curlyx.outercc;
95b24440 3817 saySAME(result);
a0d0e21e 3818 }
5f66b61c 3819 /* NOTREACHED */
a0d0e21e
LW
3820 case WHILEM: {
3821 /*
3822 * This is really hard to understand, because after we match
3823 * what we're trying to match, we must make sure the rest of
2c2d71f5 3824 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3825 * to go back UP the parse tree by recursing ever deeper. And
3826 * if it fails, we have to reset our parent's current state
3827 * that we can try again after backing off.
3828 */
3829
c2b7afd3
NC
3830 /* Dave says:
3831
3832 st->cc gets initialised by CURLYX ready for use by WHILEM.
3833 So again, unless somethings been corrupted, st->cc cannot
3834 be null at that point in WHILEM.
3835
3836 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3837 So we'll assert that this is true:
3838 */
3839 assert(st->cc);
d8319b27
DM
3840 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3841 st->u.whilem.cache_offset = 0;
3842 st->u.whilem.cache_bit = 0;
c277df42 3843
d8319b27 3844 n = st->cc->u.curlyx.cur + 1; /* how many we know we matched */
3280af22 3845 PL_reginput = locinput;
a0d0e21e 3846
a3621e74 3847 DEBUG_EXECUTE_r(
9041c2e3 3848 PerlIO_printf(Perl_debug_log,
91f3b821 3849 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3850 REPORT_CODE_OFF+PL_regindent*2, "",
d8319b27
DM
3851 (long)n, (long)st->cc->u.curlyx.min,
3852 (long)st->cc->u.curlyx.max, PTR2UV(st->cc))
c277df42 3853 );
4633a7c4 3854
a0d0e21e
LW
3855 /* If degenerate scan matches "", assume scan done. */
3856
d8319b27
DM
3857 if (locinput == st->cc->u.curlyx.lastloc && n >= st->cc->u.curlyx.min) {
3858 st->u.whilem.savecc = st->cc;
3859 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3860 if (st->cc)
d8319b27 3861 st->ln = st->cc->u.curlyx.cur;
a3621e74 3862 DEBUG_EXECUTE_r(
c3464db5
DD
3863 PerlIO_printf(Perl_debug_log,
3864 "%*s empty match detected, try continuation...\n",
3280af22 3865 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3866 );
d8319b27 3867 REGMATCH(st->u.whilem.savecc->next, WHILEM1);
95b24440 3868 /*** all unsaved local vars undefined at this point */
d8319b27 3869 st->cc = st->u.whilem.savecc;
95b24440 3870 if (result)
4633a7c4 3871 sayYES;
d8319b27
DM
3872 if (st->cc->u.curlyx.outercc)
3873 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4633a7c4 3874 sayNO;
a0d0e21e
LW
3875 }
3876
3877 /* First just match a string of min scans. */
3878
d8319b27
DM
3879 if (n < st->cc->u.curlyx.min) {
3880 st->cc->u.curlyx.cur = n;
3881 st->cc->u.curlyx.lastloc = locinput;
3882 REGMATCH(st->cc->u.curlyx.scan, WHILEM2);
95b24440
DM
3883 /*** all unsaved local vars undefined at this point */
3884 if (result)
4633a7c4 3885 sayYES;
d8319b27
DM
3886 st->cc->u.curlyx.cur = n - 1;
3887 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
4633a7c4 3888 sayNO;
a0d0e21e
LW
3889 }
3890
2c2d71f5
JH
3891 if (scan->flags) {
3892 /* Check whether we already were at this position.
3893 Postpone detection until we know the match is not
3894 *that* much linear. */
3895 if (!PL_reg_maxiter) {
3896 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
3897 /* possible overflow for long strings and many CURLYX's */
3898 if (PL_reg_maxiter < 0)
3899 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
3900 PL_reg_leftiter = PL_reg_maxiter;
3901 }
3902 if (PL_reg_leftiter-- == 0) {
a3b680e6 3903 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3904 if (PL_reg_poscache) {
eb160463 3905 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3906 Renew(PL_reg_poscache, size, char);
3907 PL_reg_poscache_size = size;
3908 }
3909 Zero(PL_reg_poscache, size, char);
3910 }
3911 else {
3912 PL_reg_poscache_size = size;
a02a5408 3913 Newxz(PL_reg_poscache, size, char);
2c2d71f5 3914 }
a3621e74 3915 DEBUG_EXECUTE_r(
2c2d71f5
JH
3916 PerlIO_printf(Perl_debug_log,
3917 "%sDetected a super-linear match, switching on caching%s...\n",
3918 PL_colors[4], PL_colors[5])
3919 );
3920 }
3921 if (PL_reg_leftiter < 0) {
d8319b27 3922 st->u.whilem.cache_offset = locinput - PL_bostr;
2c2d71f5 3923
d8319b27
DM
3924 st->u.whilem.cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3925 + st->u.whilem.cache_offset * (scan->flags>>4);
3926 st->u.whilem.cache_bit = st->u.whilem.cache_offset % 8;
3927 st->u.whilem.cache_offset /= 8;
3928 if (PL_reg_poscache[st->u.whilem.cache_offset] & (1<<st->u.whilem.cache_bit)) {
a3621e74 3929 DEBUG_EXECUTE_r(
2c2d71f5
JH
3930 PerlIO_printf(Perl_debug_log,
3931 "%*s already tried at this position...\n",
3932 REPORT_CODE_OFF+PL_regindent*2, "")
3933 );
3ab3c9b4
HS
3934 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3935 /* cache records success */
c2b0868c
HS
3936 sayYES;
3937 else
3ab3c9b4 3938 /* cache records failure */
c2b0868c 3939 sayNO_SILENT;
2c2d71f5 3940 }
2c2d71f5
JH
3941 }
3942 }
3943
a0d0e21e
LW
3944 /* Prefer next over scan for minimal matching. */
3945
5d9a96ca 3946 if (st->cc->minmod) {
d8319b27
DM
3947 st->u.whilem.savecc = st->cc;
3948 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 3949 if (st->cc)
d8319b27
DM
3950 st->ln = st->cc->u.curlyx.cur;
3951 st->u.whilem.cp = regcppush(st->u.whilem.savecc->u.curlyx.parenfloor);
3952 REGCP_SET(st->u.whilem.lastcp);
3953 REGMATCH(st->u.whilem.savecc->next, WHILEM3);
95b24440 3954 /*** all unsaved local vars undefined at this point */
d8319b27 3955 st->cc = st->u.whilem.savecc;
95b24440 3956 if (result) {
d8319b27 3957 regcpblow(st->u.whilem.cp);
3ab3c9b4 3958 CACHEsayYES; /* All done. */
5f05dabc 3959 }
d8319b27 3960 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3961 regcppop(rex);
d8319b27
DM
3962 if (st->cc->u.curlyx.outercc)
3963 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
a0d0e21e 3964
d8319b27 3965 if (n >= st->cc->u.curlyx.max) { /* Maximum greed exceeded? */
9041c2e3 3966 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3967 && !(PL_reg_flags & RF_warned)) {
3968 PL_reg_flags |= RF_warned;
9014280d 3969 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3970 "Complex regular subexpression recursion",
3971 REG_INFTY - 1);
c277df42 3972 }
3ab3c9b4 3973 CACHEsayNO;
c277df42 3974 }
a687059c 3975
a3621e74 3976 DEBUG_EXECUTE_r(
c3464db5
DD
3977 PerlIO_printf(Perl_debug_log,
3978 "%*s trying longer...\n",
3280af22 3979 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3980 );
a0d0e21e 3981 /* Try scanning more and see if it helps. */
3280af22 3982 PL_reginput = locinput;
d8319b27
DM
3983 st->cc->u.curlyx.cur = n;
3984 st->cc->u.curlyx.lastloc = locinput;
3985 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
3986 REGCP_SET(st->u.whilem.lastcp);
3987 REGMATCH(st->cc->u.curlyx.scan, WHILEM4);
95b24440
DM
3988 /*** all unsaved local vars undefined at this point */
3989 if (result) {
d8319b27 3990 regcpblow(st->u.whilem.cp);
3ab3c9b4 3991 CACHEsayYES;
5f05dabc 3992 }
d8319b27 3993 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 3994 regcppop(rex);
d8319b27
DM
3995 st->cc->u.curlyx.cur = n - 1;
3996 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 3997 CACHEsayNO;
a0d0e21e
LW
3998 }
3999
4000 /* Prefer scan over next for maximal matching. */
4001
d8319b27
DM
4002 if (n < st->cc->u.curlyx.max) { /* More greed allowed? */
4003 st->u.whilem.cp = regcppush(st->cc->u.curlyx.parenfloor);
4004 st->cc->u.curlyx.cur = n;
4005 st->cc->u.curlyx.lastloc = locinput;
4006 REGCP_SET(st->u.whilem.lastcp);
4007 REGMATCH(st->cc->u.curlyx.scan, WHILEM5);
95b24440
DM
4008 /*** all unsaved local vars undefined at this point */
4009 if (result) {
d8319b27 4010 regcpblow(st->u.whilem.cp);
3ab3c9b4 4011 CACHEsayYES;
5f05dabc 4012 }
d8319b27 4013 REGCP_UNWIND(st->u.whilem.lastcp);
4f639d21 4014 regcppop(rex); /* Restore some previous $<digit>s? */
3280af22 4015 PL_reginput = locinput;
a3621e74 4016 DEBUG_EXECUTE_r(
c3464db5
DD
4017 PerlIO_printf(Perl_debug_log,
4018 "%*s failed, try continuation...\n",
3280af22 4019 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
4020 );
4021 }
9041c2e3 4022 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 4023 && !(PL_reg_flags & RF_warned)) {
3280af22 4024 PL_reg_flags |= RF_warned;
9014280d 4025 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
4026 "Complex regular subexpression recursion",
4027 REG_INFTY - 1);
a0d0e21e
LW
4028 }
4029
4030 /* Failed deeper matches of scan, so see if this one works. */
d8319b27
DM
4031 st->u.whilem.savecc = st->cc;
4032 st->cc = st->cc->u.curlyx.outercc;
5d9a96ca 4033 if (st->cc)
d8319b27
DM
4034 st->ln = st->cc->u.curlyx.cur;
4035 REGMATCH(st->u.whilem.savecc->next, WHILEM6);
95b24440 4036 /*** all unsaved local vars undefined at this point */
d8319b27 4037 st->cc = st->u.whilem.savecc;
95b24440 4038 if (result)
3ab3c9b4 4039 CACHEsayYES;
d8319b27
DM
4040 if (st->cc->u.curlyx.outercc)
4041 st->cc->u.curlyx.outercc->u.curlyx.cur = st->ln;
4042 st->cc->u.curlyx.cur = n - 1;
4043 st->cc->u.curlyx.lastloc = st->u.whilem.lastloc;
3ab3c9b4 4044 CACHEsayNO;
a0d0e21e 4045 }
5f66b61c 4046 /* NOTREACHED */
40a82448
DM
4047
4048#undef ST
4049#define ST st->u.branch
4050
4051 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4052 next = scan + ARG(scan);
4053 if (next == scan)
4054 next = NULL;
40a82448
DM
4055 scan = NEXTOPER(scan);
4056 /* FALL THROUGH */
c277df42 4057
40a82448
DM
4058 case BRANCH: /* /(...|A|...)/ */
4059 scan = NEXTOPER(scan); /* scan now points to inner node */
4060 if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ))
4061 /* last branch; skip state push and jump direct to node */
4062 continue;
4063 ST.lastparen = *PL_reglastparen;
4064 ST.next_branch = next;
4065 REGCP_SET(ST.cp);
4066 PL_reginput = locinput;
02db2b7b 4067
40a82448
DM
4068 /* Now go into the branch */
4069 PUSH_STATE_GOTO(BRANCH_next, scan);
4070 /* NOTREACHED */
4071
4072 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4073 REGCP_UNWIND(ST.cp);
4074 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4075 PL_regendp[n] = -1;
4076 *PL_reglastparen = n;
4077 scan = ST.next_branch;
4078 /* no more branches? */
4079 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ))
4080 sayNO;
4081 continue; /* execute next BRANCH[J] op */
4082 /* NOTREACHED */
4083
a0d0e21e 4084 case MINMOD:
5d9a96ca 4085 st->minmod = 1;
a0d0e21e 4086 break;
40a82448
DM
4087
4088#undef ST
4089#define ST st->u.curlym
4090
4091 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4092
4093 /* This is an optimisation of CURLYX that enables us to push
4094 * only a single backtracking state, no matter now many matches
4095 * there are in {m,n}. It relies on the pattern being constant
4096 * length, with no parens to influence future backrefs
4097 */
4098
4099 ST.me = scan;
dc45a647 4100 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4101
4102 /* if paren positive, emulate an OPEN/CLOSE around A */
4103 if (ST.me->flags) {
4104 I32 paren = ST.me->flags;
4105 if (paren > PL_regsize)
4106 PL_regsize = paren;
4107 if (paren > (I32)*PL_reglastparen)
4108 *PL_reglastparen = paren;
c277df42 4109 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4110 }
40a82448
DM
4111 ST.A = scan;
4112 ST.B = next;
4113 ST.alen = 0;
4114 ST.count = 0;
4115 ST.minmod = st->minmod;
4116 st->minmod = 0;
4117 ST.c1 = CHRTEST_UNINIT;
4118 REGCP_SET(ST.cp);
6407bf3b 4119
40a82448
DM
4120 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4121 goto curlym_do_B;
4122
4123 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4124 PL_reginput = locinput;
40a82448
DM
4125 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4126 /* NOTREACHED */
5f80c4cf 4127
40a82448
DM
4128 case CURLYM_A: /* we've just matched an A */
4129 locinput = st->locinput;
4130 nextchr = UCHARAT(locinput);
4131
4132 ST.count++;
4133 /* after first match, determine A's length: u.curlym.alen */
4134 if (ST.count == 1) {
4135 if (PL_reg_match_utf8) {
4136 char *s = locinput;
4137 while (s < PL_reginput) {
4138 ST.alen++;
4139 s += UTF8SKIP(s);
4140 }
4141 }
4142 else {
4143 ST.alen = PL_reginput - locinput;
4144 }
4145 if (ST.alen == 0)
4146 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4147 }
0cadcf80
DM
4148 DEBUG_EXECUTE_r(
4149 PerlIO_printf(Perl_debug_log,
40a82448 4150 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
0cadcf80 4151 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
40a82448 4152 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4153 );
4154
40a82448
DM
4155 locinput = PL_reginput;
4156 if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)))
4157 goto curlym_do_A; /* try to match another A */
4158 goto curlym_do_B; /* try to match B */
5f80c4cf 4159
40a82448
DM
4160 case CURLYM_A_fail: /* just failed to match an A */
4161 REGCP_UNWIND(ST.cp);
4162 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ )
4163 sayNO;
0cadcf80 4164
40a82448
DM
4165 curlym_do_B: /* execute the B in /A{m,n}B/ */
4166 PL_reginput = locinput;
4167 if (ST.c1 == CHRTEST_UNINIT) {
4168 /* calculate c1 and c2 for possible match of 1st char
4169 * following curly */
4170 ST.c1 = ST.c2 = CHRTEST_VOID;
4171 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4172 regnode *text_node = ST.B;
4173 if (! HAS_TEXT(text_node))
4174 FIND_NEXT_IMPT(text_node);
4175 if (HAS_TEXT(text_node)
4176 && PL_regkind[OP(text_node)] != REF)
4177 {
4178 ST.c1 = (U8)*STRING(text_node);
4179 ST.c2 =
4180 (OP(text_node) == EXACTF || OP(text_node) == REFF)
4181 ? PL_fold[ST.c1]
4182 : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
4183 ? PL_fold_locale[ST.c1]
4184 : ST.c1;
c277df42 4185 }
c277df42 4186 }
40a82448
DM
4187 }
4188
4189 DEBUG_EXECUTE_r(
4190 PerlIO_printf(Perl_debug_log,
4191 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4192 (int)(REPORT_CODE_OFF+PL_regindent*2),
4193 "", (IV)ST.count)
4194 );
4195 if (ST.c1 != CHRTEST_VOID
4196 && UCHARAT(PL_reginput) != ST.c1
4197 && UCHARAT(PL_reginput) != ST.c2)
4198 {
4199 /* simulate B failing */
4200 state_num = CURLYM_B_fail;
4201 goto reenter_switch;
4202 }
4203
4204 if (ST.me->flags) {
4205 /* mark current A as captured */
4206 I32 paren = ST.me->flags;
4207 if (ST.count) {
4208 PL_regstartp[paren]
4209 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4210 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 4211 }
40a82448
DM
4212 else
4213 PL_regendp[paren] = -1;
c277df42 4214 }
40a82448 4215 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4216 /* NOTREACHED */
40a82448
DM
4217
4218 case CURLYM_B_fail: /* just failed to match a B */
4219 REGCP_UNWIND(ST.cp);
4220 if (ST.minmod) {
4221 if (ST.count == ARG2(ST.me) /* max */)
4222 sayNO;
4223 goto curlym_do_A; /* try to match a further A */
4224 }
4225 /* backtrack one A */
4226 if (ST.count == ARG1(ST.me) /* min */)
4227 sayNO;
4228 ST.count--;
4229 locinput = HOPc(locinput, -ST.alen);
4230 goto curlym_do_B; /* try to match B */
4231
4232
c277df42 4233 case CURLYN:
d8319b27
DM
4234 st->u.plus.paren = scan->flags; /* Which paren to set */
4235 if (st->u.plus.paren > PL_regsize)
4236 PL_regsize = st->u.plus.paren;
4237 if (st->u.plus.paren > (I32)*PL_reglastparen)
4238 *PL_reglastparen = st->u.plus.paren;
5d9a96ca 4239 st->ln = ARG1(scan); /* min to match */
c277df42 4240 n = ARG2(scan); /* max to match */
dc45a647 4241 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 4242 goto repeat;
a0d0e21e 4243 case CURLY:
d8319b27 4244 st->u.plus.paren = 0;
5d9a96ca 4245 st->ln = ARG1(scan); /* min to match */
a0d0e21e 4246 n = ARG2(scan); /* max to match */
dc45a647 4247 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
4248 goto repeat;
4249 case STAR:
5d9a96ca 4250 st->ln = 0;
c277df42 4251 n = REG_INFTY;
a0d0e21e 4252 scan = NEXTOPER(scan);
d8319b27 4253 st->u.plus.paren = 0;
a0d0e21e
LW
4254 goto repeat;
4255 case PLUS:
5d9a96ca 4256 st->ln = 1;
c277df42
IZ
4257 n = REG_INFTY;
4258 scan = NEXTOPER(scan);
d8319b27 4259 st->u.plus.paren = 0;
c277df42 4260 repeat:
a0d0e21e
LW
4261 /*
4262 * Lookahead to avoid useless match attempts
4263 * when we know what character comes next.
4264 */
5f80c4cf
JP
4265
4266 /*
4267 * Used to only do .*x and .*?x, but now it allows
4268 * for )'s, ('s and (?{ ... })'s to be in the way
4269 * of the quantifier and the EXACT-like node. -- japhy
4270 */
4271
cca55fe3 4272 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4273 U8 *s;
4274 regnode *text_node = next;
4275
3dab1dad
YO
4276 if (! HAS_TEXT(text_node))
4277 FIND_NEXT_IMPT(text_node);
5f80c4cf 4278
9e137952
DM
4279 if (! HAS_TEXT(text_node))
4280 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
5f80c4cf 4281 else {
3dab1dad 4282 if (PL_regkind[OP(text_node)] == REF) {
9e137952 4283 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
44a68960 4284 goto assume_ok_easy;
cca55fe3 4285 }
be8e71aa
YO
4286 else
4287 s = (U8*)STRING(text_node);
5f80c4cf
JP
4288
4289 if (!UTF) {
d8319b27 4290 st->u.plus.c2 = st->u.plus.c1 = *s;
f65d3ee7 4291 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
d8319b27 4292 st->u.plus.c2 = PL_fold[st->u.plus.c1];
f65d3ee7 4293 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
d8319b27 4294 st->u.plus.c2 = PL_fold_locale[st->u.plus.c1];
1aa99e6b 4295 }
5f80c4cf 4296 else { /* UTF */
f65d3ee7 4297 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 4298 STRLEN ulen1, ulen2;
89ebb4a3
JH
4299 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4300 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4301
4302 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4303 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4304
d8319b27 4305 st->u.plus.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
041457d9 4306 uniflags);
d8319b27 4307 st->u.plus.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
041457d9 4308 uniflags);
5f80c4cf
JP
4309 }
4310 else {
d8319b27 4311 st->u.plus.c2 = st->u.plus.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4312 uniflags);
5f80c4cf 4313 }
1aa99e6b
IH
4314 }
4315 }
bbce6d69 4316 }
a0d0e21e 4317 else
9e137952 4318 st->u.plus.c1 = st->u.plus.c2 = CHRTEST_VOID;
cca55fe3 4319 assume_ok_easy:
3280af22 4320 PL_reginput = locinput;
5d9a96ca
DM
4321 if (st->minmod) {
4322 st->minmod = 0;
32fc9b6a 4323 if (st->ln && regrepeat(rex, scan, st->ln) < st->ln)
4633a7c4 4324 sayNO;
a0ed51b3 4325 locinput = PL_reginput;
d8319b27 4326 REGCP_SET(st->u.plus.lastcp);
9e137952 4327 if (st->u.plus.c1 != CHRTEST_VOID) {
d8319b27
DM
4328 st->u.plus.old = locinput;
4329 st->u.plus.count = 0;
0fe9bf95 4330
1aa99e6b 4331 if (n == REG_INFTY) {
d8319b27 4332 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4333 if (do_utf8)
d8319b27
DM
4334 while (UTF8_IS_CONTINUATION(*(U8*)st->u.plus.e))
4335 st->u.plus.e--;
1aa99e6b
IH
4336 }
4337 else if (do_utf8) {
5d9a96ca 4338 int m = n - st->ln;
d8319b27
DM
4339 for (st->u.plus.e = locinput;
4340 m >0 && st->u.plus.e + UTF8SKIP(st->u.plus.e) <= PL_regeol; m--)
4341 st->u.plus.e += UTF8SKIP(st->u.plus.e);
1aa99e6b
IH
4342 }
4343 else {
d8319b27
DM
4344 st->u.plus.e = locinput + n - st->ln;
4345 if (st->u.plus.e >= PL_regeol)
4346 st->u.plus.e = PL_regeol - 1;
1aa99e6b 4347 }
0fe9bf95
IZ
4348 while (1) {
4349 /* Find place 'next' could work */
1aa99e6b 4350 if (!do_utf8) {
d8319b27
DM
4351 if (st->u.plus.c1 == st->u.plus.c2) {
4352 while (locinput <= st->u.plus.e &&
4353 UCHARAT(locinput) != st->u.plus.c1)
1aa99e6b
IH
4354 locinput++;
4355 } else {
d8319b27
DM
4356 while (locinput <= st->u.plus.e
4357 && UCHARAT(locinput) != st->u.plus.c1
4358 && UCHARAT(locinput) != st->u.plus.c2)
1aa99e6b
IH
4359 locinput++;
4360 }
d8319b27 4361 st->u.plus.count = locinput - st->u.plus.old;
1aa99e6b
IH
4362 }
4363 else {
d8319b27 4364 if (st->u.plus.c1 == st->u.plus.c2) {
a3b680e6 4365 STRLEN len;
872c91ae
JH
4366 /* count initialised to
4367 * utf8_distance(old, locinput) */
d8319b27 4368 while (locinput <= st->u.plus.e &&
872c91ae 4369 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4370 UTF8_MAXBYTES, &len,
d8319b27 4371 uniflags) != (UV)st->u.plus.c1) {
1aa99e6b 4372 locinput += len;
d8319b27 4373 st->u.plus.count++;
b2f2f093 4374 }
1aa99e6b 4375 } else {
872c91ae
JH
4376 /* count initialised to
4377 * utf8_distance(old, locinput) */
d8319b27 4378 while (locinput <= st->u.plus.e) {
c4fd8992
AL
4379 STRLEN len;
4380 const UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4381 UTF8_MAXBYTES, &len,
041457d9 4382 uniflags);
d8319b27 4383 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2)
1aa99e6b 4384 break;
b2f2f093 4385 locinput += len;
d8319b27 4386 st->u.plus.count++;
1aa99e6b
IH
4387 }
4388 }
0fe9bf95 4389 }
d8319b27 4390 if (locinput > st->u.plus.e)
0fe9bf95
IZ
4391 sayNO;
4392 /* PL_reginput == old now */
d8319b27 4393 if (locinput != st->u.plus.old) {
5d9a96ca 4394 st->ln = 1; /* Did some */
32fc9b6a 4395 if (regrepeat(rex, scan, st->u.plus.count) < st->u.plus.count)
0fe9bf95
IZ
4396 sayNO;
4397 }
4398 /* PL_reginput == locinput now */
d8319b27 4399 TRYPAREN(st->u.plus.paren, st->ln, locinput, PLUS1);
95b24440 4400 /*** all unsaved local vars undefined at this point */
0fe9bf95 4401 PL_reginput = locinput; /* Could be reset... */
d8319b27 4402 REGCP_UNWIND(st->u.plus.lastcp);
0fe9bf95 4403 /* Couldn't or didn't -- move forward. */
d8319b27 4404 st->u.plus.old = locinput;
1aa99e6b
IH
4405 if (do_utf8)
4406 locinput += UTF8SKIP(locinput);
4407 else
4408 locinput++;
d8319b27 4409 st->u.plus.count = 1;
0fe9bf95
IZ
4410 }
4411 }
4412 else
5d9a96ca 4413 while (n >= st->ln || (n == REG_INFTY && st->ln > 0)) { /* ln overflow ? */
1aa99e6b 4414 UV c;
9e137952 4415 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4416 if (do_utf8)
872c91ae 4417 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4418 UTF8_MAXBYTES, 0,
041457d9 4419 uniflags);
1aa99e6b 4420 else
9041c2e3 4421 c = UCHARAT(PL_reginput);
2390ecbc 4422 /* If it could work, try it. */
be8e71aa 4423 if (c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
d8319b27 4424 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS2);
95b24440 4425 /*** all unsaved local vars undefined at this point */
d8319b27 4426 REGCP_UNWIND(st->u.plus.lastcp);
2390ecbc 4427 }
1aa99e6b 4428 }
a0d0e21e 4429 /* If it could work, try it. */
be8e71aa 4430 else if (st->u.plus.c1 == CHRTEST_VOID) {
d8319b27 4431 TRYPAREN(st->u.plus.paren, st->ln, PL_reginput, PLUS3);
95b24440 4432 /*** all unsaved local vars undefined at this point */
d8319b27 4433 REGCP_UNWIND(st->u.plus.lastcp);
bbce6d69 4434 }
c277df42 4435 /* Couldn't or didn't -- move forward. */
a0ed51b3 4436 PL_reginput = locinput;
32fc9b6a 4437 if (regrepeat(rex, scan, 1)) {
5d9a96ca 4438 st->ln++;
a0ed51b3
LW
4439 locinput = PL_reginput;
4440 }
4441 else
4633a7c4 4442 sayNO;
a0d0e21e
LW
4443 }
4444 }
4445 else {
32fc9b6a 4446 n = regrepeat(rex, scan, n);
a0ed51b3 4447 locinput = PL_reginput;
3dab1dad
YO
4448 if ((st->ln < n) && (PL_regkind[OP(next)] == EOL) &&
4449 (OP(next) != MEOL || OP(next) == SEOL || OP(next) == EOS))
15272685 4450 {
5d9a96ca 4451 st->ln = n; /* why back off? */
1aeab75a
GS
4452 /* ...because $ and \Z can match before *and* after
4453 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4454 We should back off by one in this case. */
4455 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
5d9a96ca 4456 st->ln--;
1aeab75a 4457 }
d8319b27 4458 REGCP_SET(st->u.plus.lastcp);
1d5c262f 4459 {
5d9a96ca 4460 while (n >= st->ln) {
be8e71aa 4461 UV c = 0;
9e137952 4462 if (st->u.plus.c1 != CHRTEST_VOID) {
1aa99e6b 4463 if (do_utf8)
872c91ae 4464 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4465 UTF8_MAXBYTES, 0,
041457d9 4466 uniflags);
1aa99e6b 4467 else
9041c2e3 4468 c = UCHARAT(PL_reginput);
1aa99e6b 4469 }
c277df42 4470 /* If it could work, try it. */
be8e71aa
YO
4471 if (st->u.plus.c1 == CHRTEST_VOID || c == (UV)st->u.plus.c1 || c == (UV)st->u.plus.c2) {
4472 TRYPAREN(st->u.plus.paren, n, PL_reginput, PLUS4);
4473 /*** all unsaved local vars undefined at this point */
4474 REGCP_UNWIND(st->u.plus.lastcp);
4475 }
c277df42
IZ
4476 /* Couldn't or didn't -- back up. */
4477 n--;
dfe13c55 4478 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4479 }
a0d0e21e
LW
4480 }
4481 }
4633a7c4 4482 sayNO;
c277df42 4483 break;
a0d0e21e 4484 case END:
3b0527fe 4485 if (locinput < reginfo->till) {
a3621e74 4486 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4487 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4488 PL_colors[4],
4489 (long)(locinput - PL_reg_starttry),
3b0527fe 4490 (long)(reginfo->till - PL_reg_starttry),
7821416a
IZ
4491 PL_colors[5]));
4492 sayNO_FINAL; /* Cannot match: too short. */
4493 }
4494 PL_reginput = locinput; /* put where regtry can find it */
4495 sayYES_FINAL; /* Success! */
dad79028
DM
4496
4497 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4498 DEBUG_EXECUTE_r(
4499 PerlIO_printf(Perl_debug_log,
4500 "%*s %ssubpattern success...%s\n",
4501 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5]));
3280af22 4502 PL_reginput = locinput; /* put where regtry can find it */
dad79028
DM
4503 sayYES_FINAL; /* Success! */
4504
40a82448
DM
4505#undef ST
4506#define ST st->u.ifmatch
4507
4508 case SUSPEND: /* (?>A) */
4509 ST.wanted = 1;
9fe1d20c 4510 PL_reginput = locinput;
9041c2e3 4511 goto do_ifmatch;
dad79028 4512
40a82448
DM
4513 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4514 ST.wanted = 0;
dad79028
DM
4515 goto ifmatch_trivial_fail_test;
4516
40a82448
DM
4517 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4518 ST.wanted = 1;
dad79028 4519 ifmatch_trivial_fail_test:
a0ed51b3 4520 if (scan->flags) {
52657f30 4521 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
4522 if (!s) {
4523 /* trivial fail */
4524 if (st->logical) {
4525 st->logical = 0;
40a82448 4526 st->sw = 1 - (bool)ST.wanted;
dad79028 4527 }
40a82448 4528 else if (ST.wanted)
dad79028
DM
4529 sayNO;
4530 next = scan + ARG(scan);
4531 if (next == scan)
4532 next = NULL;
4533 break;
4534 }
efb30f32 4535 PL_reginput = s;
a0ed51b3
LW
4536 }
4537 else
4538 PL_reginput = locinput;
4539
c277df42 4540 do_ifmatch:
40a82448
DM
4541 ST.me = scan;
4542 /* execute body of (?...A) */
4543 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4544 /* NOTREACHED */
4545
4546 case IFMATCH_A_fail: /* body of (?...A) failed */
4547 ST.wanted = !ST.wanted;
4548 /* FALL THROUGH */
4549
4550 case IFMATCH_A: /* body of (?...A) succeeded */
4551 if (st->logical) {
4552 st->logical = 0;
4553 st->sw = (bool)ST.wanted;
4554 }
4555 else if (!ST.wanted)
4556 sayNO;
4557
4558 if (OP(ST.me) == SUSPEND)
4559 locinput = PL_reginput;
4560 else {
4561 locinput = PL_reginput = st->locinput;
4562 nextchr = UCHARAT(locinput);
4563 }
4564 scan = ST.me + ARG(ST.me);
4565 if (scan == ST.me)
4566 scan = NULL;
4567 continue; /* execute B */
4568
4569#undef ST
dad79028 4570
c277df42 4571 case LONGJMP:
c277df42
IZ
4572 next = scan + ARG(scan);
4573 if (next == scan)
4574 next = NULL;
a0d0e21e
LW
4575 break;
4576 default:
b900a521 4577 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4578 PTR2UV(scan), OP(scan));
cea2e8a9 4579 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4580 }
95b24440 4581
a0d0e21e 4582 scan = next;
95b24440
DM
4583 continue;
4584 /* NOTREACHED */
4585
40a82448
DM
4586 push_yes_state:
4587 /* push a state that backtracks on success */
4588 st->u.yes.prev_yes_state = yes_state;
4589 yes_state = st;
4590 /* FALL THROUGH */
4591 push_state:
4592 /* push a new regex state, then continue at scan */
4593 {
4594 regmatch_state *newst;
4595
4596 depth++;
4597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4598 "PUSH STATE(%d)\n", depth));
4599 st->locinput = locinput;
4600 newst = st+1;
4601 if (newst > SLAB_LAST(PL_regmatch_slab))
4602 newst = S_push_slab(aTHX);
4603 PL_regmatch_state = newst;
4604 newst->cc = st->cc;
4605 /* XXX probably don't need to initialise these */
4606 newst->minmod = 0;
4607 newst->sw = 0;
4608 newst->logical = 0;
4609
4610 locinput = PL_reginput;
4611 nextchr = UCHARAT(locinput);
4612 st = newst;
4613 continue;
4614 /* NOTREACHED */
4615 }
4616
95b24440
DM
4617 /* simulate recursively calling regmatch(), but without actually
4618 * recursing - ie save the current state on the heap rather than on
4619 * the stack, then re-enter the loop. This avoids complex regexes
4620 * blowing the processor stack */
4621
4622 start_recurse:
4623 {
5d9a96ca
DM
4624 /* push new state */
4625 regmatch_state *oldst = st;
4626
4627 depth++;
40a82448 4628 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH RECURSE STATE(%d)\n", depth));
5d9a96ca
DM
4629
4630 /* grab the next free state slot */
4631 st++;
86545054 4632 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
4633 st = S_push_slab(aTHX);
4634 PL_regmatch_state = st;
4635
4636 oldst->next = next;
4637 oldst->n = n;
4638 oldst->locinput = locinput;
5d9a96ca
DM
4639
4640 st->cc = oldst->cc;
95b24440
DM
4641 locinput = PL_reginput;
4642 nextchr = UCHARAT(locinput);
5d9a96ca
DM
4643 st->minmod = 0;
4644 st->sw = 0;
4645 st->logical = 0;
95b24440
DM
4646#ifdef DEBUGGING
4647 PL_regindent++;
4648#endif
4649 }
a0d0e21e 4650 }
a687059c 4651
aa283a38
DM
4652
4653
a0d0e21e
LW
4654 /*
4655 * We get here only if there's trouble -- normally "case END" is
4656 * the terminating point.
4657 */
cea2e8a9 4658 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4659 /*NOTREACHED*/
4633a7c4
LW
4660 sayNO;
4661
7821416a 4662yes_final:
77cb431f
DM
4663
4664 if (yes_state) {
4665 /* we have successfully completed a subexpression, but we must now
4666 * pop to the state marked by yes_state and continue from there */
4667
4668 assert(st != yes_state);
4669 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
4670 || yes_state > SLAB_LAST(PL_regmatch_slab))
4671 {
4672 /* not in this slab, pop slab */
4673 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
4674 PL_regmatch_slab = PL_regmatch_slab->prev;
4675 st = SLAB_LAST(PL_regmatch_slab);
4676 }
4677 depth -= (st - yes_state);
40a82448
DM
4678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATES (%d..%d)\n",
4679 depth+1, depth+(st - yes_state)));
77cb431f
DM
4680 st = yes_state;
4681 yes_state = st->u.yes.prev_yes_state;
4682 PL_regmatch_state = st;
4683
4684 switch (st->resume_state) {
40a82448
DM
4685 case IFMATCH_A:
4686 case CURLYM_A:
4687 case BRANCH_next:
4688 case EVAL_A:
4689 state_num = st->resume_state;
4690 goto reenter_switch;
4691
4692 case CURLYM_B:
77cb431f 4693 default:
40a82448 4694 Perl_croak(aTHX_ "unexpected yes resume state");
77cb431f
DM
4695 }
4696 }
4697
a3621e74 4698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 4699 PL_colors[4], PL_colors[5]));
4633a7c4
LW
4700yes:
4701#ifdef DEBUGGING
3280af22 4702 PL_regindent--;
4633a7c4 4703#endif
02db2b7b 4704
95b24440 4705 result = 1;
aa283a38 4706 /* XXX this is duplicate(ish) code to that in the do_no section.
40a82448 4707 * will disappear when REGFMATCH goes */
aa283a38
DM
4708 if (depth) {
4709 /* restore previous state and re-enter */
40a82448
DM
4710 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4711 depth--;
4712 st--;
4713 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4714 PL_regmatch_slab = PL_regmatch_slab->prev;
4715 st = SLAB_LAST(PL_regmatch_slab);
4716 }
4717 PL_regmatch_state = st;
4718 scan = st->scan;
4719 next = st->next;
4720 n = st->n;
4721 locinput= st->locinput;
4722 nextchr = UCHARAT(locinput);
aa283a38
DM
4723
4724 switch (st->resume_state) {
4725 case resume_TRIE1:
4726 goto resume_point_TRIE1;
4727 case resume_TRIE2:
4728 goto resume_point_TRIE2;
aa283a38
DM
4729 case resume_CURLYX:
4730 goto resume_point_CURLYX;
4731 case resume_WHILEM1:
4732 goto resume_point_WHILEM1;
4733 case resume_WHILEM2:
4734 goto resume_point_WHILEM2;
4735 case resume_WHILEM3:
4736 goto resume_point_WHILEM3;
4737 case resume_WHILEM4:
4738 goto resume_point_WHILEM4;
4739 case resume_WHILEM5:
4740 goto resume_point_WHILEM5;
4741 case resume_WHILEM6:
4742 goto resume_point_WHILEM6;
aa283a38
DM
4743 case resume_PLUS1:
4744 goto resume_point_PLUS1;
4745 case resume_PLUS2:
4746 goto resume_point_PLUS2;
4747 case resume_PLUS3:
4748 goto resume_point_PLUS3;
4749 case resume_PLUS4:
4750 goto resume_point_PLUS4;
77cb431f 4751
40a82448
DM
4752 case CURLYM_A:
4753 case CURLYM_B:
4754 case EVAL_A:
4755 case IFMATCH_A:
4756 case BRANCH_next:
4757 break;
4758
aa283a38
DM
4759 default:
4760 Perl_croak(aTHX_ "regexp resume memory corruption");
4761 }
4762 }
4763 goto final_exit;
4633a7c4
LW
4764
4765no:
a3621e74 4766 DEBUG_EXECUTE_r(
7821416a
IZ
4767 PerlIO_printf(Perl_debug_log,
4768 "%*s %sfailed...%s\n",
e4584336 4769 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
7821416a 4770 );
7821416a
IZ
4771no_final:
4772do_no:
aa283a38 4773
4633a7c4 4774#ifdef DEBUGGING
3280af22 4775 PL_regindent--;
4633a7c4 4776#endif
95b24440 4777 result = 0;
5d9a96ca 4778
aa283a38
DM
4779 if (depth) {
4780 /* there's a previous state to backtrack to */
40a82448
DM
4781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth));
4782 depth--;
4783 st--;
4784 if (st < SLAB_FIRST(PL_regmatch_slab)) {
4785 PL_regmatch_slab = PL_regmatch_slab->prev;
4786 st = SLAB_LAST(PL_regmatch_slab);
4787 }
4788 PL_regmatch_state = st;
4789 scan = st->scan;
4790 next = st->next;
4791 n = st->n;
4792 locinput= st->locinput;
4793 nextchr = UCHARAT(locinput);
4794
5d9a96ca 4795 switch (st->resume_state) {
95b24440
DM
4796 case resume_TRIE1:
4797 goto resume_point_TRIE1;
4798 case resume_TRIE2:
4799 goto resume_point_TRIE2;
aa283a38 4800
95b24440
DM
4801 case resume_CURLYX:
4802 goto resume_point_CURLYX;
4803 case resume_WHILEM1:
4804 goto resume_point_WHILEM1;
4805 case resume_WHILEM2:
4806 goto resume_point_WHILEM2;
4807 case resume_WHILEM3:
4808 goto resume_point_WHILEM3;
4809 case resume_WHILEM4:
4810 goto resume_point_WHILEM4;
4811 case resume_WHILEM5:
4812 goto resume_point_WHILEM5;
4813 case resume_WHILEM6:
4814 goto resume_point_WHILEM6;
dad79028 4815
40a82448
DM
4816 case EVAL_A:
4817 case BRANCH_next:
4818 case CURLYM_A:
4819 case CURLYM_B:
4820 case IFMATCH_A:
4821 if (yes_state == st)
4822 yes_state = st->u.yes.prev_yes_state;
4823 state_num = st->resume_state + 1; /* failure = success + 1 */
4824 goto reenter_switch;
dad79028 4825
95b24440
DM
4826 case resume_PLUS1:
4827 goto resume_point_PLUS1;
4828 case resume_PLUS2:
4829 goto resume_point_PLUS2;
4830 case resume_PLUS3:
4831 goto resume_point_PLUS3;
4832 case resume_PLUS4:
4833 goto resume_point_PLUS4;
95b24440
DM
4834 default:
4835 Perl_croak(aTHX_ "regexp resume memory corruption");
4836 }
95b24440 4837 }
aa283a38
DM
4838
4839final_exit:
4840
5d9a96ca
DM
4841 /* restore original high-water mark */
4842 PL_regmatch_slab = orig_slab;
4843 PL_regmatch_state = orig_state;
4844
4845 /* free all slabs above current one */
4846 if (orig_slab->next) {
c4fd8992 4847 regmatch_slab *sl = orig_slab->next;
5d9a96ca
DM
4848 orig_slab->next = NULL;
4849 while (sl) {
c4fd8992 4850 regmatch_slab * const osl = sl;
5d9a96ca 4851 sl = sl->next;
ad65c075 4852 Safefree(osl);
5d9a96ca
DM
4853 }
4854 }
4855
95b24440
DM
4856 return result;
4857
a687059c
LW
4858}
4859
4860/*
4861 - regrepeat - repeatedly match something simple, report how many
4862 */
4863/*
4864 * [This routine now assumes that it will only match on things of length 1.
4865 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4866 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4867 */
76e3520e 4868STATIC I32
32fc9b6a 4869S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
a687059c 4870{
27da23d5 4871 dVAR;
a0d0e21e 4872 register char *scan;
a0d0e21e 4873 register I32 c;
3280af22 4874 register char *loceol = PL_regeol;
a0ed51b3 4875 register I32 hardcount = 0;
53c4c00c 4876 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4877
3280af22 4878 scan = PL_reginput;
faf11cac
HS
4879 if (max == REG_INFTY)
4880 max = I32_MAX;
4881 else if (max < loceol - scan)
7f596f4c 4882 loceol = scan + max;
a0d0e21e 4883 switch (OP(p)) {
22c35a8c 4884 case REG_ANY:
1aa99e6b 4885 if (do_utf8) {
ffc61ed2 4886 loceol = PL_regeol;
1aa99e6b 4887 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4888 scan += UTF8SKIP(scan);
4889 hardcount++;
4890 }
4891 } else {
4892 while (scan < loceol && *scan != '\n')
4893 scan++;
a0ed51b3
LW
4894 }
4895 break;
ffc61ed2 4896 case SANY:
def8e4ea
JH
4897 if (do_utf8) {
4898 loceol = PL_regeol;
a0804c9e 4899 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4900 scan += UTF8SKIP(scan);
4901 hardcount++;
4902 }
4903 }
4904 else
4905 scan = loceol;
a0ed51b3 4906 break;
f33976b4
DB
4907 case CANY:
4908 scan = loceol;
4909 break;
090f7165
JH
4910 case EXACT: /* length of string is 1 */
4911 c = (U8)*STRING(p);
4912 while (scan < loceol && UCHARAT(scan) == c)
4913 scan++;
bbce6d69 4914 break;
4915 case EXACTF: /* length of string is 1 */
cd439c50 4916 c = (U8)*STRING(p);
bbce6d69 4917 while (scan < loceol &&
22c35a8c 4918 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4919 scan++;
4920 break;
4921 case EXACTFL: /* length of string is 1 */
3280af22 4922 PL_reg_flags |= RF_tainted;
cd439c50 4923 c = (U8)*STRING(p);
bbce6d69 4924 while (scan < loceol &&
22c35a8c 4925 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4926 scan++;
4927 break;
4928 case ANYOF:
ffc61ed2
JH
4929 if (do_utf8) {
4930 loceol = PL_regeol;
cfc92286 4931 while (hardcount < max && scan < loceol &&
32fc9b6a 4932 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4933 scan += UTF8SKIP(scan);
4934 hardcount++;
4935 }
4936 } else {
32fc9b6a 4937 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
4938 scan++;
4939 }
a0d0e21e
LW
4940 break;
4941 case ALNUM:
1aa99e6b 4942 if (do_utf8) {
ffc61ed2 4943 loceol = PL_regeol;
1a4fad37 4944 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4945 while (hardcount < max && scan < loceol &&
3568d838 4946 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4947 scan += UTF8SKIP(scan);
4948 hardcount++;
4949 }
4950 } else {
4951 while (scan < loceol && isALNUM(*scan))
4952 scan++;
a0ed51b3
LW
4953 }
4954 break;
bbce6d69 4955 case ALNUML:
3280af22 4956 PL_reg_flags |= RF_tainted;
1aa99e6b 4957 if (do_utf8) {
ffc61ed2 4958 loceol = PL_regeol;
1aa99e6b
IH
4959 while (hardcount < max && scan < loceol &&
4960 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4961 scan += UTF8SKIP(scan);
4962 hardcount++;
4963 }
4964 } else {
4965 while (scan < loceol && isALNUM_LC(*scan))
4966 scan++;
a0ed51b3
LW
4967 }
4968 break;
a0d0e21e 4969 case NALNUM:
1aa99e6b 4970 if (do_utf8) {
ffc61ed2 4971 loceol = PL_regeol;
1a4fad37 4972 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 4973 while (hardcount < max && scan < loceol &&
3568d838 4974 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4975 scan += UTF8SKIP(scan);
4976 hardcount++;
4977 }
4978 } else {
4979 while (scan < loceol && !isALNUM(*scan))
4980 scan++;
a0ed51b3
LW
4981 }
4982 break;
bbce6d69 4983 case NALNUML:
3280af22 4984 PL_reg_flags |= RF_tainted;
1aa99e6b 4985 if (do_utf8) {
ffc61ed2 4986 loceol = PL_regeol;
1aa99e6b
IH
4987 while (hardcount < max && scan < loceol &&
4988 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4989 scan += UTF8SKIP(scan);
4990 hardcount++;
4991 }
4992 } else {
4993 while (scan < loceol && !isALNUM_LC(*scan))
4994 scan++;
a0ed51b3
LW
4995 }
4996 break;
a0d0e21e 4997 case SPACE:
1aa99e6b 4998 if (do_utf8) {
ffc61ed2 4999 loceol = PL_regeol;
1a4fad37 5000 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5001 while (hardcount < max && scan < loceol &&
3568d838
JH
5002 (*scan == ' ' ||
5003 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5004 scan += UTF8SKIP(scan);
5005 hardcount++;
5006 }
5007 } else {
5008 while (scan < loceol && isSPACE(*scan))
5009 scan++;
a0ed51b3
LW
5010 }
5011 break;
bbce6d69 5012 case SPACEL:
3280af22 5013 PL_reg_flags |= RF_tainted;
1aa99e6b 5014 if (do_utf8) {
ffc61ed2 5015 loceol = PL_regeol;
1aa99e6b 5016 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5017 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5018 scan += UTF8SKIP(scan);
5019 hardcount++;
5020 }
5021 } else {
5022 while (scan < loceol && isSPACE_LC(*scan))
5023 scan++;
a0ed51b3
LW
5024 }
5025 break;
a0d0e21e 5026 case NSPACE:
1aa99e6b 5027 if (do_utf8) {
ffc61ed2 5028 loceol = PL_regeol;
1a4fad37 5029 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5030 while (hardcount < max && scan < loceol &&
3568d838
JH
5031 !(*scan == ' ' ||
5032 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5033 scan += UTF8SKIP(scan);
5034 hardcount++;
5035 }
5036 } else {
5037 while (scan < loceol && !isSPACE(*scan))
5038 scan++;
5039 break;
a0ed51b3 5040 }
bbce6d69 5041 case NSPACEL:
3280af22 5042 PL_reg_flags |= RF_tainted;
1aa99e6b 5043 if (do_utf8) {
ffc61ed2 5044 loceol = PL_regeol;
1aa99e6b 5045 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5046 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5047 scan += UTF8SKIP(scan);
5048 hardcount++;
5049 }
5050 } else {
5051 while (scan < loceol && !isSPACE_LC(*scan))
5052 scan++;
a0ed51b3
LW
5053 }
5054 break;
a0d0e21e 5055 case DIGIT:
1aa99e6b 5056 if (do_utf8) {
ffc61ed2 5057 loceol = PL_regeol;
1a4fad37 5058 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5059 while (hardcount < max && scan < loceol &&
3568d838 5060 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5061 scan += UTF8SKIP(scan);
5062 hardcount++;
5063 }
5064 } else {
5065 while (scan < loceol && isDIGIT(*scan))
5066 scan++;
a0ed51b3
LW
5067 }
5068 break;
a0d0e21e 5069 case NDIGIT:
1aa99e6b 5070 if (do_utf8) {
ffc61ed2 5071 loceol = PL_regeol;
1a4fad37 5072 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5073 while (hardcount < max && scan < loceol &&
3568d838 5074 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5075 scan += UTF8SKIP(scan);
5076 hardcount++;
5077 }
5078 } else {
5079 while (scan < loceol && !isDIGIT(*scan))
5080 scan++;
a0ed51b3
LW
5081 }
5082 break;
a0d0e21e
LW
5083 default: /* Called on something of 0 width. */
5084 break; /* So match right here or not at all. */
5085 }
a687059c 5086
a0ed51b3
LW
5087 if (hardcount)
5088 c = hardcount;
5089 else
5090 c = scan - PL_reginput;
3280af22 5091 PL_reginput = scan;
a687059c 5092
a3621e74 5093 DEBUG_r({
e68ec53f 5094 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5095 DEBUG_EXECUTE_r({
e68ec53f
YO
5096 SV * const prop = sv_newmortal();
5097 regprop(prog, prop, p);
5098 PerlIO_printf(Perl_debug_log,
be8e71aa
YO
5099 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5100 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5101 });
be8e71aa 5102 });
9041c2e3 5103
a0d0e21e 5104 return(c);
a687059c
LW
5105}
5106
c277df42 5107
be8e71aa 5108#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5109/*
ffc61ed2
JH
5110- regclass_swash - prepare the utf8 swash
5111*/
5112
5113SV *
32fc9b6a 5114Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5115{
97aff369 5116 dVAR;
9e55ce06
JH
5117 SV *sw = NULL;
5118 SV *si = NULL;
5119 SV *alt = NULL;
3dab1dad 5120 const struct reg_data * const data = prog ? prog->data : NULL;
ffc61ed2 5121
4f639d21 5122 if (data && data->count) {
a3b680e6 5123 const U32 n = ARG(node);
ffc61ed2 5124
4f639d21
DM
5125 if (data->what[n] == 's') {
5126 SV * const rv = (SV*)data->data[n];
890ce7af 5127 AV * const av = (AV*)SvRV((SV*)rv);
2d03de9c 5128 SV **const ary = AvARRAY(av);
9e55ce06 5129 SV **a, **b;
9041c2e3 5130
711a919c 5131 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5132 * documentation of these array elements. */
5133
b11f357e 5134 si = *ary;
8f7f7219 5135 a = SvROK(ary[1]) ? &ary[1] : 0;
b11f357e
JH
5136 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
5137
ffc61ed2
JH
5138 if (a)
5139 sw = *a;
5140 else if (si && doinit) {
5141 sw = swash_init("utf8", "", si, 1, 0);
5142 (void)av_store(av, 1, sw);
5143 }
9e55ce06
JH
5144 if (b)
5145 alt = *b;
ffc61ed2
JH
5146 }
5147 }
5148
9e55ce06
JH
5149 if (listsvp)
5150 *listsvp = si;
5151 if (altsvp)
5152 *altsvp = alt;
ffc61ed2
JH
5153
5154 return sw;
5155}
76234dfb 5156#endif
ffc61ed2
JH
5157
5158/*
ba7b4546 5159 - reginclass - determine if a character falls into a character class
832705d4
JH
5160
5161 The n is the ANYOF regnode, the p is the target string, lenp
5162 is pointer to the maximum length of how far to go in the p
5163 (if the lenp is zero, UTF8SKIP(p) is used),
5164 do_utf8 tells whether the target string is in UTF-8.
5165
bbce6d69 5166 */
5167
76e3520e 5168STATIC bool
32fc9b6a 5169S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5170{
27da23d5 5171 dVAR;
a3b680e6 5172 const char flags = ANYOF_FLAGS(n);
bbce6d69 5173 bool match = FALSE;
cc07378b 5174 UV c = *p;
ae9ddab8 5175 STRLEN len = 0;
9e55ce06 5176 STRLEN plen;
1aa99e6b 5177
19f67299
TS
5178 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5179 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4ad0818d
DM
5180 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5181 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
19f67299
TS
5182 if (len == (STRLEN)-1)
5183 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5184 }
bbce6d69 5185
0f0076b4 5186 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5187 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5188 if (lenp)
5189 *lenp = 0;
ffc61ed2 5190 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5191 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5192 match = TRUE;
bbce6d69 5193 }
3568d838 5194 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5195 match = TRUE;
ffc61ed2 5196 if (!match) {
9e55ce06 5197 AV *av;
32fc9b6a 5198 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5199
5200 if (sw) {
3568d838 5201 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
5202 match = TRUE;
5203 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5204 if (!match && lenp && av) {
5205 I32 i;
9e55ce06 5206 for (i = 0; i <= av_len(av); i++) {
890ce7af 5207 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5208 STRLEN len;
890ce7af 5209 const char * const s = SvPV_const(sv, len);
9e55ce06 5210
061b10df 5211 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
5212 *lenp = len;
5213 match = TRUE;
5214 break;
5215 }
5216 }
5217 }
5218 if (!match) {
89ebb4a3 5219 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
5220 STRLEN tmplen;
5221
9e55ce06
JH
5222 to_utf8_fold(p, tmpbuf, &tmplen);
5223 if (swash_fetch(sw, tmpbuf, do_utf8))
5224 match = TRUE;
5225 }
ffc61ed2
JH
5226 }
5227 }
bbce6d69 5228 }
9e55ce06 5229 if (match && lenp && *lenp == 0)
0f0076b4 5230 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 5231 }
1aa99e6b 5232 if (!match && c < 256) {
ffc61ed2
JH
5233 if (ANYOF_BITMAP_TEST(n, c))
5234 match = TRUE;
5235 else if (flags & ANYOF_FOLD) {
eb160463 5236 U8 f;
a0ed51b3 5237
ffc61ed2
JH
5238 if (flags & ANYOF_LOCALE) {
5239 PL_reg_flags |= RF_tainted;
5240 f = PL_fold_locale[c];
5241 }
5242 else
5243 f = PL_fold[c];
5244 if (f != c && ANYOF_BITMAP_TEST(n, f))
5245 match = TRUE;
5246 }
5247
5248 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 5249 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
5250 if (
5251 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5252 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5253 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5254 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5255 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5256 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5257 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5258 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5259 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5260 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5261 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5262 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5263 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5264 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5265 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5266 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5267 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5268 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5269 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5270 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5271 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5272 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5273 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5274 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5275 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5276 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5277 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5278 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5279 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5280 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5281 ) /* How's that for a conditional? */
5282 {
5283 match = TRUE;
5284 }
a0ed51b3 5285 }
a0ed51b3
LW
5286 }
5287
a0ed51b3
LW
5288 return (flags & ANYOF_INVERT) ? !match : match;
5289}
161b471a 5290
dfe13c55 5291STATIC U8 *
0ce71af7 5292S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 5293{
97aff369 5294 dVAR;
a0ed51b3 5295 if (off >= 0) {
1aa99e6b 5296 while (off-- && s < lim) {
ffc61ed2 5297 /* XXX could check well-formedness here */
a0ed51b3 5298 s += UTF8SKIP(s);
ffc61ed2 5299 }
a0ed51b3
LW
5300 }
5301 else {
5302 while (off++) {
1aa99e6b 5303 if (s > lim) {
a0ed51b3 5304 s--;
ffc61ed2 5305 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5306 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5307 s--;
ffc61ed2
JH
5308 }
5309 /* XXX could check well-formedness here */
a0ed51b3
LW
5310 }
5311 }
5312 }
5313 return s;
5314}
161b471a 5315
dfe13c55 5316STATIC U8 *
0ce71af7 5317S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 5318{
97aff369 5319 dVAR;
a0ed51b3 5320 if (off >= 0) {
1aa99e6b 5321 while (off-- && s < lim) {
ffc61ed2 5322 /* XXX could check well-formedness here */
a0ed51b3 5323 s += UTF8SKIP(s);
ffc61ed2 5324 }
a0ed51b3 5325 if (off >= 0)
3dab1dad 5326 return NULL;
a0ed51b3
LW
5327 }
5328 else {
5329 while (off++) {
1aa99e6b 5330 if (s > lim) {
a0ed51b3 5331 s--;
ffc61ed2 5332 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 5333 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 5334 s--;
ffc61ed2
JH
5335 }
5336 /* XXX could check well-formedness here */
a0ed51b3
LW
5337 }
5338 else
5339 break;
5340 }
5341 if (off <= 0)
3dab1dad 5342 return NULL;
a0ed51b3
LW
5343 }
5344 return s;
5345}
51371543 5346
51371543 5347static void
acfe0abc 5348restore_pos(pTHX_ void *arg)
51371543 5349{
97aff369 5350 dVAR;
097eb12c 5351 regexp * const rex = (regexp *)arg;
51371543
GS
5352 if (PL_reg_eval_set) {
5353 if (PL_reg_oldsaved) {
4f639d21
DM
5354 rex->subbeg = PL_reg_oldsaved;
5355 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 5356#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 5357 rex->saved_copy = PL_nrs;
ed252734 5358#endif
4f639d21 5359 RX_MATCH_COPIED_on(rex);
51371543
GS
5360 }
5361 PL_reg_magic->mg_len = PL_reg_oldpos;
5362 PL_reg_eval_set = 0;
5363 PL_curpm = PL_reg_oldcurpm;
5364 }
5365}
33b8afdf
JH
5366
5367STATIC void
5368S_to_utf8_substr(pTHX_ register regexp *prog)
5369{
33b8afdf 5370 if (prog->float_substr && !prog->float_utf8) {
097eb12c
AL
5371 SV* const sv = newSVsv(prog->float_substr);
5372 prog->float_utf8 = sv;
33b8afdf
JH
5373 sv_utf8_upgrade(sv);
5374 if (SvTAIL(prog->float_substr))
5375 SvTAIL_on(sv);
5376 if (prog->float_substr == prog->check_substr)
5377 prog->check_utf8 = sv;
5378 }
5379 if (prog->anchored_substr && !prog->anchored_utf8) {
097eb12c
AL
5380 SV* const sv = newSVsv(prog->anchored_substr);
5381 prog->anchored_utf8 = sv;
33b8afdf
JH
5382 sv_utf8_upgrade(sv);
5383 if (SvTAIL(prog->anchored_substr))
5384 SvTAIL_on(sv);
5385 if (prog->anchored_substr == prog->check_substr)
5386 prog->check_utf8 = sv;
5387 }
5388}
5389
5390STATIC void
5391S_to_byte_substr(pTHX_ register regexp *prog)
5392{
97aff369 5393 dVAR;
33b8afdf 5394 if (prog->float_utf8 && !prog->float_substr) {
097eb12c
AL
5395 SV* sv = newSVsv(prog->float_utf8);
5396 prog->float_substr = sv;
33b8afdf
JH
5397 if (sv_utf8_downgrade(sv, TRUE)) {
5398 if (SvTAIL(prog->float_utf8))
5399 SvTAIL_on(sv);
5400 } else {
5401 SvREFCNT_dec(sv);
5402 prog->float_substr = sv = &PL_sv_undef;
5403 }
5404 if (prog->float_utf8 == prog->check_utf8)
5405 prog->check_substr = sv;
5406 }
5407 if (prog->anchored_utf8 && !prog->anchored_substr) {
097eb12c
AL
5408 SV* sv = newSVsv(prog->anchored_utf8);
5409 prog->anchored_substr = sv;
33b8afdf
JH
5410 if (sv_utf8_downgrade(sv, TRUE)) {
5411 if (SvTAIL(prog->anchored_utf8))
5412 SvTAIL_on(sv);
5413 } else {
5414 SvREFCNT_dec(sv);
5415 prog->anchored_substr = sv = &PL_sv_undef;
5416 }
5417 if (prog->anchored_utf8 == prog->check_utf8)
5418 prog->check_substr = sv;
5419 }
5420}
66610fdd
RGS
5421
5422/*
5423 * Local variables:
5424 * c-indentation-style: bsd
5425 * c-basic-offset: 4
5426 * indent-tabs-mode: t
5427 * End:
5428 *
37442d52
RGS
5429 * ex: set ts=8 sts=4 sw=4 noet:
5430 */