This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
remove a compiler warning by making HOPBACKc only hop *back*
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
52657f30
AL
102#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
104 : (U8*)(pos + off)))
105#define HOPBACKc(pos, off) ((char*) \
106 ((PL_reg_match_utf8) \
b9ea4ed6 107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
efb30f32
HS
108 : (pos - off >= PL_bostr) \
109 ? (U8*)(pos - off) \
52657f30 110 : (U8*)NULL) \
efb30f32 111)
efb30f32 112
1aa99e6b 113#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 114#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 115#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 116
1a4fad37
AL
117#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
118 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
119#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
120#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
121#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
122#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
129 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3
JP
132#define HAS_TEXT(rn) ( \
133 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 141 while (JUMPABLE(rn)) \
a84d97b6 142 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 143 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
144 else if (OP(rn) == PLUS) \
145 rn = NEXTOPER(rn); \
a84d97b6
HS
146 else if (OP(rn) == IFMATCH) \
147 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 148 else rn += NEXT_OFF(rn); \
5f80c4cf 149} STMT_END
74750237 150
acfe0abc 151static void restore_pos(pTHX_ void *arg);
51371543 152
76e3520e 153STATIC CHECKPOINT
cea2e8a9 154S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 155{
97aff369 156 dVAR;
a3b680e6 157 const int retval = PL_savestack_ix;
b1ce53c5 158#define REGCP_PAREN_ELEMS 4
a3b680e6 159 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
160 int p;
161
e49a9654
IH
162 if (paren_elems_to_push < 0)
163 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
164
a01268b5 165#define REGCP_OTHER_ELEMS 6
4b3c1a47 166 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 167 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 168/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
169 SSPUSHINT(PL_regendp[p]);
170 SSPUSHINT(PL_regstartp[p]);
3280af22 171 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
172 SSPUSHINT(p);
173 }
b1ce53c5 174/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
175 SSPUSHINT(PL_regsize);
176 SSPUSHINT(*PL_reglastparen);
a01268b5 177 SSPUSHINT(*PL_reglastcloseparen);
3280af22 178 SSPUSHPTR(PL_reginput);
41123dfd
JH
179#define REGCP_FRAME_ELEMS 2
180/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
181 * are needed for the regexp context stack bookkeeping. */
182 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 183 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 184
a0d0e21e
LW
185 return retval;
186}
187
c277df42 188/* These are needed since we do not localize EVAL nodes: */
a3621e74 189# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 190 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 191 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 192
a3621e74 193# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 194 PerlIO_printf(Perl_debug_log, \
faccc32b 195 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 196 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 197
76e3520e 198STATIC char *
097eb12c 199S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 200{
97aff369 201 dVAR;
b1ce53c5 202 I32 i;
a0d0e21e 203 char *input;
b1ce53c5 204
a3621e74
YO
205 GET_RE_DEBUG_FLAGS_DECL;
206
b1ce53c5 207 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 208 i = SSPOPINT;
b1ce53c5
JH
209 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
210 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 211 input = (char *) SSPOPPTR;
a01268b5 212 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
213 *PL_reglastparen = SSPOPINT;
214 PL_regsize = SSPOPINT;
b1ce53c5
JH
215
216 /* Now restore the parentheses context. */
41123dfd
JH
217 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
218 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 219 I32 tmps;
097eb12c 220 U32 paren = (U32)SSPOPINT;
3280af22 221 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
222 PL_regstartp[paren] = SSPOPINT;
223 tmps = SSPOPINT;
3280af22
NIS
224 if (paren <= *PL_reglastparen)
225 PL_regendp[paren] = tmps;
a3621e74 226 DEBUG_EXECUTE_r(
c3464db5 227 PerlIO_printf(Perl_debug_log,
b900a521 228 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 229 (UV)paren, (IV)PL_regstartp[paren],
b900a521 230 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 231 (IV)PL_regendp[paren],
3280af22 232 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 233 );
a0d0e21e 234 }
a3621e74 235 DEBUG_EXECUTE_r(
bb7a0f54 236 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 237 PerlIO_printf(Perl_debug_log,
faccc32b 238 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 239 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
240 }
241 );
daf18116 242#if 1
dafc8851
JH
243 /* It would seem that the similar code in regtry()
244 * already takes care of this, and in fact it is in
245 * a better location to since this code can #if 0-ed out
246 * but the code in regtry() is needed or otherwise tests
247 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
248 * (as of patchlevel 7877) will fail. Then again,
249 * this code seems to be necessary or otherwise
250 * building DynaLoader will fail:
251 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
252 * --jhi */
bb7a0f54 253 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
254 if (i > PL_regsize)
255 PL_regstartp[i] = -1;
256 PL_regendp[i] = -1;
a0d0e21e 257 }
dafc8851 258#endif
a0d0e21e
LW
259 return input;
260}
261
02db2b7b 262#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 263
95b24440 264#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
265 if (paren) { \
266 if (n) { \
267 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
268 PL_regendp[paren] = input - PL_bostr; \
269 } \
270 else \
271 PL_regendp[paren] = -1; \
272 } \
95b24440
DM
273 REGMATCH(next, where); \
274 if (result) \
29d1e993
HS
275 sayYES; \
276 if (paren && n) \
277 PL_regendp[paren] = -1; \
278}
279
280
a687059c 281/*
e50aee73 282 * pregexec and friends
a687059c
LW
283 */
284
76234dfb 285#ifndef PERL_IN_XSUB_RE
a687059c 286/*
c277df42 287 - pregexec - match a regexp against a string
a687059c 288 */
c277df42 289I32
864dbfa3 290Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 291 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
292/* strend: pointer to null at end of string */
293/* strbeg: real beginning of string */
294/* minend: end of match must be >=minend after stringarg. */
295/* nosave: For optimizations. */
296{
297 return
9041c2e3 298 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
299 nosave ? 0 : REXEC_COPY_STR);
300}
76234dfb 301#endif
22e551b9 302
9041c2e3 303/*
cad2e5aa
JH
304 * Need to implement the following flags for reg_anch:
305 *
306 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
307 * USE_INTUIT_ML
308 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
309 * INTUIT_AUTORITATIVE_ML
310 * INTUIT_ONCE_NOML - Intuit can match in one location only.
311 * INTUIT_ONCE_ML
312 *
313 * Another flag for this function: SECOND_TIME (so that float substrs
314 * with giant delta may be not rechecked).
315 */
316
317/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
318
3f7c398e 319/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
320 Otherwise, only SvCUR(sv) is used to get strbeg. */
321
322/* XXXX We assume that strpos is strbeg unless sv. */
323
6eb5f6b9
JH
324/* XXXX Some places assume that there is a fixed substring.
325 An update may be needed if optimizer marks as "INTUITable"
326 RExen without fixed substrings. Similarly, it is assumed that
327 lengths of all the strings are no more than minlen, thus they
328 cannot come from lookahead.
329 (Or minlen should take into account lookahead.) */
330
2c2d71f5
JH
331/* A failure to find a constant substring means that there is no need to make
332 an expensive call to REx engine, thus we celebrate a failure. Similarly,
333 finding a substring too deep into the string means that less calls to
30944b6d
IZ
334 regtry() should be needed.
335
336 REx compiler's optimizer found 4 possible hints:
337 a) Anchored substring;
338 b) Fixed substring;
339 c) Whether we are anchored (beginning-of-line or \G);
340 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 341 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
342 string which does not contradict any of them.
343 */
2c2d71f5 344
6eb5f6b9
JH
345/* Most of decisions we do here should have been done at compile time.
346 The nodes of the REx which we used for the search should have been
347 deleted from the finite automaton. */
348
cad2e5aa
JH
349char *
350Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
351 char *strend, U32 flags, re_scream_pos_data *data)
352{
97aff369 353 dVAR;
b7953727 354 register I32 start_shift = 0;
cad2e5aa 355 /* Should be nonnegative! */
b7953727 356 register I32 end_shift = 0;
2c2d71f5
JH
357 register char *s;
358 register SV *check;
a1933d95 359 char *strbeg;
cad2e5aa 360 char *t;
a3b680e6 361 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 362 I32 ml_anch;
bd61b366
SS
363 register char *other_last = NULL; /* other substr checked before this */
364 char *check_at = NULL; /* check substr found at this pos */
1df70142 365 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 366#ifdef DEBUGGING
890ce7af
AL
367 const char * const i_strpos = strpos;
368 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 369#endif
a3621e74
YO
370
371 GET_RE_DEBUG_FLAGS_DECL;
372
a30b2f1f 373 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 374
b8d68ded 375 if (prog->reganch & ROPT_UTF8) {
a3621e74 376 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
377 "UTF-8 regex...\n"));
378 PL_reg_flags |= RF_utf8;
379 }
380
a3621e74 381 DEBUG_EXECUTE_r({
1df70142 382 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
383 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
384 strpos;
1df70142 385 const int len = PL_reg_match_utf8 ?
bb7a0f54 386 (int)strlen(s) : strend - strpos;
2a782b5b
JH
387 if (!PL_colorset)
388 reginitcolors();
b8d68ded 389 if (PL_reg_match_utf8)
a3621e74 390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 391 "UTF-8 target...\n"));
2a782b5b 392 PerlIO_printf(Perl_debug_log,
a0288114 393 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 394 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
395 prog->precomp,
396 PL_colors[1],
397 (strlen(prog->precomp) > 60 ? "..." : ""),
398 PL_colors[0],
399 (int)(len > 60 ? 60 : len),
400 s, PL_colors[1],
401 (len > 60 ? "..." : "")
402 );
403 });
cad2e5aa 404
c344f387
JH
405 /* CHR_DIST() would be more correct here but it makes things slow. */
406 if (prog->minlen > strend - strpos) {
a3621e74 407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 408 "String too short... [re_intuit_start]\n"));
cad2e5aa 409 goto fail;
2c2d71f5 410 }
a1933d95 411 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 412 PL_regeol = strend;
33b8afdf
JH
413 if (do_utf8) {
414 if (!prog->check_utf8 && prog->check_substr)
415 to_utf8_substr(prog);
416 check = prog->check_utf8;
417 } else {
418 if (!prog->check_substr && prog->check_utf8)
419 to_byte_substr(prog);
420 check = prog->check_substr;
421 }
422 if (check == &PL_sv_undef) {
a3621e74 423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
424 "Non-utf string cannot match utf check string\n"));
425 goto fail;
426 }
2c2d71f5 427 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
428 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
429 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 430 && !multiline ) ); /* Check after \n? */
cad2e5aa 431
7e25d62c
JH
432 if (!ml_anch) {
433 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
434 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 435 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
436 && sv && !SvROK(sv)
437 && (strpos != strbeg)) {
a3621e74 438 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
439 goto fail;
440 }
441 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 442 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 443 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
444 I32 slen;
445
1aa99e6b 446 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
447 if (SvTAIL(check)) {
448 slen = SvCUR(check); /* >= 1 */
cad2e5aa 449
9041c2e3 450 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 451 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 453 goto fail_finish;
cad2e5aa
JH
454 }
455 /* Now should match s[0..slen-2] */
456 slen--;
3f7c398e 457 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 458 || (slen > 1
3f7c398e 459 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 460 report_neq:
a3621e74 461 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
462 goto fail_finish;
463 }
cad2e5aa 464 }
3f7c398e 465 else if (*SvPVX_const(check) != *s
653099ff 466 || ((slen = SvCUR(check)) > 1
3f7c398e 467 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 468 goto report_neq;
c315bfe8 469 check_at = s;
2c2d71f5 470 goto success_at_start;
7e25d62c 471 }
cad2e5aa 472 }
2c2d71f5 473 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 474 s = strpos;
2c2d71f5 475 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 476 end_shift = prog->minlen - start_shift -
653099ff 477 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 478 if (!ml_anch) {
a3b680e6 479 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 480 - (SvTAIL(check) != 0);
a3b680e6 481 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
482
483 if (end_shift < eshift)
484 end_shift = eshift;
485 }
cad2e5aa 486 }
2c2d71f5 487 else { /* Can match at random position */
cad2e5aa
JH
488 ml_anch = 0;
489 s = strpos;
2c2d71f5
JH
490 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
491 /* Should be nonnegative! */
492 end_shift = prog->minlen - start_shift -
653099ff 493 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
494 }
495
2c2d71f5 496#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 497 if (end_shift < 0)
6bbae5e6 498 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
499#endif
500
2c2d71f5
JH
501 restart:
502 /* Find a possible match in the region s..strend by looking for
503 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 504 if (flags & REXEC_SCREAM) {
cad2e5aa 505 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 506 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 507
2c2d71f5
JH
508 if (PL_screamfirst[BmRARE(check)] >= 0
509 || ( BmRARE(check) == '\n'
510 && (BmPREVIOUS(check) == SvCUR(check) - 1)
511 && SvTAIL(check) ))
9041c2e3 512 s = screaminstr(sv, check,
2c2d71f5 513 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 514 else
2c2d71f5 515 goto fail_finish;
4addbd3b
HS
516 /* we may be pointing at the wrong string */
517 if (s && RX_MATCH_COPIED(prog))
3f7c398e 518 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
519 if (data)
520 *data->scream_olds = s;
521 }
f33976b4 522 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
523 s = fbm_instr((U8*)(s + start_shift),
524 (U8*)(strend - end_shift),
7fba1cd6 525 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 526 else
1aa99e6b
IH
527 s = fbm_instr(HOP3(s, start_shift, strend),
528 HOP3(strend, -end_shift, strbeg),
7fba1cd6 529 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
530
531 /* Update the count-of-usability, remove useless subpatterns,
532 unshift s. */
2c2d71f5 533
a0288114 534 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 535 (s ? "Found" : "Did not find"),
33b8afdf 536 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 537 PL_colors[0],
7b0972df 538 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 539 SvPVX_const(check),
2c2d71f5
JH
540 PL_colors[1], (SvTAIL(check) ? "$" : ""),
541 (s ? " at offset " : "...\n") ) );
542
543 if (!s)
544 goto fail_finish;
545
6eb5f6b9
JH
546 check_at = s;
547
2c2d71f5 548 /* Finish the diagnostic message */
a3621e74 549 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
550
551 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
552 Start with the other substr.
553 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 554 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
555 *always* match. Probably should be marked during compile...
556 Probably it is right to do no SCREAM here...
557 */
558
33b8afdf 559 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 560 /* Take into account the "other" substring. */
2c2d71f5
JH
561 /* XXXX May be hopelessly wrong for UTF... */
562 if (!other_last)
6eb5f6b9 563 other_last = strpos;
33b8afdf 564 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
565 do_other_anchored:
566 {
890ce7af
AL
567 char * const last = HOP3c(s, -start_shift, strbeg);
568 char *last1, *last2;
2c2d71f5 569 char *s1 = s;
33b8afdf 570 SV* must;
2c2d71f5 571
2c2d71f5
JH
572 t = s - prog->check_offset_max;
573 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 574 && (!do_utf8
1aa99e6b 575 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 576 && t > strpos)))
6f207bd3 577 NOOP;
2c2d71f5
JH
578 else
579 t = strpos;
1aa99e6b 580 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
581 if (t < other_last) /* These positions already checked */
582 t = other_last;
1aa99e6b 583 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
584 if (last < last1)
585 last1 = last;
586 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
587 /* On end-of-str: see comment below. */
33b8afdf
JH
588 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
589 if (must == &PL_sv_undef) {
590 s = (char*)NULL;
a3621e74 591 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
592 }
593 else
594 s = fbm_instr(
595 (unsigned char*)t,
596 HOP3(HOP3(last1, prog->anchored_offset, strend)
597 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
598 must,
7fba1cd6 599 multiline ? FBMrf_MULTILINE : 0
33b8afdf 600 );
a3621e74 601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 602 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
603 (s ? "Found" : "Contradicts"),
604 PL_colors[0],
33b8afdf
JH
605 (int)(SvCUR(must)
606 - (SvTAIL(must)!=0)),
3f7c398e 607 SvPVX_const(must),
33b8afdf 608 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
609 if (!s) {
610 if (last1 >= last2) {
a3621e74 611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
612 ", giving up...\n"));
613 goto fail_finish;
614 }
a3621e74 615 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 616 ", trying floating at offset %ld...\n",
1aa99e6b
IH
617 (long)(HOP3c(s1, 1, strend) - i_strpos)));
618 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
619 s = HOP3c(last, 1, strend);
2c2d71f5
JH
620 goto restart;
621 }
622 else {
a3621e74 623 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 624 (long)(s - i_strpos)));
1aa99e6b
IH
625 t = HOP3c(s, -prog->anchored_offset, strbeg);
626 other_last = HOP3c(s, 1, strend);
30944b6d 627 s = s1;
2c2d71f5
JH
628 if (t == strpos)
629 goto try_at_start;
2c2d71f5
JH
630 goto try_at_offset;
631 }
30944b6d 632 }
2c2d71f5
JH
633 }
634 else { /* Take into account the floating substring. */
33b8afdf
JH
635 char *last, *last1;
636 char *s1 = s;
637 SV* must;
638
639 t = HOP3c(s, -start_shift, strbeg);
640 last1 = last =
641 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
642 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
643 last = HOP3c(t, prog->float_max_offset, strend);
644 s = HOP3c(t, prog->float_min_offset, strend);
645 if (s < other_last)
646 s = other_last;
2c2d71f5 647 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
648 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
649 /* fbm_instr() takes into account exact value of end-of-str
650 if the check is SvTAIL(ed). Since false positives are OK,
651 and end-of-str is not later than strend we are OK. */
652 if (must == &PL_sv_undef) {
653 s = (char*)NULL;
a3621e74 654 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
655 }
656 else
2c2d71f5 657 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
658 (unsigned char*)last + SvCUR(must)
659 - (SvTAIL(must)!=0),
7fba1cd6 660 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
662 (s ? "Found" : "Contradicts"),
663 PL_colors[0],
664 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 665 SvPVX_const(must),
33b8afdf
JH
666 PL_colors[1], (SvTAIL(must) ? "$" : "")));
667 if (!s) {
668 if (last1 == last) {
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
670 ", giving up...\n"));
671 goto fail_finish;
2c2d71f5 672 }
a3621e74 673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
674 ", trying anchored starting at offset %ld...\n",
675 (long)(s1 + 1 - i_strpos)));
676 other_last = last;
677 s = HOP3c(t, 1, strend);
678 goto restart;
679 }
680 else {
a3621e74 681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
682 (long)(s - i_strpos)));
683 other_last = s; /* Fix this later. --Hugo */
684 s = s1;
685 if (t == strpos)
686 goto try_at_start;
687 goto try_at_offset;
688 }
2c2d71f5 689 }
cad2e5aa 690 }
2c2d71f5
JH
691
692 t = s - prog->check_offset_max;
2c2d71f5 693 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 694 && (!do_utf8
1aa99e6b
IH
695 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
696 && t > strpos))) {
2c2d71f5
JH
697 /* Fixed substring is found far enough so that the match
698 cannot start at strpos. */
699 try_at_offset:
cad2e5aa 700 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
701 /* Eventually fbm_*() should handle this, but often
702 anchored_offset is not 0, so this check will not be wasted. */
703 /* XXXX In the code below we prefer to look for "^" even in
704 presence of anchored substrings. And we search even
705 beyond the found float position. These pessimizations
706 are historical artefacts only. */
707 find_anchor:
2c2d71f5 708 while (t < strend - prog->minlen) {
cad2e5aa 709 if (*t == '\n') {
4ee3650e 710 if (t < check_at - prog->check_offset_min) {
33b8afdf 711 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
712 /* Since we moved from the found position,
713 we definitely contradict the found anchored
30944b6d
IZ
714 substr. Due to the above check we do not
715 contradict "check" substr.
716 Thus we can arrive here only if check substr
717 is float. Redo checking for "other"=="fixed".
718 */
9041c2e3 719 strpos = t + 1;
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 721 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
722 goto do_other_anchored;
723 }
4ee3650e
GS
724 /* We don't contradict the found floating substring. */
725 /* XXXX Why not check for STCLASS? */
cad2e5aa 726 s = t + 1;
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 728 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
729 goto set_useful;
730 }
4ee3650e
GS
731 /* Position contradicts check-string */
732 /* XXXX probably better to look for check-string
733 than for "\n", so one should lower the limit for t? */
a3621e74 734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 735 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 736 other_last = strpos = s = t + 1;
cad2e5aa
JH
737 goto restart;
738 }
739 t++;
740 }
a3621e74 741 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 742 PL_colors[0], PL_colors[1]));
2c2d71f5 743 goto fail_finish;
cad2e5aa 744 }
f5952150 745 else {
a3621e74 746 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 747 PL_colors[0], PL_colors[1]));
f5952150 748 }
cad2e5aa
JH
749 s = t;
750 set_useful:
33b8afdf 751 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
752 }
753 else {
f5952150 754 /* The found string does not prohibit matching at strpos,
2c2d71f5 755 - no optimization of calling REx engine can be performed,
f5952150
GS
756 unless it was an MBOL and we are not after MBOL,
757 or a future STCLASS check will fail this. */
2c2d71f5
JH
758 try_at_start:
759 /* Even in this situation we may use MBOL flag if strpos is offset
760 wrt the start of the string. */
05b4157f 761 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 762 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
763 /* May be due to an implicit anchor of m{.*foo} */
764 && !(prog->reganch & ROPT_IMPLICIT))
765 {
cad2e5aa
JH
766 t = strpos;
767 goto find_anchor;
768 }
a3621e74 769 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 770 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 771 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 772 );
2c2d71f5 773 success_at_start:
30944b6d 774 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
775 && (do_utf8 ? (
776 prog->check_utf8 /* Could be deleted already */
777 && --BmUSEFUL(prog->check_utf8) < 0
778 && (prog->check_utf8 == prog->float_utf8)
779 ) : (
780 prog->check_substr /* Could be deleted already */
781 && --BmUSEFUL(prog->check_substr) < 0
782 && (prog->check_substr == prog->float_substr)
783 )))
66e933ab 784 {
cad2e5aa 785 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
787 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
788 if (do_utf8 ? prog->check_substr : prog->check_utf8)
789 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
790 prog->check_substr = prog->check_utf8 = NULL; /* disable */
791 prog->float_substr = prog->float_utf8 = NULL; /* clear */
792 check = NULL; /* abort */
cad2e5aa 793 s = strpos;
3cf5c195
IZ
794 /* XXXX This is a remnant of the old implementation. It
795 looks wasteful, since now INTUIT can use many
6eb5f6b9 796 other heuristics. */
cad2e5aa
JH
797 prog->reganch &= ~RE_USE_INTUIT;
798 }
799 else
800 s = strpos;
801 }
802
6eb5f6b9
JH
803 /* Last resort... */
804 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
805 if (prog->regstclass) {
806 /* minlen == 0 is possible if regstclass is \b or \B,
807 and the fixed substr is ''$.
808 Since minlen is already taken into account, s+1 is before strend;
809 accidentally, minlen >= 1 guaranties no false positives at s + 1
810 even for \b or \B. But (minlen? 1 : 0) below assumes that
811 regstclass does not come from lookahead... */
812 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
813 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 814 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 815 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 816 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 817 : 1);
a3b680e6 818 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 819 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 820 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
821 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
822 cl_l, strend)
823 : strend);
6eb5f6b9
JH
824
825 t = s;
3b0527fe 826 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
827 if (!s) {
828#ifdef DEBUGGING
cbbf8932 829 const char *what = NULL;
6eb5f6b9
JH
830#endif
831 if (endpos == strend) {
a3621e74 832 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
833 "Could not match STCLASS...\n") );
834 goto fail;
835 }
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 837 "This position contradicts STCLASS...\n") );
653099ff
GS
838 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
839 goto fail;
6eb5f6b9 840 /* Contradict one of substrings */
33b8afdf
JH
841 if (prog->anchored_substr || prog->anchored_utf8) {
842 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 843 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 844 hop_and_restart:
1aa99e6b 845 s = HOP3c(t, 1, strend);
66e933ab
GS
846 if (s + start_shift + end_shift > strend) {
847 /* XXXX Should be taken into account earlier? */
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
849 "Could not match STCLASS...\n") );
850 goto fail;
851 }
5e39e1e5
HS
852 if (!check)
853 goto giveup;
a3621e74 854 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 855 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
856 what, (long)(s + start_shift - i_strpos)) );
857 goto restart;
858 }
66e933ab 859 /* Have both, check_string is floating */
6eb5f6b9
JH
860 if (t + start_shift >= check_at) /* Contradicts floating=check */
861 goto retry_floating_check;
862 /* Recheck anchored substring, but not floating... */
9041c2e3 863 s = check_at;
5e39e1e5
HS
864 if (!check)
865 goto giveup;
a3621e74 866 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 867 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
868 (long)(other_last - i_strpos)) );
869 goto do_other_anchored;
870 }
60e71179
GS
871 /* Another way we could have checked stclass at the
872 current position only: */
873 if (ml_anch) {
874 s = t = t + 1;
5e39e1e5
HS
875 if (!check)
876 goto giveup;
a3621e74 877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 878 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 879 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 880 goto try_at_offset;
66e933ab 881 }
33b8afdf 882 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 883 goto fail;
6eb5f6b9
JH
884 /* Check is floating subtring. */
885 retry_floating_check:
886 t = check_at - start_shift;
a3621e74 887 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
888 goto hop_and_restart;
889 }
b7953727 890 if (t != s) {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 892 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
893 (long)(t - i_strpos), (long)(s - i_strpos))
894 );
895 }
896 else {
a3621e74 897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
898 "Does not contradict STCLASS...\n");
899 );
900 }
6eb5f6b9 901 }
5e39e1e5 902 giveup:
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
904 PL_colors[4], (check ? "Guessed" : "Giving up"),
905 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 906 return s;
2c2d71f5
JH
907
908 fail_finish: /* Substring not found */
33b8afdf
JH
909 if (prog->check_substr || prog->check_utf8) /* could be removed already */
910 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 911 fail:
a3621e74 912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 913 PL_colors[4], PL_colors[5]));
bd61b366 914 return NULL;
cad2e5aa 915}
9661b544 916
6eb5f6b9 917/* We know what class REx starts with. Try to find this position... */
3b0527fe
DM
918/* if reginfo is NULL, its a dryrun */
919
3c3eec57 920STATIC char *
3b0527fe
DM
921S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
922*strend, const regmatch_info *reginfo)
a687059c 923{
27da23d5 924 dVAR;
1df70142 925 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 926 char *m;
d8093b23 927 STRLEN ln;
5dab1207 928 STRLEN lnc;
078c425b 929 register STRLEN uskip;
d8093b23
G
930 unsigned int c1;
931 unsigned int c2;
6eb5f6b9
JH
932 char *e;
933 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 934 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 935
6eb5f6b9
JH
936 /* We know what class it must start with. */
937 switch (OP(c)) {
6eb5f6b9 938 case ANYOF:
388cc4de 939 if (do_utf8) {
078c425b 940 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
941 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
942 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
943 reginclass(prog, c, (U8*)s, 0, do_utf8) :
944 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 945 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
946 goto got_it;
947 else
948 tmp = doevery;
949 }
950 else
951 tmp = 1;
078c425b 952 s += uskip;
388cc4de
HS
953 }
954 }
955 else {
956 while (s < strend) {
957 STRLEN skip = 1;
958
32fc9b6a 959 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
960 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
961 /* The assignment of 2 is intentional:
962 * for the folded sharp s, the skip is 2. */
963 (skip = SHARP_S_SKIP))) {
3b0527fe 964 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
965 goto got_it;
966 else
967 tmp = doevery;
968 }
969 else
970 tmp = 1;
971 s += skip;
972 }
a0d0e21e 973 }
6eb5f6b9 974 break;
f33976b4
DB
975 case CANY:
976 while (s < strend) {
3b0527fe 977 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
978 goto got_it;
979 else
980 tmp = doevery;
981 s++;
982 }
983 break;
6eb5f6b9 984 case EXACTF:
5dab1207
NIS
985 m = STRING(c);
986 ln = STR_LEN(c); /* length to match in octets/bytes */
987 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 988 if (UTF) {
a2a2844f 989 STRLEN ulen1, ulen2;
5dab1207 990 U8 *sm = (U8 *) m;
89ebb4a3
JH
991 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
992 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 993 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
994
995 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
996 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
997
89ebb4a3 998 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 999 0, uniflags);
89ebb4a3 1000 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1001 0, uniflags);
5dab1207
NIS
1002 lnc = 0;
1003 while (sm < ((U8 *) m + ln)) {
1004 lnc++;
1005 sm += UTF8SKIP(sm);
1006 }
1aa99e6b
IH
1007 }
1008 else {
1009 c1 = *(U8*)m;
1010 c2 = PL_fold[c1];
1011 }
6eb5f6b9
JH
1012 goto do_exactf;
1013 case EXACTFL:
5dab1207
NIS
1014 m = STRING(c);
1015 ln = STR_LEN(c);
1016 lnc = (I32) ln;
d8093b23 1017 c1 = *(U8*)m;
6eb5f6b9
JH
1018 c2 = PL_fold_locale[c1];
1019 do_exactf:
db12adc6 1020 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1021
3b0527fe 1022 if (!reginfo && e < s)
6eb5f6b9 1023 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1024
60a8b682
JH
1025 /* The idea in the EXACTF* cases is to first find the
1026 * first character of the EXACTF* node and then, if
1027 * necessary, case-insensitively compare the full
1028 * text of the node. The c1 and c2 are the first
1029 * characters (though in Unicode it gets a bit
1030 * more complicated because there are more cases
7f16dd3d
JH
1031 * than just upper and lower: one needs to use
1032 * the so-called folding case for case-insensitive
1033 * matching (called "loose matching" in Unicode).
1034 * ibcmp_utf8() will do just that. */
60a8b682 1035
1aa99e6b 1036 if (do_utf8) {
575cac57 1037 UV c, f;
89ebb4a3 1038 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1039 STRLEN len, foldlen;
4ad0818d 1040 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1041 if (c1 == c2) {
5dab1207
NIS
1042 /* Upper and lower of 1st char are equal -
1043 * probably not a "letter". */
1aa99e6b 1044 while (s <= e) {
89ebb4a3 1045 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1046 uniflags);
80aecb99
JH
1047 if ( c == c1
1048 && (ln == len ||
66423254 1049 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1050 m, (char **)0, ln, (bool)UTF))
3b0527fe 1051 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1052 goto got_it;
80aecb99 1053 else {
1df70142 1054 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1055 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1056 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1057 if ( f != c
1058 && (f == c1 || f == c2)
1059 && (ln == foldlen ||
66423254
JH
1060 !ibcmp_utf8((char *) foldbuf,
1061 (char **)0, foldlen, do_utf8,
d07ddd77 1062 m,
eb160463 1063 (char **)0, ln, (bool)UTF))
3b0527fe 1064 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1065 goto got_it;
1066 }
1aa99e6b
IH
1067 s += len;
1068 }
09091399
JH
1069 }
1070 else {
1aa99e6b 1071 while (s <= e) {
89ebb4a3 1072 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1073 uniflags);
80aecb99 1074
60a8b682 1075 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1076 * Note that not all the possible combinations
1077 * are handled here: some of them are handled
1078 * by the standard folding rules, and some of
1079 * them (the character class or ANYOF cases)
1080 * are handled during compiletime in
1081 * regexec.c:S_regclass(). */
880bd946
JH
1082 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1083 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1084 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1085
1086 if ( (c == c1 || c == c2)
1087 && (ln == len ||
66423254 1088 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1089 m, (char **)0, ln, (bool)UTF))
3b0527fe 1090 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1091 goto got_it;
80aecb99 1092 else {
1df70142 1093 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1094 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1095 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1096 if ( f != c
1097 && (f == c1 || f == c2)
1098 && (ln == foldlen ||
a6872d42 1099 !ibcmp_utf8((char *) foldbuf,
66423254 1100 (char **)0, foldlen, do_utf8,
d07ddd77 1101 m,
eb160463 1102 (char **)0, ln, (bool)UTF))
3b0527fe 1103 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1104 goto got_it;
1105 }
1aa99e6b
IH
1106 s += len;
1107 }
09091399 1108 }
1aa99e6b
IH
1109 }
1110 else {
1111 if (c1 == c2)
1112 while (s <= e) {
1113 if ( *(U8*)s == c1
1114 && (ln == 1 || !(OP(c) == EXACTF
1115 ? ibcmp(s, m, ln)
1116 : ibcmp_locale(s, m, ln)))
3b0527fe 1117 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1118 goto got_it;
1119 s++;
1120 }
1121 else
1122 while (s <= e) {
1123 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1124 && (ln == 1 || !(OP(c) == EXACTF
1125 ? ibcmp(s, m, ln)
1126 : ibcmp_locale(s, m, ln)))
3b0527fe 1127 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1128 goto got_it;
1129 s++;
1130 }
b3c9acc1
IZ
1131 }
1132 break;
bbce6d69 1133 case BOUNDL:
3280af22 1134 PL_reg_flags |= RF_tainted;
bbce6d69 1135 /* FALL THROUGH */
a0d0e21e 1136 case BOUND:
ffc61ed2 1137 if (do_utf8) {
12d33761 1138 if (s == PL_bostr)
ffc61ed2
JH
1139 tmp = '\n';
1140 else {
6136c704 1141 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1142 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1143 }
1144 tmp = ((OP(c) == BOUND ?
9041c2e3 1145 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1146 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1147 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1148 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1149 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1150 isALNUM_LC_utf8((U8*)s)))
1151 {
1152 tmp = !tmp;
3b0527fe 1153 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1154 goto got_it;
1155 }
078c425b 1156 s += uskip;
a687059c 1157 }
a0d0e21e 1158 }
667bb95a 1159 else {
12d33761 1160 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1161 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1162 while (s < strend) {
1163 if (tmp ==
1164 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1165 tmp = !tmp;
3b0527fe 1166 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1167 goto got_it;
1168 }
1169 s++;
a0ed51b3 1170 }
a0ed51b3 1171 }
3b0527fe 1172 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1173 goto got_it;
1174 break;
bbce6d69 1175 case NBOUNDL:
3280af22 1176 PL_reg_flags |= RF_tainted;
bbce6d69 1177 /* FALL THROUGH */
a0d0e21e 1178 case NBOUND:
ffc61ed2 1179 if (do_utf8) {
12d33761 1180 if (s == PL_bostr)
ffc61ed2
JH
1181 tmp = '\n';
1182 else {
6136c704 1183 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1185 }
1186 tmp = ((OP(c) == NBOUND ?
9041c2e3 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1188 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1189 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1190 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1191 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1192 isALNUM_LC_utf8((U8*)s)))
1193 tmp = !tmp;
3b0527fe 1194 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1195 goto got_it;
078c425b 1196 s += uskip;
ffc61ed2 1197 }
a0d0e21e 1198 }
667bb95a 1199 else {
12d33761 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 tmp = !tmp;
3b0527fe 1207 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1208 goto got_it;
1209 s++;
1210 }
a0ed51b3 1211 }
3b0527fe 1212 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1213 goto got_it;
1214 break;
a0d0e21e 1215 case ALNUM:
ffc61ed2 1216 if (do_utf8) {
1a4fad37 1217 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1218 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1220 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
bbce6d69 1225 else
ffc61ed2 1226 tmp = 1;
078c425b 1227 s += uskip;
bbce6d69 1228 }
bbce6d69 1229 }
ffc61ed2
JH
1230 else {
1231 while (s < strend) {
1232 if (isALNUM(*s)) {
3b0527fe 1233 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
a0ed51b3 1238 else
ffc61ed2
JH
1239 tmp = 1;
1240 s++;
a0ed51b3 1241 }
a0ed51b3
LW
1242 }
1243 break;
bbce6d69 1244 case ALNUML:
3280af22 1245 PL_reg_flags |= RF_tainted;
ffc61ed2 1246 if (do_utf8) {
078c425b 1247 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1248 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1249 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a687059c 1254 else
ffc61ed2 1255 tmp = 1;
078c425b 1256 s += uskip;
a0d0e21e 1257 }
a0d0e21e 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
3b0527fe 1262 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
a0d0e21e 1273 case NALNUM:
ffc61ed2 1274 if (do_utf8) {
1a4fad37 1275 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1276 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1278 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
bbce6d69 1283 else
ffc61ed2 1284 tmp = 1;
078c425b 1285 s += uskip;
bbce6d69 1286 }
bbce6d69 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (!isALNUM(*s)) {
3b0527fe 1291 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
bbce6d69 1302 case NALNUML:
3280af22 1303 PL_reg_flags |= RF_tainted;
ffc61ed2 1304 if (do_utf8) {
078c425b 1305 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1306 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1307 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a687059c 1312 else
ffc61ed2 1313 tmp = 1;
078c425b 1314 s += uskip;
a687059c 1315 }
a0d0e21e 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
3b0527fe 1320 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
a0d0e21e 1331 case SPACE:
ffc61ed2 1332 if (do_utf8) {
1a4fad37 1333 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1334 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1336 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0d0e21e 1341 else
ffc61ed2 1342 tmp = 1;
078c425b 1343 s += uskip;
2304df62 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (isSPACE(*s)) {
3b0527fe 1349 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
bbce6d69 1360 case SPACEL:
3280af22 1361 PL_reg_flags |= RF_tainted;
ffc61ed2 1362 if (do_utf8) {
078c425b 1363 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1365 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
bbce6d69 1370 else
ffc61ed2 1371 tmp = 1;
078c425b 1372 s += uskip;
bbce6d69 1373 }
bbce6d69 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
3b0527fe 1378 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
a0d0e21e 1389 case NSPACE:
ffc61ed2 1390 if (do_utf8) {
1a4fad37 1391 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1392 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1394 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0d0e21e 1399 else
ffc61ed2 1400 tmp = 1;
078c425b 1401 s += uskip;
a687059c 1402 }
a0d0e21e 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (!isSPACE(*s)) {
3b0527fe 1407 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
bbce6d69 1418 case NSPACEL:
3280af22 1419 PL_reg_flags |= RF_tainted;
ffc61ed2 1420 if (do_utf8) {
078c425b 1421 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1423 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
bbce6d69 1428 else
ffc61ed2 1429 tmp = 1;
078c425b 1430 s += uskip;
bbce6d69 1431 }
bbce6d69 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
3b0527fe 1436 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
a0d0e21e 1447 case DIGIT:
ffc61ed2 1448 if (do_utf8) {
1a4fad37 1449 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1450 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1452 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0d0e21e 1457 else
ffc61ed2 1458 tmp = 1;
078c425b 1459 s += uskip;
2b69d0c2 1460 }
a0d0e21e 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (isDIGIT(*s)) {
3b0527fe 1465 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
b8c5462f
JH
1476 case DIGITL:
1477 PL_reg_flags |= RF_tainted;
ffc61ed2 1478 if (do_utf8) {
078c425b 1479 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1480 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1481 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
b8c5462f 1486 else
ffc61ed2 1487 tmp = 1;
078c425b 1488 s += uskip;
b8c5462f 1489 }
b8c5462f 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
3b0527fe 1494 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
b8c5462f 1502 }
b8c5462f
JH
1503 }
1504 break;
a0d0e21e 1505 case NDIGIT:
ffc61ed2 1506 if (do_utf8) {
1a4fad37 1507 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1508 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1510 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0d0e21e 1515 else
ffc61ed2 1516 tmp = 1;
078c425b 1517 s += uskip;
a687059c 1518 }
a0d0e21e 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (!isDIGIT(*s)) {
3b0527fe 1523 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0ed51b3 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
a0ed51b3 1531 }
a0ed51b3
LW
1532 }
1533 break;
b8c5462f
JH
1534 case NDIGITL:
1535 PL_reg_flags |= RF_tainted;
ffc61ed2 1536 if (do_utf8) {
078c425b 1537 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1538 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1539 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2 1545 tmp = 1;
078c425b 1546 s += uskip;
b8c5462f 1547 }
a0ed51b3 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
3b0527fe 1552 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
cf93c79d 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
b8c5462f 1560 }
b8c5462f
JH
1561 }
1562 break;
b3c9acc1 1563 default:
3c3eec57
GS
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1565 break;
d6a28714 1566 }
6eb5f6b9
JH
1567 return 0;
1568 got_it:
1569 return s;
1570}
1571
1572/*
1573 - regexec_flags - match a regexp against a string
1574 */
1575I32
1576Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578/* strend: pointer to null at end of string */
1579/* strbeg: real beginning of string */
1580/* minend: end of match must be >=minend after stringarg. */
1581/* data: May be used for some additional optimizations. */
1582/* nosave: For optimizations. */
1583{
97aff369 1584 dVAR;
6eb5f6b9
JH
1585 register char *s;
1586 register regnode *c;
1587 register char *startpos = stringarg;
6eb5f6b9
JH
1588 I32 minlen; /* must match at least this many chars */
1589 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1590 I32 end_shift = 0; /* Same for the end. */ /* CC */
1591 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1592 char *scream_olds = NULL;
6eb5f6b9 1593 SV* oreplsv = GvSV(PL_replgv);
1df70142 1594 const bool do_utf8 = DO_UTF8(sv);
2757e526 1595 I32 multiline;
2a782b5b 1596#ifdef DEBUGGING
2757e526
JH
1597 SV* dsv0;
1598 SV* dsv1;
2a782b5b 1599#endif
3b0527fe 1600 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1601
1602 GET_RE_DEBUG_FLAGS_DECL;
1603
9d4ba2ae 1604 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1605
1606 /* Be paranoid... */
1607 if (prog == NULL || startpos == NULL) {
1608 Perl_croak(aTHX_ "NULL regexp parameter");
1609 return 0;
1610 }
1611
2757e526 1612 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1613 reginfo.prog = prog;
2757e526
JH
1614
1615#ifdef DEBUGGING
1616 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1617 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1618#endif
1619
bac06658
JH
1620 RX_MATCH_UTF8_set(prog, do_utf8);
1621
6eb5f6b9 1622 minlen = prog->minlen;
61a36c01 1623 if (strend - startpos < minlen) {
a3621e74 1624 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1625 "String too short [regexec_flags]...\n"));
1626 goto phooey;
1aa99e6b 1627 }
6eb5f6b9 1628
6eb5f6b9
JH
1629 /* Check validity of program. */
1630 if (UCHARAT(prog->program) != REG_MAGIC) {
1631 Perl_croak(aTHX_ "corrupted regexp program");
1632 }
1633
1634 PL_reg_flags = 0;
1635 PL_reg_eval_set = 0;
1636 PL_reg_maxiter = 0;
1637
1638 if (prog->reganch & ROPT_UTF8)
1639 PL_reg_flags |= RF_utf8;
1640
1641 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1642 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1643 PL_bostr = strbeg;
3b0527fe 1644 reginfo.sv = sv;
6eb5f6b9
JH
1645
1646 /* Mark end of line for $ (and such) */
1647 PL_regeol = strend;
1648
1649 /* see how far we have to get to not match where we matched before */
3b0527fe 1650 reginfo.till = startpos+minend;
6eb5f6b9 1651
6eb5f6b9
JH
1652 /* If there is a "must appear" string, look for it. */
1653 s = startpos;
1654
3b0527fe 1655 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1656 MAGIC *mg;
1657
1658 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1659 reginfo.ganch = startpos;
6eb5f6b9
JH
1660 else if (sv && SvTYPE(sv) >= SVt_PVMG
1661 && SvMAGIC(sv)
14befaf4
DM
1662 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1663 && mg->mg_len >= 0) {
3b0527fe 1664 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1665 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1666 if (s > reginfo.ganch)
6eb5f6b9 1667 goto phooey;
3b0527fe 1668 s = reginfo.ganch;
6eb5f6b9
JH
1669 }
1670 }
1671 else /* pos() not defined */
3b0527fe 1672 reginfo.ganch = strbeg;
6eb5f6b9
JH
1673 }
1674
a0714e2c 1675 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1676 re_scream_pos_data d;
1677
1678 d.scream_olds = &scream_olds;
1679 d.scream_pos = &scream_pos;
1680 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1681 if (!s) {
a3621e74 1682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1683 goto phooey; /* not present */
3fa9c3d7 1684 }
6eb5f6b9
JH
1685 }
1686
a3621e74 1687 DEBUG_EXECUTE_r({
1df70142
AL
1688 const char * const s0 = UTF
1689 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1690 UNI_DISPLAY_REGEX)
1691 : prog->precomp;
bb7a0f54 1692 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1693 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1694 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1695 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1696 if (!PL_colorset)
1697 reginitcolors();
1698 PerlIO_printf(Perl_debug_log,
a0288114 1699 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1700 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1701 len0, len0, s0,
2a782b5b 1702 PL_colors[1],
9e55ce06 1703 len0 > 60 ? "..." : "",
2a782b5b 1704 PL_colors[0],
9e55ce06
JH
1705 (int)(len1 > 60 ? 60 : len1),
1706 s1, PL_colors[1],
1707 (len1 > 60 ? "..." : "")
2a782b5b
JH
1708 );
1709 });
6eb5f6b9
JH
1710
1711 /* Simplest case: anchored match need be tried only once. */
1712 /* [unless only anchor is BOL and multiline is set] */
1713 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1714 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1715 goto got_it;
7fba1cd6 1716 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1717 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1718 {
1719 char *end;
1720
1721 if (minlen)
1722 dontbother = minlen - 1;
1aa99e6b 1723 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1724 /* for multiline we only have to try after newlines */
33b8afdf 1725 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1726 if (s == startpos)
1727 goto after_try;
1728 while (1) {
3b0527fe 1729 if (regtry(&reginfo, s))
6eb5f6b9
JH
1730 goto got_it;
1731 after_try:
1732 if (s >= end)
1733 goto phooey;
1734 if (prog->reganch & RE_USE_INTUIT) {
1735 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1736 if (!s)
1737 goto phooey;
1738 }
1739 else
1740 s++;
1741 }
1742 } else {
1743 if (s > startpos)
1744 s--;
1745 while (s < end) {
1746 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1747 if (regtry(&reginfo, s))
6eb5f6b9
JH
1748 goto got_it;
1749 }
1750 }
1751 }
1752 }
1753 goto phooey;
1754 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1755 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1756 goto got_it;
1757 goto phooey;
1758 }
1759
1760 /* Messy cases: unanchored match. */
33b8afdf 1761 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1762 /* we have /x+whatever/ */
1763 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1764 char ch;
bf93d4cc
GS
1765#ifdef DEBUGGING
1766 int did_match = 0;
1767#endif
33b8afdf
JH
1768 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1769 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1770 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1771
1aa99e6b 1772 if (do_utf8) {
6eb5f6b9
JH
1773 while (s < strend) {
1774 if (*s == ch) {
a3621e74 1775 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1776 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1777 s += UTF8SKIP(s);
1778 while (s < strend && *s == ch)
1779 s += UTF8SKIP(s);
1780 }
1781 s += UTF8SKIP(s);
1782 }
1783 }
1784 else {
1785 while (s < strend) {
1786 if (*s == ch) {
a3621e74 1787 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1788 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1789 s++;
1790 while (s < strend && *s == ch)
1791 s++;
1792 }
1793 s++;
1794 }
1795 }
a3621e74 1796 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1797 PerlIO_printf(Perl_debug_log,
b7953727
JH
1798 "Did not find anchored character...\n")
1799 );
6eb5f6b9 1800 }
a0714e2c
SS
1801 else if (prog->anchored_substr != NULL
1802 || prog->anchored_utf8 != NULL
1803 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1804 && prog->float_max_offset < strend - s)) {
1805 SV *must;
1806 I32 back_max;
1807 I32 back_min;
1808 char *last;
6eb5f6b9 1809 char *last1; /* Last position checked before */
bf93d4cc
GS
1810#ifdef DEBUGGING
1811 int did_match = 0;
1812#endif
33b8afdf
JH
1813 if (prog->anchored_substr || prog->anchored_utf8) {
1814 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1815 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1816 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1817 back_max = back_min = prog->anchored_offset;
1818 } else {
1819 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1821 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1822 back_max = prog->float_max_offset;
1823 back_min = prog->float_min_offset;
1824 }
1825 if (must == &PL_sv_undef)
1826 /* could not downgrade utf8 check substring, so must fail */
1827 goto phooey;
1828
1829 last = HOP3c(strend, /* Cannot start after this */
1830 -(I32)(CHR_SVLEN(must)
1831 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1832
1833 if (s > PL_bostr)
1834 last1 = HOPc(s, -1);
1835 else
1836 last1 = s - 1; /* bogus */
1837
a0288114 1838 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1839 check_substr==must. */
1840 scream_pos = -1;
1841 dontbother = end_shift;
1842 strend = HOPc(strend, -dontbother);
1843 while ( (s <= last) &&
9041c2e3 1844 ((flags & REXEC_SCREAM)
1aa99e6b 1845 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1846 end_shift, &scream_pos, 0))
1aa99e6b 1847 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1848 (unsigned char*)strend, must,
7fba1cd6 1849 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1850 /* we may be pointing at the wrong string */
1851 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1852 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1853 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1854 if (HOPc(s, -back_max) > last1) {
1855 last1 = HOPc(s, -back_min);
1856 s = HOPc(s, -back_max);
1857 }
1858 else {
52657f30 1859 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1860
1861 last1 = HOPc(s, -back_min);
52657f30 1862 s = t;
6eb5f6b9 1863 }
1aa99e6b 1864 if (do_utf8) {
6eb5f6b9 1865 while (s <= last1) {
3b0527fe 1866 if (regtry(&reginfo, s))
6eb5f6b9
JH
1867 goto got_it;
1868 s += UTF8SKIP(s);
1869 }
1870 }
1871 else {
1872 while (s <= last1) {
3b0527fe 1873 if (regtry(&reginfo, s))
6eb5f6b9
JH
1874 goto got_it;
1875 s++;
1876 }
1877 }
1878 }
a3621e74 1879 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1880 PerlIO_printf(Perl_debug_log,
a0288114 1881 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1882 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1883 ? "anchored" : "floating"),
1884 PL_colors[0],
1885 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1886 SvPVX_const(must),
b7953727
JH
1887 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1888 );
6eb5f6b9
JH
1889 goto phooey;
1890 }
155aba94 1891 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1892 if (minlen) {
1893 I32 op = (U8)OP(prog->regstclass);
66e933ab 1894 /* don't bother with what can't match */
f14c76ed
RGS
1895 if (PL_regkind[op] != EXACT && op != CANY)
1896 strend = HOPc(strend, -(minlen - 1));
1897 }
a3621e74 1898 DEBUG_EXECUTE_r({
ffc61ed2 1899 SV *prop = sv_newmortal();
cfd0369c
NC
1900 const char *s0;
1901 const char *s1;
9e55ce06
JH
1902 int len0;
1903 int len1;
1904
32fc9b6a 1905 regprop(prog, prop, c);
9e55ce06 1906 s0 = UTF ?
3f7c398e 1907 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1908 UNI_DISPLAY_REGEX) :
cfd0369c 1909 SvPVX_const(prop);
9e55ce06
JH
1910 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1911 s1 = UTF ?
c728cb41 1912 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 1913 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 1914 PerlIO_printf(Perl_debug_log,
a0288114 1915 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1916 len0, len0, s0,
1917 len1, len1, s1);
ffc61ed2 1918 });
3b0527fe 1919 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1920 goto got_it;
a3621e74 1921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1922 }
1923 else {
1924 dontbother = 0;
a0714e2c 1925 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1926 /* Trim the end. */
d6a28714 1927 char *last;
33b8afdf
JH
1928 SV* float_real;
1929
1930 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1931 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1932 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1933
1934 if (flags & REXEC_SCREAM) {
33b8afdf 1935 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1936 end_shift, &scream_pos, 1); /* last one */
1937 if (!last)
ffc61ed2 1938 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1939 /* we may be pointing at the wrong string */
1940 else if (RX_MATCH_COPIED(prog))
3f7c398e 1941 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1942 }
d6a28714
JH
1943 else {
1944 STRLEN len;
cfd0369c 1945 const char * const little = SvPV_const(float_real, len);
d6a28714 1946
33b8afdf 1947 if (SvTAIL(float_real)) {
d6a28714
JH
1948 if (memEQ(strend - len + 1, little, len - 1))
1949 last = strend - len + 1;
7fba1cd6 1950 else if (!multiline)
9041c2e3 1951 last = memEQ(strend - len, little, len)
bd61b366 1952 ? strend - len : NULL;
b8c5462f 1953 else
d6a28714
JH
1954 goto find_last;
1955 } else {
1956 find_last:
9041c2e3 1957 if (len)
d6a28714 1958 last = rninstr(s, strend, little, little + len);
b8c5462f 1959 else
a0288114 1960 last = strend; /* matching "$" */
b8c5462f 1961 }
b8c5462f 1962 }
bf93d4cc 1963 if (last == NULL) {
a3621e74 1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1965 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1966 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1967 goto phooey; /* Should not happen! */
1968 }
d6a28714
JH
1969 dontbother = strend - last + prog->float_min_offset;
1970 }
1971 if (minlen && (dontbother < minlen))
1972 dontbother = minlen - 1;
1973 strend -= dontbother; /* this one's always in bytes! */
1974 /* We don't know much -- general case. */
1aa99e6b 1975 if (do_utf8) {
d6a28714 1976 for (;;) {
3b0527fe 1977 if (regtry(&reginfo, s))
d6a28714
JH
1978 goto got_it;
1979 if (s >= strend)
1980 break;
b8c5462f 1981 s += UTF8SKIP(s);
d6a28714
JH
1982 };
1983 }
1984 else {
1985 do {
3b0527fe 1986 if (regtry(&reginfo, s))
d6a28714
JH
1987 goto got_it;
1988 } while (s++ < strend);
1989 }
1990 }
1991
1992 /* Failure. */
1993 goto phooey;
1994
1995got_it:
1996 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1997
1998 if (PL_reg_eval_set) {
1999 /* Preserve the current value of $^R */
2000 if (oreplsv != GvSV(PL_replgv))
2001 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2002 restored, the value remains
2003 the same. */
4f639d21 2004 restore_pos(aTHX_ prog);
d6a28714
JH
2005 }
2006
2007 /* make sure $`, $&, $', and $digit will work later */
2008 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2009 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2010 if (flags & REXEC_COPY_STR) {
2011 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2012#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2013 if ((SvIsCOW(sv)
2014 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2015 if (DEBUG_C_TEST) {
2016 PerlIO_printf(Perl_debug_log,
2017 "Copy on write: regexp capture, type %d\n",
2018 (int) SvTYPE(sv));
2019 }
2020 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2021 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2022 assert (SvPOKp(prog->saved_copy));
2023 } else
2024#endif
2025 {
2026 RX_MATCH_COPIED_on(prog);
2027 s = savepvn(strbeg, i);
2028 prog->subbeg = s;
2029 }
d6a28714 2030 prog->sublen = i;
d6a28714
JH
2031 }
2032 else {
2033 prog->subbeg = strbeg;
2034 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2035 }
2036 }
9041c2e3 2037
d6a28714
JH
2038 return 1;
2039
2040phooey:
a3621e74 2041 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2042 PL_colors[4], PL_colors[5]));
d6a28714 2043 if (PL_reg_eval_set)
4f639d21 2044 restore_pos(aTHX_ prog);
d6a28714
JH
2045 return 0;
2046}
2047
2048/*
2049 - regtry - try match at specific point
2050 */
2051STATIC I32 /* 0 failure, 1 success */
3b0527fe 2052S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2053{
97aff369 2054 dVAR;
d6a28714
JH
2055 register I32 *sp;
2056 register I32 *ep;
2057 CHECKPOINT lastcp;
3b0527fe 2058 regexp *prog = reginfo->prog;
a3621e74 2059 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2060
02db2b7b
IZ
2061#ifdef DEBUGGING
2062 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2063#endif
d6a28714
JH
2064 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2065 MAGIC *mg;
2066
2067 PL_reg_eval_set = RS_init;
a3621e74 2068 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2069 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2070 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2071 ));
e8347627 2072 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2073 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2074 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2075 SAVETMPS;
2076 /* Apparently this is not needed, judging by wantarray. */
e8347627 2077 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2078 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2079
3b0527fe 2080 if (reginfo->sv) {
d6a28714 2081 /* Make $_ available to executed code. */
3b0527fe 2082 if (reginfo->sv != DEFSV) {
59f00321 2083 SAVE_DEFSV;
3b0527fe 2084 DEFSV = reginfo->sv;
b8c5462f 2085 }
d6a28714 2086
3b0527fe
DM
2087 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2088 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2089 /* prepare for quick setting of pos */
d300d9fa
NC
2090#ifdef PERL_OLD_COPY_ON_WRITE
2091 if (SvIsCOW(sv))
2092 sv_force_normal_flags(sv, 0);
2093#endif
3b0527fe 2094 mg = sv_magicext(reginfo->sv, (SV*)0, PERL_MAGIC_regex_global,
d300d9fa 2095 &PL_vtbl_mglob, NULL, 0);
d6a28714 2096 mg->mg_len = -1;
b8c5462f 2097 }
d6a28714
JH
2098 PL_reg_magic = mg;
2099 PL_reg_oldpos = mg->mg_len;
4f639d21 2100 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2101 }
09687e5a 2102 if (!PL_reg_curpm) {
a02a5408 2103 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2104#ifdef USE_ITHREADS
2105 {
2106 SV* repointer = newSViv(0);
577e12cc 2107 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2108 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2109 av_push(PL_regex_padav,repointer);
2110 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2111 PL_regex_pad = AvARRAY(PL_regex_padav);
2112 }
2113#endif
2114 }
aaa362c4 2115 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2116 PL_reg_oldcurpm = PL_curpm;
2117 PL_curpm = PL_reg_curpm;
2118 if (RX_MATCH_COPIED(prog)) {
2119 /* Here is a serious problem: we cannot rewrite subbeg,
2120 since it may be needed if this match fails. Thus
2121 $` inside (?{}) could fail... */
2122 PL_reg_oldsaved = prog->subbeg;
2123 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2124#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2125 PL_nrs = prog->saved_copy;
2126#endif
d6a28714
JH
2127 RX_MATCH_COPIED_off(prog);
2128 }
2129 else
bd61b366 2130 PL_reg_oldsaved = NULL;
d6a28714
JH
2131 prog->subbeg = PL_bostr;
2132 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2133 }
973dddac 2134 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2135 PL_reginput = startpos;
2136 PL_regstartp = prog->startp;
2137 PL_regendp = prog->endp;
2138 PL_reglastparen = &prog->lastparen;
a01268b5 2139 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2140 prog->lastparen = 0;
03994de8 2141 prog->lastcloseparen = 0;
d6a28714 2142 PL_regsize = 0;
a3621e74 2143 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2144 if (PL_reg_start_tmpl <= prog->nparens) {
2145 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2146 if(PL_reg_start_tmp)
2147 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2148 else
a02a5408 2149 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2150 }
2151
2152 /* XXXX What this code is doing here?!!! There should be no need
2153 to do this again and again, PL_reglastparen should take care of
3dd2943c 2154 this! --ilya*/
dafc8851
JH
2155
2156 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2157 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2158 * PL_reglastparen), is not needed at all by the test suite
2159 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2160 * enough, for building DynaLoader, or otherwise this
2161 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2162 * will happen. Meanwhile, this code *is* needed for the
2163 * above-mentioned test suite tests to succeed. The common theme
2164 * on those tests seems to be returning null fields from matches.
2165 * --jhi */
dafc8851 2166#if 1
d6a28714
JH
2167 sp = prog->startp;
2168 ep = prog->endp;
2169 if (prog->nparens) {
097eb12c 2170 register I32 i;
eb160463 2171 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2172 *++sp = -1;
2173 *++ep = -1;
2174 }
2175 }
dafc8851 2176#endif
02db2b7b 2177 REGCP_SET(lastcp);
3b0527fe 2178 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2179 prog->endp[0] = PL_reginput - PL_bostr;
2180 return 1;
2181 }
02db2b7b 2182 REGCP_UNWIND(lastcp);
d6a28714
JH
2183 return 0;
2184}
2185
02db2b7b
IZ
2186#define RE_UNWIND_BRANCH 1
2187#define RE_UNWIND_BRANCHJ 2
2188
2189union re_unwind_t;
2190
2191typedef struct { /* XX: makes sense to enlarge it... */
2192 I32 type;
2193 I32 prev;
2194 CHECKPOINT lastcp;
2195} re_unwind_generic_t;
2196
2197typedef struct {
2198 I32 type;
2199 I32 prev;
2200 CHECKPOINT lastcp;
2201 I32 lastparen;
2202 regnode *next;
2203 char *locinput;
2204 I32 nextchr;
3a2830be 2205 int minmod;
02db2b7b
IZ
2206#ifdef DEBUGGING
2207 int regindent;
2208#endif
2209} re_unwind_branch_t;
2210
2211typedef union re_unwind_t {
2212 I32 type;
2213 re_unwind_generic_t generic;
2214 re_unwind_branch_t branch;
2215} re_unwind_t;
2216
8ba1375e
MJD
2217#define sayYES goto yes
2218#define sayNO goto no
e0f9d4a8 2219#define sayNO_ANYOF goto no_anyof
8ba1375e 2220#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2221#define sayNO_FINAL goto no_final
2222#define sayNO_SILENT goto do_no
2223#define saySAME(x) if (x) goto yes; else goto no
2224
3ab3c9b4
HS
2225#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2226#define POSCACHE_SEEN 1 /* we know what we're caching */
2227#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2228
3ab3c9b4 2229#define CACHEsayYES STMT_START { \
d8319b27 2230 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2231 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2232 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2233 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2234 } \
2235 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2236 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2237 } \
2238 else { \
3ab3c9b4
HS
2239 /* cache records failure, but this is success */ \
2240 DEBUG_r( \
2241 PerlIO_printf(Perl_debug_log, \
2242 "%*s (remove success from failure cache)\n", \
2243 REPORT_CODE_OFF+PL_regindent*2, "") \
2244 ); \
d8319b27 2245 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2246 } \
2247 } \
2248 sayYES; \
2249} STMT_END
7409bbd3 2250
3ab3c9b4 2251#define CACHEsayNO STMT_START { \
d8319b27 2252 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2253 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2254 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2255 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2256 } \
2257 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2258 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2259 } \
2260 else { \
3ab3c9b4
HS
2261 /* cache records success, but this is failure */ \
2262 DEBUG_r( \
2263 PerlIO_printf(Perl_debug_log, \
2264 "%*s (remove failure from success cache)\n", \
2265 REPORT_CODE_OFF+PL_regindent*2, "") \
2266 ); \
d8319b27 2267 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2268 } \
2269 } \
2270 sayNO; \
2271} STMT_END
2272
a3621e74
YO
2273/* this is used to determine how far from the left messages like
2274 'failed...' are printed. Currently 29 makes these messages line
2275 up with the opcode they refer to. Earlier perls used 25 which
2276 left these messages outdented making reviewing a debug output
2277 quite difficult.
2278*/
2279#define REPORT_CODE_OFF 29
2280
2281
2282/* Make sure there is a test for this +1 options in re_tests */
2283#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2284
9e137952
DM
2285/* this value indiciates that the c1/c2 "next char" test should be skipped */
2286#define CHRTEST_VOID -1000
2287
86545054
DM
2288#define SLAB_FIRST(s) (&(s)->states[0])
2289#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2290
5d9a96ca
DM
2291/* grab a new slab and return the first slot in it */
2292
2293STATIC regmatch_state *
2294S_push_slab(pTHX)
2295{
54df2634
NC
2296#if PERL_VERSION < 9
2297 dMY_CXT;
2298#endif
5d9a96ca
DM
2299 regmatch_slab *s = PL_regmatch_slab->next;
2300 if (!s) {
2301 Newx(s, 1, regmatch_slab);
2302 s->prev = PL_regmatch_slab;
2303 s->next = NULL;
2304 PL_regmatch_slab->next = s;
2305 }
2306 PL_regmatch_slab = s;
86545054 2307 return SLAB_FIRST(s);
5d9a96ca 2308}
5b47454d 2309
95b24440
DM
2310/* simulate a recursive call to regmatch */
2311
2312#define REGMATCH(ns, where) \
5d9a96ca
DM
2313 st->scan = scan; \
2314 scan = (ns); \
2315 st->resume_state = resume_##where; \
95b24440
DM
2316 goto start_recurse; \
2317 resume_point_##where:
2318
aa283a38
DM
2319
2320/* push a new regex state. Set newst to point to it */
2321
2322#define PUSH_STATE(newst, resume) \
2323 depth++; \
2324 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2325 st->scan = scan; \
2326 st->next = next; \
2327 st->n = n; \
2328 st->locinput = locinput; \
2329 st->resume_state = resume; \
2330 newst = st+1; \
86545054 2331 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2332 newst = S_push_slab(aTHX); \
2333 PL_regmatch_state = newst; \
2334 newst->cc = 0; \
2335 newst->minmod = 0; \
2336 newst->sw = 0; \
2337 newst->logical = 0; \
2338 newst->unwind = 0; \
2339 locinput = PL_reginput; \
2340 nextchr = UCHARAT(locinput);
2341
2342#define POP_STATE \
2343 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2344 depth--; \
2345 st--; \
86545054 2346 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2347 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2348 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2349 } \
2350 PL_regmatch_state = st; \
2351 scan = st->scan; \
2352 next = st->next; \
2353 n = st->n; \
2354 locinput = st->locinput; \
2355 nextchr = UCHARAT(locinput);
2356
d6a28714
JH
2357/*
2358 - regmatch - main matching routine
2359 *
2360 * Conceptually the strategy is simple: check to see whether the current
2361 * node matches, call self recursively to see whether the rest matches,
2362 * and then act accordingly. In practice we make some effort to avoid
2363 * recursion, in particular by going through "ordinary" nodes (that don't
2364 * need to know whether the rest of the match failed) by a loop instead of
2365 * by recursion.
2366 */
2367/* [lwall] I've hoisted the register declarations to the outer block in order to
2368 * maybe save a little bit of pushing and popping on the stack. It also takes
2369 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2370 *
2371 * This function used to be heavily recursive, but since this had the
2372 * effect of blowing the CPU stack on complex regexes, it has been
2373 * restructured to be iterative, and to save state onto the heap rather
2374 * than the stack. Essentially whereever regmatch() used to be called, it
2375 * pushes the current state, notes where to return, then jumps back into
2376 * the main loop.
2377 *
2378 * Originally the structure of this function used to look something like
2379
2380 S_regmatch() {
2381 int a = 1, b = 2;
2382 ...
2383 while (scan != NULL) {
5d9a96ca 2384 a++; // do stuff with a and b
95b24440
DM
2385 ...
2386 switch (OP(scan)) {
2387 case FOO: {
2388 int local = 3;
2389 ...
2390 if (regmatch(...)) // recurse
2391 goto yes;
2392 }
2393 ...
2394 }
2395 }
2396 yes:
2397 return 1;
2398 }
2399
2400 * Now it looks something like this:
2401
5d9a96ca 2402 typedef struct {
95b24440
DM
2403 int a, b, local;
2404 int resume_state;
5d9a96ca 2405 } regmatch_state;
95b24440
DM
2406
2407 S_regmatch() {
5d9a96ca
DM
2408 regmatch_state *st = new();
2409 int depth=0;
2410 st->a++; // do stuff with a and b
95b24440
DM
2411 ...
2412 while (scan != NULL) {
2413 ...
2414 switch (OP(scan)) {
2415 case FOO: {
5d9a96ca 2416 st->local = 3;
95b24440 2417 ...
5d9a96ca
DM
2418 st->scan = scan;
2419 scan = ...;
2420 st->resume_state = resume_FOO;
2421 goto start_recurse; // recurse
95b24440 2422
5d9a96ca
DM
2423 resume_point_FOO:
2424 if (result)
95b24440
DM
2425 goto yes;
2426 }
2427 ...
2428 }
5d9a96ca
DM
2429 start_recurse:
2430 st = new(); push a new state
2431 st->a = 1; st->b = 2;
2432 depth++;
95b24440 2433 }
5d9a96ca 2434 yes:
95b24440 2435 result = 1;
5d9a96ca
DM
2436 if (depth--) {
2437 st = pop();
95b24440
DM
2438 switch (resume_state) {
2439 case resume_FOO:
2440 goto resume_point_FOO;
2441 ...
2442 }
2443 }
2444 return result
2445 }
2446
2447 * WARNING: this means that any line in this function that contains a
2448 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2449 * regmatch() using gotos instead. Thus the values of any local variables
2450 * not saved in the regmatch_state structure will have been lost when
2451 * execution resumes on the next line .
5d9a96ca
DM
2452 *
2453 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2454 * PL_regmatch_state always points to the currently active state, and
2455 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2456 * The first time regmatch is called, the first slab is allocated, and is
2457 * never freed until interpreter desctruction. When the slab is full,
2458 * a new one is allocated chained to the end. At exit from regmatch, slabs
2459 * allocated since entry are freed.
d6a28714 2460 */
95b24440
DM
2461
2462
d6a28714 2463STATIC I32 /* 0 failure, 1 success */
3b0527fe 2464S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2465{
54df2634
NC
2466#if PERL_VERSION < 9
2467 dMY_CXT;
2468#endif
27da23d5 2469 dVAR;
95b24440 2470 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2471 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2472
3b0527fe
DM
2473 regexp *rex = reginfo->prog;
2474
5d9a96ca
DM
2475 regmatch_slab *orig_slab;
2476 regmatch_state *orig_state;
a3621e74 2477
5d9a96ca
DM
2478 /* the current state. This is a cached copy of PL_regmatch_state */
2479 register regmatch_state *st;
95b24440 2480
5d9a96ca
DM
2481 /* cache heavy used fields of st in registers */
2482 register regnode *scan;
2483 register regnode *next;
2484 register I32 n = 0; /* initialize to shut up compiler warning */
2485 register char *locinput = PL_reginput;
95b24440 2486
5d9a96ca
DM
2487 /* these variables are NOT saved during a recusive RFEGMATCH: */
2488 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2489 bool result; /* return value of S_regmatch */
2490 regnode *inner; /* Next node in internal branch. */
2491 int depth = 0; /* depth of recursion */
aa283a38 2492 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2493 regmatch_state *yes_state = NULL; /* state to pop to on success of
2494 subpattern */
95b24440
DM
2495
2496#ifdef DEBUGGING
ab74612d 2497 SV *re_debug_flags = NULL;
a3621e74 2498 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2499 PL_regindent++;
2500#endif
2501
5d9a96ca
DM
2502 /* on first ever call to regmatch, allocate first slab */
2503 if (!PL_regmatch_slab) {
2504 Newx(PL_regmatch_slab, 1, regmatch_slab);
2505 PL_regmatch_slab->prev = NULL;
2506 PL_regmatch_slab->next = NULL;
86545054 2507 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2508 }
2509
2510 /* remember current high-water mark for exit */
2511 /* XXX this should be done with SAVE* instead */
2512 orig_slab = PL_regmatch_slab;
2513 orig_state = PL_regmatch_state;
2514
2515 /* grab next free state slot */
2516 st = ++PL_regmatch_state;
86545054 2517 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2518 st = PL_regmatch_state = S_push_slab(aTHX);
2519
2520 st->minmod = 0;
2521 st->sw = 0;
2522 st->logical = 0;
2523 st->unwind = 0;
2524 st->cc = NULL;
d6a28714
JH
2525 /* Note that nextchr is a byte even in UTF */
2526 nextchr = UCHARAT(locinput);
2527 scan = prog;
2528 while (scan != NULL) {
8ba1375e 2529
a3621e74 2530 DEBUG_EXECUTE_r( {
6136c704 2531 SV * const prop = sv_newmortal();
1df70142
AL
2532 const int docolor = *PL_colors[0];
2533 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2534 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2535 /* The part of the string before starttry has one color
2536 (pref0_len chars), between starttry and current
2537 position another one (pref_len - pref0_len chars),
2538 after the current position the third one.
2539 We assume that pref0_len <= pref_len, otherwise we
2540 decrease pref0_len. */
9041c2e3 2541 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2542 ? (5 + taill) - l : locinput - PL_bostr;
2543 int pref0_len;
d6a28714 2544
df1ffd02 2545 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2546 pref_len++;
2547 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2548 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2549 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2550 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2551 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2552 l--;
d6a28714
JH
2553 if (pref0_len < 0)
2554 pref0_len = 0;
2555 if (pref0_len > pref_len)
2556 pref0_len = pref_len;
32fc9b6a 2557 regprop(rex, prop, scan);
2a782b5b 2558 {
1df70142 2559 const char * const s0 =
f14c76ed 2560 do_utf8 && OP(scan) != CANY ?
95b24440 2561 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2562 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2563 locinput - pref_len;
bb7a0f54 2564 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
1df70142 2565 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2566 pv_uni_display(PERL_DEBUG_PAD(1),
2567 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2568 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2569 locinput - pref_len + pref0_len;
bb7a0f54 2570 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
1df70142 2571 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2572 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2573 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2574 locinput;
bb7a0f54 2575 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2a782b5b
JH
2576 PerlIO_printf(Perl_debug_log,
2577 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2578 (IV)(locinput - PL_bostr),
2579 PL_colors[4],
2580 len0, s0,
2581 PL_colors[5],
2582 PL_colors[2],
2583 len1, s1,
2584 PL_colors[3],
2585 (docolor ? "" : "> <"),
2586 PL_colors[0],
2587 len2, s2,
2588 PL_colors[1],
2589 15 - l - pref_len + 1,
2590 "",
4f639d21 2591 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2592 SvPVX_const(prop));
2a782b5b
JH
2593 }
2594 });
d6a28714
JH
2595
2596 next = scan + NEXT_OFF(scan);
2597 if (next == scan)
2598 next = NULL;
2599
2600 switch (OP(scan)) {
2601 case BOL:
7fba1cd6 2602 if (locinput == PL_bostr)
d6a28714 2603 {
3b0527fe 2604 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2605 break;
2606 }
d6a28714
JH
2607 sayNO;
2608 case MBOL:
12d33761
HS
2609 if (locinput == PL_bostr ||
2610 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2611 {
b8c5462f
JH
2612 break;
2613 }
d6a28714
JH
2614 sayNO;
2615 case SBOL:
c2a73568 2616 if (locinput == PL_bostr)
b8c5462f 2617 break;
d6a28714
JH
2618 sayNO;
2619 case GPOS:
3b0527fe 2620 if (locinput == reginfo->ganch)
d6a28714
JH
2621 break;
2622 sayNO;
2623 case EOL:
d6a28714
JH
2624 goto seol;
2625 case MEOL:
d6a28714 2626 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2627 sayNO;
b8c5462f 2628 break;
d6a28714
JH
2629 case SEOL:
2630 seol:
2631 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2632 sayNO;
d6a28714 2633 if (PL_regeol - locinput > 1)
b8c5462f 2634 sayNO;
b8c5462f 2635 break;
d6a28714
JH
2636 case EOS:
2637 if (PL_regeol != locinput)
b8c5462f 2638 sayNO;
d6a28714 2639 break;
ffc61ed2 2640 case SANY:
d6a28714 2641 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2642 sayNO;
f33976b4
DB
2643 if (do_utf8) {
2644 locinput += PL_utf8skip[nextchr];
2645 if (locinput > PL_regeol)
2646 sayNO;
2647 nextchr = UCHARAT(locinput);
2648 }
2649 else
2650 nextchr = UCHARAT(++locinput);
2651 break;
2652 case CANY:
2653 if (!nextchr && locinput >= PL_regeol)
2654 sayNO;
b8c5462f 2655 nextchr = UCHARAT(++locinput);
a0d0e21e 2656 break;
ffc61ed2 2657 case REG_ANY:
1aa99e6b
IH
2658 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2659 sayNO;
2660 if (do_utf8) {
b8c5462f 2661 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2662 if (locinput > PL_regeol)
2663 sayNO;
a0ed51b3 2664 nextchr = UCHARAT(locinput);
a0ed51b3 2665 }
1aa99e6b
IH
2666 else
2667 nextchr = UCHARAT(++locinput);
a0ed51b3 2668 break;
a3621e74
YO
2669
2670
2671
2672 /*
2673 traverse the TRIE keeping track of all accepting states
2674 we transition through until we get to a failing node.
2675
a3621e74
YO
2676
2677 */
5b47454d 2678 case TRIE:
a3621e74
YO
2679 case TRIEF:
2680 case TRIEFL:
2681 {
a3621e74
YO
2682 U8 *uc = ( U8* )locinput;
2683 U32 state = 1;
2684 U16 charid = 0;
2685 U32 base = 0;
2686 UV uvc = 0;
2687 STRLEN len = 0;
2688 STRLEN foldlen = 0;
a3621e74
YO
2689 U8 *uscan = (U8*)NULL;
2690 STRLEN bufflen=0;
95b24440 2691 SV *sv_accept_buff = NULL;
5b47454d
DM
2692 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2693 trie_type = do_utf8 ?
2694 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2695 : trie_plain;
2696
7087a21c
NC
2697 /* what trie are we using right now */
2698 reg_trie_data *trie
32fc9b6a 2699 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
d8319b27 2700 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2701 result = 0;
a3621e74
YO
2702
2703 while ( state && uc <= (U8*)PL_regeol ) {
2704
5b47454d 2705 if (trie->states[ state ].wordnum) {
d8319b27 2706 if (!st->u.trie.accepted ) {
5b47454d
DM
2707 ENTER;
2708 SAVETMPS;
2709 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2710 sv_accept_buff=newSV(bufflen *
2711 sizeof(reg_trie_accepted) - 1);
2712 SvCUR_set(sv_accept_buff,
2713 sizeof(reg_trie_accepted));
2714 SvPOK_on(sv_accept_buff);
2715 sv_2mortal(sv_accept_buff);
d8319b27 2716 st->u.trie.accept_buff =
5b47454d
DM
2717 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2718 }
2719 else {
d8319b27 2720 if (st->u.trie.accepted >= bufflen) {
5b47454d 2721 bufflen *= 2;
d8319b27 2722 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2723 SvGROW(sv_accept_buff,
2724 bufflen * sizeof(reg_trie_accepted));
2725 }
2726 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2727 + sizeof(reg_trie_accepted));
2728 }
d8319b27
DM
2729 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2730 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2731 ++st->u.trie.accepted;
5b47454d 2732 }
a3621e74
YO
2733
2734 base = trie->states[ state ].trans.base;
2735
2736 DEBUG_TRIE_EXECUTE_r(
2737 PerlIO_printf( Perl_debug_log,
e4584336 2738 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2739 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2740 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2741 );
2742
2743 if ( base ) {
5b47454d
DM
2744 switch (trie_type) {
2745 case trie_uft8_fold:
a3621e74
YO
2746 if ( foldlen>0 ) {
2747 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2748 foldlen -= len;
2749 uscan += len;
2750 len=0;
2751 } else {
1df70142 2752 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2753 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2754 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2755 foldlen -= UNISKIP( uvc );
2756 uscan = foldbuf + UNISKIP( uvc );
2757 }
5b47454d
DM
2758 break;
2759 case trie_utf8:
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2761 &len, uniflags );
2762 break;
2763 case trie_plain:
e4584336 2764 uvc = (UV)*uc;
a3621e74
YO
2765 len = 1;
2766 }
2767
5b47454d
DM
2768 if (uvc < 256) {
2769 charid = trie->charmap[ uvc ];
2770 }
2771 else {
2772 charid = 0;
2773 if (trie->widecharmap) {
2774 SV** svpp = (SV**)NULL;
2775 svpp = hv_fetch(trie->widecharmap,
2776 (char*)&uvc, sizeof(UV), 0);
2777 if (svpp)
2778 charid = (U16)SvIV(*svpp);
2779 }
2780 }
a3621e74 2781
5b47454d
DM
2782 if (charid &&
2783 (base + charid > trie->uniquecharcount )
2784 && (base + charid - 1 - trie->uniquecharcount
2785 < trie->lasttrans)
2786 && trie->trans[base + charid - 1 -
2787 trie->uniquecharcount].check == state)
2788 {
2789 state = trie->trans[base + charid - 1 -
2790 trie->uniquecharcount ].next;
2791 }
2792 else {
2793 state = 0;
2794 }
2795 uc += len;
2796
2797 }
2798 else {
a3621e74
YO
2799 state = 0;
2800 }
2801 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2802 PerlIO_printf( Perl_debug_log,
2803 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2804 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2805 );
2806 }
d8319b27 2807 if (!st->u.trie.accepted )
a3621e74 2808 sayNO;
a3621e74
YO
2809
2810 /*
2811 There was at least one accepting state that we
2812 transitioned through. Presumably the number of accepting
2813 states is going to be low, typically one or two. So we
2814 simply scan through to find the one with lowest wordnum.
2815 Once we find it, we swap the last state into its place
2816 and decrement the size. We then try to match the rest of
2817 the pattern at the point where the word ends, if we
2818 succeed then we end the loop, otherwise the loop
2819 eventually terminates once all of the accepting states
2820 have been tried.
2821 */
a3621e74 2822
d8319b27 2823 if ( st->u.trie.accepted == 1 ) {
a3621e74 2824 DEBUG_EXECUTE_r({
097eb12c 2825 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2826 PerlIO_printf( Perl_debug_log,
2827 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2828 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2829 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2830 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2831 PL_colors[5] );
2832 });
d8319b27 2833 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2834 /* in this case we free tmps/leave before we call regmatch
2835 as we wont be using accept_buff again. */
2836 FREETMPS;
2837 LEAVE;
95b24440
DM
2838 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2839 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2840 } else {
2841 DEBUG_EXECUTE_r(
e4584336 2842 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2843 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2844 PL_colors[5] );
2845 );
d8319b27 2846 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2847 U32 best = 0;
2848 U32 cur;
d8319b27 2849 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2850 DEBUG_TRIE_EXECUTE_r(
2851 PerlIO_printf( Perl_debug_log,
2852 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2853 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2854 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2855 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2856 );
a3621e74 2857
d8319b27
DM
2858 if (st->u.trie.accept_buff[cur].wordnum <
2859 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2860 best = cur;
a3621e74
YO
2861 }
2862 DEBUG_EXECUTE_r({
87830502 2863 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2864 rex->data->data[ARG(scan)];
d8319b27 2865 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2866 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2867 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2868 st->u.trie.accept_buff[best].wordnum,
ca0270c4 2869 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", (void*)scan,
a3621e74
YO
2870 PL_colors[5] );
2871 });
d8319b27
DM
2872 if ( best<st->u.trie.accepted ) {
2873 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2874 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2875 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2876 best = st->u.trie.accepted;
a3621e74 2877 }
d8319b27 2878 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2879
2880 /*
2881 as far as I can tell we only need the SAVETMPS/FREETMPS
2882 for re's with EVAL in them but I'm leaving them in for
2883 all until I can be sure.
2884 */
2885 SAVETMPS;
95b24440
DM
2886 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2887 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2888 FREETMPS;
2889 }
2890 FREETMPS;
2891 LEAVE;
2892 }
2893
95b24440 2894 if (result) {
a3621e74
YO
2895 sayYES;
2896 } else {
2897 sayNO;
2898 }
2899 }
2900 /* unreached codepoint */
95b24440
DM
2901 case EXACT: {
2902 char *s = STRING(scan);
5d9a96ca 2903 st->ln = STR_LEN(scan);
eb160463 2904 if (do_utf8 != UTF) {
bc517b45 2905 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2906 char *l = locinput;
5d9a96ca 2907 const char *e = s + st->ln;
a72c7584 2908
5ff6fc6d
JH
2909 if (do_utf8) {
2910 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2911 while (s < e) {
a3b680e6 2912 STRLEN ulen;
1aa99e6b 2913 if (l >= PL_regeol)
5ff6fc6d
JH
2914 sayNO;
2915 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2916 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2917 uniflags))
5ff6fc6d 2918 sayNO;
bc517b45 2919 l += ulen;
5ff6fc6d 2920 s ++;
1aa99e6b 2921 }
5ff6fc6d
JH
2922 }
2923 else {
2924 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2925 while (s < e) {
a3b680e6 2926 STRLEN ulen;
1aa99e6b
IH
2927 if (l >= PL_regeol)
2928 sayNO;
5ff6fc6d 2929 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2930 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2931 uniflags))
1aa99e6b 2932 sayNO;
bc517b45 2933 s += ulen;
a72c7584 2934 l ++;
1aa99e6b 2935 }
5ff6fc6d 2936 }
1aa99e6b
IH
2937 locinput = l;
2938 nextchr = UCHARAT(locinput);
2939 break;
2940 }
bc517b45 2941 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2942 /* Inline the first character, for speed. */
2943 if (UCHARAT(s) != nextchr)
2944 sayNO;
5d9a96ca 2945 if (PL_regeol - locinput < st->ln)
d6a28714 2946 sayNO;
5d9a96ca 2947 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2948 sayNO;
5d9a96ca 2949 locinput += st->ln;
d6a28714
JH
2950 nextchr = UCHARAT(locinput);
2951 break;
95b24440 2952 }
d6a28714 2953 case EXACTFL:
b8c5462f
JH
2954 PL_reg_flags |= RF_tainted;
2955 /* FALL THROUGH */
95b24440
DM
2956 case EXACTF: {
2957 char *s = STRING(scan);
5d9a96ca 2958 st->ln = STR_LEN(scan);
d6a28714 2959
d07ddd77
JH
2960 if (do_utf8 || UTF) {
2961 /* Either target or the pattern are utf8. */
d6a28714 2962 char *l = locinput;
d07ddd77 2963 char *e = PL_regeol;
bc517b45 2964
5d9a96ca 2965 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2966 l, &e, 0, do_utf8)) {
5486206c
JH
2967 /* One more case for the sharp s:
2968 * pack("U0U*", 0xDF) =~ /ss/i,
2969 * the 0xC3 0x9F are the UTF-8
2970 * byte sequence for the U+00DF. */
2971 if (!(do_utf8 &&
2972 toLOWER(s[0]) == 's' &&
5d9a96ca 2973 st->ln >= 2 &&
5486206c
JH
2974 toLOWER(s[1]) == 's' &&
2975 (U8)l[0] == 0xC3 &&
2976 e - l >= 2 &&
2977 (U8)l[1] == 0x9F))
2978 sayNO;
2979 }
d07ddd77
JH
2980 locinput = e;
2981 nextchr = UCHARAT(locinput);
2982 break;
a0ed51b3 2983 }
d6a28714 2984
bc517b45
JH
2985 /* Neither the target and the pattern are utf8. */
2986
d6a28714
JH
2987 /* Inline the first character, for speed. */
2988 if (UCHARAT(s) != nextchr &&
2989 UCHARAT(s) != ((OP(scan) == EXACTF)
2990 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2991 sayNO;
5d9a96ca 2992 if (PL_regeol - locinput < st->ln)
b8c5462f 2993 sayNO;
5d9a96ca
DM
2994 if (st->ln > 1 && (OP(scan) == EXACTF
2995 ? ibcmp(s, locinput, st->ln)
2996 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2997 sayNO;
5d9a96ca 2998 locinput += st->ln;
d6a28714 2999 nextchr = UCHARAT(locinput);
a0d0e21e 3000 break;
95b24440 3001 }
d6a28714 3002 case ANYOF:
ffc61ed2 3003 if (do_utf8) {
9e55ce06
JH
3004 STRLEN inclasslen = PL_regeol - locinput;
3005
32fc9b6a 3006 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3007 sayNO_ANYOF;
ffc61ed2
JH
3008 if (locinput >= PL_regeol)
3009 sayNO;
0f0076b4 3010 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3011 nextchr = UCHARAT(locinput);
e0f9d4a8 3012 break;
ffc61ed2
JH
3013 }
3014 else {
3015 if (nextchr < 0)
3016 nextchr = UCHARAT(locinput);
32fc9b6a 3017 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3018 sayNO_ANYOF;
ffc61ed2
JH
3019 if (!nextchr && locinput >= PL_regeol)
3020 sayNO;
3021 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3022 break;
3023 }
3024 no_anyof:
3025 /* If we might have the case of the German sharp s
3026 * in a casefolding Unicode character class. */
3027
ebc501f0
JH
3028 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3029 locinput += SHARP_S_SKIP;
e0f9d4a8 3030 nextchr = UCHARAT(locinput);
ffc61ed2 3031 }
e0f9d4a8
JH
3032 else
3033 sayNO;
b8c5462f 3034 break;
d6a28714 3035 case ALNUML:
b8c5462f
JH
3036 PL_reg_flags |= RF_tainted;
3037 /* FALL THROUGH */
d6a28714 3038 case ALNUM:
b8c5462f 3039 if (!nextchr)
4633a7c4 3040 sayNO;
ffc61ed2 3041 if (do_utf8) {
1a4fad37 3042 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3043 if (!(OP(scan) == ALNUM
bb7a0f54 3044 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3045 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3046 {
3047 sayNO;
a0ed51b3 3048 }
b8c5462f 3049 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3050 nextchr = UCHARAT(locinput);
3051 break;
3052 }
ffc61ed2 3053 if (!(OP(scan) == ALNUM
d6a28714 3054 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3055 sayNO;
b8c5462f 3056 nextchr = UCHARAT(++locinput);
a0d0e21e 3057 break;
d6a28714 3058 case NALNUML:
b8c5462f
JH
3059 PL_reg_flags |= RF_tainted;
3060 /* FALL THROUGH */
d6a28714
JH
3061 case NALNUM:
3062 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3063 sayNO;
ffc61ed2 3064 if (do_utf8) {
1a4fad37 3065 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3066 if (OP(scan) == NALNUM
bb7a0f54 3067 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3068 : isALNUM_LC_utf8((U8*)locinput))
3069 {
b8c5462f 3070 sayNO;
d6a28714 3071 }
b8c5462f
JH
3072 locinput += PL_utf8skip[nextchr];
3073 nextchr = UCHARAT(locinput);
3074 break;
3075 }
ffc61ed2 3076 if (OP(scan) == NALNUM
d6a28714 3077 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3078 sayNO;
76e3520e 3079 nextchr = UCHARAT(++locinput);
a0d0e21e 3080 break;
d6a28714
JH
3081 case BOUNDL:
3082 case NBOUNDL:
3280af22 3083 PL_reg_flags |= RF_tainted;
bbce6d69 3084 /* FALL THROUGH */
d6a28714
JH
3085 case BOUND:
3086 case NBOUND:
3087 /* was last char in word? */
ffc61ed2 3088 if (do_utf8) {
12d33761 3089 if (locinput == PL_bostr)
5d9a96ca 3090 st->ln = '\n';
ffc61ed2 3091 else {
a3b680e6 3092 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3093
4ad0818d 3094 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3095 }
3096 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3097 st->ln = isALNUM_uni(st->ln);
1a4fad37 3098 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3099 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3100 }
3101 else {
5d9a96ca 3102 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3103 n = isALNUM_LC_utf8((U8*)locinput);
3104 }
a0ed51b3 3105 }
d6a28714 3106 else {
5d9a96ca 3107 st->ln = (locinput != PL_bostr) ?
12d33761 3108 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3109 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3110 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3111 n = isALNUM(nextchr);
3112 }
3113 else {
5d9a96ca 3114 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3115 n = isALNUM_LC(nextchr);
3116 }
d6a28714 3117 }
5d9a96ca 3118 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3119 OP(scan) == BOUNDL))
3120 sayNO;
a0ed51b3 3121 break;
d6a28714 3122 case SPACEL:
3280af22 3123 PL_reg_flags |= RF_tainted;
bbce6d69 3124 /* FALL THROUGH */
d6a28714 3125 case SPACE:
9442cb0e 3126 if (!nextchr)
4633a7c4 3127 sayNO;
1aa99e6b 3128 if (do_utf8) {
fd400ab9 3129 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3130 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3131 if (!(OP(scan) == SPACE
bb7a0f54 3132 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3133 : isSPACE_LC_utf8((U8*)locinput)))
3134 {
3135 sayNO;
3136 }
3137 locinput += PL_utf8skip[nextchr];
3138 nextchr = UCHARAT(locinput);
3139 break;
d6a28714 3140 }
ffc61ed2
JH
3141 if (!(OP(scan) == SPACE
3142 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3143 sayNO;
3144 nextchr = UCHARAT(++locinput);
3145 }
3146 else {
3147 if (!(OP(scan) == SPACE
3148 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3149 sayNO;
3150 nextchr = UCHARAT(++locinput);
a0ed51b3 3151 }
a0ed51b3 3152 break;
d6a28714 3153 case NSPACEL:
3280af22 3154 PL_reg_flags |= RF_tainted;
bbce6d69 3155 /* FALL THROUGH */
d6a28714 3156 case NSPACE:
9442cb0e 3157 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3158 sayNO;
1aa99e6b 3159 if (do_utf8) {
1a4fad37 3160 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3161 if (OP(scan) == NSPACE
bb7a0f54 3162 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3163 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3164 {
3165 sayNO;
3166 }
3167 locinput += PL_utf8skip[nextchr];
3168 nextchr = UCHARAT(locinput);
3169 break;
a0ed51b3 3170 }
ffc61ed2 3171 if (OP(scan) == NSPACE
d6a28714 3172 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3173 sayNO;
76e3520e 3174 nextchr = UCHARAT(++locinput);
a0d0e21e 3175 break;
d6a28714 3176 case DIGITL:
a0ed51b3
LW
3177 PL_reg_flags |= RF_tainted;
3178 /* FALL THROUGH */
d6a28714 3179 case DIGIT:
9442cb0e 3180 if (!nextchr)
a0ed51b3 3181 sayNO;
1aa99e6b 3182 if (do_utf8) {
1a4fad37 3183 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3184 if (!(OP(scan) == DIGIT
bb7a0f54 3185 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3186 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3187 {
a0ed51b3 3188 sayNO;
dfe13c55 3189 }
6f06b55f 3190 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3191 nextchr = UCHARAT(locinput);
3192 break;
3193 }
ffc61ed2 3194 if (!(OP(scan) == DIGIT
9442cb0e 3195 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3196 sayNO;
3197 nextchr = UCHARAT(++locinput);
3198 break;
d6a28714 3199 case NDIGITL:
b8c5462f
JH
3200 PL_reg_flags |= RF_tainted;
3201 /* FALL THROUGH */
d6a28714 3202 case NDIGIT:
9442cb0e 3203 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3204 sayNO;
1aa99e6b 3205 if (do_utf8) {
1a4fad37 3206 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3207 if (OP(scan) == NDIGIT
bb7a0f54 3208 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3209 : isDIGIT_LC_utf8((U8*)locinput))
3210 {
a0ed51b3 3211 sayNO;
9442cb0e 3212 }
6f06b55f 3213 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3214 nextchr = UCHARAT(locinput);
3215 break;
3216 }
ffc61ed2 3217 if (OP(scan) == NDIGIT
9442cb0e 3218 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3219 sayNO;
3220 nextchr = UCHARAT(++locinput);
3221 break;
3222 case CLUMP:
b7c83a7e 3223 if (locinput >= PL_regeol)
a0ed51b3 3224 sayNO;
b7c83a7e 3225 if (do_utf8) {
1a4fad37 3226 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3227 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3228 sayNO;
3229 locinput += PL_utf8skip[nextchr];
3230 while (locinput < PL_regeol &&
3231 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3232 locinput += UTF8SKIP(locinput);
3233 if (locinput > PL_regeol)
3234 sayNO;
eb08e2da
JH
3235 }
3236 else
3237 locinput++;
a0ed51b3
LW
3238 nextchr = UCHARAT(locinput);
3239 break;
c8756f30 3240 case REFFL:
3280af22 3241 PL_reg_flags |= RF_tainted;
c8756f30 3242 /* FALL THROUGH */
c277df42 3243 case REF:
95b24440
DM
3244 case REFF: {
3245 char *s;
c277df42 3246 n = ARG(scan); /* which paren pair */
5d9a96ca 3247 st->ln = PL_regstartp[n];
2c2d71f5 3248 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3249 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3250 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3251 if (st->ln == PL_regendp[n])
a0d0e21e 3252 break;
a0ed51b3 3253
5d9a96ca 3254 s = PL_bostr + st->ln;
1aa99e6b 3255 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3256 char *l = locinput;
a3b680e6 3257 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3258 /*
3259 * Note that we can't do the "other character" lookup trick as
3260 * in the 8-bit case (no pun intended) because in Unicode we
3261 * have to map both upper and title case to lower case.
3262 */
3263 if (OP(scan) == REFF) {
3264 while (s < e) {
a3b680e6
AL
3265 STRLEN ulen1, ulen2;
3266 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3267 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3268
a0ed51b3
LW
3269 if (l >= PL_regeol)
3270 sayNO;
a2a2844f
JH
3271 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3272 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3273 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3274 sayNO;
a2a2844f
JH
3275 s += ulen1;
3276 l += ulen2;
a0ed51b3
LW
3277 }
3278 }
3279 locinput = l;
3280 nextchr = UCHARAT(locinput);
3281 break;
3282 }
3283
a0d0e21e 3284 /* Inline the first character, for speed. */
76e3520e 3285 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3286 (OP(scan) == REF ||
3287 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3288 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3289 sayNO;
5d9a96ca
DM
3290 st->ln = PL_regendp[n] - st->ln;
3291 if (locinput + st->ln > PL_regeol)
4633a7c4 3292 sayNO;
5d9a96ca
DM
3293 if (st->ln > 1 && (OP(scan) == REF
3294 ? memNE(s, locinput, st->ln)
c8756f30 3295 : (OP(scan) == REFF
5d9a96ca
DM
3296 ? ibcmp(s, locinput, st->ln)
3297 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3298 sayNO;
5d9a96ca 3299 locinput += st->ln;
76e3520e 3300 nextchr = UCHARAT(locinput);
a0d0e21e 3301 break;
95b24440 3302 }
a0d0e21e
LW
3303
3304 case NOTHING:
c277df42 3305 case TAIL:
a0d0e21e
LW
3306 break;
3307 case BACK:
3308 break;
c277df42
IZ
3309 case EVAL:
3310 {
c277df42 3311 SV *ret;
8e5e9ebe 3312 {
4aabdb9b
DM
3313 /* execute the code in the {...} */
3314 dSP;
6136c704 3315 SV ** const before = SP;
4aabdb9b
DM
3316 OP_4tree * const oop = PL_op;
3317 COP * const ocurcop = PL_curcop;
3318 PAD *old_comppad;
4aabdb9b
DM
3319
3320 n = ARG(scan);
32fc9b6a 3321 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3322 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3323 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3324 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3325
8e5e9ebe
RGS
3326 CALLRUNOPS(aTHX); /* Scalar context. */
3327 SPAGAIN;
3328 if (SP == before)
075aa684 3329 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3330 else {
3331 ret = POPs;
3332 PUTBACK;
3333 }
4aabdb9b
DM
3334
3335 PL_op = oop;
3336 PAD_RESTORE_LOCAL(old_comppad);
3337 PL_curcop = ocurcop;
3338 if (!st->logical) {
3339 /* /(?{...})/ */
3340 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3341 break;
3342 }
8e5e9ebe 3343 }
4aabdb9b
DM
3344 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3345 regexp *re;
4aabdb9b 3346 {
4f639d21
DM
3347 /* extract RE object from returned value; compiling if
3348 * necessary */
3349
6136c704 3350 MAGIC *mg = NULL;
4aabdb9b 3351 SV *sv;
faf82a0b
AE
3352 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3353 mg = mg_find(sv, PERL_MAGIC_qr);
3354 else if (SvSMAGICAL(ret)) {
3355 if (SvGMAGICAL(ret))
3356 sv_unmagic(ret, PERL_MAGIC_qr);
3357 else
3358 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3359 }
faf82a0b 3360
0f5d15d6
IZ
3361 if (mg) {
3362 re = (regexp *)mg->mg_obj;
df0003d4 3363 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3364 }
3365 else {
3366 STRLEN len;
6136c704 3367 const char * const t = SvPV_const(ret, len);
0f5d15d6 3368 PMOP pm;
a3b680e6 3369 const I32 osize = PL_regsize;
0f5d15d6 3370
5fcd1c1b 3371 Zero(&pm, 1, PMOP);
4aabdb9b 3372 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3373 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3374 if (!(SvFLAGS(ret)
faf82a0b
AE
3375 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY