This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Regen headers following change 28325. Also, make it compile
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
8e11feef 107 (char*)(PL_reg_match_utf8 \
b9ea4ed6 108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
8e11feef
RGS
109 : (pos - off >= PL_bostr) \
110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
2c2d71f5 572 char *s1 = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
1aa99e6b
IH
620 (long)(HOP3c(s1, 1, strend) - i_strpos)));
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
30944b6d 630 s = s1;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf
JH
638 char *last, *last1;
639 char *s1 = s;
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
677 ", trying anchored starting at offset %ld...\n",
678 (long)(s1 + 1 - i_strpos)));
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
687 s = s1;
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
808 if (prog->regstclass) {
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
a3b680e6 821 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
6eb5f6b9
JH
827
828 t = s;
3b0527fe 829 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
830 if (!s) {
831#ifdef DEBUGGING
cbbf8932 832 const char *what = NULL;
6eb5f6b9
JH
833#endif
834 if (endpos == strend) {
a3621e74 835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
836 "Could not match STCLASS...\n") );
837 goto fail;
838 }
a3621e74 839 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 840 "This position contradicts STCLASS...\n") );
653099ff
GS
841 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
842 goto fail;
6eb5f6b9 843 /* Contradict one of substrings */
33b8afdf
JH
844 if (prog->anchored_substr || prog->anchored_utf8) {
845 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 846 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 847 hop_and_restart:
1aa99e6b 848 s = HOP3c(t, 1, strend);
66e933ab
GS
849 if (s + start_shift + end_shift > strend) {
850 /* XXXX Should be taken into account earlier? */
a3621e74 851 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
852 "Could not match STCLASS...\n") );
853 goto fail;
854 }
5e39e1e5
HS
855 if (!check)
856 goto giveup;
a3621e74 857 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 858 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
859 what, (long)(s + start_shift - i_strpos)) );
860 goto restart;
861 }
66e933ab 862 /* Have both, check_string is floating */
6eb5f6b9
JH
863 if (t + start_shift >= check_at) /* Contradicts floating=check */
864 goto retry_floating_check;
865 /* Recheck anchored substring, but not floating... */
9041c2e3 866 s = check_at;
5e39e1e5
HS
867 if (!check)
868 goto giveup;
a3621e74 869 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 870 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
871 (long)(other_last - i_strpos)) );
872 goto do_other_anchored;
873 }
60e71179
GS
874 /* Another way we could have checked stclass at the
875 current position only: */
876 if (ml_anch) {
877 s = t = t + 1;
5e39e1e5
HS
878 if (!check)
879 goto giveup;
a3621e74 880 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 881 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 882 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 883 goto try_at_offset;
66e933ab 884 }
33b8afdf 885 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 886 goto fail;
6eb5f6b9
JH
887 /* Check is floating subtring. */
888 retry_floating_check:
889 t = check_at - start_shift;
a3621e74 890 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
891 goto hop_and_restart;
892 }
b7953727 893 if (t != s) {
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 895 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
896 (long)(t - i_strpos), (long)(s - i_strpos))
897 );
898 }
899 else {
a3621e74 900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
901 "Does not contradict STCLASS...\n");
902 );
903 }
6eb5f6b9 904 }
5e39e1e5 905 giveup:
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
907 PL_colors[4], (check ? "Guessed" : "Giving up"),
908 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 909 return s;
2c2d71f5
JH
910
911 fail_finish: /* Substring not found */
33b8afdf
JH
912 if (prog->check_substr || prog->check_utf8) /* could be removed already */
913 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 914 fail:
a3621e74 915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 916 PL_colors[4], PL_colors[5]));
bd61b366 917 return NULL;
cad2e5aa 918}
9661b544 919
6eb5f6b9 920/* We know what class REx starts with. Try to find this position... */
3b0527fe
DM
921/* if reginfo is NULL, its a dryrun */
922
3c3eec57 923STATIC char *
3b0527fe
DM
924S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char
925*strend, const regmatch_info *reginfo)
a687059c 926{
27da23d5 927 dVAR;
1df70142 928 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 929 char *m;
d8093b23 930 STRLEN ln;
5dab1207 931 STRLEN lnc;
078c425b 932 register STRLEN uskip;
d8093b23 933 unsigned int c1;
934 unsigned int c2;
6eb5f6b9
JH
935 char *e;
936 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 937 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 938
6eb5f6b9
JH
939 /* We know what class it must start with. */
940 switch (OP(c)) {
6eb5f6b9 941 case ANYOF:
388cc4de 942 if (do_utf8) {
078c425b 943 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
944 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
945 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
946 reginclass(prog, c, (U8*)s, 0, do_utf8) :
947 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 948 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
949 goto got_it;
950 else
951 tmp = doevery;
952 }
953 else
954 tmp = 1;
078c425b 955 s += uskip;
388cc4de
HS
956 }
957 }
958 else {
959 while (s < strend) {
960 STRLEN skip = 1;
961
32fc9b6a 962 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
963 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
964 /* The assignment of 2 is intentional:
965 * for the folded sharp s, the skip is 2. */
966 (skip = SHARP_S_SKIP))) {
3b0527fe 967 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
968 goto got_it;
969 else
970 tmp = doevery;
971 }
972 else
973 tmp = 1;
974 s += skip;
975 }
a0d0e21e 976 }
6eb5f6b9 977 break;
f33976b4
DB
978 case CANY:
979 while (s < strend) {
3b0527fe 980 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
981 goto got_it;
982 else
983 tmp = doevery;
984 s++;
985 }
986 break;
6eb5f6b9 987 case EXACTF:
5dab1207
NIS
988 m = STRING(c);
989 ln = STR_LEN(c); /* length to match in octets/bytes */
990 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 991 if (UTF) {
a2a2844f 992 STRLEN ulen1, ulen2;
5dab1207 993 U8 *sm = (U8 *) m;
89ebb4a3
JH
994 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
995 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 996 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
997
998 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
999 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1000
89ebb4a3 1001 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1002 0, uniflags);
89ebb4a3 1003 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1004 0, uniflags);
5dab1207
NIS
1005 lnc = 0;
1006 while (sm < ((U8 *) m + ln)) {
1007 lnc++;
1008 sm += UTF8SKIP(sm);
1009 }
1aa99e6b
IH
1010 }
1011 else {
1012 c1 = *(U8*)m;
1013 c2 = PL_fold[c1];
1014 }
6eb5f6b9
JH
1015 goto do_exactf;
1016 case EXACTFL:
5dab1207
NIS
1017 m = STRING(c);
1018 ln = STR_LEN(c);
1019 lnc = (I32) ln;
d8093b23 1020 c1 = *(U8*)m;
6eb5f6b9
JH
1021 c2 = PL_fold_locale[c1];
1022 do_exactf:
db12adc6 1023 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1024
3b0527fe 1025 if (!reginfo && e < s)
6eb5f6b9 1026 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1027
60a8b682
JH
1028 /* The idea in the EXACTF* cases is to first find the
1029 * first character of the EXACTF* node and then, if
1030 * necessary, case-insensitively compare the full
1031 * text of the node. The c1 and c2 are the first
1032 * characters (though in Unicode it gets a bit
1033 * more complicated because there are more cases
7f16dd3d
JH
1034 * than just upper and lower: one needs to use
1035 * the so-called folding case for case-insensitive
1036 * matching (called "loose matching" in Unicode).
1037 * ibcmp_utf8() will do just that. */
60a8b682 1038
1aa99e6b 1039 if (do_utf8) {
575cac57 1040 UV c, f;
89ebb4a3 1041 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1042 STRLEN len, foldlen;
4ad0818d 1043 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1044 if (c1 == c2) {
5dab1207
NIS
1045 /* Upper and lower of 1st char are equal -
1046 * probably not a "letter". */
1aa99e6b 1047 while (s <= e) {
89ebb4a3 1048 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1049 uniflags);
80aecb99
JH
1050 if ( c == c1
1051 && (ln == len ||
3dab1dad
YO
1052 ibcmp_utf8(s, NULL, 0, do_utf8,
1053 m, NULL, ln, (bool)UTF))
3b0527fe 1054 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1055 goto got_it;
80aecb99 1056 else {
1df70142 1057 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1058 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1059 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1060 if ( f != c
1061 && (f == c1 || f == c2)
1062 && (ln == foldlen ||
66423254 1063 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1064 NULL, foldlen, do_utf8,
d07ddd77 1065 m,
3dab1dad 1066 NULL, ln, (bool)UTF))
3b0527fe 1067 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1068 goto got_it;
1069 }
1aa99e6b
IH
1070 s += len;
1071 }
09091399
JH
1072 }
1073 else {
1aa99e6b 1074 while (s <= e) {
89ebb4a3 1075 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1076 uniflags);
80aecb99 1077
60a8b682 1078 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1079 * Note that not all the possible combinations
1080 * are handled here: some of them are handled
1081 * by the standard folding rules, and some of
1082 * them (the character class or ANYOF cases)
1083 * are handled during compiletime in
1084 * regexec.c:S_regclass(). */
880bd946
JH
1085 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1086 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1087 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1088
1089 if ( (c == c1 || c == c2)
1090 && (ln == len ||
3dab1dad
YO
1091 ibcmp_utf8(s, NULL, 0, do_utf8,
1092 m, NULL, ln, (bool)UTF))
3b0527fe 1093 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1094 goto got_it;
80aecb99 1095 else {
1df70142 1096 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1097 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1098 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1099 if ( f != c
1100 && (f == c1 || f == c2)
1101 && (ln == foldlen ||
a6872d42 1102 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1103 NULL, foldlen, do_utf8,
d07ddd77 1104 m,
3dab1dad 1105 NULL, ln, (bool)UTF))
3b0527fe 1106 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1107 goto got_it;
1108 }
1aa99e6b
IH
1109 s += len;
1110 }
09091399 1111 }
1aa99e6b
IH
1112 }
1113 else {
1114 if (c1 == c2)
1115 while (s <= e) {
1116 if ( *(U8*)s == c1
1117 && (ln == 1 || !(OP(c) == EXACTF
1118 ? ibcmp(s, m, ln)
1119 : ibcmp_locale(s, m, ln)))
3b0527fe 1120 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1121 goto got_it;
1122 s++;
1123 }
1124 else
1125 while (s <= e) {
1126 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
3b0527fe 1130 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1131 goto got_it;
1132 s++;
1133 }
b3c9acc1
IZ
1134 }
1135 break;
bbce6d69 1136 case BOUNDL:
3280af22 1137 PL_reg_flags |= RF_tainted;
bbce6d69 1138 /* FALL THROUGH */
a0d0e21e 1139 case BOUND:
ffc61ed2 1140 if (do_utf8) {
12d33761 1141 if (s == PL_bostr)
ffc61ed2
JH
1142 tmp = '\n';
1143 else {
6136c704 1144 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1145 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1146 }
1147 tmp = ((OP(c) == BOUND ?
9041c2e3 1148 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1149 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1150 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1151 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1152 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1153 isALNUM_LC_utf8((U8*)s)))
1154 {
1155 tmp = !tmp;
3b0527fe 1156 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1157 goto got_it;
1158 }
078c425b 1159 s += uskip;
a687059c 1160 }
a0d0e21e 1161 }
667bb95a 1162 else {
12d33761 1163 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1164 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1165 while (s < strend) {
1166 if (tmp ==
1167 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1168 tmp = !tmp;
3b0527fe 1169 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1170 goto got_it;
1171 }
1172 s++;
a0ed51b3 1173 }
a0ed51b3 1174 }
3b0527fe 1175 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1176 goto got_it;
1177 break;
bbce6d69 1178 case NBOUNDL:
3280af22 1179 PL_reg_flags |= RF_tainted;
bbce6d69 1180 /* FALL THROUGH */
a0d0e21e 1181 case NBOUND:
ffc61ed2 1182 if (do_utf8) {
12d33761 1183 if (s == PL_bostr)
ffc61ed2
JH
1184 tmp = '\n';
1185 else {
6136c704 1186 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1187 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1188 }
1189 tmp = ((OP(c) == NBOUND ?
9041c2e3 1190 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1191 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1192 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1193 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1194 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1195 isALNUM_LC_utf8((U8*)s)))
1196 tmp = !tmp;
3b0527fe 1197 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1198 goto got_it;
078c425b 1199 s += uskip;
ffc61ed2 1200 }
a0d0e21e 1201 }
667bb95a 1202 else {
12d33761 1203 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1204 tmp = ((OP(c) == NBOUND ?
1205 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1206 while (s < strend) {
1207 if (tmp ==
1208 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1209 tmp = !tmp;
3b0527fe 1210 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1211 goto got_it;
1212 s++;
1213 }
a0ed51b3 1214 }
3b0527fe 1215 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1216 goto got_it;
1217 break;
a0d0e21e 1218 case ALNUM:
ffc61ed2 1219 if (do_utf8) {
1a4fad37 1220 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1221 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1222 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1223 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1224 goto got_it;
1225 else
1226 tmp = doevery;
1227 }
bbce6d69 1228 else
ffc61ed2 1229 tmp = 1;
078c425b 1230 s += uskip;
bbce6d69 1231 }
bbce6d69 1232 }
ffc61ed2
JH
1233 else {
1234 while (s < strend) {
1235 if (isALNUM(*s)) {
3b0527fe 1236 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1237 goto got_it;
1238 else
1239 tmp = doevery;
1240 }
a0ed51b3 1241 else
ffc61ed2
JH
1242 tmp = 1;
1243 s++;
a0ed51b3 1244 }
a0ed51b3
LW
1245 }
1246 break;
bbce6d69 1247 case ALNUML:
3280af22 1248 PL_reg_flags |= RF_tainted;
ffc61ed2 1249 if (do_utf8) {
078c425b 1250 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1251 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1252 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1253 goto got_it;
1254 else
1255 tmp = doevery;
1256 }
a687059c 1257 else
ffc61ed2 1258 tmp = 1;
078c425b 1259 s += uskip;
a0d0e21e 1260 }
a0d0e21e 1261 }
ffc61ed2
JH
1262 else {
1263 while (s < strend) {
1264 if (isALNUM_LC(*s)) {
3b0527fe 1265 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1266 goto got_it;
1267 else
1268 tmp = doevery;
1269 }
a0ed51b3 1270 else
ffc61ed2
JH
1271 tmp = 1;
1272 s++;
a0ed51b3 1273 }
a0ed51b3
LW
1274 }
1275 break;
a0d0e21e 1276 case NALNUM:
ffc61ed2 1277 if (do_utf8) {
1a4fad37 1278 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1279 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1280 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1281 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1282 goto got_it;
1283 else
1284 tmp = doevery;
1285 }
bbce6d69 1286 else
ffc61ed2 1287 tmp = 1;
078c425b 1288 s += uskip;
bbce6d69 1289 }
bbce6d69 1290 }
ffc61ed2
JH
1291 else {
1292 while (s < strend) {
1293 if (!isALNUM(*s)) {
3b0527fe 1294 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1295 goto got_it;
1296 else
1297 tmp = doevery;
1298 }
a0ed51b3 1299 else
ffc61ed2
JH
1300 tmp = 1;
1301 s++;
a0ed51b3 1302 }
a0ed51b3
LW
1303 }
1304 break;
bbce6d69 1305 case NALNUML:
3280af22 1306 PL_reg_flags |= RF_tainted;
ffc61ed2 1307 if (do_utf8) {
078c425b 1308 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1309 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1310 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1311 goto got_it;
1312 else
1313 tmp = doevery;
1314 }
a687059c 1315 else
ffc61ed2 1316 tmp = 1;
078c425b 1317 s += uskip;
a687059c 1318 }
a0d0e21e 1319 }
ffc61ed2
JH
1320 else {
1321 while (s < strend) {
1322 if (!isALNUM_LC(*s)) {
3b0527fe 1323 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1324 goto got_it;
1325 else
1326 tmp = doevery;
1327 }
a0ed51b3 1328 else
ffc61ed2
JH
1329 tmp = 1;
1330 s++;
a0ed51b3 1331 }
a0ed51b3
LW
1332 }
1333 break;
a0d0e21e 1334 case SPACE:
ffc61ed2 1335 if (do_utf8) {
1a4fad37 1336 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1337 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1338 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1339 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1340 goto got_it;
1341 else
1342 tmp = doevery;
1343 }
a0d0e21e 1344 else
ffc61ed2 1345 tmp = 1;
078c425b 1346 s += uskip;
2304df62 1347 }
a0d0e21e 1348 }
ffc61ed2
JH
1349 else {
1350 while (s < strend) {
1351 if (isSPACE(*s)) {
3b0527fe 1352 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1353 goto got_it;
1354 else
1355 tmp = doevery;
1356 }
a0ed51b3 1357 else
ffc61ed2
JH
1358 tmp = 1;
1359 s++;
a0ed51b3 1360 }
a0ed51b3
LW
1361 }
1362 break;
bbce6d69 1363 case SPACEL:
3280af22 1364 PL_reg_flags |= RF_tainted;
ffc61ed2 1365 if (do_utf8) {
078c425b 1366 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1367 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1368 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1369 goto got_it;
1370 else
1371 tmp = doevery;
1372 }
bbce6d69 1373 else
ffc61ed2 1374 tmp = 1;
078c425b 1375 s += uskip;
bbce6d69 1376 }
bbce6d69 1377 }
ffc61ed2
JH
1378 else {
1379 while (s < strend) {
1380 if (isSPACE_LC(*s)) {
3b0527fe 1381 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1382 goto got_it;
1383 else
1384 tmp = doevery;
1385 }
a0ed51b3 1386 else
ffc61ed2
JH
1387 tmp = 1;
1388 s++;
a0ed51b3 1389 }
a0ed51b3
LW
1390 }
1391 break;
a0d0e21e 1392 case NSPACE:
ffc61ed2 1393 if (do_utf8) {
1a4fad37 1394 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1395 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1396 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1397 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1398 goto got_it;
1399 else
1400 tmp = doevery;
1401 }
a0d0e21e 1402 else
ffc61ed2 1403 tmp = 1;
078c425b 1404 s += uskip;
a687059c 1405 }
a0d0e21e 1406 }
ffc61ed2
JH
1407 else {
1408 while (s < strend) {
1409 if (!isSPACE(*s)) {
3b0527fe 1410 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1411 goto got_it;
1412 else
1413 tmp = doevery;
1414 }
a0ed51b3 1415 else
ffc61ed2
JH
1416 tmp = 1;
1417 s++;
a0ed51b3 1418 }
a0ed51b3
LW
1419 }
1420 break;
bbce6d69 1421 case NSPACEL:
3280af22 1422 PL_reg_flags |= RF_tainted;
ffc61ed2 1423 if (do_utf8) {
078c425b 1424 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1425 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1426 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1427 goto got_it;
1428 else
1429 tmp = doevery;
1430 }
bbce6d69 1431 else
ffc61ed2 1432 tmp = 1;
078c425b 1433 s += uskip;
bbce6d69 1434 }
bbce6d69 1435 }
ffc61ed2
JH
1436 else {
1437 while (s < strend) {
1438 if (!isSPACE_LC(*s)) {
3b0527fe 1439 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1440 goto got_it;
1441 else
1442 tmp = doevery;
1443 }
a0ed51b3 1444 else
ffc61ed2
JH
1445 tmp = 1;
1446 s++;
a0ed51b3 1447 }
a0ed51b3
LW
1448 }
1449 break;
a0d0e21e 1450 case DIGIT:
ffc61ed2 1451 if (do_utf8) {
1a4fad37 1452 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1453 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1454 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1455 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1456 goto got_it;
1457 else
1458 tmp = doevery;
1459 }
a0d0e21e 1460 else
ffc61ed2 1461 tmp = 1;
078c425b 1462 s += uskip;
2b69d0c2 1463 }
a0d0e21e 1464 }
ffc61ed2
JH
1465 else {
1466 while (s < strend) {
1467 if (isDIGIT(*s)) {
3b0527fe 1468 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1469 goto got_it;
1470 else
1471 tmp = doevery;
1472 }
a0ed51b3 1473 else
ffc61ed2
JH
1474 tmp = 1;
1475 s++;
a0ed51b3 1476 }
a0ed51b3
LW
1477 }
1478 break;
b8c5462f
JH
1479 case DIGITL:
1480 PL_reg_flags |= RF_tainted;
ffc61ed2 1481 if (do_utf8) {
078c425b 1482 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1483 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1484 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1485 goto got_it;
1486 else
1487 tmp = doevery;
1488 }
b8c5462f 1489 else
ffc61ed2 1490 tmp = 1;
078c425b 1491 s += uskip;
b8c5462f 1492 }
b8c5462f 1493 }
ffc61ed2
JH
1494 else {
1495 while (s < strend) {
1496 if (isDIGIT_LC(*s)) {
3b0527fe 1497 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1498 goto got_it;
1499 else
1500 tmp = doevery;
1501 }
b8c5462f 1502 else
ffc61ed2
JH
1503 tmp = 1;
1504 s++;
b8c5462f 1505 }
b8c5462f
JH
1506 }
1507 break;
a0d0e21e 1508 case NDIGIT:
ffc61ed2 1509 if (do_utf8) {
1a4fad37 1510 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1511 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1512 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1513 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1514 goto got_it;
1515 else
1516 tmp = doevery;
1517 }
a0d0e21e 1518 else
ffc61ed2 1519 tmp = 1;
078c425b 1520 s += uskip;
a687059c 1521 }
a0d0e21e 1522 }
ffc61ed2
JH
1523 else {
1524 while (s < strend) {
1525 if (!isDIGIT(*s)) {
3b0527fe 1526 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1527 goto got_it;
1528 else
1529 tmp = doevery;
1530 }
a0ed51b3 1531 else
ffc61ed2
JH
1532 tmp = 1;
1533 s++;
a0ed51b3 1534 }
a0ed51b3
LW
1535 }
1536 break;
b8c5462f
JH
1537 case NDIGITL:
1538 PL_reg_flags |= RF_tainted;
ffc61ed2 1539 if (do_utf8) {
078c425b 1540 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1541 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1542 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1543 goto got_it;
1544 else
1545 tmp = doevery;
1546 }
b8c5462f 1547 else
ffc61ed2 1548 tmp = 1;
078c425b 1549 s += uskip;
b8c5462f 1550 }
a0ed51b3 1551 }
ffc61ed2
JH
1552 else {
1553 while (s < strend) {
1554 if (!isDIGIT_LC(*s)) {
3b0527fe 1555 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1556 goto got_it;
1557 else
1558 tmp = doevery;
1559 }
cf93c79d 1560 else
ffc61ed2
JH
1561 tmp = 1;
1562 s++;
b8c5462f 1563 }
b8c5462f
JH
1564 }
1565 break;
b3c9acc1 1566 default:
3c3eec57
GS
1567 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1568 break;
d6a28714 1569 }
6eb5f6b9
JH
1570 return 0;
1571 got_it:
1572 return s;
1573}
1574
1575/*
1576 - regexec_flags - match a regexp against a string
1577 */
1578I32
1579Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1580 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1581/* strend: pointer to null at end of string */
1582/* strbeg: real beginning of string */
1583/* minend: end of match must be >=minend after stringarg. */
1584/* data: May be used for some additional optimizations. */
1585/* nosave: For optimizations. */
1586{
97aff369 1587 dVAR;
6eb5f6b9
JH
1588 register char *s;
1589 register regnode *c;
1590 register char *startpos = stringarg;
6eb5f6b9
JH
1591 I32 minlen; /* must match at least this many chars */
1592 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1593 I32 end_shift = 0; /* Same for the end. */ /* CC */
1594 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1595 char *scream_olds = NULL;
3dab1dad 1596 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1597 const bool do_utf8 = DO_UTF8(sv);
2757e526 1598 I32 multiline;
2a782b5b 1599#ifdef DEBUGGING
2757e526
JH
1600 SV* dsv0;
1601 SV* dsv1;
2a782b5b 1602#endif
3b0527fe 1603 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1604
1605 GET_RE_DEBUG_FLAGS_DECL;
1606
9d4ba2ae 1607 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1608
1609 /* Be paranoid... */
1610 if (prog == NULL || startpos == NULL) {
1611 Perl_croak(aTHX_ "NULL regexp parameter");
1612 return 0;
1613 }
1614
2757e526 1615 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1616 reginfo.prog = prog;
2757e526
JH
1617
1618#ifdef DEBUGGING
1619 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1620 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1621#endif
1622
bac06658
JH
1623 RX_MATCH_UTF8_set(prog, do_utf8);
1624
6eb5f6b9 1625 minlen = prog->minlen;
61a36c01 1626 if (strend - startpos < minlen) {
a3621e74 1627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1628 "String too short [regexec_flags]...\n"));
1629 goto phooey;
1aa99e6b 1630 }
6eb5f6b9 1631
6eb5f6b9
JH
1632 /* Check validity of program. */
1633 if (UCHARAT(prog->program) != REG_MAGIC) {
1634 Perl_croak(aTHX_ "corrupted regexp program");
1635 }
1636
1637 PL_reg_flags = 0;
1638 PL_reg_eval_set = 0;
1639 PL_reg_maxiter = 0;
1640
1641 if (prog->reganch & ROPT_UTF8)
1642 PL_reg_flags |= RF_utf8;
1643
1644 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1645 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1646 PL_bostr = strbeg;
3b0527fe 1647 reginfo.sv = sv;
6eb5f6b9
JH
1648
1649 /* Mark end of line for $ (and such) */
1650 PL_regeol = strend;
1651
1652 /* see how far we have to get to not match where we matched before */
3b0527fe 1653 reginfo.till = startpos+minend;
6eb5f6b9 1654
6eb5f6b9
JH
1655 /* If there is a "must appear" string, look for it. */
1656 s = startpos;
1657
3b0527fe 1658 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1659 MAGIC *mg;
1660
1661 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1662 reginfo.ganch = startpos;
6eb5f6b9
JH
1663 else if (sv && SvTYPE(sv) >= SVt_PVMG
1664 && SvMAGIC(sv)
14befaf4
DM
1665 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1666 && mg->mg_len >= 0) {
3b0527fe 1667 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1668 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1669 if (s > reginfo.ganch)
6eb5f6b9 1670 goto phooey;
3b0527fe 1671 s = reginfo.ganch;
6eb5f6b9
JH
1672 }
1673 }
1674 else /* pos() not defined */
3b0527fe 1675 reginfo.ganch = strbeg;
6eb5f6b9
JH
1676 }
1677
a0714e2c 1678 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1679 re_scream_pos_data d;
1680
1681 d.scream_olds = &scream_olds;
1682 d.scream_pos = &scream_pos;
1683 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1684 if (!s) {
a3621e74 1685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1686 goto phooey; /* not present */
3fa9c3d7 1687 }
6eb5f6b9
JH
1688 }
1689
a3621e74 1690 DEBUG_EXECUTE_r({
1df70142
AL
1691 const char * const s0 = UTF
1692 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1693 UNI_DISPLAY_REGEX)
1694 : prog->precomp;
bb7a0f54 1695 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1696 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1697 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1698 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1699 if (!PL_colorset)
1700 reginitcolors();
1701 PerlIO_printf(Perl_debug_log,
a0288114 1702 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1703 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1704 len0, len0, s0,
2a782b5b 1705 PL_colors[1],
9e55ce06 1706 len0 > 60 ? "..." : "",
2a782b5b 1707 PL_colors[0],
9e55ce06
JH
1708 (int)(len1 > 60 ? 60 : len1),
1709 s1, PL_colors[1],
1710 (len1 > 60 ? "..." : "")
2a782b5b
JH
1711 );
1712 });
6eb5f6b9
JH
1713
1714 /* Simplest case: anchored match need be tried only once. */
1715 /* [unless only anchor is BOL and multiline is set] */
1716 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1717 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1718 goto got_it;
7fba1cd6 1719 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1720 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1721 {
1722 char *end;
1723
1724 if (minlen)
1725 dontbother = minlen - 1;
1aa99e6b 1726 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1727 /* for multiline we only have to try after newlines */
33b8afdf 1728 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1729 if (s == startpos)
1730 goto after_try;
1731 while (1) {
3b0527fe 1732 if (regtry(&reginfo, s))
6eb5f6b9
JH
1733 goto got_it;
1734 after_try:
1735 if (s >= end)
1736 goto phooey;
1737 if (prog->reganch & RE_USE_INTUIT) {
1738 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1739 if (!s)
1740 goto phooey;
1741 }
1742 else
1743 s++;
1744 }
1745 } else {
1746 if (s > startpos)
1747 s--;
1748 while (s < end) {
1749 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1750 if (regtry(&reginfo, s))
6eb5f6b9
JH
1751 goto got_it;
1752 }
1753 }
1754 }
1755 }
1756 goto phooey;
1757 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1758 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1759 goto got_it;
1760 goto phooey;
1761 }
1762
1763 /* Messy cases: unanchored match. */
33b8afdf 1764 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1765 /* we have /x+whatever/ */
1766 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1767 char ch;
bf93d4cc
GS
1768#ifdef DEBUGGING
1769 int did_match = 0;
1770#endif
33b8afdf
JH
1771 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1772 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1773 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1774
1aa99e6b 1775 if (do_utf8) {
6eb5f6b9
JH
1776 while (s < strend) {
1777 if (*s == ch) {
a3621e74 1778 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1779 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1780 s += UTF8SKIP(s);
1781 while (s < strend && *s == ch)
1782 s += UTF8SKIP(s);
1783 }
1784 s += UTF8SKIP(s);
1785 }
1786 }
1787 else {
1788 while (s < strend) {
1789 if (*s == ch) {
a3621e74 1790 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1791 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1792 s++;
1793 while (s < strend && *s == ch)
1794 s++;
1795 }
1796 s++;
1797 }
1798 }
a3621e74 1799 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1800 PerlIO_printf(Perl_debug_log,
b7953727
JH
1801 "Did not find anchored character...\n")
1802 );
6eb5f6b9 1803 }
a0714e2c
SS
1804 else if (prog->anchored_substr != NULL
1805 || prog->anchored_utf8 != NULL
1806 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1807 && prog->float_max_offset < strend - s)) {
1808 SV *must;
1809 I32 back_max;
1810 I32 back_min;
1811 char *last;
6eb5f6b9 1812 char *last1; /* Last position checked before */
bf93d4cc
GS
1813#ifdef DEBUGGING
1814 int did_match = 0;
1815#endif
33b8afdf
JH
1816 if (prog->anchored_substr || prog->anchored_utf8) {
1817 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1818 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1819 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1820 back_max = back_min = prog->anchored_offset;
1821 } else {
1822 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1823 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1824 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1825 back_max = prog->float_max_offset;
1826 back_min = prog->float_min_offset;
1827 }
1828 if (must == &PL_sv_undef)
1829 /* could not downgrade utf8 check substring, so must fail */
1830 goto phooey;
1831
1832 last = HOP3c(strend, /* Cannot start after this */
1833 -(I32)(CHR_SVLEN(must)
1834 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1835
1836 if (s > PL_bostr)
1837 last1 = HOPc(s, -1);
1838 else
1839 last1 = s - 1; /* bogus */
1840
a0288114 1841 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1842 check_substr==must. */
1843 scream_pos = -1;
1844 dontbother = end_shift;
1845 strend = HOPc(strend, -dontbother);
1846 while ( (s <= last) &&
9041c2e3 1847 ((flags & REXEC_SCREAM)
1aa99e6b 1848 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1849 end_shift, &scream_pos, 0))
1aa99e6b 1850 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1851 (unsigned char*)strend, must,
7fba1cd6 1852 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1853 /* we may be pointing at the wrong string */
1854 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1855 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1856 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1857 if (HOPc(s, -back_max) > last1) {
1858 last1 = HOPc(s, -back_min);
1859 s = HOPc(s, -back_max);
1860 }
1861 else {
52657f30 1862 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1863
1864 last1 = HOPc(s, -back_min);
52657f30 1865 s = t;
6eb5f6b9 1866 }
1aa99e6b 1867 if (do_utf8) {
6eb5f6b9 1868 while (s <= last1) {
3b0527fe 1869 if (regtry(&reginfo, s))
6eb5f6b9
JH
1870 goto got_it;
1871 s += UTF8SKIP(s);
1872 }
1873 }
1874 else {
1875 while (s <= last1) {
3b0527fe 1876 if (regtry(&reginfo, s))
6eb5f6b9
JH
1877 goto got_it;
1878 s++;
1879 }
1880 }
1881 }
a3621e74 1882 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1883 PerlIO_printf(Perl_debug_log,
a0288114 1884 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1885 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1886 ? "anchored" : "floating"),
1887 PL_colors[0],
1888 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1889 SvPVX_const(must),
b7953727
JH
1890 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1891 );
6eb5f6b9
JH
1892 goto phooey;
1893 }
155aba94 1894 else if ((c = prog->regstclass)) {
f14c76ed 1895 if (minlen) {
3dab1dad 1896 I32 op = OP(prog->regstclass);
66e933ab 1897 /* don't bother with what can't match */
f14c76ed
RGS
1898 if (PL_regkind[op] != EXACT && op != CANY)
1899 strend = HOPc(strend, -(minlen - 1));
1900 }
a3621e74 1901 DEBUG_EXECUTE_r({
ffc61ed2 1902 SV *prop = sv_newmortal();
cfd0369c
NC
1903 const char *s0;
1904 const char *s1;
9e55ce06
JH
1905 int len0;
1906 int len1;
1907
32fc9b6a 1908 regprop(prog, prop, c);
9e55ce06 1909 s0 = UTF ?
3f7c398e 1910 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1911 UNI_DISPLAY_REGEX) :
cfd0369c 1912 SvPVX_const(prop);
9e55ce06
JH
1913 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1914 s1 = UTF ?
c728cb41 1915 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 1916 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 1917 PerlIO_printf(Perl_debug_log,
a0288114 1918 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1919 len0, len0, s0,
1920 len1, len1, s1);
ffc61ed2 1921 });
3b0527fe 1922 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1923 goto got_it;
a3621e74 1924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1925 }
1926 else {
1927 dontbother = 0;
a0714e2c 1928 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1929 /* Trim the end. */
d6a28714 1930 char *last;
33b8afdf
JH
1931 SV* float_real;
1932
1933 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1934 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1935 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1936
1937 if (flags & REXEC_SCREAM) {
33b8afdf 1938 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1939 end_shift, &scream_pos, 1); /* last one */
1940 if (!last)
ffc61ed2 1941 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1942 /* we may be pointing at the wrong string */
1943 else if (RX_MATCH_COPIED(prog))
3f7c398e 1944 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1945 }
d6a28714
JH
1946 else {
1947 STRLEN len;
cfd0369c 1948 const char * const little = SvPV_const(float_real, len);
d6a28714 1949
33b8afdf 1950 if (SvTAIL(float_real)) {
d6a28714
JH
1951 if (memEQ(strend - len + 1, little, len - 1))
1952 last = strend - len + 1;
7fba1cd6 1953 else if (!multiline)
9041c2e3 1954 last = memEQ(strend - len, little, len)
bd61b366 1955 ? strend - len : NULL;
b8c5462f 1956 else
d6a28714
JH
1957 goto find_last;
1958 } else {
1959 find_last:
9041c2e3 1960 if (len)
d6a28714 1961 last = rninstr(s, strend, little, little + len);
b8c5462f 1962 else
a0288114 1963 last = strend; /* matching "$" */
b8c5462f 1964 }
b8c5462f 1965 }
bf93d4cc 1966 if (last == NULL) {
a3621e74 1967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1968 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1969 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1970 goto phooey; /* Should not happen! */
1971 }
d6a28714
JH
1972 dontbother = strend - last + prog->float_min_offset;
1973 }
1974 if (minlen && (dontbother < minlen))
1975 dontbother = minlen - 1;
1976 strend -= dontbother; /* this one's always in bytes! */
1977 /* We don't know much -- general case. */
1aa99e6b 1978 if (do_utf8) {
d6a28714 1979 for (;;) {
3b0527fe 1980 if (regtry(&reginfo, s))
d6a28714
JH
1981 goto got_it;
1982 if (s >= strend)
1983 break;
b8c5462f 1984 s += UTF8SKIP(s);
d6a28714
JH
1985 };
1986 }
1987 else {
1988 do {
3b0527fe 1989 if (regtry(&reginfo, s))
d6a28714
JH
1990 goto got_it;
1991 } while (s++ < strend);
1992 }
1993 }
1994
1995 /* Failure. */
1996 goto phooey;
1997
1998got_it:
1999 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2000
2001 if (PL_reg_eval_set) {
2002 /* Preserve the current value of $^R */
2003 if (oreplsv != GvSV(PL_replgv))
2004 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2005 restored, the value remains
2006 the same. */
4f639d21 2007 restore_pos(aTHX_ prog);
d6a28714
JH
2008 }
2009
2010 /* make sure $`, $&, $', and $digit will work later */
2011 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2012 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2013 if (flags & REXEC_COPY_STR) {
2014 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2015#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2016 if ((SvIsCOW(sv)
2017 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2018 if (DEBUG_C_TEST) {
2019 PerlIO_printf(Perl_debug_log,
2020 "Copy on write: regexp capture, type %d\n",
2021 (int) SvTYPE(sv));
2022 }
2023 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2024 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2025 assert (SvPOKp(prog->saved_copy));
2026 } else
2027#endif
2028 {
2029 RX_MATCH_COPIED_on(prog);
2030 s = savepvn(strbeg, i);
2031 prog->subbeg = s;
2032 }
d6a28714 2033 prog->sublen = i;
d6a28714
JH
2034 }
2035 else {
2036 prog->subbeg = strbeg;
2037 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2038 }
2039 }
9041c2e3 2040
d6a28714
JH
2041 return 1;
2042
2043phooey:
a3621e74 2044 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2045 PL_colors[4], PL_colors[5]));
d6a28714 2046 if (PL_reg_eval_set)
4f639d21 2047 restore_pos(aTHX_ prog);
d6a28714
JH
2048 return 0;
2049}
2050
2051/*
2052 - regtry - try match at specific point
2053 */
2054STATIC I32 /* 0 failure, 1 success */
3b0527fe 2055S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2056{
97aff369 2057 dVAR;
d6a28714
JH
2058 register I32 *sp;
2059 register I32 *ep;
2060 CHECKPOINT lastcp;
3b0527fe 2061 regexp *prog = reginfo->prog;
a3621e74 2062 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2063
02db2b7b
IZ
2064#ifdef DEBUGGING
2065 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2066#endif
d6a28714
JH
2067 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2068 MAGIC *mg;
2069
2070 PL_reg_eval_set = RS_init;
a3621e74 2071 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2072 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2073 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2074 ));
e8347627 2075 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2076 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2077 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2078 SAVETMPS;
2079 /* Apparently this is not needed, judging by wantarray. */
e8347627 2080 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2081 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2082
3b0527fe 2083 if (reginfo->sv) {
d6a28714 2084 /* Make $_ available to executed code. */
3b0527fe 2085 if (reginfo->sv != DEFSV) {
59f00321 2086 SAVE_DEFSV;
3b0527fe 2087 DEFSV = reginfo->sv;
b8c5462f 2088 }
d6a28714 2089
3b0527fe
DM
2090 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2091 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2092 /* prepare for quick setting of pos */
d300d9fa
NC
2093#ifdef PERL_OLD_COPY_ON_WRITE
2094 if (SvIsCOW(sv))
2095 sv_force_normal_flags(sv, 0);
2096#endif
3dab1dad 2097 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2098 &PL_vtbl_mglob, NULL, 0);
d6a28714 2099 mg->mg_len = -1;
b8c5462f 2100 }
d6a28714
JH
2101 PL_reg_magic = mg;
2102 PL_reg_oldpos = mg->mg_len;
4f639d21 2103 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2104 }
09687e5a 2105 if (!PL_reg_curpm) {
a02a5408 2106 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2107#ifdef USE_ITHREADS
2108 {
2109 SV* repointer = newSViv(0);
577e12cc 2110 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2111 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2112 av_push(PL_regex_padav,repointer);
2113 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2114 PL_regex_pad = AvARRAY(PL_regex_padav);
2115 }
2116#endif
2117 }
aaa362c4 2118 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2119 PL_reg_oldcurpm = PL_curpm;
2120 PL_curpm = PL_reg_curpm;
2121 if (RX_MATCH_COPIED(prog)) {
2122 /* Here is a serious problem: we cannot rewrite subbeg,
2123 since it may be needed if this match fails. Thus
2124 $` inside (?{}) could fail... */
2125 PL_reg_oldsaved = prog->subbeg;
2126 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2127#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2128 PL_nrs = prog->saved_copy;
2129#endif
d6a28714
JH
2130 RX_MATCH_COPIED_off(prog);
2131 }
2132 else
bd61b366 2133 PL_reg_oldsaved = NULL;
d6a28714
JH
2134 prog->subbeg = PL_bostr;
2135 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2136 }
973dddac 2137 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2138 PL_reginput = startpos;
2139 PL_regstartp = prog->startp;
2140 PL_regendp = prog->endp;
2141 PL_reglastparen = &prog->lastparen;
a01268b5 2142 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2143 prog->lastparen = 0;
03994de8 2144 prog->lastcloseparen = 0;
d6a28714 2145 PL_regsize = 0;
a3621e74 2146 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2147 if (PL_reg_start_tmpl <= prog->nparens) {
2148 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2149 if(PL_reg_start_tmp)
2150 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2151 else
a02a5408 2152 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2153 }
2154
2155 /* XXXX What this code is doing here?!!! There should be no need
2156 to do this again and again, PL_reglastparen should take care of
3dd2943c 2157 this! --ilya*/
dafc8851
JH
2158
2159 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2160 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2161 * PL_reglastparen), is not needed at all by the test suite
2162 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2163 * enough, for building DynaLoader, or otherwise this
2164 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2165 * will happen. Meanwhile, this code *is* needed for the
2166 * above-mentioned test suite tests to succeed. The common theme
2167 * on those tests seems to be returning null fields from matches.
2168 * --jhi */
dafc8851 2169#if 1
d6a28714
JH
2170 sp = prog->startp;
2171 ep = prog->endp;
2172 if (prog->nparens) {
097eb12c 2173 register I32 i;
eb160463 2174 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2175 *++sp = -1;
2176 *++ep = -1;
2177 }
2178 }
dafc8851 2179#endif
02db2b7b 2180 REGCP_SET(lastcp);
3b0527fe 2181 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2182 prog->endp[0] = PL_reginput - PL_bostr;
2183 return 1;
2184 }
02db2b7b 2185 REGCP_UNWIND(lastcp);
d6a28714
JH
2186 return 0;
2187}
2188
02db2b7b
IZ
2189#define RE_UNWIND_BRANCH 1
2190#define RE_UNWIND_BRANCHJ 2
2191
2192union re_unwind_t;
2193
2194typedef struct { /* XX: makes sense to enlarge it... */
2195 I32 type;
2196 I32 prev;
2197 CHECKPOINT lastcp;
2198} re_unwind_generic_t;
2199
2200typedef struct {
2201 I32 type;
2202 I32 prev;
2203 CHECKPOINT lastcp;
2204 I32 lastparen;
2205 regnode *next;
2206 char *locinput;
2207 I32 nextchr;
3a2830be 2208 int minmod;
02db2b7b
IZ
2209#ifdef DEBUGGING
2210 int regindent;
2211#endif
2212} re_unwind_branch_t;
2213
2214typedef union re_unwind_t {
2215 I32 type;
2216 re_unwind_generic_t generic;
2217 re_unwind_branch_t branch;
2218} re_unwind_t;
2219
8ba1375e
MJD
2220#define sayYES goto yes
2221#define sayNO goto no
e0f9d4a8 2222#define sayNO_ANYOF goto no_anyof
8ba1375e 2223#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2224#define sayNO_FINAL goto no_final
2225#define sayNO_SILENT goto do_no
2226#define saySAME(x) if (x) goto yes; else goto no
2227
3ab3c9b4
HS
2228#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2229#define POSCACHE_SEEN 1 /* we know what we're caching */
2230#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2231
3ab3c9b4 2232#define CACHEsayYES STMT_START { \
d8319b27 2233 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2234 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2235 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2236 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2237 } \
2238 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2239 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2240 } \
2241 else { \
3ab3c9b4
HS
2242 /* cache records failure, but this is success */ \
2243 DEBUG_r( \
2244 PerlIO_printf(Perl_debug_log, \
2245 "%*s (remove success from failure cache)\n", \
2246 REPORT_CODE_OFF+PL_regindent*2, "") \
2247 ); \
d8319b27 2248 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2249 } \
2250 } \
2251 sayYES; \
2252} STMT_END
7409bbd3 2253
3ab3c9b4 2254#define CACHEsayNO STMT_START { \
d8319b27 2255 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2256 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2257 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2258 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2259 } \
2260 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2261 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2262 } \
2263 else { \
3ab3c9b4
HS
2264 /* cache records success, but this is failure */ \
2265 DEBUG_r( \
2266 PerlIO_printf(Perl_debug_log, \
2267 "%*s (remove failure from success cache)\n", \
2268 REPORT_CODE_OFF+PL_regindent*2, "") \
2269 ); \
d8319b27 2270 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2271 } \
2272 } \
2273 sayNO; \
2274} STMT_END
2275
a3621e74
YO
2276/* this is used to determine how far from the left messages like
2277 'failed...' are printed. Currently 29 makes these messages line
2278 up with the opcode they refer to. Earlier perls used 25 which
2279 left these messages outdented making reviewing a debug output
2280 quite difficult.
2281*/
2282#define REPORT_CODE_OFF 29
2283
2284
2285/* Make sure there is a test for this +1 options in re_tests */
2286#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2287
9e137952
DM
2288/* this value indiciates that the c1/c2 "next char" test should be skipped */
2289#define CHRTEST_VOID -1000
2290
86545054
DM
2291#define SLAB_FIRST(s) (&(s)->states[0])
2292#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2293
5d9a96ca
DM
2294/* grab a new slab and return the first slot in it */
2295
2296STATIC regmatch_state *
2297S_push_slab(pTHX)
2298{
54df2634
NC
2299#if PERL_VERSION < 9
2300 dMY_CXT;
2301#endif
5d9a96ca
DM
2302 regmatch_slab *s = PL_regmatch_slab->next;
2303 if (!s) {
2304 Newx(s, 1, regmatch_slab);
2305 s->prev = PL_regmatch_slab;
2306 s->next = NULL;
2307 PL_regmatch_slab->next = s;
2308 }
2309 PL_regmatch_slab = s;
86545054 2310 return SLAB_FIRST(s);
5d9a96ca 2311}
5b47454d 2312
95b24440
DM
2313/* simulate a recursive call to regmatch */
2314
2315#define REGMATCH(ns, where) \
5d9a96ca
DM
2316 st->scan = scan; \
2317 scan = (ns); \
2318 st->resume_state = resume_##where; \
95b24440
DM
2319 goto start_recurse; \
2320 resume_point_##where:
2321
aa283a38
DM
2322
2323/* push a new regex state. Set newst to point to it */
2324
2325#define PUSH_STATE(newst, resume) \
2326 depth++; \
2327 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2328 st->scan = scan; \
2329 st->next = next; \
2330 st->n = n; \
2331 st->locinput = locinput; \
2332 st->resume_state = resume; \
2333 newst = st+1; \
86545054 2334 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2335 newst = S_push_slab(aTHX); \
2336 PL_regmatch_state = newst; \
2337 newst->cc = 0; \
2338 newst->minmod = 0; \
2339 newst->sw = 0; \
2340 newst->logical = 0; \
2341 newst->unwind = 0; \
2342 locinput = PL_reginput; \
2343 nextchr = UCHARAT(locinput);
2344
2345#define POP_STATE \
2346 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2347 depth--; \
2348 st--; \
86545054 2349 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2350 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2351 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2352 } \
2353 PL_regmatch_state = st; \
2354 scan = st->scan; \
2355 next = st->next; \
2356 n = st->n; \
2357 locinput = st->locinput; \
2358 nextchr = UCHARAT(locinput);
2359
d6a28714
JH
2360/*
2361 - regmatch - main matching routine
2362 *
2363 * Conceptually the strategy is simple: check to see whether the current
2364 * node matches, call self recursively to see whether the rest matches,
2365 * and then act accordingly. In practice we make some effort to avoid
2366 * recursion, in particular by going through "ordinary" nodes (that don't
2367 * need to know whether the rest of the match failed) by a loop instead of
2368 * by recursion.
2369 */
2370/* [lwall] I've hoisted the register declarations to the outer block in order to
2371 * maybe save a little bit of pushing and popping on the stack. It also takes
2372 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2373 *
2374 * This function used to be heavily recursive, but since this had the
2375 * effect of blowing the CPU stack on complex regexes, it has been
2376 * restructured to be iterative, and to save state onto the heap rather
2377 * than the stack. Essentially whereever regmatch() used to be called, it
2378 * pushes the current state, notes where to return, then jumps back into
2379 * the main loop.
2380 *
2381 * Originally the structure of this function used to look something like
2382
2383 S_regmatch() {
2384 int a = 1, b = 2;
2385 ...
2386 while (scan != NULL) {
5d9a96ca 2387 a++; // do stuff with a and b
95b24440
DM
2388 ...
2389 switch (OP(scan)) {
2390 case FOO: {
2391 int local = 3;
2392 ...
2393 if (regmatch(...)) // recurse
2394 goto yes;
2395 }
2396 ...
2397 }
2398 }
2399 yes:
2400 return 1;
2401 }
2402
2403 * Now it looks something like this:
2404
5d9a96ca 2405 typedef struct {
95b24440
DM
2406 int a, b, local;
2407 int resume_state;
5d9a96ca 2408 } regmatch_state;
95b24440
DM
2409
2410 S_regmatch() {
5d9a96ca
DM
2411 regmatch_state *st = new();
2412 int depth=0;
2413 st->a++; // do stuff with a and b
95b24440
DM
2414 ...
2415 while (scan != NULL) {
2416 ...
2417 switch (OP(scan)) {
2418 case FOO: {
5d9a96ca 2419 st->local = 3;
95b24440 2420 ...
5d9a96ca
DM
2421 st->scan = scan;
2422 scan = ...;
2423 st->resume_state = resume_FOO;
2424 goto start_recurse; // recurse
95b24440 2425
5d9a96ca
DM
2426 resume_point_FOO:
2427 if (result)
95b24440
DM
2428 goto yes;
2429 }
2430 ...
2431 }
5d9a96ca
DM
2432 start_recurse:
2433 st = new(); push a new state
2434 st->a = 1; st->b = 2;
2435 depth++;
95b24440 2436 }
5d9a96ca 2437 yes:
95b24440 2438 result = 1;
5d9a96ca
DM
2439 if (depth--) {
2440 st = pop();
95b24440
DM
2441 switch (resume_state) {
2442 case resume_FOO:
2443 goto resume_point_FOO;
2444 ...
2445 }
2446 }
2447 return result
2448 }
2449
2450 * WARNING: this means that any line in this function that contains a
2451 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2452 * regmatch() using gotos instead. Thus the values of any local variables
2453 * not saved in the regmatch_state structure will have been lost when
2454 * execution resumes on the next line .
5d9a96ca
DM
2455 *
2456 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2457 * PL_regmatch_state always points to the currently active state, and
2458 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2459 * The first time regmatch is called, the first slab is allocated, and is
2460 * never freed until interpreter desctruction. When the slab is full,
2461 * a new one is allocated chained to the end. At exit from regmatch, slabs
2462 * allocated since entry are freed.
d6a28714 2463 */
95b24440 2464
3dab1dad 2465#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2466
d6a28714 2467STATIC I32 /* 0 failure, 1 success */
3b0527fe 2468S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2469{
54df2634
NC
2470#if PERL_VERSION < 9
2471 dMY_CXT;
2472#endif
27da23d5 2473 dVAR;
95b24440 2474 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2475 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2476
3b0527fe
DM
2477 regexp *rex = reginfo->prog;
2478
5d9a96ca
DM
2479 regmatch_slab *orig_slab;
2480 regmatch_state *orig_state;
a3621e74 2481
5d9a96ca
DM
2482 /* the current state. This is a cached copy of PL_regmatch_state */
2483 register regmatch_state *st;
95b24440 2484
5d9a96ca
DM
2485 /* cache heavy used fields of st in registers */
2486 register regnode *scan;
2487 register regnode *next;
2488 register I32 n = 0; /* initialize to shut up compiler warning */
2489 register char *locinput = PL_reginput;
95b24440 2490
5d9a96ca
DM
2491 /* these variables are NOT saved during a recusive RFEGMATCH: */
2492 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2493 bool result; /* return value of S_regmatch */
2494 regnode *inner; /* Next node in internal branch. */
2495 int depth = 0; /* depth of recursion */
aa283a38 2496 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2497 regmatch_state *yes_state = NULL; /* state to pop to on success of
2498 subpattern */
95b24440
DM
2499
2500#ifdef DEBUGGING
ab74612d 2501 SV *re_debug_flags = NULL;
a3621e74 2502 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2503 PL_regindent++;
2504#endif
2505
5d9a96ca
DM
2506 /* on first ever call to regmatch, allocate first slab */
2507 if (!PL_regmatch_slab) {
2508 Newx(PL_regmatch_slab, 1, regmatch_slab);
2509 PL_regmatch_slab->prev = NULL;
2510 PL_regmatch_slab->next = NULL;
86545054 2511 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2512 }
2513
2514 /* remember current high-water mark for exit */
2515 /* XXX this should be done with SAVE* instead */
2516 orig_slab = PL_regmatch_slab;
2517 orig_state = PL_regmatch_state;
2518
2519 /* grab next free state slot */
2520 st = ++PL_regmatch_state;
86545054 2521 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2522 st = PL_regmatch_state = S_push_slab(aTHX);
2523
2524 st->minmod = 0;
2525 st->sw = 0;
2526 st->logical = 0;
2527 st->unwind = 0;
2528 st->cc = NULL;
d6a28714
JH
2529 /* Note that nextchr is a byte even in UTF */
2530 nextchr = UCHARAT(locinput);
2531 scan = prog;
2532 while (scan != NULL) {
8ba1375e 2533
a3621e74 2534 DEBUG_EXECUTE_r( {
6136c704 2535 SV * const prop = sv_newmortal();
1df70142
AL
2536 const int docolor = *PL_colors[0];
2537 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2538 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2539 /* The part of the string before starttry has one color
2540 (pref0_len chars), between starttry and current
2541 position another one (pref_len - pref0_len chars),
2542 after the current position the third one.
2543 We assume that pref0_len <= pref_len, otherwise we
2544 decrease pref0_len. */
9041c2e3 2545 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2546 ? (5 + taill) - l : locinput - PL_bostr;
2547 int pref0_len;
d6a28714 2548
df1ffd02 2549 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2550 pref_len++;
2551 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2552 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2553 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2554 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2555 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2556 l--;
d6a28714
JH
2557 if (pref0_len < 0)
2558 pref0_len = 0;
2559 if (pref0_len > pref_len)
2560 pref0_len = pref_len;
32fc9b6a 2561 regprop(rex, prop, scan);
2a782b5b 2562 {
1df70142 2563 const char * const s0 =
f14c76ed 2564 do_utf8 && OP(scan) != CANY ?
95b24440 2565 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2566 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2567 locinput - pref_len;
bb7a0f54 2568 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
1df70142 2569 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2570 pv_uni_display(PERL_DEBUG_PAD(1),
2571 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2572 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2573 locinput - pref_len + pref0_len;
bb7a0f54 2574 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
1df70142 2575 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2576 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2577 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2578 locinput;
bb7a0f54 2579 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2a782b5b
JH
2580 PerlIO_printf(Perl_debug_log,
2581 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2582 (IV)(locinput - PL_bostr),
2583 PL_colors[4],
2584 len0, s0,
2585 PL_colors[5],
2586 PL_colors[2],
2587 len1, s1,
2588 PL_colors[3],
2589 (docolor ? "" : "> <"),
2590 PL_colors[0],
2591 len2, s2,
2592 PL_colors[1],
2593 15 - l - pref_len + 1,
2594 "",
4f639d21 2595 (IV)(scan - rex->program), PL_regindent*2, "",
3f7c398e 2596 SvPVX_const(prop));
2a782b5b
JH
2597 }
2598 });
d6a28714
JH
2599
2600 next = scan + NEXT_OFF(scan);
2601 if (next == scan)
2602 next = NULL;
2603
2604 switch (OP(scan)) {
2605 case BOL:
7fba1cd6 2606 if (locinput == PL_bostr)
d6a28714 2607 {
3b0527fe 2608 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2609 break;
2610 }
d6a28714
JH
2611 sayNO;
2612 case MBOL:
12d33761
HS
2613 if (locinput == PL_bostr ||
2614 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2615 {
b8c5462f
JH
2616 break;
2617 }
d6a28714
JH
2618 sayNO;
2619 case SBOL:
c2a73568 2620 if (locinput == PL_bostr)
b8c5462f 2621 break;
d6a28714
JH
2622 sayNO;
2623 case GPOS:
3b0527fe 2624 if (locinput == reginfo->ganch)
d6a28714
JH
2625 break;
2626 sayNO;
2627 case EOL:
d6a28714
JH
2628 goto seol;
2629 case MEOL:
d6a28714 2630 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2631 sayNO;
b8c5462f 2632 break;
d6a28714
JH
2633 case SEOL:
2634 seol:
2635 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2636 sayNO;
d6a28714 2637 if (PL_regeol - locinput > 1)
b8c5462f 2638 sayNO;
b8c5462f 2639 break;
d6a28714
JH
2640 case EOS:
2641 if (PL_regeol != locinput)
b8c5462f 2642 sayNO;
d6a28714 2643 break;
ffc61ed2 2644 case SANY:
d6a28714 2645 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2646 sayNO;
f33976b4
DB
2647 if (do_utf8) {
2648 locinput += PL_utf8skip[nextchr];
2649 if (locinput > PL_regeol)
2650 sayNO;
2651 nextchr = UCHARAT(locinput);
2652 }
2653 else
2654 nextchr = UCHARAT(++locinput);
2655 break;
2656 case CANY:
2657 if (!nextchr && locinput >= PL_regeol)
2658 sayNO;
b8c5462f 2659 nextchr = UCHARAT(++locinput);
a0d0e21e 2660 break;
ffc61ed2 2661 case REG_ANY:
1aa99e6b
IH
2662 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2663 sayNO;
2664 if (do_utf8) {
b8c5462f 2665 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2666 if (locinput > PL_regeol)
2667 sayNO;
a0ed51b3 2668 nextchr = UCHARAT(locinput);
a0ed51b3 2669 }
1aa99e6b
IH
2670 else
2671 nextchr = UCHARAT(++locinput);
a0ed51b3 2672 break;
a3621e74
YO
2673
2674
2675
2676 /*
2677 traverse the TRIE keeping track of all accepting states
2678 we transition through until we get to a failing node.
a3621e74 2679 */
5b47454d 2680 case TRIE:
3dab1dad
YO
2681 {
2682 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2683 trie_type = do_utf8 ?
2684 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2685 : trie_plain;
2686
2687 /* what trie are we using right now */
2688 reg_trie_data *trie
2689 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2690 U32 state = trie->startstate;
2691
2692 if (trie->bitmap && trie_type != trie_utf8_fold &&
2693 !TRIE_BITMAP_TEST(trie,*locinput)
2694 ) {
2695 if (trie->states[ state ].wordnum) {
2696 DEBUG_EXECUTE_r(
2697 PerlIO_printf(Perl_debug_log,
2698 "%*s %smatched empty string...%s\n",
2699 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2700 );
2701 break;
2702 } else {
2703 DEBUG_EXECUTE_r(
2704 PerlIO_printf(Perl_debug_log,
2705 "%*s %sfailed to match start class...%s\n",
2706 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2707 );
2708 sayNO_SILENT;
2709 }
2710 }
a3621e74 2711 {
a3621e74 2712 U8 *uc = ( U8* )locinput;
a3621e74
YO
2713 U16 charid = 0;
2714 U32 base = 0;
2715 UV uvc = 0;
2716 STRLEN len = 0;
2717 STRLEN foldlen = 0;
a3621e74
YO
2718 U8 *uscan = (U8*)NULL;
2719 STRLEN bufflen=0;
95b24440 2720 SV *sv_accept_buff = NULL;
5b47454d 2721
d8319b27 2722 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2723 result = 0;
a3621e74
YO
2724
2725 while ( state && uc <= (U8*)PL_regeol ) {
2726
5b47454d 2727 if (trie->states[ state ].wordnum) {
d8319b27 2728 if (!st->u.trie.accepted ) {
5b47454d
DM
2729 ENTER;
2730 SAVETMPS;
2731 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2732 sv_accept_buff=newSV(bufflen *
2733 sizeof(reg_trie_accepted) - 1);
2734 SvCUR_set(sv_accept_buff,
2735 sizeof(reg_trie_accepted));
2736 SvPOK_on(sv_accept_buff);
2737 sv_2mortal(sv_accept_buff);
d8319b27 2738 st->u.trie.accept_buff =
5b47454d
DM
2739 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2740 }
2741 else {
d8319b27 2742 if (st->u.trie.accepted >= bufflen) {
5b47454d 2743 bufflen *= 2;
d8319b27 2744 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2745 SvGROW(sv_accept_buff,
2746 bufflen * sizeof(reg_trie_accepted));
2747 }
2748 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2749 + sizeof(reg_trie_accepted));
2750 }
d8319b27
DM
2751 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2752 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2753 ++st->u.trie.accepted;
5b47454d 2754 }
a3621e74
YO
2755
2756 base = trie->states[ state ].trans.base;
2757
2758 DEBUG_TRIE_EXECUTE_r(
2759 PerlIO_printf( Perl_debug_log,
e4584336 2760 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2761 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27 2762 (UV)state, (UV)base, (UV)st->u.trie.accepted );
a3621e74
YO
2763 );
2764
2765 if ( base ) {
5b47454d 2766 switch (trie_type) {
3dab1dad 2767 case trie_utf8_fold:
a3621e74
YO
2768 if ( foldlen>0 ) {
2769 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2770 foldlen -= len;
2771 uscan += len;
2772 len=0;
2773 } else {
1df70142 2774 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2775 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2776 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2777 foldlen -= UNISKIP( uvc );
2778 uscan = foldbuf + UNISKIP( uvc );
2779 }
5b47454d
DM
2780 break;
2781 case trie_utf8:
2782 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2783 &len, uniflags );
2784 break;
2785 case trie_plain:
e4584336 2786 uvc = (UV)*uc;
a3621e74
YO
2787 len = 1;
2788 }
2789
5b47454d
DM
2790 if (uvc < 256) {
2791 charid = trie->charmap[ uvc ];
2792 }
2793 else {
2794 charid = 0;
2795 if (trie->widecharmap) {
3dab1dad 2796 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2797 (char*)&uvc, sizeof(UV), 0);
2798 if (svpp)
2799 charid = (U16)SvIV(*svpp);
2800 }
2801 }
a3621e74 2802
5b47454d
DM
2803 if (charid &&
2804 (base + charid > trie->uniquecharcount )
2805 && (base + charid - 1 - trie->uniquecharcount
2806 < trie->lasttrans)
2807 && trie->trans[base + charid - 1 -
2808 trie->uniquecharcount].check == state)
2809 {
2810 state = trie->trans[base + charid - 1 -
2811 trie->uniquecharcount ].next;
2812 }
2813 else {
2814 state = 0;
2815 }
2816 uc += len;
2817
2818 }
2819 else {
a3621e74
YO
2820 state = 0;
2821 }
2822 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2823 PerlIO_printf( Perl_debug_log,
2824 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2825 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2826 );
2827 }
d8319b27 2828 if (!st->u.trie.accepted )
a3621e74 2829 sayNO;
a3621e74
YO
2830
2831 /*
2832 There was at least one accepting state that we
2833 transitioned through. Presumably the number of accepting
2834 states is going to be low, typically one or two. So we
2835 simply scan through to find the one with lowest wordnum.
2836 Once we find it, we swap the last state into its place
2837 and decrement the size. We then try to match the rest of
2838 the pattern at the point where the word ends, if we
2839 succeed then we end the loop, otherwise the loop
2840 eventually terminates once all of the accepting states
2841 have been tried.
2842 */
a3621e74 2843
d8319b27 2844 if ( st->u.trie.accepted == 1 ) {
a3621e74 2845 DEBUG_EXECUTE_r({
097eb12c 2846 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 );
a3621e74
YO
2847 PerlIO_printf( Perl_debug_log,
2848 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2849 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2850 st->u.trie.accept_buff[ 0 ].wordnum,
cfd0369c 2851 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2852 PL_colors[5] );
2853 });
d8319b27 2854 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
2855 /* in this case we free tmps/leave before we call regmatch
2856 as we wont be using accept_buff again. */
2857 FREETMPS;
2858 LEAVE;
95b24440
DM
2859 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2860 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2861 } else {
2862 DEBUG_EXECUTE_r(
e4584336 2863 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 2864 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
2865 PL_colors[5] );
2866 );
d8319b27 2867 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
2868 U32 best = 0;
2869 U32 cur;
d8319b27 2870 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
2871 DEBUG_TRIE_EXECUTE_r(
2872 PerlIO_printf( Perl_debug_log,
2873 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2874 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
2875 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
2876 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 2877 );
a3621e74 2878
d8319b27
DM
2879 if (st->u.trie.accept_buff[cur].wordnum <
2880 st->u.trie.accept_buff[best].wordnum)
e822a8b4 2881 best = cur;
a3621e74
YO
2882 }
2883 DEBUG_EXECUTE_r({
87830502 2884 reg_trie_data * const trie = (reg_trie_data*)
32fc9b6a 2885 rex->data->data[ARG(scan)];
d8319b27 2886 SV ** const tmp = av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 );
3dab1dad 2887 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
e4584336 2888 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
d8319b27 2889 st->u.trie.accept_buff[best].wordnum,
3dab1dad 2890 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
a3621e74
YO
2891 PL_colors[5] );
2892 });
d8319b27
DM
2893 if ( best<st->u.trie.accepted ) {
2894 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
2895 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
2896 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
2897 best = st->u.trie.accepted;
a3621e74 2898 }
d8319b27 2899 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
2900
2901 /*
2902 as far as I can tell we only need the SAVETMPS/FREETMPS
2903 for re's with EVAL in them but I'm leaving them in for
2904 all until I can be sure.
2905 */
2906 SAVETMPS;
95b24440
DM
2907 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2908 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2909 FREETMPS;
2910 }
2911 FREETMPS;
2912 LEAVE;
2913 }
2914
95b24440 2915 if (result) {
a3621e74
YO
2916 sayYES;
2917 } else {
2918 sayNO;
2919 }
3dab1dad 2920 }}
a3621e74 2921 /* unreached codepoint */
95b24440
DM
2922 case EXACT: {
2923 char *s = STRING(scan);
5d9a96ca 2924 st->ln = STR_LEN(scan);
eb160463 2925 if (do_utf8 != UTF) {
bc517b45 2926 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2927 char *l = locinput;
5d9a96ca 2928 const char *e = s + st->ln;
a72c7584 2929
5ff6fc6d
JH
2930 if (do_utf8) {
2931 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2932 while (s < e) {
a3b680e6 2933 STRLEN ulen;
1aa99e6b 2934 if (l >= PL_regeol)
5ff6fc6d
JH
2935 sayNO;
2936 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2937 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2938 uniflags))
5ff6fc6d 2939 sayNO;
bc517b45 2940 l += ulen;
5ff6fc6d 2941 s ++;
1aa99e6b 2942 }
5ff6fc6d
JH
2943 }
2944 else {
2945 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2946 while (s < e) {
a3b680e6 2947 STRLEN ulen;
1aa99e6b
IH
2948 if (l >= PL_regeol)
2949 sayNO;
5ff6fc6d 2950 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2951 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2952 uniflags))
1aa99e6b 2953 sayNO;
bc517b45 2954 s += ulen;
a72c7584 2955 l ++;
1aa99e6b 2956 }
5ff6fc6d 2957 }
1aa99e6b
IH
2958 locinput = l;
2959 nextchr = UCHARAT(locinput);
2960 break;
2961 }
bc517b45 2962 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2963 /* Inline the first character, for speed. */
2964 if (UCHARAT(s) != nextchr)
2965 sayNO;
5d9a96ca 2966 if (PL_regeol - locinput < st->ln)
d6a28714 2967 sayNO;
5d9a96ca 2968 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2969 sayNO;
5d9a96ca 2970 locinput += st->ln;
d6a28714
JH
2971 nextchr = UCHARAT(locinput);
2972 break;
95b24440 2973 }
d6a28714 2974 case EXACTFL:
b8c5462f
JH
2975 PL_reg_flags |= RF_tainted;
2976 /* FALL THROUGH */
95b24440
DM
2977 case EXACTF: {
2978 char *s = STRING(scan);
5d9a96ca 2979 st->ln = STR_LEN(scan);
d6a28714 2980
d07ddd77
JH
2981 if (do_utf8 || UTF) {
2982 /* Either target or the pattern are utf8. */
d6a28714 2983 char *l = locinput;
d07ddd77 2984 char *e = PL_regeol;
bc517b45 2985
5d9a96ca 2986 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2987 l, &e, 0, do_utf8)) {
5486206c
JH
2988 /* One more case for the sharp s:
2989 * pack("U0U*", 0xDF) =~ /ss/i,
2990 * the 0xC3 0x9F are the UTF-8
2991 * byte sequence for the U+00DF. */
2992 if (!(do_utf8 &&
2993 toLOWER(s[0]) == 's' &&
5d9a96ca 2994 st->ln >= 2 &&
5486206c
JH
2995 toLOWER(s[1]) == 's' &&
2996 (U8)l[0] == 0xC3 &&
2997 e - l >= 2 &&
2998 (U8)l[1] == 0x9F))
2999 sayNO;
3000 }
d07ddd77
JH
3001 locinput = e;
3002 nextchr = UCHARAT(locinput);
3003 break;
a0ed51b3 3004 }
d6a28714 3005
bc517b45
JH
3006 /* Neither the target and the pattern are utf8. */
3007
d6a28714
JH
3008 /* Inline the first character, for speed. */
3009 if (UCHARAT(s) != nextchr &&
3010 UCHARAT(s) != ((OP(scan) == EXACTF)
3011 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3012 sayNO;
5d9a96ca 3013 if (PL_regeol - locinput < st->ln)
b8c5462f 3014 sayNO;
5d9a96ca
DM
3015 if (st->ln > 1 && (OP(scan) == EXACTF
3016 ? ibcmp(s, locinput, st->ln)
3017 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3018 sayNO;
5d9a96ca 3019 locinput += st->ln;
d6a28714 3020 nextchr = UCHARAT(locinput);
a0d0e21e 3021 break;
95b24440 3022 }
d6a28714 3023 case ANYOF:
ffc61ed2 3024 if (do_utf8) {
9e55ce06
JH
3025 STRLEN inclasslen = PL_regeol - locinput;
3026
32fc9b6a 3027 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3028 sayNO_ANYOF;
ffc61ed2
JH
3029 if (locinput >= PL_regeol)
3030 sayNO;
0f0076b4 3031 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3032 nextchr = UCHARAT(locinput);
e0f9d4a8 3033 break;
ffc61ed2
JH
3034 }
3035 else {
3036 if (nextchr < 0)
3037 nextchr = UCHARAT(locinput);
32fc9b6a 3038 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3039 sayNO_ANYOF;
ffc61ed2
JH
3040 if (!nextchr && locinput >= PL_regeol)
3041 sayNO;
3042 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3043 break;
3044 }
3045 no_anyof:
3046 /* If we might have the case of the German sharp s
3047 * in a casefolding Unicode character class. */
3048
ebc501f0
JH
3049 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3050 locinput += SHARP_S_SKIP;
e0f9d4a8 3051 nextchr = UCHARAT(locinput);
ffc61ed2 3052 }
e0f9d4a8
JH
3053 else
3054 sayNO;
b8c5462f 3055 break;
d6a28714 3056 case ALNUML:
b8c5462f
JH
3057 PL_reg_flags |= RF_tainted;
3058 /* FALL THROUGH */
d6a28714 3059 case ALNUM:
b8c5462f 3060 if (!nextchr)
4633a7c4 3061 sayNO;
ffc61ed2 3062 if (do_utf8) {
1a4fad37 3063 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3064 if (!(OP(scan) == ALNUM
bb7a0f54 3065 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3066 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3067 {
3068 sayNO;
a0ed51b3 3069 }
b8c5462f 3070 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3071 nextchr = UCHARAT(locinput);
3072 break;
3073 }
ffc61ed2 3074 if (!(OP(scan) == ALNUM
d6a28714 3075 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3076 sayNO;
b8c5462f 3077 nextchr = UCHARAT(++locinput);
a0d0e21e 3078 break;
d6a28714 3079 case NALNUML:
b8c5462f
JH
3080 PL_reg_flags |= RF_tainted;
3081 /* FALL THROUGH */
d6a28714
JH
3082 case NALNUM:
3083 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3084 sayNO;
ffc61ed2 3085 if (do_utf8) {
1a4fad37 3086 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3087 if (OP(scan) == NALNUM
bb7a0f54 3088 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3089 : isALNUM_LC_utf8((U8*)locinput))
3090 {
b8c5462f 3091 sayNO;
d6a28714 3092 }
b8c5462f
JH
3093 locinput += PL_utf8skip[nextchr];
3094 nextchr = UCHARAT(locinput);
3095 break;
3096 }
ffc61ed2 3097 if (OP(scan) == NALNUM
d6a28714 3098 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3099 sayNO;
76e3520e 3100 nextchr = UCHARAT(++locinput);
a0d0e21e 3101 break;
d6a28714
JH
3102 case BOUNDL:
3103 case NBOUNDL:
3280af22 3104 PL_reg_flags |= RF_tainted;
bbce6d69 3105 /* FALL THROUGH */
d6a28714
JH
3106 case BOUND:
3107 case NBOUND:
3108 /* was last char in word? */
ffc61ed2 3109 if (do_utf8) {
12d33761 3110 if (locinput == PL_bostr)
5d9a96ca 3111 st->ln = '\n';
ffc61ed2 3112 else {
a3b680e6 3113 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3114
4ad0818d 3115 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3116 }
3117 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3118 st->ln = isALNUM_uni(st->ln);
1a4fad37 3119 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3120 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3121 }
3122 else {
5d9a96ca 3123 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3124 n = isALNUM_LC_utf8((U8*)locinput);
3125 }
a0ed51b3 3126 }
d6a28714 3127 else {
5d9a96ca 3128 st->ln = (locinput != PL_bostr) ?
12d33761 3129 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3130 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3131 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3132 n = isALNUM(nextchr);
3133 }
3134 else {
5d9a96ca 3135 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3136 n = isALNUM_LC(nextchr);
3137 }
d6a28714 3138 }
5d9a96ca 3139 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3140 OP(scan) == BOUNDL))
3141 sayNO;
a0ed51b3 3142 break;
d6a28714 3143 case SPACEL:
3280af22 3144 PL_reg_flags |= RF_tainted;
bbce6d69 3145 /* FALL THROUGH */
d6a28714 3146 case SPACE:
9442cb0e 3147 if (!nextchr)
4633a7c4 3148 sayNO;
1aa99e6b 3149 if (do_utf8) {
fd400ab9 3150 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3151 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3152 if (!(OP(scan) == SPACE
bb7a0f54 3153 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3154 : isSPACE_LC_utf8((U8*)locinput)))
3155 {
3156 sayNO;
3157 }
3158 locinput += PL_utf8skip[nextchr];
3159 nextchr = UCHARAT(locinput);
3160 break;
d6a28714 3161 }
ffc61ed2
JH
3162 if (!(OP(scan) == SPACE
3163 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3164 sayNO;
3165 nextchr = UCHARAT(++locinput);
3166 }
3167 else {
3168 if (!(OP(scan) == SPACE
3169 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3170 sayNO;
3171 nextchr = UCHARAT(++locinput);
a0ed51b3 3172 }
a0ed51b3 3173 break;
d6a28714 3174 case NSPACEL:
3280af22 3175 PL_reg_flags |= RF_tainted;
bbce6d69 3176 /* FALL THROUGH */
d6a28714 3177 case NSPACE:
9442cb0e 3178 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3179 sayNO;
1aa99e6b 3180 if (do_utf8) {
1a4fad37 3181 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3182 if (OP(scan) == NSPACE
bb7a0f54 3183 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3184 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3185 {
3186 sayNO;
3187 }
3188 locinput += PL_utf8skip[nextchr];
3189 nextchr = UCHARAT(locinput);
3190 break;
a0ed51b3 3191 }
ffc61ed2 3192 if (OP(scan) == NSPACE
d6a28714 3193 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3194 sayNO;
76e3520e 3195 nextchr = UCHARAT(++locinput);
a0d0e21e 3196 break;
d6a28714 3197 case DIGITL:
a0ed51b3
LW
3198 PL_reg_flags |= RF_tainted;
3199 /* FALL THROUGH */
d6a28714 3200 case DIGIT:
9442cb0e 3201 if (!nextchr)
a0ed51b3 3202 sayNO;
1aa99e6b 3203 if (do_utf8) {
1a4fad37 3204 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3205 if (!(OP(scan) == DIGIT
bb7a0f54 3206 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3207 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3208 {
a0ed51b3 3209 sayNO;
dfe13c55 3210 }
6f06b55f 3211 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3212 nextchr = UCHARAT(locinput);
3213 break;
3214 }
ffc61ed2 3215 if (!(OP(scan) == DIGIT
9442cb0e 3216 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3217 sayNO;
3218 nextchr = UCHARAT(++locinput);
3219 break;
d6a28714 3220 case NDIGITL:
b8c5462f
JH
3221 PL_reg_flags |= RF_tainted;
3222 /* FALL THROUGH */
d6a28714 3223 case NDIGIT:
9442cb0e 3224 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3225 sayNO;
1aa99e6b 3226 if (do_utf8) {
1a4fad37 3227 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3228 if (OP(scan) == NDIGIT
bb7a0f54 3229 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3230 : isDIGIT_LC_utf8((U8*)locinput))
3231 {
a0ed51b3 3232 sayNO;
9442cb0e 3233 }
6f06b55f 3234 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3235 nextchr = UCHARAT(locinput);
3236 break;
3237 }
ffc61ed2 3238 if (OP(scan) == NDIGIT
9442cb0e 3239 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3240 sayNO;
3241 nextchr = UCHARAT(++locinput);
3242 break;
3243 case CLUMP:
b7c83a7e 3244 if (locinput >= PL_regeol)
a0ed51b3 3245 sayNO;
b7c83a7e 3246 if (do_utf8) {
1a4fad37 3247 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3248 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3249 sayNO;
3250 locinput += PL_utf8skip[nextchr];
3251 while (locinput < PL_regeol &&
3252 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3253 locinput += UTF8SKIP(locinput);
3254 if (locinput > PL_regeol)
3255 sayNO;
eb08e2da
JH
3256 }
3257 else
3258 locinput++;
a0ed51b3
LW
3259 nextchr = UCHARAT(locinput);
3260 break;
c8756f30 3261 case REFFL:
3280af22 3262 PL_reg_flags |= RF_tainted;
c8756f30 3263 /* FALL THROUGH */
c277df42 3264 case REF:
95b24440
DM
3265 case REFF: {
3266 char *s;
c277df42 3267 n = ARG(scan); /* which paren pair */
5d9a96ca 3268 st->ln = PL_regstartp[n];
2c2d71f5 3269 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3270 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3271 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3272 if (st->ln == PL_regendp[n])
a0d0e21e 3273 break;
a0ed51b3 3274
5d9a96ca 3275 s = PL_bostr + st->ln;
1aa99e6b 3276 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3277 char *l = locinput;
a3b680e6 3278 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3279 /*
3280 * Note that we can't do the "other character" lookup trick as
3281 * in the 8-bit case (no pun intended) because in Unicode we
3282 * have to map both upper and title case to lower case.
3283 */
3284 if (OP(scan) == REFF) {
3285 while (s < e) {
a3b680e6
AL
3286 STRLEN ulen1, ulen2;
3287 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3288 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3289
a0ed51b3
LW
3290 if (l >= PL_regeol)
3291 sayNO;
a2a2844f
JH
3292 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3293 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3294 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3295 sayNO;
a2a2844f
JH
3296 s += ulen1;
3297 l += ulen2;
a0ed51b3
LW
3298 }
3299 }
3300 locinput = l;
3301 nextchr = UCHARAT(locinput);
3302 break;
3303 }
3304
a0d0e21e 3305 /* Inline the first character, for speed. */
76e3520e 3306 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3307 (OP(scan) == REF ||
3308 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3309 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3310 sayNO;
5d9a96ca
DM
3311 st->ln = PL_regendp[n] - st->ln;
3312 if (locinput + st->ln > PL_regeol)
4633a7c4 3313 sayNO;
5d9a96ca
DM
3314 if (st->ln > 1 && (OP(scan) == REF
3315 ? memNE(s, locinput, st->ln)
c8756f30 3316 : (OP(scan) == REFF
5d9a96ca
DM
3317 ? ibcmp(s, locinput, st->ln)
3318 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3319 sayNO;
5d9a96ca 3320 locinput += st->ln;
76e3520e 3321 nextchr = UCHARAT(locinput);
a0d0e21e 3322 break;
95b24440 3323 }
a0d0e21e
LW
3324
3325 case NOTHING:
c277df42 3326 case TAIL:
a0d0e21e
LW
3327 break;
3328 case BACK:
3329 break;
c277df42
IZ
3330 case EVAL:
3331 {
c277df42 3332 SV *ret;
8e5e9ebe 3333 {
4aabdb9b
DM
3334 /* execute the code in the {...} */
3335 dSP;
6136c704 3336 SV ** const before = SP;
4aabdb9b
DM
3337 OP_4tree * const oop = PL_op;
3338 COP * const ocurcop = PL_curcop;
3339 PAD *old_comppad;
4aabdb9b
DM
3340
3341 n = ARG(scan);
32fc9b6a 3342 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3343 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3344 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3345 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3346
8e5e9ebe
RGS
3347 CALLRUNOPS(aTHX); /* Scalar context. */
3348 SPAGAIN;
3349 if (SP == before)
075aa684 3350 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3351 else {
3352 ret = POPs;
3353 PUTBACK;
3354 }
4aabdb9b
DM
3355
3356 PL_op = oop;
3357 PAD_RESTORE_LOCAL(old_comppad);
3358 PL_curcop = ocurcop;
3359 if (!st->logical) {
3360 /* /(?{...})/ */
3361 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3362 break;
3363 }
8e5e9ebe 3364 }
4aabdb9b
DM
3365 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3366 regexp *re;
4aabdb9b 3367 {
4f639d21
DM
3368 /* extract RE object from returned value; compiling if
3369 * necessary */
3370
6136c704 3371 MAGIC *mg = NULL;
4aabdb9b 3372 SV *sv;
faf82a0b
AE
3373 if(SvROK(ret) &&am