This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove a redundant test on mg->mg_ptr. (It had to be non-NULL to get
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
97aff369 181 dVAR;
a3b680e6 182 const int retval = PL_savestack_ix;
b1ce53c5 183#define REGCP_PAREN_ELEMS 4
a3b680e6 184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
185 int p;
186
e49a9654
IH
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189
a01268b5 190#define REGCP_OTHER_ELEMS 6
4b3c1a47 191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 192 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 193/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
3280af22 196 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
197 SSPUSHINT(p);
198 }
b1ce53c5 199/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
a01268b5 202 SSPUSHINT(*PL_reglastcloseparen);
3280af22 203 SSPUSHPTR(PL_reginput);
41123dfd
JH
204#define REGCP_FRAME_ELEMS 2
205/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 209
a0d0e21e
LW
210 return retval;
211}
212
c277df42 213/* These are needed since we do not localize EVAL nodes: */
a3621e74 214# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 217
a3621e74 218# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 219 PerlIO_printf(Perl_debug_log, \
faccc32b 220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 222
76e3520e 223STATIC char *
cea2e8a9 224S_regcppop(pTHX)
a0d0e21e 225{
97aff369 226 dVAR;
b1ce53c5 227 I32 i;
a0d0e21e
LW
228 U32 paren = 0;
229 char *input;
b1ce53c5 230
a3621e74
YO
231 GET_RE_DEBUG_FLAGS_DECL;
232
b1ce53c5 233 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 234 i = SSPOPINT;
b1ce53c5
JH
235 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
236 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 237 input = (char *) SSPOPPTR;
a01268b5 238 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
239 *PL_reglastparen = SSPOPINT;
240 PL_regsize = SSPOPINT;
b1ce53c5
JH
241
242 /* Now restore the parentheses context. */
41123dfd
JH
243 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
244 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 245 I32 tmps;
a0d0e21e 246 paren = (U32)SSPOPINT;
3280af22 247 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
248 PL_regstartp[paren] = SSPOPINT;
249 tmps = SSPOPINT;
3280af22
NIS
250 if (paren <= *PL_reglastparen)
251 PL_regendp[paren] = tmps;
a3621e74 252 DEBUG_EXECUTE_r(
c3464db5 253 PerlIO_printf(Perl_debug_log,
b900a521 254 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 255 (UV)paren, (IV)PL_regstartp[paren],
b900a521 256 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 257 (IV)PL_regendp[paren],
3280af22 258 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 259 );
a0d0e21e 260 }
a3621e74 261 DEBUG_EXECUTE_r(
eb160463 262 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 263 PerlIO_printf(Perl_debug_log,
faccc32b
JH
264 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
265 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
266 }
267 );
daf18116 268#if 1
dafc8851
JH
269 /* It would seem that the similar code in regtry()
270 * already takes care of this, and in fact it is in
271 * a better location to since this code can #if 0-ed out
272 * but the code in regtry() is needed or otherwise tests
273 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
274 * (as of patchlevel 7877) will fail. Then again,
275 * this code seems to be necessary or otherwise
276 * building DynaLoader will fail:
277 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
278 * --jhi */
eb160463
GS
279 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
280 if ((I32)paren > PL_regsize)
cf93c79d
IZ
281 PL_regstartp[paren] = -1;
282 PL_regendp[paren] = -1;
a0d0e21e 283 }
dafc8851 284#endif
a0d0e21e
LW
285 return input;
286}
287
0f5d15d6
IZ
288typedef struct re_cc_state
289{
290 I32 ss;
291 regnode *node;
292 struct re_cc_state *prev;
293 CURCUR *cc;
294 regexp *re;
295} re_cc_state;
296
02db2b7b 297#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 298
29d1e993
HS
299#define TRYPAREN(paren, n, input) { \
300 if (paren) { \
301 if (n) { \
302 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
303 PL_regendp[paren] = input - PL_bostr; \
304 } \
305 else \
306 PL_regendp[paren] = -1; \
307 } \
308 if (regmatch(next)) \
309 sayYES; \
310 if (paren && n) \
311 PL_regendp[paren] = -1; \
312}
313
314
a687059c 315/*
e50aee73 316 * pregexec and friends
a687059c
LW
317 */
318
319/*
c277df42 320 - pregexec - match a regexp against a string
a687059c 321 */
c277df42 322I32
864dbfa3 323Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 324 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
325/* strend: pointer to null at end of string */
326/* strbeg: real beginning of string */
327/* minend: end of match must be >=minend after stringarg. */
328/* nosave: For optimizations. */
329{
330 return
9041c2e3 331 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
332 nosave ? 0 : REXEC_COPY_STR);
333}
0f5d15d6
IZ
334
335STATIC void
cea2e8a9 336S_cache_re(pTHX_ regexp *prog)
0f5d15d6 337{
97aff369 338 dVAR;
0f5d15d6
IZ
339 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
340#ifdef DEBUGGING
341 PL_regprogram = prog->program;
342#endif
343 PL_regnpar = prog->nparens;
9041c2e3
NIS
344 PL_regdata = prog->data;
345 PL_reg_re = prog;
0f5d15d6 346}
22e551b9 347
9041c2e3 348/*
cad2e5aa
JH
349 * Need to implement the following flags for reg_anch:
350 *
351 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
352 * USE_INTUIT_ML
353 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
354 * INTUIT_AUTORITATIVE_ML
355 * INTUIT_ONCE_NOML - Intuit can match in one location only.
356 * INTUIT_ONCE_ML
357 *
358 * Another flag for this function: SECOND_TIME (so that float substrs
359 * with giant delta may be not rechecked).
360 */
361
362/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
363
3f7c398e 364/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
365 Otherwise, only SvCUR(sv) is used to get strbeg. */
366
367/* XXXX We assume that strpos is strbeg unless sv. */
368
6eb5f6b9
JH
369/* XXXX Some places assume that there is a fixed substring.
370 An update may be needed if optimizer marks as "INTUITable"
371 RExen without fixed substrings. Similarly, it is assumed that
372 lengths of all the strings are no more than minlen, thus they
373 cannot come from lookahead.
374 (Or minlen should take into account lookahead.) */
375
2c2d71f5
JH
376/* A failure to find a constant substring means that there is no need to make
377 an expensive call to REx engine, thus we celebrate a failure. Similarly,
378 finding a substring too deep into the string means that less calls to
30944b6d
IZ
379 regtry() should be needed.
380
381 REx compiler's optimizer found 4 possible hints:
382 a) Anchored substring;
383 b) Fixed substring;
384 c) Whether we are anchored (beginning-of-line or \G);
385 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 386 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
387 string which does not contradict any of them.
388 */
2c2d71f5 389
6eb5f6b9
JH
390/* Most of decisions we do here should have been done at compile time.
391 The nodes of the REx which we used for the search should have been
392 deleted from the finite automaton. */
393
cad2e5aa
JH
394char *
395Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
396 char *strend, U32 flags, re_scream_pos_data *data)
397{
97aff369 398 dVAR;
b7953727 399 register I32 start_shift = 0;
cad2e5aa 400 /* Should be nonnegative! */
b7953727 401 register I32 end_shift = 0;
2c2d71f5
JH
402 register char *s;
403 register SV *check;
a1933d95 404 char *strbeg;
cad2e5aa 405 char *t;
a3b680e6 406 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 407 I32 ml_anch;
bd61b366
SS
408 register char *other_last = NULL; /* other substr checked before this */
409 char *check_at = NULL; /* check substr found at this pos */
1df70142 410 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 411#ifdef DEBUGGING
890ce7af
AL
412 const char * const i_strpos = strpos;
413 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 414#endif
a3621e74
YO
415
416 GET_RE_DEBUG_FLAGS_DECL;
417
a30b2f1f 418 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 419
b8d68ded 420 if (prog->reganch & ROPT_UTF8) {
a3621e74 421 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
422 "UTF-8 regex...\n"));
423 PL_reg_flags |= RF_utf8;
424 }
425
a3621e74 426 DEBUG_EXECUTE_r({
1df70142 427 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
428 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
429 strpos;
1df70142 430 const int len = PL_reg_match_utf8 ?
b8d68ded 431 strlen(s) : strend - strpos;
2a782b5b
JH
432 if (!PL_colorset)
433 reginitcolors();
b8d68ded 434 if (PL_reg_match_utf8)
a3621e74 435 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 436 "UTF-8 target...\n"));
2a782b5b 437 PerlIO_printf(Perl_debug_log,
a0288114 438 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 439 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
440 prog->precomp,
441 PL_colors[1],
442 (strlen(prog->precomp) > 60 ? "..." : ""),
443 PL_colors[0],
444 (int)(len > 60 ? 60 : len),
445 s, PL_colors[1],
446 (len > 60 ? "..." : "")
447 );
448 });
cad2e5aa 449
c344f387
JH
450 /* CHR_DIST() would be more correct here but it makes things slow. */
451 if (prog->minlen > strend - strpos) {
a3621e74 452 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 453 "String too short... [re_intuit_start]\n"));
cad2e5aa 454 goto fail;
2c2d71f5 455 }
a1933d95 456 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 457 PL_regeol = strend;
33b8afdf
JH
458 if (do_utf8) {
459 if (!prog->check_utf8 && prog->check_substr)
460 to_utf8_substr(prog);
461 check = prog->check_utf8;
462 } else {
463 if (!prog->check_substr && prog->check_utf8)
464 to_byte_substr(prog);
465 check = prog->check_substr;
466 }
467 if (check == &PL_sv_undef) {
a3621e74 468 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
469 "Non-utf string cannot match utf check string\n"));
470 goto fail;
471 }
2c2d71f5 472 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
473 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
474 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 475 && !multiline ) ); /* Check after \n? */
cad2e5aa 476
7e25d62c
JH
477 if (!ml_anch) {
478 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
479 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 480 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
481 && sv && !SvROK(sv)
482 && (strpos != strbeg)) {
a3621e74 483 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
484 goto fail;
485 }
486 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 487 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 488 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
489 I32 slen;
490
1aa99e6b 491 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
492 if (SvTAIL(check)) {
493 slen = SvCUR(check); /* >= 1 */
cad2e5aa 494
9041c2e3 495 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 496 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 497 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 498 goto fail_finish;
cad2e5aa
JH
499 }
500 /* Now should match s[0..slen-2] */
501 slen--;
3f7c398e 502 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 503 || (slen > 1
3f7c398e 504 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 505 report_neq:
a3621e74 506 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
507 goto fail_finish;
508 }
cad2e5aa 509 }
3f7c398e 510 else if (*SvPVX_const(check) != *s
653099ff 511 || ((slen = SvCUR(check)) > 1
3f7c398e 512 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 513 goto report_neq;
c315bfe8 514 check_at = s;
2c2d71f5 515 goto success_at_start;
7e25d62c 516 }
cad2e5aa 517 }
2c2d71f5 518 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 519 s = strpos;
2c2d71f5 520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 523 if (!ml_anch) {
a3b680e6 524 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 525 - (SvTAIL(check) != 0);
a3b680e6 526 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
527
528 if (end_shift < eshift)
529 end_shift = eshift;
530 }
cad2e5aa 531 }
2c2d71f5 532 else { /* Can match at random position */
cad2e5aa
JH
533 ml_anch = 0;
534 s = strpos;
2c2d71f5
JH
535 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
536 /* Should be nonnegative! */
537 end_shift = prog->minlen - start_shift -
653099ff 538 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
539 }
540
2c2d71f5 541#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 542 if (end_shift < 0)
6bbae5e6 543 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
544#endif
545
2c2d71f5
JH
546 restart:
547 /* Find a possible match in the region s..strend by looking for
548 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 549 if (flags & REXEC_SCREAM) {
cad2e5aa 550 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 551 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 552
2c2d71f5
JH
553 if (PL_screamfirst[BmRARE(check)] >= 0
554 || ( BmRARE(check) == '\n'
555 && (BmPREVIOUS(check) == SvCUR(check) - 1)
556 && SvTAIL(check) ))
9041c2e3 557 s = screaminstr(sv, check,
2c2d71f5 558 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 559 else
2c2d71f5 560 goto fail_finish;
4addbd3b
HS
561 /* we may be pointing at the wrong string */
562 if (s && RX_MATCH_COPIED(prog))
3f7c398e 563 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
564 if (data)
565 *data->scream_olds = s;
566 }
f33976b4 567 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
568 s = fbm_instr((U8*)(s + start_shift),
569 (U8*)(strend - end_shift),
7fba1cd6 570 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 571 else
1aa99e6b
IH
572 s = fbm_instr(HOP3(s, start_shift, strend),
573 HOP3(strend, -end_shift, strbeg),
7fba1cd6 574 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
575
576 /* Update the count-of-usability, remove useless subpatterns,
577 unshift s. */
2c2d71f5 578
a0288114 579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 580 (s ? "Found" : "Did not find"),
33b8afdf 581 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 582 PL_colors[0],
7b0972df 583 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 584 SvPVX_const(check),
2c2d71f5
JH
585 PL_colors[1], (SvTAIL(check) ? "$" : ""),
586 (s ? " at offset " : "...\n") ) );
587
588 if (!s)
589 goto fail_finish;
590
6eb5f6b9
JH
591 check_at = s;
592
2c2d71f5 593 /* Finish the diagnostic message */
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
595
596 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
597 Start with the other substr.
598 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 599 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
600 *always* match. Probably should be marked during compile...
601 Probably it is right to do no SCREAM here...
602 */
603
33b8afdf 604 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 605 /* Take into account the "other" substring. */
2c2d71f5
JH
606 /* XXXX May be hopelessly wrong for UTF... */
607 if (!other_last)
6eb5f6b9 608 other_last = strpos;
33b8afdf 609 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
610 do_other_anchored:
611 {
890ce7af
AL
612 char * const last = HOP3c(s, -start_shift, strbeg);
613 char *last1, *last2;
2c2d71f5 614 char *s1 = s;
33b8afdf 615 SV* must;
2c2d71f5 616
2c2d71f5
JH
617 t = s - prog->check_offset_max;
618 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 619 && (!do_utf8
1aa99e6b 620 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 621 && t > strpos)))
30944b6d 622 /* EMPTY */;
2c2d71f5
JH
623 else
624 t = strpos;
1aa99e6b 625 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
626 if (t < other_last) /* These positions already checked */
627 t = other_last;
1aa99e6b 628 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
629 if (last < last1)
630 last1 = last;
631 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
632 /* On end-of-str: see comment below. */
33b8afdf
JH
633 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
634 if (must == &PL_sv_undef) {
635 s = (char*)NULL;
a3621e74 636 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
637 }
638 else
639 s = fbm_instr(
640 (unsigned char*)t,
641 HOP3(HOP3(last1, prog->anchored_offset, strend)
642 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
643 must,
7fba1cd6 644 multiline ? FBMrf_MULTILINE : 0
33b8afdf 645 );
a3621e74 646 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 647 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
648 (s ? "Found" : "Contradicts"),
649 PL_colors[0],
33b8afdf
JH
650 (int)(SvCUR(must)
651 - (SvTAIL(must)!=0)),
3f7c398e 652 SvPVX_const(must),
33b8afdf 653 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
654 if (!s) {
655 if (last1 >= last2) {
a3621e74 656 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
657 ", giving up...\n"));
658 goto fail_finish;
659 }
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 661 ", trying floating at offset %ld...\n",
1aa99e6b
IH
662 (long)(HOP3c(s1, 1, strend) - i_strpos)));
663 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
664 s = HOP3c(last, 1, strend);
2c2d71f5
JH
665 goto restart;
666 }
667 else {
a3621e74 668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 669 (long)(s - i_strpos)));
1aa99e6b
IH
670 t = HOP3c(s, -prog->anchored_offset, strbeg);
671 other_last = HOP3c(s, 1, strend);
30944b6d 672 s = s1;
2c2d71f5
JH
673 if (t == strpos)
674 goto try_at_start;
2c2d71f5
JH
675 goto try_at_offset;
676 }
30944b6d 677 }
2c2d71f5
JH
678 }
679 else { /* Take into account the floating substring. */
33b8afdf
JH
680 char *last, *last1;
681 char *s1 = s;
682 SV* must;
683
684 t = HOP3c(s, -start_shift, strbeg);
685 last1 = last =
686 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
687 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
688 last = HOP3c(t, prog->float_max_offset, strend);
689 s = HOP3c(t, prog->float_min_offset, strend);
690 if (s < other_last)
691 s = other_last;
2c2d71f5 692 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
693 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
694 /* fbm_instr() takes into account exact value of end-of-str
695 if the check is SvTAIL(ed). Since false positives are OK,
696 and end-of-str is not later than strend we are OK. */
697 if (must == &PL_sv_undef) {
698 s = (char*)NULL;
a3621e74 699 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
700 }
701 else
2c2d71f5 702 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
703 (unsigned char*)last + SvCUR(must)
704 - (SvTAIL(must)!=0),
7fba1cd6 705 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 706 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
707 (s ? "Found" : "Contradicts"),
708 PL_colors[0],
709 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 710 SvPVX_const(must),
33b8afdf
JH
711 PL_colors[1], (SvTAIL(must) ? "$" : "")));
712 if (!s) {
713 if (last1 == last) {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
715 ", giving up...\n"));
716 goto fail_finish;
2c2d71f5 717 }
a3621e74 718 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
719 ", trying anchored starting at offset %ld...\n",
720 (long)(s1 + 1 - i_strpos)));
721 other_last = last;
722 s = HOP3c(t, 1, strend);
723 goto restart;
724 }
725 else {
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
727 (long)(s - i_strpos)));
728 other_last = s; /* Fix this later. --Hugo */
729 s = s1;
730 if (t == strpos)
731 goto try_at_start;
732 goto try_at_offset;
733 }
2c2d71f5 734 }
cad2e5aa 735 }
2c2d71f5
JH
736
737 t = s - prog->check_offset_max;
2c2d71f5 738 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 739 && (!do_utf8
1aa99e6b
IH
740 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
741 && t > strpos))) {
2c2d71f5
JH
742 /* Fixed substring is found far enough so that the match
743 cannot start at strpos. */
744 try_at_offset:
cad2e5aa 745 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
746 /* Eventually fbm_*() should handle this, but often
747 anchored_offset is not 0, so this check will not be wasted. */
748 /* XXXX In the code below we prefer to look for "^" even in
749 presence of anchored substrings. And we search even
750 beyond the found float position. These pessimizations
751 are historical artefacts only. */
752 find_anchor:
2c2d71f5 753 while (t < strend - prog->minlen) {
cad2e5aa 754 if (*t == '\n') {
4ee3650e 755 if (t < check_at - prog->check_offset_min) {
33b8afdf 756 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
757 /* Since we moved from the found position,
758 we definitely contradict the found anchored
30944b6d
IZ
759 substr. Due to the above check we do not
760 contradict "check" substr.
761 Thus we can arrive here only if check substr
762 is float. Redo checking for "other"=="fixed".
763 */
9041c2e3 764 strpos = t + 1;
a3621e74 765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 766 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
767 goto do_other_anchored;
768 }
4ee3650e
GS
769 /* We don't contradict the found floating substring. */
770 /* XXXX Why not check for STCLASS? */
cad2e5aa 771 s = t + 1;
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
774 goto set_useful;
775 }
4ee3650e
GS
776 /* Position contradicts check-string */
777 /* XXXX probably better to look for check-string
778 than for "\n", so one should lower the limit for t? */
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 780 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 781 other_last = strpos = s = t + 1;
cad2e5aa
JH
782 goto restart;
783 }
784 t++;
785 }
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 787 PL_colors[0], PL_colors[1]));
2c2d71f5 788 goto fail_finish;
cad2e5aa 789 }
f5952150 790 else {
a3621e74 791 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 792 PL_colors[0], PL_colors[1]));
f5952150 793 }
cad2e5aa
JH
794 s = t;
795 set_useful:
33b8afdf 796 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
797 }
798 else {
f5952150 799 /* The found string does not prohibit matching at strpos,
2c2d71f5 800 - no optimization of calling REx engine can be performed,
f5952150
GS
801 unless it was an MBOL and we are not after MBOL,
802 or a future STCLASS check will fail this. */
2c2d71f5
JH
803 try_at_start:
804 /* Even in this situation we may use MBOL flag if strpos is offset
805 wrt the start of the string. */
05b4157f 806 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 807 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
808 /* May be due to an implicit anchor of m{.*foo} */
809 && !(prog->reganch & ROPT_IMPLICIT))
810 {
cad2e5aa
JH
811 t = strpos;
812 goto find_anchor;
813 }
a3621e74 814 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 815 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 816 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 817 );
2c2d71f5 818 success_at_start:
30944b6d 819 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
820 && (do_utf8 ? (
821 prog->check_utf8 /* Could be deleted already */
822 && --BmUSEFUL(prog->check_utf8) < 0
823 && (prog->check_utf8 == prog->float_utf8)
824 ) : (
825 prog->check_substr /* Could be deleted already */
826 && --BmUSEFUL(prog->check_substr) < 0
827 && (prog->check_substr == prog->float_substr)
828 )))
66e933ab 829 {
cad2e5aa 830 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
832 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
833 if (do_utf8 ? prog->check_substr : prog->check_utf8)
834 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
835 prog->check_substr = prog->check_utf8 = NULL; /* disable */
836 prog->float_substr = prog->float_utf8 = NULL; /* clear */
837 check = NULL; /* abort */
cad2e5aa 838 s = strpos;
3cf5c195
IZ
839 /* XXXX This is a remnant of the old implementation. It
840 looks wasteful, since now INTUIT can use many
6eb5f6b9 841 other heuristics. */
cad2e5aa
JH
842 prog->reganch &= ~RE_USE_INTUIT;
843 }
844 else
845 s = strpos;
846 }
847
6eb5f6b9
JH
848 /* Last resort... */
849 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
850 if (prog->regstclass) {
851 /* minlen == 0 is possible if regstclass is \b or \B,
852 and the fixed substr is ''$.
853 Since minlen is already taken into account, s+1 is before strend;
854 accidentally, minlen >= 1 guaranties no false positives at s + 1
855 even for \b or \B. But (minlen? 1 : 0) below assumes that
856 regstclass does not come from lookahead... */
857 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
858 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 859 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 860 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 861 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 862 : 1);
a3b680e6 863 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 864 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 865 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
866 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
867 cl_l, strend)
868 : strend);
6eb5f6b9
JH
869
870 t = s;
9926ca43 871 cache_re(prog);
06b5626a 872 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
873 if (!s) {
874#ifdef DEBUGGING
cbbf8932 875 const char *what = NULL;
6eb5f6b9
JH
876#endif
877 if (endpos == strend) {
a3621e74 878 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
879 "Could not match STCLASS...\n") );
880 goto fail;
881 }
a3621e74 882 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 883 "This position contradicts STCLASS...\n") );
653099ff
GS
884 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
885 goto fail;
6eb5f6b9 886 /* Contradict one of substrings */
33b8afdf
JH
887 if (prog->anchored_substr || prog->anchored_utf8) {
888 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 889 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 890 hop_and_restart:
1aa99e6b 891 s = HOP3c(t, 1, strend);
66e933ab
GS
892 if (s + start_shift + end_shift > strend) {
893 /* XXXX Should be taken into account earlier? */
a3621e74 894 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
895 "Could not match STCLASS...\n") );
896 goto fail;
897 }
5e39e1e5
HS
898 if (!check)
899 goto giveup;
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 901 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
902 what, (long)(s + start_shift - i_strpos)) );
903 goto restart;
904 }
66e933ab 905 /* Have both, check_string is floating */
6eb5f6b9
JH
906 if (t + start_shift >= check_at) /* Contradicts floating=check */
907 goto retry_floating_check;
908 /* Recheck anchored substring, but not floating... */
9041c2e3 909 s = check_at;
5e39e1e5
HS
910 if (!check)
911 goto giveup;
a3621e74 912 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 913 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
914 (long)(other_last - i_strpos)) );
915 goto do_other_anchored;
916 }
60e71179
GS
917 /* Another way we could have checked stclass at the
918 current position only: */
919 if (ml_anch) {
920 s = t = t + 1;
5e39e1e5
HS
921 if (!check)
922 goto giveup;
a3621e74 923 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 924 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 926 goto try_at_offset;
66e933ab 927 }
33b8afdf 928 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 929 goto fail;
6eb5f6b9
JH
930 /* Check is floating subtring. */
931 retry_floating_check:
932 t = check_at - start_shift;
a3621e74 933 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
934 goto hop_and_restart;
935 }
b7953727 936 if (t != s) {
a3621e74 937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 938 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
939 (long)(t - i_strpos), (long)(s - i_strpos))
940 );
941 }
942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
944 "Does not contradict STCLASS...\n");
945 );
946 }
6eb5f6b9 947 }
5e39e1e5 948 giveup:
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
950 PL_colors[4], (check ? "Guessed" : "Giving up"),
951 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 952 return s;
2c2d71f5
JH
953
954 fail_finish: /* Substring not found */
33b8afdf
JH
955 if (prog->check_substr || prog->check_utf8) /* could be removed already */
956 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 957 fail:
a3621e74 958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 959 PL_colors[4], PL_colors[5]));
bd61b366 960 return NULL;
cad2e5aa 961}
9661b544 962
6eb5f6b9 963/* We know what class REx starts with. Try to find this position... */
3c3eec57 964STATIC char *
a3b680e6 965S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 966{
27da23d5 967 dVAR;
1df70142 968 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23
G
973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 977 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1036 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1037
1038 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1039 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1040
89ebb4a3 1041 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1042 0, uniflags);
89ebb4a3 1043 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1044 0, uniflags);
5dab1207
NIS
1045 lnc = 0;
1046 while (sm < ((U8 *) m + ln)) {
1047 lnc++;
1048 sm += UTF8SKIP(sm);
1049 }
1aa99e6b
IH
1050 }
1051 else {
1052 c1 = *(U8*)m;
1053 c2 = PL_fold[c1];
1054 }
6eb5f6b9
JH
1055 goto do_exactf;
1056 case EXACTFL:
5dab1207
NIS
1057 m = STRING(c);
1058 ln = STR_LEN(c);
1059 lnc = (I32) ln;
d8093b23 1060 c1 = *(U8*)m;
6eb5f6b9
JH
1061 c2 = PL_fold_locale[c1];
1062 do_exactf:
db12adc6 1063 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1064
6eb5f6b9
JH
1065 if (norun && e < s)
1066 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1067
60a8b682
JH
1068 /* The idea in the EXACTF* cases is to first find the
1069 * first character of the EXACTF* node and then, if
1070 * necessary, case-insensitively compare the full
1071 * text of the node. The c1 and c2 are the first
1072 * characters (though in Unicode it gets a bit
1073 * more complicated because there are more cases
7f16dd3d
JH
1074 * than just upper and lower: one needs to use
1075 * the so-called folding case for case-insensitive
1076 * matching (called "loose matching" in Unicode).
1077 * ibcmp_utf8() will do just that. */
60a8b682 1078
1aa99e6b 1079 if (do_utf8) {
575cac57 1080 UV c, f;
89ebb4a3 1081 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1082 STRLEN len, foldlen;
041457d9 1083 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1089 uniflags);
80aecb99
JH
1090 if ( c == c1
1091 && (ln == len ||
66423254 1092 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1093 m, (char **)0, ln, (bool)UTF))
55da9344 1094 && (norun || regtry(prog, s)) )
1aa99e6b 1095 goto got_it;
80aecb99 1096 else {
1df70142 1097 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1116 uniflags);
80aecb99 1117
60a8b682 1118 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1119 * Note that not all the possible combinations
1120 * are handled here: some of them are handled
1121 * by the standard folding rules, and some of
1122 * them (the character class or ANYOF cases)
1123 * are handled during compiletime in
1124 * regexec.c:S_regclass(). */
880bd946
JH
1125 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1126 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1127 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1128
1129 if ( (c == c1 || c == c2)
1130 && (ln == len ||
66423254 1131 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1132 m, (char **)0, ln, (bool)UTF))
55da9344 1133 && (norun || regtry(prog, s)) )
1aa99e6b 1134 goto got_it;
80aecb99 1135 else {
1df70142 1136 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
6136c704 1184 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1185 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1186 }
1187 tmp = ((OP(c) == BOUND ?
9041c2e3 1188 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1189 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1190 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1191 if (tmp == !(OP(c) == BOUND ?
3568d838 1192 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1193 isALNUM_LC_utf8((U8*)s)))
1194 {
1195 tmp = !tmp;
1196 if ((norun || regtry(prog, s)))
1197 goto got_it;
1198 }
078c425b 1199 s += uskip;
a687059c 1200 }
a0d0e21e 1201 }
667bb95a 1202 else {
12d33761 1203 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1204 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1205 while (s < strend) {
1206 if (tmp ==
1207 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1208 tmp = !tmp;
1209 if ((norun || regtry(prog, s)))
1210 goto got_it;
1211 }
1212 s++;
a0ed51b3 1213 }
a0ed51b3 1214 }
6eb5f6b9 1215 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1216 goto got_it;
1217 break;
bbce6d69 1218 case NBOUNDL:
3280af22 1219 PL_reg_flags |= RF_tainted;
bbce6d69 1220 /* FALL THROUGH */
a0d0e21e 1221 case NBOUND:
ffc61ed2 1222 if (do_utf8) {
12d33761 1223 if (s == PL_bostr)
ffc61ed2
JH
1224 tmp = '\n';
1225 else {
6136c704 1226 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1227 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1228 }
1229 tmp = ((OP(c) == NBOUND ?
9041c2e3 1230 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1231 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1232 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1233 if (tmp == !(OP(c) == NBOUND ?
3568d838 1234 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1235 isALNUM_LC_utf8((U8*)s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
078c425b 1239 s += uskip;
ffc61ed2 1240 }
a0d0e21e 1241 }
667bb95a 1242 else {
12d33761 1243 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1244 tmp = ((OP(c) == NBOUND ?
1245 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1246 while (s < strend) {
1247 if (tmp ==
1248 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1249 tmp = !tmp;
1250 else if ((norun || regtry(prog, s)))
1251 goto got_it;
1252 s++;
1253 }
a0ed51b3 1254 }
6eb5f6b9 1255 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1256 goto got_it;
1257 break;
a0d0e21e 1258 case ALNUM:
ffc61ed2 1259 if (do_utf8) {
1a4fad37 1260 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1261 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1262 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
bbce6d69 1268 else
ffc61ed2 1269 tmp = 1;
078c425b 1270 s += uskip;
bbce6d69 1271 }
bbce6d69 1272 }
ffc61ed2
JH
1273 else {
1274 while (s < strend) {
1275 if (isALNUM(*s)) {
1276 if (tmp && (norun || regtry(prog, s)))
1277 goto got_it;
1278 else
1279 tmp = doevery;
1280 }
a0ed51b3 1281 else
ffc61ed2
JH
1282 tmp = 1;
1283 s++;
a0ed51b3 1284 }
a0ed51b3
LW
1285 }
1286 break;
bbce6d69 1287 case ALNUML:
3280af22 1288 PL_reg_flags |= RF_tainted;
ffc61ed2 1289 if (do_utf8) {
078c425b 1290 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1291 if (isALNUM_LC_utf8((U8*)s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
a687059c 1297 else
ffc61ed2 1298 tmp = 1;
078c425b 1299 s += uskip;
a0d0e21e 1300 }
a0d0e21e 1301 }
ffc61ed2
JH
1302 else {
1303 while (s < strend) {
1304 if (isALNUM_LC(*s)) {
1305 if (tmp && (norun || regtry(prog, s)))
1306 goto got_it;
1307 else
1308 tmp = doevery;
1309 }
a0ed51b3 1310 else
ffc61ed2
JH
1311 tmp = 1;
1312 s++;
a0ed51b3 1313 }
a0ed51b3
LW
1314 }
1315 break;
a0d0e21e 1316 case NALNUM:
ffc61ed2 1317 if (do_utf8) {
1a4fad37 1318 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1319 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1320 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
bbce6d69 1326 else
ffc61ed2 1327 tmp = 1;
078c425b 1328 s += uskip;
bbce6d69 1329 }
bbce6d69 1330 }
ffc61ed2
JH
1331 else {
1332 while (s < strend) {
1333 if (!isALNUM(*s)) {
1334 if (tmp && (norun || regtry(prog, s)))
1335 goto got_it;
1336 else
1337 tmp = doevery;
1338 }
a0ed51b3 1339 else
ffc61ed2
JH
1340 tmp = 1;
1341 s++;
a0ed51b3 1342 }
a0ed51b3
LW
1343 }
1344 break;
bbce6d69 1345 case NALNUML:
3280af22 1346 PL_reg_flags |= RF_tainted;
ffc61ed2 1347 if (do_utf8) {
078c425b 1348 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1349 if (!isALNUM_LC_utf8((U8*)s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
a687059c 1355 else
ffc61ed2 1356 tmp = 1;
078c425b 1357 s += uskip;
a687059c 1358 }
a0d0e21e 1359 }
ffc61ed2
JH
1360 else {
1361 while (s < strend) {
1362 if (!isALNUM_LC(*s)) {
1363 if (tmp && (norun || regtry(prog, s)))
1364 goto got_it;
1365 else
1366 tmp = doevery;
1367 }
a0ed51b3 1368 else
ffc61ed2
JH
1369 tmp = 1;
1370 s++;
a0ed51b3 1371 }
a0ed51b3
LW
1372 }
1373 break;
a0d0e21e 1374 case SPACE:
ffc61ed2 1375 if (do_utf8) {
1a4fad37 1376 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1377 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1378 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
a0d0e21e 1384 else
ffc61ed2 1385 tmp = 1;
078c425b 1386 s += uskip;
2304df62 1387 }
a0d0e21e 1388 }
ffc61ed2
JH
1389 else {
1390 while (s < strend) {
1391 if (isSPACE(*s)) {
1392 if (tmp && (norun || regtry(prog, s)))
1393 goto got_it;
1394 else
1395 tmp = doevery;
1396 }
a0ed51b3 1397 else
ffc61ed2
JH
1398 tmp = 1;
1399 s++;
a0ed51b3 1400 }
a0ed51b3
LW
1401 }
1402 break;
bbce6d69 1403 case SPACEL:
3280af22 1404 PL_reg_flags |= RF_tainted;
ffc61ed2 1405 if (do_utf8) {
078c425b 1406 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1407 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
bbce6d69 1413 else
ffc61ed2 1414 tmp = 1;
078c425b 1415 s += uskip;
bbce6d69 1416 }
bbce6d69 1417 }
ffc61ed2
JH
1418 else {
1419 while (s < strend) {
1420 if (isSPACE_LC(*s)) {
1421 if (tmp && (norun || regtry(prog, s)))
1422 goto got_it;
1423 else
1424 tmp = doevery;
1425 }
a0ed51b3 1426 else
ffc61ed2
JH
1427 tmp = 1;
1428 s++;
a0ed51b3 1429 }
a0ed51b3
LW
1430 }
1431 break;
a0d0e21e 1432 case NSPACE:
ffc61ed2 1433 if (do_utf8) {
1a4fad37 1434 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1435 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1436 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
a0d0e21e 1442 else
ffc61ed2 1443 tmp = 1;
078c425b 1444 s += uskip;
a687059c 1445 }
a0d0e21e 1446 }
ffc61ed2
JH
1447 else {
1448 while (s < strend) {
1449 if (!isSPACE(*s)) {
1450 if (tmp && (norun || regtry(prog, s)))
1451 goto got_it;
1452 else
1453 tmp = doevery;
1454 }
a0ed51b3 1455 else
ffc61ed2
JH
1456 tmp = 1;
1457 s++;
a0ed51b3 1458 }
a0ed51b3
LW
1459 }
1460 break;
bbce6d69 1461 case NSPACEL:
3280af22 1462 PL_reg_flags |= RF_tainted;
ffc61ed2 1463 if (do_utf8) {
078c425b 1464 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1465 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
bbce6d69 1471 else
ffc61ed2 1472 tmp = 1;
078c425b 1473 s += uskip;
bbce6d69 1474 }
bbce6d69 1475 }
ffc61ed2
JH
1476 else {
1477 while (s < strend) {
1478 if (!isSPACE_LC(*s)) {
1479 if (tmp && (norun || regtry(prog, s)))
1480 goto got_it;
1481 else
1482 tmp = doevery;
1483 }
a0ed51b3 1484 else
ffc61ed2
JH
1485 tmp = 1;
1486 s++;
a0ed51b3 1487 }
a0ed51b3
LW
1488 }
1489 break;
a0d0e21e 1490 case DIGIT:
ffc61ed2 1491 if (do_utf8) {
1a4fad37 1492 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1493 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1494 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
a0d0e21e 1500 else
ffc61ed2 1501 tmp = 1;
078c425b 1502 s += uskip;
2b69d0c2 1503 }
a0d0e21e 1504 }
ffc61ed2
JH
1505 else {
1506 while (s < strend) {
1507 if (isDIGIT(*s)) {
1508 if (tmp && (norun || regtry(prog, s)))
1509 goto got_it;
1510 else
1511 tmp = doevery;
1512 }
a0ed51b3 1513 else
ffc61ed2
JH
1514 tmp = 1;
1515 s++;
a0ed51b3 1516 }
a0ed51b3
LW
1517 }
1518 break;
b8c5462f
JH
1519 case DIGITL:
1520 PL_reg_flags |= RF_tainted;
ffc61ed2 1521 if (do_utf8) {
078c425b 1522 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1523 if (isDIGIT_LC_utf8((U8*)s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
b8c5462f 1529 else
ffc61ed2 1530 tmp = 1;
078c425b 1531 s += uskip;
b8c5462f 1532 }
b8c5462f 1533 }
ffc61ed2
JH
1534 else {
1535 while (s < strend) {
1536 if (isDIGIT_LC(*s)) {
1537 if (tmp && (norun || regtry(prog, s)))
1538 goto got_it;
1539 else
1540 tmp = doevery;
1541 }
b8c5462f 1542 else
ffc61ed2
JH
1543 tmp = 1;
1544 s++;
b8c5462f 1545 }
b8c5462f
JH
1546 }
1547 break;
a0d0e21e 1548 case NDIGIT:
ffc61ed2 1549 if (do_utf8) {
1a4fad37 1550 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1551 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1552 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
a0d0e21e 1558 else
ffc61ed2 1559 tmp = 1;
078c425b 1560 s += uskip;
a687059c 1561 }
a0d0e21e 1562 }
ffc61ed2
JH
1563 else {
1564 while (s < strend) {
1565 if (!isDIGIT(*s)) {
1566 if (tmp && (norun || regtry(prog, s)))
1567 goto got_it;
1568 else
1569 tmp = doevery;
1570 }
a0ed51b3 1571 else
ffc61ed2
JH
1572 tmp = 1;
1573 s++;
a0ed51b3 1574 }
a0ed51b3
LW
1575 }
1576 break;
b8c5462f
JH
1577 case NDIGITL:
1578 PL_reg_flags |= RF_tainted;
ffc61ed2 1579 if (do_utf8) {
078c425b 1580 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1581 if (!isDIGIT_LC_utf8((U8*)s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
b8c5462f 1587 else
ffc61ed2 1588 tmp = 1;
078c425b 1589 s += uskip;
b8c5462f 1590 }
a0ed51b3 1591 }
ffc61ed2
JH
1592 else {
1593 while (s < strend) {
1594 if (!isDIGIT_LC(*s)) {
1595 if (tmp && (norun || regtry(prog, s)))
1596 goto got_it;
1597 else
1598 tmp = doevery;
1599 }
cf93c79d 1600 else
ffc61ed2
JH
1601 tmp = 1;
1602 s++;
b8c5462f 1603 }
b8c5462f
JH
1604 }
1605 break;
b3c9acc1 1606 default:
3c3eec57
GS
1607 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1608 break;
d6a28714 1609 }
6eb5f6b9
JH
1610 return 0;
1611 got_it:
1612 return s;
1613}
1614
1615/*
1616 - regexec_flags - match a regexp against a string
1617 */
1618I32
1619Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1620 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1621/* strend: pointer to null at end of string */
1622/* strbeg: real beginning of string */
1623/* minend: end of match must be >=minend after stringarg. */
1624/* data: May be used for some additional optimizations. */
1625/* nosave: For optimizations. */
1626{
97aff369 1627 dVAR;
6eb5f6b9
JH
1628 register char *s;
1629 register regnode *c;
1630 register char *startpos = stringarg;
6eb5f6b9
JH
1631 I32 minlen; /* must match at least this many chars */
1632 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1633 I32 end_shift = 0; /* Same for the end. */ /* CC */
1634 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1635 char *scream_olds = NULL;
6eb5f6b9 1636 SV* oreplsv = GvSV(PL_replgv);
1df70142 1637 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1638 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1639#ifdef DEBUGGING
6136c704
AL
1640 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1641 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1642#endif
a3621e74
YO
1643
1644 GET_RE_DEBUG_FLAGS_DECL;
1645
9d4ba2ae 1646 PERL_UNUSED_ARG(data);
a30b2f1f 1647 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1648
1649 PL_regcc = 0;
1650
1651 cache_re(prog);
1652#ifdef DEBUGGING
aea4f609 1653 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1654#endif
1655
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1659 return 0;
1660 }
1661
1662 minlen = prog->minlen;
61a36c01 1663 if (strend - startpos < minlen) {
a3621e74 1664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1665 "String too short [regexec_flags]...\n"));
1666 goto phooey;
1aa99e6b 1667 }
6eb5f6b9 1668
6eb5f6b9
JH
1669 /* Check validity of program. */
1670 if (UCHARAT(prog->program) != REG_MAGIC) {
1671 Perl_croak(aTHX_ "corrupted regexp program");
1672 }
1673
1674 PL_reg_flags = 0;
1675 PL_reg_eval_set = 0;
1676 PL_reg_maxiter = 0;
1677
1678 if (prog->reganch & ROPT_UTF8)
1679 PL_reg_flags |= RF_utf8;
1680
1681 /* Mark beginning of line for ^ and lookbehind. */
1682 PL_regbol = startpos;
1683 PL_bostr = strbeg;
1684 PL_reg_sv = sv;
1685
1686 /* Mark end of line for $ (and such) */
1687 PL_regeol = strend;
1688
1689 /* see how far we have to get to not match where we matched before */
1690 PL_regtill = startpos+minend;
1691
1692 /* We start without call_cc context. */
1693 PL_reg_call_cc = 0;
1694
1695 /* If there is a "must appear" string, look for it. */
1696 s = startpos;
1697
1698 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1699 MAGIC *mg;
1700
1701 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1702 PL_reg_ganch = startpos;
1703 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && SvMAGIC(sv)
14befaf4
DM
1705 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1706 && mg->mg_len >= 0) {
6eb5f6b9
JH
1707 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1708 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1709 if (s > PL_reg_ganch)
6eb5f6b9
JH
1710 goto phooey;
1711 s = PL_reg_ganch;
1712 }
1713 }
1714 else /* pos() not defined */
1715 PL_reg_ganch = strbeg;
1716 }
1717
a0714e2c 1718 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1719 re_scream_pos_data d;
1720
1721 d.scream_olds = &scream_olds;
1722 d.scream_pos = &scream_pos;
1723 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1724 if (!s) {
a3621e74 1725 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1726 goto phooey; /* not present */
3fa9c3d7 1727 }
6eb5f6b9
JH
1728 }
1729
a3621e74 1730 DEBUG_EXECUTE_r({
1df70142
AL
1731 const char * const s0 = UTF
1732 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1733 UNI_DISPLAY_REGEX)
1734 : prog->precomp;
1735 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1736 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1737 UNI_DISPLAY_REGEX) : startpos;
1df70142 1738 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1739 if (!PL_colorset)
1740 reginitcolors();
1741 PerlIO_printf(Perl_debug_log,
a0288114 1742 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1743 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1744 len0, len0, s0,
2a782b5b 1745 PL_colors[1],
9e55ce06 1746 len0 > 60 ? "..." : "",
2a782b5b 1747 PL_colors[0],
9e55ce06
JH
1748 (int)(len1 > 60 ? 60 : len1),
1749 s1, PL_colors[1],
1750 (len1 > 60 ? "..." : "")
2a782b5b
JH
1751 );
1752 });
6eb5f6b9
JH
1753
1754 /* Simplest case: anchored match need be tried only once. */
1755 /* [unless only anchor is BOL and multiline is set] */
1756 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1757 if (s == startpos && regtry(prog, startpos))
1758 goto got_it;
7fba1cd6 1759 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1760 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1761 {
1762 char *end;
1763
1764 if (minlen)
1765 dontbother = minlen - 1;
1aa99e6b 1766 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1767 /* for multiline we only have to try after newlines */
33b8afdf 1768 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1769 if (s == startpos)
1770 goto after_try;
1771 while (1) {
1772 if (regtry(prog, s))
1773 goto got_it;
1774 after_try:
1775 if (s >= end)
1776 goto phooey;
1777 if (prog->reganch & RE_USE_INTUIT) {
1778 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1779 if (!s)
1780 goto phooey;
1781 }
1782 else
1783 s++;
1784 }
1785 } else {
1786 if (s > startpos)
1787 s--;
1788 while (s < end) {
1789 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1790 if (regtry(prog, s))
1791 goto got_it;
1792 }
1793 }
1794 }
1795 }
1796 goto phooey;
1797 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1798 if (regtry(prog, PL_reg_ganch))
1799 goto got_it;
1800 goto phooey;
1801 }
1802
1803 /* Messy cases: unanchored match. */
33b8afdf 1804 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1805 /* we have /x+whatever/ */
1806 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1807 char ch;
bf93d4cc
GS
1808#ifdef DEBUGGING
1809 int did_match = 0;
1810#endif
33b8afdf
JH
1811 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1812 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1813 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1814
1aa99e6b 1815 if (do_utf8) {
6eb5f6b9
JH
1816 while (s < strend) {
1817 if (*s == ch) {
a3621e74 1818 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1819 if (regtry(prog, s)) goto got_it;
1820 s += UTF8SKIP(s);
1821 while (s < strend && *s == ch)
1822 s += UTF8SKIP(s);
1823 }
1824 s += UTF8SKIP(s);
1825 }
1826 }
1827 else {
1828 while (s < strend) {
1829 if (*s == ch) {
a3621e74 1830 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1831 if (regtry(prog, s)) goto got_it;
1832 s++;
1833 while (s < strend && *s == ch)
1834 s++;
1835 }
1836 s++;
1837 }
1838 }
a3621e74 1839 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1840 PerlIO_printf(Perl_debug_log,
b7953727
JH
1841 "Did not find anchored character...\n")
1842 );
6eb5f6b9 1843 }
a0714e2c
SS
1844 else if (prog->anchored_substr != NULL
1845 || prog->anchored_utf8 != NULL
1846 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1847 && prog->float_max_offset < strend - s)) {
1848 SV *must;
1849 I32 back_max;
1850 I32 back_min;
1851 char *last;
6eb5f6b9 1852 char *last1; /* Last position checked before */
bf93d4cc
GS
1853#ifdef DEBUGGING
1854 int did_match = 0;
1855#endif
33b8afdf
JH
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1861 } else {
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1867 }
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1870 goto phooey;
1871
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1875
1876 if (s > PL_bostr)
1877 last1 = HOPc(s, -1);
1878 else
1879 last1 = s - 1; /* bogus */
1880
a0288114 1881 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1882 check_substr==must. */
1883 scream_pos = -1;
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
9041c2e3 1887 ((flags & REXEC_SCREAM)
1aa99e6b 1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1889 end_shift, &scream_pos, 0))
1aa99e6b 1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1891 (unsigned char*)strend, must,
7fba1cd6 1892 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1895 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1896 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1900 }
1901 else {
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1903
1904 last1 = HOPc(s, -back_min);
1905 s = t;
1906 }
1aa99e6b 1907 if (do_utf8) {
6eb5f6b9
JH
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1910 goto got_it;
1911 s += UTF8SKIP(s);
1912 }
1913 }
1914 else {
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s++;
1919 }
1920 }
1921 }
a3621e74 1922 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1923 PerlIO_printf(Perl_debug_log,
a0288114 1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1926 ? "anchored" : "floating"),
1927 PL_colors[0],
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1929 SvPVX_const(must),
b7953727
JH
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1931 );
6eb5f6b9
JH
1932 goto phooey;
1933 }
155aba94 1934 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1935 if (minlen) {
1936 I32 op = (U8)OP(prog->regstclass);
66e933ab 1937 /* don't bother with what can't match */
f14c76ed
RGS
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1940 }
a3621e74 1941 DEBUG_EXECUTE_r({
ffc61ed2 1942 SV *prop = sv_newmortal();
cfd0369c
NC
1943 const char *s0;
1944 const char *s1;
9e55ce06
JH
1945 int len0;
1946 int len1;
1947
ffc61ed2 1948 regprop(prop, c);
9e55ce06 1949 s0 = UTF ?
3f7c398e 1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1951 UNI_DISPLAY_REGEX) :
cfd0369c 1952 SvPVX_const(prop);
9e55ce06
JH
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1954 s1 = UTF ?
c728cb41 1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
a0288114 1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1959 len0, len0, s0,
1960 len1, len1, s1);
ffc61ed2 1961 });
06b5626a 1962 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1963 goto got_it;
a3621e74 1964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1965 }
1966 else {
1967 dontbother = 0;
a0714e2c 1968 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1969 /* Trim the end. */
d6a28714 1970 char *last;
33b8afdf
JH
1971 SV* float_real;
1972
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1976
1977 if (flags & REXEC_SCREAM) {
33b8afdf 1978 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1979 end_shift, &scream_pos, 1); /* last one */
1980 if (!last)
ffc61ed2 1981 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
3f7c398e 1984 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1985 }
d6a28714
JH
1986 else {
1987 STRLEN len;
cfd0369c 1988 const char * const little = SvPV_const(float_real, len);
d6a28714 1989
33b8afdf 1990 if (SvTAIL(float_real)) {
d6a28714
JH
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
7fba1cd6 1993 else if (!multiline)
9041c2e3 1994 last = memEQ(strend - len, little, len)
bd61b366 1995 ? strend - len : NULL;
b8c5462f 1996 else
d6a28714
JH
1997 goto find_last;
1998 } else {
1999 find_last:
9041c2e3 2000 if (len)
d6a28714 2001 last = rninstr(s, strend, little, little + len);
b8c5462f 2002 else
a0288114 2003 last = strend; /* matching "$" */
b8c5462f 2004 }
b8c5462f 2005 }
bf93d4cc 2006 if (last == NULL) {
a3621e74 2007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2009 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2010 goto phooey; /* Should not happen! */
2011 }
d6a28714
JH
2012 dontbother = strend - last + prog->float_min_offset;
2013 }
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
1aa99e6b 2018 if (do_utf8) {
d6a28714
JH
2019 for (;;) {
2020 if (regtry(prog, s))
2021 goto got_it;
2022 if (s >= strend)
2023 break;
b8c5462f 2024 s += UTF8SKIP(s);
d6a28714
JH
2025 };
2026 }
2027 else {
2028 do {
2029 if (regtry(prog, s))
2030 goto got_it;
2031 } while (s++ < strend);
2032 }
2033 }
2034
2035 /* Failure. */
2036 goto phooey;
2037
2038got_it:
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2040
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2046 the same. */
acfe0abc 2047 restore_pos(aTHX_ 0);
d6a28714
JH
2048 }
2049
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2052 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2053 if (flags & REXEC_COPY_STR) {
2054 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2055#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2056 if ((SvIsCOW(sv)
2057 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2058 if (DEBUG_C_TEST) {
2059 PerlIO_printf(Perl_debug_log,
2060 "Copy on write: regexp capture, type %d\n",
2061 (int) SvTYPE(sv));
2062 }
2063 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2064 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2065 assert (SvPOKp(prog->saved_copy));
2066 } else
2067#endif
2068 {
2069 RX_MATCH_COPIED_on(prog);
2070 s = savepvn(strbeg, i);
2071 prog->subbeg = s;
2072 }
d6a28714 2073 prog->sublen = i;
d6a28714
JH
2074 }
2075 else {
2076 prog->subbeg = strbeg;
2077 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2078 }
2079 }
9041c2e3 2080
d6a28714
JH
2081 return 1;
2082
2083phooey:
a3621e74 2084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2085 PL_colors[4], PL_colors[5]));
d6a28714 2086 if (PL_reg_eval_set)
acfe0abc 2087 restore_pos(aTHX_ 0);
d6a28714
JH
2088 return 0;
2089}
2090
2091/*
2092 - regtry - try match at specific point
2093 */
2094STATIC I32 /* 0 failure, 1 success */
2095S_regtry(pTHX_ regexp *prog, char *startpos)
2096{
97aff369 2097 dVAR;
d6a28714
JH
2098 register I32 i;
2099 register I32 *sp;
2100 register I32 *ep;
2101 CHECKPOINT lastcp;
a3621e74 2102 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2103
02db2b7b
IZ
2104#ifdef DEBUGGING
2105 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2106#endif
d6a28714
JH
2107 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2108 MAGIC *mg;
2109
2110 PL_reg_eval_set = RS_init;
a3621e74 2111 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2112 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2113 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2114 ));
e8347627 2115 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2116 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2117 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2118 SAVETMPS;
2119 /* Apparently this is not needed, judging by wantarray. */
e8347627 2120 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2121 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2122
2123 if (PL_reg_sv) {
2124 /* Make $_ available to executed code. */
2125 if (PL_reg_sv != DEFSV) {
59f00321 2126 SAVE_DEFSV;
d6a28714 2127 DEFSV = PL_reg_sv;
b8c5462f 2128 }
d6a28714 2129
9041c2e3 2130 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2131 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2132 /* prepare for quick setting of pos */
d300d9fa
NC
2133#ifdef PERL_OLD_COPY_ON_WRITE
2134 if (SvIsCOW(sv))
2135 sv_force_normal_flags(sv, 0);
2136#endif
2137 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2138 &PL_vtbl_mglob, NULL, 0);
d6a28714 2139 mg->mg_len = -1;
b8c5462f 2140 }
d6a28714
JH
2141 PL_reg_magic = mg;
2142 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2143 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2144 }
09687e5a 2145 if (!PL_reg_curpm) {
a02a5408 2146 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2147#ifdef USE_ITHREADS
2148 {
2149 SV* repointer = newSViv(0);
577e12cc 2150 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2151 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2152 av_push(PL_regex_padav,repointer);
2153 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2154 PL_regex_pad = AvARRAY(PL_regex_padav);
2155 }
2156#endif
2157 }
aaa362c4 2158 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2159 PL_reg_oldcurpm = PL_curpm;
2160 PL_curpm = PL_reg_curpm;
2161 if (RX_MATCH_COPIED(prog)) {
2162 /* Here is a serious problem: we cannot rewrite subbeg,
2163 since it may be needed if this match fails. Thus
2164 $` inside (?{}) could fail... */
2165 PL_reg_oldsaved = prog->subbeg;
2166 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2167#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2168 PL_nrs = prog->saved_copy;
2169#endif
d6a28714
JH
2170 RX_MATCH_COPIED_off(prog);
2171 }
2172 else
bd61b366 2173 PL_reg_oldsaved = NULL;
d6a28714
JH
2174 prog->subbeg = PL_bostr;
2175 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2176 }
973dddac 2177 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2178 PL_reginput = startpos;
2179 PL_regstartp = prog->startp;
2180 PL_regendp = prog->endp;
2181 PL_reglastparen = &prog->lastparen;
a01268b5 2182 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2183 prog->lastparen = 0;
03994de8 2184 prog->lastcloseparen = 0;
d6a28714 2185 PL_regsize = 0;
a3621e74 2186 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2187 if (PL_reg_start_tmpl <= prog->nparens) {
2188 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2189 if(PL_reg_start_tmp)
2190 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2191 else
a02a5408 2192 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2193 }
2194
2195 /* XXXX What this code is doing here?!!! There should be no need
2196 to do this again and again, PL_reglastparen should take care of
3dd2943c 2197 this! --ilya*/
dafc8851
JH
2198
2199 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2200 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2201 * PL_reglastparen), is not needed at all by the test suite
2202 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2203 * enough, for building DynaLoader, or otherwise this
2204 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2205 * will happen. Meanwhile, this code *is* needed for the
2206 * above-mentioned test suite tests to succeed. The common theme
2207 * on those tests seems to be returning null fields from matches.
2208 * --jhi */
dafc8851 2209#if 1
d6a28714
JH
2210 sp = prog->startp;
2211 ep = prog->endp;
2212 if (prog->nparens) {
eb160463 2213 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2214 *++sp = -1;
2215 *++ep = -1;
2216 }
2217 }
dafc8851 2218#endif
02db2b7b 2219 REGCP_SET(lastcp);
d6a28714
JH
2220 if (regmatch(prog->program + 1)) {
2221 prog->endp[0] = PL_reginput - PL_bostr;
2222 return 1;
2223 }
02db2b7b 2224 REGCP_UNWIND(lastcp);
d6a28714
JH
2225 return 0;
2226}
2227
02db2b7b
IZ
2228#define RE_UNWIND_BRANCH 1
2229#define RE_UNWIND_BRANCHJ 2
2230
2231union re_unwind_t;
2232
2233typedef struct { /* XX: makes sense to enlarge it... */
2234 I32 type;
2235 I32 prev;
2236 CHECKPOINT lastcp;
2237} re_unwind_generic_t;
2238
2239typedef struct {
2240 I32 type;
2241 I32 prev;
2242 CHECKPOINT lastcp;
2243 I32 lastparen;
2244 regnode *next;
2245 char *locinput;
2246 I32 nextchr;
2247#ifdef DEBUGGING
2248 int regindent;
2249#endif
2250} re_unwind_branch_t;
2251
2252typedef union re_unwind_t {
2253 I32 type;
2254 re_unwind_generic_t generic;
2255 re_unwind_branch_t branch;
2256} re_unwind_t;
2257
8ba1375e
MJD
2258#define sayYES goto yes
2259#define sayNO goto no
e0f9d4a8 2260#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2261#define sayYES_FINAL goto yes_final
2262#define sayYES_LOUD goto yes_loud
2263#define sayNO_FINAL goto no_final
2264#define sayNO_SILENT goto do_no
2265#define saySAME(x) if (x) goto yes; else goto no
2266
3ab3c9b4
HS
2267#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2268#define POSCACHE_SEEN 1 /* we know what we're caching */
2269#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2270#define CACHEsayYES STMT_START { \
2271 if (cache_offset | cache_bit) { \
2272 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2273 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2274 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2275 /* cache records failure, but this is success */ \
2276 DEBUG_r( \
2277 PerlIO_printf(Perl_debug_log, \
2278 "%*s (remove success from failure cache)\n", \
2279 REPORT_CODE_OFF+PL_regindent*2, "") \
2280 ); \
2281 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2282 } \
2283 } \
2284 sayYES; \
2285} STMT_END
2286#define CACHEsayNO STMT_START { \
2287 if (cache_offset | cache_bit) { \
2288 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2289 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2290 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2291 /* cache records success, but this is failure */ \
2292 DEBUG_r( \
2293 PerlIO_printf(Perl_debug_log, \
2294 "%*s (remove failure from success cache)\n", \
2295 REPORT_CODE_OFF+PL_regindent*2, "") \
2296 ); \
2297 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2298 } \
2299 } \
2300 sayNO; \
2301} STMT_END
2302
a3621e74
YO
2303/* this is used to determine how far from the left messages like
2304 'failed...' are printed. Currently 29 makes these messages line
2305 up with the opcode they refer to. Earlier perls used 25 which
2306 left these messages outdented making reviewing a debug output
2307 quite difficult.
2308*/
2309#define REPORT_CODE_OFF 29
2310
2311
2312/* Make sure there is a test for this +1 options in re_tests */
2313#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2314
2315#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2316 if ( trie->states[ state ].wordnum ) { \
2317 if ( !accepted ) { \
2318 ENTER; \
2319 SAVETMPS; \
2320 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
561b68a9 2321 sv_accept_buff=newSV(bufflen * sizeof(reg_trie_accepted) - 1 );\
a3621e74
YO
2322 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2323 SvPOK_on( sv_accept_buff ); \
2324 sv_2mortal( sv_accept_buff ); \
2325 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2326 } else { \
2327 if ( accepted >= bufflen ) { \
2328 bufflen *= 2; \
2329 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2330 bufflen * sizeof(reg_trie_accepted) ); \
2331 } \
2332 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2333 + sizeof( reg_trie_accepted ) ); \
2334 } \
2335 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2336 accept_buff[ accepted ].endpos = uc; \
2337 ++accepted; \
2338 } } STMT_END
2339
2340#define TRIE_HANDLE_CHAR STMT_START { \
2341 if ( uvc < 256 ) { \
2342 charid = trie->charmap[ uvc ]; \
2343 } else { \
2344 charid = 0; \
2345 if( trie->widecharmap ) { \
2346 SV** svpp = (SV**)NULL; \
2347 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2348 sizeof( UV ), 0 ); \
2349 if ( svpp ) { \
2350 charid = (U16)SvIV( *svpp ); \
2351 } \
2352 } \
2353 } \
2354 if ( charid && \
cc601c31
YO
2355 ( base + charid > trie->uniquecharcount ) && \
2356 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2357 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2358 { \
2359 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2360 } else { \
2361 state = 0; \
2362 } \
2363 uc += len; \
2364 } STMT_END
8ba1375e 2365
d6a28714
JH
2366/*
2367 - regmatch - main matching routine
2368 *
2369 * Conceptually the strategy is simple: check to see whether the current
2370 * node matches, call self recursively to see whether the rest matches,
2371 * and then act accordingly. In practice we make some effort to avoid
2372 * recursion, in particular by going through "ordinary" nodes (that don't
2373 * need to know whether the rest of the match failed) by a loop instead of
2374 * by recursion.
2375 */
2376/* [lwall] I've hoisted the register declarations to the outer block in order to
2377 * maybe save a little bit of pushing and popping on the stack. It also takes
2378 * advantage of machines that use a register save mask on subroutine entry.
2379 */
2380STATIC I32 /* 0 failure, 1 success */
2381S_regmatch(pTHX_ regnode *prog)
2382{
27da23d5 2383 dVAR;
d6a28714
JH
2384 register regnode *scan; /* Current node. */
2385 regnode *next; /* Next node. */
2386 regnode *inner; /* Next node in internal branch. */
2387 register I32 nextchr; /* renamed nextchr - nextchar colides with
2388 function of same name */
2389 register I32 n; /* no or next */
b7953727 2390 register I32 ln = 0; /* len or last */
bd61b366 2391 register char *s = NULL; /* operand or save */
d6a28714 2392 register char *locinput = PL_reginput;
b7953727 2393 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2394 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2395 I32 unwind = 0;
a3621e74
YO
2396
2397 /* used by the trie code */
cbbf8932
AL
2398 SV *sv_accept_buff = NULL; /* accepting states we have traversed */
2399 reg_trie_accepted *accept_buff = NULL; /* "" */
ab74612d
NC
2400 reg_trie_data *trie; /* what trie are we using right now */
2401 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2402
b7953727 2403#if 0
02db2b7b 2404 I32 firstcp = PL_savestack_ix;
b7953727 2405#endif
0d46e09a 2406 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2407#ifdef DEBUGGING
6136c704
AL
2408 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
2409 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2410 SV * const dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2411
ab74612d 2412 SV *re_debug_flags = NULL;
2a782b5b 2413#endif
041457d9 2414 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2415
a3621e74
YO
2416 GET_RE_DEBUG_FLAGS;
2417
d6a28714
JH
2418#ifdef DEBUGGING
2419 PL_regindent++;
2420#endif
2421
a3621e74 2422
d6a28714
JH
2423 /* Note that nextchr is a byte even in UTF */
2424 nextchr = UCHARAT(locinput);
2425 scan = prog;
2426 while (scan != NULL) {
8ba1375e 2427
a3621e74 2428 DEBUG_EXECUTE_r( {
6136c704 2429 SV * const prop = sv_newmortal();
1df70142
AL
2430 const int docolor = *PL_colors[0];
2431 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2432 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2433 /* The part of the string before starttry has one color
2434 (pref0_len chars), between starttry and current
2435 position another one (pref_len - pref0_len chars),
2436 after the current position the third one.
2437 We assume that pref0_len <= pref_len, otherwise we
2438 decrease pref0_len. */
9041c2e3 2439 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2440 ? (5 + taill) - l : locinput - PL_bostr;
2441 int pref0_len;
d6a28714 2442
df1ffd02 2443 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2444 pref_len++;
2445 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2446 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2447 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2448 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2449 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2450 l--;
d6a28714
JH
2451 if (pref0_len < 0)
2452 pref0_len = 0;
2453 if (pref0_len > pref_len)
2454 pref0_len = pref_len;
2455 regprop(prop, scan);
2a782b5b 2456 {
1df70142 2457 const char * const s0 =
f14c76ed 2458 do_utf8 && OP(scan) != CANY ?
2a782b5b 2459 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2460 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2461 locinput - pref_len;
1df70142
AL
2462 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2463 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2464 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2465 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2466 locinput - pref_len + pref0_len;
1df70142
AL
2467 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2468 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2469 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2470 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2471 locinput;
1df70142 2472 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2473 PerlIO_printf(Perl_debug_log,
2474 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2475 (IV)(locinput - PL_bostr),
2476 PL_colors[4],
2477 len0, s0,
2478 PL_colors[5],
2479 PL_colors[2],
2480 len1, s1,
2481 PL_colors[3],
2482 (docolor ? "" : "> <"),
2483 PL_colors[0],
2484 len2, s2,
2485 PL_colors[1],
2486 15 - l - pref_len + 1,
2487 "",
2488 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2489 SvPVX_const(prop));
2a782b5b
JH
2490 }
2491 });
d6a28714
JH
2492
2493 next = scan + NEXT_OFF(scan);
2494 if (next == scan)
2495 next = NULL;
2496
2497 switch (OP(scan)) {
2498 case BOL:
7fba1cd6 2499 if (locinput == PL_bostr)
d6a28714
JH
2500 {
2501 /* regtill = regbol; */
b8c5462f
JH
2502 break;
2503 }
d6a28714
JH
2504 sayNO;
2505 case MBOL:
12d33761
HS
2506 if (locinput == PL_bostr ||
2507 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2508 {
b8c5462f
JH
2509 break;
2510 }
d6a28714
JH
2511 sayNO;
2512 case SBOL:
c2a73568 2513 if (locinput == PL_bostr)
b8c5462f 2514 break;
d6a28714
JH
2515 sayNO;
2516 case GPOS:
2517 if (locinput == PL_reg_ganch)
2518 break;
2519 sayNO;
2520 case EOL:
d6a28714
JH
2521 goto seol;
2522 case MEOL:
d6a28714 2523 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2524 sayNO;
b8c5462f 2525 break;
d6a28714
JH
2526 case SEOL:
2527 seol:
2528 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2529 sayNO;
d6a28714 2530 if (PL_regeol - locinput > 1)
b8c5462f 2531 sayNO;
b8c5462f 2532 break;
d6a28714
JH
2533 case EOS:
2534 if (PL_regeol != locinput)
b8c5462f 2535 sayNO;
d6a28714 2536 break;
ffc61ed2 2537 case SANY:
d6a28714 2538 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2539 sayNO;
f33976b4
DB
2540 if (do_utf8) {
2541 locinput += PL_utf8skip[nextchr];
2542 if (locinput > PL_regeol)
2543 sayNO;
2544 nextchr = UCHARAT(locinput);
2545 }
2546 else
2547 nextchr = UCHARAT(++locinput);
2548 break;
2549 case CANY:
2550 if (!nextchr && locinput >= PL_regeol)
2551 sayNO;
b8c5462f 2552 nextchr = UCHARAT(++locinput);
a0d0e21e 2553 break;
ffc61ed2 2554 case REG_ANY:
1aa99e6b
IH
2555 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2556 sayNO;
2557 if (do_utf8) {
b8c5462f 2558 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2559 if (locinput > PL_regeol)
2560 sayNO;
a0ed51b3 2561 nextchr = UCHARAT(locinput);
a0ed51b3 2562 }
1aa99e6b
IH
2563 else
2564 nextchr = UCHARAT(++locinput);
a0ed51b3 2565 break;
a3621e74
YO
2566
2567
2568
2569 /*
2570 traverse the TRIE keeping track of all accepting states
2571 we transition through until we get to a failing node.
2572
2573 we use two slightly different pieces of code to handle
2574 the traversal depending on whether its case sensitive or
2575 not. we reuse the accept code however. (this should probably
2576 be turned into a macro.)
2577
2578 */
2579 case TRIEF:
2580 case TRIEFL:
2581 {
a3621e74
YO
2582 U8 *uc = ( U8* )locinput;
2583 U32 state = 1;
2584 U16 charid = 0;
2585 U32 base = 0;
2586 UV uvc = 0;
2587 STRLEN len = 0;
2588 STRLEN foldlen = 0;
a3621e74
YO
2589 U8 *uscan = (U8*)NULL;
2590 STRLEN bufflen=0;
2591 accepted = 0;
2592
2593 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2594
2595 while ( state && uc <= (U8*)PL_regeol ) {
2596
2597 TRIE_CHECK_STATE_IS_ACCEPTING;
2598
2599 base = trie->states[ state ].trans.base;
2600
2601 DEBUG_TRIE_EXECUTE_r(
2602 PerlIO_printf( Perl_debug_log,
e4584336 2603 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2604 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2605 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2606 );
2607
2608 if ( base ) {
2609
108bb1ad 2610 if ( do_utf8 ) {
a3621e74
YO
2611 if ( foldlen>0 ) {
2612 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2613 foldlen -= len;
2614 uscan += len;
2615 len=0;
2616 } else {
1df70142 2617 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2618 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2619 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2620 foldlen -= UNISKIP( uvc );
2621 uscan = foldbuf + UNISKIP( uvc );
2622 }
2623 } else {
e4584336 2624 uvc = (UV)*uc;
a3621e74
YO
2625 len = 1;
2626 }
2627
2628 TRIE_HANDLE_CHAR;
2629
2630 } else {
2631 state = 0;
2632 }
2633 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2634 PerlIO_printf( Perl_debug_log,
2635 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2636 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2637 );
2638 }
2639 if ( !accepted ) {
2640 sayNO;
2641 } else {
2642 goto TrieAccept;
2643 }
2644 }
2645 /* unreached codepoint: we jump into the middle of the next case
2646 from previous if blocks */
2647 case TRIE:
2648 {
a3621e74
YO
2649 U8 *uc = (U8*)locinput;
2650 U32 state = 1;
2651 U16 charid = 0;
2652 U32 base = 0;
2653 UV uvc = 0;
2654 STRLEN len = 0;
2655 STRLEN bufflen = 0;
2656 accepted = 0;
2657
2658 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2659
2660 while ( state && uc <= (U8*)PL_regeol ) {
2661
2662 TRIE_CHECK_STATE_IS_ACCEPTING;
2663
2664 base = trie->states[ state ].trans.base;
2665
2666 DEBUG_TRIE_EXECUTE_r(
2667 PerlIO_printf( Perl_debug_log,
e4584336 2668 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2669 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2670 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2671 );
2672
2673 if ( base ) {
2674
108bb1ad 2675 if ( do_utf8 ) {
a3621e74
YO
2676 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2677 } else {
2678 uvc = (U32)*uc;
2679 len = 1;
2680 }
2681
2682 TRIE_HANDLE_CHAR;
2683
2684 } else {
2685 state = 0;
2686 }
2687 DEBUG_TRIE_EXECUTE_r(
2688 PerlIO_printf( Perl_debug_log,
e4584336
RB
2689 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2690 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2691 );
2692 }
2693 if ( !accepted ) {
2694 sayNO;
2695 }
2696 }
2697
2698
2699 /*
2700 There was at least one accepting state that we
2701 transitioned through. Presumably the number of accepting
2702 states is going to be low, typically one or two. So we
2703 simply scan through to find the one with lowest wordnum.
2704 Once we find it, we swap the last state into its place
2705 and decrement the size. We then try to match the rest of
2706 the pattern at the point where the word ends, if we
2707 succeed then we end the loop, otherwise the loop
2708 eventually terminates once all of the accepting states
2709 have been tried.
2710 */
2711 TrieAccept:
2712 {
2713 int gotit = 0;
2714
2715 if ( accepted == 1 ) {
2716 DEBUG_EXECUTE_r({
2717 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2718 PerlIO_printf( Perl_debug_log,
2719 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2720 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2721 accept_buff[ 0 ].wordnum,
cfd0369c 2722 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2723 PL_colors[5] );
2724 });
cc601c31 2725 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2726 /* in this case we free tmps/leave before we call regmatch
2727 as we wont be using accept_buff again. */
2728 FREETMPS;
2729 LEAVE;
2730 gotit = regmatch( scan + NEXT_OFF( scan ) );
2731 } else {
2732 DEBUG_EXECUTE_r(
e4584336
RB
2733 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2734 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2735 PL_colors[5] );
2736 );
2737 while ( !gotit && accepted-- ) {
2738 U32 best = 0;
2739 U32 cur;
2740 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2741 DEBUG_TRIE_EXECUTE_r(
2742 PerlIO_printf( Perl_debug_log,
2743 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2744 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2745 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2746 accept_buff[ cur ].wordnum, PL_colors[5] );
2747 );
a3621e74
YO
2748
2749 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2750 best = cur;
2751 }
2752 DEBUG_EXECUTE_r({
6136c704 2753 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2754 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2755 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2756 accept_buff[best].wordnum,
cfd0369c 2757 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2758 PL_colors[5] );
2759 });
2760 if ( best<accepted ) {
2761 reg_trie_accepted tmp = accept_buff[ best ];
2762 accept_buff[ best ] = accept_buff[ accepted ];
2763 accept_buff[ accepted ] = tmp;
2764 best = accepted;
2765 }
cc601c31 2766 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2767
2768 /*
2769 as far as I can tell we only need the SAVETMPS/FREETMPS
2770 for re's with EVAL in them but I'm leaving them in for
2771 all until I can be sure.
2772 */
2773 SAVETMPS;
2774 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2775 FREETMPS;
2776 }
2777 FREETMPS;
2778 LEAVE;
2779 }
2780
2781 if ( gotit ) {
2782 sayYES;
2783 } else {
2784 sayNO;
2785 }
2786 }
2787 /* unreached codepoint */
d6a28714 2788 case EXACT:
cd439c50
IZ
2789 s = STRING(scan);
2790 ln = STR_LEN(scan);
eb160463 2791 if (do_utf8 != UTF) {
bc517b45 2792 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2793 char *l = locinput;
a3b680e6 2794 const char *e = s + ln;
a72c7584 2795
5ff6fc6d
JH
2796 if (do_utf8) {
2797 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2798 while (s < e) {
a3b680e6 2799 STRLEN ulen;
1aa99e6b 2800 if (l >= PL_regeol)
5ff6fc6d
JH
2801 sayNO;
2802 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2803 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2804 uniflags))
5ff6fc6d 2805 sayNO;
bc517b45 2806 l += ulen;
5ff6fc6d 2807 s ++;
1aa99e6b 2808 }
5ff6fc6d
JH
2809 }
2810 else {
2811 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2812 while (s < e) {
a3b680e6 2813 STRLEN ulen;
1aa99e6b
IH
2814 if (l >= PL_regeol)
2815 sayNO;
5ff6fc6d 2816 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2817 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2818 uniflags))
1aa99e6b 2819 sayNO;
bc517b45 2820 s += ulen;
a72c7584 2821 l ++;
1aa99e6b 2822 }
5ff6fc6d 2823 }
1aa99e6b
IH
2824 locinput = l;
2825 nextchr = UCHARAT(locinput);
2826 break;
2827 }
bc517b45 2828 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2829 /* Inline the first character, for speed. */
2830 if (UCHARAT(s) != nextchr)
2831 sayNO;
2832 if (PL_regeol - locinput < ln)
2833 sayNO;
2834 if (ln > 1 && memNE(s, locinput, ln))
2835 sayNO;
2836 locinput += ln;
2837 nextchr = UCHARAT(locinput);
2838 break;
2839 case EXACTFL:
b8c5462f
JH
2840 PL_reg_flags |= RF_tainted;
2841 /* FALL THROUGH */
d6a28714 2842 case EXACTF:
cd439c50
IZ
2843 s = STRING(scan);
2844 ln = STR_LEN(scan);
d6a28714 2845
d07ddd77
JH
2846 if (do_utf8 || UTF) {
2847 /* Either target or the pattern are utf8. */
d6a28714 2848 char *l = locinput;
d07ddd77 2849 char *e = PL_regeol;
bc517b45 2850
eb160463 2851 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2852 l, &e, 0, do_utf8)) {
5486206c
JH
2853 /* One more case for the sharp s:
2854 * pack("U0U*", 0xDF) =~ /ss/i,
2855 * the 0xC3 0x9F are the UTF-8
2856 * byte sequence for the U+00DF. */
2857 if (!(do_utf8 &&
2858 toLOWER(s[0]) == 's' &&
2859 ln >= 2 &&
2860 toLOWER(s[1]) == 's' &&
2861 (U8)l[0] == 0xC3 &&
2862 e - l >= 2 &&
2863 (U8)l[1] == 0x9F))
2864 sayNO;
2865 }
d07ddd77
JH
2866 locinput = e;
2867 nextchr = UCHARAT(locinput);
2868 break;
a0ed51b3 2869 }
d6a28714 2870
bc517b45
JH
2871 /* Neither the target and the pattern are utf8. */
2872
d6a28714
JH
2873 /* Inline the first character, for speed. */
2874 if (UCHARAT(s) != nextchr &&
2875 UCHARAT(s) != ((OP(scan) == EXACTF)
2876 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2877 sayNO;
d6a28714 2878 if (PL_regeol - locinput < ln)
b8c5462f 2879 sayNO;
d6a28714
JH
2880 if (ln > 1 && (OP(scan) == EXACTF
2881 ? ibcmp(s, locinput, ln)
2882 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2883 sayNO;
d6a28714
JH
2884 locinput += ln;
2885 nextchr = UCHARAT(locinput);
a0d0e21e 2886 break;
d6a28714 2887 case ANYOF:
ffc61ed2 2888 if (do_utf8) {
9e55ce06
JH
2889 STRLEN inclasslen = PL_regeol - locinput;
2890
ba7b4546 2891 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2892 sayNO_ANYOF;
ffc61ed2
JH
2893 if (locinput >= PL_regeol)
2894 sayNO;
0f0076b4 2895 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2896 nextchr = UCHARAT(locinput);
e0f9d4a8 2897 break;
ffc61ed2
JH
2898 }
2899 else {
2900 if (nextchr < 0)
2901 nextchr = UCHARAT(locinput);
7d3e948e 2902 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2903 sayNO_ANYOF;
ffc61ed2
JH
2904 if (!nextchr && locinput >= PL_regeol)
2905 sayNO;
2906 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2907 break;
2908 }
2909 no_anyof:
2910 /* If we might have the case of the German sharp s
2911 * in a casefolding Unicode character class. */
2912
ebc501f0
JH
2913 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2914 locinput += SHARP_S_SKIP;
e0f9d4a8 2915 nextchr = UCHARAT(locinput);
ffc61ed2 2916 }
e0f9d4a8
JH
2917 else
2918 sayNO;
b8c5462f 2919 break;
d6a28714 2920 case ALNUML:
b8c5462f
JH
2921 PL_reg_flags |= RF_tainted;
2922 /* FALL THROUGH */
d6a28714 2923 case ALNUM:
b8c5462f 2924 if (!nextchr)
4633a7c4 2925 sayNO;
ffc61ed2 2926 if (do_utf8) {
1a4fad37 2927 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2928 if (!(OP(scan) == ALNUM
3568d838 2929 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2930 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2931 {
2932 sayNO;
a0ed51b3 2933 }
b8c5462f 2934 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2935 nextchr = UCHARAT(locinput);
2936 break;
2937 }
ffc61ed2 2938 if (!(OP(scan) == ALNUM
d6a28714 2939 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2940 sayNO;
b8c5462f 2941 nextchr = UCHARAT(++locinput);
a0d0e21e 2942 break;
d6a28714 2943 case NALNUML:
b8c5462f
JH
2944 PL_reg_flags |= RF_tainted;
2945 /* FALL THROUGH */
d6a28714
JH
2946 case NALNUM:
2947 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2948 sayNO;
ffc61ed2 2949 if (do_utf8) {
1a4fad37 2950 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2951 if (OP(scan) == NALNUM
3568d838 2952 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2953 : isALNUM_LC_utf8((U8*)locinput))
2954 {
b8c5462f 2955 sayNO;
d6a28714 2956 }
b8c5462f
JH
2957 locinput += PL_utf8skip[nextchr];
2958 nextchr = UCHARAT(locinput);
2959 break;
2960 }
ffc61ed2 2961 if (OP(scan) == NALNUM
d6a28714 2962 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2963 sayNO;
76e3520e 2964 nextchr = UCHARAT(++locinput);
a0d0e21e 2965 break;
d6a28714
JH
2966 case BOUNDL:
2967 case NBOUNDL:
3280af22 2968 PL_reg_flags |= RF_tainted;
bbce6d69 2969 /* FALL THROUGH */
d6a28714
JH
2970 case BOUND:
2971 case NBOUND:
2972 /* was last char in word? */
ffc61ed2 2973 if (do_utf8) {
12d33761
HS
2974 if (locinput == PL_bostr)
2975 ln = '\n';
ffc61ed2 2976 else {
a3b680e6 2977 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2978
b4f7163a 2979 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2980 }
2981 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2982 ln = isALNUM_uni(ln);
1a4fad37 2983 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2984 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2985 }
2986 else {
9041c2e3 2987 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2988 n = isALNUM_LC_utf8((U8*)locinput);
2989 }
a0ed51b3 2990 }
d6a28714 2991 else {
12d33761
HS
2992 ln = (locinput != PL_bostr) ?
2993 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2994 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2995 ln = isALNUM(ln);
2996 n = isALNUM(nextchr);
2997 }
2998 else {
2999 ln = isALNUM_LC(ln);
3000 n = isALNUM_LC(nextchr);
3001 }
d6a28714 3002 }
ffc61ed2
JH
3003 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3004 OP(scan) == BOUNDL))
3005 sayNO;
a0ed51b3 3006 break;
d6a28714 3007 case SPACEL:
3280af22 3008 PL_reg_flags |= RF_tainted;
bbce6d69 3009 /* FALL THROUGH */
d6a28714 3010 case SPACE:
9442cb0e 3011 if (!nextchr)
4633a7c4 3012 sayNO;
1aa99e6b 3013 if (do_utf8) {
fd400ab9 3014 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3015 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3016 if (!(OP(scan) == SPACE
3568d838 3017 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3018 : isSPACE_LC_utf8((U8*)locinput)))
3019 {
3020 sayNO;
3021 }
3022 locinput += PL_utf8skip[nextchr];
3023 nextchr = UCHARAT(locinput);
3024 break;
d6a28714 3025 }
ffc61ed2
JH
3026 if (!(OP(scan) == SPACE
3027 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3028 sayNO;
3029 nextchr = UCHARAT(++locinput);
3030 }
3031 else {
3032 if (!(OP(scan) == SPACE
3033 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3034 sayNO;
3035 nextchr = UCHARAT(++locinput);
a0ed51b3 3036 }
a0ed51b3 3037 break;
d6a28714 3038 case NSPACEL:
3280af22 3039 PL_reg_flags |= RF_tainted;
bbce6d69 3040 /* FALL THROUGH */
d6a28714 3041 case NSPACE:
9442cb0e 3042 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3043 sayNO;
1aa99e6b 3044 if (do_utf8) {
1a4fad37 3045 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3046 if (OP(scan) == NSPACE
3568d838 3047 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3048 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3049 {
3050 sayNO;
3051 }
3052 locinput += PL_utf8skip[nextchr];
3053 nextchr = UCHARAT(locinput);
3054 break;
a0ed51b3 3055 }
ffc61ed2 3056 if (OP(scan) == NSPACE
d6a28714 3057 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3058 sayNO;
76e3520e 3059 nextchr = UCHARAT(++locinput);
a0d0e21e 3060 break;
d6a28714 3061 case DIGITL:
a0ed51b3
LW
3062 PL_reg_flags |= RF_tainted;
3063 /* FALL THROUGH */
d6a28714 3064 case DIGIT:
9442cb0e 3065 if (!nextchr)
a0ed51b3 3066 sayNO;
1aa99e6b 3067 if (do_utf8) {
1a4fad37 3068 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3069 if (!(OP(scan) == DIGIT
3568d838 3070 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3071 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3072 {
a0ed51b3 3073 sayNO;
dfe13c55 3074 }
6f06b55f 3075 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3076 nextchr = UCHARAT(locinput);
3077 break;
3078 }
ffc61ed2 3079 if (!(OP(scan) == DIGIT
9442cb0e 3080 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3081 sayNO;
3082 nextchr = UCHARAT(++locinput);
3083 break;
d6a28714 3084 case NDIGITL:
b8c5462f
JH
3085 PL_reg_flags |= RF_tainted;
3086 /* FALL THROUGH */
d6a28714 3087 case NDIGIT:
9442cb0e 3088 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3089 sayNO;
1aa99e6b 3090 if (do_utf8) {
1a4fad37 3091 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3092 if (OP(scan) == NDIGIT
3568d838 3093 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3094 : isDIGIT_LC_utf8((U8*)locinput))
3095 {
a0ed51b3 3096 sayNO;
9442cb0e 3097 }
6f06b55f 3098 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3099 nextchr = UCHARAT(locinput);
3100 break;
3101 }
ffc61ed2 3102 if (OP(scan) == NDIGIT
9442cb0e 3103 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3104 sayNO;
3105 nextchr = UCHARAT(++locinput);
3106 break;
3107 case CLUMP:
b7c83a7e 3108 if (locinput >= PL_regeol)
a0ed51b3 3109 sayNO;
b7c83a7e 3110 if (do_utf8) {
1a4fad37 3111 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3112 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3113 sayNO;
3114 locinput += PL_utf8skip[nextchr];
3115 while (locinput < PL_regeol &&
3116 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3117 locinput += UTF8SKIP(locinput);
3118 if (locinput > PL_regeol)
3119 sayNO;
eb08e2da
JH
3120 }
3121 else
3122 locinput++;
a0ed51b3
LW
3123 nextchr = UCHARAT(locinput);
3124 break;
c8756f30 3125 case REFFL:
3280af22 3126 PL_reg_flags |= RF_tainted;
c8756f30 3127 /* FALL THROUGH */
c277df42 3128 case REF:
c8756f30 3129 case REFF:
c277df42 3130 n = ARG(scan); /* which paren pair */
cf93c79d 3131 ln = PL_regstartp[n];
2c2d71f5 3132 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3133 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3134 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3135 if (ln == PL_regendp[n])
a0d0e21e 3136 break;
a0ed51b3 3137
cf93c79d 3138 s = PL_bostr + ln;
1aa99e6b 3139 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3140 char *l = locinput;
a3b680e6 3141 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3142 /*
3143 * Note that we can't do the "other character" lookup trick as
3144 * in the 8-bit case (no pun intended) because in Unicode we
3145 * have to map both upper and title case to lower case.
3146 */
3147 if (OP(scan) == REFF) {
3148 while (s < e) {
a3b680e6
AL
3149 STRLEN ulen1, ulen2;
3150 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3151 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3152
a0ed51b3
LW
3153 if (l >= PL_regeol)
3154 sayNO;
a2a2844f
JH
3155 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3156 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3157 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3158 sayNO;
a2a2844f
JH
3159 s += ulen1;
3160 l += ulen2;
a0ed51b3
LW
3161 }
3162 }
3163 locinput = l;
3164 nextchr = UCHARAT(locinput);
3165 break;
3166 }
3167
a0d0e21e 3168 /* Inline the first character, for speed. */
76e3520e 3169 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3170 (OP(scan) == REF ||
3171 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3172 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3173 sayNO;
cf93c79d 3174 ln = PL_regendp[n] - ln;
3280af22 3175 if (locinput + ln > PL_regeol)
4633a7c4 3176 sayNO;
c8756f30
AK
3177 if (ln > 1 && (OP(scan) == REF
3178 ? memNE(s, locinput, ln)
3179 : (OP(scan) == REFF
3180 ? ibcmp(s, locinput, ln)
3181 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3182 sayNO;
a0d0e21e 3183 locinput += ln;
76e3520e 3184 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3185 break;
3186
3187 case NOTHING:
c277df42 3188 case TAIL:
a0d0e21e
LW
3189 break;
3190 case BACK:
3191 break;
c277df42
IZ
3192 case EVAL:
3193 {
3194 dSP;
6136c704
AL
3195 OP_4tree * const oop = PL_op;
3196 COP * const ocurcop = PL_curcop;
f3548bdc 3197 PAD *old_comppad;
c277df42 3198 SV *ret;
6136c704 3199 struct regexp * const oreg = PL_reg_re;
9041c2e3 3200
c277df42 3201 n = ARG(scan);
533c011a 3202 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3203 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3204 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3205 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3206
8e5e9ebe 3207 {
6136c704 3208 SV ** const before = SP;
8e5e9ebe
RGS
3209 CALLRUNOPS(aTHX); /* Scalar context. */
3210 SPAGAIN;
3211 if (SP == before)
075aa684 3212 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3213 else {
3214 ret = POPs;
3215 PUTBACK;
3216 }
3217 }
3218
0f5d15d6 3219 PL_op = oop;
f3548bdc 3220 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3221 PL_curcop = ocurcop;
c277df42 3222 if (logical) {
0f5d15d6
IZ
3223 if (logical == 2) { /* Postponed subexpression. */
3224 regexp *re;
6136c704 3225 MAGIC *mg = NULL;
0f5d15d6 3226 re_cc_state state;
0f5d15d6 3227 CHECKPOINT cp, lastcp;
cb50f42d 3228 int toggleutf;
faf82a0b 3229 register SV *sv;
0f5d15d6 3230
faf82a0b
AE
3231 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3232 mg = mg_find(sv, PERL_MAGIC_qr);
3233 else if (SvSMAGICAL(ret)) {
3234 if (SvGMAGICAL(ret))
3235 sv_unmagic(ret, PERL_MAGIC_qr);
3236 else
3237 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3238 }
faf82a0b 3239
0f5d15d6
IZ
3240 if (mg) {
3241 re = (regexp *)mg->mg_obj;
df0003d4 3242 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3243 }
3244 else {
3245 STRLEN len;
6136c704 3246 const char * const t = SvPV_const(ret, len);
0f5d15d6 3247 PMOP pm;
a3b680e6
AL
3248 char * const oprecomp = PL_regprecomp;
3249 const I32 osize = PL_regsize;
3250 const I32 onpar = PL_regnpar;
0f5d15d6 3251
5fcd1c1b 3252 Zero(&pm, 1, PMOP);
cb50f42d 3253 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3254 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3255 if (!(SvFLAGS(ret)
faf82a0b
AE
3256 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3257 | SVs_GMG)))
14befaf4
DM
3258 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3259 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3260 PL_regprecomp = oprecomp;
3261 PL_regsize = osize;
3262 PL_regnpar = onpar;
3263 }
a3621e74 3264 DEBUG_EXECUTE_r(
9041c2e3 3265 PerlIO_printf(Perl_debug_log,
a0288114 3266 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3267 PL_colors[0],
3268 re->precomp,
3269 PL_colors[1],
3270 (strlen(re->precomp) > 60 ? "..." : ""))
3271 );
3272 state.node = next;
3273 state.prev = PL_reg_call_cc;
3274 state.cc = PL_regcc;
3275 state.re = PL_reg_re;
3276
2ab05381 3277 PL_regcc = 0;
9041c2e3 3278
0f5d15d6 3279 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3280 REGCP_SET(lastcp);
0f5d15d6
IZ
3281 cache_re(re);
3282 state.ss = PL_savestack_ix;
3283 *PL_reglastparen = 0;
a01268b5 3284 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3285 PL_reg_call_cc = &state;
3286 PL_reginput = locinput;
cb50f42d
YST
3287 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3288 ((re->reganch & ROPT_UTF8) != 0);
3289 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3290
3291 /* XXXX This is too dramatic a measure... */
3292 PL_reg_maxiter = 0;
3293
0f5d15d6 3294 if (regmatch(re->program + 1)) {
2c914db6
IZ
3295 /* Even though we succeeded, we need to restore
3296 global variables, since we may be wrapped inside
3297 SUSPEND, thus the match may be not finished yet. */
3298
3299 /* XXXX Do this only if SUSPENDed? */
3300 PL_reg_call_cc = state.prev;
3301 PL_regcc = state.cc;
3302 PL_reg_re = state.re;
3303 cache_re(PL_reg_re);
cb50f42d 3304 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3305
3306 /* XXXX This is too dramatic a measure... */
3307 PL_reg_maxiter = 0;
3308
3309 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3310 ReREFCNT_dec(re);
3311 regcpblow(cp);
3312 sayYES;
3313 }
0f5d15d6 3314 ReREFCNT_dec(re);
02db2b7b 3315 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3316 regcppop();
3317 PL_reg_call_cc = state.prev;
3318 PL_regcc = state.cc;
3319 PL_reg_re = state.re;
d3790889 3320 cache_re(PL_reg_re);
cb50f42d 3321 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3322
3323 /* XXXX This is too dramatic a measure... */
3324 PL_reg_maxiter = 0;
3325
8e514ae6 3326 logical = 0;
0f5d15d6
IZ
3327 sayNO;
3328 }
c277df42 3329 sw = SvTRUE(ret);
0f5d15d6 3330 logical = 0;
a0ed51b3 3331 }
080c2dec 3332 else {
3280af22 3333 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3334 cache_re(oreg);
3335 }
c277df42
IZ
3336 break;
3337 }
a0d0e21e 3338 case OPEN:
c277df42 3339 n = ARG(scan); /* which paren pair */
3280af22
NIS
3340 PL_reg_start_tmp[n] = locinput;
3341 if (n > PL_regsize)
3342 PL_regsize = n;
a0d0e21e
LW
3343 break;
3344 case CLOSE:
c277df42 3345 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3346 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3347 PL_regendp[n] = locinput - PL_bostr;
eb160463 3348 if (n > (I32)*PL_reglastparen)
3280af22 3349 *PL_reglastparen = n;
a01268b5 3350 *PL_reglastcloseparen = n;
a0d0e21e 3351 break;
c277df42
IZ
3352 case GROUPP:
3353 n = ARG(scan); /* which paren pair */
eb160463 3354 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3355 break;
3356 case IFTHEN:
2c2d71f5 3357 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3358 if (sw)
3359 next = NEXTOPER(NEXTOPER(scan));
3360 else {
3361 next = scan + ARG(scan);
3362 if (OP(next) == IFTHEN) /* Fake one. */
3363 next = NEXTOPER(NEXTOPER(next));
3364 }
3365 break;
3366 case LOGICAL:
0f5d15d6 3367 logical = scan->flags;
c277df42 3368 break;
2ab05381
IZ
3369/*******************************************************************
3370 PL_regcc contains infoblock about the innermost (...)* loop, and
3371 a pointer to the next outer infoblock.
3372
3373 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3374
3375 1) After matching X, regnode for CURLYX is processed;
3376
9041c2e3 3377 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3378 regmatch() recursively with the starting point at WHILEM node;
3379
3380 3) Each hit of WHILEM node tries to match A and Z (in the order
3381 depending on the current iteration, min/max of {min,max} and
3382 greediness). The information about where are nodes for "A"
3383 and "Z" is read from the infoblock, as is info on how many times "A"
3384 was already matched, and greediness.
3385
3386 4) After A matches, the same WHILEM node is hit again.
3387
3388 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3389 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3390 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3391 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3392 of the external loop.
3393
3394 Currently present infoblocks form a tree with a stem formed by PL_curcc
3395 and whatever it mentions via ->next, and additional attached trees
3396 corresponding to temporarily unset infoblocks as in "5" above.
3397
9041c2e3 3398 In the following picture infoblocks for outer loop of
2ab05381
IZ
3399 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3400 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3401 infoblocks are drawn below the "reset" infoblock.
3402
3403 In fact in the picture below we do not show failed matches for Z and T
3404 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3405 more obvious *why* one needs to *temporary* unset infoblocks.]
3406
3407 Matched REx position InfoBlocks Comment
3408 (Y(A)*?Z)*?T x
3409 Y(A)*?Z)*?T x <- O
3410 Y (A)*?Z)*?T x <- O
3411 Y A)*?Z)*?T x <- O <- I
3412 YA )*?Z)*?T x <- O <- I
3413 YA A)*?Z)*?T x <- O <- I
3414 YAA )*?Z)*?T x <- O <- I
3415 YAA Z)*?T x <- O # Temporary unset I
3416 I
3417
3418 YAAZ Y(A)*?Z)*?T x <- O
3419 I
3420
3421 YAAZY (A)*?Z)*?T x <- O
3422 I
3423
3424 YAAZY A)*?Z)*?T x <- O <- I
3425 I
3426
3427 YAAZYA )*?Z)*?T x <- O <- I
3428 I
3429
3430 YAAZYA Z)*?T x <- O # Temporary unset I
3431 I,I
3432
3433 YAAZYAZ )*?T x <- O
3434 I,I
3435
3436 YAAZYAZ T x # Temporary unset O
3437 O
3438 I,I
3439
3440 YAAZYAZT x
3441 O
3442 I,I
3443 *******************************************************************/
a0d0e21e
LW
3444 case CURLYX: {
3445 CURCUR cc;
3280af22 3446 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3447 /* No need to save/restore up to this paren */
3448 I32 parenfloor = scan->flags;
c277df42
IZ
3449
3450 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3451 next += ARG(next);
3280af22
NIS
3452 cc.oldcc = PL_regcc;
3453 PL_regcc = &cc;
cb434fcc
IZ
3454 /* XXXX Probably it is better to teach regpush to support
3455 parenfloor > PL_regsize... */
eb160463 3456 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3457 parenfloor = *PL_reglastparen; /* Pessimization... */
3458 cc.parenfloor = parenfloor;
a0d0e21e
LW
3459 cc.cur = -1;
3460 cc.min = ARG1(scan);
3461 cc.max = ARG2(scan);
c277df42 3462 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3463 cc.next = next;
3464 cc.minmod = minmod;
3465 cc.lastloc = 0;
3280af22 3466 PL_reginput = locinput;
a0d0e21e
LW
3467 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3468 regcpblow(cp);
3280af22 3469 PL_regcc = cc.oldcc;
4633a7c4 3470 saySAME(n);
a0d0e21e 3471 }
5f66b61c 3472 /* NOTREACHED */
a0d0e21e
LW
3473 case WHILEM: {
3474 /*
3475 * This is really hard to understand, because after we match
3476 * what we're trying to match, we must make sure the rest of
2c2d71f5 3477 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3478 * to go back UP the parse tree by recursing ever deeper. And
3479 * if it fails, we have to reset our parent's current state
3480 * that we can try again after backing off.
3481 */
3482
c277df42 3483 CHECKPOINT cp, lastcp;
3280af22 3484 CURCUR* cc = PL_regcc;
6136c704 3485 char * const lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3486 I32 cache_offset = 0, cache_bit = 0;
c277df42 3487
4633a7c4 3488 n = cc->cur + 1; /* how many we know we matched */
3280af22 3489 PL_reginput = locinput;
a0d0e21e 3490