This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #37836] Simple Regex causes SEGV when run on specific data
[perl5.git] / regexec.c
<
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
a3b680e6 181 const int retval = PL_savestack_ix;
b1ce53c5 182#define REGCP_PAREN_ELEMS 4
a3b680e6 183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
184 int p;
185
e49a9654
IH
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
188
a01268b5 189#define REGCP_OTHER_ELEMS 6
4b3c1a47 190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
196 SSPUSHINT(p);
197 }
b1ce53c5 198/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
a01268b5 201 SSPUSHINT(*PL_reglastcloseparen);
3280af22 202 SSPUSHPTR(PL_reginput);
41123dfd
JH
203#define REGCP_FRAME_ELEMS 2
204/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 208
a0d0e21e
LW
209 return retval;
210}
211
c277df42 212/* These are needed since we do not localize EVAL nodes: */
a3621e74 213# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 216
a3621e74 217# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 218 PerlIO_printf(Perl_debug_log, \
faccc32b 219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 221
76e3520e 222STATIC char *
cea2e8a9 223S_regcppop(pTHX)
a0d0e21e 224{
b1ce53c5 225 I32 i;
a0d0e21e
LW
226 U32 paren = 0;
227 char *input;
b1ce53c5 228
a3621e74
YO
229 GET_RE_DEBUG_FLAGS_DECL;
230
b1ce53c5 231 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 232 i = SSPOPINT;
b1ce53c5
JH
233 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
234 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 235 input = (char *) SSPOPPTR;
a01268b5 236 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
237 *PL_reglastparen = SSPOPINT;
238 PL_regsize = SSPOPINT;
b1ce53c5
JH
239
240 /* Now restore the parentheses context. */
41123dfd
JH
241 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
242 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 243 I32 tmps;
a0d0e21e 244 paren = (U32)SSPOPINT;
3280af22 245 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
246 PL_regstartp[paren] = SSPOPINT;
247 tmps = SSPOPINT;
3280af22
NIS
248 if (paren <= *PL_reglastparen)
249 PL_regendp[paren] = tmps;
a3621e74 250 DEBUG_EXECUTE_r(
c3464db5 251 PerlIO_printf(Perl_debug_log,
b900a521 252 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 253 (UV)paren, (IV)PL_regstartp[paren],
b900a521 254 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 255 (IV)PL_regendp[paren],
3280af22 256 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 257 );
a0d0e21e 258 }
a3621e74 259 DEBUG_EXECUTE_r(
eb160463 260 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 261 PerlIO_printf(Perl_debug_log,
faccc32b
JH
262 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
263 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
264 }
265 );
daf18116 266#if 1
dafc8851
JH
267 /* It would seem that the similar code in regtry()
268 * already takes care of this, and in fact it is in
269 * a better location to since this code can #if 0-ed out
270 * but the code in regtry() is needed or otherwise tests
271 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
272 * (as of patchlevel 7877) will fail. Then again,
273 * this code seems to be necessary or otherwise
274 * building DynaLoader will fail:
275 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
276 * --jhi */
eb160463
GS
277 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
278 if ((I32)paren > PL_regsize)
cf93c79d
IZ
279 PL_regstartp[paren] = -1;
280 PL_regendp[paren] = -1;
a0d0e21e 281 }
dafc8851 282#endif
a0d0e21e
LW
283 return input;
284}
285
0f5d15d6 286STATIC char *
cea2e8a9 287S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 288{
a3b680e6 289 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
290
291 PL_savestack_ix = ss;
292 regcppop();
293 PL_savestack_ix = tmp;
942e002e 294 return Nullch;
0f5d15d6
IZ
295}
296
297typedef struct re_cc_state
298{
299 I32 ss;
300 regnode *node;
301 struct re_cc_state *prev;
302 CURCUR *cc;
303 regexp *re;
304} re_cc_state;
305
02db2b7b 306#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 307
29d1e993
HS
308#define TRYPAREN(paren, n, input) { \
309 if (paren) { \
310 if (n) { \
311 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
312 PL_regendp[paren] = input - PL_bostr; \
313 } \
314 else \
315 PL_regendp[paren] = -1; \
316 } \
317 if (regmatch(next)) \
318 sayYES; \
319 if (paren && n) \
320 PL_regendp[paren] = -1; \
321}
322
323
a687059c 324/*
e50aee73 325 * pregexec and friends
a687059c
LW
326 */
327
328/*
c277df42 329 - pregexec - match a regexp against a string
a687059c 330 */
c277df42 331I32
864dbfa3 332Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 333 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
334/* strend: pointer to null at end of string */
335/* strbeg: real beginning of string */
336/* minend: end of match must be >=minend after stringarg. */
337/* nosave: For optimizations. */
338{
339 return
9041c2e3 340 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
341 nosave ? 0 : REXEC_COPY_STR);
342}
0f5d15d6
IZ
343
344STATIC void
cea2e8a9 345S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
346{
347 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
348#ifdef DEBUGGING
349 PL_regprogram = prog->program;
350#endif
351 PL_regnpar = prog->nparens;
9041c2e3
NIS
352 PL_regdata = prog->data;
353 PL_reg_re = prog;
0f5d15d6 354}
22e551b9 355
9041c2e3 356/*
cad2e5aa
JH
357 * Need to implement the following flags for reg_anch:
358 *
359 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
360 * USE_INTUIT_ML
361 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
362 * INTUIT_AUTORITATIVE_ML
363 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * INTUIT_ONCE_ML
365 *
366 * Another flag for this function: SECOND_TIME (so that float substrs
367 * with giant delta may be not rechecked).
368 */
369
370/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
371
3f7c398e 372/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
373 Otherwise, only SvCUR(sv) is used to get strbeg. */
374
375/* XXXX We assume that strpos is strbeg unless sv. */
376
6eb5f6b9
JH
377/* XXXX Some places assume that there is a fixed substring.
378 An update may be needed if optimizer marks as "INTUITable"
379 RExen without fixed substrings. Similarly, it is assumed that
380 lengths of all the strings are no more than minlen, thus they
381 cannot come from lookahead.
382 (Or minlen should take into account lookahead.) */
383
2c2d71f5
JH
384/* A failure to find a constant substring means that there is no need to make
385 an expensive call to REx engine, thus we celebrate a failure. Similarly,
386 finding a substring too deep into the string means that less calls to
30944b6d
IZ
387 regtry() should be needed.
388
389 REx compiler's optimizer found 4 possible hints:
390 a) Anchored substring;
391 b) Fixed substring;
392 c) Whether we are anchored (beginning-of-line or \G);
393 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 394 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
395 string which does not contradict any of them.
396 */
2c2d71f5 397
6eb5f6b9
JH
398/* Most of decisions we do here should have been done at compile time.
399 The nodes of the REx which we used for the search should have been
400 deleted from the finite automaton. */
401
cad2e5aa
JH
402char *
403Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
404 char *strend, U32 flags, re_scream_pos_data *data)
405{
b7953727 406 register I32 start_shift = 0;
cad2e5aa 407 /* Should be nonnegative! */
b7953727 408 register I32 end_shift = 0;
2c2d71f5
JH
409 register char *s;
410 register SV *check;
a1933d95 411 char *strbeg;
cad2e5aa 412 char *t;
a3b680e6 413 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 414 I32 ml_anch;
6eb5f6b9 415 register char *other_last = Nullch; /* other substr checked before this */
b7953727 416 char *check_at = Nullch; /* check substr found at this pos */
1df70142 417 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 418#ifdef DEBUGGING
890ce7af
AL
419 const char * const i_strpos = strpos;
420 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 421#endif
a3621e74
YO
422
423 GET_RE_DEBUG_FLAGS_DECL;
424
a30b2f1f 425 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 426
b8d68ded 427 if (prog->reganch & ROPT_UTF8) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
429 "UTF-8 regex...\n"));
430 PL_reg_flags |= RF_utf8;
431 }
432
a3621e74 433 DEBUG_EXECUTE_r({
1df70142 434 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
435 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
436 strpos;
1df70142 437 const int len = PL_reg_match_utf8 ?
b8d68ded 438 strlen(s) : strend - strpos;
2a782b5b
JH
439 if (!PL_colorset)
440 reginitcolors();
b8d68ded 441 if (PL_reg_match_utf8)
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 443 "UTF-8 target...\n"));
2a782b5b 444 PerlIO_printf(Perl_debug_log,
a0288114 445 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 446 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
447 prog->precomp,
448 PL_colors[1],
449 (strlen(prog->precomp) > 60 ? "..." : ""),
450 PL_colors[0],
451 (int)(len > 60 ? 60 : len),
452 s, PL_colors[1],
453 (len > 60 ? "..." : "")
454 );
455 });
cad2e5aa 456
c344f387
JH
457 /* CHR_DIST() would be more correct here but it makes things slow. */
458 if (prog->minlen > strend - strpos) {
a3621e74 459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 460 "String too short... [re_intuit_start]\n"));
cad2e5aa 461 goto fail;
2c2d71f5 462 }
a1933d95 463 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 464 PL_regeol = strend;
33b8afdf
JH
465 if (do_utf8) {
466 if (!prog->check_utf8 && prog->check_substr)
467 to_utf8_substr(prog);
468 check = prog->check_utf8;
469 } else {
470 if (!prog->check_substr && prog->check_utf8)
471 to_byte_substr(prog);
472 check = prog->check_substr;
473 }
474 if (check == &PL_sv_undef) {
a3621e74 475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
476 "Non-utf string cannot match utf check string\n"));
477 goto fail;
478 }
2c2d71f5 479 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
480 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
481 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 482 && !multiline ) ); /* Check after \n? */
cad2e5aa 483
7e25d62c
JH
484 if (!ml_anch) {
485 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
486 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 487 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
488 && sv && !SvROK(sv)
489 && (strpos != strbeg)) {
a3621e74 490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
491 goto fail;
492 }
493 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 494 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 495 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
496 I32 slen;
497
1aa99e6b 498 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
499 if (SvTAIL(check)) {
500 slen = SvCUR(check); /* >= 1 */
cad2e5aa 501
9041c2e3 502 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 503 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 504 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 505 goto fail_finish;
cad2e5aa
JH
506 }
507 /* Now should match s[0..slen-2] */
508 slen--;
3f7c398e 509 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 510 || (slen > 1
3f7c398e 511 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 512 report_neq:
a3621e74 513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
514 goto fail_finish;
515 }
cad2e5aa 516 }
3f7c398e 517 else if (*SvPVX_const(check) != *s
653099ff 518 || ((slen = SvCUR(check)) > 1
3f7c398e 519 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 520 goto report_neq;
c315bfe8 521 check_at = s;
2c2d71f5 522 goto success_at_start;
7e25d62c 523 }
cad2e5aa 524 }
2c2d71f5 525 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 526 s = strpos;
2c2d71f5 527 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 528 end_shift = prog->minlen - start_shift -
653099ff 529 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 530 if (!ml_anch) {
a3b680e6 531 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 532 - (SvTAIL(check) != 0);
a3b680e6 533 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
534
535 if (end_shift < eshift)
536 end_shift = eshift;
537 }
cad2e5aa 538 }
2c2d71f5 539 else { /* Can match at random position */
cad2e5aa
JH
540 ml_anch = 0;
541 s = strpos;
2c2d71f5
JH
542 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
543 /* Should be nonnegative! */
544 end_shift = prog->minlen - start_shift -
653099ff 545 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
546 }
547
2c2d71f5 548#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 549 if (end_shift < 0)
6bbae5e6 550 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
551#endif
552
2c2d71f5
JH
553 restart:
554 /* Find a possible match in the region s..strend by looking for
555 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 556 if (flags & REXEC_SCREAM) {
cad2e5aa 557 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 558 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 559
2c2d71f5
JH
560 if (PL_screamfirst[BmRARE(check)] >= 0
561 || ( BmRARE(check) == '\n'
562 && (BmPREVIOUS(check) == SvCUR(check) - 1)
563 && SvTAIL(check) ))
9041c2e3 564 s = screaminstr(sv, check,
2c2d71f5 565 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 566 else
2c2d71f5 567 goto fail_finish;
4addbd3b
HS
568 /* we may be pointing at the wrong string */
569 if (s && RX_MATCH_COPIED(prog))
3f7c398e 570 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
571 if (data)
572 *data->scream_olds = s;
573 }
f33976b4 574 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
575 s = fbm_instr((U8*)(s + start_shift),
576 (U8*)(strend - end_shift),
7fba1cd6 577 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 578 else
1aa99e6b
IH
579 s = fbm_instr(HOP3(s, start_shift, strend),
580 HOP3(strend, -end_shift, strbeg),
7fba1cd6 581 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
582
583 /* Update the count-of-usability, remove useless subpatterns,
584 unshift s. */
2c2d71f5 585
a0288114 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 587 (s ? "Found" : "Did not find"),
33b8afdf 588 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 589 PL_colors[0],
7b0972df 590 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 591 SvPVX_const(check),
2c2d71f5
JH
592 PL_colors[1], (SvTAIL(check) ? "$" : ""),
593 (s ? " at offset " : "...\n") ) );
594
595 if (!s)
596 goto fail_finish;
597
6eb5f6b9
JH
598 check_at = s;
599
2c2d71f5 600 /* Finish the diagnostic message */
a3621e74 601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
602
603 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
604 Start with the other substr.
605 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 606 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
607 *always* match. Probably should be marked during compile...
608 Probably it is right to do no SCREAM here...
609 */
610
33b8afdf 611 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 612 /* Take into account the "other" substring. */
2c2d71f5
JH
613 /* XXXX May be hopelessly wrong for UTF... */
614 if (!other_last)
6eb5f6b9 615 other_last = strpos;
33b8afdf 616 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
617 do_other_anchored:
618 {
890ce7af
AL
619 char * const last = HOP3c(s, -start_shift, strbeg);
620 char *last1, *last2;
2c2d71f5 621 char *s1 = s;
33b8afdf 622 SV* must;
2c2d71f5 623
2c2d71f5
JH
624 t = s - prog->check_offset_max;
625 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 626 && (!do_utf8
1aa99e6b 627 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 628 && t > strpos)))
30944b6d 629 /* EMPTY */;
2c2d71f5
JH
630 else
631 t = strpos;
1aa99e6b 632 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
633 if (t < other_last) /* These positions already checked */
634 t = other_last;
1aa99e6b 635 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
636 if (last < last1)
637 last1 = last;
638 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
639 /* On end-of-str: see comment below. */
33b8afdf
JH
640 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
641 if (must == &PL_sv_undef) {
642 s = (char*)NULL;
a3621e74 643 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
644 }
645 else
646 s = fbm_instr(
647 (unsigned char*)t,
648 HOP3(HOP3(last1, prog->anchored_offset, strend)
649 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
650 must,
7fba1cd6 651 multiline ? FBMrf_MULTILINE : 0
33b8afdf 652 );
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 654 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
655 (s ? "Found" : "Contradicts"),
656 PL_colors[0],
33b8afdf
JH
657 (int)(SvCUR(must)
658 - (SvTAIL(must)!=0)),
3f7c398e 659 SvPVX_const(must),
33b8afdf 660 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
661 if (!s) {
662 if (last1 >= last2) {
a3621e74 663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
664 ", giving up...\n"));
665 goto fail_finish;
666 }
a3621e74 667 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 668 ", trying floating at offset %ld...\n",
1aa99e6b
IH
669 (long)(HOP3c(s1, 1, strend) - i_strpos)));
670 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
671 s = HOP3c(last, 1, strend);
2c2d71f5
JH
672 goto restart;
673 }
674 else {
a3621e74 675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 676 (long)(s - i_strpos)));
1aa99e6b
IH
677 t = HOP3c(s, -prog->anchored_offset, strbeg);
678 other_last = HOP3c(s, 1, strend);
30944b6d 679 s = s1;
2c2d71f5
JH
680 if (t == strpos)
681 goto try_at_start;
2c2d71f5
JH
682 goto try_at_offset;
683 }
30944b6d 684 }
2c2d71f5
JH
685 }
686 else { /* Take into account the floating substring. */
33b8afdf
JH
687 char *last, *last1;
688 char *s1 = s;
689 SV* must;
690
691 t = HOP3c(s, -start_shift, strbeg);
692 last1 = last =
693 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
694 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
695 last = HOP3c(t, prog->float_max_offset, strend);
696 s = HOP3c(t, prog->float_min_offset, strend);
697 if (s < other_last)
698 s = other_last;
2c2d71f5 699 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
700 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
701 /* fbm_instr() takes into account exact value of end-of-str
702 if the check is SvTAIL(ed). Since false positives are OK,
703 and end-of-str is not later than strend we are OK. */
704 if (must == &PL_sv_undef) {
705 s = (char*)NULL;
a3621e74 706 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
707 }
708 else
2c2d71f5 709 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
710 (unsigned char*)last + SvCUR(must)
711 - (SvTAIL(must)!=0),
7fba1cd6 712 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
714 (s ? "Found" : "Contradicts"),
715 PL_colors[0],
716 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 717 SvPVX_const(must),
33b8afdf
JH
718 PL_colors[1], (SvTAIL(must) ? "$" : "")));
719 if (!s) {
720 if (last1 == last) {
a3621e74 721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
722 ", giving up...\n"));
723 goto fail_finish;
2c2d71f5 724 }
a3621e74 725 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
726 ", trying anchored starting at offset %ld...\n",
727 (long)(s1 + 1 - i_strpos)));
728 other_last = last;
729 s = HOP3c(t, 1, strend);
730 goto restart;
731 }
732 else {
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
734 (long)(s - i_strpos)));
735 other_last = s; /* Fix this later. --Hugo */
736 s = s1;
737 if (t == strpos)
738 goto try_at_start;
739 goto try_at_offset;
740 }
2c2d71f5 741 }
cad2e5aa 742 }
2c2d71f5
JH
743
744 t = s - prog->check_offset_max;
2c2d71f5 745 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 746 && (!do_utf8
1aa99e6b
IH
747 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
748 && t > strpos))) {
2c2d71f5
JH
749 /* Fixed substring is found far enough so that the match
750 cannot start at strpos. */
751 try_at_offset:
cad2e5aa 752 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
753 /* Eventually fbm_*() should handle this, but often
754 anchored_offset is not 0, so this check will not be wasted. */
755 /* XXXX In the code below we prefer to look for "^" even in
756 presence of anchored substrings. And we search even
757 beyond the found float position. These pessimizations
758 are historical artefacts only. */
759 find_anchor:
2c2d71f5 760 while (t < strend - prog->minlen) {
cad2e5aa 761 if (*t == '\n') {
4ee3650e 762 if (t < check_at - prog->check_offset_min) {
33b8afdf 763 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
764 /* Since we moved from the found position,
765 we definitely contradict the found anchored
30944b6d
IZ
766 substr. Due to the above check we do not
767 contradict "check" substr.
768 Thus we can arrive here only if check substr
769 is float. Redo checking for "other"=="fixed".
770 */
9041c2e3 771 strpos = t + 1;
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
774 goto do_other_anchored;
775 }
4ee3650e
GS
776 /* We don't contradict the found floating substring. */
777 /* XXXX Why not check for STCLASS? */
cad2e5aa 778 s = t + 1;
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 780 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
781 goto set_useful;
782 }
4ee3650e
GS
783 /* Position contradicts check-string */
784 /* XXXX probably better to look for check-string
785 than for "\n", so one should lower the limit for t? */
a3621e74 786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 787 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 788 other_last = strpos = s = t + 1;
cad2e5aa
JH
789 goto restart;
790 }
791 t++;
792 }
a3621e74 793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 794 PL_colors[0], PL_colors[1]));
2c2d71f5 795 goto fail_finish;
cad2e5aa 796 }
f5952150 797 else {
a3621e74 798 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 799 PL_colors[0], PL_colors[1]));
f5952150 800 }
cad2e5aa
JH
801 s = t;
802 set_useful:
33b8afdf 803 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
804 }
805 else {
f5952150 806 /* The found string does not prohibit matching at strpos,
2c2d71f5 807 - no optimization of calling REx engine can be performed,
f5952150
GS
808 unless it was an MBOL and we are not after MBOL,
809 or a future STCLASS check will fail this. */
2c2d71f5
JH
810 try_at_start:
811 /* Even in this situation we may use MBOL flag if strpos is offset
812 wrt the start of the string. */
05b4157f 813 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 814 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
815 /* May be due to an implicit anchor of m{.*foo} */
816 && !(prog->reganch & ROPT_IMPLICIT))
817 {
cad2e5aa
JH
818 t = strpos;
819 goto find_anchor;
820 }
a3621e74 821 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 822 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 823 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 824 );
2c2d71f5 825 success_at_start:
30944b6d 826 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
827 && (do_utf8 ? (
828 prog->check_utf8 /* Could be deleted already */
829 && --BmUSEFUL(prog->check_utf8) < 0
830 && (prog->check_utf8 == prog->float_utf8)
831 ) : (
832 prog->check_substr /* Could be deleted already */
833 && --BmUSEFUL(prog->check_substr) < 0
834 && (prog->check_substr == prog->float_substr)
835 )))
66e933ab 836 {
cad2e5aa 837 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 838 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
839 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
840 if (do_utf8 ? prog->check_substr : prog->check_utf8)
841 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
842 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
843 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 844 check = Nullsv; /* abort */
cad2e5aa 845 s = strpos;
3cf5c195
IZ
846 /* XXXX This is a remnant of the old implementation. It
847 looks wasteful, since now INTUIT can use many
6eb5f6b9 848 other heuristics. */
cad2e5aa
JH
849 prog->reganch &= ~RE_USE_INTUIT;
850 }
851 else
852 s = strpos;
853 }
854
6eb5f6b9
JH
855 /* Last resort... */
856 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
857 if (prog->regstclass) {
858 /* minlen == 0 is possible if regstclass is \b or \B,
859 and the fixed substr is ''$.
860 Since minlen is already taken into account, s+1 is before strend;
861 accidentally, minlen >= 1 guaranties no false positives at s + 1
862 even for \b or \B. But (minlen? 1 : 0) below assumes that
863 regstclass does not come from lookahead... */
864 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
865 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 866 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 867 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 868 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 869 : 1);
a3b680e6 870 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 871 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 872 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
873 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
874 cl_l, strend)
875 : strend);
6eb5f6b9
JH
876
877 t = s;
9926ca43 878 cache_re(prog);
06b5626a 879 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
880 if (!s) {
881#ifdef DEBUGGING
e1ec3a88 882 const char *what = 0;
6eb5f6b9
JH
883#endif
884 if (endpos == strend) {
a3621e74 885 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
886 "Could not match STCLASS...\n") );
887 goto fail;
888 }
a3621e74 889 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 890 "This position contradicts STCLASS...\n") );
653099ff
GS
891 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
892 goto fail;
6eb5f6b9 893 /* Contradict one of substrings */
33b8afdf
JH
894 if (prog->anchored_substr || prog->anchored_utf8) {
895 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 896 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 897 hop_and_restart:
1aa99e6b 898 s = HOP3c(t, 1, strend);
66e933ab
GS
899 if (s + start_shift + end_shift > strend) {
900 /* XXXX Should be taken into account earlier? */
a3621e74 901 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
902 "Could not match STCLASS...\n") );
903 goto fail;
904 }
5e39e1e5
HS
905 if (!check)
906 goto giveup;
a3621e74 907 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 908 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
909 what, (long)(s + start_shift - i_strpos)) );
910 goto restart;
911 }
66e933ab 912 /* Have both, check_string is floating */
6eb5f6b9
JH
913 if (t + start_shift >= check_at) /* Contradicts floating=check */
914 goto retry_floating_check;
915 /* Recheck anchored substring, but not floating... */
9041c2e3 916 s = check_at;
5e39e1e5
HS
917 if (!check)
918 goto giveup;
a3621e74 919 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 920 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
921 (long)(other_last - i_strpos)) );
922 goto do_other_anchored;
923 }
60e71179
GS
924 /* Another way we could have checked stclass at the
925 current position only: */
926 if (ml_anch) {
927 s = t = t + 1;
5e39e1e5
HS
928 if (!check)
929 goto giveup;
a3621e74 930 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 931 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 932 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 933 goto try_at_offset;
66e933ab 934 }
33b8afdf 935 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 936 goto fail;
6eb5f6b9
JH
937 /* Check is floating subtring. */
938 retry_floating_check:
939 t = check_at - start_shift;
a3621e74 940 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
941 goto hop_and_restart;
942 }
b7953727 943 if (t != s) {
a3621e74 944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 945 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
946 (long)(t - i_strpos), (long)(s - i_strpos))
947 );
948 }
949 else {
a3621e74 950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
951 "Does not contradict STCLASS...\n");
952 );
953 }
6eb5f6b9 954 }
5e39e1e5 955 giveup:
a3621e74 956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
957 PL_colors[4], (check ? "Guessed" : "Giving up"),
958 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 959 return s;
2c2d71f5
JH
960
961 fail_finish: /* Substring not found */
33b8afdf
JH
962 if (prog->check_substr || prog->check_utf8) /* could be removed already */
963 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 964 fail:
a3621e74 965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 966 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
967 return Nullch;
968}
9661b544 969
6eb5f6b9 970/* We know what class REx starts with. Try to find this position... */
3c3eec57 971STATIC char *
a3b680e6 972S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 973{
27da23d5 974 dVAR;
1df70142 975 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 976 char *m;
d8093b23 977 STRLEN ln;
5dab1207 978 STRLEN lnc;
078c425b 979 register STRLEN uskip;
d8093b23 980 unsigned int c1;
981 unsigned int c2;
6eb5f6b9
JH
982 char *e;
983 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 984 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 985
6eb5f6b9
JH
986 /* We know what class it must start with. */
987 switch (OP(c)) {
6eb5f6b9 988 case ANYOF:
388cc4de 989 if (do_utf8) {
078c425b 990 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
991 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
992 !UTF8_IS_INVARIANT((U8)s[0]) ?
993 reginclass(c, (U8*)s, 0, do_utf8) :
994 REGINCLASS(c, (U8*)s)) {
995 if (tmp && (norun || regtry(prog, s)))
996 goto got_it;
997 else
998 tmp = doevery;
999 }
1000 else
1001 tmp = 1;
078c425b 1002 s += uskip;
388cc4de
HS
1003 }
1004 }
1005 else {
1006 while (s < strend) {
1007 STRLEN skip = 1;
1008
1009 if (REGINCLASS(c, (U8*)s) ||
1010 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1011 /* The assignment of 2 is intentional:
1012 * for the folded sharp s, the skip is 2. */
1013 (skip = SHARP_S_SKIP))) {
1014 if (tmp && (norun || regtry(prog, s)))
1015 goto got_it;
1016 else
1017 tmp = doevery;
1018 }
1019 else
1020 tmp = 1;
1021 s += skip;
1022 }
a0d0e21e 1023 }
6eb5f6b9 1024 break;
f33976b4
DB
1025 case CANY:
1026 while (s < strend) {
1027 if (tmp && (norun || regtry(prog, s)))
1028 goto got_it;
1029 else
1030 tmp = doevery;
1031 s++;
1032 }
1033 break;
6eb5f6b9 1034 case EXACTF:
5dab1207
NIS
1035 m = STRING(c);
1036 ln = STR_LEN(c); /* length to match in octets/bytes */
1037 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1038 if (UTF) {
a2a2844f 1039 STRLEN ulen1, ulen2;
5dab1207 1040 U8 *sm = (U8 *) m;
89ebb4a3
JH
1041 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1042 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1043 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1044
1045 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1046 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1047
89ebb4a3 1048 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1049 0, uniflags);
89ebb4a3 1050 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1051 0, uniflags);
5dab1207
NIS
1052 lnc = 0;
1053 while (sm < ((U8 *) m + ln)) {
1054 lnc++;
1055 sm += UTF8SKIP(sm);
1056 }
1aa99e6b
IH
1057 }
1058 else {
1059 c1 = *(U8*)m;
1060 c2 = PL_fold[c1];
1061 }
6eb5f6b9
JH
1062 goto do_exactf;
1063 case EXACTFL:
5dab1207
NIS
1064 m = STRING(c);
1065 ln = STR_LEN(c);
1066 lnc = (I32) ln;
d8093b23 1067 c1 = *(U8*)m;
6eb5f6b9
JH
1068 c2 = PL_fold_locale[c1];
1069 do_exactf:
db12adc6 1070 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1071
6eb5f6b9
JH
1072 if (norun && e < s)
1073 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1074
60a8b682
JH
1075 /* The idea in the EXACTF* cases is to first find the
1076 * first character of the EXACTF* node and then, if
1077 * necessary, case-insensitively compare the full
1078 * text of the node. The c1 and c2 are the first
1079 * characters (though in Unicode it gets a bit
1080 * more complicated because there are more cases
7f16dd3d
JH
1081 * than just upper and lower: one needs to use
1082 * the so-called folding case for case-insensitive
1083 * matching (called "loose matching" in Unicode).
1084 * ibcmp_utf8() will do just that. */
60a8b682 1085
1aa99e6b 1086 if (do_utf8) {
575cac57 1087 UV c, f;
89ebb4a3 1088 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1089 STRLEN len, foldlen;
041457d9 1090 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1091 if (c1 == c2) {
5dab1207
NIS
1092 /* Upper and lower of 1st char are equal -
1093 * probably not a "letter". */
1aa99e6b 1094 while (s <= e) {
89ebb4a3 1095 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1096 uniflags);
80aecb99
JH
1097 if ( c == c1
1098 && (ln == len ||
66423254 1099 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1100 m, (char **)0, ln, (bool)UTF))
55da9344 1101 && (norun || regtry(prog, s)) )
1aa99e6b 1102 goto got_it;
80aecb99 1103 else {
1df70142 1104 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1105 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1106 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1107 if ( f != c
1108 && (f == c1 || f == c2)
1109 && (ln == foldlen ||
66423254
JH
1110 !ibcmp_utf8((char *) foldbuf,
1111 (char **)0, foldlen, do_utf8,
d07ddd77 1112 m,
eb160463 1113 (char **)0, ln, (bool)UTF))
80aecb99
JH
1114 && (norun || regtry(prog, s)) )
1115 goto got_it;
1116 }
1aa99e6b
IH
1117 s += len;
1118 }
09091399
JH
1119 }
1120 else {
1aa99e6b 1121 while (s <= e) {
89ebb4a3 1122 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1123 uniflags);
80aecb99 1124
60a8b682 1125 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1126 * Note that not all the possible combinations
1127 * are handled here: some of them are handled
1128 * by the standard folding rules, and some of
1129 * them (the character class or ANYOF cases)
1130 * are handled during compiletime in
1131 * regexec.c:S_regclass(). */
880bd946
JH
1132 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1133 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1134 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1135
1136 if ( (c == c1 || c == c2)
1137 && (ln == len ||
66423254 1138 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1139 m, (char **)0, ln, (bool)UTF))
55da9344 1140 && (norun || regtry(prog, s)) )
1aa99e6b 1141 goto got_it;
80aecb99 1142 else {
1df70142 1143 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1144 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1145 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1146 if ( f != c
1147 && (f == c1 || f == c2)
1148 && (ln == foldlen ||
a6872d42 1149 !ibcmp_utf8((char *) foldbuf,
66423254 1150 (char **)0, foldlen, do_utf8,
d07ddd77 1151 m,
eb160463 1152 (char **)0, ln, (bool)UTF))
80aecb99
JH
1153 && (norun || regtry(prog, s)) )
1154 goto got_it;
1155 }
1aa99e6b
IH
1156 s += len;
1157 }
09091399 1158 }
1aa99e6b
IH
1159 }
1160 else {
1161 if (c1 == c2)
1162 while (s <= e) {
1163 if ( *(U8*)s == c1
1164 && (ln == 1 || !(OP(c) == EXACTF
1165 ? ibcmp(s, m, ln)
1166 : ibcmp_locale(s, m, ln)))
1167 && (norun || regtry(prog, s)) )
1168 goto got_it;
1169 s++;
1170 }
1171 else
1172 while (s <= e) {
1173 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1174 && (ln == 1 || !(OP(c) == EXACTF
1175 ? ibcmp(s, m, ln)
1176 : ibcmp_locale(s, m, ln)))
1177 && (norun || regtry(prog, s)) )
1178 goto got_it;
1179 s++;
1180 }
b3c9acc1
IZ
1181 }
1182 break;
bbce6d69 1183 case BOUNDL:
3280af22 1184 PL_reg_flags |= RF_tainted;
bbce6d69 1185 /* FALL THROUGH */
a0d0e21e 1186 case BOUND:
ffc61ed2 1187 if (do_utf8) {
12d33761 1188 if (s == PL_bostr)
ffc61ed2
JH
1189 tmp = '\n';
1190 else {
b4f7163a 1191 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1192
b4f7163a 1193 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1194 }
1195 tmp = ((OP(c) == BOUND ?
9041c2e3 1196 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1197 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1198 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1199 if (tmp == !(OP(c) == BOUND ?
3568d838 1200 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1201 isALNUM_LC_utf8((U8*)s)))
1202 {
1203 tmp = !tmp;
1204 if ((norun || regtry(prog, s)))
1205 goto got_it;
1206 }
078c425b 1207 s += uskip;
a687059c 1208 }
a0d0e21e 1209 }
667bb95a 1210 else {
12d33761 1211 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1212 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1213 while (s < strend) {
1214 if (tmp ==
1215 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1216 tmp = !tmp;
1217 if ((norun || regtry(prog, s)))
1218 goto got_it;
1219 }
1220 s++;
a0ed51b3 1221 }
a0ed51b3 1222 }
6eb5f6b9 1223 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1224 goto got_it;
1225 break;
bbce6d69 1226 case NBOUNDL:
3280af22 1227 PL_reg_flags |= RF_tainted;
bbce6d69 1228 /* FALL THROUGH */
a0d0e21e 1229 case NBOUND:
ffc61ed2 1230 if (do_utf8) {
12d33761 1231 if (s == PL_bostr)
ffc61ed2
JH
1232 tmp = '\n';
1233 else {
b4f7163a 1234 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1235
b4f7163a 1236 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1237 }
1238 tmp = ((OP(c) == NBOUND ?
9041c2e3 1239 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1240 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1241 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1242 if (tmp == !(OP(c) == NBOUND ?
3568d838 1243 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1244 isALNUM_LC_utf8((U8*)s)))
1245 tmp = !tmp;
1246 else if ((norun || regtry(prog, s)))
1247 goto got_it;
078c425b 1248 s += uskip;
ffc61ed2 1249 }
a0d0e21e 1250 }
667bb95a 1251 else {
12d33761 1252 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1253 tmp = ((OP(c) == NBOUND ?
1254 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1255 while (s < strend) {
1256 if (tmp ==
1257 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1258 tmp = !tmp;
1259 else if ((norun || regtry(prog, s)))
1260 goto got_it;
1261 s++;
1262 }
a0ed51b3 1263 }
6eb5f6b9 1264 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1265 goto got_it;
1266 break;
a0d0e21e 1267 case ALNUM:
ffc61ed2 1268 if (do_utf8) {
1a4fad37 1269 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1270 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1271 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1272 if (tmp && (norun || regtry(prog, s)))
1273 goto got_it;
1274 else
1275 tmp = doevery;
1276 }
bbce6d69 1277 else
ffc61ed2 1278 tmp = 1;
078c425b 1279 s += uskip;
bbce6d69 1280 }
bbce6d69 1281 }
ffc61ed2
JH
1282 else {
1283 while (s < strend) {
1284 if (isALNUM(*s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1286 goto got_it;
1287 else
1288 tmp = doevery;
1289 }
a0ed51b3 1290 else
ffc61ed2
JH
1291 tmp = 1;
1292 s++;
a0ed51b3 1293 }
a0ed51b3
LW
1294 }
1295 break;
bbce6d69 1296 case ALNUML:
3280af22 1297 PL_reg_flags |= RF_tainted;
ffc61ed2 1298 if (do_utf8) {
078c425b 1299 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1300 if (isALNUM_LC_utf8((U8*)s)) {
1301 if (tmp && (norun || regtry(prog, s)))
1302 goto got_it;
1303 else
1304 tmp = doevery;
1305 }
a687059c 1306 else
ffc61ed2 1307 tmp = 1;
078c425b 1308 s += uskip;
a0d0e21e 1309 }
a0d0e21e 1310 }
ffc61ed2
JH
1311 else {
1312 while (s < strend) {
1313 if (isALNUM_LC(*s)) {
1314 if (tmp && (norun || regtry(prog, s)))
1315 goto got_it;
1316 else
1317 tmp = doevery;
1318 }
a0ed51b3 1319 else
ffc61ed2
JH
1320 tmp = 1;
1321 s++;
a0ed51b3 1322 }
a0ed51b3
LW
1323 }
1324 break;
a0d0e21e 1325 case NALNUM:
ffc61ed2 1326 if (do_utf8) {
1a4fad37 1327 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1328 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1329 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1330 if (tmp && (norun || regtry(prog, s)))
1331 goto got_it;
1332 else
1333 tmp = doevery;
1334 }
bbce6d69 1335 else
ffc61ed2 1336 tmp = 1;
078c425b 1337 s += uskip;
bbce6d69 1338 }
bbce6d69 1339 }
ffc61ed2
JH
1340 else {
1341 while (s < strend) {
1342 if (!isALNUM(*s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1344 goto got_it;
1345 else
1346 tmp = doevery;
1347 }
a0ed51b3 1348 else
ffc61ed2
JH
1349 tmp = 1;
1350 s++;
a0ed51b3 1351 }
a0ed51b3
LW
1352 }
1353 break;
bbce6d69 1354 case NALNUML:
3280af22 1355 PL_reg_flags |= RF_tainted;
ffc61ed2 1356 if (do_utf8) {
078c425b 1357 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1358 if (!isALNUM_LC_utf8((U8*)s)) {
1359 if (tmp && (norun || regtry(prog, s)))
1360 goto got_it;
1361 else
1362 tmp = doevery;
1363 }
a687059c 1364 else
ffc61ed2 1365 tmp = 1;
078c425b 1366 s += uskip;
a687059c 1367 }
a0d0e21e 1368 }
ffc61ed2
JH
1369 else {
1370 while (s < strend) {
1371 if (!isALNUM_LC(*s)) {
1372 if (tmp && (norun || regtry(prog, s)))
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
a0ed51b3 1377 else
ffc61ed2
JH
1378 tmp = 1;
1379 s++;
a0ed51b3 1380 }
a0ed51b3
LW
1381 }
1382 break;
a0d0e21e 1383 case SPACE:
ffc61ed2 1384 if (do_utf8) {
1a4fad37 1385 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1386 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1387 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1388 if (tmp && (norun || regtry(prog, s)))
1389 goto got_it;
1390 else
1391 tmp = doevery;
1392 }
a0d0e21e 1393 else
ffc61ed2 1394 tmp = 1;
078c425b 1395 s += uskip;
2304df62 1396 }
a0d0e21e 1397 }
ffc61ed2
JH
1398 else {
1399 while (s < strend) {
1400 if (isSPACE(*s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1402 goto got_it;
1403 else
1404 tmp = doevery;
1405 }
a0ed51b3 1406 else
ffc61ed2
JH
1407 tmp = 1;
1408 s++;
a0ed51b3 1409 }
a0ed51b3
LW
1410 }
1411 break;
bbce6d69 1412 case SPACEL:
3280af22 1413 PL_reg_flags |= RF_tainted;
ffc61ed2 1414 if (do_utf8) {
078c425b 1415 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1416 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1417 if (tmp && (norun || regtry(prog, s)))
1418 goto got_it;
1419 else
1420 tmp = doevery;
1421 }
bbce6d69 1422 else
ffc61ed2 1423 tmp = 1;
078c425b 1424 s += uskip;
bbce6d69 1425 }
bbce6d69 1426 }
ffc61ed2
JH
1427 else {
1428 while (s < strend) {
1429 if (isSPACE_LC(*s)) {
1430 if (tmp && (norun || regtry(prog, s)))
1431 goto got_it;
1432 else
1433 tmp = doevery;
1434 }
a0ed51b3 1435 else
ffc61ed2
JH
1436 tmp = 1;
1437 s++;
a0ed51b3 1438 }
a0ed51b3
LW
1439 }
1440 break;
a0d0e21e 1441 case NSPACE:
ffc61ed2 1442 if (do_utf8) {
1a4fad37 1443 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1444 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1445 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1446 if (tmp && (norun || regtry(prog, s)))
1447 goto got_it;
1448 else
1449 tmp = doevery;
1450 }
a0d0e21e 1451 else
ffc61ed2 1452 tmp = 1;
078c425b 1453 s += uskip;
a687059c 1454 }
a0d0e21e 1455 }
ffc61ed2
JH
1456 else {
1457 while (s < strend) {
1458 if (!isSPACE(*s)) {
1459 if (tmp && (norun || regtry(prog, s)))
1460 goto got_it;
1461 else
1462 tmp = doevery;
1463 }
a0ed51b3 1464 else
ffc61ed2
JH
1465 tmp = 1;
1466 s++;
a0ed51b3 1467 }
a0ed51b3
LW
1468 }
1469 break;
bbce6d69 1470 case NSPACEL:
3280af22 1471 PL_reg_flags |= RF_tainted;
ffc61ed2 1472 if (do_utf8) {
078c425b 1473 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1474 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1475 if (tmp && (norun || regtry(prog, s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 }
bbce6d69 1480 else
ffc61ed2 1481 tmp = 1;
078c425b 1482 s += uskip;
bbce6d69 1483 }
bbce6d69 1484 }
ffc61ed2
JH
1485 else {
1486 while (s < strend) {
1487 if (!isSPACE_LC(*s)) {
1488 if (tmp && (norun || regtry(prog, s)))
1489 goto got_it;
1490 else
1491 tmp = doevery;
1492 }
a0ed51b3 1493 else
ffc61ed2
JH
1494 tmp = 1;
1495 s++;
a0ed51b3 1496 }
a0ed51b3
LW
1497 }
1498 break;
a0d0e21e 1499 case DIGIT:
ffc61ed2 1500 if (do_utf8) {
1a4fad37 1501 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1502 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1503 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1504 if (tmp && (norun || regtry(prog, s)))
1505 goto got_it;
1506 else
1507 tmp = doevery;
1508 }
a0d0e21e 1509 else
ffc61ed2 1510 tmp = 1;
078c425b 1511 s += uskip;
2b69d0c2 1512 }
a0d0e21e 1513 }
ffc61ed2
JH
1514 else {
1515 while (s < strend) {
1516 if (isDIGIT(*s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1518 goto got_it;
1519 else
1520 tmp = doevery;
1521 }
a0ed51b3 1522 else
ffc61ed2
JH
1523 tmp = 1;
1524 s++;
a0ed51b3 1525 }
a0ed51b3
LW
1526 }
1527 break;
b8c5462f
JH
1528 case DIGITL:
1529 PL_reg_flags |= RF_tainted;
ffc61ed2 1530 if (do_utf8) {
078c425b 1531 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1532 if (isDIGIT_LC_utf8((U8*)s)) {
1533 if (tmp && (norun || regtry(prog, s)))
1534 goto got_it;
1535 else
1536 tmp = doevery;
1537 }
b8c5462f 1538 else
ffc61ed2 1539 tmp = 1;
078c425b 1540 s += uskip;
b8c5462f 1541 }
b8c5462f 1542 }
ffc61ed2
JH
1543 else {
1544 while (s < strend) {
1545 if (isDIGIT_LC(*s)) {
1546 if (tmp && (norun || regtry(prog, s)))
1547 goto got_it;
1548 else
1549 tmp = doevery;
1550 }
b8c5462f 1551 else
ffc61ed2
JH
1552 tmp = 1;
1553 s++;
b8c5462f 1554 }
b8c5462f
JH
1555 }
1556 break;
a0d0e21e 1557 case NDIGIT:
ffc61ed2 1558 if (do_utf8) {
1a4fad37 1559 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1560 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1561 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1562 if (tmp && (norun || regtry(prog, s)))
1563 goto got_it;
1564 else
1565 tmp = doevery;
1566 }
a0d0e21e 1567 else
ffc61ed2 1568 tmp = 1;
078c425b 1569 s += uskip;
a687059c 1570 }
a0d0e21e 1571 }
ffc61ed2
JH
1572 else {
1573 while (s < strend) {
1574 if (!isDIGIT(*s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1576 goto got_it;
1577 else
1578 tmp = doevery;
1579 }
a0ed51b3 1580 else
ffc61ed2
JH
1581 tmp = 1;
1582 s++;
a0ed51b3 1583 }
a0ed51b3
LW
1584 }
1585 break;
b8c5462f
JH
1586 case NDIGITL:
1587 PL_reg_flags |= RF_tainted;
ffc61ed2 1588 if (do_utf8) {
078c425b 1589 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1590 if (!isDIGIT_LC_utf8((U8*)s)) {
1591 if (tmp && (norun || regtry(prog, s)))
1592 goto got_it;
1593 else
1594 tmp = doevery;
1595 }
b8c5462f 1596 else
ffc61ed2 1597 tmp = 1;
078c425b 1598 s += uskip;
b8c5462f 1599 }
a0ed51b3 1600 }
ffc61ed2
JH
1601 else {
1602 while (s < strend) {
1603 if (!isDIGIT_LC(*s)) {
1604 if (tmp && (norun || regtry(prog, s)))
1605 goto got_it;
1606 else
1607 tmp = doevery;
1608 }
cf93c79d 1609 else
ffc61ed2
JH
1610 tmp = 1;
1611 s++;
b8c5462f 1612 }
b8c5462f
JH
1613 }
1614 break;
b3c9acc1 1615 default:
3c3eec57
GS
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1617 break;
d6a28714 1618 }
6eb5f6b9
JH
1619 return 0;
1620 got_it:
1621 return s;
1622}
1623
1624/*
1625 - regexec_flags - match a regexp against a string
1626 */
1627I32
1628Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630/* strend: pointer to null at end of string */
1631/* strbeg: real beginning of string */
1632/* minend: end of match must be >=minend after stringarg. */
1633/* data: May be used for some additional optimizations. */
1634/* nosave: For optimizations. */
1635{
6eb5f6b9
JH
1636 register char *s;
1637 register regnode *c;
1638 register char *startpos = stringarg;
6eb5f6b9
JH
1639 I32 minlen; /* must match at least this many chars */
1640 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1641 I32 end_shift = 0; /* Same for the end. */ /* CC */
1642 I32 scream_pos = -1; /* Internal iterator of scream. */
1643 char *scream_olds;
1644 SV* oreplsv = GvSV(PL_replgv);
1df70142 1645 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1646 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1647#ifdef DEBUGGING
9e55ce06
JH
1648 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1649 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1650#endif
a3621e74
YO
1651
1652 GET_RE_DEBUG_FLAGS_DECL;
1653
9d4ba2ae 1654 PERL_UNUSED_ARG(data);
a30b2f1f 1655 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1656
1657 PL_regcc = 0;
1658
1659 cache_re(prog);
1660#ifdef DEBUGGING
aea4f609 1661 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1662#endif
1663
1664 /* Be paranoid... */
1665 if (prog == NULL || startpos == NULL) {
1666 Perl_croak(aTHX_ "NULL regexp parameter");
1667 return 0;
1668 }
1669
1670 minlen = prog->minlen;
61a36c01 1671 if (strend - startpos < minlen) {
a3621e74 1672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1673 "String too short [regexec_flags]...\n"));
1674 goto phooey;
1aa99e6b 1675 }
6eb5f6b9 1676
6eb5f6b9
JH
1677 /* Check validity of program. */
1678 if (UCHARAT(prog->program) != REG_MAGIC) {
1679 Perl_croak(aTHX_ "corrupted regexp program");
1680 }
1681
1682 PL_reg_flags = 0;
1683 PL_reg_eval_set = 0;
1684 PL_reg_maxiter = 0;
1685
1686 if (prog->reganch & ROPT_UTF8)
1687 PL_reg_flags |= RF_utf8;
1688
1689 /* Mark beginning of line for ^ and lookbehind. */
1690 PL_regbol = startpos;
1691 PL_bostr = strbeg;
1692 PL_reg_sv = sv;
1693
1694 /* Mark end of line for $ (and such) */
1695 PL_regeol = strend;
1696
1697 /* see how far we have to get to not match where we matched before */
1698 PL_regtill = startpos+minend;
1699
1700 /* We start without call_cc context. */
1701 PL_reg_call_cc = 0;
1702
1703 /* If there is a "must appear" string, look for it. */
1704 s = startpos;
1705
1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1707 MAGIC *mg;
1708
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1710 PL_reg_ganch = startpos;
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && SvMAGIC(sv)
14befaf4
DM
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
6eb5f6b9
JH
1715 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1716 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1717 if (s > PL_reg_ganch)
6eb5f6b9
JH
1718 goto phooey;
1719 s = PL_reg_ganch;
1720 }
1721 }
1722 else /* pos() not defined */
1723 PL_reg_ganch = strbeg;
1724 }
1725
33b8afdf 1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1727 re_scream_pos_data d;
1728
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1732 if (!s) {
a3621e74 1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1734 goto phooey; /* not present */
3fa9c3d7 1735 }
6eb5f6b9
JH
1736 }
1737
a3621e74 1738 DEBUG_EXECUTE_r({
1df70142
AL
1739 const char * const s0 = UTF
1740 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1741 UNI_DISPLAY_REGEX)
1742 : prog->precomp;
1743 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1744 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1745 UNI_DISPLAY_REGEX) : startpos;
1df70142 1746 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1747 if (!PL_colorset)
1748 reginitcolors();
1749 PerlIO_printf(Perl_debug_log,
a0288114 1750 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1751 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1752 len0, len0, s0,
2a782b5b 1753 PL_colors[1],
9e55ce06 1754 len0 > 60 ? "..." : "",
2a782b5b 1755 PL_colors[0],
9e55ce06
JH
1756 (int)(len1 > 60 ? 60 : len1),
1757 s1, PL_colors[1],
1758 (len1 > 60 ? "..." : "")
2a782b5b
JH
1759 );
1760 });
6eb5f6b9
JH
1761
1762 /* Simplest case: anchored match need be tried only once. */
1763 /* [unless only anchor is BOL and multiline is set] */
1764 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1765 if (s == startpos && regtry(prog, startpos))
1766 goto got_it;
7fba1cd6 1767 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1768 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1769 {
1770 char *end;
1771
1772 if (minlen)
1773 dontbother = minlen - 1;
1aa99e6b 1774 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1775 /* for multiline we only have to try after newlines */
33b8afdf 1776 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1777 if (s == startpos)
1778 goto after_try;
1779 while (1) {
1780 if (regtry(prog, s))
1781 goto got_it;
1782 after_try:
1783 if (s >= end)
1784 goto phooey;
1785 if (prog->reganch & RE_USE_INTUIT) {
1786 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1787 if (!s)
1788 goto phooey;
1789 }
1790 else
1791 s++;
1792 }
1793 } else {
1794 if (s > startpos)
1795 s--;
1796 while (s < end) {
1797 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1798 if (regtry(prog, s))
1799 goto got_it;
1800 }
1801 }
1802 }
1803 }
1804 goto phooey;
1805 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1806 if (regtry(prog, PL_reg_ganch))
1807 goto got_it;
1808 goto phooey;
1809 }
1810
1811 /* Messy cases: unanchored match. */
33b8afdf 1812 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1813 /* we have /x+whatever/ */
1814 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1815 char ch;
bf93d4cc
GS
1816#ifdef DEBUGGING
1817 int did_match = 0;
1818#endif
33b8afdf
JH
1819 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1821 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1822
1aa99e6b 1823 if (do_utf8) {
6eb5f6b9
JH
1824 while (s < strend) {
1825 if (*s == ch) {
a3621e74 1826 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1827 if (regtry(prog, s)) goto got_it;
1828 s += UTF8SKIP(s);
1829 while (s < strend && *s == ch)
1830 s += UTF8SKIP(s);
1831 }
1832 s += UTF8SKIP(s);
1833 }
1834 }
1835 else {
1836 while (s < strend) {
1837 if (*s == ch) {
a3621e74 1838 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1839 if (regtry(prog, s)) goto got_it;
1840 s++;
1841 while (s < strend && *s == ch)
1842 s++;
1843 }
1844 s++;
1845 }
1846 }
a3621e74 1847 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1848 PerlIO_printf(Perl_debug_log,
b7953727
JH
1849 "Did not find anchored character...\n")
1850 );
6eb5f6b9 1851 }
33b8afdf
JH
1852 else if (prog->anchored_substr != Nullsv
1853 || prog->anchored_utf8 != Nullsv
1854 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1855 && prog->float_max_offset < strend - s)) {
1856 SV *must;
1857 I32 back_max;
1858 I32 back_min;
1859 char *last;
6eb5f6b9 1860 char *last1; /* Last position checked before */
bf93d4cc
GS
1861#ifdef DEBUGGING
1862 int did_match = 0;
1863#endif
33b8afdf
JH
1864 if (prog->anchored_substr || prog->anchored_utf8) {
1865 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1868 back_max = back_min = prog->anchored_offset;
1869 } else {
1870 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1871 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1872 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1873 back_max = prog->float_max_offset;
1874 back_min = prog->float_min_offset;
1875 }
1876 if (must == &PL_sv_undef)
1877 /* could not downgrade utf8 check substring, so must fail */
1878 goto phooey;
1879
1880 last = HOP3c(strend, /* Cannot start after this */
1881 -(I32)(CHR_SVLEN(must)
1882 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1883
1884 if (s > PL_bostr)
1885 last1 = HOPc(s, -1);
1886 else
1887 last1 = s - 1; /* bogus */
1888
a0288114 1889 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1890 check_substr==must. */
1891 scream_pos = -1;
1892 dontbother = end_shift;
1893 strend = HOPc(strend, -dontbother);
1894 while ( (s <= last) &&
9041c2e3 1895 ((flags & REXEC_SCREAM)
1aa99e6b 1896 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1897 end_shift, &scream_pos, 0))
1aa99e6b 1898 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1899 (unsigned char*)strend, must,
7fba1cd6 1900 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1901 /* we may be pointing at the wrong string */
1902 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1903 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1904 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1905 if (HOPc(s, -back_max) > last1) {
1906 last1 = HOPc(s, -back_min);
1907 s = HOPc(s, -back_max);
1908 }
1909 else {
1910 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1911
1912 last1 = HOPc(s, -back_min);
1913 s = t;
1914 }
1aa99e6b 1915 if (do_utf8) {
6eb5f6b9
JH
1916 while (s <= last1) {
1917 if (regtry(prog, s))
1918 goto got_it;
1919 s += UTF8SKIP(s);
1920 }
1921 }
1922 else {
1923 while (s <= last1) {
1924 if (regtry(prog, s))
1925 goto got_it;
1926 s++;
1927 }
1928 }
1929 }
a3621e74 1930 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1931 PerlIO_printf(Perl_debug_log,
a0288114 1932 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1933 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1934 ? "anchored" : "floating"),
1935 PL_colors[0],
1936 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1937 SvPVX_const(must),
b7953727
JH
1938 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1939 );
6eb5f6b9
JH
1940 goto phooey;
1941 }
155aba94 1942 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1943 if (minlen) {
1944 I32 op = (U8)OP(prog->regstclass);
66e933ab 1945 /* don't bother with what can't match */
f14c76ed
RGS
1946 if (PL_regkind[op] != EXACT && op != CANY)
1947 strend = HOPc(strend, -(minlen - 1));
1948 }
a3621e74 1949 DEBUG_EXECUTE_r({
ffc61ed2 1950 SV *prop = sv_newmortal();
cfd0369c
NC
1951 const char *s0;
1952 const char *s1;
9e55ce06
JH
1953 int len0;
1954 int len1;
1955
ffc61ed2 1956 regprop(prop, c);
9e55ce06 1957 s0 = UTF ?
3f7c398e 1958 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1959 UNI_DISPLAY_REGEX) :
cfd0369c 1960 SvPVX_const(prop);
9e55ce06
JH
1961 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1962 s1 = UTF ?
c728cb41 1963 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1964 len1 = UTF ? SvCUR(dsv1) : strend - s;
1965 PerlIO_printf(Perl_debug_log,
a0288114 1966 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1967 len0, len0, s0,
1968 len1, len1, s1);
ffc61ed2 1969 });
06b5626a 1970 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1971 goto got_it;
a3621e74 1972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1973 }
1974 else {
1975 dontbother = 0;
33b8afdf
JH
1976 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1977 /* Trim the end. */
d6a28714 1978 char *last;
33b8afdf
JH
1979 SV* float_real;
1980
1981 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1982 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1983 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1984
1985 if (flags & REXEC_SCREAM) {
33b8afdf 1986 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1987 end_shift, &scream_pos, 1); /* last one */
1988 if (!last)
ffc61ed2 1989 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1990 /* we may be pointing at the wrong string */
1991 else if (RX_MATCH_COPIED(prog))
3f7c398e 1992 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1993 }
d6a28714
JH
1994 else {
1995 STRLEN len;
cfd0369c 1996 const char * const little = SvPV_const(float_real, len);
d6a28714 1997
33b8afdf 1998 if (SvTAIL(float_real)) {
d6a28714
JH
1999 if (memEQ(strend - len + 1, little, len - 1))
2000 last = strend - len + 1;
7fba1cd6 2001 else if (!multiline)
9041c2e3 2002 last = memEQ(strend - len, little, len)
d6a28714 2003 ? strend - len : Nullch;
b8c5462f 2004 else
d6a28714
JH
2005 goto find_last;
2006 } else {
2007 find_last:
9041c2e3 2008 if (len)
d6a28714 2009 last = rninstr(s, strend, little, little + len);
b8c5462f 2010 else
a0288114 2011 last = strend; /* matching "$" */
b8c5462f 2012 }
b8c5462f 2013 }
bf93d4cc 2014 if (last == NULL) {
a3621e74 2015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2016 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2017 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2018 goto phooey; /* Should not happen! */
2019 }
d6a28714
JH
2020 dontbother = strend - last + prog->float_min_offset;
2021 }
2022 if (minlen && (dontbother < minlen))
2023 dontbother = minlen - 1;
2024 strend -= dontbother; /* this one's always in bytes! */
2025 /* We don't know much -- general case. */
1aa99e6b 2026 if (do_utf8) {
d6a28714
JH
2027 for (;;) {
2028 if (regtry(prog, s))
2029 goto got_it;
2030 if (s >= strend)
2031 break;
b8c5462f 2032 s += UTF8SKIP(s);
d6a28714
JH
2033 };
2034 }
2035 else {
2036 do {
2037 if (regtry(prog, s))
2038 goto got_it;
2039 } while (s++ < strend);
2040 }
2041 }
2042
2043 /* Failure. */
2044 goto phooey;
2045
2046got_it:
2047 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2048
2049 if (PL_reg_eval_set) {
2050 /* Preserve the current value of $^R */
2051 if (oreplsv != GvSV(PL_replgv))
2052 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2053 restored, the value remains
2054 the same. */
acfe0abc 2055 restore_pos(aTHX_ 0);
d6a28714
JH
2056 }
2057
2058 /* make sure $`, $&, $', and $digit will work later */
2059 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2060 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2061 if (flags & REXEC_COPY_STR) {
2062 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2063#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2064 if ((SvIsCOW(sv)
2065 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2066 if (DEBUG_C_TEST) {
2067 PerlIO_printf(Perl_debug_log,
2068 "Copy on write: regexp capture, type %d\n",
2069 (int) SvTYPE(sv));
2070 }
2071 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2072 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2073 assert (SvPOKp(prog->saved_copy));
2074 } else
2075#endif
2076 {
2077 RX_MATCH_COPIED_on(prog);
2078 s = savepvn(strbeg, i);
2079 prog->subbeg = s;
2080 }
d6a28714 2081 prog->sublen = i;
d6a28714
JH
2082 }
2083 else {
2084 prog->subbeg = strbeg;
2085 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2086 }
2087 }
9041c2e3 2088
d6a28714
JH
2089 return 1;
2090
2091phooey:
a3621e74 2092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2093 PL_colors[4], PL_colors[5]));
d6a28714 2094 if (PL_reg_eval_set)
acfe0abc 2095 restore_pos(aTHX_ 0);
d6a28714
JH
2096 return 0;
2097}
2098
2099/*
2100 - regtry - try match at specific point
2101 */
2102STATIC I32 /* 0 failure, 1 success */
2103S_regtry(pTHX_ regexp *prog, char *startpos)
2104{
d6a28714
JH
2105 register I32 i;
2106 register I32 *sp;
2107 register I32 *ep;
2108 CHECKPOINT lastcp;
a3621e74 2109 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2110
02db2b7b
IZ
2111#ifdef DEBUGGING
2112 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2113#endif
d6a28714
JH
2114 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2115 MAGIC *mg;
2116
2117 PL_reg_eval_set = RS_init;
a3621e74 2118 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2119 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2120 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2121 ));
e8347627 2122 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2123 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2124 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2125 SAVETMPS;
2126 /* Apparently this is not needed, judging by wantarray. */
e8347627 2127 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2128 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2129
2130 if (PL_reg_sv) {
2131 /* Make $_ available to executed code. */
2132 if (PL_reg_sv != DEFSV) {
59f00321 2133 SAVE_DEFSV;
d6a28714 2134 DEFSV = PL_reg_sv;
b8c5462f 2135 }
d6a28714 2136
9041c2e3 2137 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2138 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2139 /* prepare for quick setting of pos */
14befaf4
DM
2140 sv_magic(PL_reg_sv, (SV*)0,
2141 PERL_MAGIC_regex_global, Nullch, 0);
2142 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2143 mg->mg_len = -1;
b8c5462f 2144 }
d6a28714
JH
2145 PL_reg_magic = mg;
2146 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2147 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2148 }
09687e5a 2149 if (!PL_reg_curpm) {
a02a5408 2150 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2151#ifdef USE_ITHREADS
2152 {
2153 SV* repointer = newSViv(0);
577e12cc 2154 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2155 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2156 av_push(PL_regex_padav,repointer);
2157 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2158 PL_regex_pad = AvARRAY(PL_regex_padav);
2159 }
2160#endif
2161 }
aaa362c4 2162 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2163 PL_reg_oldcurpm = PL_curpm;
2164 PL_curpm = PL_reg_curpm;
2165 if (RX_MATCH_COPIED(prog)) {
2166 /* Here is a serious problem: we cannot rewrite subbeg,
2167 since it may be needed if this match fails. Thus
2168 $` inside (?{}) could fail... */
2169 PL_reg_oldsaved = prog->subbeg;
2170 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2171#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2172 PL_nrs = prog->saved_copy;
2173#endif
d6a28714
JH
2174 RX_MATCH_COPIED_off(prog);
2175 }
2176 else
2177 PL_reg_oldsaved = Nullch;
2178 prog->subbeg = PL_bostr;
2179 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2180 }
973dddac 2181 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2182 PL_reginput = startpos;
2183 PL_regstartp = prog->startp;
2184 PL_regendp = prog->endp;
2185 PL_reglastparen = &prog->lastparen;
a01268b5 2186 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2187 prog->lastparen = 0;
03994de8 2188 prog->lastcloseparen = 0;
d6a28714 2189 PL_regsize = 0;
a3621e74 2190 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2191 if (PL_reg_start_tmpl <= prog->nparens) {
2192 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2193 if(PL_reg_start_tmp)
2194 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2195 else
a02a5408 2196 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2197 }
2198
2199 /* XXXX What this code is doing here?!!! There should be no need
2200 to do this again and again, PL_reglastparen should take care of
3dd2943c 2201 this! --ilya*/
dafc8851
JH
2202
2203 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2204 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2205 * PL_reglastparen), is not needed at all by the test suite
2206 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2207 * enough, for building DynaLoader, or otherwise this
2208 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2209 * will happen. Meanwhile, this code *is* needed for the
2210 * above-mentioned test suite tests to succeed. The common theme
2211 * on those tests seems to be returning null fields from matches.
2212 * --jhi */
dafc8851 2213#if 1
d6a28714
JH
2214 sp = prog->startp;
2215 ep = prog->endp;
2216 if (prog->nparens) {
eb160463 2217 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2218 *++sp = -1;
2219 *++ep = -1;
2220 }
2221 }
dafc8851 2222#endif
02db2b7b 2223 REGCP_SET(lastcp);
d6a28714
JH
2224 if (regmatch(prog->program + 1)) {
2225 prog->endp[0] = PL_reginput - PL_bostr;
2226 return 1;
2227 }
02db2b7b 2228 REGCP_UNWIND(lastcp);
d6a28714
JH
2229 return 0;
2230}
2231
02db2b7b
IZ
2232#define RE_UNWIND_BRANCH 1
2233#define RE_UNWIND_BRANCHJ 2
2234
2235union re_unwind_t;
2236
2237typedef struct { /* XX: makes sense to enlarge it... */
2238 I32 type;
2239 I32 prev;
2240 CHECKPOINT lastcp;
2241} re_unwind_generic_t;
2242
2243typedef struct {
2244 I32 type;
2245 I32 prev;
2246 CHECKPOINT lastcp;
2247 I32 lastparen;
2248 regnode *next;
2249 char *locinput;
2250 I32 nextchr;
2251#ifdef DEBUGGING
2252 int regindent;
2253#endif
2254} re_unwind_branch_t;
2255
2256typedef union re_unwind_t {
2257 I32 type;
2258 re_unwind_generic_t generic;
2259 re_unwind_branch_t branch;
2260} re_unwind_t;
2261
8ba1375e
MJD
2262#define sayYES goto yes
2263#define sayNO goto no
e0f9d4a8 2264#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2265#define sayYES_FINAL goto yes_final
2266#define sayYES_LOUD goto yes_loud
2267#define sayNO_FINAL goto no_final
2268#define sayNO_SILENT goto do_no
2269#define saySAME(x) if (x) goto yes; else goto no
2270
3ab3c9b4
HS
2271#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2272#define POSCACHE_SEEN 1 /* we know what we're caching */
2273#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2274#define CACHEsayYES STMT_START { \
2275 if (cache_offset | cache_bit) { \
2276 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2277 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2278 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2279 /* cache records failure, but this is success */ \
2280 DEBUG_r( \
2281 PerlIO_printf(Perl_debug_log, \
2282 "%*s (remove success from failure cache)\n", \
2283 REPORT_CODE_OFF+PL_regindent*2, "") \
2284 ); \
2285 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2286 } \
2287 } \
2288 sayYES; \
2289} STMT_END
2290#define CACHEsayNO STMT_START { \
2291 if (cache_offset | cache_bit) { \
2292 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2293 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2294 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2295 /* cache records success, but this is failure */ \
2296 DEBUG_r( \
2297 PerlIO_printf(Perl_debug_log, \
2298 "%*s (remove failure from success cache)\n", \
2299 REPORT_CODE_OFF+PL_regindent*2, "") \
2300 ); \
2301 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2302 } \
2303 } \
2304 sayNO; \
2305} STMT_END
2306
a3621e74
YO
2307/* this is used to determine how far from the left messages like
2308 'failed...' are printed. Currently 29 makes these messages line
2309 up with the opcode they refer to. Earlier perls used 25 which
2310 left these messages outdented making reviewing a debug output
2311 quite difficult.
2312*/
2313#define REPORT_CODE_OFF 29
2314
2315
2316/* Make sure there is a test for this +1 options in re_tests */
2317#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2318
2319#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2320 if ( trie->states[ state ].wordnum ) { \
2321 if ( !accepted ) { \
2322 ENTER; \
2323 SAVETMPS; \
2324 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2325 sv_accept_buff=NEWSV( 1234, \
2326 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2327 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2328 SvPOK_on( sv_accept_buff ); \
2329 sv_2mortal( sv_accept_buff ); \
2330 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2331 } else { \
2332 if ( accepted >= bufflen ) { \
2333 bufflen *= 2; \
2334 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2335 bufflen * sizeof(reg_trie_accepted) ); \
2336 } \
2337 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2338 + sizeof( reg_trie_accepted ) ); \
2339 } \
2340 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2341 accept_buff[ accepted ].endpos = uc; \
2342 ++accepted; \
2343 } } STMT_END
2344
2345#define TRIE_HANDLE_CHAR STMT_START { \
2346 if ( uvc < 256 ) { \
2347 charid = trie->charmap[ uvc ]; \
2348 } else { \
2349 charid = 0; \
2350 if( trie->widecharmap ) { \
2351 SV** svpp = (SV**)NULL; \
2352 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2353 sizeof( UV ), 0 ); \
2354 if ( svpp ) { \
2355 charid = (U16)SvIV( *svpp ); \
2356 } \
2357 } \
2358 } \
2359 if ( charid && \
cc601c31
YO
2360 ( base + charid > trie->uniquecharcount ) && \
2361 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2362 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2363 { \
2364 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2365 } else { \
2366 state = 0; \
2367 } \
2368 uc += len; \
2369 } STMT_END
8ba1375e 2370
d6a28714
JH
2371/*
2372 - regmatch - main matching routine
2373 *
2374 * Conceptually the strategy is simple: check to see whether the current
2375 * node matches, call self recursively to see whether the rest matches,
2376 * and then act accordingly. In practice we make some effort to avoid
2377 * recursion, in particular by going through "ordinary" nodes (that don't
2378 * need to know whether the rest of the match failed) by a loop instead of
2379 * by recursion.
2380 */
2381/* [lwall] I've hoisted the register declarations to the outer block in order to
2382 * maybe save a little bit of pushing and popping on the stack. It also takes
2383 * advantage of machines that use a register save mask on subroutine entry.
2384 */
2385STATIC I32 /* 0 failure, 1 success */
2386S_regmatch(pTHX_ regnode *prog)
2387{
27da23d5 2388 dVAR;
d6a28714
JH
2389 register regnode *scan; /* Current node. */
2390 regnode *next; /* Next node. */
2391 regnode *inner; /* Next node in internal branch. */
2392 register I32 nextchr; /* renamed nextchr - nextchar colides with
2393 function of same name */
2394 register I32 n; /* no or next */
b7953727
JH
2395 register I32 ln = 0; /* len or last */
2396 register char *s = Nullch; /* operand or save */
d6a28714 2397 register char *locinput = PL_reginput;
b7953727 2398 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2399 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2400 I32 unwind = 0;
a3621e74
YO
2401
2402 /* used by the trie code */
ab74612d
NC
2403 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2404 reg_trie_accepted *accept_buff = 0; /* "" */
2405 reg_trie_data *trie; /* what trie are we using right now */
2406 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2407
b7953727 2408#if 0
02db2b7b 2409 I32 firstcp = PL_savestack_ix;
b7953727 2410#endif
0d46e09a 2411 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2412#ifdef DEBUGGING
ce333219
JH
2413 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2414 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2415 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2416
ab74612d 2417 SV *re_debug_flags = NULL;
2a782b5b 2418#endif
041457d9 2419 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2420
a3621e74
YO
2421 GET_RE_DEBUG_FLAGS;
2422
d6a28714
JH
2423#ifdef DEBUGGING
2424 PL_regindent++;
2425#endif
2426
a3621e74 2427
d6a28714
JH
2428 /* Note that nextchr is a byte even in UTF */
2429 nextchr = UCHARAT(locinput);
2430 scan = prog;
2431 while (scan != NULL) {
8ba1375e 2432
a3621e74 2433 DEBUG_EXECUTE_r( {
d6a28714 2434 SV *prop = sv_newmortal();
1df70142
AL
2435 const int docolor = *PL_colors[0];
2436 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2437 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2438 /* The part of the string before starttry has one color
2439 (pref0_len chars), between starttry and current
2440 position another one (pref_len - pref0_len chars),
2441 after the current position the third one.
2442 We assume that pref0_len <= pref_len, otherwise we
2443 decrease pref0_len. */
9041c2e3 2444 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2445 ? (5 + taill) - l : locinput - PL_bostr;
2446 int pref0_len;
d6a28714 2447
df1ffd02 2448 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2449 pref_len++;
2450 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2451 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2452 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2453 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2454 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2455 l--;
d6a28714
JH
2456 if (pref0_len < 0)
2457 pref0_len = 0;
2458 if (pref0_len > pref_len)
2459 pref0_len = pref_len;
2460 regprop(prop, scan);
2a782b5b 2461 {
1df70142 2462 const char * const s0 =
f14c76ed 2463 do_utf8 && OP(scan) != CANY ?
2a782b5b 2464 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2465 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2466 locinput - pref_len;
1df70142
AL
2467 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2468 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2469 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2470 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2471 locinput - pref_len + pref0_len;
1df70142
AL
2472 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2473 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2474 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2475 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2476 locinput;
1df70142 2477 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2478 PerlIO_printf(Perl_debug_log,
2479 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2480 (IV)(locinput - PL_bostr),
2481 PL_colors[4],
2482 len0, s0,
2483 PL_colors[5],
2484 PL_colors[2],
2485 len1, s1,
2486 PL_colors[3],
2487 (docolor ? "" : "> <"),
2488 PL_colors[0],
2489 len2, s2,
2490 PL_colors[1],
2491 15 - l - pref_len + 1,
2492 "",
2493 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2494 SvPVX_const(prop));
2a782b5b
JH
2495 }
2496 });
d6a28714
JH
2497
2498 next = scan + NEXT_OFF(scan);
2499 if (next == scan)
2500 next = NULL;
2501
2502 switch (OP(scan)) {
2503 case BOL:
7fba1cd6 2504 if (locinput == PL_bostr)
d6a28714
JH
2505 {
2506 /* regtill = regbol; */
b8c5462f
JH
2507 break;
2508 }
d6a28714
JH
2509 sayNO;
2510 case MBOL:
12d33761
HS
2511 if (locinput == PL_bostr ||
2512 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2513 {
b8c5462f
JH
2514 break;
2515 }
d6a28714
JH
2516 sayNO;
2517 case SBOL:
c2a73568 2518 if (locinput == PL_bostr)
b8c5462f 2519 break;
d6a28714
JH
2520 sayNO;
2521 case GPOS:
2522 if (locinput == PL_reg_ganch)
2523 break;
2524 sayNO;
2525 case EOL:
d6a28714
JH
2526 goto seol;
2527 case MEOL:
d6a28714 2528 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2529 sayNO;
b8c5462f 2530 break;
d6a28714
JH
2531 case SEOL:
2532 seol:
2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2534 sayNO;
d6a28714 2535 if (PL_regeol - locinput > 1)
b8c5462f 2536 sayNO;
b8c5462f 2537 break;
d6a28714
JH
2538 case EOS:
2539 if (PL_regeol != locinput)
b8c5462f 2540 sayNO;
d6a28714 2541 break;
ffc61ed2 2542 case SANY:
d6a28714 2543 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2544 sayNO;
f33976b4
DB
2545 if (do_utf8) {
2546 locinput += PL_utf8skip[nextchr];
2547 if (locinput > PL_regeol)
2548 sayNO;
2549 nextchr = UCHARAT(locinput);
2550 }
2551 else
2552 nextchr = UCHARAT(++locinput);
2553 break;
2554 case CANY:
2555 if (!nextchr && locinput >= PL_regeol)
2556 sayNO;
b8c5462f 2557 nextchr = UCHARAT(++locinput);
a0d0e21e 2558 break;
ffc61ed2 2559 case REG_ANY:
1aa99e6b
IH
2560 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2561 sayNO;
2562 if (do_utf8) {
b8c5462f 2563 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2564 if (locinput > PL_regeol)
2565 sayNO;
a0ed51b3 2566 nextchr = UCHARAT(locinput);
a0ed51b3 2567 }
1aa99e6b
IH
2568 else
2569 nextchr = UCHARAT(++locinput);
a0ed51b3 2570 break;
a3621e74
YO
2571
2572
2573
2574 /*
2575 traverse the TRIE keeping track of all accepting states
2576 we transition through until we get to a failing node.
2577
2578 we use two slightly different pieces of code to handle
2579 the traversal depending on whether its case sensitive or
2580 not. we reuse the accept code however. (this should probably
2581 be turned into a macro.)
2582
2583 */
2584 case TRIEF:
2585 case TRIEFL:
2586 {
a3621e74
YO
2587 U8 *uc = ( U8* )locinput;
2588 U32 state = 1;
2589 U16 charid = 0;
2590 U32 base = 0;
2591 UV uvc = 0;
2592 STRLEN len = 0;
2593 STRLEN foldlen = 0;
a3621e74
YO
2594 U8 *uscan = (U8*)NULL;
2595 STRLEN bufflen=0;
2596 accepted = 0;
2597
2598 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2599
2600 while ( state && uc <= (U8*)PL_regeol ) {
2601
2602 TRIE_CHECK_STATE_IS_ACCEPTING;
2603
2604 base = trie->states[ state ].trans.base;
2605
2606 DEBUG_TRIE_EXECUTE_r(
2607 PerlIO_printf( Perl_debug_log,
e4584336 2608 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2609 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2610 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2611 );
2612
2613 if ( base ) {
2614
108bb1ad 2615 if ( do_utf8 ) {
a3621e74
YO
2616 if ( foldlen>0 ) {
2617 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2618 foldlen -= len;
2619 uscan += len;
2620 len=0;
2621 } else {
1df70142 2622 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2623 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2624 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2625 foldlen -= UNISKIP( uvc );
2626 uscan = foldbuf + UNISKIP( uvc );
2627 }
2628 } else {
e4584336 2629 uvc = (UV)*uc;
a3621e74
YO
2630 len = 1;
2631 }
2632
2633 TRIE_HANDLE_CHAR;
2634
2635 } else {
2636 state = 0;
2637 }
2638 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2639 PerlIO_printf( Perl_debug_log,
2640 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2641 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2642 );
2643 }
2644 if ( !accepted ) {
2645 sayNO;
2646 } else {
2647 goto TrieAccept;
2648 }
2649 }
2650 /* unreached codepoint: we jump into the middle of the next case
2651 from previous if blocks */
2652 case TRIE:
2653 {
a3621e74
YO
2654 U8 *uc = (U8*)locinput;
2655 U32 state = 1;
2656 U16 charid = 0;
2657 U32 base = 0;
2658 UV uvc = 0;
2659 STRLEN len = 0;
2660 STRLEN bufflen = 0;
2661 accepted = 0;
2662
2663 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2664
2665 while ( state && uc <= (U8*)PL_regeol ) {
2666
2667 TRIE_CHECK_STATE_IS_ACCEPTING;
2668
2669 base = trie->states[ state ].trans.base;
2670
2671 DEBUG_TRIE_EXECUTE_r(
2672 PerlIO_printf( Perl_debug_log,
e4584336 2673 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2674 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2675 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2676 );
2677
2678 if ( base ) {
2679
108bb1ad 2680 if ( do_utf8 ) {
a3621e74
YO
2681 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2682 } else {
2683 uvc = (U32)*uc;
2684 len = 1;
2685 }
2686
2687 TRIE_HANDLE_CHAR;
2688
2689 } else {
2690 state = 0;
2691 }
2692 DEBUG_TRIE_EXECUTE_r(
2693 PerlIO_printf( Perl_debug_log,
e4584336
RB
2694 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2695 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2696 );
2697 }
2698 if ( !accepted ) {
2699 sayNO;
2700 }
2701 }
2702
2703
2704 /*
2705 There was at least one accepting state that we
2706 transitioned through. Presumably the number of accepting
2707 states is going to be low, typically one or two. So we
2708 simply scan through to find the one with lowest wordnum.
2709 Once we find it, we swap the last state into its place
2710 and decrement the size. We then try to match the rest of
2711 the pattern at the point where the word ends, if we
2712 succeed then we end the loop, otherwise the loop
2713 eventually terminates once all of the accepting states
2714 have been tried.
2715 */
2716 TrieAccept:
2717 {
2718 int gotit = 0;
2719
2720 if ( accepted == 1 ) {
2721 DEBUG_EXECUTE_r({
2722 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2723 PerlIO_printf( Perl_debug_log,
2724 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2725 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2726 accept_buff[ 0 ].wordnum,
cfd0369c 2727 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2728 PL_colors[5] );
2729 });
cc601c31 2730 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2731 /* in this case we free tmps/leave before we call regmatch
2732 as we wont be using accept_buff again. */
2733 FREETMPS;
2734 LEAVE;
2735 gotit = regmatch( scan + NEXT_OFF( scan ) );
2736 } else {
2737 DEBUG_EXECUTE_r(
e4584336
RB
2738 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2739 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2740 PL_colors[5] );
2741 );
2742 while ( !gotit && accepted-- ) {
2743 U32 best = 0;
2744 U32 cur;
2745 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2746 DEBUG_TRIE_EXECUTE_r(
2747 PerlIO_printf( Perl_debug_log,
2748 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2749 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2750 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2751 accept_buff[ cur ].wordnum, PL_colors[5] );
2752 );
a3621e74
YO
2753
2754 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2755 best = cur;
2756 }
2757 DEBUG_EXECUTE_r({
2758 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2759 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2760 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2761 accept_buff[best].wordnum,
cfd0369c 2762 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2763 PL_colors[5] );
2764 });
2765 if ( best<accepted ) {
2766 reg_trie_accepted tmp = accept_buff[ best ];
2767 accept_buff[ best ] = accept_buff[ accepted ];
2768 accept_buff[ accepted ] = tmp;
2769 best = accepted;
2770 }
cc601c31 2771 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2772
2773 /*
2774 as far as I can tell we only need the SAVETMPS/FREETMPS
2775 for re's with EVAL in them but I'm leaving them in for
2776 all until I can be sure.
2777 */
2778 SAVETMPS;
2779 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2780 FREETMPS;
2781 }
2782 FREETMPS;
2783 LEAVE;
2784 }
2785
2786 if ( gotit ) {
2787 sayYES;
2788 } else {
2789 sayNO;
2790 }
2791 }
2792 /* unreached codepoint */
d6a28714 2793 case EXACT:
cd439c50
IZ
2794 s = STRING(scan);
2795 ln = STR_LEN(scan);
eb160463 2796 if (do_utf8 != UTF) {
bc517b45 2797 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2798 char *l = locinput;
a3b680e6 2799 const char *e = s + ln;
a72c7584 2800
5ff6fc6d
JH
2801 if (do_utf8) {
2802 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2803 while (s < e) {
a3b680e6 2804 STRLEN ulen;
1aa99e6b 2805 if (l >= PL_regeol)
5ff6fc6d
JH
2806 sayNO;
2807 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2808 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2809 uniflags))
5ff6fc6d 2810 sayNO;
bc517b45 2811 l += ulen;
5ff6fc6d 2812 s ++;
1aa99e6b 2813 }
5ff6fc6d
JH
2814 }
2815 else {
2816 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2817 while (s < e) {
a3b680e6 2818 STRLEN ulen;
1aa99e6b
IH
2819 if (l >= PL_regeol)
2820 sayNO;
5ff6fc6d 2821 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2822 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2823 uniflags))
1aa99e6b 2824 sayNO;
bc517b45 2825 s += ulen;
a72c7584 2826 l ++;
1aa99e6b 2827 }
5ff6fc6d 2828 }
1aa99e6b
IH
2829 locinput = l;
2830 nextchr = UCHARAT(locinput);
2831 break;
2832 }
bc517b45 2833 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2834 /* Inline the first character, for speed. */
2835 if (UCHARAT(s) != nextchr)
2836 sayNO;
2837 if (PL_regeol - locinput < ln)
2838 sayNO;
2839 if (ln > 1 && memNE(s, locinput, ln))
2840 sayNO;
2841 locinput += ln;
2842 nextchr = UCHARAT(locinput);
2843 break;
2844 case EXACTFL:
b8c5462f
JH
2845 PL_reg_flags |= RF_tainted;
2846 /* FALL THROUGH */
d6a28714 2847 case EXACTF:
cd439c50
IZ
2848 s = STRING(scan);
2849 ln = STR_LEN(scan);
d6a28714 2850
d07ddd77
JH
2851 if (do_utf8 || UTF) {
2852 /* Either target or the pattern are utf8. */
d6a28714 2853 char *l = locinput;
d07ddd77 2854 char *e = PL_regeol;
bc517b45 2855
eb160463 2856 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2857 l, &e, 0, do_utf8)) {
5486206c
JH
2858 /* One more case for the sharp s:
2859 * pack("U0U*", 0xDF) =~ /ss/i,
2860 * the 0xC3 0x9F are the UTF-8
2861 * byte sequence for the U+00DF. */
2862 if (!(do_utf8 &&
2863 toLOWER(s[0]) == 's' &&
2864 ln >= 2 &&
2865 toLOWER(s[1]) == 's' &&
2866 (U8)l[0] == 0xC3 &&
2867 e - l >= 2 &&
2868 (U8)l[1] == 0x9F))
2869 sayNO;
2870 }
d07ddd77
JH
2871 locinput = e;
2872 nextchr = UCHARAT(locinput);
2873 break;
a0ed51b3 2874 }
d6a28714 2875
bc517b45
JH
2876 /* Neither the target and the pattern are utf8. */
2877
d6a28714
JH
2878 /* Inline the first character, for speed. */
2879 if (UCHARAT(s) != nextchr &&
2880 UCHARAT(s) != ((OP(scan) == EXACTF)
2881 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2882 sayNO;
d6a28714 2883 if (PL_regeol - locinput < ln)
b8c5462f 2884 sayNO;
d6a28714
JH
2885 if (ln > 1 && (OP(scan) == EXACTF
2886 ? ibcmp(s, locinput, ln)
2887 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2888 sayNO;
d6a28714
JH
2889 locinput += ln;
2890 nextchr = UCHARAT(locinput);
a0d0e21e 2891 break;
d6a28714 2892 case ANYOF:
ffc61ed2 2893 if (do_utf8) {
9e55ce06
JH
2894 STRLEN inclasslen = PL_regeol - locinput;
2895
ba7b4546 2896 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2897 sayNO_ANYOF;
ffc61ed2
JH
2898 if (locinput >= PL_regeol)
2899 sayNO;
0f0076b4 2900 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2901 nextchr = UCHARAT(locinput);
e0f9d4a8 2902 break;
ffc61ed2
JH
2903 }
2904 else {
2905 if (nextchr < 0)
2906 nextchr = UCHARAT(locinput);
7d3e948e 2907 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2908 sayNO_ANYOF;
ffc61ed2
JH
2909 if (!nextchr && locinput >= PL_regeol)
2910 sayNO;
2911 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2912 break;
2913 }
2914 no_anyof:
2915 /* If we might have the case of the German sharp s
2916 * in a casefolding Unicode character class. */
2917
ebc501f0
JH
2918 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2919 locinput += SHARP_S_SKIP;
e0f9d4a8 2920 nextchr = UCHARAT(locinput);
ffc61ed2 2921 }
e0f9d4a8
JH
2922 else
2923 sayNO;
b8c5462f 2924 break;
d6a28714 2925 case ALNUML:
b8c5462f
JH
2926 PL_reg_flags |= RF_tainted;
2927 /* FALL THROUGH */
d6a28714 2928 case ALNUM:
b8c5462f 2929 if (!nextchr)
4633a7c4 2930 sayNO;
ffc61ed2 2931 if (do_utf8) {
1a4fad37 2932 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2933 if (!(OP(scan) == ALNUM
3568d838 2934 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2935 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2936 {
2937 sayNO;
a0ed51b3 2938 }
b8c5462f 2939 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2940 nextchr = UCHARAT(locinput);
2941 break;
2942 }
ffc61ed2 2943 if (!(OP(scan) == ALNUM
d6a28714 2944 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2945 sayNO;
b8c5462f 2946 nextchr = UCHARAT(++locinput);
a0d0e21e 2947 break;
d6a28714 2948 case NALNUML:
b8c5462f
JH
2949 PL_reg_flags |= RF_tainted;
2950 /* FALL THROUGH */
d6a28714
JH
2951 case NALNUM:
2952 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2953 sayNO;
ffc61ed2 2954 if (do_utf8) {
1a4fad37 2955 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2956 if (OP(scan) == NALNUM
3568d838 2957 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2958 : isALNUM_LC_utf8((U8*)locinput))
2959 {
b8c5462f 2960 sayNO;
d6a28714 2961 }
b8c5462f
JH
2962 locinput += PL_utf8skip[nextchr];
2963 nextchr = UCHARAT(locinput);
2964 break;
2965 }
ffc61ed2 2966 if (OP(scan) == NALNUM
d6a28714 2967 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2968 sayNO;
76e3520e 2969 nextchr = UCHARAT(++locinput);
a0d0e21e 2970 break;
d6a28714
JH
2971 case BOUNDL:
2972 case NBOUNDL:
3280af22 2973 PL_reg_flags |= RF_tainted;
bbce6d69 2974 /* FALL THROUGH */
d6a28714
JH
2975 case BOUND:
2976 case NBOUND:
2977 /* was last char in word? */
ffc61ed2 2978 if (do_utf8) {
12d33761
HS
2979 if (locinput == PL_bostr)
2980 ln = '\n';
ffc61ed2 2981 else {
a3b680e6 2982 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2983
b4f7163a 2984 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2985 }
2986 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2987 ln = isALNUM_uni(ln);
1a4fad37 2988 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2989 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2990 }
2991 else {
9041c2e3 2992 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2993 n = isALNUM_LC_utf8((U8*)locinput);
2994 }
a0ed51b3 2995 }
d6a28714 2996 else {
12d33761
HS
2997 ln = (locinput != PL_bostr) ?
2998 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2999 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3000 ln = isALNUM(ln);
3001 n = isALNUM(nextchr);
3002 }
3003 else {
3004 ln = isALNUM_LC(ln);
3005 n = isALNUM_LC(nextchr);
3006 }
d6a28714 3007 }
ffc61ed2
JH
3008 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3009 OP(scan) == BOUNDL))
3010 sayNO;
a0ed51b3 3011 break;
d6a28714 3012 case SPACEL:
3280af22 3013 PL_reg_flags |= RF_tainted;
bbce6d69 3014 /* FALL THROUGH */
d6a28714 3015 case SPACE:
9442cb0e 3016 if (!nextchr)
4633a7c4 3017 sayNO;
1aa99e6b 3018 if (do_utf8) {
fd400ab9 3019 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3020 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3021 if (!(OP(scan) == SPACE
3568d838 3022 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3023 : isSPACE_LC_utf8((U8*)locinput)))
3024 {
3025 sayNO;
3026 }
3027 locinput += PL_utf8skip[nextchr];
3028 nextchr = UCHARAT(locinput);
3029 break;
d6a28714 3030 }
ffc61ed2
JH
3031 if (!(OP(scan) == SPACE
3032 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3033 sayNO;
3034 nextchr = UCHARAT(++locinput);
3035 }
3036 else {
3037 if (!(OP(scan) == SPACE
3038 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3039 sayNO;
3040 nextchr = UCHARAT(++locinput);
a0ed51b3 3041 }
a0ed51b3 3042 break;
d6a28714 3043 case NSPACEL:
3280af22 3044 PL_reg_flags |= RF_tainted;
bbce6d69 3045 /* FALL THROUGH */
d6a28714 3046 case NSPACE:
9442cb0e 3047 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3048 sayNO;
1aa99e6b 3049 if (do_utf8) {
1a4fad37 3050 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3051 if (OP(scan) == NSPACE
3568d838 3052 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3053 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3054 {
3055 sayNO;
3056 }
3057 locinput += PL_utf8skip[nextchr];
3058 nextchr = UCHARAT(locinput);
3059 break;
a0ed51b3 3060 }
ffc61ed2 3061 if (OP(scan) == NSPACE
d6a28714 3062 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3063 sayNO;
76e3520e 3064 nextchr = UCHARAT(++locinput);
a0d0e21e 3065 break;
d6a28714 3066 case DIGITL:
a0ed51b3
LW
3067 PL_reg_flags |= RF_tainted;
3068 /* FALL THROUGH */
d6a28714 3069 case DIGIT:
9442cb0e 3070 if (!nextchr)
a0ed51b3 3071 sayNO;
1aa99e6b 3072 if (do_utf8) {
1a4fad37 3073 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3074 if (!(OP(scan) == DIGIT
3568d838 3075 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3076 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3077 {
a0ed51b3 3078 sayNO;
dfe13c55 3079 }
6f06b55f 3080 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3081 nextchr = UCHARAT(locinput);
3082 break;
3083 }
ffc61ed2 3084 if (!(OP(scan) == DIGIT
9442cb0e 3085 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3086 sayNO;
3087 nextchr = UCHARAT(++locinput);
3088 break;
d6a28714 3089 case NDIGITL:
b8c5462f
JH
3090 PL_reg_flags |= RF_tainted;
3091 /* FALL THROUGH */
d6a28714 3092 case NDIGIT:
9442cb0e 3093 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3094 sayNO;
1aa99e6b 3095 if (do_utf8) {
1a4fad37 3096 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3097 if (OP(scan) == NDIGIT
3568d838 3098 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3099 : isDIGIT_LC_utf8((U8*)locinput))
3100 {
a0ed51b3 3101 sayNO;
9442cb0e 3102 }
6f06b55f 3103 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3104 nextchr = UCHARAT(locinput);
3105 break;
3106 }
ffc61ed2 3107 if (OP(scan) == NDIGIT
9442cb0e 3108 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3109 sayNO;
3110 nextchr = UCHARAT(++locinput);
3111 break;
3112 case CLUMP:
b7c83a7e 3113 if (locinput >= PL_regeol)
a0ed51b3 3114 sayNO;
b7c83a7e 3115 if (do_utf8) {
1a4fad37 3116 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3117 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3118 sayNO;
3119 locinput += PL_utf8skip[nextchr];
3120 while (locinput < PL_regeol &&
3121 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3122 locinput += UTF8SKIP(locinput);
3123 if (locinput > PL_regeol)
3124 sayNO;
eb08e2da
JH
3125 }
3126 else
3127 locinput++;
a0ed51b3
LW
3128 nextchr = UCHARAT(locinput);
3129 break;
c8756f30 3130 case REFFL:
3280af22 3131 PL_reg_flags |= RF_tainted;
c8756f30 3132 /* FALL THROUGH */
c277df42 3133 case REF:
c8756f30 3134 case REFF:
c277df42 3135 n = ARG(scan); /* which paren pair */
cf93c79d 3136 ln = PL_regstartp[n];
2c2d71f5 3137 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3138 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3139 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3140 if (ln == PL_regendp[n])
a0d0e21e 3141 break;
a0ed51b3 3142
cf93c79d 3143 s = PL_bostr + ln;
1aa99e6b 3144 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3145 char *l = locinput;
a3b680e6 3146 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3147 /*
3148 * Note that we can't do the "other character" lookup trick as
3149 * in the 8-bit case (no pun intended) because in Unicode we
3150 * have to map both upper and title case to lower case.
3151 */
3152 if (OP(scan) == REFF) {
3153 while (s < e) {
a3b680e6
AL
3154 STRLEN ulen1, ulen2;
3155 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3156 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3157
a0ed51b3
LW
3158 if (l >= PL_regeol)
3159 sayNO;
a2a2844f
JH
3160 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3161 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3162 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3163 sayNO;
a2a2844f
JH
3164 s += ulen1;
3165 l += ulen2;
a0ed51b3
LW
3166 }
3167 }
3168 locinput = l;
3169 nextchr = UCHARAT(locinput);
3170 break;
3171 }
3172
a0d0e21e 3173 /* Inline the first character, for speed. */
76e3520e 3174 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3175 (OP(scan) == REF ||
3176 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3177 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3178 sayNO;
cf93c79d 3179 ln = PL_regendp[n] - ln;
3280af22 3180 if (locinput + ln > PL_regeol)
4633a7c4 3181 sayNO;
c8756f30
AK
3182 if (ln > 1 && (OP(scan) == REF
3183 ? memNE(s, locinput, ln)
3184 : (OP(scan) == REFF
3185 ? ibcmp(s, locinput, ln)
3186 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3187 sayNO;
a0d0e21e 3188 locinput += ln;
76e3520e 3189 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3190 break;
3191
3192 case NOTHING:
c277df42 3193 case TAIL:
a0d0e21e
LW
3194 break;
3195 case BACK:
3196 break;
c277df42
IZ
3197 case EVAL:
3198 {
3199 dSP;
533c011a 3200 OP_4tree *oop = PL_op;
3280af22 3201 COP *ocurcop = PL_curcop;
f3548bdc 3202 PAD *old_comppad;
c277df42 3203 SV *ret;
080c2dec 3204 struct regexp *oreg = PL_reg_re;
9041c2e3 3205
c277df42 3206 n = ARG(scan);
533c011a 3207 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3208 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3209 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3210 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3211
8e5e9ebe
RGS
3212 {
3213 SV **before = SP;
3214 CALLRUNOPS(aTHX); /* Scalar context. */
3215 SPAGAIN;
3216 if (SP == before)
075aa684 3217 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3218 else {
3219 ret = POPs;
3220 PUTBACK;
3221 }
3222 }
3223
0f5d15d6 3224 PL_op = oop;
f3548bdc 3225 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3226 PL_curcop = ocurcop;
c277df42 3227 if (logical) {
0f5d15d6
IZ
3228 if (logical == 2) { /* Postponed subexpression. */
3229 regexp *re;
22c35a8c 3230 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3231 re_cc_state state;
0f5d15d6 3232 CHECKPOINT cp, lastcp;
cb50f42d 3233 int toggleutf;
faf82a0b 3234 register SV *sv;
0f5d15d6 3235
faf82a0b
AE
3236 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3237 mg = mg_find(sv, PERL_MAGIC_qr);
3238 else if (SvSMAGICAL(ret)) {
3239 if (SvGMAGICAL(ret))
3240 sv_unmagic(ret, PERL_MAGIC_qr);
3241 else
3242 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3243 }
faf82a0b 3244
0f5d15d6
IZ
3245 if (mg) {
3246 re = (regexp *)mg->mg_obj;
df0003d4 3247 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3248 }
3249 else {
3250 STRLEN len;
83003860 3251 const char *t = SvPV_const(ret, len);
0f5d15d6 3252 PMOP pm;
a3b680e6
AL
3253 char * const oprecomp = PL_regprecomp;
3254 const I32 osize = PL_regsize;
3255 const I32 onpar = PL_regnpar;
0f5d15d6 3256
5fcd1c1b 3257 Zero(&pm, 1, PMOP);
cb50f42d 3258 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3259 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3260 if (!(SvFLAGS(ret)
faf82a0b
AE
3261 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3262 | SVs_GMG)))
14befaf4
DM
3263 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3264 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3265 PL_regprecomp = oprecomp;
3266 PL_regsize = osize;
3267 PL_regnpar = onpar;
3268 }
a3621e74 3269 DEBUG_EXECUTE_r(
9041c2e3 3270 PerlIO_printf(Perl_debug_log,
a0288114 3271 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3272 PL_colors[0],
3273 re->precomp,
3274 PL_colors[1],
3275 (strlen(re->precomp) > 60 ? "..." : ""))
3276 );
3277 state.node = next;
3278 state.prev = PL_reg_call_cc;
3279 state.cc = PL_regcc;
3280 state.re = PL_reg_re;
3281
2ab05381 3282 PL_regcc = 0;
9041c2e3 3283
0f5d15d6 3284 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3285 REGCP_SET(lastcp);
0f5d15d6
IZ
3286 cache_re(re);
3287 state.ss = PL_savestack_ix;
3288 *PL_reglastparen = 0;
a01268b5 3289 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3290 PL_reg_call_cc = &state;
3291 PL_reginput = locinput;
cb50f42d
YST
3292 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3293 ((re->reganch & ROPT_UTF8) != 0);
3294 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3295
3296 /* XXXX This is too dramatic a measure... */
3297 PL_reg_maxiter = 0;
3298
0f5d15d6 3299 if (regmatch(re->program + 1)) {
2c914db6
IZ
3300 /* Even though we succeeded, we need to restore
3301 global variables, since we may be wrapped inside
3302 SUSPEND, thus the match may be not finished yet. */
3303
3304 /* XXXX Do this only if SUSPENDed? */
3305 PL_reg_call_cc = state.prev;
3306 PL_regcc = state.cc;
3307 PL_reg_re = state.re;
3308 cache_re(PL_reg_re);
cb50f42d 3309 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3310
3311 /* XXXX This is too dramatic a measure... */
3312 PL_reg_maxiter = 0;
3313
3314 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3315 ReREFCNT_dec(re);
3316 regcpblow(cp);
3317 sayYES;
3318 }
0f5d15d6 3319 ReREFCNT_dec(re);
02db2b7b 3320 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3321 regcppop();
3322 PL_reg_call_cc = state.prev;
3323 PL_regcc = state.cc;
3324 PL_reg_re = state.re;
d3790889 3325 cache_re(PL_reg_re);
cb50f42d 3326 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3327
3328 /* XXXX This is too dramatic a measure... */
3329 PL_reg_maxiter = 0;
3330
8e514ae6 3331 logical = 0;
0f5d15d6
IZ
3332 sayNO;
3333 }
c277df42 3334 sw = SvTRUE(ret);
0f5d15d6 3335 logical = 0;
a0ed51b3 3336 }
080c2dec 3337 else {
3280af22 3338 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3339 cache_re(oreg);
3340 }
c277df42
IZ
3341 break;
3342 }
a0d0e21e 3343 case OPEN:
c277df42 3344 n = ARG(scan); /* which paren pair */
3280af22
NIS
3345 PL_reg_start_tmp[n] = locinput;
3346 if (n > PL_regsize)
3347 PL_regsize = n;
a0d0e21e
LW
3348 break;
3349 case CLOSE:
c277df42 3350 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3351 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3352 PL_regendp[n] = locinput - PL_bostr;
eb160463 3353 if (n > (I32)*PL_reglastparen)
3280af22 3354 *PL_reglastparen = n;
a01268b5 3355 *PL_reglastcloseparen = n;
a0d0e21e 3356 break;
c277df42
IZ
3357 case GROUPP:
3358 n = ARG(scan); /* which paren pair */
eb160463 3359 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3360 break;
3361 case IFTHEN:
2c2d71f5 3362 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3363 if (sw)
3364 next = NEXTOPER(NEXTOPER(scan));
3365 else {
3366 next = scan + ARG(scan);
3367 if (OP(next) == IFTHEN) /* Fake one. */
3368 next = NEXTOPER(NEXTOPER(next));
3369 }
3370 break;
3371 case LOGICAL:
0f5d15d6 3372 logical = scan->flags;
c277df42 3373 break;
2ab05381
IZ
3374/*******************************************************************
3375 PL_regcc contains infoblock about the innermost (...)* loop, and
3376 a pointer to the next outer infoblock.
3377
3378 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3379
3380 1) After matching X, regnode for CURLYX is processed;
3381
9041c2e3 3382 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3383 regmatch() recursively with the starting point at WHILEM node;
3384
3385 3) Each hit of WHILEM node tries to match A and Z (in the order
3386 depending on the current iteration, min/max of {min,max} and
3387 greediness). The information about where are nodes for "A"
3388 and "Z" is read from the infoblock, as is info on how many times "A"
3389 was already matched, and greediness.
3390
3391 4) After A matches, the same WHILEM node is hit again.
3392
3393 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3394 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3395 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3396 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3397 of the external loop.
3398
3399 Currently present infoblocks form a tree with a stem formed by PL_curcc
3400 and whatever it mentions via ->next, and additional attached trees
3401 corresponding to temporarily unset infoblocks as in "5" above.
3402
9041c2e3 3403 In the following picture infoblocks for outer loop of
2ab05381
IZ
3404 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3405 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3406 infoblocks are drawn below the "reset" infoblock.
3407
3408 In fact in the picture below we do not show failed matches for Z and T
3409 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3410 more obvious *why* one needs to *temporary* unset infoblocks.]
3411
3412 Matched REx position InfoBlocks Comment
3413 (Y(A)*?Z)*?T x
3414 Y(A)*?Z)*?T x <- O
3415 Y (A)*?Z)*?T x <- O
3416 Y A)*?Z)*?T x <- O <- I
3417 YA )*?Z)*?T x <- O <- I
3418 YA A)*?Z)*?T x <- O <- I
3419 YAA )*?Z)*?T x <- O <- I
3420 YAA Z)*?T x <- O # Temporary unset I
3421 I
3422
3423 YAAZ Y(A)*?Z)*?T x <- O
3424 I
3425
3426 YAAZY (A)*?Z)*?T x <- O
3427 I
3428
3429 YAAZY A)*?Z)*?T x <- O <- I
3430 I
3431
3432 YAAZYA )*?Z)*?T x <- O <- I
3433 I
3434
3435 YAAZYA Z)*?T x <- O # Temporary unset I
3436 I,I
3437
3438 YAAZYAZ )*?T x <- O
3439 I,I
3440
3441 YAAZYAZ T x # Temporary unset O
3442 O
3443 I,I
3444
3445 YAAZYAZT x
3446 O
3447 I,I
3448 *******************************************************************/
a0d0e21e
LW
3449 case CURLYX: {
3450 CURCUR cc;
3280af22 3451 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3452 /* No need to save/restore up to this paren */
3453 I32 parenfloor = scan->flags;
c277df42
IZ
3454
3455 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3456 next += ARG(next);
3280af22
NIS
3457 cc.oldcc = PL_regcc;
3458 PL_regcc = &cc;
cb434fcc
IZ
3459 /* XXXX Probably it is better to teach regpush to support
3460 parenfloor > PL_regsize... */
eb160463 3461 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3462 parenfloor = *PL_reglastparen; /* Pessimization... */
3463 cc.parenfloor = parenfloor;
a0d0e21e
LW
3464 cc.cur = -1;
3465 cc.min = ARG1(scan);
3466 cc.max = ARG2(scan);
c277df42 3467 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3468 cc.next = next;
3469 cc.minmod = minmod;
3470 cc.lastloc = 0;
3280af22 3471 PL_reginput = locinput;
a0d0e21e
LW
3472 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3473 regcpblow(cp);
3280af22 3474 PL_regcc = cc.oldcc;
4633a7c4 3475 saySAME(n);
a0d0e21e
LW
3476 }
3477 /* NOT REACHED */
3478 case WHILEM: {
3479 /*
3480 * This is really hard to understand, because after we match
3481 * what we're trying to match, we must make sure the rest of
2c2d71f5 3482 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3483 * to go back UP the parse tree by recursing ever deeper. And
3484 * if it fails, we have to reset our parent's current state
3485 * that we can try again after backing off.
3486 */
3487
c277df42 3488 CHECKPOINT cp, lastcp;
3280af22 3489 CURCUR* cc = PL_regcc;
c277df42 3490 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3491 I32 cache_offset = 0, cache_bit = 0;
c277df42 3492
4633a7c4 3493 n = cc->cur + 1; /* how many we know we matched */
3280af22 3494 PL_reginput = locinput;
a0d0e21e 3495
a3621e74 3496 DEBUG_EXECUTE_r(
9041c2e3 3497 PerlIO_printf(Perl_debug_log,
91f3b821 3498 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",