This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
s/PERL_COPY_ON_WRITE/PERL_OLD_COPY_ON_WRITE/g
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
a687059c
LW
86 *
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
90 */
91#include "EXTERN.h"
864dbfa3 92#define PERL_IN_REGEXEC_C
a687059c 93#include "perl.h"
0f5d15d6 94
a687059c
LW
95#include "regcomp.h"
96
c277df42
IZ
97#define RF_tainted 1 /* tainted information used? */
98#define RF_warned 2 /* warned about big count? */
ce862d02 99#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
100#define RF_utf8 8 /* String contains multibyte chars? */
101
eb160463 102#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
103
104#define RS_init 1 /* eval environment created */
105#define RS_set 2 /* replsv value is set */
c277df42 106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
ba7b4546 111#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 112
c277df42
IZ
113/*
114 * Forwards.
115 */
116
33b8afdf 117#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 118#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 119
dfe13c55
GS
120#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
122#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
124#define HOPc(pos,off) ((char*)HOP(pos,off))
125#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 126
efb30f32 127#define HOPBACK(pos, off) ( \
e54858b0 128 (PL_reg_match_utf8) \
efb30f32
HS
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
131 ? (U8*)(pos - off) \
132 : (U8*)NULL \
133)
134#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135
1aa99e6b
IH
136#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
138#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
140#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
144 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
145#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
146#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
147#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
148#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 149
5f80c4cf 150/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
151#define JUMPABLE(rn) ( \
152 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
153 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
154 OP(rn) == PLUS || OP(rn) == MINMOD || \
155 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
156)
157
cca55fe3
JP
158#define HAS_TEXT(rn) ( \
159 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
160)
e2d8ce26 161
a84d97b6
HS
162/*
163 Search for mandatory following text node; for lookahead, the text must
164 follow but for lookbehind (rn->flags != 0) we skip to the next step.
165*/
cca55fe3 166#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 167 while (JUMPABLE(rn)) \
a84d97b6 168 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 169 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
170 else if (OP(rn) == PLUS) \
171 rn = NEXTOPER(rn); \
a84d97b6
HS
172 else if (OP(rn) == IFMATCH) \
173 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 174 else rn += NEXT_OFF(rn); \
5f80c4cf 175} STMT_END
74750237 176
acfe0abc 177static void restore_pos(pTHX_ void *arg);
51371543 178
76e3520e 179STATIC CHECKPOINT
cea2e8a9 180S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 181{
a3b680e6 182 const int retval = PL_savestack_ix;
b1ce53c5 183#define REGCP_PAREN_ELEMS 4
a3b680e6 184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
185 int p;
186
e49a9654
IH
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189
a01268b5 190#define REGCP_OTHER_ELEMS 6
4b3c1a47 191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 192 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 193/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
3280af22 196 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
197 SSPUSHINT(p);
198 }
b1ce53c5 199/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
a01268b5 202 SSPUSHINT(*PL_reglastcloseparen);
3280af22 203 SSPUSHPTR(PL_reginput);
41123dfd
JH
204#define REGCP_FRAME_ELEMS 2
205/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 209
a0d0e21e
LW
210 return retval;
211}
212
c277df42 213/* These are needed since we do not localize EVAL nodes: */
a3621e74 214# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 217
a3621e74 218# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 219 PerlIO_printf(Perl_debug_log, \
faccc32b 220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 222
76e3520e 223STATIC char *
cea2e8a9 224S_regcppop(pTHX)
a0d0e21e 225{
b1ce53c5 226 I32 i;
a0d0e21e
LW
227 U32 paren = 0;
228 char *input;
b1ce53c5 229
a3621e74
YO
230 GET_RE_DEBUG_FLAGS_DECL;
231
b1ce53c5 232 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 233 i = SSPOPINT;
b1ce53c5
JH
234 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
235 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 236 input = (char *) SSPOPPTR;
a01268b5 237 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
238 *PL_reglastparen = SSPOPINT;
239 PL_regsize = SSPOPINT;
b1ce53c5
JH
240
241 /* Now restore the parentheses context. */
41123dfd
JH
242 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
243 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 244 I32 tmps;
a0d0e21e 245 paren = (U32)SSPOPINT;
3280af22 246 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
247 PL_regstartp[paren] = SSPOPINT;
248 tmps = SSPOPINT;
3280af22
NIS
249 if (paren <= *PL_reglastparen)
250 PL_regendp[paren] = tmps;
a3621e74 251 DEBUG_EXECUTE_r(
c3464db5 252 PerlIO_printf(Perl_debug_log,
b900a521 253 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 254 (UV)paren, (IV)PL_regstartp[paren],
b900a521 255 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 256 (IV)PL_regendp[paren],
3280af22 257 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 258 );
a0d0e21e 259 }
a3621e74 260 DEBUG_EXECUTE_r(
eb160463 261 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 262 PerlIO_printf(Perl_debug_log,
faccc32b
JH
263 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
264 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
265 }
266 );
daf18116 267#if 1
dafc8851
JH
268 /* It would seem that the similar code in regtry()
269 * already takes care of this, and in fact it is in
270 * a better location to since this code can #if 0-ed out
271 * but the code in regtry() is needed or otherwise tests
272 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
273 * (as of patchlevel 7877) will fail. Then again,
274 * this code seems to be necessary or otherwise
275 * building DynaLoader will fail:
276 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
277 * --jhi */
eb160463
GS
278 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
279 if ((I32)paren > PL_regsize)
cf93c79d
IZ
280 PL_regstartp[paren] = -1;
281 PL_regendp[paren] = -1;
a0d0e21e 282 }
dafc8851 283#endif
a0d0e21e
LW
284 return input;
285}
286
0f5d15d6 287STATIC char *
cea2e8a9 288S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 289{
a3b680e6 290 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
291
292 PL_savestack_ix = ss;
293 regcppop();
294 PL_savestack_ix = tmp;
942e002e 295 return Nullch;
0f5d15d6
IZ
296}
297
298typedef struct re_cc_state
299{
300 I32 ss;
301 regnode *node;
302 struct re_cc_state *prev;
303 CURCUR *cc;
304 regexp *re;
305} re_cc_state;
306
02db2b7b 307#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 308
29d1e993
HS
309#define TRYPAREN(paren, n, input) { \
310 if (paren) { \
311 if (n) { \
312 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
313 PL_regendp[paren] = input - PL_bostr; \
314 } \
315 else \
316 PL_regendp[paren] = -1; \
317 } \
318 if (regmatch(next)) \
319 sayYES; \
320 if (paren && n) \
321 PL_regendp[paren] = -1; \
322}
323
324
a687059c 325/*
e50aee73 326 * pregexec and friends
a687059c
LW
327 */
328
329/*
c277df42 330 - pregexec - match a regexp against a string
a687059c 331 */
c277df42 332I32
864dbfa3 333Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 334 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
335/* strend: pointer to null at end of string */
336/* strbeg: real beginning of string */
337/* minend: end of match must be >=minend after stringarg. */
338/* nosave: For optimizations. */
339{
340 return
9041c2e3 341 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
342 nosave ? 0 : REXEC_COPY_STR);
343}
0f5d15d6
IZ
344
345STATIC void
cea2e8a9 346S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
347{
348 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
349#ifdef DEBUGGING
350 PL_regprogram = prog->program;
351#endif
352 PL_regnpar = prog->nparens;
9041c2e3
NIS
353 PL_regdata = prog->data;
354 PL_reg_re = prog;
0f5d15d6 355}
22e551b9 356
9041c2e3 357/*
cad2e5aa
JH
358 * Need to implement the following flags for reg_anch:
359 *
360 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
361 * USE_INTUIT_ML
362 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
363 * INTUIT_AUTORITATIVE_ML
364 * INTUIT_ONCE_NOML - Intuit can match in one location only.
365 * INTUIT_ONCE_ML
366 *
367 * Another flag for this function: SECOND_TIME (so that float substrs
368 * with giant delta may be not rechecked).
369 */
370
371/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
372
3f7c398e 373/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
374 Otherwise, only SvCUR(sv) is used to get strbeg. */
375
376/* XXXX We assume that strpos is strbeg unless sv. */
377
6eb5f6b9
JH
378/* XXXX Some places assume that there is a fixed substring.
379 An update may be needed if optimizer marks as "INTUITable"
380 RExen without fixed substrings. Similarly, it is assumed that
381 lengths of all the strings are no more than minlen, thus they
382 cannot come from lookahead.
383 (Or minlen should take into account lookahead.) */
384
2c2d71f5
JH
385/* A failure to find a constant substring means that there is no need to make
386 an expensive call to REx engine, thus we celebrate a failure. Similarly,
387 finding a substring too deep into the string means that less calls to
30944b6d
IZ
388 regtry() should be needed.
389
390 REx compiler's optimizer found 4 possible hints:
391 a) Anchored substring;
392 b) Fixed substring;
393 c) Whether we are anchored (beginning-of-line or \G);
394 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 395 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
396 string which does not contradict any of them.
397 */
2c2d71f5 398
6eb5f6b9
JH
399/* Most of decisions we do here should have been done at compile time.
400 The nodes of the REx which we used for the search should have been
401 deleted from the finite automaton. */
402
cad2e5aa
JH
403char *
404Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
405 char *strend, U32 flags, re_scream_pos_data *data)
406{
b7953727 407 register I32 start_shift = 0;
cad2e5aa 408 /* Should be nonnegative! */
b7953727 409 register I32 end_shift = 0;
2c2d71f5
JH
410 register char *s;
411 register SV *check;
a1933d95 412 char *strbeg;
cad2e5aa 413 char *t;
a3b680e6 414 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 415 I32 ml_anch;
6eb5f6b9 416 register char *other_last = Nullch; /* other substr checked before this */
b7953727 417 char *check_at = Nullch; /* check substr found at this pos */
1df70142 418 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
419#ifdef DEBUGGING
420 char *i_strpos = strpos;
ce333219 421 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 422#endif
a3621e74
YO
423
424 GET_RE_DEBUG_FLAGS_DECL;
425
a30b2f1f 426 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 427
b8d68ded 428 if (prog->reganch & ROPT_UTF8) {
a3621e74 429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
430 "UTF-8 regex...\n"));
431 PL_reg_flags |= RF_utf8;
432 }
433
a3621e74 434 DEBUG_EXECUTE_r({
1df70142 435 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
436 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
437 strpos;
1df70142 438 const int len = PL_reg_match_utf8 ?
b8d68ded 439 strlen(s) : strend - strpos;
2a782b5b
JH
440 if (!PL_colorset)
441 reginitcolors();
b8d68ded 442 if (PL_reg_match_utf8)
a3621e74 443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 444 "UTF-8 target...\n"));
2a782b5b 445 PerlIO_printf(Perl_debug_log,
a0288114 446 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 447 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
448 prog->precomp,
449 PL_colors[1],
450 (strlen(prog->precomp) > 60 ? "..." : ""),
451 PL_colors[0],
452 (int)(len > 60 ? 60 : len),
453 s, PL_colors[1],
454 (len > 60 ? "..." : "")
455 );
456 });
cad2e5aa 457
c344f387
JH
458 /* CHR_DIST() would be more correct here but it makes things slow. */
459 if (prog->minlen > strend - strpos) {
a3621e74 460 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 461 "String too short... [re_intuit_start]\n"));
cad2e5aa 462 goto fail;
2c2d71f5 463 }
a1933d95 464 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 465 PL_regeol = strend;
33b8afdf
JH
466 if (do_utf8) {
467 if (!prog->check_utf8 && prog->check_substr)
468 to_utf8_substr(prog);
469 check = prog->check_utf8;
470 } else {
471 if (!prog->check_substr && prog->check_utf8)
472 to_byte_substr(prog);
473 check = prog->check_substr;
474 }
475 if (check == &PL_sv_undef) {
a3621e74 476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
477 "Non-utf string cannot match utf check string\n"));
478 goto fail;
479 }
2c2d71f5 480 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
481 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
482 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 483 && !multiline ) ); /* Check after \n? */
cad2e5aa 484
7e25d62c
JH
485 if (!ml_anch) {
486 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
487 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 488 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
489 && sv && !SvROK(sv)
490 && (strpos != strbeg)) {
a3621e74 491 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
492 goto fail;
493 }
494 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 495 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 496 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
497 I32 slen;
498
1aa99e6b 499 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
500 if (SvTAIL(check)) {
501 slen = SvCUR(check); /* >= 1 */
cad2e5aa 502
9041c2e3 503 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 504 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 505 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 506 goto fail_finish;
cad2e5aa
JH
507 }
508 /* Now should match s[0..slen-2] */
509 slen--;
3f7c398e 510 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 511 || (slen > 1
3f7c398e 512 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 513 report_neq:
a3621e74 514 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
515 goto fail_finish;
516 }
cad2e5aa 517 }
3f7c398e 518 else if (*SvPVX_const(check) != *s
653099ff 519 || ((slen = SvCUR(check)) > 1
3f7c398e 520 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5
JH
521 goto report_neq;
522 goto success_at_start;
7e25d62c 523 }
cad2e5aa 524 }
2c2d71f5 525 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 526 s = strpos;
2c2d71f5 527 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 528 end_shift = prog->minlen - start_shift -
653099ff 529 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 530 if (!ml_anch) {
a3b680e6 531 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 532 - (SvTAIL(check) != 0);
a3b680e6 533 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
534
535 if (end_shift < eshift)
536 end_shift = eshift;
537 }
cad2e5aa 538 }
2c2d71f5 539 else { /* Can match at random position */
cad2e5aa
JH
540 ml_anch = 0;
541 s = strpos;
2c2d71f5
JH
542 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
543 /* Should be nonnegative! */
544 end_shift = prog->minlen - start_shift -
653099ff 545 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
546 }
547
2c2d71f5 548#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 549 if (end_shift < 0)
6bbae5e6 550 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
551#endif
552
2c2d71f5
JH
553 restart:
554 /* Find a possible match in the region s..strend by looking for
555 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 556 if (flags & REXEC_SCREAM) {
cad2e5aa 557 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 558 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 559
2c2d71f5
JH
560 if (PL_screamfirst[BmRARE(check)] >= 0
561 || ( BmRARE(check) == '\n'
562 && (BmPREVIOUS(check) == SvCUR(check) - 1)
563 && SvTAIL(check) ))
9041c2e3 564 s = screaminstr(sv, check,
2c2d71f5 565 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 566 else
2c2d71f5 567 goto fail_finish;
4addbd3b
HS
568 /* we may be pointing at the wrong string */
569 if (s && RX_MATCH_COPIED(prog))
3f7c398e 570 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
571 if (data)
572 *data->scream_olds = s;
573 }
f33976b4 574 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
575 s = fbm_instr((U8*)(s + start_shift),
576 (U8*)(strend - end_shift),
7fba1cd6 577 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 578 else
1aa99e6b
IH
579 s = fbm_instr(HOP3(s, start_shift, strend),
580 HOP3(strend, -end_shift, strbeg),
7fba1cd6 581 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
582
583 /* Update the count-of-usability, remove useless subpatterns,
584 unshift s. */
2c2d71f5 585
a0288114 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 587 (s ? "Found" : "Did not find"),
33b8afdf 588 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 589 PL_colors[0],
7b0972df 590 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 591 SvPVX_const(check),
2c2d71f5
JH
592 PL_colors[1], (SvTAIL(check) ? "$" : ""),
593 (s ? " at offset " : "...\n") ) );
594
595 if (!s)
596 goto fail_finish;
597
6eb5f6b9
JH
598 check_at = s;
599
2c2d71f5 600 /* Finish the diagnostic message */
a3621e74 601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
602
603 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
604 Start with the other substr.
605 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 606 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
607 *always* match. Probably should be marked during compile...
608 Probably it is right to do no SCREAM here...
609 */
610
33b8afdf 611 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 612 /* Take into account the "other" substring. */
2c2d71f5
JH
613 /* XXXX May be hopelessly wrong for UTF... */
614 if (!other_last)
6eb5f6b9 615 other_last = strpos;
33b8afdf 616 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
617 do_other_anchored:
618 {
1aa99e6b 619 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 620 char *s1 = s;
33b8afdf 621 SV* must;
2c2d71f5 622
2c2d71f5
JH
623 t = s - prog->check_offset_max;
624 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 625 && (!do_utf8
1aa99e6b 626 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 627 && t > strpos)))
30944b6d 628 /* EMPTY */;
2c2d71f5
JH
629 else
630 t = strpos;
1aa99e6b 631 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
632 if (t < other_last) /* These positions already checked */
633 t = other_last;
1aa99e6b 634 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
635 if (last < last1)
636 last1 = last;
637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
638 /* On end-of-str: see comment below. */
33b8afdf
JH
639 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
640 if (must == &PL_sv_undef) {
641 s = (char*)NULL;
a3621e74 642 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
643 }
644 else
645 s = fbm_instr(
646 (unsigned char*)t,
647 HOP3(HOP3(last1, prog->anchored_offset, strend)
648 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
649 must,
7fba1cd6 650 multiline ? FBMrf_MULTILINE : 0
33b8afdf 651 );
a3621e74 652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 653 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
654 (s ? "Found" : "Contradicts"),
655 PL_colors[0],
33b8afdf
JH
656 (int)(SvCUR(must)
657 - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf 659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
660 if (!s) {
661 if (last1 >= last2) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
663 ", giving up...\n"));
664 goto fail_finish;
665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 667 ", trying floating at offset %ld...\n",
1aa99e6b
IH
668 (long)(HOP3c(s1, 1, strend) - i_strpos)));
669 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
670 s = HOP3c(last, 1, strend);
2c2d71f5
JH
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 675 (long)(s - i_strpos)));
1aa99e6b
IH
676 t = HOP3c(s, -prog->anchored_offset, strbeg);
677 other_last = HOP3c(s, 1, strend);
30944b6d 678 s = s1;
2c2d71f5
JH
679 if (t == strpos)
680 goto try_at_start;
2c2d71f5
JH
681 goto try_at_offset;
682 }
30944b6d 683 }
2c2d71f5
JH
684 }
685 else { /* Take into account the floating substring. */
33b8afdf
JH
686 char *last, *last1;
687 char *s1 = s;
688 SV* must;
689
690 t = HOP3c(s, -start_shift, strbeg);
691 last1 = last =
692 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
693 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
694 last = HOP3c(t, prog->float_max_offset, strend);
695 s = HOP3c(t, prog->float_min_offset, strend);
696 if (s < other_last)
697 s = other_last;
2c2d71f5 698 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
699 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
700 /* fbm_instr() takes into account exact value of end-of-str
701 if the check is SvTAIL(ed). Since false positives are OK,
702 and end-of-str is not later than strend we are OK. */
703 if (must == &PL_sv_undef) {
704 s = (char*)NULL;
a3621e74 705 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
706 }
707 else
2c2d71f5 708 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
709 (unsigned char*)last + SvCUR(must)
710 - (SvTAIL(must)!=0),
7fba1cd6 711 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
713 (s ? "Found" : "Contradicts"),
714 PL_colors[0],
715 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 716 SvPVX_const(must),
33b8afdf
JH
717 PL_colors[1], (SvTAIL(must) ? "$" : "")));
718 if (!s) {
719 if (last1 == last) {
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
721 ", giving up...\n"));
722 goto fail_finish;
2c2d71f5 723 }
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
725 ", trying anchored starting at offset %ld...\n",
726 (long)(s1 + 1 - i_strpos)));
727 other_last = last;
728 s = HOP3c(t, 1, strend);
729 goto restart;
730 }
731 else {
a3621e74 732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
733 (long)(s - i_strpos)));
734 other_last = s; /* Fix this later. --Hugo */
735 s = s1;
736 if (t == strpos)
737 goto try_at_start;
738 goto try_at_offset;
739 }
2c2d71f5 740 }
cad2e5aa 741 }
2c2d71f5
JH
742
743 t = s - prog->check_offset_max;
2c2d71f5 744 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 745 && (!do_utf8
1aa99e6b
IH
746 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
747 && t > strpos))) {
2c2d71f5
JH
748 /* Fixed substring is found far enough so that the match
749 cannot start at strpos. */
750 try_at_offset:
cad2e5aa 751 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
752 /* Eventually fbm_*() should handle this, but often
753 anchored_offset is not 0, so this check will not be wasted. */
754 /* XXXX In the code below we prefer to look for "^" even in
755 presence of anchored substrings. And we search even
756 beyond the found float position. These pessimizations
757 are historical artefacts only. */
758 find_anchor:
2c2d71f5 759 while (t < strend - prog->minlen) {
cad2e5aa 760 if (*t == '\n') {
4ee3650e 761 if (t < check_at - prog->check_offset_min) {
33b8afdf 762 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
763 /* Since we moved from the found position,
764 we definitely contradict the found anchored
30944b6d
IZ
765 substr. Due to the above check we do not
766 contradict "check" substr.
767 Thus we can arrive here only if check substr
768 is float. Redo checking for "other"=="fixed".
769 */
9041c2e3 770 strpos = t + 1;
a3621e74 771 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 772 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
773 goto do_other_anchored;
774 }
4ee3650e
GS
775 /* We don't contradict the found floating substring. */
776 /* XXXX Why not check for STCLASS? */
cad2e5aa 777 s = t + 1;
a3621e74 778 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 779 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
780 goto set_useful;
781 }
4ee3650e
GS
782 /* Position contradicts check-string */
783 /* XXXX probably better to look for check-string
784 than for "\n", so one should lower the limit for t? */
a3621e74 785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 786 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 787 other_last = strpos = s = t + 1;
cad2e5aa
JH
788 goto restart;
789 }
790 t++;
791 }
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 793 PL_colors[0], PL_colors[1]));
2c2d71f5 794 goto fail_finish;
cad2e5aa 795 }
f5952150 796 else {
a3621e74 797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 798 PL_colors[0], PL_colors[1]));
f5952150 799 }
cad2e5aa
JH
800 s = t;
801 set_useful:
33b8afdf 802 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
803 }
804 else {
f5952150 805 /* The found string does not prohibit matching at strpos,
2c2d71f5 806 - no optimization of calling REx engine can be performed,
f5952150
GS
807 unless it was an MBOL and we are not after MBOL,
808 or a future STCLASS check will fail this. */
2c2d71f5
JH
809 try_at_start:
810 /* Even in this situation we may use MBOL flag if strpos is offset
811 wrt the start of the string. */
05b4157f 812 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 813 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
814 /* May be due to an implicit anchor of m{.*foo} */
815 && !(prog->reganch & ROPT_IMPLICIT))
816 {
cad2e5aa
JH
817 t = strpos;
818 goto find_anchor;
819 }
a3621e74 820 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 821 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 822 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 823 );
2c2d71f5 824 success_at_start:
30944b6d 825 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
826 && (do_utf8 ? (
827 prog->check_utf8 /* Could be deleted already */
828 && --BmUSEFUL(prog->check_utf8) < 0
829 && (prog->check_utf8 == prog->float_utf8)
830 ) : (
831 prog->check_substr /* Could be deleted already */
832 && --BmUSEFUL(prog->check_substr) < 0
833 && (prog->check_substr == prog->float_substr)
834 )))
66e933ab 835 {
cad2e5aa 836 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
838 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
839 if (do_utf8 ? prog->check_substr : prog->check_utf8)
840 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
841 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
842 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 843 check = Nullsv; /* abort */
cad2e5aa 844 s = strpos;
3cf5c195
IZ
845 /* XXXX This is a remnant of the old implementation. It
846 looks wasteful, since now INTUIT can use many
6eb5f6b9 847 other heuristics. */
cad2e5aa
JH
848 prog->reganch &= ~RE_USE_INTUIT;
849 }
850 else
851 s = strpos;
852 }
853
6eb5f6b9
JH
854 /* Last resort... */
855 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
856 if (prog->regstclass) {
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
06b5626a
AL
865 const U8* str = (U8*)STRING(prog->regstclass);
866 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 868 : 1);
a3b680e6 869 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 870 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 871 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
872 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
873 cl_l, strend)
874 : strend);
6eb5f6b9
JH
875
876 t = s;
9926ca43 877 cache_re(prog);
06b5626a 878 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
879 if (!s) {
880#ifdef DEBUGGING
e1ec3a88 881 const char *what = 0;
6eb5f6b9
JH
882#endif
883 if (endpos == strend) {
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
885 "Could not match STCLASS...\n") );
886 goto fail;
887 }
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 889 "This position contradicts STCLASS...\n") );
653099ff
GS
890 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
891 goto fail;
6eb5f6b9 892 /* Contradict one of substrings */
33b8afdf
JH
893 if (prog->anchored_substr || prog->anchored_utf8) {
894 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 895 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 896 hop_and_restart:
1aa99e6b 897 s = HOP3c(t, 1, strend);
66e933ab
GS
898 if (s + start_shift + end_shift > strend) {
899 /* XXXX Should be taken into account earlier? */
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
901 "Could not match STCLASS...\n") );
902 goto fail;
903 }
5e39e1e5
HS
904 if (!check)
905 goto giveup;
a3621e74 906 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 907 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
908 what, (long)(s + start_shift - i_strpos)) );
909 goto restart;
910 }
66e933ab 911 /* Have both, check_string is floating */
6eb5f6b9
JH
912 if (t + start_shift >= check_at) /* Contradicts floating=check */
913 goto retry_floating_check;
914 /* Recheck anchored substring, but not floating... */
9041c2e3 915 s = check_at;
5e39e1e5
HS
916 if (!check)
917 goto giveup;
a3621e74 918 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 919 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
920 (long)(other_last - i_strpos)) );
921 goto do_other_anchored;
922 }
60e71179
GS
923 /* Another way we could have checked stclass at the
924 current position only: */
925 if (ml_anch) {
926 s = t = t + 1;
5e39e1e5
HS
927 if (!check)
928 goto giveup;
a3621e74 929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 930 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 931 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 932 goto try_at_offset;
66e933ab 933 }
33b8afdf 934 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 935 goto fail;
6eb5f6b9
JH
936 /* Check is floating subtring. */
937 retry_floating_check:
938 t = check_at - start_shift;
a3621e74 939 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
940 goto hop_and_restart;
941 }
b7953727 942 if (t != s) {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 944 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
945 (long)(t - i_strpos), (long)(s - i_strpos))
946 );
947 }
948 else {
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
950 "Does not contradict STCLASS...\n");
951 );
952 }
6eb5f6b9 953 }
5e39e1e5 954 giveup:
a3621e74 955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
956 PL_colors[4], (check ? "Guessed" : "Giving up"),
957 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 958 return s;
2c2d71f5
JH
959
960 fail_finish: /* Substring not found */
33b8afdf
JH
961 if (prog->check_substr || prog->check_utf8) /* could be removed already */
962 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 963 fail:
a3621e74 964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 965 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
966 return Nullch;
967}
9661b544 968
6eb5f6b9 969/* We know what class REx starts with. Try to find this position... */
3c3eec57 970STATIC char *
a3b680e6 971S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 972{
27da23d5 973 dVAR;
1df70142 974 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 975 char *m;
d8093b23 976 STRLEN ln;
5dab1207 977 STRLEN lnc;
078c425b 978 register STRLEN uskip;
d8093b23 979 unsigned int c1;
980 unsigned int c2;
6eb5f6b9
JH
981 char *e;
982 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 983 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 984
6eb5f6b9
JH
985 /* We know what class it must start with. */
986 switch (OP(c)) {
6eb5f6b9 987 case ANYOF:
388cc4de 988 if (do_utf8) {
078c425b 989 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
990 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
991 !UTF8_IS_INVARIANT((U8)s[0]) ?
992 reginclass(c, (U8*)s, 0, do_utf8) :
993 REGINCLASS(c, (U8*)s)) {
994 if (tmp && (norun || regtry(prog, s)))
995 goto got_it;
996 else
997 tmp = doevery;
998 }
999 else
1000 tmp = 1;
078c425b 1001 s += uskip;
388cc4de
HS
1002 }
1003 }
1004 else {
1005 while (s < strend) {
1006 STRLEN skip = 1;
1007
1008 if (REGINCLASS(c, (U8*)s) ||
1009 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1010 /* The assignment of 2 is intentional:
1011 * for the folded sharp s, the skip is 2. */
1012 (skip = SHARP_S_SKIP))) {
1013 if (tmp && (norun || regtry(prog, s)))
1014 goto got_it;
1015 else
1016 tmp = doevery;
1017 }
1018 else
1019 tmp = 1;
1020 s += skip;
1021 }
a0d0e21e 1022 }
6eb5f6b9 1023 break;
f33976b4
DB
1024 case CANY:
1025 while (s < strend) {
1026 if (tmp && (norun || regtry(prog, s)))
1027 goto got_it;
1028 else
1029 tmp = doevery;
1030 s++;
1031 }
1032 break;
6eb5f6b9 1033 case EXACTF:
5dab1207
NIS
1034 m = STRING(c);
1035 ln = STR_LEN(c); /* length to match in octets/bytes */
1036 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1037 if (UTF) {
a2a2844f 1038 STRLEN ulen1, ulen2;
5dab1207 1039 U8 *sm = (U8 *) m;
89ebb4a3
JH
1040 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1041 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1042
1043 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1044 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1045
89ebb4a3 1046 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1047 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1048 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1049 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1050 lnc = 0;
1051 while (sm < ((U8 *) m + ln)) {
1052 lnc++;
1053 sm += UTF8SKIP(sm);
1054 }
1aa99e6b
IH
1055 }
1056 else {
1057 c1 = *(U8*)m;
1058 c2 = PL_fold[c1];
1059 }
6eb5f6b9
JH
1060 goto do_exactf;
1061 case EXACTFL:
5dab1207
NIS
1062 m = STRING(c);
1063 ln = STR_LEN(c);
1064 lnc = (I32) ln;
d8093b23 1065 c1 = *(U8*)m;
6eb5f6b9
JH
1066 c2 = PL_fold_locale[c1];
1067 do_exactf:
db12adc6 1068 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1069
6eb5f6b9
JH
1070 if (norun && e < s)
1071 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1072
60a8b682
JH
1073 /* The idea in the EXACTF* cases is to first find the
1074 * first character of the EXACTF* node and then, if
1075 * necessary, case-insensitively compare the full
1076 * text of the node. The c1 and c2 are the first
1077 * characters (though in Unicode it gets a bit
1078 * more complicated because there are more cases
7f16dd3d
JH
1079 * than just upper and lower: one needs to use
1080 * the so-called folding case for case-insensitive
1081 * matching (called "loose matching" in Unicode).
1082 * ibcmp_utf8() will do just that. */
60a8b682 1083
1aa99e6b 1084 if (do_utf8) {
575cac57 1085 UV c, f;
89ebb4a3 1086 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1087 STRLEN len, foldlen;
d7f013c8 1088
09091399 1089 if (c1 == c2) {
5dab1207
NIS
1090 /* Upper and lower of 1st char are equal -
1091 * probably not a "letter". */
1aa99e6b 1092 while (s <= e) {
89ebb4a3 1093 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1094 ckWARN(WARN_UTF8) ?
1095 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1096 if ( c == c1
1097 && (ln == len ||
66423254 1098 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1099 m, (char **)0, ln, (bool)UTF))
55da9344 1100 && (norun || regtry(prog, s)) )
1aa99e6b 1101 goto got_it;
80aecb99 1102 else {
1df70142 1103 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1104 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1105 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1106 if ( f != c
1107 && (f == c1 || f == c2)
1108 && (ln == foldlen ||
66423254
JH
1109 !ibcmp_utf8((char *) foldbuf,
1110 (char **)0, foldlen, do_utf8,
d07ddd77 1111 m,
eb160463 1112 (char **)0, ln, (bool)UTF))
80aecb99
JH
1113 && (norun || regtry(prog, s)) )
1114 goto got_it;
1115 }
1aa99e6b
IH
1116 s += len;
1117 }
09091399
JH
1118 }
1119 else {
1aa99e6b 1120 while (s <= e) {
89ebb4a3 1121 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1122 ckWARN(WARN_UTF8) ?
1123 0 : UTF8_ALLOW_ANY);
80aecb99 1124
60a8b682 1125 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1126 * Note that not all the possible combinations
1127 * are handled here: some of them are handled
1128 * by the standard folding rules, and some of
1129 * them (the character class or ANYOF cases)
1130 * are handled during compiletime in
1131 * regexec.c:S_regclass(). */
880bd946
JH
1132 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1133 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1134 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1135
1136 if ( (c == c1 || c == c2)
1137 && (ln == len ||
66423254 1138 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1139 m, (char **)0, ln, (bool)UTF))
55da9344 1140 && (norun || regtry(prog, s)) )
1aa99e6b 1141 goto got_it;
80aecb99 1142 else {
1df70142 1143 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1144 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1145 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1146 if ( f != c
1147 && (f == c1 || f == c2)
1148 && (ln == foldlen ||
a6872d42 1149 !ibcmp_utf8((char *) foldbuf,
66423254 1150 (char **)0, foldlen, do_utf8,
d07ddd77 1151 m,
eb160463 1152 (char **)0, ln, (bool)UTF))
80aecb99
JH
1153 && (norun || regtry(prog, s)) )
1154 goto got_it;
1155 }
1aa99e6b
IH
1156 s += len;
1157 }
09091399 1158 }
1aa99e6b
IH
1159 }
1160 else {
1161 if (c1 == c2)
1162 while (s <= e) {
1163 if ( *(U8*)s == c1
1164 && (ln == 1 || !(OP(c) == EXACTF
1165 ? ibcmp(s, m, ln)
1166 : ibcmp_locale(s, m, ln)))
1167 && (norun || regtry(prog, s)) )
1168 goto got_it;
1169 s++;
1170 }
1171 else
1172 while (s <= e) {
1173 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1174 && (ln == 1 || !(OP(c) == EXACTF
1175 ? ibcmp(s, m, ln)
1176 : ibcmp_locale(s, m, ln)))
1177 && (norun || regtry(prog, s)) )
1178 goto got_it;
1179 s++;
1180 }
b3c9acc1
IZ
1181 }
1182 break;
bbce6d69 1183 case BOUNDL:
3280af22 1184 PL_reg_flags |= RF_tainted;
bbce6d69 1185 /* FALL THROUGH */
a0d0e21e 1186 case BOUND:
ffc61ed2 1187 if (do_utf8) {
12d33761 1188 if (s == PL_bostr)
ffc61ed2
JH
1189 tmp = '\n';
1190 else {
b4f7163a 1191 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1192
b4f7163a 1193 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1194 }
1195 tmp = ((OP(c) == BOUND ?
9041c2e3 1196 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1197 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1198 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1199 if (tmp == !(OP(c) == BOUND ?
3568d838 1200 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1201 isALNUM_LC_utf8((U8*)s)))
1202 {
1203 tmp = !tmp;
1204 if ((norun || regtry(prog, s)))
1205 goto got_it;
1206 }
078c425b 1207 s += uskip;
a687059c 1208 }
a0d0e21e 1209 }
667bb95a 1210 else {
12d33761 1211 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1212 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1213 while (s < strend) {
1214 if (tmp ==
1215 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1216 tmp = !tmp;
1217 if ((norun || regtry(prog, s)))
1218 goto got_it;
1219 }
1220 s++;
a0ed51b3 1221 }
a0ed51b3 1222 }
6eb5f6b9 1223 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1224 goto got_it;
1225 break;
bbce6d69 1226 case NBOUNDL:
3280af22 1227 PL_reg_flags |= RF_tainted;
bbce6d69 1228 /* FALL THROUGH */
a0d0e21e 1229 case NBOUND:
ffc61ed2 1230 if (do_utf8) {
12d33761 1231 if (s == PL_bostr)
ffc61ed2
JH
1232 tmp = '\n';
1233 else {
b4f7163a 1234 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1235
b4f7163a 1236 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1237 }
1238 tmp = ((OP(c) == NBOUND ?
9041c2e3 1239 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1240 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1241 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1242 if (tmp == !(OP(c) == NBOUND ?
3568d838 1243 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1244 isALNUM_LC_utf8((U8*)s)))
1245 tmp = !tmp;
1246 else if ((norun || regtry(prog, s)))
1247 goto got_it;
078c425b 1248 s += uskip;
ffc61ed2 1249 }
a0d0e21e 1250 }
667bb95a 1251 else {
12d33761 1252 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1253 tmp = ((OP(c) == NBOUND ?
1254 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1255 while (s < strend) {
1256 if (tmp ==
1257 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1258 tmp = !tmp;
1259 else if ((norun || regtry(prog, s)))
1260 goto got_it;
1261 s++;
1262 }
a0ed51b3 1263 }
6eb5f6b9 1264 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1265 goto got_it;
1266 break;
a0d0e21e 1267 case ALNUM:
ffc61ed2 1268 if (do_utf8) {
1a4fad37 1269 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1270 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1271 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1272 if (tmp && (norun || regtry(prog, s)))
1273 goto got_it;
1274 else
1275 tmp = doevery;
1276 }
bbce6d69 1277 else
ffc61ed2 1278 tmp = 1;
078c425b 1279 s += uskip;
bbce6d69 1280 }
bbce6d69 1281 }
ffc61ed2
JH
1282 else {
1283 while (s < strend) {
1284 if (isALNUM(*s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1286 goto got_it;
1287 else
1288 tmp = doevery;
1289 }
a0ed51b3 1290 else
ffc61ed2
JH
1291 tmp = 1;
1292 s++;
a0ed51b3 1293 }
a0ed51b3
LW
1294 }
1295 break;
bbce6d69 1296 case ALNUML:
3280af22 1297 PL_reg_flags |= RF_tainted;
ffc61ed2 1298 if (do_utf8) {
078c425b 1299 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1300 if (isALNUM_LC_utf8((U8*)s)) {
1301 if (tmp && (norun || regtry(prog, s)))
1302 goto got_it;
1303 else
1304 tmp = doevery;
1305 }
a687059c 1306 else
ffc61ed2 1307 tmp = 1;
078c425b 1308 s += uskip;
a0d0e21e 1309 }
a0d0e21e 1310 }
ffc61ed2
JH
1311 else {
1312 while (s < strend) {
1313 if (isALNUM_LC(*s)) {
1314 if (tmp && (norun || regtry(prog, s)))
1315 goto got_it;
1316 else
1317 tmp = doevery;
1318 }
a0ed51b3 1319 else
ffc61ed2
JH
1320 tmp = 1;
1321 s++;
a0ed51b3 1322 }
a0ed51b3
LW
1323 }
1324 break;
a0d0e21e 1325 case NALNUM:
ffc61ed2 1326 if (do_utf8) {
1a4fad37 1327 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1328 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1329 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1330 if (tmp && (norun || regtry(prog, s)))
1331 goto got_it;
1332 else
1333 tmp = doevery;
1334 }
bbce6d69 1335 else
ffc61ed2 1336 tmp = 1;
078c425b 1337 s += uskip;
bbce6d69 1338 }
bbce6d69 1339 }
ffc61ed2
JH
1340 else {
1341 while (s < strend) {
1342 if (!isALNUM(*s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1344 goto got_it;
1345 else
1346 tmp = doevery;
1347 }
a0ed51b3 1348 else
ffc61ed2
JH
1349 tmp = 1;
1350 s++;
a0ed51b3 1351 }
a0ed51b3
LW
1352 }
1353 break;
bbce6d69 1354 case NALNUML:
3280af22 1355 PL_reg_flags |= RF_tainted;
ffc61ed2 1356 if (do_utf8) {
078c425b 1357 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1358 if (!isALNUM_LC_utf8((U8*)s)) {
1359 if (tmp && (norun || regtry(prog, s)))
1360 goto got_it;
1361 else
1362 tmp = doevery;
1363 }
a687059c 1364 else
ffc61ed2 1365 tmp = 1;
078c425b 1366 s += uskip;
a687059c 1367 }
a0d0e21e 1368 }
ffc61ed2
JH
1369 else {
1370 while (s < strend) {
1371 if (!isALNUM_LC(*s)) {
1372 if (tmp && (norun || regtry(prog, s)))
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
a0ed51b3 1377 else
ffc61ed2
JH
1378 tmp = 1;
1379 s++;
a0ed51b3 1380 }
a0ed51b3
LW
1381 }
1382 break;
a0d0e21e 1383 case SPACE:
ffc61ed2 1384 if (do_utf8) {
1a4fad37 1385 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1386 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1387 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1388 if (tmp && (norun || regtry(prog, s)))
1389 goto got_it;
1390 else
1391 tmp = doevery;
1392 }
a0d0e21e 1393 else
ffc61ed2 1394 tmp = 1;
078c425b 1395 s += uskip;
2304df62 1396 }
a0d0e21e 1397 }
ffc61ed2
JH
1398 else {
1399 while (s < strend) {
1400 if (isSPACE(*s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1402 goto got_it;
1403 else
1404 tmp = doevery;
1405 }
a0ed51b3 1406 else
ffc61ed2
JH
1407 tmp = 1;
1408 s++;
a0ed51b3 1409 }
a0ed51b3
LW
1410 }
1411 break;
bbce6d69 1412 case SPACEL:
3280af22 1413 PL_reg_flags |= RF_tainted;
ffc61ed2 1414 if (do_utf8) {
078c425b 1415 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1416 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1417 if (tmp && (norun || regtry(prog, s)))
1418 goto got_it;
1419 else
1420 tmp = doevery;
1421 }
bbce6d69 1422 else
ffc61ed2 1423 tmp = 1;
078c425b 1424 s += uskip;
bbce6d69 1425 }
bbce6d69 1426 }
ffc61ed2
JH
1427 else {
1428 while (s < strend) {
1429 if (isSPACE_LC(*s)) {
1430 if (tmp && (norun || regtry(prog, s)))
1431 goto got_it;
1432 else
1433 tmp = doevery;
1434 }
a0ed51b3 1435 else
ffc61ed2
JH
1436 tmp = 1;
1437 s++;
a0ed51b3 1438 }
a0ed51b3
LW
1439 }
1440 break;
a0d0e21e 1441 case NSPACE:
ffc61ed2 1442 if (do_utf8) {
1a4fad37 1443 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1444 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1445 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1446 if (tmp && (norun || regtry(prog, s)))
1447 goto got_it;
1448 else
1449 tmp = doevery;
1450 }
a0d0e21e 1451 else
ffc61ed2 1452 tmp = 1;
078c425b 1453 s += uskip;
a687059c 1454 }
a0d0e21e 1455 }
ffc61ed2
JH
1456 else {
1457 while (s < strend) {
1458 if (!isSPACE(*s)) {
1459 if (tmp && (norun || regtry(prog, s)))
1460 goto got_it;
1461 else
1462 tmp = doevery;
1463 }
a0ed51b3 1464 else
ffc61ed2
JH
1465 tmp = 1;
1466 s++;
a0ed51b3 1467 }
a0ed51b3
LW
1468 }
1469 break;
bbce6d69 1470 case NSPACEL:
3280af22 1471 PL_reg_flags |= RF_tainted;
ffc61ed2 1472 if (do_utf8) {
078c425b 1473 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1474 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1475 if (tmp && (norun || regtry(prog, s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 }
bbce6d69 1480 else
ffc61ed2 1481 tmp = 1;
078c425b 1482 s += uskip;
bbce6d69 1483 }
bbce6d69 1484 }
ffc61ed2
JH
1485 else {
1486 while (s < strend) {
1487 if (!isSPACE_LC(*s)) {
1488 if (tmp && (norun || regtry(prog, s)))
1489 goto got_it;
1490 else
1491 tmp = doevery;
1492 }
a0ed51b3 1493 else
ffc61ed2
JH
1494 tmp = 1;
1495 s++;
a0ed51b3 1496 }
a0ed51b3
LW
1497 }
1498 break;
a0d0e21e 1499 case DIGIT:
ffc61ed2 1500 if (do_utf8) {
1a4fad37 1501 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1502 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1503 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1504 if (tmp && (norun || regtry(prog, s)))
1505 goto got_it;
1506 else
1507 tmp = doevery;
1508 }
a0d0e21e 1509 else
ffc61ed2 1510 tmp = 1;
078c425b 1511 s += uskip;
2b69d0c2 1512 }
a0d0e21e 1513 }
ffc61ed2
JH
1514 else {
1515 while (s < strend) {
1516 if (isDIGIT(*s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1518 goto got_it;
1519 else
1520 tmp = doevery;
1521 }
a0ed51b3 1522 else
ffc61ed2
JH
1523 tmp = 1;
1524 s++;
a0ed51b3 1525 }
a0ed51b3
LW
1526 }
1527 break;
b8c5462f
JH
1528 case DIGITL:
1529 PL_reg_flags |= RF_tainted;
ffc61ed2 1530 if (do_utf8) {
078c425b 1531 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1532 if (isDIGIT_LC_utf8((U8*)s)) {
1533 if (tmp && (norun || regtry(prog, s)))
1534 goto got_it;
1535 else
1536 tmp = doevery;
1537 }
b8c5462f 1538 else
ffc61ed2 1539 tmp = 1;
078c425b 1540 s += uskip;
b8c5462f 1541 }
b8c5462f 1542 }
ffc61ed2
JH
1543 else {
1544 while (s < strend) {
1545 if (isDIGIT_LC(*s)) {
1546 if (tmp && (norun || regtry(prog, s)))
1547 goto got_it;
1548 else
1549 tmp = doevery;
1550 }
b8c5462f 1551 else
ffc61ed2
JH
1552 tmp = 1;
1553 s++;
b8c5462f 1554 }
b8c5462f
JH
1555 }
1556 break;
a0d0e21e 1557 case NDIGIT:
ffc61ed2 1558 if (do_utf8) {
1a4fad37 1559 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1560 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1561 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1562 if (tmp && (norun || regtry(prog, s)))
1563 goto got_it;
1564 else
1565 tmp = doevery;
1566 }
a0d0e21e 1567 else
ffc61ed2 1568 tmp = 1;
078c425b 1569 s += uskip;
a687059c 1570 }
a0d0e21e 1571 }
ffc61ed2
JH
1572 else {
1573 while (s < strend) {
1574 if (!isDIGIT(*s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1576 goto got_it;
1577 else
1578 tmp = doevery;
1579 }
a0ed51b3 1580 else
ffc61ed2
JH
1581 tmp = 1;
1582 s++;
a0ed51b3 1583 }
a0ed51b3
LW
1584 }
1585 break;
b8c5462f
JH
1586 case NDIGITL:
1587 PL_reg_flags |= RF_tainted;
ffc61ed2 1588 if (do_utf8) {
078c425b 1589 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1590 if (!isDIGIT_LC_utf8((U8*)s)) {
1591 if (tmp && (norun || regtry(prog, s)))
1592 goto got_it;
1593 else
1594 tmp = doevery;
1595 }
b8c5462f 1596 else
ffc61ed2 1597 tmp = 1;
078c425b 1598 s += uskip;
b8c5462f 1599 }
a0ed51b3 1600 }
ffc61ed2
JH
1601 else {
1602 while (s < strend) {
1603 if (!isDIGIT_LC(*s)) {
1604 if (tmp && (norun || regtry(prog, s)))
1605 goto got_it;
1606 else
1607 tmp = doevery;
1608 }
cf93c79d 1609 else
ffc61ed2
JH
1610 tmp = 1;
1611 s++;
b8c5462f 1612 }
b8c5462f
JH
1613 }
1614 break;
b3c9acc1 1615 default:
3c3eec57
GS
1616 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1617 break;
d6a28714 1618 }
6eb5f6b9
JH
1619 return 0;
1620 got_it:
1621 return s;
1622}
1623
1624/*
1625 - regexec_flags - match a regexp against a string
1626 */
1627I32
1628Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1629 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1630/* strend: pointer to null at end of string */
1631/* strbeg: real beginning of string */
1632/* minend: end of match must be >=minend after stringarg. */
1633/* data: May be used for some additional optimizations. */
1634/* nosave: For optimizations. */
1635{
6eb5f6b9
JH
1636 register char *s;
1637 register regnode *c;
1638 register char *startpos = stringarg;
6eb5f6b9
JH
1639 I32 minlen; /* must match at least this many chars */
1640 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1641 I32 end_shift = 0; /* Same for the end. */ /* CC */
1642 I32 scream_pos = -1; /* Internal iterator of scream. */
1643 char *scream_olds;
1644 SV* oreplsv = GvSV(PL_replgv);
1df70142 1645 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1646 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1647#ifdef DEBUGGING
9e55ce06
JH
1648 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1649 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1650#endif
a3621e74
YO
1651
1652 GET_RE_DEBUG_FLAGS_DECL;
1653
1df70142 1654 (void)data; /* Currently unused */
a30b2f1f 1655 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1656
1657 PL_regcc = 0;
1658
1659 cache_re(prog);
1660#ifdef DEBUGGING
aea4f609 1661 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1662#endif
1663
1664 /* Be paranoid... */
1665 if (prog == NULL || startpos == NULL) {
1666 Perl_croak(aTHX_ "NULL regexp parameter");
1667 return 0;
1668 }
1669
1670 minlen = prog->minlen;
61a36c01 1671 if (strend - startpos < minlen) {
a3621e74 1672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1673 "String too short [regexec_flags]...\n"));
1674 goto phooey;
1aa99e6b 1675 }
6eb5f6b9 1676
6eb5f6b9
JH
1677 /* Check validity of program. */
1678 if (UCHARAT(prog->program) != REG_MAGIC) {
1679 Perl_croak(aTHX_ "corrupted regexp program");
1680 }
1681
1682 PL_reg_flags = 0;
1683 PL_reg_eval_set = 0;
1684 PL_reg_maxiter = 0;
1685
1686 if (prog->reganch & ROPT_UTF8)
1687 PL_reg_flags |= RF_utf8;
1688
1689 /* Mark beginning of line for ^ and lookbehind. */
1690 PL_regbol = startpos;
1691 PL_bostr = strbeg;
1692 PL_reg_sv = sv;
1693
1694 /* Mark end of line for $ (and such) */
1695 PL_regeol = strend;
1696
1697 /* see how far we have to get to not match where we matched before */
1698 PL_regtill = startpos+minend;
1699
1700 /* We start without call_cc context. */
1701 PL_reg_call_cc = 0;
1702
1703 /* If there is a "must appear" string, look for it. */
1704 s = startpos;
1705
1706 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1707 MAGIC *mg;
1708
1709 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1710 PL_reg_ganch = startpos;
1711 else if (sv && SvTYPE(sv) >= SVt_PVMG
1712 && SvMAGIC(sv)
14befaf4
DM
1713 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1714 && mg->mg_len >= 0) {
6eb5f6b9
JH
1715 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1716 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1717 if (s > PL_reg_ganch)
6eb5f6b9
JH
1718 goto phooey;
1719 s = PL_reg_ganch;
1720 }
1721 }
1722 else /* pos() not defined */
1723 PL_reg_ganch = strbeg;
1724 }
1725
33b8afdf 1726 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1727 re_scream_pos_data d;
1728
1729 d.scream_olds = &scream_olds;
1730 d.scream_pos = &scream_pos;
1731 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1732 if (!s) {
a3621e74 1733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1734 goto phooey; /* not present */
3fa9c3d7 1735 }
6eb5f6b9
JH
1736 }
1737
a3621e74 1738 DEBUG_EXECUTE_r({
1df70142
AL
1739 const char * const s0 = UTF
1740 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1741 UNI_DISPLAY_REGEX)
1742 : prog->precomp;
1743 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1744 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1745 UNI_DISPLAY_REGEX) : startpos;
1df70142 1746 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1747 if (!PL_colorset)
1748 reginitcolors();
1749 PerlIO_printf(Perl_debug_log,
a0288114 1750 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1751 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1752 len0, len0, s0,
2a782b5b 1753 PL_colors[1],
9e55ce06 1754 len0 > 60 ? "..." : "",
2a782b5b 1755 PL_colors[0],
9e55ce06
JH
1756 (int)(len1 > 60 ? 60 : len1),
1757 s1, PL_colors[1],
1758 (len1 > 60 ? "..." : "")
2a782b5b
JH
1759 );
1760 });
6eb5f6b9
JH
1761
1762 /* Simplest case: anchored match need be tried only once. */
1763 /* [unless only anchor is BOL and multiline is set] */
1764 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1765 if (s == startpos && regtry(prog, startpos))
1766 goto got_it;
7fba1cd6 1767 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1768 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1769 {
1770 char *end;
1771
1772 if (minlen)
1773 dontbother = minlen - 1;
1aa99e6b 1774 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1775 /* for multiline we only have to try after newlines */
33b8afdf 1776 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1777 if (s == startpos)
1778 goto after_try;
1779 while (1) {
1780 if (regtry(prog, s))
1781 goto got_it;
1782 after_try:
1783 if (s >= end)
1784 goto phooey;
1785 if (prog->reganch & RE_USE_INTUIT) {
1786 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1787 if (!s)
1788 goto phooey;
1789 }
1790 else
1791 s++;
1792 }
1793 } else {
1794 if (s > startpos)
1795 s--;
1796 while (s < end) {
1797 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1798 if (regtry(prog, s))
1799 goto got_it;
1800 }
1801 }
1802 }
1803 }
1804 goto phooey;
1805 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1806 if (regtry(prog, PL_reg_ganch))
1807 goto got_it;
1808 goto phooey;
1809 }
1810
1811 /* Messy cases: unanchored match. */
33b8afdf 1812 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1813 /* we have /x+whatever/ */
1814 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1815 char ch;
bf93d4cc
GS
1816#ifdef DEBUGGING
1817 int did_match = 0;
1818#endif
33b8afdf
JH
1819 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1820 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1821 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1822
1aa99e6b 1823 if (do_utf8) {
6eb5f6b9
JH
1824 while (s < strend) {
1825 if (*s == ch) {
a3621e74 1826 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1827 if (regtry(prog, s)) goto got_it;
1828 s += UTF8SKIP(s);
1829 while (s < strend && *s == ch)
1830 s += UTF8SKIP(s);
1831 }
1832 s += UTF8SKIP(s);
1833 }
1834 }
1835 else {
1836 while (s < strend) {
1837 if (*s == ch) {
a3621e74 1838 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1839 if (regtry(prog, s)) goto got_it;
1840 s++;
1841 while (s < strend && *s == ch)
1842 s++;
1843 }
1844 s++;
1845 }
1846 }
a3621e74 1847 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1848 PerlIO_printf(Perl_debug_log,
b7953727
JH
1849 "Did not find anchored character...\n")
1850 );
6eb5f6b9
JH
1851 }
1852 /*SUPPRESS 560*/
33b8afdf
JH
1853 else if (prog->anchored_substr != Nullsv
1854 || prog->anchored_utf8 != Nullsv
1855 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1856 && prog->float_max_offset < strend - s)) {
1857 SV *must;
1858 I32 back_max;
1859 I32 back_min;
1860 char *last;
6eb5f6b9 1861 char *last1; /* Last position checked before */
bf93d4cc
GS
1862#ifdef DEBUGGING
1863 int did_match = 0;
1864#endif
33b8afdf
JH
1865 if (prog->anchored_substr || prog->anchored_utf8) {
1866 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1867 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1868 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1869 back_max = back_min = prog->anchored_offset;
1870 } else {
1871 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1872 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1873 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1874 back_max = prog->float_max_offset;
1875 back_min = prog->float_min_offset;
1876 }
1877 if (must == &PL_sv_undef)
1878 /* could not downgrade utf8 check substring, so must fail */
1879 goto phooey;
1880
1881 last = HOP3c(strend, /* Cannot start after this */
1882 -(I32)(CHR_SVLEN(must)
1883 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1884
1885 if (s > PL_bostr)
1886 last1 = HOPc(s, -1);
1887 else
1888 last1 = s - 1; /* bogus */
1889
a0288114 1890 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1891 check_substr==must. */
1892 scream_pos = -1;
1893 dontbother = end_shift;
1894 strend = HOPc(strend, -dontbother);
1895 while ( (s <= last) &&
9041c2e3 1896 ((flags & REXEC_SCREAM)
1aa99e6b 1897 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1898 end_shift, &scream_pos, 0))
1aa99e6b 1899 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1900 (unsigned char*)strend, must,
7fba1cd6 1901 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1902 /* we may be pointing at the wrong string */
1903 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1904 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1905 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1906 if (HOPc(s, -back_max) > last1) {
1907 last1 = HOPc(s, -back_min);
1908 s = HOPc(s, -back_max);
1909 }
1910 else {
1911 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1912
1913 last1 = HOPc(s, -back_min);
1914 s = t;
1915 }
1aa99e6b 1916 if (do_utf8) {
6eb5f6b9
JH
1917 while (s <= last1) {
1918 if (regtry(prog, s))
1919 goto got_it;
1920 s += UTF8SKIP(s);
1921 }
1922 }
1923 else {
1924 while (s <= last1) {
1925 if (regtry(prog, s))
1926 goto got_it;
1927 s++;
1928 }
1929 }
1930 }
a3621e74 1931 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1932 PerlIO_printf(Perl_debug_log,
a0288114 1933 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1934 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1935 ? "anchored" : "floating"),
1936 PL_colors[0],
1937 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1938 SvPVX_const(must),
b7953727
JH
1939 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1940 );
6eb5f6b9
JH
1941 goto phooey;
1942 }
155aba94 1943 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1944 if (minlen) {
1945 I32 op = (U8)OP(prog->regstclass);
66e933ab 1946 /* don't bother with what can't match */
f14c76ed
RGS
1947 if (PL_regkind[op] != EXACT && op != CANY)
1948 strend = HOPc(strend, -(minlen - 1));
1949 }
a3621e74 1950 DEBUG_EXECUTE_r({
ffc61ed2 1951 SV *prop = sv_newmortal();
9e55ce06
JH
1952 char *s0;
1953 char *s1;
1954 int len0;
1955 int len1;
1956
ffc61ed2 1957 regprop(prop, c);
9e55ce06 1958 s0 = UTF ?
3f7c398e 1959 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1960 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1961 SvPVX(prop);
1962 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1963 s1 = UTF ?
c728cb41 1964 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1965 len1 = UTF ? SvCUR(dsv1) : strend - s;
1966 PerlIO_printf(Perl_debug_log,
a0288114 1967 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1968 len0, len0, s0,
1969 len1, len1, s1);
ffc61ed2 1970 });
06b5626a 1971 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1972 goto got_it;
a3621e74 1973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1974 }
1975 else {
1976 dontbother = 0;
33b8afdf
JH
1977 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1978 /* Trim the end. */
d6a28714 1979 char *last;
33b8afdf
JH
1980 SV* float_real;
1981
1982 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1983 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1984 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1985
1986 if (flags & REXEC_SCREAM) {
33b8afdf 1987 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1988 end_shift, &scream_pos, 1); /* last one */
1989 if (!last)
ffc61ed2 1990 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1991 /* we may be pointing at the wrong string */
1992 else if (RX_MATCH_COPIED(prog))
3f7c398e 1993 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1994 }
d6a28714
JH
1995 else {
1996 STRLEN len;
06b5626a 1997 const char * const little = SvPV(float_real, len);
d6a28714 1998
33b8afdf 1999 if (SvTAIL(float_real)) {
d6a28714
JH
2000 if (memEQ(strend - len + 1, little, len - 1))
2001 last = strend - len + 1;
7fba1cd6 2002 else if (!multiline)
9041c2e3 2003 last = memEQ(strend - len, little, len)
d6a28714 2004 ? strend - len : Nullch;
b8c5462f 2005 else
d6a28714
JH
2006 goto find_last;
2007 } else {
2008 find_last:
9041c2e3 2009 if (len)
d6a28714 2010 last = rninstr(s, strend, little, little + len);
b8c5462f 2011 else
a0288114 2012 last = strend; /* matching "$" */
b8c5462f 2013 }
b8c5462f 2014 }
bf93d4cc 2015 if (last == NULL) {
a3621e74 2016 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2017 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2018 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2019 goto phooey; /* Should not happen! */
2020 }
d6a28714
JH
2021 dontbother = strend - last + prog->float_min_offset;
2022 }
2023 if (minlen && (dontbother < minlen))
2024 dontbother = minlen - 1;
2025 strend -= dontbother; /* this one's always in bytes! */
2026 /* We don't know much -- general case. */
1aa99e6b 2027 if (do_utf8) {
d6a28714
JH
2028 for (;;) {
2029 if (regtry(prog, s))
2030 goto got_it;
2031 if (s >= strend)
2032 break;
b8c5462f 2033 s += UTF8SKIP(s);
d6a28714
JH
2034 };
2035 }
2036 else {
2037 do {
2038 if (regtry(prog, s))
2039 goto got_it;
2040 } while (s++ < strend);
2041 }
2042 }
2043
2044 /* Failure. */
2045 goto phooey;
2046
2047got_it:
2048 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2049
2050 if (PL_reg_eval_set) {
2051 /* Preserve the current value of $^R */
2052 if (oreplsv != GvSV(PL_replgv))
2053 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2054 restored, the value remains
2055 the same. */
acfe0abc 2056 restore_pos(aTHX_ 0);
d6a28714
JH
2057 }
2058
2059 /* make sure $`, $&, $', and $digit will work later */
2060 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2061 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2062 if (flags & REXEC_COPY_STR) {
2063 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2064#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2065 if ((SvIsCOW(sv)
2066 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2067 if (DEBUG_C_TEST) {
2068 PerlIO_printf(Perl_debug_log,
2069 "Copy on write: regexp capture, type %d\n",
2070 (int) SvTYPE(sv));
2071 }
2072 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2073 prog->subbeg = SvPVX(prog->saved_copy);
2074 assert (SvPOKp(prog->saved_copy));
2075 } else
2076#endif
2077 {
2078 RX_MATCH_COPIED_on(prog);
2079 s = savepvn(strbeg, i);
2080 prog->subbeg = s;
2081 }
d6a28714 2082 prog->sublen = i;
d6a28714
JH
2083 }
2084 else {
2085 prog->subbeg = strbeg;
2086 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2087 }
2088 }
9041c2e3 2089
d6a28714
JH
2090 return 1;
2091
2092phooey:
a3621e74 2093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2094 PL_colors[4], PL_colors[5]));
d6a28714 2095 if (PL_reg_eval_set)
acfe0abc 2096 restore_pos(aTHX_ 0);
d6a28714
JH
2097 return 0;
2098}
2099
2100/*
2101 - regtry - try match at specific point
2102 */
2103STATIC I32 /* 0 failure, 1 success */
2104S_regtry(pTHX_ regexp *prog, char *startpos)
2105{
d6a28714
JH
2106 register I32 i;
2107 register I32 *sp;
2108 register I32 *ep;
2109 CHECKPOINT lastcp;
a3621e74 2110 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2111
02db2b7b
IZ
2112#ifdef DEBUGGING
2113 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2114#endif
d6a28714
JH
2115 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2116 MAGIC *mg;
2117
2118 PL_reg_eval_set = RS_init;
a3621e74 2119 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2120 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2121 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2122 ));
e8347627 2123 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2124 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2125 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2126 SAVETMPS;
2127 /* Apparently this is not needed, judging by wantarray. */
e8347627 2128 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2129 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2130
2131 if (PL_reg_sv) {
2132 /* Make $_ available to executed code. */
2133 if (PL_reg_sv != DEFSV) {
59f00321 2134 SAVE_DEFSV;
d6a28714 2135 DEFSV = PL_reg_sv;
b8c5462f 2136 }
d6a28714 2137
9041c2e3 2138 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2139 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2140 /* prepare for quick setting of pos */
14befaf4
DM
2141 sv_magic(PL_reg_sv, (SV*)0,
2142 PERL_MAGIC_regex_global, Nullch, 0);
2143 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2144 mg->mg_len = -1;
b8c5462f 2145 }
d6a28714
JH
2146 PL_reg_magic = mg;
2147 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2148 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2149 }
09687e5a 2150 if (!PL_reg_curpm) {
e4584336 2151 Newz(22, PL_reg_curpm, 1, PMOP);
09687e5a
AB
2152#ifdef USE_ITHREADS
2153 {
2154 SV* repointer = newSViv(0);
577e12cc 2155 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2156 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2157 av_push(PL_regex_padav,repointer);
2158 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2159 PL_regex_pad = AvARRAY(PL_regex_padav);
2160 }
2161#endif
2162 }
aaa362c4 2163 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2164 PL_reg_oldcurpm = PL_curpm;
2165 PL_curpm = PL_reg_curpm;
2166 if (RX_MATCH_COPIED(prog)) {
2167 /* Here is a serious problem: we cannot rewrite subbeg,
2168 since it may be needed if this match fails. Thus
2169 $` inside (?{}) could fail... */
2170 PL_reg_oldsaved = prog->subbeg;
2171 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2172#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2173 PL_nrs = prog->saved_copy;
2174#endif
d6a28714
JH
2175 RX_MATCH_COPIED_off(prog);
2176 }
2177 else
2178 PL_reg_oldsaved = Nullch;
2179 prog->subbeg = PL_bostr;
2180 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2181 }
2182 prog->startp[0] = startpos - PL_bostr;
2183 PL_reginput = startpos;
2184 PL_regstartp = prog->startp;
2185 PL_regendp = prog->endp;
2186 PL_reglastparen = &prog->lastparen;
a01268b5 2187 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2188 prog->lastparen = 0;
03994de8 2189 prog->lastcloseparen = 0;
d6a28714 2190 PL_regsize = 0;
a3621e74 2191 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2192 if (PL_reg_start_tmpl <= prog->nparens) {
2193 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2194 if(PL_reg_start_tmp)
2195 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2196 else
e4584336 2197 New(22, PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2198 }
2199
2200 /* XXXX What this code is doing here?!!! There should be no need
2201 to do this again and again, PL_reglastparen should take care of
3dd2943c 2202 this! --ilya*/
dafc8851
JH
2203
2204 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2205 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2206 * PL_reglastparen), is not needed at all by the test suite
2207 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2208 * enough, for building DynaLoader, or otherwise this
2209 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2210 * will happen. Meanwhile, this code *is* needed for the
2211 * above-mentioned test suite tests to succeed. The common theme
2212 * on those tests seems to be returning null fields from matches.
2213 * --jhi */
dafc8851 2214#if 1
d6a28714
JH
2215 sp = prog->startp;
2216 ep = prog->endp;
2217 if (prog->nparens) {
eb160463 2218 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2219 *++sp = -1;
2220 *++ep = -1;
2221 }
2222 }
dafc8851 2223#endif
02db2b7b 2224 REGCP_SET(lastcp);
d6a28714
JH
2225 if (regmatch(prog->program + 1)) {
2226 prog->endp[0] = PL_reginput - PL_bostr;
2227 return 1;
2228 }
02db2b7b 2229 REGCP_UNWIND(lastcp);
d6a28714
JH
2230 return 0;
2231}
2232
02db2b7b
IZ
2233#define RE_UNWIND_BRANCH 1
2234#define RE_UNWIND_BRANCHJ 2
2235
2236union re_unwind_t;
2237
2238typedef struct { /* XX: makes sense to enlarge it... */
2239 I32 type;
2240 I32 prev;
2241 CHECKPOINT lastcp;
2242} re_unwind_generic_t;
2243
2244typedef struct {
2245 I32 type;
2246 I32 prev;
2247 CHECKPOINT lastcp;
2248 I32 lastparen;
2249 regnode *next;
2250 char *locinput;
2251 I32 nextchr;
2252#ifdef DEBUGGING
2253 int regindent;
2254#endif
2255} re_unwind_branch_t;
2256
2257typedef union re_unwind_t {
2258 I32 type;
2259 re_unwind_generic_t generic;
2260 re_unwind_branch_t branch;
2261} re_unwind_t;
2262
8ba1375e
MJD
2263#define sayYES goto yes
2264#define sayNO goto no
e0f9d4a8 2265#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2266#define sayYES_FINAL goto yes_final
2267#define sayYES_LOUD goto yes_loud
2268#define sayNO_FINAL goto no_final
2269#define sayNO_SILENT goto do_no
2270#define saySAME(x) if (x) goto yes; else goto no
2271
3ab3c9b4
HS
2272#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2273#define POSCACHE_SEEN 1 /* we know what we're caching */
2274#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2275#define CACHEsayYES STMT_START { \
2276 if (cache_offset | cache_bit) { \
2277 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2278 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2279 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2280 /* cache records failure, but this is success */ \
2281 DEBUG_r( \
2282 PerlIO_printf(Perl_debug_log, \
2283 "%*s (remove success from failure cache)\n", \
2284 REPORT_CODE_OFF+PL_regindent*2, "") \
2285 ); \
2286 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2287 } \
2288 } \
2289 sayYES; \
2290} STMT_END
2291#define CACHEsayNO STMT_START { \
2292 if (cache_offset | cache_bit) { \
2293 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2294 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2295 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2296 /* cache records success, but this is failure */ \
2297 DEBUG_r( \
2298 PerlIO_printf(Perl_debug_log, \
2299 "%*s (remove failure from success cache)\n", \
2300 REPORT_CODE_OFF+PL_regindent*2, "") \
2301 ); \
2302 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2303 } \
2304 } \
2305 sayNO; \
2306} STMT_END
2307
a3621e74
YO
2308/* this is used to determine how far from the left messages like
2309 'failed...' are printed. Currently 29 makes these messages line
2310 up with the opcode they refer to. Earlier perls used 25 which
2311 left these messages outdented making reviewing a debug output
2312 quite difficult.
2313*/
2314#define REPORT_CODE_OFF 29
2315
2316
2317/* Make sure there is a test for this +1 options in re_tests */
2318#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2319
2320#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2321 if ( trie->states[ state ].wordnum ) { \
2322 if ( !accepted ) { \
2323 ENTER; \
2324 SAVETMPS; \
2325 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2326 sv_accept_buff=NEWSV( 1234, \
2327 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2328 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2329 SvPOK_on( sv_accept_buff ); \
2330 sv_2mortal( sv_accept_buff ); \
2331 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2332 } else { \
2333 if ( accepted >= bufflen ) { \
2334 bufflen *= 2; \
2335 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2336 bufflen * sizeof(reg_trie_accepted) ); \
2337 } \
2338 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2339 + sizeof( reg_trie_accepted ) ); \
2340 } \
2341 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2342 accept_buff[ accepted ].endpos = uc; \
2343 ++accepted; \
2344 } } STMT_END
2345
2346#define TRIE_HANDLE_CHAR STMT_START { \
2347 if ( uvc < 256 ) { \
2348 charid = trie->charmap[ uvc ]; \
2349 } else { \
2350 charid = 0; \
2351 if( trie->widecharmap ) { \
2352 SV** svpp = (SV**)NULL; \
2353 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2354 sizeof( UV ), 0 ); \
2355 if ( svpp ) { \
2356 charid = (U16)SvIV( *svpp ); \
2357 } \
2358 } \
2359 } \
2360 if ( charid && \
cc601c31
YO
2361 ( base + charid > trie->uniquecharcount ) && \
2362 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2363 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2364 { \
2365 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2366 } else { \
2367 state = 0; \
2368 } \
2369 uc += len; \
2370 } STMT_END
8ba1375e 2371
d6a28714
JH
2372/*
2373 - regmatch - main matching routine
2374 *
2375 * Conceptually the strategy is simple: check to see whether the current
2376 * node matches, call self recursively to see whether the rest matches,
2377 * and then act accordingly. In practice we make some effort to avoid
2378 * recursion, in particular by going through "ordinary" nodes (that don't
2379 * need to know whether the rest of the match failed) by a loop instead of
2380 * by recursion.
2381 */
2382/* [lwall] I've hoisted the register declarations to the outer block in order to
2383 * maybe save a little bit of pushing and popping on the stack. It also takes
2384 * advantage of machines that use a register save mask on subroutine entry.
2385 */
2386STATIC I32 /* 0 failure, 1 success */
2387S_regmatch(pTHX_ regnode *prog)
2388{
27da23d5 2389 dVAR;
d6a28714
JH
2390 register regnode *scan; /* Current node. */
2391 regnode *next; /* Next node. */
2392 regnode *inner; /* Next node in internal branch. */
2393 register I32 nextchr; /* renamed nextchr - nextchar colides with
2394 function of same name */
2395 register I32 n; /* no or next */
b7953727
JH
2396 register I32 ln = 0; /* len or last */
2397 register char *s = Nullch; /* operand or save */
d6a28714 2398 register char *locinput = PL_reginput;
b7953727 2399 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2400 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2401 I32 unwind = 0;
a3621e74
YO
2402
2403 /* used by the trie code */
ab74612d
NC
2404 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2405 reg_trie_accepted *accept_buff = 0; /* "" */
2406 reg_trie_data *trie; /* what trie are we using right now */
2407 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2408
b7953727 2409#if 0
02db2b7b 2410 I32 firstcp = PL_savestack_ix;
b7953727 2411#endif
a3b680e6 2412 const register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2413#ifdef DEBUGGING
ce333219
JH
2414 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2415 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2416 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2417
ab74612d 2418 SV *re_debug_flags = NULL;
2a782b5b 2419#endif
02db2b7b 2420
a3621e74
YO
2421 GET_RE_DEBUG_FLAGS;
2422
d6a28714
JH
2423#ifdef DEBUGGING
2424 PL_regindent++;
2425#endif
2426
a3621e74 2427
d6a28714
JH
2428 /* Note that nextchr is a byte even in UTF */
2429 nextchr = UCHARAT(locinput);
2430 scan = prog;
2431 while (scan != NULL) {
8ba1375e 2432
a3621e74 2433 DEBUG_EXECUTE_r( {
d6a28714 2434 SV *prop = sv_newmortal();
1df70142
AL
2435 const int docolor = *PL_colors[0];
2436 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2437 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2438 /* The part of the string before starttry has one color
2439 (pref0_len chars), between starttry and current
2440 position another one (pref_len - pref0_len chars),
2441 after the current position the third one.
2442 We assume that pref0_len <= pref_len, otherwise we
2443 decrease pref0_len. */
9041c2e3 2444 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2445 ? (5 + taill) - l : locinput - PL_bostr;
2446 int pref0_len;
d6a28714 2447
df1ffd02 2448 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2449 pref_len++;
2450 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2451 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2452 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2453 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2454 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2455 l--;
d6a28714
JH
2456 if (pref0_len < 0)
2457 pref0_len = 0;
2458 if (pref0_len > pref_len)
2459 pref0_len = pref_len;
2460 regprop(prop, scan);
2a782b5b 2461 {
1df70142 2462 const char * const s0 =
f14c76ed 2463 do_utf8 && OP(scan) != CANY ?
2a782b5b 2464 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2465 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2466 locinput - pref_len;
1df70142
AL
2467 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2468 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2469 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2470 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2471 locinput - pref_len + pref0_len;
1df70142
AL
2472 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2473 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2474 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2475 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2476 locinput;
1df70142 2477 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2478 PerlIO_printf(Perl_debug_log,
2479 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2480 (IV)(locinput - PL_bostr),
2481 PL_colors[4],
2482 len0, s0,
2483 PL_colors[5],
2484 PL_colors[2],
2485 len1, s1,
2486 PL_colors[3],
2487 (docolor ? "" : "> <"),
2488 PL_colors[0],
2489 len2, s2,
2490 PL_colors[1],
2491 15 - l - pref_len + 1,
2492 "",
2493 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2494 SvPVX_const(prop));
2a782b5b
JH
2495 }
2496 });
d6a28714
JH
2497
2498 next = scan + NEXT_OFF(scan);
2499 if (next == scan)
2500 next = NULL;
2501
2502 switch (OP(scan)) {
2503 case BOL:
7fba1cd6 2504 if (locinput == PL_bostr)
d6a28714
JH
2505 {
2506 /* regtill = regbol; */
b8c5462f
JH
2507 break;
2508 }
d6a28714
JH
2509 sayNO;
2510 case MBOL:
12d33761
HS
2511 if (locinput == PL_bostr ||
2512 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2513 {
b8c5462f
JH
2514 break;
2515 }
d6a28714
JH
2516 sayNO;
2517 case SBOL:
c2a73568 2518 if (locinput == PL_bostr)
b8c5462f 2519 break;
d6a28714
JH
2520 sayNO;
2521 case GPOS:
2522 if (locinput == PL_reg_ganch)
2523 break;
2524 sayNO;
2525 case EOL:
d6a28714
JH
2526 goto seol;
2527 case MEOL:
d6a28714 2528 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2529 sayNO;
b8c5462f 2530 break;
d6a28714
JH
2531 case SEOL:
2532 seol:
2533 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2534 sayNO;
d6a28714 2535 if (PL_regeol - locinput > 1)
b8c5462f 2536 sayNO;
b8c5462f 2537 break;
d6a28714
JH
2538 case EOS:
2539 if (PL_regeol != locinput)
b8c5462f 2540 sayNO;
d6a28714 2541 break;
ffc61ed2 2542 case SANY:
d6a28714 2543 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2544 sayNO;
f33976b4
DB
2545 if (do_utf8) {
2546 locinput += PL_utf8skip[nextchr];
2547 if (locinput > PL_regeol)
2548 sayNO;
2549 nextchr = UCHARAT(locinput);
2550 }
2551 else
2552 nextchr = UCHARAT(++locinput);
2553 break;
2554 case CANY:
2555 if (!nextchr && locinput >= PL_regeol)
2556 sayNO;
b8c5462f 2557 nextchr = UCHARAT(++locinput);
a0d0e21e 2558 break;
ffc61ed2 2559 case REG_ANY:
1aa99e6b
IH
2560 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2561 sayNO;
2562 if (do_utf8) {
b8c5462f 2563 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2564 if (locinput > PL_regeol)
2565 sayNO;
a0ed51b3 2566 nextchr = UCHARAT(locinput);
a0ed51b3 2567 }
1aa99e6b
IH
2568 else
2569 nextchr = UCHARAT(++locinput);
a0ed51b3 2570 break;
a3621e74
YO
2571
2572
2573
2574 /*
2575 traverse the TRIE keeping track of all accepting states
2576 we transition through until we get to a failing node.
2577
2578 we use two slightly different pieces of code to handle
2579 the traversal depending on whether its case sensitive or
2580 not. we reuse the accept code however. (this should probably
2581 be turned into a macro.)
2582
2583 */
2584 case TRIEF:
2585 case TRIEFL:
2586 {
2587
a3b680e6 2588 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2589 U8 *uc = ( U8* )locinput;
2590 U32 state = 1;
2591 U16 charid = 0;
2592 U32 base = 0;
2593 UV uvc = 0;
2594 STRLEN len = 0;
2595 STRLEN foldlen = 0;
a3621e74
YO
2596 U8 *uscan = (U8*)NULL;
2597 STRLEN bufflen=0;
2598 accepted = 0;
2599
2600 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2601
2602 while ( state && uc <= (U8*)PL_regeol ) {
2603
2604 TRIE_CHECK_STATE_IS_ACCEPTING;
2605
2606 base = trie->states[ state ].trans.base;
2607
2608 DEBUG_TRIE_EXECUTE_r(
2609 PerlIO_printf( Perl_debug_log,
e4584336 2610 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2611 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2612 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2613 );
2614
2615 if ( base ) {
2616
2617 if ( do_utf8 || UTF ) {
2618 if ( foldlen>0 ) {
2619 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2620 foldlen -= len;
2621 uscan += len;
2622 len=0;
2623 } else {
1df70142 2624 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2625 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2626 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2627 foldlen -= UNISKIP( uvc );
2628 uscan = foldbuf + UNISKIP( uvc );
2629 }
2630 } else {
e4584336 2631 uvc = (UV)*uc;
a3621e74
YO
2632 len = 1;
2633 }
2634
2635 TRIE_HANDLE_CHAR;
2636
2637 } else {
2638 state = 0;
2639 }
2640 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2641 PerlIO_printf( Perl_debug_log,
2642 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2643 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2644 );
2645 }
2646 if ( !accepted ) {
2647 sayNO;
2648 } else {
2649 goto TrieAccept;
2650 }
2651 }
2652 /* unreached codepoint: we jump into the middle of the next case
2653 from previous if blocks */
2654 case TRIE:
2655 {
a3b680e6 2656 const U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
2657 U8 *uc = (U8*)locinput;
2658 U32 state = 1;
2659 U16 charid = 0;
2660 U32 base = 0;
2661 UV uvc = 0;
2662 STRLEN len = 0;
2663 STRLEN bufflen = 0;
2664 accepted = 0;
2665
2666 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2667
2668 while ( state && uc <= (U8*)PL_regeol ) {
2669
2670 TRIE_CHECK_STATE_IS_ACCEPTING;
2671
2672 base = trie->states[ state ].trans.base;
2673
2674 DEBUG_TRIE_EXECUTE_r(
2675 PerlIO_printf( Perl_debug_log,
e4584336 2676 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2677 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2678 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2679 );
2680
2681 if ( base ) {
2682
2683 if ( do_utf8 || UTF ) {
2684 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2685 } else {
2686 uvc = (U32)*uc;
2687 len = 1;
2688 }
2689
2690 TRIE_HANDLE_CHAR;
2691
2692 } else {
2693 state = 0;
2694 }
2695 DEBUG_TRIE_EXECUTE_r(
2696 PerlIO_printf( Perl_debug_log,
e4584336
RB
2697 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2698 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2699 );
2700 }
2701 if ( !accepted ) {
2702 sayNO;
2703 }
2704 }
2705
2706
2707 /*
2708 There was at least one accepting state that we
2709 transitioned through. Presumably the number of accepting
2710 states is going to be low, typically one or two. So we
2711 simply scan through to find the one with lowest wordnum.
2712 Once we find it, we swap the last state into its place
2713 and decrement the size. We then try to match the rest of
2714 the pattern at the point where the word ends, if we
2715 succeed then we end the loop, otherwise the loop
2716 eventually terminates once all of the accepting states
2717 have been tried.
2718 */
2719 TrieAccept:
2720 {
2721 int gotit = 0;
2722
2723 if ( accepted == 1 ) {
2724 DEBUG_EXECUTE_r({
2725 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2726 PerlIO_printf( Perl_debug_log,
2727 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2728 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74
YO
2729 accept_buff[ 0 ].wordnum,
2730 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2731 PL_colors[5] );
2732 });
cc601c31 2733 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2734 /* in this case we free tmps/leave before we call regmatch
2735 as we wont be using accept_buff again. */
2736 FREETMPS;
2737 LEAVE;
2738 gotit = regmatch( scan + NEXT_OFF( scan ) );
2739 } else {
2740 DEBUG_EXECUTE_r(
e4584336
RB
2741 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2742 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2743 PL_colors[5] );
2744 );
2745 while ( !gotit && accepted-- ) {
2746 U32 best = 0;
2747 U32 cur;
2748 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2749 DEBUG_TRIE_EXECUTE_r(
2750 PerlIO_printf( Perl_debug_log,
2751 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2752 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2753 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2754 accept_buff[ cur ].wordnum, PL_colors[5] );
2755 );
a3621e74
YO
2756
2757 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2758 best = cur;
2759 }
2760 DEBUG_EXECUTE_r({
2761 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2762 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2763 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74
YO
2764 accept_buff[best].wordnum,
2765 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2766 PL_colors[5] );
2767 });
2768 if ( best<accepted ) {
2769 reg_trie_accepted tmp = accept_buff[ best ];
2770 accept_buff[ best ] = accept_buff[ accepted ];
2771 accept_buff[ accepted ] = tmp;
2772 best = accepted;
2773 }
cc601c31 2774 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2775
2776 /*
2777 as far as I can tell we only need the SAVETMPS/FREETMPS
2778 for re's with EVAL in them but I'm leaving them in for
2779 all until I can be sure.
2780 */
2781 SAVETMPS;
2782 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2783 FREETMPS;
2784 }
2785 FREETMPS;
2786 LEAVE;
2787 }
2788
2789 if ( gotit ) {
2790 sayYES;
2791 } else {
2792 sayNO;
2793 }
2794 }
2795 /* unreached codepoint */
d6a28714 2796 case EXACT:
cd439c50
IZ
2797 s = STRING(scan);
2798 ln = STR_LEN(scan);
eb160463 2799 if (do_utf8 != UTF) {
bc517b45 2800 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2801 char *l = locinput;
a3b680e6 2802 const char *e = s + ln;
a72c7584 2803
5ff6fc6d
JH
2804 if (do_utf8) {
2805 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2806 while (s < e) {
a3b680e6 2807 STRLEN ulen;
1aa99e6b 2808 if (l >= PL_regeol)
5ff6fc6d
JH
2809 sayNO;
2810 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2811 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2812 ckWARN(WARN_UTF8) ?
2813 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2814 sayNO;
bc517b45 2815 l += ulen;
5ff6fc6d 2816 s ++;
1aa99e6b 2817 }
5ff6fc6d
JH
2818 }
2819 else {
2820 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2821 while (s < e) {
a3b680e6 2822 STRLEN ulen;
1aa99e6b
IH
2823 if (l >= PL_regeol)
2824 sayNO;
5ff6fc6d 2825 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2826 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2827 ckWARN(WARN_UTF8) ?
2828 0 : UTF8_ALLOW_ANY))
1aa99e6b 2829 sayNO;
bc517b45 2830 s += ulen;
a72c7584 2831 l ++;
1aa99e6b 2832 }
5ff6fc6d 2833 }
1aa99e6b
IH
2834 locinput = l;
2835 nextchr = UCHARAT(locinput);
2836 break;
2837 }
bc517b45 2838 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2839 /* Inline the first character, for speed. */
2840 if (UCHARAT(s) != nextchr)
2841 sayNO;
2842 if (PL_regeol - locinput < ln)
2843 sayNO;
2844 if (ln > 1 && memNE(s, locinput, ln))
2845 sayNO;
2846 locinput += ln;
2847 nextchr = UCHARAT(locinput);
2848 break;
2849 case EXACTFL:
b8c5462f
JH
2850 PL_reg_flags |= RF_tainted;
2851 /* FALL THROUGH */
d6a28714 2852 case EXACTF:
cd439c50
IZ
2853 s = STRING(scan);
2854 ln = STR_LEN(scan);
d6a28714 2855
d07ddd77
JH
2856 if (do_utf8 || UTF) {
2857 /* Either target or the pattern are utf8. */
d6a28714 2858 char *l = locinput;
d07ddd77 2859 char *e = PL_regeol;
bc517b45 2860
eb160463 2861 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2862 l, &e, 0, do_utf8)) {
5486206c
JH
2863 /* One more case for the sharp s:
2864 * pack("U0U*", 0xDF) =~ /ss/i,
2865 * the 0xC3 0x9F are the UTF-8
2866 * byte sequence for the U+00DF. */
2867 if (!(do_utf8 &&
2868 toLOWER(s[0]) == 's' &&
2869 ln >= 2 &&
2870 toLOWER(s[1]) == 's' &&
2871 (U8)l[0] == 0xC3 &&
2872 e - l >= 2 &&
2873 (U8)l[1] == 0x9F))
2874 sayNO;
2875 }
d07ddd77
JH
2876 locinput = e;
2877 nextchr = UCHARAT(locinput);
2878 break;
a0ed51b3 2879 }
d6a28714 2880
bc517b45
JH
2881 /* Neither the target and the pattern are utf8. */
2882
d6a28714
JH
2883 /* Inline the first character, for speed. */
2884 if (UCHARAT(s) != nextchr &&
2885 UCHARAT(s) != ((OP(scan) == EXACTF)
2886 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2887 sayNO;
d6a28714 2888 if (PL_regeol - locinput < ln)
b8c5462f 2889 sayNO;
d6a28714
JH
2890 if (ln > 1 && (OP(scan) == EXACTF
2891 ? ibcmp(s, locinput, ln)
2892 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2893 sayNO;
d6a28714
JH
2894 locinput += ln;
2895 nextchr = UCHARAT(locinput);
a0d0e21e 2896 break;
d6a28714 2897 case ANYOF:
ffc61ed2 2898 if (do_utf8) {
9e55ce06
JH
2899 STRLEN inclasslen = PL_regeol - locinput;
2900
ba7b4546 2901 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2902 sayNO_ANYOF;
ffc61ed2
JH
2903 if (locinput >= PL_regeol)
2904 sayNO;
0f0076b4 2905 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2906 nextchr = UCHARAT(locinput);
e0f9d4a8 2907 break;
ffc61ed2
JH
2908 }
2909 else {
2910 if (nextchr < 0)
2911 nextchr = UCHARAT(locinput);
7d3e948e 2912 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2913 sayNO_ANYOF;
ffc61ed2
JH
2914 if (!nextchr && locinput >= PL_regeol)
2915 sayNO;
2916 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2917 break;
2918 }
2919 no_anyof:
2920 /* If we might have the case of the German sharp s
2921 * in a casefolding Unicode character class. */
2922
ebc501f0
JH
2923 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2924 locinput += SHARP_S_SKIP;
e0f9d4a8 2925 nextchr = UCHARAT(locinput);
ffc61ed2 2926 }
e0f9d4a8
JH
2927 else
2928 sayNO;
b8c5462f 2929 break;
d6a28714 2930 case ALNUML:
b8c5462f
JH
2931 PL_reg_flags |= RF_tainted;
2932 /* FALL THROUGH */
d6a28714 2933 case ALNUM:
b8c5462f 2934 if (!nextchr)
4633a7c4 2935 sayNO;
ffc61ed2 2936 if (do_utf8) {
1a4fad37 2937 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2938 if (!(OP(scan) == ALNUM
3568d838 2939 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2940 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2941 {
2942 sayNO;
a0ed51b3 2943 }
b8c5462f 2944 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2945 nextchr = UCHARAT(locinput);
2946 break;
2947 }
ffc61ed2 2948 if (!(OP(scan) == ALNUM
d6a28714 2949 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2950 sayNO;
b8c5462f 2951 nextchr = UCHARAT(++locinput);
a0d0e21e 2952 break;
d6a28714 2953 case NALNUML:
b8c5462f
JH
2954 PL_reg_flags |= RF_tainted;
2955 /* FALL THROUGH */
d6a28714
JH
2956 case NALNUM:
2957 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2958 sayNO;
ffc61ed2 2959 if (do_utf8) {
1a4fad37 2960 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2961 if (OP(scan) == NALNUM
3568d838 2962 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2963 : isALNUM_LC_utf8((U8*)locinput))
2964 {
b8c5462f 2965 sayNO;
d6a28714 2966 }
b8c5462f
JH
2967 locinput += PL_utf8skip[nextchr];
2968 nextchr = UCHARAT(locinput);
2969 break;
2970 }
ffc61ed2 2971 if (OP(scan) == NALNUM
d6a28714 2972 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2973 sayNO;
76e3520e 2974 nextchr = UCHARAT(++locinput);
a0d0e21e 2975 break;
d6a28714
JH
2976 case BOUNDL:
2977 case NBOUNDL:
3280af22 2978 PL_reg_flags |= RF_tainted;
bbce6d69 2979 /* FALL THROUGH */
d6a28714
JH
2980 case BOUND:
2981 case NBOUND:
2982 /* was last char in word? */
ffc61ed2 2983 if (do_utf8) {
12d33761
HS
2984 if (locinput == PL_bostr)
2985 ln = '\n';
ffc61ed2 2986 else {
a3b680e6 2987 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2988
b4f7163a 2989 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2990 }
2991 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2992 ln = isALNUM_uni(ln);
1a4fad37 2993 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2994 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2995 }
2996 else {
9041c2e3 2997 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2998 n = isALNUM_LC_utf8((U8*)locinput);
2999 }
a0ed51b3 3000 }
d6a28714 3001 else {
12d33761
HS
3002 ln = (locinput != PL_bostr) ?
3003 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
3004 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3005 ln = isALNUM(ln);
3006 n = isALNUM(nextchr);
3007 }
3008 else {
3009 ln = isALNUM_LC(ln);
3010 n = isALNUM_LC(nextchr);
3011 }
d6a28714 3012 }
ffc61ed2
JH
3013 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3014 OP(scan) == BOUNDL))
3015 sayNO;
a0ed51b3 3016 break;
d6a28714 3017 case SPACEL:
3280af22 3018 PL_reg_flags |= RF_tainted;
bbce6d69 3019 /* FALL THROUGH */
d6a28714 3020 case SPACE:
9442cb0e 3021 if (!nextchr)
4633a7c4 3022 sayNO;
1aa99e6b 3023 if (do_utf8) {
fd400ab9 3024 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3025 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3026 if (!(OP(scan) == SPACE
3568d838 3027 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3028 : isSPACE_LC_utf8((U8*)locinput)))
3029 {
3030 sayNO;
3031 }
3032 locinput += PL_utf8skip[nextchr];
3033 nextchr = UCHARAT(locinput);
3034 break;
d6a28714 3035 }
ffc61ed2
JH
3036 if (!(OP(scan) == SPACE
3037 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3038 sayNO;
3039 nextchr = UCHARAT(++locinput);
3040 }
3041 else {
3042 if (!(OP(scan) == SPACE
3043 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3044 sayNO;
3045 nextchr = UCHARAT(++locinput);
a0ed51b3 3046 }
a0ed51b3 3047 break;
d6a28714 3048 case NSPACEL:
3280af22 3049 PL_reg_flags |= RF_tainted;
bbce6d69 3050 /* FALL THROUGH */
d6a28714 3051 case NSPACE:
9442cb0e 3052 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3053 sayNO;
1aa99e6b 3054 if (do_utf8) {
1a4fad37 3055 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3056 if (OP(scan) == NSPACE
3568d838 3057 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3058 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3059 {
3060 sayNO;
3061 }
3062 locinput += PL_utf8skip[nextchr];
3063 nextchr = UCHARAT(locinput);
3064 break;
a0ed51b3 3065 }
ffc61ed2 3066 if (OP(scan) == NSPACE
d6a28714 3067 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3068 sayNO;
76e3520e 3069 nextchr = UCHARAT(++locinput);
a0d0e21e 3070 break;
d6a28714 3071 case DIGITL:
a0ed51b3
LW
3072 PL_reg_flags |= RF_tainted;
3073 /* FALL THROUGH */
d6a28714 3074 case DIGIT:
9442cb0e 3075 if (!nextchr)
a0ed51b3 3076 sayNO;
1aa99e6b 3077 if (do_utf8) {
1a4fad37 3078 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3079 if (!(OP(scan) == DIGIT
3568d838 3080 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3081 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3082 {
a0ed51b3 3083 sayNO;
dfe13c55 3084 }
6f06b55f 3085 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3086 nextchr = UCHARAT(locinput);
3087 break;
3088 }
ffc61ed2 3089 if (!(OP(scan) == DIGIT
9442cb0e 3090 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3091 sayNO;
3092 nextchr = UCHARAT(++locinput);
3093 break;
d6a28714 3094 case NDIGITL:
b8c5462f
JH
3095 PL_reg_flags |= RF_tainted;
3096 /* FALL THROUGH */
d6a28714 3097 case NDIGIT:
9442cb0e 3098 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3099 sayNO;
1aa99e6b 3100 if (do_utf8) {
1a4fad37 3101 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3102 if (OP(scan) == NDIGIT
3568d838 3103 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3104 : isDIGIT_LC_utf8((U8*)locinput))
3105 {
a0ed51b3 3106 sayNO;
9442cb0e 3107 }
6f06b55f 3108 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3109 nextchr = UCHARAT(locinput);
3110 break;
3111 }
ffc61ed2 3112 if (OP(scan) == NDIGIT
9442cb0e 3113 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3114 sayNO;
3115 nextchr = UCHARAT(++locinput);
3116 break;
3117 case CLUMP:
b7c83a7e 3118 if (locinput >= PL_regeol)
a0ed51b3 3119 sayNO;
b7c83a7e 3120 if (do_utf8) {
1a4fad37 3121 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3122 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3123 sayNO;
3124 locinput += PL_utf8skip[nextchr];
3125 while (locinput < PL_regeol &&
3126 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3127 locinput += UTF8SKIP(locinput);
3128 if (locinput > PL_regeol)
3129 sayNO;
eb08e2da
JH
3130 }
3131 else
3132 locinput++;
a0ed51b3
LW
3133 nextchr = UCHARAT(locinput);
3134 break;
c8756f30 3135 case REFFL:
3280af22 3136 PL_reg_flags |= RF_tainted;
c8756f30 3137 /* FALL THROUGH */
c277df42 3138 case REF:
c8756f30 3139 case REFF:
c277df42 3140 n = ARG(scan); /* which paren pair */
cf93c79d 3141 ln = PL_regstartp[n];
2c2d71f5 3142 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3143 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3144 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3145 if (ln == PL_regendp[n])
a0d0e21e 3146 break;
a0ed51b3 3147
cf93c79d 3148 s = PL_bostr + ln;
1aa99e6b 3149 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3150 char *l = locinput;
a3b680e6 3151 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3152 /*
3153 * Note that we can't do the "other character" lookup trick as
3154 * in the 8-bit case (no pun intended) because in Unicode we
3155 * have to map both upper and title case to lower case.
3156 */
3157 if (OP(scan) == REFF) {
3158 while (s < e) {
a3b680e6
AL
3159 STRLEN ulen1, ulen2;
3160 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3161 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3162
a0ed51b3
LW
3163 if (l >= PL_regeol)
3164 sayNO;
a2a2844f
JH
3165 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3166 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3167 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3168 sayNO;
a2a2844f
JH
3169 s += ulen1;
3170 l += ulen2;
a0ed51b3
LW
3171 }
3172 }
3173 locinput = l;
3174 nextchr = UCHARAT(locinput);
3175 break;
3176 }
3177
a0d0e21e 3178 /* Inline the first character, for speed. */
76e3520e 3179 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3180 (OP(scan) == REF ||
3181 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3182 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3183 sayNO;
cf93c79d 3184 ln = PL_regendp[n] - ln;
3280af22 3185 if (locinput + ln > PL_regeol)
4633a7c4 3186 sayNO;
c8756f30
AK
3187 if (ln > 1 && (OP(scan) == REF
3188 ? memNE(s, locinput, ln)
3189 : (OP(scan) == REFF
3190 ? ibcmp(s, locinput, ln)
3191 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3192 sayNO;
a0d0e21e 3193 locinput += ln;
76e3520e 3194 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3195 break;
3196
3197 case NOTHING:
c277df42 3198 case TAIL:
a0d0e21e
LW
3199 break;
3200 case BACK:
3201 break;
c277df42
IZ
3202 case EVAL:
3203 {
3204 dSP;
533c011a 3205 OP_4tree *oop = PL_op;
3280af22 3206 COP *ocurcop = PL_curcop;
f3548bdc 3207 PAD *old_comppad;
c277df42 3208 SV *ret;
080c2dec 3209 struct regexp *oreg = PL_reg_re;
9041c2e3 3210
c277df42 3211 n = ARG(scan);
533c011a 3212 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3213 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3214 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3215 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3216
8e5e9ebe
RGS
3217 {
3218 SV **before = SP;
3219 CALLRUNOPS(aTHX); /* Scalar context. */
3220 SPAGAIN;
3221 if (SP == before)
075aa684 3222 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3223 else {
3224 ret = POPs;
3225 PUTBACK;
3226 }
3227 }
3228
0f5d15d6 3229 PL_op = oop;
f3548bdc 3230 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3231 PL_curcop = ocurcop;
c277df42 3232 if (logical) {
0f5d15d6
IZ
3233 if (logical == 2) { /* Postponed subexpression. */
3234 regexp *re;
22c35a8c 3235 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3236 re_cc_state state;
0f5d15d6 3237 CHECKPOINT cp, lastcp;
cb50f42d 3238 int toggleutf;
faf82a0b 3239 register SV *sv;
0f5d15d6 3240
faf82a0b
AE
3241 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3242 mg = mg_find(sv, PERL_MAGIC_qr);
3243 else if (SvSMAGICAL(ret)) {
3244 if (SvGMAGICAL(ret))
3245 sv_unmagic(ret, PERL_MAGIC_qr);
3246 else
3247 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3248 }
faf82a0b 3249
0f5d15d6
IZ
3250 if (mg) {
3251 re = (regexp *)mg->mg_obj;
df0003d4 3252 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3253 }
3254 else {
3255 STRLEN len;
83003860 3256 const char *t = SvPV_const(ret, len);
0f5d15d6 3257 PMOP pm;
a3b680e6
AL
3258 char * const oprecomp = PL_regprecomp;
3259 const I32 osize = PL_regsize;
3260 const I32 onpar = PL_regnpar;
0f5d15d6 3261
5fcd1c1b 3262 Zero(&pm, 1, PMOP);
cb50f42d 3263 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3264 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3265 if (!(SvFLAGS(ret)
faf82a0b
AE
3266 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3267 | SVs_GMG)))
14befaf4
DM
3268 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3269 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3270 PL_regprecomp = oprecomp;
3271 PL_regsize = osize;
3272 PL_regnpar = onpar;
3273 }
a3621e74 3274 DEBUG_EXECUTE_r(
9041c2e3 3275 PerlIO_printf(Perl_debug_log,
a0288114 3276 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3277 PL_colors[0],
3278 re->precomp,
3279 PL_colors[1],
3280 (strlen(re->precomp) > 60 ? "..." : ""))
3281 );
3282 state.node = next;
3283 state.prev = PL_reg_call_cc;
3284 state.cc = PL_regcc;
3285 state.re = PL_reg_re;
3286
2ab05381 3287 PL_regcc = 0;
9041c2e3 3288
0f5d15d6 3289 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3290 REGCP_SET(lastcp);
0f5d15d6
IZ
3291 cache_re(re);
3292 state.ss = PL_savestack_ix;
3293 *PL_reglastparen = 0;
a01268b5 3294 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3295 PL_reg_call_cc = &state;
3296 PL_reginput = locinput;
cb50f42d
YST
3297 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3298 ((re->reganch & ROPT_UTF8) != 0);
3299 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3300
3301 /* XXXX This is too dramatic a measure... */
3302 PL_reg_maxiter = 0;
3303
0f5d15d6 3304 if (regmatch(re->program + 1)) {
2c914db6
IZ
3305 /* Even though we succeeded, we need to restore
3306 global variables, since we may be wrapped inside
3307 SUSPEND, thus the match may be not finished yet. */
3308
3309 /* XXXX Do this only if SUSPENDed? */
3310 PL_reg_call_cc = state.prev;
3311 PL_regcc = state.cc;
3312 PL_reg_re = state.re;
3313 cache_re(PL_reg_re);
cb50f42d 3314 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3315
3316 /* XXXX This is too dramatic a measure... */
3317 PL_reg_maxiter = 0;
3318
3319 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3320 ReREFCNT_dec(re);
3321 regcpblow(cp);
3322 sayYES;
3323 }
0f5d15d6 3324 ReREFCNT_dec(re);
02db2b7b 3325 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3326 regcppop();
3327 PL_reg_call_cc = state.prev;
3328 PL_regcc = state.cc;
3329 PL_reg_re = state.re;
d3790889 3330 cache_re(PL_reg_re);
cb50f42d 3331 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3332
3333 /* XXXX This is too dramatic a measure... */
3334 PL_reg_maxiter = 0;
3335
8e514ae6 3336 logical = 0;
0f5d15d6
IZ
3337 sayNO;
3338 }
c277df42 3339 sw = SvTRUE(ret);
0f5d15d6 3340 logical = 0;
a0ed51b3 3341 }
080c2dec 3342 else {
3280af22 3343 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3344 cache_re(oreg);
3345 }
c277df42
IZ
3346 break;
3347 }
a0d0e21e 3348 case OPEN:
c277df42 3349 n = ARG(scan); /* which paren pair */
3280af22
NIS
3350 PL_reg_start_tmp[n] = locinput;
3351 if (n > PL_regsize)
3352 PL_regsize = n;
a0d0e21e
LW
3353 break;
3354 case CLOSE:
c277df42 3355 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3356 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3357 PL_regendp[n] = locinput - PL_bostr;
eb160463 3358 if (n > (I32)*PL_reglastparen)
3280af22 3359 *PL_reglastparen = n;
a01268b5 3360 *PL_reglastcloseparen = n;
a0d0e21e 3361 break;
c277df42
IZ
3362 case GROUPP:
3363 n = ARG(scan); /* which paren pair */
eb160463 3364 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3365 break;
3366 case IFTHEN:
2c2d71f5 3367 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3368 if (sw)
3369 next = NEXTOPER(NEXTOPER(scan));
3370 else {
3371 next = scan + ARG(scan);
3372 if (OP(next) == IFTHEN) /* Fake one. */
3373 next = NEXTOPER(NEXTOPER(next));
3374 }
3375 break;
3376 case LOGICAL:
0f5d15d6 3377 logical = scan->flags;
c277df42 3378 break;
2ab05381
IZ
3379/*******************************************************************
3380 PL_regcc contains infoblock about the innermost (...)* loop, and
3381 a pointer to the next outer infoblock.
3382
3383 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3384
3385 1) After matching X, regnode for CURLYX is processed;
3386
9041c2e3 3387 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3388 regmatch() recursively with the starting point at WHILEM node;
3389
3390 3) Each hit of WHILEM node tries to match A and Z (in the order
3391 depending on the current iteration, min/max of {min,max} and
3392 greediness). The information about where are nodes for "A"
3393 and "Z" is read from the infoblock, as is info on how many times "A"
3394 was already matched, and greediness.
3395
3396 4) After A matches, the same WHILEM node is hit again.
3397
3398 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3399 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3400 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3401 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3402 of the external loop.
3403
3404 Currently present infoblocks form a tree with a stem formed by PL_curcc
3405 and whatever it mentions via ->next, and additional attached trees
3406 corresponding to temporarily unset infoblocks as in "5" above.
3407
9041c2e3 3408 In the following picture infoblocks for outer loop of
2ab05381
IZ
3409 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3410 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3411 infoblocks are drawn below the "reset" infoblock.
3412
3413 In fact in the picture below we do not show failed matches for Z and T
3414 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3415 more obvious *why* one needs to *temporary* unset infoblocks.]
3416
3417 Matched REx position InfoBlocks Comment
3418 (Y(A)*?Z)*?T x
3419 Y(A)*?Z)*?T x <- O
3420 Y (A)*?Z)*?T x <- O
3421 Y A)*?Z)*?T x <- O <- I
3422 YA )*?Z)*?T x <- O <- I
3423 YA A)*?Z)*?T x <- O <- I
3424 YAA )*?Z)*?T x <- O <- I
3425 YAA Z)*?T x <- O # Temporary unset I
3426 I
3427
3428 YAAZ Y(A)*?Z)*?T x <- O
3429 I
3430
3431 YAAZY (A)*?Z)*?T x <- O
3432 I
3433
3434 YAAZY A)*?Z)*?T x <- O <- I
3435 I
3436
3437 YAAZYA )*?Z)*?T x <- O <- I
3438 I
3439
3440 YAAZYA Z)*?T x <- O # Temporary unset I
3441 I,I
3442
3443 YAAZYAZ )*?T x <- O
3444 I,I
3445
3446 YAAZYAZ T x # Temporary unset O
3447 O
3448 I,I
3449
3450 YAAZYAZT x
3451 O
3452 I,I
3453 *******************************************************************/
a0d0e21e
LW
3454 case CURLYX: {
3455 CURCUR cc;
3280af22 3456 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3457 /* No need to save/restore up to this paren */
3458 I32 parenfloor = scan->flags;
c277df42
IZ
3459
3460 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3461 next += ARG(next);
3280af22
NIS
3462 cc.oldcc = PL_regcc;
3463 PL_regcc = &cc;
cb434fcc
IZ
3464 /* XXXX Probably it is better to teach regpush to support
3465 parenfloor > PL_regsize... */
eb160463 3466 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3467 parenfloor = *PL_reglastparen; /* Pessimization... */
3468 cc.parenfloor = parenfloor;
a0d0e21e
LW
3469 cc.cur = -1;
3470 cc.min = ARG1(scan);
3471 cc.max = ARG2(scan);
c277df42 3472 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3473 cc.next = next;
3474 cc.minmod = minmod;
3475 cc.lastloc = 0;
3280af22 3476 PL_reginput = locinput;
a0d0e21e
LW
3477 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3478 regcpblow(cp);
3280af22 3479 PL_regcc = cc.oldcc;
4633a7c4 3480 saySAME(n);
a0d0e21e
LW
3481 }
3482 /* NOT REACHED */
3483 case WHILEM: {
3484 /*
3485 * This is really hard to understand, because after we match
3486 * what we're trying to match, we must make sure the rest of
2c2d71f5 3487 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3488 * to go back UP the parse tree by recursing ever deeper. And
3489 * if it fails, we have to reset our parent's current state
3490 * that we can try again after backing off.
3491 */
3492
c277df42 3493 CHECKPOINT cp, lastcp;
3280af22 3494 CURCUR* cc = PL_regcc;
c277df42 3495 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3496 I32 cache_offset = 0, cache_bit = 0;
c277df42 3497
4633a7c4 3498 n = cc->cur + 1; /* how many we know we matched */
3280af22 3499 PL_reginput = locinput;
a0d0e21e 3500
a3621e74 3501 DEBUG_EXECUTE_r(
9041c2e3 3502 PerlIO_printf(Perl_debug_log,
91f3b821 3503 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3504 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3505 (long)n, (long)cc->min,
2797576d 3506 (long)cc->max, PTR2UV(cc))
c277df42