This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Revert patch #25993.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
dfe13c55
GS
119#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
121#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
123#define HOPc(pos,off) ((char*)HOP(pos,off))
124#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 125
efb30f32 126#define HOPBACK(pos, off) ( \
e54858b0 127 (PL_reg_match_utf8) \
efb30f32
HS
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)(pos - off) \
131 : (U8*)NULL \
132)
133#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
134
1aa99e6b
IH
135#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
137#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
139#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
141
1a4fad37
AL
142#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
144#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 148
5f80c4cf 149/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
150#define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
155)
156
cca55fe3
JP
157#define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
159)
e2d8ce26 160
a84d97b6
HS
161/*
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
164*/
cca55fe3 165#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 166 while (JUMPABLE(rn)) \
a84d97b6 167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 168 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
169 else if (OP(rn) == PLUS) \
170 rn = NEXTOPER(rn); \
a84d97b6
HS
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 173 else rn += NEXT_OFF(rn); \
5f80c4cf 174} STMT_END
74750237 175
acfe0abc 176static void restore_pos(pTHX_ void *arg);
51371543 177
76e3520e 178STATIC CHECKPOINT
cea2e8a9 179S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 180{
a3b680e6 181 const int retval = PL_savestack_ix;
b1ce53c5 182#define REGCP_PAREN_ELEMS 4
a3b680e6 183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
184 int p;
185
e49a9654
IH
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
188
a01268b5 189#define REGCP_OTHER_ELEMS 6
4b3c1a47 190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 191 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 192/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
3280af22 195 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
196 SSPUSHINT(p);
197 }
b1ce53c5 198/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
a01268b5 201 SSPUSHINT(*PL_reglastcloseparen);
3280af22 202 SSPUSHPTR(PL_reginput);
41123dfd
JH
203#define REGCP_FRAME_ELEMS 2
204/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 208
a0d0e21e
LW
209 return retval;
210}
211
c277df42 212/* These are needed since we do not localize EVAL nodes: */
a3621e74 213# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 216
a3621e74 217# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 218 PerlIO_printf(Perl_debug_log, \
faccc32b 219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 221
76e3520e 222STATIC char *
cea2e8a9 223S_regcppop(pTHX)
a0d0e21e 224{
b1ce53c5 225 I32 i;
a0d0e21e
LW
226 U32 paren = 0;
227 char *input;
b1ce53c5 228
a3621e74
YO
229 GET_RE_DEBUG_FLAGS_DECL;
230
b1ce53c5 231 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 232 i = SSPOPINT;
b1ce53c5
JH
233 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
234 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 235 input = (char *) SSPOPPTR;
a01268b5 236 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
237 *PL_reglastparen = SSPOPINT;
238 PL_regsize = SSPOPINT;
b1ce53c5
JH
239
240 /* Now restore the parentheses context. */
41123dfd
JH
241 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
242 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 243 I32 tmps;
a0d0e21e 244 paren = (U32)SSPOPINT;
3280af22 245 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
246 PL_regstartp[paren] = SSPOPINT;
247 tmps = SSPOPINT;
3280af22
NIS
248 if (paren <= *PL_reglastparen)
249 PL_regendp[paren] = tmps;
a3621e74 250 DEBUG_EXECUTE_r(
c3464db5 251 PerlIO_printf(Perl_debug_log,
b900a521 252 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 253 (UV)paren, (IV)PL_regstartp[paren],
b900a521 254 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 255 (IV)PL_regendp[paren],
3280af22 256 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 257 );
a0d0e21e 258 }
a3621e74 259 DEBUG_EXECUTE_r(
eb160463 260 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 261 PerlIO_printf(Perl_debug_log,
faccc32b
JH
262 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
263 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
264 }
265 );
daf18116 266#if 1
dafc8851
JH
267 /* It would seem that the similar code in regtry()
268 * already takes care of this, and in fact it is in
269 * a better location to since this code can #if 0-ed out
270 * but the code in regtry() is needed or otherwise tests
271 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
272 * (as of patchlevel 7877) will fail. Then again,
273 * this code seems to be necessary or otherwise
274 * building DynaLoader will fail:
275 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
276 * --jhi */
eb160463
GS
277 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
278 if ((I32)paren > PL_regsize)
cf93c79d
IZ
279 PL_regstartp[paren] = -1;
280 PL_regendp[paren] = -1;
a0d0e21e 281 }
dafc8851 282#endif
a0d0e21e
LW
283 return input;
284}
285
0f5d15d6 286STATIC char *
cea2e8a9 287S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6 288{
a3b680e6 289 const I32 tmp = PL_savestack_ix;
0f5d15d6
IZ
290
291 PL_savestack_ix = ss;
292 regcppop();
293 PL_savestack_ix = tmp;
942e002e 294 return Nullch;
0f5d15d6
IZ
295}
296
297typedef struct re_cc_state
298{
299 I32 ss;
300 regnode *node;
301 struct re_cc_state *prev;
302 CURCUR *cc;
303 regexp *re;
304} re_cc_state;
305
02db2b7b 306#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 307
29d1e993
HS
308#define TRYPAREN(paren, n, input) { \
309 if (paren) { \
310 if (n) { \
311 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
312 PL_regendp[paren] = input - PL_bostr; \
313 } \
314 else \
315 PL_regendp[paren] = -1; \
316 } \
317 if (regmatch(next)) \
318 sayYES; \
319 if (paren && n) \
320 PL_regendp[paren] = -1; \
321}
322
323
a687059c 324/*
e50aee73 325 * pregexec and friends
a687059c
LW
326 */
327
328/*
c277df42 329 - pregexec - match a regexp against a string
a687059c 330 */
c277df42 331I32
864dbfa3 332Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 333 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
334/* strend: pointer to null at end of string */
335/* strbeg: real beginning of string */
336/* minend: end of match must be >=minend after stringarg. */
337/* nosave: For optimizations. */
338{
339 return
9041c2e3 340 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
341 nosave ? 0 : REXEC_COPY_STR);
342}
0f5d15d6
IZ
343
344STATIC void
cea2e8a9 345S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
346{
347 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
348#ifdef DEBUGGING
349 PL_regprogram = prog->program;
350#endif
351 PL_regnpar = prog->nparens;
9041c2e3
NIS
352 PL_regdata = prog->data;
353 PL_reg_re = prog;
0f5d15d6 354}
22e551b9 355
9041c2e3 356/*
cad2e5aa
JH
357 * Need to implement the following flags for reg_anch:
358 *
359 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
360 * USE_INTUIT_ML
361 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
362 * INTUIT_AUTORITATIVE_ML
363 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * INTUIT_ONCE_ML
365 *
366 * Another flag for this function: SECOND_TIME (so that float substrs
367 * with giant delta may be not rechecked).
368 */
369
370/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
371
3f7c398e 372/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
373 Otherwise, only SvCUR(sv) is used to get strbeg. */
374
375/* XXXX We assume that strpos is strbeg unless sv. */
376
6eb5f6b9
JH
377/* XXXX Some places assume that there is a fixed substring.
378 An update may be needed if optimizer marks as "INTUITable"
379 RExen without fixed substrings. Similarly, it is assumed that
380 lengths of all the strings are no more than minlen, thus they
381 cannot come from lookahead.
382 (Or minlen should take into account lookahead.) */
383
2c2d71f5
JH
384/* A failure to find a constant substring means that there is no need to make
385 an expensive call to REx engine, thus we celebrate a failure. Similarly,
386 finding a substring too deep into the string means that less calls to
30944b6d
IZ
387 regtry() should be needed.
388
389 REx compiler's optimizer found 4 possible hints:
390 a) Anchored substring;
391 b) Fixed substring;
392 c) Whether we are anchored (beginning-of-line or \G);
393 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 394 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
395 string which does not contradict any of them.
396 */
2c2d71f5 397
6eb5f6b9
JH
398/* Most of decisions we do here should have been done at compile time.
399 The nodes of the REx which we used for the search should have been
400 deleted from the finite automaton. */
401
cad2e5aa
JH
402char *
403Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
404 char *strend, U32 flags, re_scream_pos_data *data)
405{
b7953727 406 register I32 start_shift = 0;
cad2e5aa 407 /* Should be nonnegative! */
b7953727 408 register I32 end_shift = 0;
2c2d71f5
JH
409 register char *s;
410 register SV *check;
a1933d95 411 char *strbeg;
cad2e5aa 412 char *t;
a3b680e6 413 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 414 I32 ml_anch;
6eb5f6b9 415 register char *other_last = Nullch; /* other substr checked before this */
b7953727 416 char *check_at = Nullch; /* check substr found at this pos */
1df70142 417 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 418#ifdef DEBUGGING
890ce7af
AL
419 const char * const i_strpos = strpos;
420 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 421#endif
a3621e74
YO
422
423 GET_RE_DEBUG_FLAGS_DECL;
424
a30b2f1f 425 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 426
b8d68ded 427 if (prog->reganch & ROPT_UTF8) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
429 "UTF-8 regex...\n"));
430 PL_reg_flags |= RF_utf8;
431 }
432
a3621e74 433 DEBUG_EXECUTE_r({
1df70142 434 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
435 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
436 strpos;
1df70142 437 const int len = PL_reg_match_utf8 ?
b8d68ded 438 strlen(s) : strend - strpos;
2a782b5b
JH
439 if (!PL_colorset)
440 reginitcolors();
b8d68ded 441 if (PL_reg_match_utf8)
a3621e74 442 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 443 "UTF-8 target...\n"));
2a782b5b 444 PerlIO_printf(Perl_debug_log,
a0288114 445 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 446 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
447 prog->precomp,
448 PL_colors[1],
449 (strlen(prog->precomp) > 60 ? "..." : ""),
450 PL_colors[0],
451 (int)(len > 60 ? 60 : len),
452 s, PL_colors[1],
453 (len > 60 ? "..." : "")
454 );
455 });
cad2e5aa 456
c344f387
JH
457 /* CHR_DIST() would be more correct here but it makes things slow. */
458 if (prog->minlen > strend - strpos) {
a3621e74 459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 460 "String too short... [re_intuit_start]\n"));
cad2e5aa 461 goto fail;
2c2d71f5 462 }
a1933d95 463 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 464 PL_regeol = strend;
33b8afdf
JH
465 if (do_utf8) {
466 if (!prog->check_utf8 && prog->check_substr)
467 to_utf8_substr(prog);
468 check = prog->check_utf8;
469 } else {
470 if (!prog->check_substr && prog->check_utf8)
471 to_byte_substr(prog);
472 check = prog->check_substr;
473 }
474 if (check == &PL_sv_undef) {
a3621e74 475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
476 "Non-utf string cannot match utf check string\n"));
477 goto fail;
478 }
2c2d71f5 479 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
480 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
481 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 482 && !multiline ) ); /* Check after \n? */
cad2e5aa 483
7e25d62c
JH
484 if (!ml_anch) {
485 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
486 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 487 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
488 && sv && !SvROK(sv)
489 && (strpos != strbeg)) {
a3621e74 490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
491 goto fail;
492 }
493 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 494 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 495 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
496 I32 slen;
497
1aa99e6b 498 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
499 if (SvTAIL(check)) {
500 slen = SvCUR(check); /* >= 1 */
cad2e5aa 501
9041c2e3 502 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 503 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 504 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 505 goto fail_finish;
cad2e5aa
JH
506 }
507 /* Now should match s[0..slen-2] */
508 slen--;
3f7c398e 509 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 510 || (slen > 1
3f7c398e 511 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 512 report_neq:
a3621e74 513 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
514 goto fail_finish;
515 }
cad2e5aa 516 }
3f7c398e 517 else if (*SvPVX_const(check) != *s
653099ff 518 || ((slen = SvCUR(check)) > 1
3f7c398e 519 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5
JH
520 goto report_neq;
521 goto success_at_start;
7e25d62c 522 }
cad2e5aa 523 }
2c2d71f5 524 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 525 s = strpos;
2c2d71f5 526 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 527 end_shift = prog->minlen - start_shift -
653099ff 528 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 529 if (!ml_anch) {
a3b680e6 530 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 531 - (SvTAIL(check) != 0);
a3b680e6 532 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
533
534 if (end_shift < eshift)
535 end_shift = eshift;
536 }
cad2e5aa 537 }
2c2d71f5 538 else { /* Can match at random position */
cad2e5aa
JH
539 ml_anch = 0;
540 s = strpos;
2c2d71f5
JH
541 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
542 /* Should be nonnegative! */
543 end_shift = prog->minlen - start_shift -
653099ff 544 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
545 }
546
2c2d71f5 547#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 548 if (end_shift < 0)
6bbae5e6 549 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
550#endif
551
2c2d71f5
JH
552 restart:
553 /* Find a possible match in the region s..strend by looking for
554 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 555 if (flags & REXEC_SCREAM) {
cad2e5aa 556 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 557 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 558
2c2d71f5
JH
559 if (PL_screamfirst[BmRARE(check)] >= 0
560 || ( BmRARE(check) == '\n'
561 && (BmPREVIOUS(check) == SvCUR(check) - 1)
562 && SvTAIL(check) ))
9041c2e3 563 s = screaminstr(sv, check,
2c2d71f5 564 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 565 else
2c2d71f5 566 goto fail_finish;
4addbd3b
HS
567 /* we may be pointing at the wrong string */
568 if (s && RX_MATCH_COPIED(prog))
3f7c398e 569 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
570 if (data)
571 *data->scream_olds = s;
572 }
f33976b4 573 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
574 s = fbm_instr((U8*)(s + start_shift),
575 (U8*)(strend - end_shift),
7fba1cd6 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 577 else
1aa99e6b
IH
578 s = fbm_instr(HOP3(s, start_shift, strend),
579 HOP3(strend, -end_shift, strbeg),
7fba1cd6 580 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
581
582 /* Update the count-of-usability, remove useless subpatterns,
583 unshift s. */
2c2d71f5 584
a0288114 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 586 (s ? "Found" : "Did not find"),
33b8afdf 587 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 588 PL_colors[0],
7b0972df 589 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 590 SvPVX_const(check),
2c2d71f5
JH
591 PL_colors[1], (SvTAIL(check) ? "$" : ""),
592 (s ? " at offset " : "...\n") ) );
593
594 if (!s)
595 goto fail_finish;
596
6eb5f6b9
JH
597 check_at = s;
598
2c2d71f5 599 /* Finish the diagnostic message */
a3621e74 600 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
601
602 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
603 Start with the other substr.
604 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 605 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
606 *always* match. Probably should be marked during compile...
607 Probably it is right to do no SCREAM here...
608 */
609
33b8afdf 610 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 611 /* Take into account the "other" substring. */
2c2d71f5
JH
612 /* XXXX May be hopelessly wrong for UTF... */
613 if (!other_last)
6eb5f6b9 614 other_last = strpos;
33b8afdf 615 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
616 do_other_anchored:
617 {
890ce7af
AL
618 char * const last = HOP3c(s, -start_shift, strbeg);
619 char *last1, *last2;
2c2d71f5 620 char *s1 = s;
33b8afdf 621 SV* must;
2c2d71f5 622
2c2d71f5
JH
623 t = s - prog->check_offset_max;
624 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 625 && (!do_utf8
1aa99e6b 626 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 627 && t > strpos)))
30944b6d 628 /* EMPTY */;
2c2d71f5
JH
629 else
630 t = strpos;
1aa99e6b 631 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
632 if (t < other_last) /* These positions already checked */
633 t = other_last;
1aa99e6b 634 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
635 if (last < last1)
636 last1 = last;
637 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
638 /* On end-of-str: see comment below. */
33b8afdf
JH
639 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
640 if (must == &PL_sv_undef) {
641 s = (char*)NULL;
a3621e74 642 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
643 }
644 else
645 s = fbm_instr(
646 (unsigned char*)t,
647 HOP3(HOP3(last1, prog->anchored_offset, strend)
648 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
649 must,
7fba1cd6 650 multiline ? FBMrf_MULTILINE : 0
33b8afdf 651 );
a3621e74 652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 653 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
654 (s ? "Found" : "Contradicts"),
655 PL_colors[0],
33b8afdf
JH
656 (int)(SvCUR(must)
657 - (SvTAIL(must)!=0)),
3f7c398e 658 SvPVX_const(must),
33b8afdf 659 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
660 if (!s) {
661 if (last1 >= last2) {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
663 ", giving up...\n"));
664 goto fail_finish;
665 }
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 667 ", trying floating at offset %ld...\n",
1aa99e6b
IH
668 (long)(HOP3c(s1, 1, strend) - i_strpos)));
669 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
670 s = HOP3c(last, 1, strend);
2c2d71f5
JH
671 goto restart;
672 }
673 else {
a3621e74 674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 675 (long)(s - i_strpos)));
1aa99e6b
IH
676 t = HOP3c(s, -prog->anchored_offset, strbeg);
677 other_last = HOP3c(s, 1, strend);
30944b6d 678 s = s1;
2c2d71f5
JH
679 if (t == strpos)
680 goto try_at_start;
2c2d71f5
JH
681 goto try_at_offset;
682 }
30944b6d 683 }
2c2d71f5
JH
684 }
685 else { /* Take into account the floating substring. */
33b8afdf
JH
686 char *last, *last1;
687 char *s1 = s;
688 SV* must;
689
690 t = HOP3c(s, -start_shift, strbeg);
691 last1 = last =
692 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
693 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
694 last = HOP3c(t, prog->float_max_offset, strend);
695 s = HOP3c(t, prog->float_min_offset, strend);
696 if (s < other_last)
697 s = other_last;
2c2d71f5 698 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
699 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
700 /* fbm_instr() takes into account exact value of end-of-str
701 if the check is SvTAIL(ed). Since false positives are OK,
702 and end-of-str is not later than strend we are OK. */
703 if (must == &PL_sv_undef) {
704 s = (char*)NULL;
a3621e74 705 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
706 }
707 else
2c2d71f5 708 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
709 (unsigned char*)last + SvCUR(must)
710 - (SvTAIL(must)!=0),
7fba1cd6 711 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 712 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
713 (s ? "Found" : "Contradicts"),
714 PL_colors[0],
715 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 716 SvPVX_const(must),
33b8afdf
JH
717 PL_colors[1], (SvTAIL(must) ? "$" : "")));
718 if (!s) {
719 if (last1 == last) {
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
721 ", giving up...\n"));
722 goto fail_finish;
2c2d71f5 723 }
a3621e74 724 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
725 ", trying anchored starting at offset %ld...\n",
726 (long)(s1 + 1 - i_strpos)));
727 other_last = last;
728 s = HOP3c(t, 1, strend);
729 goto restart;
730 }
731 else {
a3621e74 732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
733 (long)(s - i_strpos)));
734 other_last = s; /* Fix this later. --Hugo */
735 s = s1;
736 if (t == strpos)
737 goto try_at_start;
738 goto try_at_offset;
739 }
2c2d71f5 740 }
cad2e5aa 741 }
2c2d71f5
JH
742
743 t = s - prog->check_offset_max;
2c2d71f5 744 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 745 && (!do_utf8
1aa99e6b
IH
746 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
747 && t > strpos))) {
2c2d71f5
JH
748 /* Fixed substring is found far enough so that the match
749 cannot start at strpos. */
750 try_at_offset:
cad2e5aa 751 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
752 /* Eventually fbm_*() should handle this, but often
753 anchored_offset is not 0, so this check will not be wasted. */
754 /* XXXX In the code below we prefer to look for "^" even in
755 presence of anchored substrings. And we search even
756 beyond the found float position. These pessimizations
757 are historical artefacts only. */
758 find_anchor:
2c2d71f5 759 while (t < strend - prog->minlen) {
cad2e5aa 760 if (*t == '\n') {
4ee3650e 761 if (t < check_at - prog->check_offset_min) {
33b8afdf 762 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
763 /* Since we moved from the found position,
764 we definitely contradict the found anchored
30944b6d
IZ
765 substr. Due to the above check we do not
766 contradict "check" substr.
767 Thus we can arrive here only if check substr
768 is float. Redo checking for "other"=="fixed".
769 */
9041c2e3 770 strpos = t + 1;
a3621e74 771 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 772 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
773 goto do_other_anchored;
774 }
4ee3650e
GS
775 /* We don't contradict the found floating substring. */
776 /* XXXX Why not check for STCLASS? */
cad2e5aa 777 s = t + 1;
a3621e74 778 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 779 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
780 goto set_useful;
781 }
4ee3650e
GS
782 /* Position contradicts check-string */
783 /* XXXX probably better to look for check-string
784 than for "\n", so one should lower the limit for t? */
a3621e74 785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 786 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 787 other_last = strpos = s = t + 1;
cad2e5aa
JH
788 goto restart;
789 }
790 t++;
791 }
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 793 PL_colors[0], PL_colors[1]));
2c2d71f5 794 goto fail_finish;
cad2e5aa 795 }
f5952150 796 else {
a3621e74 797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 798 PL_colors[0], PL_colors[1]));
f5952150 799 }
cad2e5aa
JH
800 s = t;
801 set_useful:
33b8afdf 802 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
803 }
804 else {
f5952150 805 /* The found string does not prohibit matching at strpos,
2c2d71f5 806 - no optimization of calling REx engine can be performed,
f5952150
GS
807 unless it was an MBOL and we are not after MBOL,
808 or a future STCLASS check will fail this. */
2c2d71f5
JH
809 try_at_start:
810 /* Even in this situation we may use MBOL flag if strpos is offset
811 wrt the start of the string. */
05b4157f 812 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 813 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
814 /* May be due to an implicit anchor of m{.*foo} */
815 && !(prog->reganch & ROPT_IMPLICIT))
816 {
cad2e5aa
JH
817 t = strpos;
818 goto find_anchor;
819 }
a3621e74 820 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 821 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 822 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 823 );
2c2d71f5 824 success_at_start:
30944b6d 825 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
826 && (do_utf8 ? (
827 prog->check_utf8 /* Could be deleted already */
828 && --BmUSEFUL(prog->check_utf8) < 0
829 && (prog->check_utf8 == prog->float_utf8)
830 ) : (
831 prog->check_substr /* Could be deleted already */
832 && --BmUSEFUL(prog->check_substr) < 0
833 && (prog->check_substr == prog->float_substr)
834 )))
66e933ab 835 {
cad2e5aa 836 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
838 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
839 if (do_utf8 ? prog->check_substr : prog->check_utf8)
840 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
841 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
842 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 843 check = Nullsv; /* abort */
cad2e5aa 844 s = strpos;
3cf5c195
IZ
845 /* XXXX This is a remnant of the old implementation. It
846 looks wasteful, since now INTUIT can use many
6eb5f6b9 847 other heuristics. */
cad2e5aa
JH
848 prog->reganch &= ~RE_USE_INTUIT;
849 }
850 else
851 s = strpos;
852 }
853
6eb5f6b9
JH
854 /* Last resort... */
855 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
856 if (prog->regstclass) {
857 /* minlen == 0 is possible if regstclass is \b or \B,
858 and the fixed substr is ''$.
859 Since minlen is already taken into account, s+1 is before strend;
860 accidentally, minlen >= 1 guaranties no false positives at s + 1
861 even for \b or \B. But (minlen? 1 : 0) below assumes that
862 regstclass does not come from lookahead... */
863 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
864 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 865 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 866 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 867 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 868 : 1);
a3b680e6 869 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 870 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 871 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
872 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
873 cl_l, strend)
874 : strend);
6eb5f6b9
JH
875
876 t = s;
9926ca43 877 cache_re(prog);
06b5626a 878 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
879 if (!s) {
880#ifdef DEBUGGING
e1ec3a88 881 const char *what = 0;
6eb5f6b9
JH
882#endif
883 if (endpos == strend) {
a3621e74 884 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
885 "Could not match STCLASS...\n") );
886 goto fail;
887 }
a3621e74 888 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 889 "This position contradicts STCLASS...\n") );
653099ff
GS
890 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
891 goto fail;
6eb5f6b9 892 /* Contradict one of substrings */
33b8afdf
JH
893 if (prog->anchored_substr || prog->anchored_utf8) {
894 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 895 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 896 hop_and_restart:
1aa99e6b 897 s = HOP3c(t, 1, strend);
66e933ab
GS
898 if (s + start_shift + end_shift > strend) {
899 /* XXXX Should be taken into account earlier? */
a3621e74 900 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
901 "Could not match STCLASS...\n") );
902 goto fail;
903 }
5e39e1e5
HS
904 if (!check)
905 goto giveup;
a3621e74 906 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 907 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
908 what, (long)(s + start_shift - i_strpos)) );
909 goto restart;
910 }
66e933ab 911 /* Have both, check_string is floating */
6eb5f6b9
JH
912 if (t + start_shift >= check_at) /* Contradicts floating=check */
913 goto retry_floating_check;
914 /* Recheck anchored substring, but not floating... */
9041c2e3 915 s = check_at;
5e39e1e5
HS
916 if (!check)
917 goto giveup;
a3621e74 918 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 919 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
920 (long)(other_last - i_strpos)) );
921 goto do_other_anchored;
922 }
60e71179
GS
923 /* Another way we could have checked stclass at the
924 current position only: */
925 if (ml_anch) {
926 s = t = t + 1;
5e39e1e5
HS
927 if (!check)
928 goto giveup;
a3621e74 929 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 930 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 931 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 932 goto try_at_offset;
66e933ab 933 }
33b8afdf 934 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 935 goto fail;
6eb5f6b9
JH
936 /* Check is floating subtring. */
937 retry_floating_check:
938 t = check_at - start_shift;
a3621e74 939 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
940 goto hop_and_restart;
941 }
b7953727 942 if (t != s) {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 944 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
945 (long)(t - i_strpos), (long)(s - i_strpos))
946 );
947 }
948 else {
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
950 "Does not contradict STCLASS...\n");
951 );
952 }
6eb5f6b9 953 }
5e39e1e5 954 giveup:
a3621e74 955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
956 PL_colors[4], (check ? "Guessed" : "Giving up"),
957 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 958 return s;
2c2d71f5
JH
959
960 fail_finish: /* Substring not found */
33b8afdf
JH
961 if (prog->check_substr || prog->check_utf8) /* could be removed already */
962 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 963 fail:
a3621e74 964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 965 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
966 return Nullch;
967}
9661b544 968
6eb5f6b9 969/* We know what class REx starts with. Try to find this position... */
3c3eec57 970STATIC char *
a3b680e6 971S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 972{
27da23d5 973 dVAR;
1df70142 974 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 975 char *m;
d8093b23 976 STRLEN ln;
5dab1207 977 STRLEN lnc;
078c425b 978 register STRLEN uskip;
d8093b23
G
979 unsigned int c1;
980 unsigned int c2;
6eb5f6b9
JH
981 char *e;
982 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 983 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 984
6eb5f6b9
JH
985 /* We know what class it must start with. */
986 switch (OP(c)) {
6eb5f6b9 987 case ANYOF:
388cc4de 988 if (do_utf8) {
078c425b 989 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
990 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
991 !UTF8_IS_INVARIANT((U8)s[0]) ?
992 reginclass(c, (U8*)s, 0, do_utf8) :
993 REGINCLASS(c, (U8*)s)) {
994 if (tmp && (norun || regtry(prog, s)))
995 goto got_it;
996 else
997 tmp = doevery;
998 }
999 else
1000 tmp = 1;
078c425b 1001 s += uskip;
388cc4de
HS
1002 }
1003 }
1004 else {
1005 while (s < strend) {
1006 STRLEN skip = 1;
1007
1008 if (REGINCLASS(c, (U8*)s) ||
1009 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1010 /* The assignment of 2 is intentional:
1011 * for the folded sharp s, the skip is 2. */
1012 (skip = SHARP_S_SKIP))) {
1013 if (tmp && (norun || regtry(prog, s)))
1014 goto got_it;
1015 else
1016 tmp = doevery;
1017 }
1018 else
1019 tmp = 1;
1020 s += skip;
1021 }
a0d0e21e 1022 }
6eb5f6b9 1023 break;
f33976b4
DB
1024 case CANY:
1025 while (s < strend) {
1026 if (tmp && (norun || regtry(prog, s)))
1027 goto got_it;
1028 else
1029 tmp = doevery;
1030 s++;
1031 }
1032 break;
6eb5f6b9 1033 case EXACTF:
5dab1207
NIS
1034 m = STRING(c);
1035 ln = STR_LEN(c); /* length to match in octets/bytes */
1036 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1037 if (UTF) {
a2a2844f 1038 STRLEN ulen1, ulen2;
5dab1207 1039 U8 *sm = (U8 *) m;
89ebb4a3
JH
1040 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1041 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1042 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1043
1044 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1045 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1046
89ebb4a3 1047 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1048 0, uniflags);
89ebb4a3 1049 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1050 0, uniflags);
5dab1207
NIS
1051 lnc = 0;
1052 while (sm < ((U8 *) m + ln)) {
1053 lnc++;
1054 sm += UTF8SKIP(sm);
1055 }
1aa99e6b
IH
1056 }
1057 else {
1058 c1 = *(U8*)m;
1059 c2 = PL_fold[c1];
1060 }
6eb5f6b9
JH
1061 goto do_exactf;
1062 case EXACTFL:
5dab1207
NIS
1063 m = STRING(c);
1064 ln = STR_LEN(c);
1065 lnc = (I32) ln;
d8093b23 1066 c1 = *(U8*)m;
6eb5f6b9
JH
1067 c2 = PL_fold_locale[c1];
1068 do_exactf:
db12adc6 1069 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1070
6eb5f6b9
JH
1071 if (norun && e < s)
1072 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1073
60a8b682
JH
1074 /* The idea in the EXACTF* cases is to first find the
1075 * first character of the EXACTF* node and then, if
1076 * necessary, case-insensitively compare the full
1077 * text of the node. The c1 and c2 are the first
1078 * characters (though in Unicode it gets a bit
1079 * more complicated because there are more cases
7f16dd3d
JH
1080 * than just upper and lower: one needs to use
1081 * the so-called folding case for case-insensitive
1082 * matching (called "loose matching" in Unicode).
1083 * ibcmp_utf8() will do just that. */
60a8b682 1084
1aa99e6b 1085 if (do_utf8) {
575cac57 1086 UV c, f;
89ebb4a3 1087 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1088 STRLEN len, foldlen;
041457d9 1089 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1090 if (c1 == c2) {
5dab1207
NIS
1091 /* Upper and lower of 1st char are equal -
1092 * probably not a "letter". */
1aa99e6b 1093 while (s <= e) {
89ebb4a3 1094 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1095 uniflags);
80aecb99
JH
1096 if ( c == c1
1097 && (ln == len ||
66423254 1098 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1099 m, (char **)0, ln, (bool)UTF))
55da9344 1100 && (norun || regtry(prog, s)) )
1aa99e6b 1101 goto got_it;
80aecb99 1102 else {
1df70142 1103 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1104 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1105 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1106 if ( f != c
1107 && (f == c1 || f == c2)
1108 && (ln == foldlen ||
66423254
JH
1109 !ibcmp_utf8((char *) foldbuf,
1110 (char **)0, foldlen, do_utf8,
d07ddd77 1111 m,
eb160463 1112 (char **)0, ln, (bool)UTF))
80aecb99
JH
1113 && (norun || regtry(prog, s)) )
1114 goto got_it;
1115 }
1aa99e6b
IH
1116 s += len;
1117 }
09091399
JH
1118 }
1119 else {
1aa99e6b 1120 while (s <= e) {
89ebb4a3 1121 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1122 uniflags);
80aecb99 1123
60a8b682 1124 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1125 * Note that not all the possible combinations
1126 * are handled here: some of them are handled
1127 * by the standard folding rules, and some of
1128 * them (the character class or ANYOF cases)
1129 * are handled during compiletime in
1130 * regexec.c:S_regclass(). */
880bd946
JH
1131 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1132 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1133 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1134
1135 if ( (c == c1 || c == c2)
1136 && (ln == len ||
66423254 1137 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1138 m, (char **)0, ln, (bool)UTF))
55da9344 1139 && (norun || regtry(prog, s)) )
1aa99e6b 1140 goto got_it;
80aecb99 1141 else {
1df70142 1142 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1143 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1144 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1145 if ( f != c
1146 && (f == c1 || f == c2)
1147 && (ln == foldlen ||
a6872d42 1148 !ibcmp_utf8((char *) foldbuf,
66423254 1149 (char **)0, foldlen, do_utf8,
d07ddd77 1150 m,
eb160463 1151 (char **)0, ln, (bool)UTF))
80aecb99
JH
1152 && (norun || regtry(prog, s)) )
1153 goto got_it;
1154 }
1aa99e6b
IH
1155 s += len;
1156 }
09091399 1157 }
1aa99e6b
IH
1158 }
1159 else {
1160 if (c1 == c2)
1161 while (s <= e) {
1162 if ( *(U8*)s == c1
1163 && (ln == 1 || !(OP(c) == EXACTF
1164 ? ibcmp(s, m, ln)
1165 : ibcmp_locale(s, m, ln)))
1166 && (norun || regtry(prog, s)) )
1167 goto got_it;
1168 s++;
1169 }
1170 else
1171 while (s <= e) {
1172 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1173 && (ln == 1 || !(OP(c) == EXACTF
1174 ? ibcmp(s, m, ln)
1175 : ibcmp_locale(s, m, ln)))
1176 && (norun || regtry(prog, s)) )
1177 goto got_it;
1178 s++;
1179 }
b3c9acc1
IZ
1180 }
1181 break;
bbce6d69 1182 case BOUNDL:
3280af22 1183 PL_reg_flags |= RF_tainted;
bbce6d69 1184 /* FALL THROUGH */
a0d0e21e 1185 case BOUND:
ffc61ed2 1186 if (do_utf8) {
12d33761 1187 if (s == PL_bostr)
ffc61ed2
JH
1188 tmp = '\n';
1189 else {
b4f7163a 1190 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1191
b4f7163a 1192 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1193 }
1194 tmp = ((OP(c) == BOUND ?
9041c2e3 1195 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1196 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1197 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1198 if (tmp == !(OP(c) == BOUND ?
3568d838 1199 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1200 isALNUM_LC_utf8((U8*)s)))
1201 {
1202 tmp = !tmp;
1203 if ((norun || regtry(prog, s)))
1204 goto got_it;
1205 }
078c425b 1206 s += uskip;
a687059c 1207 }
a0d0e21e 1208 }
667bb95a 1209 else {
12d33761 1210 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1211 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1212 while (s < strend) {
1213 if (tmp ==
1214 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1215 tmp = !tmp;
1216 if ((norun || regtry(prog, s)))
1217 goto got_it;
1218 }
1219 s++;
a0ed51b3 1220 }
a0ed51b3 1221 }
6eb5f6b9 1222 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1223 goto got_it;
1224 break;
bbce6d69 1225 case NBOUNDL:
3280af22 1226 PL_reg_flags |= RF_tainted;
bbce6d69 1227 /* FALL THROUGH */
a0d0e21e 1228 case NBOUND:
ffc61ed2 1229 if (do_utf8) {
12d33761 1230 if (s == PL_bostr)
ffc61ed2
JH
1231 tmp = '\n';
1232 else {
b4f7163a 1233 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1234
b4f7163a 1235 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1236 }
1237 tmp = ((OP(c) == NBOUND ?
9041c2e3 1238 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1239 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1240 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1241 if (tmp == !(OP(c) == NBOUND ?
3568d838 1242 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1243 isALNUM_LC_utf8((U8*)s)))
1244 tmp = !tmp;
1245 else if ((norun || regtry(prog, s)))
1246 goto got_it;
078c425b 1247 s += uskip;
ffc61ed2 1248 }
a0d0e21e 1249 }
667bb95a 1250 else {
12d33761 1251 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1252 tmp = ((OP(c) == NBOUND ?
1253 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1254 while (s < strend) {
1255 if (tmp ==
1256 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1257 tmp = !tmp;
1258 else if ((norun || regtry(prog, s)))
1259 goto got_it;
1260 s++;
1261 }
a0ed51b3 1262 }
6eb5f6b9 1263 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1264 goto got_it;
1265 break;
a0d0e21e 1266 case ALNUM:
ffc61ed2 1267 if (do_utf8) {
1a4fad37 1268 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1269 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1270 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1271 if (tmp && (norun || regtry(prog, s)))
1272 goto got_it;
1273 else
1274 tmp = doevery;
1275 }
bbce6d69 1276 else
ffc61ed2 1277 tmp = 1;
078c425b 1278 s += uskip;
bbce6d69 1279 }
bbce6d69 1280 }
ffc61ed2
JH
1281 else {
1282 while (s < strend) {
1283 if (isALNUM(*s)) {
1284 if (tmp && (norun || regtry(prog, s)))
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
a0ed51b3 1289 else
ffc61ed2
JH
1290 tmp = 1;
1291 s++;
a0ed51b3 1292 }
a0ed51b3
LW
1293 }
1294 break;
bbce6d69 1295 case ALNUML:
3280af22 1296 PL_reg_flags |= RF_tainted;
ffc61ed2 1297 if (do_utf8) {
078c425b 1298 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1299 if (isALNUM_LC_utf8((U8*)s)) {
1300 if (tmp && (norun || regtry(prog, s)))
1301 goto got_it;
1302 else
1303 tmp = doevery;
1304 }
a687059c 1305 else
ffc61ed2 1306 tmp = 1;
078c425b 1307 s += uskip;
a0d0e21e 1308 }
a0d0e21e 1309 }
ffc61ed2
JH
1310 else {
1311 while (s < strend) {
1312 if (isALNUM_LC(*s)) {
1313 if (tmp && (norun || regtry(prog, s)))
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a0ed51b3 1318 else
ffc61ed2
JH
1319 tmp = 1;
1320 s++;
a0ed51b3 1321 }
a0ed51b3
LW
1322 }
1323 break;
a0d0e21e 1324 case NALNUM:
ffc61ed2 1325 if (do_utf8) {
1a4fad37 1326 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1327 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1328 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1329 if (tmp && (norun || regtry(prog, s)))
1330 goto got_it;
1331 else
1332 tmp = doevery;
1333 }
bbce6d69 1334 else
ffc61ed2 1335 tmp = 1;
078c425b 1336 s += uskip;
bbce6d69 1337 }
bbce6d69 1338 }
ffc61ed2
JH
1339 else {
1340 while (s < strend) {
1341 if (!isALNUM(*s)) {
1342 if (tmp && (norun || regtry(prog, s)))
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0ed51b3 1347 else
ffc61ed2
JH
1348 tmp = 1;
1349 s++;
a0ed51b3 1350 }
a0ed51b3
LW
1351 }
1352 break;
bbce6d69 1353 case NALNUML:
3280af22 1354 PL_reg_flags |= RF_tainted;
ffc61ed2 1355 if (do_utf8) {
078c425b 1356 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1357 if (!isALNUM_LC_utf8((U8*)s)) {
1358 if (tmp && (norun || regtry(prog, s)))
1359 goto got_it;
1360 else
1361 tmp = doevery;
1362 }
a687059c 1363 else
ffc61ed2 1364 tmp = 1;
078c425b 1365 s += uskip;
a687059c 1366 }
a0d0e21e 1367 }
ffc61ed2
JH
1368 else {
1369 while (s < strend) {
1370 if (!isALNUM_LC(*s)) {
1371 if (tmp && (norun || regtry(prog, s)))
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
a0ed51b3 1376 else
ffc61ed2
JH
1377 tmp = 1;
1378 s++;
a0ed51b3 1379 }
a0ed51b3
LW
1380 }
1381 break;
a0d0e21e 1382 case SPACE:
ffc61ed2 1383 if (do_utf8) {
1a4fad37 1384 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1385 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1386 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1387 if (tmp && (norun || regtry(prog, s)))
1388 goto got_it;
1389 else
1390 tmp = doevery;
1391 }
a0d0e21e 1392 else
ffc61ed2 1393 tmp = 1;
078c425b 1394 s += uskip;
2304df62 1395 }
a0d0e21e 1396 }
ffc61ed2
JH
1397 else {
1398 while (s < strend) {
1399 if (isSPACE(*s)) {
1400 if (tmp && (norun || regtry(prog, s)))
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0ed51b3 1405 else
ffc61ed2
JH
1406 tmp = 1;
1407 s++;
a0ed51b3 1408 }
a0ed51b3
LW
1409 }
1410 break;
bbce6d69 1411 case SPACEL:
3280af22 1412 PL_reg_flags |= RF_tainted;
ffc61ed2 1413 if (do_utf8) {
078c425b 1414 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1415 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1416 if (tmp && (norun || regtry(prog, s)))
1417 goto got_it;
1418 else
1419 tmp = doevery;
1420 }
bbce6d69 1421 else
ffc61ed2 1422 tmp = 1;
078c425b 1423 s += uskip;
bbce6d69 1424 }
bbce6d69 1425 }
ffc61ed2
JH
1426 else {
1427 while (s < strend) {
1428 if (isSPACE_LC(*s)) {
1429 if (tmp && (norun || regtry(prog, s)))
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
a0ed51b3 1434 else
ffc61ed2
JH
1435 tmp = 1;
1436 s++;
a0ed51b3 1437 }
a0ed51b3
LW
1438 }
1439 break;
a0d0e21e 1440 case NSPACE:
ffc61ed2 1441 if (do_utf8) {
1a4fad37 1442 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1443 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1444 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1445 if (tmp && (norun || regtry(prog, s)))
1446 goto got_it;
1447 else
1448 tmp = doevery;
1449 }
a0d0e21e 1450 else
ffc61ed2 1451 tmp = 1;
078c425b 1452 s += uskip;
a687059c 1453 }
a0d0e21e 1454 }
ffc61ed2
JH
1455 else {
1456 while (s < strend) {
1457 if (!isSPACE(*s)) {
1458 if (tmp && (norun || regtry(prog, s)))
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0ed51b3 1463 else
ffc61ed2
JH
1464 tmp = 1;
1465 s++;
a0ed51b3 1466 }
a0ed51b3
LW
1467 }
1468 break;
bbce6d69 1469 case NSPACEL:
3280af22 1470 PL_reg_flags |= RF_tainted;
ffc61ed2 1471 if (do_utf8) {
078c425b 1472 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1473 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1474 if (tmp && (norun || regtry(prog, s)))
1475 goto got_it;
1476 else
1477 tmp = doevery;
1478 }
bbce6d69 1479 else
ffc61ed2 1480 tmp = 1;
078c425b 1481 s += uskip;
bbce6d69 1482 }
bbce6d69 1483 }
ffc61ed2
JH
1484 else {
1485 while (s < strend) {
1486 if (!isSPACE_LC(*s)) {
1487 if (tmp && (norun || regtry(prog, s)))
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
a0ed51b3 1492 else
ffc61ed2
JH
1493 tmp = 1;
1494 s++;
a0ed51b3 1495 }
a0ed51b3
LW
1496 }
1497 break;
a0d0e21e 1498 case DIGIT:
ffc61ed2 1499 if (do_utf8) {
1a4fad37 1500 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1501 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1502 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1503 if (tmp && (norun || regtry(prog, s)))
1504 goto got_it;
1505 else
1506 tmp = doevery;
1507 }
a0d0e21e 1508 else
ffc61ed2 1509 tmp = 1;
078c425b 1510 s += uskip;
2b69d0c2 1511 }
a0d0e21e 1512 }
ffc61ed2
JH
1513 else {
1514 while (s < strend) {
1515 if (isDIGIT(*s)) {
1516 if (tmp && (norun || regtry(prog, s)))
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0ed51b3 1521 else
ffc61ed2
JH
1522 tmp = 1;
1523 s++;
a0ed51b3 1524 }
a0ed51b3
LW
1525 }
1526 break;
b8c5462f
JH
1527 case DIGITL:
1528 PL_reg_flags |= RF_tainted;
ffc61ed2 1529 if (do_utf8) {
078c425b 1530 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1531 if (isDIGIT_LC_utf8((U8*)s)) {
1532 if (tmp && (norun || regtry(prog, s)))
1533 goto got_it;
1534 else
1535 tmp = doevery;
1536 }
b8c5462f 1537 else
ffc61ed2 1538 tmp = 1;
078c425b 1539 s += uskip;
b8c5462f 1540 }
b8c5462f 1541 }
ffc61ed2
JH
1542 else {
1543 while (s < strend) {
1544 if (isDIGIT_LC(*s)) {
1545 if (tmp && (norun || regtry(prog, s)))
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2
JH
1551 tmp = 1;
1552 s++;
b8c5462f 1553 }
b8c5462f
JH
1554 }
1555 break;
a0d0e21e 1556 case NDIGIT:
ffc61ed2 1557 if (do_utf8) {
1a4fad37 1558 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1559 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1560 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1561 if (tmp && (norun || regtry(prog, s)))
1562 goto got_it;
1563 else
1564 tmp = doevery;
1565 }
a0d0e21e 1566 else
ffc61ed2 1567 tmp = 1;
078c425b 1568 s += uskip;
a687059c 1569 }
a0d0e21e 1570 }
ffc61ed2
JH
1571 else {
1572 while (s < strend) {
1573 if (!isDIGIT(*s)) {
1574 if (tmp && (norun || regtry(prog, s)))
1575 goto got_it;
1576 else
1577 tmp = doevery;
1578 }
a0ed51b3 1579 else
ffc61ed2
JH
1580 tmp = 1;
1581 s++;
a0ed51b3 1582 }
a0ed51b3
LW
1583 }
1584 break;
b8c5462f
JH
1585 case NDIGITL:
1586 PL_reg_flags |= RF_tainted;
ffc61ed2 1587 if (do_utf8) {
078c425b 1588 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1589 if (!isDIGIT_LC_utf8((U8*)s)) {
1590 if (tmp && (norun || regtry(prog, s)))
1591 goto got_it;
1592 else
1593 tmp = doevery;
1594 }
b8c5462f 1595 else
ffc61ed2 1596 tmp = 1;
078c425b 1597 s += uskip;
b8c5462f 1598 }
a0ed51b3 1599 }
ffc61ed2
JH
1600 else {
1601 while (s < strend) {
1602 if (!isDIGIT_LC(*s)) {
1603 if (tmp && (norun || regtry(prog, s)))
1604 goto got_it;
1605 else
1606 tmp = doevery;
1607 }
cf93c79d 1608 else
ffc61ed2
JH
1609 tmp = 1;
1610 s++;
b8c5462f 1611 }
b8c5462f
JH
1612 }
1613 break;
b3c9acc1 1614 default:
3c3eec57
GS
1615 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1616 break;
d6a28714 1617 }
6eb5f6b9
JH
1618 return 0;
1619 got_it:
1620 return s;
1621}
1622
1623/*
1624 - regexec_flags - match a regexp against a string
1625 */
1626I32
1627Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1628 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1629/* strend: pointer to null at end of string */
1630/* strbeg: real beginning of string */
1631/* minend: end of match must be >=minend after stringarg. */
1632/* data: May be used for some additional optimizations. */
1633/* nosave: For optimizations. */
1634{
6eb5f6b9
JH
1635 register char *s;
1636 register regnode *c;
1637 register char *startpos = stringarg;
6eb5f6b9
JH
1638 I32 minlen; /* must match at least this many chars */
1639 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1640 I32 end_shift = 0; /* Same for the end. */ /* CC */
1641 I32 scream_pos = -1; /* Internal iterator of scream. */
1642 char *scream_olds;
1643 SV* oreplsv = GvSV(PL_replgv);
1df70142 1644 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1645 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1646#ifdef DEBUGGING
9e55ce06
JH
1647 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1648 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1649#endif
a3621e74
YO
1650
1651 GET_RE_DEBUG_FLAGS_DECL;
1652
9d4ba2ae 1653 PERL_UNUSED_ARG(data);
a30b2f1f 1654 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1655
1656 PL_regcc = 0;
1657
1658 cache_re(prog);
1659#ifdef DEBUGGING
aea4f609 1660 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1661#endif
1662
1663 /* Be paranoid... */
1664 if (prog == NULL || startpos == NULL) {
1665 Perl_croak(aTHX_ "NULL regexp parameter");
1666 return 0;
1667 }
1668
1669 minlen = prog->minlen;
61a36c01 1670 if (strend - startpos < minlen) {
a3621e74 1671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1672 "String too short [regexec_flags]...\n"));
1673 goto phooey;
1aa99e6b 1674 }
6eb5f6b9 1675
6eb5f6b9
JH
1676 /* Check validity of program. */
1677 if (UCHARAT(prog->program) != REG_MAGIC) {
1678 Perl_croak(aTHX_ "corrupted regexp program");
1679 }
1680
1681 PL_reg_flags = 0;
1682 PL_reg_eval_set = 0;
1683 PL_reg_maxiter = 0;
1684
1685 if (prog->reganch & ROPT_UTF8)
1686 PL_reg_flags |= RF_utf8;
1687
1688 /* Mark beginning of line for ^ and lookbehind. */
1689 PL_regbol = startpos;
1690 PL_bostr = strbeg;
1691 PL_reg_sv = sv;
1692
1693 /* Mark end of line for $ (and such) */
1694 PL_regeol = strend;
1695
1696 /* see how far we have to get to not match where we matched before */
1697 PL_regtill = startpos+minend;
1698
1699 /* We start without call_cc context. */
1700 PL_reg_call_cc = 0;
1701
1702 /* If there is a "must appear" string, look for it. */
1703 s = startpos;
1704
1705 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1706 MAGIC *mg;
1707
1708 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1709 PL_reg_ganch = startpos;
1710 else if (sv && SvTYPE(sv) >= SVt_PVMG
1711 && SvMAGIC(sv)
14befaf4
DM
1712 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1713 && mg->mg_len >= 0) {
6eb5f6b9
JH
1714 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1715 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1716 if (s > PL_reg_ganch)
6eb5f6b9
JH
1717 goto phooey;
1718 s = PL_reg_ganch;
1719 }
1720 }
1721 else /* pos() not defined */
1722 PL_reg_ganch = strbeg;
1723 }
1724
33b8afdf 1725 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1726 re_scream_pos_data d;
1727
1728 d.scream_olds = &scream_olds;
1729 d.scream_pos = &scream_pos;
1730 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1731 if (!s) {
a3621e74 1732 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1733 goto phooey; /* not present */
3fa9c3d7 1734 }
6eb5f6b9
JH
1735 }
1736
a3621e74 1737 DEBUG_EXECUTE_r({
1df70142
AL
1738 const char * const s0 = UTF
1739 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1740 UNI_DISPLAY_REGEX)
1741 : prog->precomp;
1742 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1743 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1744 UNI_DISPLAY_REGEX) : startpos;
1df70142 1745 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1746 if (!PL_colorset)
1747 reginitcolors();
1748 PerlIO_printf(Perl_debug_log,
a0288114 1749 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1750 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1751 len0, len0, s0,
2a782b5b 1752 PL_colors[1],
9e55ce06 1753 len0 > 60 ? "..." : "",
2a782b5b 1754 PL_colors[0],
9e55ce06
JH
1755 (int)(len1 > 60 ? 60 : len1),
1756 s1, PL_colors[1],
1757 (len1 > 60 ? "..." : "")
2a782b5b
JH
1758 );
1759 });
6eb5f6b9
JH
1760
1761 /* Simplest case: anchored match need be tried only once. */
1762 /* [unless only anchor is BOL and multiline is set] */
1763 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1764 if (s == startpos && regtry(prog, startpos))
1765 goto got_it;
7fba1cd6 1766 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1767 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1768 {
1769 char *end;
1770
1771 if (minlen)
1772 dontbother = minlen - 1;
1aa99e6b 1773 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1774 /* for multiline we only have to try after newlines */
33b8afdf 1775 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1776 if (s == startpos)
1777 goto after_try;
1778 while (1) {
1779 if (regtry(prog, s))
1780 goto got_it;
1781 after_try:
1782 if (s >= end)
1783 goto phooey;
1784 if (prog->reganch & RE_USE_INTUIT) {
1785 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1786 if (!s)
1787 goto phooey;
1788 }
1789 else
1790 s++;
1791 }
1792 } else {
1793 if (s > startpos)
1794 s--;
1795 while (s < end) {
1796 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1797 if (regtry(prog, s))
1798 goto got_it;
1799 }
1800 }
1801 }
1802 }
1803 goto phooey;
1804 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1805 if (regtry(prog, PL_reg_ganch))
1806 goto got_it;
1807 goto phooey;
1808 }
1809
1810 /* Messy cases: unanchored match. */
33b8afdf 1811 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1812 /* we have /x+whatever/ */
1813 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1814 char ch;
bf93d4cc
GS
1815#ifdef DEBUGGING
1816 int did_match = 0;
1817#endif
33b8afdf
JH
1818 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1819 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1820 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1821
1aa99e6b 1822 if (do_utf8) {
6eb5f6b9
JH
1823 while (s < strend) {
1824 if (*s == ch) {
a3621e74 1825 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1826 if (regtry(prog, s)) goto got_it;
1827 s += UTF8SKIP(s);
1828 while (s < strend && *s == ch)
1829 s += UTF8SKIP(s);
1830 }
1831 s += UTF8SKIP(s);
1832 }
1833 }
1834 else {
1835 while (s < strend) {
1836 if (*s == ch) {
a3621e74 1837 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1838 if (regtry(prog, s)) goto got_it;
1839 s++;
1840 while (s < strend && *s == ch)
1841 s++;
1842 }
1843 s++;
1844 }
1845 }
a3621e74 1846 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1847 PerlIO_printf(Perl_debug_log,
b7953727
JH
1848 "Did not find anchored character...\n")
1849 );
6eb5f6b9 1850 }
33b8afdf
JH
1851 else if (prog->anchored_substr != Nullsv
1852 || prog->anchored_utf8 != Nullsv
1853 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1854 && prog->float_max_offset < strend - s)) {
1855 SV *must;
1856 I32 back_max;
1857 I32 back_min;
1858 char *last;
6eb5f6b9 1859 char *last1; /* Last position checked before */
bf93d4cc
GS
1860#ifdef DEBUGGING
1861 int did_match = 0;
1862#endif
33b8afdf
JH
1863 if (prog->anchored_substr || prog->anchored_utf8) {
1864 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1865 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1866 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1867 back_max = back_min = prog->anchored_offset;
1868 } else {
1869 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1870 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1871 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1872 back_max = prog->float_max_offset;
1873 back_min = prog->float_min_offset;
1874 }
1875 if (must == &PL_sv_undef)
1876 /* could not downgrade utf8 check substring, so must fail */
1877 goto phooey;
1878
1879 last = HOP3c(strend, /* Cannot start after this */
1880 -(I32)(CHR_SVLEN(must)
1881 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1882
1883 if (s > PL_bostr)
1884 last1 = HOPc(s, -1);
1885 else
1886 last1 = s - 1; /* bogus */
1887
a0288114 1888 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1889 check_substr==must. */
1890 scream_pos = -1;
1891 dontbother = end_shift;
1892 strend = HOPc(strend, -dontbother);
1893 while ( (s <= last) &&
9041c2e3 1894 ((flags & REXEC_SCREAM)
1aa99e6b 1895 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1896 end_shift, &scream_pos, 0))
1aa99e6b 1897 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1898 (unsigned char*)strend, must,
7fba1cd6 1899 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1900 /* we may be pointing at the wrong string */
1901 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1902 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1903 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1904 if (HOPc(s, -back_max) > last1) {
1905 last1 = HOPc(s, -back_min);
1906 s = HOPc(s, -back_max);
1907 }
1908 else {
1909 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1910
1911 last1 = HOPc(s, -back_min);
1912 s = t;
1913 }
1aa99e6b 1914 if (do_utf8) {
6eb5f6b9
JH
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1917 goto got_it;
1918 s += UTF8SKIP(s);
1919 }
1920 }
1921 else {
1922 while (s <= last1) {
1923 if (regtry(prog, s))
1924 goto got_it;
1925 s++;
1926 }
1927 }
1928 }
a3621e74 1929 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1930 PerlIO_printf(Perl_debug_log,
a0288114 1931 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1932 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1933 ? "anchored" : "floating"),
1934 PL_colors[0],
1935 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1936 SvPVX_const(must),
b7953727
JH
1937 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1938 );
6eb5f6b9
JH
1939 goto phooey;
1940 }
155aba94 1941 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1942 if (minlen) {
1943 I32 op = (U8)OP(prog->regstclass);
66e933ab 1944 /* don't bother with what can't match */
f14c76ed
RGS
1945 if (PL_regkind[op] != EXACT && op != CANY)
1946 strend = HOPc(strend, -(minlen - 1));
1947 }
a3621e74 1948 DEBUG_EXECUTE_r({
ffc61ed2 1949 SV *prop = sv_newmortal();
cfd0369c
NC
1950 const char *s0;
1951 const char *s1;
9e55ce06
JH
1952 int len0;
1953 int len1;
1954
ffc61ed2 1955 regprop(prop, c);
9e55ce06 1956 s0 = UTF ?
3f7c398e 1957 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1958 UNI_DISPLAY_REGEX) :
cfd0369c 1959 SvPVX_const(prop);
9e55ce06
JH
1960 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1961 s1 = UTF ?
c728cb41 1962 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1963 len1 = UTF ? SvCUR(dsv1) : strend - s;
1964 PerlIO_printf(Perl_debug_log,
a0288114 1965 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1966 len0, len0, s0,
1967 len1, len1, s1);
ffc61ed2 1968 });
06b5626a 1969 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1970 goto got_it;
a3621e74 1971 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1972 }
1973 else {
1974 dontbother = 0;
33b8afdf
JH
1975 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1976 /* Trim the end. */
d6a28714 1977 char *last;
33b8afdf
JH
1978 SV* float_real;
1979
1980 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1981 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1982 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1983
1984 if (flags & REXEC_SCREAM) {
33b8afdf 1985 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1986 end_shift, &scream_pos, 1); /* last one */
1987 if (!last)
ffc61ed2 1988 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1989 /* we may be pointing at the wrong string */
1990 else if (RX_MATCH_COPIED(prog))
3f7c398e 1991 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1992 }
d6a28714
JH
1993 else {
1994 STRLEN len;
cfd0369c 1995 const char * const little = SvPV_const(float_real, len);
d6a28714 1996
33b8afdf 1997 if (SvTAIL(float_real)) {
d6a28714
JH
1998 if (memEQ(strend - len + 1, little, len - 1))
1999 last = strend - len + 1;
7fba1cd6 2000 else if (!multiline)
9041c2e3 2001 last = memEQ(strend - len, little, len)
d6a28714 2002 ? strend - len : Nullch;
b8c5462f 2003 else
d6a28714
JH
2004 goto find_last;
2005 } else {
2006 find_last:
9041c2e3 2007 if (len)
d6a28714 2008 last = rninstr(s, strend, little, little + len);
b8c5462f 2009 else
a0288114 2010 last = strend; /* matching "$" */
b8c5462f 2011 }
b8c5462f 2012 }
bf93d4cc 2013 if (last == NULL) {
a3621e74 2014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2015 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2016 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2017 goto phooey; /* Should not happen! */
2018 }
d6a28714
JH
2019 dontbother = strend - last + prog->float_min_offset;
2020 }
2021 if (minlen && (dontbother < minlen))
2022 dontbother = minlen - 1;
2023 strend -= dontbother; /* this one's always in bytes! */
2024 /* We don't know much -- general case. */
1aa99e6b 2025 if (do_utf8) {
d6a28714
JH
2026 for (;;) {
2027 if (regtry(prog, s))
2028 goto got_it;
2029 if (s >= strend)
2030 break;
b8c5462f 2031 s += UTF8SKIP(s);
d6a28714
JH
2032 };
2033 }
2034 else {
2035 do {
2036 if (regtry(prog, s))
2037 goto got_it;
2038 } while (s++ < strend);
2039 }
2040 }
2041
2042 /* Failure. */
2043 goto phooey;
2044
2045got_it:
2046 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2047
2048 if (PL_reg_eval_set) {
2049 /* Preserve the current value of $^R */
2050 if (oreplsv != GvSV(PL_replgv))
2051 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2052 restored, the value remains
2053 the same. */
acfe0abc 2054 restore_pos(aTHX_ 0);
d6a28714
JH
2055 }
2056
2057 /* make sure $`, $&, $', and $digit will work later */
2058 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2059 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2060 if (flags & REXEC_COPY_STR) {
2061 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2062#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2063 if ((SvIsCOW(sv)
2064 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2065 if (DEBUG_C_TEST) {
2066 PerlIO_printf(Perl_debug_log,
2067 "Copy on write: regexp capture, type %d\n",
2068 (int) SvTYPE(sv));
2069 }
2070 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2071 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2072 assert (SvPOKp(prog->saved_copy));
2073 } else
2074#endif
2075 {
2076 RX_MATCH_COPIED_on(prog);
2077 s = savepvn(strbeg, i);
2078 prog->subbeg = s;
2079 }
d6a28714 2080 prog->sublen = i;
d6a28714
JH
2081 }
2082 else {
2083 prog->subbeg = strbeg;
2084 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2085 }
2086 }
9041c2e3 2087
d6a28714
JH
2088 return 1;
2089
2090phooey:
a3621e74 2091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2092 PL_colors[4], PL_colors[5]));
d6a28714 2093 if (PL_reg_eval_set)
acfe0abc 2094 restore_pos(aTHX_ 0);
d6a28714
JH
2095 return 0;
2096}
2097
2098/*
2099 - regtry - try match at specific point
2100 */
2101STATIC I32 /* 0 failure, 1 success */
2102S_regtry(pTHX_ regexp *prog, char *startpos)
2103{
d6a28714
JH
2104 register I32 i;
2105 register I32 *sp;
2106 register I32 *ep;
2107 CHECKPOINT lastcp;
a3621e74 2108 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2109
02db2b7b
IZ
2110#ifdef DEBUGGING
2111 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2112#endif
d6a28714
JH
2113 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2114 MAGIC *mg;
2115
2116 PL_reg_eval_set = RS_init;
a3621e74 2117 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2118 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2119 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2120 ));
e8347627 2121 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2122 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2123 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2124 SAVETMPS;
2125 /* Apparently this is not needed, judging by wantarray. */
e8347627 2126 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2127 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2128
2129 if (PL_reg_sv) {
2130 /* Make $_ available to executed code. */
2131 if (PL_reg_sv != DEFSV) {
59f00321 2132 SAVE_DEFSV;
d6a28714 2133 DEFSV = PL_reg_sv;
b8c5462f 2134 }
d6a28714 2135
9041c2e3 2136 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2137 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2138 /* prepare for quick setting of pos */
14befaf4
DM
2139 sv_magic(PL_reg_sv, (SV*)0,
2140 PERL_MAGIC_regex_global, Nullch, 0);
2141 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2142 mg->mg_len = -1;
b8c5462f 2143 }
d6a28714
JH
2144 PL_reg_magic = mg;
2145 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2146 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2147 }
09687e5a 2148 if (!PL_reg_curpm) {
a02a5408 2149 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2150#ifdef USE_ITHREADS
2151 {
2152 SV* repointer = newSViv(0);
577e12cc 2153 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2154 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2155 av_push(PL_regex_padav,repointer);
2156 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2157 PL_regex_pad = AvARRAY(PL_regex_padav);
2158 }
2159#endif
2160 }
aaa362c4 2161 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2162 PL_reg_oldcurpm = PL_curpm;
2163 PL_curpm = PL_reg_curpm;
2164 if (RX_MATCH_COPIED(prog)) {
2165 /* Here is a serious problem: we cannot rewrite subbeg,
2166 since it may be needed if this match fails. Thus
2167 $` inside (?{}) could fail... */
2168 PL_reg_oldsaved = prog->subbeg;
2169 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2170#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2171 PL_nrs = prog->saved_copy;
2172#endif
d6a28714
JH
2173 RX_MATCH_COPIED_off(prog);
2174 }
2175 else
2176 PL_reg_oldsaved = Nullch;
2177 prog->subbeg = PL_bostr;
2178 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2179 }
973dddac 2180 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2181 PL_reginput = startpos;
2182 PL_regstartp = prog->startp;
2183 PL_regendp = prog->endp;
2184 PL_reglastparen = &prog->lastparen;
a01268b5 2185 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2186 prog->lastparen = 0;
03994de8 2187 prog->lastcloseparen = 0;
d6a28714 2188 PL_regsize = 0;
a3621e74 2189 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2190 if (PL_reg_start_tmpl <= prog->nparens) {
2191 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2192 if(PL_reg_start_tmp)
2193 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2194 else
a02a5408 2195 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2196 }
2197
2198 /* XXXX What this code is doing here?!!! There should be no need
2199 to do this again and again, PL_reglastparen should take care of
3dd2943c 2200 this! --ilya*/
dafc8851
JH
2201
2202 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2203 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2204 * PL_reglastparen), is not needed at all by the test suite
2205 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2206 * enough, for building DynaLoader, or otherwise this
2207 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2208 * will happen. Meanwhile, this code *is* needed for the
2209 * above-mentioned test suite tests to succeed. The common theme
2210 * on those tests seems to be returning null fields from matches.
2211 * --jhi */
dafc8851 2212#if 1
d6a28714
JH
2213 sp = prog->startp;
2214 ep = prog->endp;
2215 if (prog->nparens) {
eb160463 2216 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2217 *++sp = -1;
2218 *++ep = -1;
2219 }
2220 }
dafc8851 2221#endif
02db2b7b 2222 REGCP_SET(lastcp);
d6a28714
JH
2223 if (regmatch(prog->program + 1)) {
2224 prog->endp[0] = PL_reginput - PL_bostr;
2225 return 1;
2226 }
02db2b7b 2227 REGCP_UNWIND(lastcp);
d6a28714
JH
2228 return 0;
2229}
2230
02db2b7b
IZ
2231#define RE_UNWIND_BRANCH 1
2232#define RE_UNWIND_BRANCHJ 2
2233
2234union re_unwind_t;
2235
2236typedef struct { /* XX: makes sense to enlarge it... */
2237 I32 type;
2238 I32 prev;
2239 CHECKPOINT lastcp;
2240} re_unwind_generic_t;
2241
2242typedef struct {
2243 I32 type;
2244 I32 prev;
2245 CHECKPOINT lastcp;
2246 I32 lastparen;
2247 regnode *next;
2248 char *locinput;
2249 I32 nextchr;
2250#ifdef DEBUGGING
2251 int regindent;
2252#endif
2253} re_unwind_branch_t;
2254
2255typedef union re_unwind_t {
2256 I32 type;
2257 re_unwind_generic_t generic;
2258 re_unwind_branch_t branch;
2259} re_unwind_t;
2260
8ba1375e
MJD
2261#define sayYES goto yes
2262#define sayNO goto no
e0f9d4a8 2263#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2264#define sayYES_FINAL goto yes_final
2265#define sayYES_LOUD goto yes_loud
2266#define sayNO_FINAL goto no_final
2267#define sayNO_SILENT goto do_no
2268#define saySAME(x) if (x) goto yes; else goto no
2269
3ab3c9b4
HS
2270#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2271#define POSCACHE_SEEN 1 /* we know what we're caching */
2272#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2273#define CACHEsayYES STMT_START { \
2274 if (cache_offset | cache_bit) { \
2275 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2276 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2277 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2278 /* cache records failure, but this is success */ \
2279 DEBUG_r( \
2280 PerlIO_printf(Perl_debug_log, \
2281 "%*s (remove success from failure cache)\n", \
2282 REPORT_CODE_OFF+PL_regindent*2, "") \
2283 ); \
2284 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2285 } \
2286 } \
2287 sayYES; \
2288} STMT_END
2289#define CACHEsayNO STMT_START { \
2290 if (cache_offset | cache_bit) { \
2291 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2292 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2293 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2294 /* cache records success, but this is failure */ \
2295 DEBUG_r( \
2296 PerlIO_printf(Perl_debug_log, \
2297 "%*s (remove failure from success cache)\n", \
2298 REPORT_CODE_OFF+PL_regindent*2, "") \
2299 ); \
2300 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2301 } \
2302 } \
2303 sayNO; \
2304} STMT_END
2305
a3621e74
YO
2306/* this is used to determine how far from the left messages like
2307 'failed...' are printed. Currently 29 makes these messages line
2308 up with the opcode they refer to. Earlier perls used 25 which
2309 left these messages outdented making reviewing a debug output
2310 quite difficult.
2311*/
2312#define REPORT_CODE_OFF 29
2313
2314
2315/* Make sure there is a test for this +1 options in re_tests */
2316#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2317
2318#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2319 if ( trie->states[ state ].wordnum ) { \
2320 if ( !accepted ) { \
2321 ENTER; \
2322 SAVETMPS; \
2323 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2324 sv_accept_buff=NEWSV( 1234, \
2325 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2326 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2327 SvPOK_on( sv_accept_buff ); \
2328 sv_2mortal( sv_accept_buff ); \
2329 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2330 } else { \
2331 if ( accepted >= bufflen ) { \
2332 bufflen *= 2; \
2333 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2334 bufflen * sizeof(reg_trie_accepted) ); \
2335 } \
2336 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2337 + sizeof( reg_trie_accepted ) ); \
2338 } \
2339 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2340 accept_buff[ accepted ].endpos = uc; \
2341 ++accepted; \
2342 } } STMT_END
2343
2344#define TRIE_HANDLE_CHAR STMT_START { \
2345 if ( uvc < 256 ) { \
2346 charid = trie->charmap[ uvc ]; \
2347 } else { \
2348 charid = 0; \
2349 if( trie->widecharmap ) { \
2350 SV** svpp = (SV**)NULL; \
2351 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2352 sizeof( UV ), 0 ); \
2353 if ( svpp ) { \
2354 charid = (U16)SvIV( *svpp ); \
2355 } \
2356 } \
2357 } \
2358 if ( charid && \
cc601c31
YO
2359 ( base + charid > trie->uniquecharcount ) && \
2360 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2361 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2362 { \
2363 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2364 } else { \
2365 state = 0; \
2366 } \
2367 uc += len; \
2368 } STMT_END
8ba1375e 2369
d6a28714
JH
2370/*
2371 - regmatch - main matching routine
2372 *
2373 * Conceptually the strategy is simple: check to see whether the current
2374 * node matches, call self recursively to see whether the rest matches,
2375 * and then act accordingly. In practice we make some effort to avoid
2376 * recursion, in particular by going through "ordinary" nodes (that don't
2377 * need to know whether the rest of the match failed) by a loop instead of
2378 * by recursion.
2379 */
2380/* [lwall] I've hoisted the register declarations to the outer block in order to
2381 * maybe save a little bit of pushing and popping on the stack. It also takes
2382 * advantage of machines that use a register save mask on subroutine entry.
2383 */
2384STATIC I32 /* 0 failure, 1 success */
2385S_regmatch(pTHX_ regnode *prog)
2386{
27da23d5 2387 dVAR;
d6a28714
JH
2388 register regnode *scan; /* Current node. */
2389 regnode *next; /* Next node. */
2390 regnode *inner; /* Next node in internal branch. */
2391 register I32 nextchr; /* renamed nextchr - nextchar colides with
2392 function of same name */
2393 register I32 n; /* no or next */
b7953727
JH
2394 register I32 ln = 0; /* len or last */
2395 register char *s = Nullch; /* operand or save */
d6a28714 2396 register char *locinput = PL_reginput;
b7953727 2397 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2398 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2399 I32 unwind = 0;
a3621e74
YO
2400
2401 /* used by the trie code */
ab74612d
NC
2402 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2403 reg_trie_accepted *accept_buff = 0; /* "" */
2404 reg_trie_data *trie; /* what trie are we using right now */
2405 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2406
b7953727 2407#if 0
02db2b7b 2408 I32 firstcp = PL_savestack_ix;
b7953727 2409#endif
0d46e09a 2410 register const bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2411#ifdef DEBUGGING
ce333219
JH
2412 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2413 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2414 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2415
ab74612d 2416 SV *re_debug_flags = NULL;
2a782b5b 2417#endif
041457d9 2418 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
02db2b7b 2419
a3621e74
YO
2420 GET_RE_DEBUG_FLAGS;
2421
d6a28714
JH
2422#ifdef DEBUGGING
2423 PL_regindent++;
2424#endif
2425
a3621e74 2426
d6a28714
JH
2427 /* Note that nextchr is a byte even in UTF */
2428 nextchr = UCHARAT(locinput);
2429 scan = prog;
2430 while (scan != NULL) {
8ba1375e 2431
a3621e74 2432 DEBUG_EXECUTE_r( {
d6a28714 2433 SV *prop = sv_newmortal();
1df70142
AL
2434 const int docolor = *PL_colors[0];
2435 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2436 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2437 /* The part of the string before starttry has one color
2438 (pref0_len chars), between starttry and current
2439 position another one (pref_len - pref0_len chars),
2440 after the current position the third one.
2441 We assume that pref0_len <= pref_len, otherwise we
2442 decrease pref0_len. */
9041c2e3 2443 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2444 ? (5 + taill) - l : locinput - PL_bostr;
2445 int pref0_len;
d6a28714 2446
df1ffd02 2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2448 pref_len++;
2449 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2450 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2451 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2452 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2453 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2454 l--;
d6a28714
JH
2455 if (pref0_len < 0)
2456 pref0_len = 0;
2457 if (pref0_len > pref_len)
2458 pref0_len = pref_len;
2459 regprop(prop, scan);
2a782b5b 2460 {
1df70142 2461 const char * const s0 =
f14c76ed 2462 do_utf8 && OP(scan) != CANY ?
2a782b5b 2463 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2464 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2465 locinput - pref_len;
1df70142
AL
2466 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2467 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2468 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2469 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2470 locinput - pref_len + pref0_len;
1df70142
AL
2471 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2472 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2473 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2474 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2475 locinput;
1df70142 2476 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2477 PerlIO_printf(Perl_debug_log,
2478 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2479 (IV)(locinput - PL_bostr),
2480 PL_colors[4],
2481 len0, s0,
2482 PL_colors[5],
2483 PL_colors[2],
2484 len1, s1,
2485 PL_colors[3],
2486 (docolor ? "" : "> <"),
2487 PL_colors[0],
2488 len2, s2,
2489 PL_colors[1],
2490 15 - l - pref_len + 1,
2491 "",
2492 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2493 SvPVX_const(prop));
2a782b5b
JH
2494 }
2495 });
d6a28714
JH
2496
2497 next = scan + NEXT_OFF(scan);
2498 if (next == scan)
2499 next = NULL;
2500
2501 switch (OP(scan)) {
2502 case BOL:
7fba1cd6 2503 if (locinput == PL_bostr)
d6a28714
JH
2504 {
2505 /* regtill = regbol; */
b8c5462f
JH
2506 break;
2507 }
d6a28714
JH
2508 sayNO;
2509 case MBOL:
12d33761
HS
2510 if (locinput == PL_bostr ||
2511 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2512 {
b8c5462f
JH
2513 break;
2514 }
d6a28714
JH
2515 sayNO;
2516 case SBOL:
c2a73568 2517 if (locinput == PL_bostr)
b8c5462f 2518 break;
d6a28714
JH
2519 sayNO;
2520 case GPOS:
2521 if (locinput == PL_reg_ganch)
2522 break;
2523 sayNO;
2524 case EOL:
d6a28714
JH
2525 goto seol;
2526 case MEOL:
d6a28714 2527 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2528 sayNO;
b8c5462f 2529 break;
d6a28714
JH
2530 case SEOL:
2531 seol:
2532 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2533 sayNO;
d6a28714 2534 if (PL_regeol - locinput > 1)
b8c5462f 2535 sayNO;
b8c5462f 2536 break;
d6a28714
JH
2537 case EOS:
2538 if (PL_regeol != locinput)
b8c5462f 2539 sayNO;
d6a28714 2540 break;
ffc61ed2 2541 case SANY:
d6a28714 2542 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2543 sayNO;
f33976b4
DB
2544 if (do_utf8) {
2545 locinput += PL_utf8skip[nextchr];
2546 if (locinput > PL_regeol)
2547 sayNO;
2548 nextchr = UCHARAT(locinput);
2549 }
2550 else
2551 nextchr = UCHARAT(++locinput);
2552 break;
2553 case CANY:
2554 if (!nextchr && locinput >= PL_regeol)
2555 sayNO;
b8c5462f 2556 nextchr = UCHARAT(++locinput);
a0d0e21e 2557 break;
ffc61ed2 2558 case REG_ANY:
1aa99e6b
IH
2559 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2560 sayNO;
2561 if (do_utf8) {
b8c5462f 2562 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2563 if (locinput > PL_regeol)
2564 sayNO;
a0ed51b3 2565 nextchr = UCHARAT(locinput);
a0ed51b3 2566 }
1aa99e6b
IH
2567 else
2568 nextchr = UCHARAT(++locinput);
a0ed51b3 2569 break;
a3621e74
YO
2570
2571
2572
2573 /*
2574 traverse the TRIE keeping track of all accepting states
2575 we transition through until we get to a failing node.
2576
2577 we use two slightly different pieces of code to handle
2578 the traversal depending on whether its case sensitive or
2579 not. we reuse the accept code however. (this should probably
2580 be turned into a macro.)
2581
2582 */
2583 case TRIEF:
2584 case TRIEFL:
2585 {
a3621e74
YO
2586 U8 *uc = ( U8* )locinput;
2587 U32 state = 1;
2588 U16 charid = 0;
2589 U32 base = 0;
2590 UV uvc = 0;
2591 STRLEN len = 0;
2592 STRLEN foldlen = 0;
a3621e74
YO
2593 U8 *uscan = (U8*)NULL;
2594 STRLEN bufflen=0;
2595 accepted = 0;
2596
2597 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2598
2599 while ( state && uc <= (U8*)PL_regeol ) {
2600
2601 TRIE_CHECK_STATE_IS_ACCEPTING;
2602
2603 base = trie->states[ state ].trans.base;
2604
2605 DEBUG_TRIE_EXECUTE_r(
2606 PerlIO_printf( Perl_debug_log,
e4584336 2607 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2608 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2609 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2610 );
2611
2612 if ( base ) {
2613
108bb1ad 2614 if ( do_utf8 ) {
a3621e74
YO
2615 if ( foldlen>0 ) {
2616 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2617 foldlen -= len;
2618 uscan += len;
2619 len=0;
2620 } else {
1df70142 2621 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2622 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2623 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2624 foldlen -= UNISKIP( uvc );
2625 uscan = foldbuf + UNISKIP( uvc );
2626 }
2627 } else {
e4584336 2628 uvc = (UV)*uc;
a3621e74
YO
2629 len = 1;
2630 }
2631
2632 TRIE_HANDLE_CHAR;
2633
2634 } else {
2635 state = 0;
2636 }
2637 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2638 PerlIO_printf( Perl_debug_log,
2639 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2640 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2641 );
2642 }
2643 if ( !accepted ) {
2644 sayNO;
2645 } else {
2646 goto TrieAccept;
2647 }
2648 }
2649 /* unreached codepoint: we jump into the middle of the next case
2650 from previous if blocks */
2651 case TRIE:
2652 {
a3621e74
YO
2653 U8 *uc = (U8*)locinput;
2654 U32 state = 1;
2655 U16 charid = 0;
2656 U32 base = 0;
2657 UV uvc = 0;
2658 STRLEN len = 0;
2659 STRLEN bufflen = 0;
2660 accepted = 0;
2661
2662 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2663
2664 while ( state && uc <= (U8*)PL_regeol ) {
2665
2666 TRIE_CHECK_STATE_IS_ACCEPTING;
2667
2668 base = trie->states[ state ].trans.base;
2669
2670 DEBUG_TRIE_EXECUTE_r(
2671 PerlIO_printf( Perl_debug_log,
e4584336 2672 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2673 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2674 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2675 );
2676
2677 if ( base ) {
2678
108bb1ad 2679 if ( do_utf8 ) {
a3621e74
YO
2680 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2681 } else {
2682 uvc = (U32)*uc;
2683 len = 1;
2684 }
2685
2686 TRIE_HANDLE_CHAR;
2687
2688 } else {
2689 state = 0;
2690 }
2691 DEBUG_TRIE_EXECUTE_r(
2692 PerlIO_printf( Perl_debug_log,
e4584336
RB
2693 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2694 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2695 );
2696 }
2697 if ( !accepted ) {
2698 sayNO;
2699 }
2700 }
2701
2702
2703 /*
2704 There was at least one accepting state that we
2705 transitioned through. Presumably the number of accepting
2706 states is going to be low, typically one or two. So we
2707 simply scan through to find the one with lowest wordnum.
2708 Once we find it, we swap the last state into its place
2709 and decrement the size. We then try to match the rest of
2710 the pattern at the point where the word ends, if we
2711 succeed then we end the loop, otherwise the loop
2712 eventually terminates once all of the accepting states
2713 have been tried.
2714 */
2715 TrieAccept:
2716 {
2717 int gotit = 0;
2718
2719 if ( accepted == 1 ) {
2720 DEBUG_EXECUTE_r({
2721 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2722 PerlIO_printf( Perl_debug_log,
2723 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2724 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2725 accept_buff[ 0 ].wordnum,
cfd0369c 2726 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2727 PL_colors[5] );
2728 });
cc601c31 2729 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2730 /* in this case we free tmps/leave before we call regmatch
2731 as we wont be using accept_buff again. */
2732 FREETMPS;
2733 LEAVE;
2734 gotit = regmatch( scan + NEXT_OFF( scan ) );
2735 } else {
2736 DEBUG_EXECUTE_r(
e4584336
RB
2737 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2738 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2739 PL_colors[5] );
2740 );
2741 while ( !gotit && accepted-- ) {
2742 U32 best = 0;
2743 U32 cur;
2744 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2745 DEBUG_TRIE_EXECUTE_r(
2746 PerlIO_printf( Perl_debug_log,
2747 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2748 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2749 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2750 accept_buff[ cur ].wordnum, PL_colors[5] );
2751 );
a3621e74
YO
2752
2753 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2754 best = cur;
2755 }
2756 DEBUG_EXECUTE_r({
2757 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2758 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2759 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2760 accept_buff[best].wordnum,
cfd0369c 2761 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2762 PL_colors[5] );
2763 });
2764 if ( best<accepted ) {
2765 reg_trie_accepted tmp = accept_buff[ best ];
2766 accept_buff[ best ] = accept_buff[ accepted ];
2767 accept_buff[ accepted ] = tmp;
2768 best = accepted;
2769 }
cc601c31 2770 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2771
2772 /*
2773 as far as I can tell we only need the SAVETMPS/FREETMPS
2774 for re's with EVAL in them but I'm leaving them in for
2775 all until I can be sure.
2776 */
2777 SAVETMPS;
2778 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2779 FREETMPS;
2780 }
2781 FREETMPS;
2782 LEAVE;
2783 }
2784
2785 if ( gotit ) {
2786 sayYES;
2787 } else {
2788 sayNO;
2789 }
2790 }
2791 /* unreached codepoint */
d6a28714 2792 case EXACT:
cd439c50
IZ
2793 s = STRING(scan);
2794 ln = STR_LEN(scan);
eb160463 2795 if (do_utf8 != UTF) {
bc517b45 2796 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2797 char *l = locinput;
a3b680e6 2798 const char *e = s + ln;
a72c7584 2799
5ff6fc6d
JH
2800 if (do_utf8) {
2801 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2802 while (s < e) {
a3b680e6 2803 STRLEN ulen;
1aa99e6b 2804 if (l >= PL_regeol)
5ff6fc6d
JH
2805 sayNO;
2806 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2807 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2808 uniflags))
5ff6fc6d 2809 sayNO;
bc517b45 2810 l += ulen;
5ff6fc6d 2811 s ++;
1aa99e6b 2812 }
5ff6fc6d
JH
2813 }
2814 else {
2815 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2816 while (s < e) {
a3b680e6 2817 STRLEN ulen;
1aa99e6b
IH
2818 if (l >= PL_regeol)
2819 sayNO;
5ff6fc6d 2820 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2821 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2822 uniflags))
1aa99e6b 2823 sayNO;
bc517b45 2824 s += ulen;
a72c7584 2825 l ++;
1aa99e6b 2826 }
5ff6fc6d 2827 }
1aa99e6b
IH
2828 locinput = l;
2829 nextchr = UCHARAT(locinput);
2830 break;
2831 }
bc517b45 2832 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2833 /* Inline the first character, for speed. */
2834 if (UCHARAT(s) != nextchr)
2835 sayNO;
2836 if (PL_regeol - locinput < ln)
2837 sayNO;
2838 if (ln > 1 && memNE(s, locinput, ln))
2839 sayNO;
2840 locinput += ln;
2841 nextchr = UCHARAT(locinput);
2842 break;
2843 case EXACTFL:
b8c5462f
JH
2844 PL_reg_flags |= RF_tainted;
2845 /* FALL THROUGH */
d6a28714 2846 case EXACTF:
cd439c50
IZ
2847 s = STRING(scan);
2848 ln = STR_LEN(scan);
d6a28714 2849
d07ddd77
JH
2850 if (do_utf8 || UTF) {
2851 /* Either target or the pattern are utf8. */
d6a28714 2852 char *l = locinput;
d07ddd77 2853 char *e = PL_regeol;
bc517b45 2854
eb160463 2855 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2856 l, &e, 0, do_utf8)) {
5486206c
JH
2857 /* One more case for the sharp s:
2858 * pack("U0U*", 0xDF) =~ /ss/i,
2859 * the 0xC3 0x9F are the UTF-8
2860 * byte sequence for the U+00DF. */
2861 if (!(do_utf8 &&
2862 toLOWER(s[0]) == 's' &&
2863 ln >= 2 &&
2864 toLOWER(s[1]) == 's' &&
2865 (U8)l[0] == 0xC3 &&
2866 e - l >= 2 &&
2867 (U8)l[1] == 0x9F))
2868 sayNO;
2869 }
d07ddd77
JH
2870 locinput = e;
2871 nextchr = UCHARAT(locinput);
2872 break;
a0ed51b3 2873 }
d6a28714 2874
bc517b45
JH
2875 /* Neither the target and the pattern are utf8. */
2876
d6a28714
JH
2877 /* Inline the first character, for speed. */
2878 if (UCHARAT(s) != nextchr &&
2879 UCHARAT(s) != ((OP(scan) == EXACTF)
2880 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2881 sayNO;
d6a28714 2882 if (PL_regeol - locinput < ln)
b8c5462f 2883 sayNO;
d6a28714
JH
2884 if (ln > 1 && (OP(scan) == EXACTF
2885 ? ibcmp(s, locinput, ln)
2886 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2887 sayNO;
d6a28714
JH
2888 locinput += ln;
2889 nextchr = UCHARAT(locinput);
a0d0e21e 2890 break;
d6a28714 2891 case ANYOF:
ffc61ed2 2892 if (do_utf8) {
9e55ce06
JH
2893 STRLEN inclasslen = PL_regeol - locinput;
2894
ba7b4546 2895 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2896 sayNO_ANYOF;
ffc61ed2
JH
2897 if (locinput >= PL_regeol)
2898 sayNO;
0f0076b4 2899 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2900 nextchr = UCHARAT(locinput);
e0f9d4a8 2901 break;
ffc61ed2
JH
2902 }
2903 else {
2904 if (nextchr < 0)
2905 nextchr = UCHARAT(locinput);
7d3e948e 2906 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2907 sayNO_ANYOF;
ffc61ed2
JH
2908 if (!nextchr && locinput >= PL_regeol)
2909 sayNO;
2910 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2911 break;
2912 }
2913 no_anyof:
2914 /* If we might have the case of the German sharp s
2915 * in a casefolding Unicode character class. */
2916
ebc501f0
JH
2917 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2918 locinput += SHARP_S_SKIP;
e0f9d4a8 2919 nextchr = UCHARAT(locinput);
ffc61ed2 2920 }
e0f9d4a8
JH
2921 else
2922 sayNO;
b8c5462f 2923 break;
d6a28714 2924 case ALNUML:
b8c5462f
JH
2925 PL_reg_flags |= RF_tainted;
2926 /* FALL THROUGH */
d6a28714 2927 case ALNUM:
b8c5462f 2928 if (!nextchr)
4633a7c4 2929 sayNO;
ffc61ed2 2930 if (do_utf8) {
1a4fad37 2931 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2932 if (!(OP(scan) == ALNUM
3568d838 2933 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2934 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2935 {
2936 sayNO;
a0ed51b3 2937 }
b8c5462f 2938 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2939 nextchr = UCHARAT(locinput);
2940 break;
2941 }
ffc61ed2 2942 if (!(OP(scan) == ALNUM
d6a28714 2943 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2944 sayNO;
b8c5462f 2945 nextchr = UCHARAT(++locinput);
a0d0e21e 2946 break;
d6a28714 2947 case NALNUML:
b8c5462f
JH
2948 PL_reg_flags |= RF_tainted;
2949 /* FALL THROUGH */
d6a28714
JH
2950 case NALNUM:
2951 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2952 sayNO;
ffc61ed2 2953 if (do_utf8) {
1a4fad37 2954 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2955 if (OP(scan) == NALNUM
3568d838 2956 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2957 : isALNUM_LC_utf8((U8*)locinput))
2958 {
b8c5462f 2959 sayNO;
d6a28714 2960 }
b8c5462f
JH
2961 locinput += PL_utf8skip[nextchr];
2962 nextchr = UCHARAT(locinput);
2963 break;
2964 }
ffc61ed2 2965 if (OP(scan) == NALNUM
d6a28714 2966 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2967 sayNO;
76e3520e 2968 nextchr = UCHARAT(++locinput);
a0d0e21e 2969 break;
d6a28714
JH
2970 case BOUNDL:
2971 case NBOUNDL:
3280af22 2972 PL_reg_flags |= RF_tainted;
bbce6d69 2973 /* FALL THROUGH */
d6a28714
JH
2974 case BOUND:
2975 case NBOUND:
2976 /* was last char in word? */
ffc61ed2 2977 if (do_utf8) {
12d33761
HS
2978 if (locinput == PL_bostr)
2979 ln = '\n';
ffc61ed2 2980 else {
a3b680e6 2981 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2982
b4f7163a 2983 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2984 }
2985 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2986 ln = isALNUM_uni(ln);
1a4fad37 2987 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2988 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2989 }
2990 else {
9041c2e3 2991 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2992 n = isALNUM_LC_utf8((U8*)locinput);
2993 }
a0ed51b3 2994 }
d6a28714 2995 else {
12d33761
HS
2996 ln = (locinput != PL_bostr) ?
2997 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2998 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2999 ln = isALNUM(ln);
3000 n = isALNUM(nextchr);
3001 }
3002 else {
3003 ln = isALNUM_LC(ln);
3004 n = isALNUM_LC(nextchr);
3005 }
d6a28714 3006 }
ffc61ed2
JH
3007 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3008 OP(scan) == BOUNDL))
3009 sayNO;
a0ed51b3 3010 break;
d6a28714 3011 case SPACEL:
3280af22 3012 PL_reg_flags |= RF_tainted;
bbce6d69 3013 /* FALL THROUGH */
d6a28714 3014 case SPACE:
9442cb0e 3015 if (!nextchr)
4633a7c4 3016 sayNO;
1aa99e6b 3017 if (do_utf8) {
fd400ab9 3018 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3019 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3020 if (!(OP(scan) == SPACE
3568d838 3021 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3022 : isSPACE_LC_utf8((U8*)locinput)))
3023 {
3024 sayNO;
3025 }
3026 locinput += PL_utf8skip[nextchr];
3027 nextchr = UCHARAT(locinput);
3028 break;
d6a28714 3029 }
ffc61ed2
JH
3030 if (!(OP(scan) == SPACE
3031 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3032 sayNO;
3033 nextchr = UCHARAT(++locinput);
3034 }
3035 else {
3036 if (!(OP(scan) == SPACE
3037 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3038 sayNO;
3039 nextchr = UCHARAT(++locinput);
a0ed51b3 3040 }
a0ed51b3 3041 break;
d6a28714 3042 case NSPACEL:
3280af22 3043 PL_reg_flags |= RF_tainted;
bbce6d69 3044 /* FALL THROUGH */
d6a28714 3045 case NSPACE:
9442cb0e 3046 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3047 sayNO;
1aa99e6b 3048 if (do_utf8) {
1a4fad37 3049 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3050 if (OP(scan) == NSPACE
3568d838 3051 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3052 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3053 {
3054 sayNO;
3055 }
3056 locinput += PL_utf8skip[nextchr];
3057 nextchr = UCHARAT(locinput);
3058 break;
a0ed51b3 3059 }
ffc61ed2 3060 if (OP(scan) == NSPACE
d6a28714 3061 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3062 sayNO;
76e3520e 3063 nextchr = UCHARAT(++locinput);
a0d0e21e 3064 break;
d6a28714 3065 case DIGITL:
a0ed51b3
LW
3066 PL_reg_flags |= RF_tainted;
3067 /* FALL THROUGH */
d6a28714 3068 case DIGIT:
9442cb0e 3069 if (!nextchr)
a0ed51b3 3070 sayNO;
1aa99e6b 3071 if (do_utf8) {
1a4fad37 3072 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3073 if (!(OP(scan) == DIGIT
3568d838 3074 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3075 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3076 {
a0ed51b3 3077 sayNO;
dfe13c55 3078 }
6f06b55f 3079 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3080 nextchr = UCHARAT(locinput);
3081 break;
3082 }
ffc61ed2 3083 if (!(OP(scan) == DIGIT
9442cb0e 3084 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3085 sayNO;
3086 nextchr = UCHARAT(++locinput);
3087 break;
d6a28714 3088 case NDIGITL:
b8c5462f
JH
3089 PL_reg_flags |= RF_tainted;
3090 /* FALL THROUGH */
d6a28714 3091 case NDIGIT:
9442cb0e 3092 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3093 sayNO;
1aa99e6b 3094 if (do_utf8) {
1a4fad37 3095 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3096 if (OP(scan) == NDIGIT
3568d838 3097 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3098 : isDIGIT_LC_utf8((U8*)locinput))
3099 {
a0ed51b3 3100 sayNO;
9442cb0e 3101 }
6f06b55f 3102 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3103 nextchr = UCHARAT(locinput);
3104 break;
3105 }
ffc61ed2 3106 if (OP(scan) == NDIGIT
9442cb0e 3107 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3108 sayNO;
3109 nextchr = UCHARAT(++locinput);
3110 break;
3111 case CLUMP:
b7c83a7e 3112 if (locinput >= PL_regeol)
a0ed51b3 3113 sayNO;
b7c83a7e 3114 if (do_utf8) {
1a4fad37 3115 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3116 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3117 sayNO;
3118 locinput += PL_utf8skip[nextchr];
3119 while (locinput < PL_regeol &&
3120 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3121 locinput += UTF8SKIP(locinput);
3122 if (locinput > PL_regeol)
3123 sayNO;
eb08e2da
JH
3124 }
3125 else
3126 locinput++;
a0ed51b3
LW
3127 nextchr = UCHARAT(locinput);
3128 break;
c8756f30 3129 case REFFL:
3280af22 3130 PL_reg_flags |= RF_tainted;
c8756f30 3131 /* FALL THROUGH */
c277df42 3132 case REF:
c8756f30 3133 case REFF:
c277df42 3134 n = ARG(scan); /* which paren pair */
cf93c79d 3135 ln = PL_regstartp[n];
2c2d71f5 3136 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3137 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3138 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3139 if (ln == PL_regendp[n])
a0d0e21e 3140 break;
a0ed51b3 3141
cf93c79d 3142 s = PL_bostr + ln;
1aa99e6b 3143 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3144 char *l = locinput;
a3b680e6 3145 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3146 /*
3147 * Note that we can't do the "other character" lookup trick as
3148 * in the 8-bit case (no pun intended) because in Unicode we
3149 * have to map both upper and title case to lower case.
3150 */
3151 if (OP(scan) == REFF) {
3152 while (s < e) {
a3b680e6
AL
3153 STRLEN ulen1, ulen2;
3154 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3155 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3156
a0ed51b3
LW
3157 if (l >= PL_regeol)
3158 sayNO;
a2a2844f
JH
3159 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3160 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3161 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3162 sayNO;
a2a2844f
JH
3163 s += ulen1;
3164 l += ulen2;
a0ed51b3
LW
3165 }
3166 }
3167 locinput = l;
3168 nextchr = UCHARAT(locinput);
3169 break;
3170 }
3171
a0d0e21e 3172 /* Inline the first character, for speed. */
76e3520e 3173 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3174 (OP(scan) == REF ||
3175 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3176 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3177 sayNO;
cf93c79d 3178 ln = PL_regendp[n] - ln;
3280af22 3179 if (locinput + ln > PL_regeol)
4633a7c4 3180 sayNO;
c8756f30
AK
3181 if (ln > 1 && (OP(scan) == REF
3182 ? memNE(s, locinput, ln)
3183 : (OP(scan) == REFF
3184 ? ibcmp(s, locinput, ln)
3185 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3186 sayNO;
a0d0e21e 3187 locinput += ln;
76e3520e 3188 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3189 break;
3190
3191 case NOTHING:
c277df42 3192 case TAIL:
a0d0e21e
LW
3193 break;
3194 case BACK:
3195 break;
c277df42
IZ
3196 case EVAL:
3197 {
3198 dSP;
533c011a 3199 OP_4tree *oop = PL_op;
3280af22 3200 COP *ocurcop = PL_curcop;
f3548bdc 3201 PAD *old_comppad;
c277df42 3202 SV *ret;
080c2dec 3203 struct regexp *oreg = PL_reg_re;
9041c2e3 3204
c277df42 3205 n = ARG(scan);
533c011a 3206 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3207 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3208 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3209 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3210
8e5e9ebe
RGS
3211 {
3212 SV **before = SP;
3213 CALLRUNOPS(aTHX); /* Scalar context. */
3214 SPAGAIN;
3215 if (SP == before)
075aa684 3216 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3217 else {
3218 ret = POPs;
3219 PUTBACK;
3220 }
3221 }
3222
0f5d15d6 3223 PL_op = oop;
f3548bdc 3224 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3225 PL_curcop = ocurcop;
c277df42 3226 if (logical) {
0f5d15d6
IZ
3227 if (logical == 2) { /* Postponed subexpression. */
3228 regexp *re;
22c35a8c 3229 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3230 re_cc_state state;
0f5d15d6 3231 CHECKPOINT cp, lastcp;
cb50f42d 3232 int toggleutf;
faf82a0b 3233 register SV *sv;
0f5d15d6 3234
faf82a0b
AE
3235 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3236 mg = mg_find(sv, PERL_MAGIC_qr);
3237 else if (SvSMAGICAL(ret)) {
3238 if (SvGMAGICAL(ret))
3239 sv_unmagic(ret, PERL_MAGIC_qr);
3240 else
3241 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3242 }
faf82a0b 3243
0f5d15d6
IZ
3244 if (mg) {
3245 re = (regexp *)mg->mg_obj;
df0003d4 3246 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3247 }
3248 else {
3249 STRLEN len;
83003860 3250 const char *t = SvPV_const(ret, len);
0f5d15d6 3251 PMOP pm;
a3b680e6
AL
3252 char * const oprecomp = PL_regprecomp;
3253 const I32 osize = PL_regsize;
3254 const I32 onpar = PL_regnpar;
0f5d15d6 3255
5fcd1c1b 3256 Zero(&pm, 1, PMOP);
cb50f42d 3257 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3258 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3259 if (!(SvFLAGS(ret)
faf82a0b
AE
3260 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3261 | SVs_GMG)))
14befaf4
DM
3262 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3263 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3264 PL_regprecomp = oprecomp;
3265 PL_regsize = osize;
3266 PL_regnpar = onpar;
3267 }
a3621e74 3268 DEBUG_EXECUTE_r(
9041c2e3 3269 PerlIO_printf(Perl_debug_log,
a0288114 3270 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3271 PL_colors[0],
3272 re->precomp,
3273 PL_colors[1],
3274 (strlen(re->precomp) > 60 ? "..." : ""))
3275 );
3276 state.node = next;
3277 state.prev = PL_reg_call_cc;
3278 state.cc = PL_regcc;
3279 state.re = PL_reg_re;
3280
2ab05381 3281 PL_regcc = 0;
9041c2e3 3282
0f5d15d6 3283 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3284 REGCP_SET(lastcp);
0f5d15d6
IZ
3285 cache_re(re);
3286 state.ss = PL_savestack_ix;
3287 *PL_reglastparen = 0;
a01268b5 3288 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3289 PL_reg_call_cc = &state;
3290 PL_reginput = locinput;
cb50f42d
YST
3291 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3292 ((re->reganch & ROPT_UTF8) != 0);
3293 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3294
3295 /* XXXX This is too dramatic a measure... */
3296 PL_reg_maxiter = 0;
3297
0f5d15d6 3298 if (regmatch(re->program + 1)) {
2c914db6
IZ
3299 /* Even though we succeeded, we need to restore
3300 global variables, since we may be wrapped inside
3301 SUSPEND, thus the match may be not finished yet. */
3302
3303 /* XXXX Do this only if SUSPENDed? */
3304 PL_reg_call_cc = state.prev;
3305 PL_regcc = state.cc;
3306 PL_reg_re = state.re;
3307 cache_re(PL_reg_re);
cb50f42d 3308 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3309
3310 /* XXXX This is too dramatic a measure... */
3311 PL_reg_maxiter = 0;
3312
3313 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3314 ReREFCNT_dec(re);
3315 regcpblow(cp);
3316 sayYES;
3317 }
0f5d15d6 3318 ReREFCNT_dec(re);
02db2b7b 3319 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3320 regcppop();
3321 PL_reg_call_cc = state.prev;
3322 PL_regcc = state.cc;
3323 PL_reg_re = state.re;
d3790889 3324 cache_re(PL_reg_re);
cb50f42d 3325 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3326
3327 /* XXXX This is too dramatic a measure... */
3328 PL_reg_maxiter = 0;
3329
8e514ae6 3330 logical = 0;
0f5d15d6
IZ
3331 sayNO;
3332 }
c277df42 3333 sw = SvTRUE(ret);
0f5d15d6 3334 logical = 0;
a0ed51b3 3335 }
080c2dec 3336 else {
3280af22 3337 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3338 cache_re(oreg);
3339 }
c277df42
IZ
3340 break;
3341 }
a0d0e21e 3342 case OPEN:
c277df42 3343 n = ARG(scan); /* which paren pair */
3280af22
NIS
3344 PL_reg_start_tmp[n] = locinput;
3345 if (n > PL_regsize)
3346 PL_regsize = n;
a0d0e21e
LW
3347 break;
3348 case CLOSE:
c277df42 3349 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3350 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3351 PL_regendp[n] = locinput - PL_bostr;
eb160463 3352 if (n > (I32)*PL_reglastparen)
3280af22 3353 *PL_reglastparen = n;
a01268b5 3354 *PL_reglastcloseparen = n;
a0d0e21e 3355 break;
c277df42
IZ
3356 case GROUPP:
3357 n = ARG(scan); /* which paren pair */
eb160463 3358 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3359 break;
3360 case IFTHEN:
2c2d71f5 3361 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3362 if (sw)
3363 next = NEXTOPER(NEXTOPER(scan));
3364 else {
3365 next = scan + ARG(scan);
3366 if (OP(next) == IFTHEN) /* Fake one. */
3367 next = NEXTOPER(NEXTOPER(next));
3368 }
3369 break;
3370 case LOGICAL:
0f5d15d6 3371 logical = scan->flags;
c277df42 3372 break;
2ab05381
IZ
3373/*******************************************************************
3374 PL_regcc contains infoblock about the innermost (...)* loop, and
3375 a pointer to the next outer infoblock.
3376
3377 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3378
3379 1) After matching X, regnode for CURLYX is processed;
3380
9041c2e3 3381 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3382 regmatch() recursively with the starting point at WHILEM node;
3383
3384 3) Each hit of WHILEM node tries to match A and Z (in the order
3385 depending on the current iteration, min/max of {min,max} and
3386 greediness). The information about where are nodes for "A"
3387 and "Z" is read from the infoblock, as is info on how many times "A"
3388 was already matched, and greediness.
3389
3390 4) After A matches, the same WHILEM node is hit again.
3391
3392 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3393 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3394 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3395 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3396 of the external loop.
3397
3398 Currently present infoblocks form a tree with a stem formed by PL_curcc
3399 and whatever it mentions via ->next, and additional attached trees
3400 corresponding to temporarily unset infoblocks as in "5" above.
3401
9041c2e3 3402 In the following picture infoblocks for outer loop of
2ab05381
IZ
3403 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3404 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3405 infoblocks are drawn below the "reset" infoblock.
3406
3407 In fact in the picture below we do not show failed matches for Z and T
3408 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3409 more obvious *why* one needs to *temporary* unset infoblocks.]
3410
3411 Matched REx position InfoBlocks Comment
3412 (Y(A)*?Z)*?T x
3413 Y(A)*?Z)*?T x <- O
3414 Y (A)*?Z)*?T x <- O
3415 Y A)*?Z)*?T x <- O <- I
3416 YA )*?Z)*?T x <- O <- I
3417 YA A)*?Z)*?T x <- O <- I
3418 YAA )*?Z)*?T x <- O <- I
3419 YAA Z)*?T x <- O # Temporary unset I
3420 I
3421
3422 YAAZ Y(A)*?Z)*?T x <- O
3423 I
3424
3425 YAAZY (A)*?Z)*?T x <- O
3426 I
3427
3428 YAAZY A)*?Z)*?T x <- O <- I
3429 I
3430
3431 YAAZYA )*?Z)*?T x <- O <- I
3432 I
3433
3434 YAAZYA Z)*?T x <- O # Temporary unset I
3435 I,I
3436
3437 YAAZYAZ )*?T x <- O
3438 I,I
3439
3440 YAAZYAZ T x # Temporary unset O
3441 O
3442 I,I
3443
3444 YAAZYAZT x
3445 O
3446 I,I
3447 *******************************************************************/
a0d0e21e
LW
3448 case CURLYX: {
3449 CURCUR cc;
3280af22 3450 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3451 /* No need to save/restore up to this paren */
3452 I32 parenfloor = scan->flags;
c277df42
IZ
3453
3454 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3455 next += ARG(next);
3280af22
NIS
3456 cc.oldcc = PL_regcc;
3457 PL_regcc = &cc;
cb434fcc
IZ
3458 /* XXXX Probably it is better to teach regpush to support
3459 parenfloor > PL_regsize... */
eb160463 3460 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3461 parenfloor = *PL_reglastparen; /* Pessimization... */
3462 cc.parenfloor = parenfloor;
a0d0e21e
LW
3463 cc.cur = -1;
3464 cc.min = ARG1(scan);
3465 cc.max = ARG2(scan);
c277df42 3466 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3467 cc.next = next;
3468 cc.minmod = minmod;
3469 cc.lastloc = 0;
3280af22 3470 PL_reginput = locinput;
a0d0e21e
LW
3471 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3472 regcpblow(cp);
3280af22 3473 PL_regcc = cc.oldcc;
4633a7c4 3474 saySAME(n);
a0d0e21e
LW
3475 }
3476 /* NOT REACHED */
3477 case WHILEM: {
3478 /*
3479 * This is really hard to understand, because after we match
3480 * what we're trying to match, we must make sure the rest of
2c2d71f5 3481 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3482 * to go back UP the parse tree by recursing ever deeper. And
3483 * if it fails, we have to reset our parent's current state
3484 * that we can try again after backing off.
3485 */
3486
c277df42 3487 CHECKPOINT cp, lastcp;
3280af22 3488 CURCUR* cc = PL_regcc;
c277df42 3489 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3490 I32 cache_offset = 0, cache_bit = 0;
c277df42 3491
4633a7c4 3492 n = cc->cur + 1; /* how many we know we matched */
3280af22 3493 PL_reginput = locinput;
a0d0e21e 3494
a3621e74 3495 DEBUG_EXECUTE_r(
9041c2e3 3496 PerlIO_printf(Perl_debug_log,
91f3b821 3497 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3498 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3499 (long)n, (long)cc-&