This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Abstract all reads/writes of the hints in COPs with 2 new macros,
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 **** Alterations to Henry's code are...
79 ****
4bb101f2 80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 81 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 82 ****
9ef589d8
LW
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
a687059c
LW
85 *
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
89 */
90#include "EXTERN.h"
864dbfa3 91#define PERL_IN_REGEXEC_C
a687059c 92#include "perl.h"
0f5d15d6 93
a687059c
LW
94#include "regcomp.h"
95
c277df42
IZ
96#define RF_tainted 1 /* tainted information used? */
97#define RF_warned 2 /* warned about big count? */
ce862d02 98#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
99#define RF_utf8 8 /* String contains multibyte chars? */
100
eb160463 101#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
102
103#define RS_init 1 /* eval environment created */
104#define RS_set 2 /* replsv value is set */
c277df42 105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
ba7b4546 110#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 111
c277df42
IZ
112/*
113 * Forwards.
114 */
115
33b8afdf 116#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 117#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 118
52657f30
AL
119#define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 : (U8*)(pos + off)))
122#define HOPBACKc(pos, off) ((char*) \
123 ((PL_reg_match_utf8) \
124 ? reghopmaybe3((U8*)pos, -off, ((U8*)(off < 0 ? PL_regeol : PL_bostr))) \
efb30f32
HS
125 : (pos - off >= PL_bostr) \
126 ? (U8*)(pos - off) \
52657f30 127 : (U8*)NULL) \
efb30f32 128)
efb30f32 129
1aa99e6b 130#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c 131#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 132#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 133
1a4fad37
AL
134#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
136#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
137#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
138#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
139#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 140
5f80c4cf 141/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
142#define JUMPABLE(rn) ( \
143 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
144 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
145 OP(rn) == PLUS || OP(rn) == MINMOD || \
146 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
147)
148
cca55fe3
JP
149#define HAS_TEXT(rn) ( \
150 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
151)
e2d8ce26 152
a84d97b6
HS
153/*
154 Search for mandatory following text node; for lookahead, the text must
155 follow but for lookbehind (rn->flags != 0) we skip to the next step.
156*/
cca55fe3 157#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 158 while (JUMPABLE(rn)) \
a84d97b6 159 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 160 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
161 else if (OP(rn) == PLUS) \
162 rn = NEXTOPER(rn); \
a84d97b6
HS
163 else if (OP(rn) == IFMATCH) \
164 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 165 else rn += NEXT_OFF(rn); \
5f80c4cf 166} STMT_END
74750237 167
acfe0abc 168static void restore_pos(pTHX_ void *arg);
51371543 169
76e3520e 170STATIC CHECKPOINT
cea2e8a9 171S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 172{
97aff369 173 dVAR;
a3b680e6 174 const int retval = PL_savestack_ix;
b1ce53c5 175#define REGCP_PAREN_ELEMS 4
a3b680e6 176 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
177 int p;
178
e49a9654
IH
179 if (paren_elems_to_push < 0)
180 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
181
a01268b5 182#define REGCP_OTHER_ELEMS 6
4b3c1a47 183 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 184 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 185/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
186 SSPUSHINT(PL_regendp[p]);
187 SSPUSHINT(PL_regstartp[p]);
3280af22 188 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
189 SSPUSHINT(p);
190 }
b1ce53c5 191/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
192 SSPUSHINT(PL_regsize);
193 SSPUSHINT(*PL_reglastparen);
a01268b5 194 SSPUSHINT(*PL_reglastcloseparen);
3280af22 195 SSPUSHPTR(PL_reginput);
41123dfd
JH
196#define REGCP_FRAME_ELEMS 2
197/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
198 * are needed for the regexp context stack bookkeeping. */
199 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 200 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 201
a0d0e21e
LW
202 return retval;
203}
204
c277df42 205/* These are needed since we do not localize EVAL nodes: */
a3621e74 206# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 207 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 208 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 209
a3621e74 210# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 211 PerlIO_printf(Perl_debug_log, \
faccc32b 212 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 213 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 214
76e3520e 215STATIC char *
cea2e8a9 216S_regcppop(pTHX)
a0d0e21e 217{
97aff369 218 dVAR;
b1ce53c5 219 I32 i;
a0d0e21e
LW
220 U32 paren = 0;
221 char *input;
b1ce53c5 222
a3621e74
YO
223 GET_RE_DEBUG_FLAGS_DECL;
224
b1ce53c5 225 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 226 i = SSPOPINT;
b1ce53c5
JH
227 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
228 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 229 input = (char *) SSPOPPTR;
a01268b5 230 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
231 *PL_reglastparen = SSPOPINT;
232 PL_regsize = SSPOPINT;
b1ce53c5
JH
233
234 /* Now restore the parentheses context. */
41123dfd
JH
235 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
236 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 237 I32 tmps;
a0d0e21e 238 paren = (U32)SSPOPINT;
3280af22 239 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
240 PL_regstartp[paren] = SSPOPINT;
241 tmps = SSPOPINT;
3280af22
NIS
242 if (paren <= *PL_reglastparen)
243 PL_regendp[paren] = tmps;
a3621e74 244 DEBUG_EXECUTE_r(
c3464db5 245 PerlIO_printf(Perl_debug_log,
b900a521 246 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 247 (UV)paren, (IV)PL_regstartp[paren],
b900a521 248 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 249 (IV)PL_regendp[paren],
3280af22 250 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 251 );
a0d0e21e 252 }
a3621e74 253 DEBUG_EXECUTE_r(
eb160463 254 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 255 PerlIO_printf(Perl_debug_log,
faccc32b
JH
256 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
257 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
258 }
259 );
daf18116 260#if 1
dafc8851
JH
261 /* It would seem that the similar code in regtry()
262 * already takes care of this, and in fact it is in
263 * a better location to since this code can #if 0-ed out
264 * but the code in regtry() is needed or otherwise tests
265 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
266 * (as of patchlevel 7877) will fail. Then again,
267 * this code seems to be necessary or otherwise
268 * building DynaLoader will fail:
269 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
270 * --jhi */
eb160463
GS
271 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
272 if ((I32)paren > PL_regsize)
cf93c79d
IZ
273 PL_regstartp[paren] = -1;
274 PL_regendp[paren] = -1;
a0d0e21e 275 }
dafc8851 276#endif
a0d0e21e
LW
277 return input;
278}
279
0f5d15d6
IZ
280typedef struct re_cc_state
281{
282 I32 ss;
283 regnode *node;
284 struct re_cc_state *prev;
285 CURCUR *cc;
286 regexp *re;
287} re_cc_state;
288
02db2b7b 289#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 290
95b24440 291#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
292 if (paren) { \
293 if (n) { \
294 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
295 PL_regendp[paren] = input - PL_bostr; \
296 } \
297 else \
298 PL_regendp[paren] = -1; \
299 } \
95b24440
DM
300 REGMATCH(next, where); \
301 if (result) \
29d1e993
HS
302 sayYES; \
303 if (paren && n) \
304 PL_regendp[paren] = -1; \
305}
306
307
a687059c 308/*
e50aee73 309 * pregexec and friends
a687059c
LW
310 */
311
312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
864dbfa3 316Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
323 return
9041c2e3 324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
325 nosave ? 0 : REXEC_COPY_STR);
326}
0f5d15d6
IZ
327
328STATIC void
cea2e8a9 329S_cache_re(pTHX_ regexp *prog)
0f5d15d6 330{
97aff369 331 dVAR;
0f5d15d6
IZ
332 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
333#ifdef DEBUGGING
334 PL_regprogram = prog->program;
335#endif
336 PL_regnpar = prog->nparens;
9041c2e3
NIS
337 PL_regdata = prog->data;
338 PL_reg_re = prog;
0f5d15d6 339}
22e551b9 340
9041c2e3 341/*
cad2e5aa
JH
342 * Need to implement the following flags for reg_anch:
343 *
344 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
345 * USE_INTUIT_ML
346 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
347 * INTUIT_AUTORITATIVE_ML
348 * INTUIT_ONCE_NOML - Intuit can match in one location only.
349 * INTUIT_ONCE_ML
350 *
351 * Another flag for this function: SECOND_TIME (so that float substrs
352 * with giant delta may be not rechecked).
353 */
354
355/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
356
3f7c398e 357/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
358 Otherwise, only SvCUR(sv) is used to get strbeg. */
359
360/* XXXX We assume that strpos is strbeg unless sv. */
361
6eb5f6b9
JH
362/* XXXX Some places assume that there is a fixed substring.
363 An update may be needed if optimizer marks as "INTUITable"
364 RExen without fixed substrings. Similarly, it is assumed that
365 lengths of all the strings are no more than minlen, thus they
366 cannot come from lookahead.
367 (Or minlen should take into account lookahead.) */
368
2c2d71f5
JH
369/* A failure to find a constant substring means that there is no need to make
370 an expensive call to REx engine, thus we celebrate a failure. Similarly,
371 finding a substring too deep into the string means that less calls to
30944b6d
IZ
372 regtry() should be needed.
373
374 REx compiler's optimizer found 4 possible hints:
375 a) Anchored substring;
376 b) Fixed substring;
377 c) Whether we are anchored (beginning-of-line or \G);
378 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 379 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
380 string which does not contradict any of them.
381 */
2c2d71f5 382
6eb5f6b9
JH
383/* Most of decisions we do here should have been done at compile time.
384 The nodes of the REx which we used for the search should have been
385 deleted from the finite automaton. */
386
cad2e5aa
JH
387char *
388Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
389 char *strend, U32 flags, re_scream_pos_data *data)
390{
97aff369 391 dVAR;
b7953727 392 register I32 start_shift = 0;
cad2e5aa 393 /* Should be nonnegative! */
b7953727 394 register I32 end_shift = 0;
2c2d71f5
JH
395 register char *s;
396 register SV *check;
a1933d95 397 char *strbeg;
cad2e5aa 398 char *t;
a3b680e6 399 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 400 I32 ml_anch;
bd61b366
SS
401 register char *other_last = NULL; /* other substr checked before this */
402 char *check_at = NULL; /* check substr found at this pos */
1df70142 403 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 404#ifdef DEBUGGING
890ce7af
AL
405 const char * const i_strpos = strpos;
406 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 407#endif
a3621e74
YO
408
409 GET_RE_DEBUG_FLAGS_DECL;
410
a30b2f1f 411 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 412
b8d68ded 413 if (prog->reganch & ROPT_UTF8) {
a3621e74 414 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
415 "UTF-8 regex...\n"));
416 PL_reg_flags |= RF_utf8;
417 }
418
a3621e74 419 DEBUG_EXECUTE_r({
1df70142 420 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
421 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
422 strpos;
1df70142 423 const int len = PL_reg_match_utf8 ?
b8d68ded 424 strlen(s) : strend - strpos;
2a782b5b
JH
425 if (!PL_colorset)
426 reginitcolors();
b8d68ded 427 if (PL_reg_match_utf8)
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 429 "UTF-8 target...\n"));
2a782b5b 430 PerlIO_printf(Perl_debug_log,
a0288114 431 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 432 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
433 prog->precomp,
434 PL_colors[1],
435 (strlen(prog->precomp) > 60 ? "..." : ""),
436 PL_colors[0],
437 (int)(len > 60 ? 60 : len),
438 s, PL_colors[1],
439 (len > 60 ? "..." : "")
440 );
441 });
cad2e5aa 442
c344f387
JH
443 /* CHR_DIST() would be more correct here but it makes things slow. */
444 if (prog->minlen > strend - strpos) {
a3621e74 445 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 446 "String too short... [re_intuit_start]\n"));
cad2e5aa 447 goto fail;
2c2d71f5 448 }
a1933d95 449 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 450 PL_regeol = strend;
33b8afdf
JH
451 if (do_utf8) {
452 if (!prog->check_utf8 && prog->check_substr)
453 to_utf8_substr(prog);
454 check = prog->check_utf8;
455 } else {
456 if (!prog->check_substr && prog->check_utf8)
457 to_byte_substr(prog);
458 check = prog->check_substr;
459 }
460 if (check == &PL_sv_undef) {
a3621e74 461 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
462 "Non-utf string cannot match utf check string\n"));
463 goto fail;
464 }
2c2d71f5 465 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
466 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
467 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 468 && !multiline ) ); /* Check after \n? */
cad2e5aa 469
7e25d62c
JH
470 if (!ml_anch) {
471 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
472 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 473 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
474 && sv && !SvROK(sv)
475 && (strpos != strbeg)) {
a3621e74 476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
477 goto fail;
478 }
479 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 480 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 481 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
482 I32 slen;
483
1aa99e6b 484 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
485 if (SvTAIL(check)) {
486 slen = SvCUR(check); /* >= 1 */
cad2e5aa 487
9041c2e3 488 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 489 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 490 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 491 goto fail_finish;
cad2e5aa
JH
492 }
493 /* Now should match s[0..slen-2] */
494 slen--;
3f7c398e 495 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 496 || (slen > 1
3f7c398e 497 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 498 report_neq:
a3621e74 499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
500 goto fail_finish;
501 }
cad2e5aa 502 }
3f7c398e 503 else if (*SvPVX_const(check) != *s
653099ff 504 || ((slen = SvCUR(check)) > 1
3f7c398e 505 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 506 goto report_neq;
c315bfe8 507 check_at = s;
2c2d71f5 508 goto success_at_start;
7e25d62c 509 }
cad2e5aa 510 }
2c2d71f5 511 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 512 s = strpos;
2c2d71f5 513 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 514 end_shift = prog->minlen - start_shift -
653099ff 515 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 516 if (!ml_anch) {
a3b680e6 517 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 518 - (SvTAIL(check) != 0);
a3b680e6 519 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
520
521 if (end_shift < eshift)
522 end_shift = eshift;
523 }
cad2e5aa 524 }
2c2d71f5 525 else { /* Can match at random position */
cad2e5aa
JH
526 ml_anch = 0;
527 s = strpos;
2c2d71f5
JH
528 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
529 /* Should be nonnegative! */
530 end_shift = prog->minlen - start_shift -
653099ff 531 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
532 }
533
2c2d71f5 534#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 535 if (end_shift < 0)
6bbae5e6 536 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
537#endif
538
2c2d71f5
JH
539 restart:
540 /* Find a possible match in the region s..strend by looking for
541 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 542 if (flags & REXEC_SCREAM) {
cad2e5aa 543 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 544 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 545
2c2d71f5
JH
546 if (PL_screamfirst[BmRARE(check)] >= 0
547 || ( BmRARE(check) == '\n'
548 && (BmPREVIOUS(check) == SvCUR(check) - 1)
549 && SvTAIL(check) ))
9041c2e3 550 s = screaminstr(sv, check,
2c2d71f5 551 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 552 else
2c2d71f5 553 goto fail_finish;
4addbd3b
HS
554 /* we may be pointing at the wrong string */
555 if (s && RX_MATCH_COPIED(prog))
3f7c398e 556 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
557 if (data)
558 *data->scream_olds = s;
559 }
f33976b4 560 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
561 s = fbm_instr((U8*)(s + start_shift),
562 (U8*)(strend - end_shift),
7fba1cd6 563 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 564 else
1aa99e6b
IH
565 s = fbm_instr(HOP3(s, start_shift, strend),
566 HOP3(strend, -end_shift, strbeg),
7fba1cd6 567 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
568
569 /* Update the count-of-usability, remove useless subpatterns,
570 unshift s. */
2c2d71f5 571
a0288114 572 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 573 (s ? "Found" : "Did not find"),
33b8afdf 574 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 575 PL_colors[0],
7b0972df 576 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 577 SvPVX_const(check),
2c2d71f5
JH
578 PL_colors[1], (SvTAIL(check) ? "$" : ""),
579 (s ? " at offset " : "...\n") ) );
580
581 if (!s)
582 goto fail_finish;
583
6eb5f6b9
JH
584 check_at = s;
585
2c2d71f5 586 /* Finish the diagnostic message */
a3621e74 587 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
588
589 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
590 Start with the other substr.
591 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 592 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
593 *always* match. Probably should be marked during compile...
594 Probably it is right to do no SCREAM here...
595 */
596
33b8afdf 597 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 598 /* Take into account the "other" substring. */
2c2d71f5
JH
599 /* XXXX May be hopelessly wrong for UTF... */
600 if (!other_last)
6eb5f6b9 601 other_last = strpos;
33b8afdf 602 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
603 do_other_anchored:
604 {
890ce7af
AL
605 char * const last = HOP3c(s, -start_shift, strbeg);
606 char *last1, *last2;
2c2d71f5 607 char *s1 = s;
33b8afdf 608 SV* must;
2c2d71f5 609
2c2d71f5
JH
610 t = s - prog->check_offset_max;
611 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 612 && (!do_utf8
1aa99e6b 613 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 614 && t > strpos)))
30944b6d 615 /* EMPTY */;
2c2d71f5
JH
616 else
617 t = strpos;
1aa99e6b 618 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
619 if (t < other_last) /* These positions already checked */
620 t = other_last;
1aa99e6b 621 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
622 if (last < last1)
623 last1 = last;
624 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
625 /* On end-of-str: see comment below. */
33b8afdf
JH
626 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
627 if (must == &PL_sv_undef) {
628 s = (char*)NULL;
a3621e74 629 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
630 }
631 else
632 s = fbm_instr(
633 (unsigned char*)t,
634 HOP3(HOP3(last1, prog->anchored_offset, strend)
635 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
636 must,
7fba1cd6 637 multiline ? FBMrf_MULTILINE : 0
33b8afdf 638 );
a3621e74 639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 640 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
641 (s ? "Found" : "Contradicts"),
642 PL_colors[0],
33b8afdf
JH
643 (int)(SvCUR(must)
644 - (SvTAIL(must)!=0)),
3f7c398e 645 SvPVX_const(must),
33b8afdf 646 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
647 if (!s) {
648 if (last1 >= last2) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
650 ", giving up...\n"));
651 goto fail_finish;
652 }
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 654 ", trying floating at offset %ld...\n",
1aa99e6b
IH
655 (long)(HOP3c(s1, 1, strend) - i_strpos)));
656 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
657 s = HOP3c(last, 1, strend);
2c2d71f5
JH
658 goto restart;
659 }
660 else {
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 662 (long)(s - i_strpos)));
1aa99e6b
IH
663 t = HOP3c(s, -prog->anchored_offset, strbeg);
664 other_last = HOP3c(s, 1, strend);
30944b6d 665 s = s1;
2c2d71f5
JH
666 if (t == strpos)
667 goto try_at_start;
2c2d71f5
JH
668 goto try_at_offset;
669 }
30944b6d 670 }
2c2d71f5
JH
671 }
672 else { /* Take into account the floating substring. */
33b8afdf
JH
673 char *last, *last1;
674 char *s1 = s;
675 SV* must;
676
677 t = HOP3c(s, -start_shift, strbeg);
678 last1 = last =
679 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
680 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
681 last = HOP3c(t, prog->float_max_offset, strend);
682 s = HOP3c(t, prog->float_min_offset, strend);
683 if (s < other_last)
684 s = other_last;
2c2d71f5 685 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
686 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
687 /* fbm_instr() takes into account exact value of end-of-str
688 if the check is SvTAIL(ed). Since false positives are OK,
689 and end-of-str is not later than strend we are OK. */
690 if (must == &PL_sv_undef) {
691 s = (char*)NULL;
a3621e74 692 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
693 }
694 else
2c2d71f5 695 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
696 (unsigned char*)last + SvCUR(must)
697 - (SvTAIL(must)!=0),
7fba1cd6 698 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 699 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
700 (s ? "Found" : "Contradicts"),
701 PL_colors[0],
702 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 703 SvPVX_const(must),
33b8afdf
JH
704 PL_colors[1], (SvTAIL(must) ? "$" : "")));
705 if (!s) {
706 if (last1 == last) {
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
708 ", giving up...\n"));
709 goto fail_finish;
2c2d71f5 710 }
a3621e74 711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
712 ", trying anchored starting at offset %ld...\n",
713 (long)(s1 + 1 - i_strpos)));
714 other_last = last;
715 s = HOP3c(t, 1, strend);
716 goto restart;
717 }
718 else {
a3621e74 719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
720 (long)(s - i_strpos)));
721 other_last = s; /* Fix this later. --Hugo */
722 s = s1;
723 if (t == strpos)
724 goto try_at_start;
725 goto try_at_offset;
726 }
2c2d71f5 727 }
cad2e5aa 728 }
2c2d71f5
JH
729
730 t = s - prog->check_offset_max;
2c2d71f5 731 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 732 && (!do_utf8
1aa99e6b
IH
733 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
734 && t > strpos))) {
2c2d71f5
JH
735 /* Fixed substring is found far enough so that the match
736 cannot start at strpos. */
737 try_at_offset:
cad2e5aa 738 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
739 /* Eventually fbm_*() should handle this, but often
740 anchored_offset is not 0, so this check will not be wasted. */
741 /* XXXX In the code below we prefer to look for "^" even in
742 presence of anchored substrings. And we search even
743 beyond the found float position. These pessimizations
744 are historical artefacts only. */
745 find_anchor:
2c2d71f5 746 while (t < strend - prog->minlen) {
cad2e5aa 747 if (*t == '\n') {
4ee3650e 748 if (t < check_at - prog->check_offset_min) {
33b8afdf 749 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
750 /* Since we moved from the found position,
751 we definitely contradict the found anchored
30944b6d
IZ
752 substr. Due to the above check we do not
753 contradict "check" substr.
754 Thus we can arrive here only if check substr
755 is float. Redo checking for "other"=="fixed".
756 */
9041c2e3 757 strpos = t + 1;
a3621e74 758 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 759 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
760 goto do_other_anchored;
761 }
4ee3650e
GS
762 /* We don't contradict the found floating substring. */
763 /* XXXX Why not check for STCLASS? */
cad2e5aa 764 s = t + 1;
a3621e74 765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 766 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
767 goto set_useful;
768 }
4ee3650e
GS
769 /* Position contradicts check-string */
770 /* XXXX probably better to look for check-string
771 than for "\n", so one should lower the limit for t? */
a3621e74 772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 773 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 774 other_last = strpos = s = t + 1;
cad2e5aa
JH
775 goto restart;
776 }
777 t++;
778 }
a3621e74 779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 780 PL_colors[0], PL_colors[1]));
2c2d71f5 781 goto fail_finish;
cad2e5aa 782 }
f5952150 783 else {
a3621e74 784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 785 PL_colors[0], PL_colors[1]));
f5952150 786 }
cad2e5aa
JH
787 s = t;
788 set_useful:
33b8afdf 789 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
790 }
791 else {
f5952150 792 /* The found string does not prohibit matching at strpos,
2c2d71f5 793 - no optimization of calling REx engine can be performed,
f5952150
GS
794 unless it was an MBOL and we are not after MBOL,
795 or a future STCLASS check will fail this. */
2c2d71f5
JH
796 try_at_start:
797 /* Even in this situation we may use MBOL flag if strpos is offset
798 wrt the start of the string. */
05b4157f 799 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 800 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
801 /* May be due to an implicit anchor of m{.*foo} */
802 && !(prog->reganch & ROPT_IMPLICIT))
803 {
cad2e5aa
JH
804 t = strpos;
805 goto find_anchor;
806 }
a3621e74 807 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 808 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 809 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 810 );
2c2d71f5 811 success_at_start:
30944b6d 812 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
813 && (do_utf8 ? (
814 prog->check_utf8 /* Could be deleted already */
815 && --BmUSEFUL(prog->check_utf8) < 0
816 && (prog->check_utf8 == prog->float_utf8)
817 ) : (
818 prog->check_substr /* Could be deleted already */
819 && --BmUSEFUL(prog->check_substr) < 0
820 && (prog->check_substr == prog->float_substr)
821 )))
66e933ab 822 {
cad2e5aa 823 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
825 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
826 if (do_utf8 ? prog->check_substr : prog->check_utf8)
827 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
828 prog->check_substr = prog->check_utf8 = NULL; /* disable */
829 prog->float_substr = prog->float_utf8 = NULL; /* clear */
830 check = NULL; /* abort */
cad2e5aa 831 s = strpos;
3cf5c195
IZ
832 /* XXXX This is a remnant of the old implementation. It
833 looks wasteful, since now INTUIT can use many
6eb5f6b9 834 other heuristics. */
cad2e5aa
JH
835 prog->reganch &= ~RE_USE_INTUIT;
836 }
837 else
838 s = strpos;
839 }
840
6eb5f6b9
JH
841 /* Last resort... */
842 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
843 if (prog->regstclass) {
844 /* minlen == 0 is possible if regstclass is \b or \B,
845 and the fixed substr is ''$.
846 Since minlen is already taken into account, s+1 is before strend;
847 accidentally, minlen >= 1 guaranties no false positives at s + 1
848 even for \b or \B. But (minlen? 1 : 0) below assumes that
849 regstclass does not come from lookahead... */
850 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
851 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 852 const U8* const str = (U8*)STRING(prog->regstclass);
06b5626a 853 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 854 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 855 : 1);
a3b680e6 856 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 857 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 858 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
859 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
860 cl_l, strend)
861 : strend);
6eb5f6b9
JH
862
863 t = s;
9926ca43 864 cache_re(prog);
06b5626a 865 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
866 if (!s) {
867#ifdef DEBUGGING
cbbf8932 868 const char *what = NULL;
6eb5f6b9
JH
869#endif
870 if (endpos == strend) {
a3621e74 871 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
872 "Could not match STCLASS...\n") );
873 goto fail;
874 }
a3621e74 875 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 876 "This position contradicts STCLASS...\n") );
653099ff
GS
877 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
878 goto fail;
6eb5f6b9 879 /* Contradict one of substrings */
33b8afdf
JH
880 if (prog->anchored_substr || prog->anchored_utf8) {
881 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 882 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 883 hop_and_restart:
1aa99e6b 884 s = HOP3c(t, 1, strend);
66e933ab
GS
885 if (s + start_shift + end_shift > strend) {
886 /* XXXX Should be taken into account earlier? */
a3621e74 887 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
888 "Could not match STCLASS...\n") );
889 goto fail;
890 }
5e39e1e5
HS
891 if (!check)
892 goto giveup;
a3621e74 893 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 894 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
895 what, (long)(s + start_shift - i_strpos)) );
896 goto restart;
897 }
66e933ab 898 /* Have both, check_string is floating */
6eb5f6b9
JH
899 if (t + start_shift >= check_at) /* Contradicts floating=check */
900 goto retry_floating_check;
901 /* Recheck anchored substring, but not floating... */
9041c2e3 902 s = check_at;
5e39e1e5
HS
903 if (!check)
904 goto giveup;
a3621e74 905 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 906 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
907 (long)(other_last - i_strpos)) );
908 goto do_other_anchored;
909 }
60e71179
GS
910 /* Another way we could have checked stclass at the
911 current position only: */
912 if (ml_anch) {
913 s = t = t + 1;
5e39e1e5
HS
914 if (!check)
915 goto giveup;
a3621e74 916 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 917 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 918 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 919 goto try_at_offset;
66e933ab 920 }
33b8afdf 921 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 922 goto fail;
6eb5f6b9
JH
923 /* Check is floating subtring. */
924 retry_floating_check:
925 t = check_at - start_shift;
a3621e74 926 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
927 goto hop_and_restart;
928 }
b7953727 929 if (t != s) {
a3621e74 930 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 931 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
932 (long)(t - i_strpos), (long)(s - i_strpos))
933 );
934 }
935 else {
a3621e74 936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
937 "Does not contradict STCLASS...\n");
938 );
939 }
6eb5f6b9 940 }
5e39e1e5 941 giveup:
a3621e74 942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
943 PL_colors[4], (check ? "Guessed" : "Giving up"),
944 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 945 return s;
2c2d71f5
JH
946
947 fail_finish: /* Substring not found */
33b8afdf
JH
948 if (prog->check_substr || prog->check_utf8) /* could be removed already */
949 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 950 fail:
a3621e74 951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 952 PL_colors[4], PL_colors[5]));
bd61b366 953 return NULL;
cad2e5aa 954}
9661b544 955
6eb5f6b9 956/* We know what class REx starts with. Try to find this position... */
3c3eec57 957STATIC char *
a3b680e6 958S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
a687059c 959{
27da23d5 960 dVAR;
1df70142 961 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 962 char *m;
d8093b23 963 STRLEN ln;
5dab1207 964 STRLEN lnc;
078c425b 965 register STRLEN uskip;
d8093b23
G
966 unsigned int c1;
967 unsigned int c2;
6eb5f6b9
JH
968 char *e;
969 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 970 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 971
6eb5f6b9
JH
972 /* We know what class it must start with. */
973 switch (OP(c)) {
6eb5f6b9 974 case ANYOF:
388cc4de 975 if (do_utf8) {
078c425b 976 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
977 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
978 !UTF8_IS_INVARIANT((U8)s[0]) ?
979 reginclass(c, (U8*)s, 0, do_utf8) :
980 REGINCLASS(c, (U8*)s)) {
981 if (tmp && (norun || regtry(prog, s)))
982 goto got_it;
983 else
984 tmp = doevery;
985 }
986 else
987 tmp = 1;
078c425b 988 s += uskip;
388cc4de
HS
989 }
990 }
991 else {
992 while (s < strend) {
993 STRLEN skip = 1;
994
995 if (REGINCLASS(c, (U8*)s) ||
996 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
997 /* The assignment of 2 is intentional:
998 * for the folded sharp s, the skip is 2. */
999 (skip = SHARP_S_SKIP))) {
1000 if (tmp && (norun || regtry(prog, s)))
1001 goto got_it;
1002 else
1003 tmp = doevery;
1004 }
1005 else
1006 tmp = 1;
1007 s += skip;
1008 }
a0d0e21e 1009 }
6eb5f6b9 1010 break;
f33976b4
DB
1011 case CANY:
1012 while (s < strend) {
1013 if (tmp && (norun || regtry(prog, s)))
1014 goto got_it;
1015 else
1016 tmp = doevery;
1017 s++;
1018 }
1019 break;
6eb5f6b9 1020 case EXACTF:
5dab1207
NIS
1021 m = STRING(c);
1022 ln = STR_LEN(c); /* length to match in octets/bytes */
1023 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1024 if (UTF) {
a2a2844f 1025 STRLEN ulen1, ulen2;
5dab1207 1026 U8 *sm = (U8 *) m;
89ebb4a3
JH
1027 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1028 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
041457d9 1029 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a2a2844f
JH
1030
1031 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1032 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1033
89ebb4a3 1034 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1035 0, uniflags);
89ebb4a3 1036 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1037 0, uniflags);
5dab1207
NIS
1038 lnc = 0;
1039 while (sm < ((U8 *) m + ln)) {
1040 lnc++;
1041 sm += UTF8SKIP(sm);
1042 }
1aa99e6b
IH
1043 }
1044 else {
1045 c1 = *(U8*)m;
1046 c2 = PL_fold[c1];
1047 }
6eb5f6b9
JH
1048 goto do_exactf;
1049 case EXACTFL:
5dab1207
NIS
1050 m = STRING(c);
1051 ln = STR_LEN(c);
1052 lnc = (I32) ln;
d8093b23 1053 c1 = *(U8*)m;
6eb5f6b9
JH
1054 c2 = PL_fold_locale[c1];
1055 do_exactf:
db12adc6 1056 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1057
6eb5f6b9
JH
1058 if (norun && e < s)
1059 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1060
60a8b682
JH
1061 /* The idea in the EXACTF* cases is to first find the
1062 * first character of the EXACTF* node and then, if
1063 * necessary, case-insensitively compare the full
1064 * text of the node. The c1 and c2 are the first
1065 * characters (though in Unicode it gets a bit
1066 * more complicated because there are more cases
7f16dd3d
JH
1067 * than just upper and lower: one needs to use
1068 * the so-called folding case for case-insensitive
1069 * matching (called "loose matching" in Unicode).
1070 * ibcmp_utf8() will do just that. */
60a8b682 1071
1aa99e6b 1072 if (do_utf8) {
575cac57 1073 UV c, f;
89ebb4a3 1074 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1075 STRLEN len, foldlen;
041457d9 1076 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
09091399 1077 if (c1 == c2) {
5dab1207
NIS
1078 /* Upper and lower of 1st char are equal -
1079 * probably not a "letter". */
1aa99e6b 1080 while (s <= e) {
89ebb4a3 1081 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1082 uniflags);
80aecb99
JH
1083 if ( c == c1
1084 && (ln == len ||
66423254 1085 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1086 m, (char **)0, ln, (bool)UTF))
55da9344 1087 && (norun || regtry(prog, s)) )
1aa99e6b 1088 goto got_it;
80aecb99 1089 else {
1df70142 1090 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1091 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1092 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1093 if ( f != c
1094 && (f == c1 || f == c2)
1095 && (ln == foldlen ||
66423254
JH
1096 !ibcmp_utf8((char *) foldbuf,
1097 (char **)0, foldlen, do_utf8,
d07ddd77 1098 m,
eb160463 1099 (char **)0, ln, (bool)UTF))
80aecb99
JH
1100 && (norun || regtry(prog, s)) )
1101 goto got_it;
1102 }
1aa99e6b
IH
1103 s += len;
1104 }
09091399
JH
1105 }
1106 else {
1aa99e6b 1107 while (s <= e) {
89ebb4a3 1108 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1109 uniflags);
80aecb99 1110
60a8b682 1111 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1112 * Note that not all the possible combinations
1113 * are handled here: some of them are handled
1114 * by the standard folding rules, and some of
1115 * them (the character class or ANYOF cases)
1116 * are handled during compiletime in
1117 * regexec.c:S_regclass(). */
880bd946
JH
1118 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1119 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1120 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1121
1122 if ( (c == c1 || c == c2)
1123 && (ln == len ||
66423254 1124 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1125 m, (char **)0, ln, (bool)UTF))
55da9344 1126 && (norun || regtry(prog, s)) )
1aa99e6b 1127 goto got_it;
80aecb99 1128 else {
1df70142 1129 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1130 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1131 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1132 if ( f != c
1133 && (f == c1 || f == c2)
1134 && (ln == foldlen ||
a6872d42 1135 !ibcmp_utf8((char *) foldbuf,
66423254 1136 (char **)0, foldlen, do_utf8,
d07ddd77 1137 m,
eb160463 1138 (char **)0, ln, (bool)UTF))
80aecb99
JH
1139 && (norun || regtry(prog, s)) )
1140 goto got_it;
1141 }
1aa99e6b
IH
1142 s += len;
1143 }
09091399 1144 }
1aa99e6b
IH
1145 }
1146 else {
1147 if (c1 == c2)
1148 while (s <= e) {
1149 if ( *(U8*)s == c1
1150 && (ln == 1 || !(OP(c) == EXACTF
1151 ? ibcmp(s, m, ln)
1152 : ibcmp_locale(s, m, ln)))
1153 && (norun || regtry(prog, s)) )
1154 goto got_it;
1155 s++;
1156 }
1157 else
1158 while (s <= e) {
1159 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1160 && (ln == 1 || !(OP(c) == EXACTF
1161 ? ibcmp(s, m, ln)
1162 : ibcmp_locale(s, m, ln)))
1163 && (norun || regtry(prog, s)) )
1164 goto got_it;
1165 s++;
1166 }
b3c9acc1
IZ
1167 }
1168 break;
bbce6d69 1169 case BOUNDL:
3280af22 1170 PL_reg_flags |= RF_tainted;
bbce6d69 1171 /* FALL THROUGH */
a0d0e21e 1172 case BOUND:
ffc61ed2 1173 if (do_utf8) {
12d33761 1174 if (s == PL_bostr)
ffc61ed2
JH
1175 tmp = '\n';
1176 else {
6136c704 1177 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1178 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1179 }
1180 tmp = ((OP(c) == BOUND ?
9041c2e3 1181 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1182 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1183 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1184 if (tmp == !(OP(c) == BOUND ?
3568d838 1185 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1186 isALNUM_LC_utf8((U8*)s)))
1187 {
1188 tmp = !tmp;
1189 if ((norun || regtry(prog, s)))
1190 goto got_it;
1191 }
078c425b 1192 s += uskip;
a687059c 1193 }
a0d0e21e 1194 }
667bb95a 1195 else {
12d33761 1196 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1197 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1198 while (s < strend) {
1199 if (tmp ==
1200 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1201 tmp = !tmp;
1202 if ((norun || regtry(prog, s)))
1203 goto got_it;
1204 }
1205 s++;
a0ed51b3 1206 }
a0ed51b3 1207 }
6eb5f6b9 1208 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1209 goto got_it;
1210 break;
bbce6d69 1211 case NBOUNDL:
3280af22 1212 PL_reg_flags |= RF_tainted;
bbce6d69 1213 /* FALL THROUGH */
a0d0e21e 1214 case NBOUND:
ffc61ed2 1215 if (do_utf8) {
12d33761 1216 if (s == PL_bostr)
ffc61ed2
JH
1217 tmp = '\n';
1218 else {
6136c704 1219 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
b4f7163a 1220 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1221 }
1222 tmp = ((OP(c) == NBOUND ?
9041c2e3 1223 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1224 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1225 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1226 if (tmp == !(OP(c) == NBOUND ?
3568d838 1227 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1228 isALNUM_LC_utf8((U8*)s)))
1229 tmp = !tmp;
1230 else if ((norun || regtry(prog, s)))
1231 goto got_it;
078c425b 1232 s += uskip;
ffc61ed2 1233 }
a0d0e21e 1234 }
667bb95a 1235 else {
12d33761 1236 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1237 tmp = ((OP(c) == NBOUND ?
1238 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1239 while (s < strend) {
1240 if (tmp ==
1241 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1242 tmp = !tmp;
1243 else if ((norun || regtry(prog, s)))
1244 goto got_it;
1245 s++;
1246 }
a0ed51b3 1247 }
6eb5f6b9 1248 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1249 goto got_it;
1250 break;
a0d0e21e 1251 case ALNUM:
ffc61ed2 1252 if (do_utf8) {
1a4fad37 1253 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1254 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1255 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1256 if (tmp && (norun || regtry(prog, s)))
1257 goto got_it;
1258 else
1259 tmp = doevery;
1260 }
bbce6d69 1261 else
ffc61ed2 1262 tmp = 1;
078c425b 1263 s += uskip;
bbce6d69 1264 }
bbce6d69 1265 }
ffc61ed2
JH
1266 else {
1267 while (s < strend) {
1268 if (isALNUM(*s)) {
1269 if (tmp && (norun || regtry(prog, s)))
1270 goto got_it;
1271 else
1272 tmp = doevery;
1273 }
a0ed51b3 1274 else
ffc61ed2
JH
1275 tmp = 1;
1276 s++;
a0ed51b3 1277 }
a0ed51b3
LW
1278 }
1279 break;
bbce6d69 1280 case ALNUML:
3280af22 1281 PL_reg_flags |= RF_tainted;
ffc61ed2 1282 if (do_utf8) {
078c425b 1283 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1284 if (isALNUM_LC_utf8((U8*)s)) {
1285 if (tmp && (norun || regtry(prog, s)))
1286 goto got_it;
1287 else
1288 tmp = doevery;
1289 }
a687059c 1290 else
ffc61ed2 1291 tmp = 1;
078c425b 1292 s += uskip;
a0d0e21e 1293 }
a0d0e21e 1294 }
ffc61ed2
JH
1295 else {
1296 while (s < strend) {
1297 if (isALNUM_LC(*s)) {
1298 if (tmp && (norun || regtry(prog, s)))
1299 goto got_it;
1300 else
1301 tmp = doevery;
1302 }
a0ed51b3 1303 else
ffc61ed2
JH
1304 tmp = 1;
1305 s++;
a0ed51b3 1306 }
a0ed51b3
LW
1307 }
1308 break;
a0d0e21e 1309 case NALNUM:
ffc61ed2 1310 if (do_utf8) {
1a4fad37 1311 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1312 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1313 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1314 if (tmp && (norun || regtry(prog, s)))
1315 goto got_it;
1316 else
1317 tmp = doevery;
1318 }
bbce6d69 1319 else
ffc61ed2 1320 tmp = 1;
078c425b 1321 s += uskip;
bbce6d69 1322 }
bbce6d69 1323 }
ffc61ed2
JH
1324 else {
1325 while (s < strend) {
1326 if (!isALNUM(*s)) {
1327 if (tmp && (norun || regtry(prog, s)))
1328 goto got_it;
1329 else
1330 tmp = doevery;
1331 }
a0ed51b3 1332 else
ffc61ed2
JH
1333 tmp = 1;
1334 s++;
a0ed51b3 1335 }
a0ed51b3
LW
1336 }
1337 break;
bbce6d69 1338 case NALNUML:
3280af22 1339 PL_reg_flags |= RF_tainted;
ffc61ed2 1340 if (do_utf8) {
078c425b 1341 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1342 if (!isALNUM_LC_utf8((U8*)s)) {
1343 if (tmp && (norun || regtry(prog, s)))
1344 goto got_it;
1345 else
1346 tmp = doevery;
1347 }
a687059c 1348 else
ffc61ed2 1349 tmp = 1;
078c425b 1350 s += uskip;
a687059c 1351 }
a0d0e21e 1352 }
ffc61ed2
JH
1353 else {
1354 while (s < strend) {
1355 if (!isALNUM_LC(*s)) {
1356 if (tmp && (norun || regtry(prog, s)))
1357 goto got_it;
1358 else
1359 tmp = doevery;
1360 }
a0ed51b3 1361 else
ffc61ed2
JH
1362 tmp = 1;
1363 s++;
a0ed51b3 1364 }
a0ed51b3
LW
1365 }
1366 break;
a0d0e21e 1367 case SPACE:
ffc61ed2 1368 if (do_utf8) {
1a4fad37 1369 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1370 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1371 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1372 if (tmp && (norun || regtry(prog, s)))
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
a0d0e21e 1377 else
ffc61ed2 1378 tmp = 1;
078c425b 1379 s += uskip;
2304df62 1380 }
a0d0e21e 1381 }
ffc61ed2
JH
1382 else {
1383 while (s < strend) {
1384 if (isSPACE(*s)) {
1385 if (tmp && (norun || regtry(prog, s)))
1386 goto got_it;
1387 else
1388 tmp = doevery;
1389 }
a0ed51b3 1390 else
ffc61ed2
JH
1391 tmp = 1;
1392 s++;
a0ed51b3 1393 }
a0ed51b3
LW
1394 }
1395 break;
bbce6d69 1396 case SPACEL:
3280af22 1397 PL_reg_flags |= RF_tainted;
ffc61ed2 1398 if (do_utf8) {
078c425b 1399 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1400 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1401 if (tmp && (norun || regtry(prog, s)))
1402 goto got_it;
1403 else
1404 tmp = doevery;
1405 }
bbce6d69 1406 else
ffc61ed2 1407 tmp = 1;
078c425b 1408 s += uskip;
bbce6d69 1409 }
bbce6d69 1410 }
ffc61ed2
JH
1411 else {
1412 while (s < strend) {
1413 if (isSPACE_LC(*s)) {
1414 if (tmp && (norun || regtry(prog, s)))
1415 goto got_it;
1416 else
1417 tmp = doevery;
1418 }
a0ed51b3 1419 else
ffc61ed2
JH
1420 tmp = 1;
1421 s++;
a0ed51b3 1422 }
a0ed51b3
LW
1423 }
1424 break;
a0d0e21e 1425 case NSPACE:
ffc61ed2 1426 if (do_utf8) {
1a4fad37 1427 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1428 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1429 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1430 if (tmp && (norun || regtry(prog, s)))
1431 goto got_it;
1432 else
1433 tmp = doevery;
1434 }
a0d0e21e 1435 else
ffc61ed2 1436 tmp = 1;
078c425b 1437 s += uskip;
a687059c 1438 }
a0d0e21e 1439 }
ffc61ed2
JH
1440 else {
1441 while (s < strend) {
1442 if (!isSPACE(*s)) {
1443 if (tmp && (norun || regtry(prog, s)))
1444 goto got_it;
1445 else
1446 tmp = doevery;
1447 }
a0ed51b3 1448 else
ffc61ed2
JH
1449 tmp = 1;
1450 s++;
a0ed51b3 1451 }
a0ed51b3
LW
1452 }
1453 break;
bbce6d69 1454 case NSPACEL:
3280af22 1455 PL_reg_flags |= RF_tainted;
ffc61ed2 1456 if (do_utf8) {
078c425b 1457 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1458 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1459 if (tmp && (norun || regtry(prog, s)))
1460 goto got_it;
1461 else
1462 tmp = doevery;
1463 }
bbce6d69 1464 else
ffc61ed2 1465 tmp = 1;
078c425b 1466 s += uskip;
bbce6d69 1467 }
bbce6d69 1468 }
ffc61ed2
JH
1469 else {
1470 while (s < strend) {
1471 if (!isSPACE_LC(*s)) {
1472 if (tmp && (norun || regtry(prog, s)))
1473 goto got_it;
1474 else
1475 tmp = doevery;
1476 }
a0ed51b3 1477 else
ffc61ed2
JH
1478 tmp = 1;
1479 s++;
a0ed51b3 1480 }
a0ed51b3
LW
1481 }
1482 break;
a0d0e21e 1483 case DIGIT:
ffc61ed2 1484 if (do_utf8) {
1a4fad37 1485 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1486 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1487 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1488 if (tmp && (norun || regtry(prog, s)))
1489 goto got_it;
1490 else
1491 tmp = doevery;
1492 }
a0d0e21e 1493 else
ffc61ed2 1494 tmp = 1;
078c425b 1495 s += uskip;
2b69d0c2 1496 }
a0d0e21e 1497 }
ffc61ed2
JH
1498 else {
1499 while (s < strend) {
1500 if (isDIGIT(*s)) {
1501 if (tmp && (norun || regtry(prog, s)))
1502 goto got_it;
1503 else
1504 tmp = doevery;
1505 }
a0ed51b3 1506 else
ffc61ed2
JH
1507 tmp = 1;
1508 s++;
a0ed51b3 1509 }
a0ed51b3
LW
1510 }
1511 break;
b8c5462f
JH
1512 case DIGITL:
1513 PL_reg_flags |= RF_tainted;
ffc61ed2 1514 if (do_utf8) {
078c425b 1515 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1516 if (isDIGIT_LC_utf8((U8*)s)) {
1517 if (tmp && (norun || regtry(prog, s)))
1518 goto got_it;
1519 else
1520 tmp = doevery;
1521 }
b8c5462f 1522 else
ffc61ed2 1523 tmp = 1;
078c425b 1524 s += uskip;
b8c5462f 1525 }
b8c5462f 1526 }
ffc61ed2
JH
1527 else {
1528 while (s < strend) {
1529 if (isDIGIT_LC(*s)) {
1530 if (tmp && (norun || regtry(prog, s)))
1531 goto got_it;
1532 else
1533 tmp = doevery;
1534 }
b8c5462f 1535 else
ffc61ed2
JH
1536 tmp = 1;
1537 s++;
b8c5462f 1538 }
b8c5462f
JH
1539 }
1540 break;
a0d0e21e 1541 case NDIGIT:
ffc61ed2 1542 if (do_utf8) {
1a4fad37 1543 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1544 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1545 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1546 if (tmp && (norun || regtry(prog, s)))
1547 goto got_it;
1548 else
1549 tmp = doevery;
1550 }
a0d0e21e 1551 else
ffc61ed2 1552 tmp = 1;
078c425b 1553 s += uskip;
a687059c 1554 }
a0d0e21e 1555 }
ffc61ed2
JH
1556 else {
1557 while (s < strend) {
1558 if (!isDIGIT(*s)) {
1559 if (tmp && (norun || regtry(prog, s)))
1560 goto got_it;
1561 else
1562 tmp = doevery;
1563 }
a0ed51b3 1564 else
ffc61ed2
JH
1565 tmp = 1;
1566 s++;
a0ed51b3 1567 }
a0ed51b3
LW
1568 }
1569 break;
b8c5462f
JH
1570 case NDIGITL:
1571 PL_reg_flags |= RF_tainted;
ffc61ed2 1572 if (do_utf8) {
078c425b 1573 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1574 if (!isDIGIT_LC_utf8((U8*)s)) {
1575 if (tmp && (norun || regtry(prog, s)))
1576 goto got_it;
1577 else
1578 tmp = doevery;
1579 }
b8c5462f 1580 else
ffc61ed2 1581 tmp = 1;
078c425b 1582 s += uskip;
b8c5462f 1583 }
a0ed51b3 1584 }
ffc61ed2
JH
1585 else {
1586 while (s < strend) {
1587 if (!isDIGIT_LC(*s)) {
1588 if (tmp && (norun || regtry(prog, s)))
1589 goto got_it;
1590 else
1591 tmp = doevery;
1592 }
cf93c79d 1593 else
ffc61ed2
JH
1594 tmp = 1;
1595 s++;
b8c5462f 1596 }
b8c5462f
JH
1597 }
1598 break;
b3c9acc1 1599 default:
3c3eec57
GS
1600 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1601 break;
d6a28714 1602 }
6eb5f6b9
JH
1603 return 0;
1604 got_it:
1605 return s;
1606}
1607
1608/*
1609 - regexec_flags - match a regexp against a string
1610 */
1611I32
1612Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1613 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1614/* strend: pointer to null at end of string */
1615/* strbeg: real beginning of string */
1616/* minend: end of match must be >=minend after stringarg. */
1617/* data: May be used for some additional optimizations. */
1618/* nosave: For optimizations. */
1619{
97aff369 1620 dVAR;
6eb5f6b9
JH
1621 register char *s;
1622 register regnode *c;
1623 register char *startpos = stringarg;
6eb5f6b9
JH
1624 I32 minlen; /* must match at least this many chars */
1625 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1626 I32 end_shift = 0; /* Same for the end. */ /* CC */
1627 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1628 char *scream_olds = NULL;
6eb5f6b9 1629 SV* oreplsv = GvSV(PL_replgv);
1df70142 1630 const bool do_utf8 = DO_UTF8(sv);
a3b680e6 1631 const I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1632#ifdef DEBUGGING
6136c704
AL
1633 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
1634 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1635#endif
a3621e74
YO
1636
1637 GET_RE_DEBUG_FLAGS_DECL;
1638
9d4ba2ae 1639 PERL_UNUSED_ARG(data);
a30b2f1f 1640 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9 1641
6eb5f6b9
JH
1642 cache_re(prog);
1643#ifdef DEBUGGING
aea4f609 1644 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1645#endif
1646
1647 /* Be paranoid... */
1648 if (prog == NULL || startpos == NULL) {
1649 Perl_croak(aTHX_ "NULL regexp parameter");
1650 return 0;
1651 }
1652
1653 minlen = prog->minlen;
61a36c01 1654 if (strend - startpos < minlen) {
a3621e74 1655 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1656 "String too short [regexec_flags]...\n"));
1657 goto phooey;
1aa99e6b 1658 }
6eb5f6b9 1659
6eb5f6b9
JH
1660 /* Check validity of program. */
1661 if (UCHARAT(prog->program) != REG_MAGIC) {
1662 Perl_croak(aTHX_ "corrupted regexp program");
1663 }
1664
1665 PL_reg_flags = 0;
1666 PL_reg_eval_set = 0;
1667 PL_reg_maxiter = 0;
1668
1669 if (prog->reganch & ROPT_UTF8)
1670 PL_reg_flags |= RF_utf8;
1671
1672 /* Mark beginning of line for ^ and lookbehind. */
1673 PL_regbol = startpos;
1674 PL_bostr = strbeg;
1675 PL_reg_sv = sv;
1676
1677 /* Mark end of line for $ (and such) */
1678 PL_regeol = strend;
1679
1680 /* see how far we have to get to not match where we matched before */
1681 PL_regtill = startpos+minend;
1682
1683 /* We start without call_cc context. */
1684 PL_reg_call_cc = 0;
1685
1686 /* If there is a "must appear" string, look for it. */
1687 s = startpos;
1688
1689 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1690 MAGIC *mg;
1691
1692 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1693 PL_reg_ganch = startpos;
1694 else if (sv && SvTYPE(sv) >= SVt_PVMG
1695 && SvMAGIC(sv)
14befaf4
DM
1696 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1697 && mg->mg_len >= 0) {
6eb5f6b9
JH
1698 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1699 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1700 if (s > PL_reg_ganch)
6eb5f6b9
JH
1701 goto phooey;
1702 s = PL_reg_ganch;
1703 }
1704 }
1705 else /* pos() not defined */
1706 PL_reg_ganch = strbeg;
1707 }
1708
a0714e2c 1709 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1710 re_scream_pos_data d;
1711
1712 d.scream_olds = &scream_olds;
1713 d.scream_pos = &scream_pos;
1714 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1715 if (!s) {
a3621e74 1716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1717 goto phooey; /* not present */
3fa9c3d7 1718 }
6eb5f6b9
JH
1719 }
1720
a3621e74 1721 DEBUG_EXECUTE_r({
1df70142
AL
1722 const char * const s0 = UTF
1723 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1724 UNI_DISPLAY_REGEX)
1725 : prog->precomp;
1726 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1727 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1728 UNI_DISPLAY_REGEX) : startpos;
1df70142 1729 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1730 if (!PL_colorset)
1731 reginitcolors();
1732 PerlIO_printf(Perl_debug_log,
a0288114 1733 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1734 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1735 len0, len0, s0,
2a782b5b 1736 PL_colors[1],
9e55ce06 1737 len0 > 60 ? "..." : "",
2a782b5b 1738 PL_colors[0],
9e55ce06
JH
1739 (int)(len1 > 60 ? 60 : len1),
1740 s1, PL_colors[1],
1741 (len1 > 60 ? "..." : "")
2a782b5b
JH
1742 );
1743 });
6eb5f6b9
JH
1744
1745 /* Simplest case: anchored match need be tried only once. */
1746 /* [unless only anchor is BOL and multiline is set] */
1747 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1748 if (s == startpos && regtry(prog, startpos))
1749 goto got_it;
7fba1cd6 1750 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1751 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1752 {
1753 char *end;
1754
1755 if (minlen)
1756 dontbother = minlen - 1;
1aa99e6b 1757 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1758 /* for multiline we only have to try after newlines */
33b8afdf 1759 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1760 if (s == startpos)
1761 goto after_try;
1762 while (1) {
1763 if (regtry(prog, s))
1764 goto got_it;
1765 after_try:
1766 if (s >= end)
1767 goto phooey;
1768 if (prog->reganch & RE_USE_INTUIT) {
1769 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1770 if (!s)
1771 goto phooey;
1772 }
1773 else
1774 s++;
1775 }
1776 } else {
1777 if (s > startpos)
1778 s--;
1779 while (s < end) {
1780 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1781 if (regtry(prog, s))
1782 goto got_it;
1783 }
1784 }
1785 }
1786 }
1787 goto phooey;
1788 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1789 if (regtry(prog, PL_reg_ganch))
1790 goto got_it;
1791 goto phooey;
1792 }
1793
1794 /* Messy cases: unanchored match. */
33b8afdf 1795 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1796 /* we have /x+whatever/ */
1797 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1798 char ch;
bf93d4cc
GS
1799#ifdef DEBUGGING
1800 int did_match = 0;
1801#endif
33b8afdf
JH
1802 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1803 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1804 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1805
1aa99e6b 1806 if (do_utf8) {
6eb5f6b9
JH
1807 while (s < strend) {
1808 if (*s == ch) {
a3621e74 1809 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1810 if (regtry(prog, s)) goto got_it;
1811 s += UTF8SKIP(s);
1812 while (s < strend && *s == ch)
1813 s += UTF8SKIP(s);
1814 }
1815 s += UTF8SKIP(s);
1816 }
1817 }
1818 else {
1819 while (s < strend) {
1820 if (*s == ch) {
a3621e74 1821 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1822 if (regtry(prog, s)) goto got_it;
1823 s++;
1824 while (s < strend && *s == ch)
1825 s++;
1826 }
1827 s++;
1828 }
1829 }
a3621e74 1830 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1831 PerlIO_printf(Perl_debug_log,
b7953727
JH
1832 "Did not find anchored character...\n")
1833 );
6eb5f6b9 1834 }
a0714e2c
SS
1835 else if (prog->anchored_substr != NULL
1836 || prog->anchored_utf8 != NULL
1837 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1838 && prog->float_max_offset < strend - s)) {
1839 SV *must;
1840 I32 back_max;
1841 I32 back_min;
1842 char *last;
6eb5f6b9 1843 char *last1; /* Last position checked before */
bf93d4cc
GS
1844#ifdef DEBUGGING
1845 int did_match = 0;
1846#endif
33b8afdf
JH
1847 if (prog->anchored_substr || prog->anchored_utf8) {
1848 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1849 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1850 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1851 back_max = back_min = prog->anchored_offset;
1852 } else {
1853 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1854 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1855 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1856 back_max = prog->float_max_offset;
1857 back_min = prog->float_min_offset;
1858 }
1859 if (must == &PL_sv_undef)
1860 /* could not downgrade utf8 check substring, so must fail */
1861 goto phooey;
1862
1863 last = HOP3c(strend, /* Cannot start after this */
1864 -(I32)(CHR_SVLEN(must)
1865 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1866
1867 if (s > PL_bostr)
1868 last1 = HOPc(s, -1);
1869 else
1870 last1 = s - 1; /* bogus */
1871
a0288114 1872 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1873 check_substr==must. */
1874 scream_pos = -1;
1875 dontbother = end_shift;
1876 strend = HOPc(strend, -dontbother);
1877 while ( (s <= last) &&
9041c2e3 1878 ((flags & REXEC_SCREAM)
1aa99e6b 1879 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1880 end_shift, &scream_pos, 0))
1aa99e6b 1881 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1882 (unsigned char*)strend, must,
7fba1cd6 1883 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1884 /* we may be pointing at the wrong string */
1885 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1886 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1887 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1888 if (HOPc(s, -back_max) > last1) {
1889 last1 = HOPc(s, -back_min);
1890 s = HOPc(s, -back_max);
1891 }
1892 else {
52657f30 1893 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1894
1895 last1 = HOPc(s, -back_min);
52657f30 1896 s = t;
6eb5f6b9 1897 }
1aa99e6b 1898 if (do_utf8) {
6eb5f6b9
JH
1899 while (s <= last1) {
1900 if (regtry(prog, s))
1901 goto got_it;
1902 s += UTF8SKIP(s);
1903 }
1904 }
1905 else {
1906 while (s <= last1) {
1907 if (regtry(prog, s))
1908 goto got_it;
1909 s++;
1910 }
1911 }
1912 }
a3621e74 1913 DEBUG_EXECUTE_r(if (!did_match)
b7953727 1914 PerlIO_printf(Perl_debug_log,
a0288114 1915 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 1916 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1917 ? "anchored" : "floating"),
1918 PL_colors[0],
1919 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 1920 SvPVX_const(must),
b7953727
JH
1921 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1922 );
6eb5f6b9
JH
1923 goto phooey;
1924 }
155aba94 1925 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1926 if (minlen) {
1927 I32 op = (U8)OP(prog->regstclass);
66e933ab 1928 /* don't bother with what can't match */
f14c76ed
RGS
1929 if (PL_regkind[op] != EXACT && op != CANY)
1930 strend = HOPc(strend, -(minlen - 1));
1931 }
a3621e74 1932 DEBUG_EXECUTE_r({
ffc61ed2 1933 SV *prop = sv_newmortal();
cfd0369c
NC
1934 const char *s0;
1935 const char *s1;
9e55ce06
JH
1936 int len0;
1937 int len1;
1938
ffc61ed2 1939 regprop(prop, c);
9e55ce06 1940 s0 = UTF ?
3f7c398e 1941 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 1942 UNI_DISPLAY_REGEX) :
cfd0369c 1943 SvPVX_const(prop);
9e55ce06
JH
1944 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1945 s1 = UTF ?
c728cb41 1946 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1947 len1 = UTF ? SvCUR(dsv1) : strend - s;
1948 PerlIO_printf(Perl_debug_log,
a0288114 1949 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
9e55ce06
JH
1950 len0, len0, s0,
1951 len1, len1, s1);
ffc61ed2 1952 });
06b5626a 1953 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1954 goto got_it;
a3621e74 1955 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1956 }
1957 else {
1958 dontbother = 0;
a0714e2c 1959 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1960 /* Trim the end. */
d6a28714 1961 char *last;
33b8afdf
JH
1962 SV* float_real;
1963
1964 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1965 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1966 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1967
1968 if (flags & REXEC_SCREAM) {
33b8afdf 1969 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1970 end_shift, &scream_pos, 1); /* last one */
1971 if (!last)
ffc61ed2 1972 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1973 /* we may be pointing at the wrong string */
1974 else if (RX_MATCH_COPIED(prog))
3f7c398e 1975 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1976 }
d6a28714
JH
1977 else {
1978 STRLEN len;
cfd0369c 1979 const char * const little = SvPV_const(float_real, len);
d6a28714 1980
33b8afdf 1981 if (SvTAIL(float_real)) {
d6a28714
JH
1982 if (memEQ(strend - len + 1, little, len - 1))
1983 last = strend - len + 1;
7fba1cd6 1984 else if (!multiline)
9041c2e3 1985 last = memEQ(strend - len, little, len)
bd61b366 1986 ? strend - len : NULL;
b8c5462f 1987 else
d6a28714
JH
1988 goto find_last;
1989 } else {
1990 find_last:
9041c2e3 1991 if (len)
d6a28714 1992 last = rninstr(s, strend, little, little + len);
b8c5462f 1993 else
a0288114 1994 last = strend; /* matching "$" */
b8c5462f 1995 }
b8c5462f 1996 }
bf93d4cc 1997 if (last == NULL) {
a3621e74 1998 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1999 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2000 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2001 goto phooey; /* Should not happen! */
2002 }
d6a28714
JH
2003 dontbother = strend - last + prog->float_min_offset;
2004 }
2005 if (minlen && (dontbother < minlen))
2006 dontbother = minlen - 1;
2007 strend -= dontbother; /* this one's always in bytes! */
2008 /* We don't know much -- general case. */
1aa99e6b 2009 if (do_utf8) {
d6a28714
JH
2010 for (;;) {
2011 if (regtry(prog, s))
2012 goto got_it;
2013 if (s >= strend)
2014 break;
b8c5462f 2015 s += UTF8SKIP(s);
d6a28714
JH
2016 };
2017 }
2018 else {
2019 do {
2020 if (regtry(prog, s))
2021 goto got_it;
2022 } while (s++ < strend);
2023 }
2024 }
2025
2026 /* Failure. */
2027 goto phooey;
2028
2029got_it:
2030 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2031
2032 if (PL_reg_eval_set) {
2033 /* Preserve the current value of $^R */
2034 if (oreplsv != GvSV(PL_replgv))
2035 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2036 restored, the value remains
2037 the same. */
acfe0abc 2038 restore_pos(aTHX_ 0);
d6a28714
JH
2039 }
2040
2041 /* make sure $`, $&, $', and $digit will work later */
2042 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2043 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2044 if (flags & REXEC_COPY_STR) {
2045 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2046#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2047 if ((SvIsCOW(sv)
2048 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2049 if (DEBUG_C_TEST) {
2050 PerlIO_printf(Perl_debug_log,
2051 "Copy on write: regexp capture, type %d\n",
2052 (int) SvTYPE(sv));
2053 }
2054 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2055 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2056 assert (SvPOKp(prog->saved_copy));
2057 } else
2058#endif
2059 {
2060 RX_MATCH_COPIED_on(prog);
2061 s = savepvn(strbeg, i);
2062 prog->subbeg = s;
2063 }
d6a28714 2064 prog->sublen = i;
d6a28714
JH
2065 }
2066 else {
2067 prog->subbeg = strbeg;
2068 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2069 }
2070 }
9041c2e3 2071
d6a28714
JH
2072 return 1;
2073
2074phooey:
a3621e74 2075 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2076 PL_colors[4], PL_colors[5]));
d6a28714 2077 if (PL_reg_eval_set)
acfe0abc 2078 restore_pos(aTHX_ 0);
d6a28714
JH
2079 return 0;
2080}
2081
2082/*
2083 - regtry - try match at specific point
2084 */
2085STATIC I32 /* 0 failure, 1 success */
2086S_regtry(pTHX_ regexp *prog, char *startpos)
2087{
97aff369 2088 dVAR;
d6a28714
JH
2089 register I32 i;
2090 register I32 *sp;
2091 register I32 *ep;
2092 CHECKPOINT lastcp;
a3621e74 2093 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2094
02db2b7b
IZ
2095#ifdef DEBUGGING
2096 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2097#endif
d6a28714
JH
2098 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2099 MAGIC *mg;
2100
2101 PL_reg_eval_set = RS_init;
a3621e74 2102 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2103 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2104 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2105 ));
e8347627 2106 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2107 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2108 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2109 SAVETMPS;
2110 /* Apparently this is not needed, judging by wantarray. */
e8347627 2111 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2112 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2113
2114 if (PL_reg_sv) {
2115 /* Make $_ available to executed code. */
2116 if (PL_reg_sv != DEFSV) {
59f00321 2117 SAVE_DEFSV;
d6a28714 2118 DEFSV = PL_reg_sv;
b8c5462f 2119 }
d6a28714 2120
9041c2e3 2121 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2122 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2123 /* prepare for quick setting of pos */
d300d9fa
NC
2124#ifdef PERL_OLD_COPY_ON_WRITE
2125 if (SvIsCOW(sv))
2126 sv_force_normal_flags(sv, 0);
2127#endif
2128 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
2129 &PL_vtbl_mglob, NULL, 0);
d6a28714 2130 mg->mg_len = -1;
b8c5462f 2131 }
d6a28714
JH
2132 PL_reg_magic = mg;
2133 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2134 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2135 }
09687e5a 2136 if (!PL_reg_curpm) {
a02a5408 2137 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2138#ifdef USE_ITHREADS
2139 {
2140 SV* repointer = newSViv(0);
577e12cc 2141 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2142 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2143 av_push(PL_regex_padav,repointer);
2144 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2145 PL_regex_pad = AvARRAY(PL_regex_padav);
2146 }
2147#endif
2148 }
aaa362c4 2149 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2150 PL_reg_oldcurpm = PL_curpm;
2151 PL_curpm = PL_reg_curpm;
2152 if (RX_MATCH_COPIED(prog)) {
2153 /* Here is a serious problem: we cannot rewrite subbeg,
2154 since it may be needed if this match fails. Thus
2155 $` inside (?{}) could fail... */
2156 PL_reg_oldsaved = prog->subbeg;
2157 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2158#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2159 PL_nrs = prog->saved_copy;
2160#endif
d6a28714
JH
2161 RX_MATCH_COPIED_off(prog);
2162 }
2163 else
bd61b366 2164 PL_reg_oldsaved = NULL;
d6a28714
JH
2165 prog->subbeg = PL_bostr;
2166 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2167 }
973dddac 2168 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2169 PL_reginput = startpos;
2170 PL_regstartp = prog->startp;
2171 PL_regendp = prog->endp;
2172 PL_reglastparen = &prog->lastparen;
a01268b5 2173 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2174 prog->lastparen = 0;
03994de8 2175 prog->lastcloseparen = 0;
d6a28714 2176 PL_regsize = 0;
a3621e74 2177 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2178 if (PL_reg_start_tmpl <= prog->nparens) {
2179 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2180 if(PL_reg_start_tmp)
2181 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2182 else
a02a5408 2183 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2184 }
2185
2186 /* XXXX What this code is doing here?!!! There should be no need
2187 to do this again and again, PL_reglastparen should take care of
3dd2943c 2188 this! --ilya*/
dafc8851
JH
2189
2190 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2191 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2192 * PL_reglastparen), is not needed at all by the test suite
2193 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2194 * enough, for building DynaLoader, or otherwise this
2195 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2196 * will happen. Meanwhile, this code *is* needed for the
2197 * above-mentioned test suite tests to succeed. The common theme
2198 * on those tests seems to be returning null fields from matches.
2199 * --jhi */
dafc8851 2200#if 1
d6a28714
JH
2201 sp = prog->startp;
2202 ep = prog->endp;
2203 if (prog->nparens) {
eb160463 2204 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2205 *++sp = -1;
2206 *++ep = -1;
2207 }
2208 }
dafc8851 2209#endif
02db2b7b 2210 REGCP_SET(lastcp);
d6a28714
JH
2211 if (regmatch(prog->program + 1)) {
2212 prog->endp[0] = PL_reginput - PL_bostr;
2213 return 1;
2214 }
02db2b7b 2215 REGCP_UNWIND(lastcp);
d6a28714
JH
2216 return 0;
2217}
2218
02db2b7b
IZ
2219#define RE_UNWIND_BRANCH 1
2220#define RE_UNWIND_BRANCHJ 2
2221
2222union re_unwind_t;
2223
2224typedef struct { /* XX: makes sense to enlarge it... */
2225 I32 type;
2226 I32 prev;
2227 CHECKPOINT lastcp;
2228} re_unwind_generic_t;
2229
2230typedef struct {
2231 I32 type;
2232 I32 prev;
2233 CHECKPOINT lastcp;
2234 I32 lastparen;
2235 regnode *next;
2236 char *locinput;
2237 I32 nextchr;
2238#ifdef DEBUGGING
2239 int regindent;
2240#endif
2241} re_unwind_branch_t;
2242
2243typedef union re_unwind_t {
2244 I32 type;
2245 re_unwind_generic_t generic;
2246 re_unwind_branch_t branch;
2247} re_unwind_t;
2248
8ba1375e
MJD
2249#define sayYES goto yes
2250#define sayNO goto no
e0f9d4a8 2251#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2252#define sayYES_FINAL goto yes_final
2253#define sayYES_LOUD goto yes_loud
2254#define sayNO_FINAL goto no_final
2255#define sayNO_SILENT goto do_no
2256#define saySAME(x) if (x) goto yes; else goto no
2257
3ab3c9b4
HS
2258#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2259#define POSCACHE_SEEN 1 /* we know what we're caching */
2260#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2261#define CACHEsayYES STMT_START { \
2262 if (cache_offset | cache_bit) { \
2263 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2264 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2265 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2266 /* cache records failure, but this is success */ \
2267 DEBUG_r( \
2268 PerlIO_printf(Perl_debug_log, \
2269 "%*s (remove success from failure cache)\n", \
2270 REPORT_CODE_OFF+PL_regindent*2, "") \
2271 ); \
2272 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2273 } \
2274 } \
2275 sayYES; \
2276} STMT_END
2277#define CACHEsayNO STMT_START { \
2278 if (cache_offset | cache_bit) { \
2279 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2280 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2281 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2282 /* cache records success, but this is failure */ \
2283 DEBUG_r( \
2284 PerlIO_printf(Perl_debug_log, \
2285 "%*s (remove failure from success cache)\n", \
2286 REPORT_CODE_OFF+PL_regindent*2, "") \
2287 ); \
2288 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2289 } \
2290 } \
2291 sayNO; \
2292} STMT_END
2293
a3621e74
YO
2294/* this is used to determine how far from the left messages like
2295 'failed...' are printed. Currently 29 makes these messages line
2296 up with the opcode they refer to. Earlier perls used 25 which
2297 left these messages outdented making reviewing a debug output
2298 quite difficult.
2299*/
2300#define REPORT_CODE_OFF 29
2301
2302
2303/* Make sure there is a test for this +1 options in re_tests */
2304#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2305
5b47454d 2306
95b24440
DM
2307/* simulate a recursive call to regmatch */
2308
2309#define REGMATCH(ns, where) \
2310 new_scan = (ns); \
2311 resume_state = resume_##where; \
2312 goto start_recurse; \
2313 resume_point_##where:
2314
2315typedef enum {
2316 resume_TRIE1,
2317 resume_TRIE2,
2318 resume_CURLYX,
2319 resume_WHILEM1,
2320 resume_WHILEM2,
2321 resume_WHILEM3,
2322 resume_WHILEM4,
2323 resume_WHILEM5,
2324 resume_WHILEM6,
2325 resume_CURLYM1,
2326 resume_CURLYM2,
2327 resume_CURLYM3,
2328 resume_CURLYM4,
2329 resume_IFMATCH,
2330 resume_PLUS1,
2331 resume_PLUS2,
2332 resume_PLUS3,
2333 resume_PLUS4,
2334 resume_END
2335} resume_states;
2336
2337
2338struct regmatch_state {
2339 struct regmatch_state *prev_state;
2340 resume_states resume_state;
2341 regnode *scan;
2342 regnode *next;
2343 bool minmod;
2344 bool sw;
2345 int logical;
2346 I32 unwind;
2347 CURCUR *cc;
2348 char *locinput;
2349
2350 I32 n;
2351 I32 ln;
2352 I32 c1, c2, paren;
2353 CHECKPOINT cp, lastcp;
2354 CURCUR *oldcc;
2355 char *lastloc;
2356 I32 cache_offset, cache_bit;
2357 I32 curlym_l;
2358 I32 matches;
2359 I32 maxwanted;
2360 char *e;
2361 char *old;
2362 int count;
2363 re_cc_state *cur_call_cc;
2364 regexp *end_re;
2365 reg_trie_accepted *accept_buff;
2366 U32 accepted;
2367
2368 re_cc_state *reg_call_cc; /* saved value of PL_reg_call_cc */
2369};
8ba1375e 2370
d6a28714
JH
2371/*
2372 - regmatch - main matching routine
2373 *
2374 * Conceptually the strategy is simple: check to see whether the current
2375 * node matches, call self recursively to see whether the rest matches,
2376 * and then act accordingly. In practice we make some effort to avoid
2377 * recursion, in particular by going through "ordinary" nodes (that don't
2378 * need to know whether the rest of the match failed) by a loop instead of
2379 * by recursion.
2380 */
2381/* [lwall] I've hoisted the register declarations to the outer block in order to
2382 * maybe save a little bit of pushing and popping on the stack. It also takes
2383 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2384 *
2385 * This function used to be heavily recursive, but since this had the
2386 * effect of blowing the CPU stack on complex regexes, it has been
2387 * restructured to be iterative, and to save state onto the heap rather
2388 * than the stack. Essentially whereever regmatch() used to be called, it
2389 * pushes the current state, notes where to return, then jumps back into
2390 * the main loop.
2391 *
2392 * Originally the structure of this function used to look something like
2393
2394 S_regmatch() {
2395 int a = 1, b = 2;
2396 ...
2397 while (scan != NULL) {
2398 ...
2399 switch (OP(scan)) {
2400 case FOO: {
2401 int local = 3;
2402 ...
2403 if (regmatch(...)) // recurse
2404 goto yes;
2405 }
2406 ...
2407 }
2408 }
2409 yes:
2410 return 1;
2411 }
2412
2413 * Now it looks something like this:
2414
2415 struct regmatch_state {
2416 int a, b, local;
2417 int resume_state;
2418 };
2419
2420 S_regmatch() {
2421 int a = 1, b = 2;
2422 int local;
2423 ...
2424 while (scan != NULL) {
2425 ...
2426 switch (OP(scan)) {
2427 case FOO: {
2428 local = 3;
2429 ...
2430 resume_state = resume_FOO;
2431 new_scan = ...;
2432 goto start_recurse;
2433 resume_point_FOO:
2434
2435 if (result) // recurse
2436 goto yes;
2437 }
2438 ...
2439 }
2440 start_recurse:
2441 ...push a, b, local onto the heap
2442 a = 1; b = 2;
2443 scan = new_scan;
2444 }
2445 yes:
2446 result = 1;
2447 if (states pushed on heap) {
2448 ... restore a, b, local from heap
2449 switch (resume_state) {
2450 case resume_FOO:
2451 goto resume_point_FOO;
2452 ...
2453 }
2454 }
2455 return result
2456 }
2457
2458 * WARNING: this means that any line in this function that contains a
2459 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2460 * regmatch() using gotos instead. Thus the values of any local variables
2461 * not saved in the regmatch_state structure will have been lost when
2462 * execution resumes on the next line .
d6a28714 2463 */
95b24440
DM
2464
2465
d6a28714
JH
2466STATIC I32 /* 0 failure, 1 success */
2467S_regmatch(pTHX_ regnode *prog)
2468{
27da23d5 2469 dVAR;
95b24440
DM
2470 register const bool do_utf8 = PL_reg_match_utf8;
2471 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2472
2473 /************************************************************
2474 * the following variabes are saved and restored on each fake
2475 * recursive call to regmatch:
2476 *
2477 * The first ones contain state that needs to be maintained
2478 * across the main while loop: */
2479
2480 struct regmatch_state *prev_state = NULL; /* stack of pushed states */
2481 resume_states resume_state; /* where to jump to on return */
d6a28714
JH
2482 register regnode *scan; /* Current node. */
2483 regnode *next; /* Next node. */
95b24440
DM
2484 bool minmod = 0; /* the next {n.m} is a {n,m}? */
2485 bool sw = 0; /* the condition value in (?(cond)a|b) */
2486 int logical = 0;
2487 I32 unwind = 0; /* savestack index of current unwind block */
2488 CURCUR *cc = NULL; /* current innermost curly struct */
d6a28714 2489 register char *locinput = PL_reginput;
a3621e74 2490
95b24440
DM
2491 /* while the rest of these are local to an individual branch, and
2492 * have only been hoisted into this outer scope to allow for saving and
2493 * restoration - thus they can be safely reused in other branches.
2494 * Note that they are only initialized here to silence compiler
2495 * warnings :-( */
a3621e74 2496
95b24440
DM
2497 register I32 n = 0; /* no or next */
2498 register I32 ln = 0; /* len or last */
2499 register I32 c1 = 0, c2 = 0, paren = 0; /* case fold search, parenth */
2500 CHECKPOINT cp = 0; /* remember current savestack indexes */
2501 CHECKPOINT lastcp = 0;
2502 CURCUR *oldcc = NULL; /* tmp copy of cc */
2503 char *lastloc = NULL; /* Detection of 0-len. */
2504 I32 cache_offset = 0;
2505 I32 cache_bit = 0;
2506 I32 curlym_l = 0;
2507 I32 matches = 0;
2508 I32 maxwanted = 0;
2509 char *e = NULL;
2510 char *old = NULL;
2511 int count = 0;
2512 re_cc_state *cur_call_cc = NULL;
2513 regexp *end_re = NULL;
2514 reg_trie_accepted *accept_buff = NULL;
2515 U32 accepted = 0; /* how many accepting states we have seen */
2516
2517 /************************************************************
2518 * these variables are NOT saved: */
2519
2520 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2521 regnode *new_scan; /* node to begin exedcution at when recursing */
2522 bool result; /* return value of S_regmatch */
2523
2524 regnode *inner; /* Next node in internal branch. */
2525
2526#ifdef DEBUGGING
ab74612d 2527 SV *re_debug_flags = NULL;
a3621e74 2528 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2529 PL_regindent++;
2530#endif
2531
2532 /* Note that nextchr is a byte even in UTF */
2533 nextchr = UCHARAT(locinput);
2534 scan = prog;
2535 while (scan != NULL) {
8ba1375e 2536
a3621e74 2537 DEBUG_EXECUTE_r( {
6136c704 2538 SV * const prop = sv_newmortal();
1df70142
AL
2539 const int docolor = *PL_colors[0];
2540 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2541 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2542 /* The part of the string before starttry has one color
2543 (pref0_len chars), between starttry and current
2544 position another one (pref_len - pref0_len chars),
2545 after the current position the third one.
2546 We assume that pref0_len <= pref_len, otherwise we
2547 decrease pref0_len. */
9041c2e3 2548 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2549 ? (5 + taill) - l : locinput - PL_bostr;
2550 int pref0_len;
d6a28714 2551
df1ffd02 2552 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2553 pref_len++;
2554 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2555 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2556 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2557 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2558 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2559 l--;
d6a28714
JH
2560 if (pref0_len < 0)
2561 pref0_len = 0;
2562 if (pref0_len > pref_len)
2563 pref0_len = pref_len;
2564 regprop(prop, scan);
2a782b5b 2565 {
1df70142 2566 const char * const s0 =
f14c76ed 2567 do_utf8 && OP(scan) != CANY ?
95b24440 2568 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
c728cb41 2569 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2570 locinput - pref_len;
1df70142
AL
2571 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2572 const char * const s1 = do_utf8 && OP(scan) != CANY ?
95b24440
DM
2573 pv_uni_display(PERL_DEBUG_PAD(1),
2574 (U8*)(locinput - pref_len + pref0_len),
c728cb41 2575 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2576 locinput - pref_len + pref0_len;
1df70142
AL
2577 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2578 const char * const s2 = do_utf8 && OP(scan) != CANY ?
95b24440 2579 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
c728cb41 2580 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2581 locinput;
1df70142 2582 const int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2583 PerlIO_printf(Perl_debug_log,
2584 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2585 (IV)(locinput - PL_bostr),
2586 PL_colors[4],
2587 len0, s0,
2588 PL_colors[5],
2589 PL_colors[2],
2590 len1, s1,
2591 PL_colors[3],
2592 (docolor ? "" : "> <"),
2593 PL_colors[0],
2594 len2, s2,
2595 PL_colors[1],
2596 15 - l - pref_len + 1,
2597 "",
2598 (IV)(scan - PL_regprogram), PL_regindent*2, "",
3f7c398e 2599 SvPVX_const(prop));
2a782b5b
JH
2600 }
2601 });
d6a28714
JH
2602
2603 next = scan + NEXT_OFF(scan);
2604 if (next == scan)
2605 next = NULL;
2606
2607 switch (OP(scan)) {
2608 case BOL:
7fba1cd6 2609 if (locinput == PL_bostr)
d6a28714
JH
2610 {
2611 /* regtill = regbol; */
b8c5462f
JH
2612 break;
2613 }
d6a28714
JH
2614 sayNO;
2615 case MBOL:
12d33761
HS
2616 if (locinput == PL_bostr ||
2617 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2618 {
b8c5462f
JH
2619 break;
2620 }
d6a28714
JH
2621 sayNO;
2622 case SBOL:
c2a73568 2623 if (locinput == PL_bostr)
b8c5462f 2624 break;
d6a28714
JH
2625 sayNO;
2626 case GPOS:
2627 if (locinput == PL_reg_ganch)
2628 break;
2629 sayNO;
2630 case EOL:
d6a28714
JH
2631 goto seol;
2632 case MEOL:
d6a28714 2633 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2634 sayNO;
b8c5462f 2635 break;
d6a28714
JH
2636 case SEOL:
2637 seol:
2638 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2639 sayNO;
d6a28714 2640 if (PL_regeol - locinput > 1)
b8c5462f 2641 sayNO;
b8c5462f 2642 break;
d6a28714
JH
2643 case EOS:
2644 if (PL_regeol != locinput)
b8c5462f 2645 sayNO;
d6a28714 2646 break;
ffc61ed2 2647 case SANY:
d6a28714 2648 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2649 sayNO;
f33976b4
DB
2650 if (do_utf8) {
2651 locinput += PL_utf8skip[nextchr];
2652 if (locinput > PL_regeol)
2653 sayNO;
2654 nextchr = UCHARAT(locinput);
2655 }
2656 else
2657 nextchr = UCHARAT(++locinput);
2658 break;
2659 case CANY:
2660 if (!nextchr && locinput >= PL_regeol)
2661 sayNO;
b8c5462f 2662 nextchr = UCHARAT(++locinput);
a0d0e21e 2663 break;
ffc61ed2 2664 case REG_ANY:
1aa99e6b
IH
2665 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2666 sayNO;
2667 if (do_utf8) {
b8c5462f 2668 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2669 if (locinput > PL_regeol)
2670 sayNO;
a0ed51b3 2671 nextchr = UCHARAT(locinput);
a0ed51b3 2672 }
1aa99e6b
IH
2673 else
2674 nextchr = UCHARAT(++locinput);
a0ed51b3 2675 break;
a3621e74
YO
2676
2677
2678
2679 /*
2680 traverse the TRIE keeping track of all accepting states
2681 we transition through until we get to a failing node.
2682
a3621e74
YO
2683
2684 */
5b47454d 2685 case TRIE:
a3621e74
YO
2686 case TRIEF:
2687 case TRIEFL:
2688 {
a3621e74
YO
2689 U8 *uc = ( U8* )locinput;
2690 U32 state = 1;
2691 U16 charid = 0;
2692 U32 base = 0;
2693 UV uvc = 0;
2694 STRLEN len = 0;
2695 STRLEN foldlen = 0;
a3621e74
YO
2696 U8 *uscan = (U8*)NULL;
2697 STRLEN bufflen=0;
95b24440 2698 SV *sv_accept_buff = NULL;
5b47454d
DM
2699 const enum { trie_plain, trie_utf8, trie_uft8_fold }
2700 trie_type = do_utf8 ?
2701 (OP(scan) == TRIE ? trie_utf8 : trie_uft8_fold)
2702 : trie_plain;
2703
7087a21c
NC
2704 /* what trie are we using right now */
2705 reg_trie_data *trie
2706 = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
95b24440
DM
2707 accepted = 0; /* how many accepting states we have seen */
2708 result = 0;
a3621e74
YO
2709
2710 while ( state && uc <= (U8*)PL_regeol ) {
2711
5b47454d
DM
2712 if (trie->states[ state ].wordnum) {
2713 if (!accepted ) {
2714 ENTER;
2715 SAVETMPS;
2716 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2717 sv_accept_buff=newSV(bufflen *
2718 sizeof(reg_trie_accepted) - 1);
2719 SvCUR_set(sv_accept_buff,
2720 sizeof(reg_trie_accepted));
2721 SvPOK_on(sv_accept_buff);
2722 sv_2mortal(sv_accept_buff);
2723 accept_buff =
2724 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2725 }
2726 else {
2727 if (accepted >= bufflen) {
2728 bufflen *= 2;
2729 accept_buff =(reg_trie_accepted*)
2730 SvGROW(sv_accept_buff,
2731 bufflen * sizeof(reg_trie_accepted));
2732 }
2733 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2734 + sizeof(reg_trie_accepted));
2735 }
2736 accept_buff[accepted].wordnum = trie->states[state].wordnum;
2737 accept_buff[accepted].endpos = uc;
2738 ++accepted;
2739 }
a3621e74
YO
2740
2741 base = trie->states[ state ].trans.base;
2742
2743 DEBUG_TRIE_EXECUTE_r(
2744 PerlIO_printf( Perl_debug_log,
e4584336 2745 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2746 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2747 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2748 );
2749
2750 if ( base ) {
5b47454d
DM
2751 switch (trie_type) {
2752 case trie_uft8_fold:
a3621e74
YO
2753 if ( foldlen>0 ) {
2754 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2755 foldlen -= len;
2756 uscan += len;
2757 len=0;
2758 } else {
1df70142 2759 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2760 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2761 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2762 foldlen -= UNISKIP( uvc );
2763 uscan = foldbuf + UNISKIP( uvc );
2764 }
5b47454d
DM
2765 break;
2766 case trie_utf8:
2767 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2768 &len, uniflags );
2769 break;
2770 case trie_plain:
e4584336 2771 uvc = (UV)*uc;
a3621e74
YO
2772 len = 1;
2773 }
2774
5b47454d
DM
2775 if (uvc < 256) {
2776 charid = trie->charmap[ uvc ];
2777 }
2778 else {
2779 charid = 0;
2780 if (trie->widecharmap) {
2781 SV** svpp = (SV**)NULL;
2782 svpp = hv_fetch(trie->widecharmap,
2783 (char*)&uvc, sizeof(UV), 0);
2784 if (svpp)
2785 charid = (U16)SvIV(*svpp);
2786 }
2787 }
a3621e74 2788
5b47454d
DM
2789 if (charid &&
2790 (base + charid > trie->uniquecharcount )
2791 && (base + charid - 1 - trie->uniquecharcount
2792 < trie->lasttrans)
2793 && trie->trans[base + charid - 1 -
2794 trie->uniquecharcount].check == state)
2795 {
2796 state = trie->trans[base + charid - 1 -
2797 trie->uniquecharcount ].next;
2798 }
2799 else {
2800 state = 0;
2801 }
2802 uc += len;
2803
2804 }
2805 else {
a3621e74
YO
2806 state = 0;
2807 }
2808 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2809 PerlIO_printf( Perl_debug_log,
2810 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2811 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2812 );
2813 }
5b47454d 2814 if (!accepted )
a3621e74 2815 sayNO;
a3621e74
YO
2816
2817 /*
2818 There was at least one accepting state that we
2819 transitioned through. Presumably the number of accepting
2820 states is going to be low, typically one or two. So we
2821 simply scan through to find the one with lowest wordnum.
2822 Once we find it, we swap the last state into its place
2823 and decrement the size. We then try to match the rest of
2824 the pattern at the point where the word ends, if we
2825 succeed then we end the loop, otherwise the loop
2826 eventually terminates once all of the accepting states
2827 have been tried.
2828 */
a3621e74
YO
2829
2830 if ( accepted == 1 ) {
2831 DEBUG_EXECUTE_r({
2832 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2833 PerlIO_printf( Perl_debug_log,
2834 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2835 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2836 accept_buff[ 0 ].wordnum,
cfd0369c 2837 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
a3621e74
YO
2838 PL_colors[5] );
2839 });
cc601c31 2840 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2841 /* in this case we free tmps/leave before we call regmatch
2842 as we wont be using accept_buff again. */
2843 FREETMPS;
2844 LEAVE;
95b24440
DM
2845 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
2846 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2847 } else {
2848 DEBUG_EXECUTE_r(
e4584336
RB
2849 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2850 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2851 PL_colors[5] );
2852 );
95b24440 2853 while ( !result && accepted-- ) {
a3621e74
YO
2854 U32 best = 0;
2855 U32 cur;
2856 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2857 DEBUG_TRIE_EXECUTE_r(
2858 PerlIO_printf( Perl_debug_log,
2859 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2860 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2861 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2862 accept_buff[ cur ].wordnum, PL_colors[5] );
2863 );
a3621e74
YO
2864
2865 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2866 best = cur;
2867 }
2868 DEBUG_EXECUTE_r({
6136c704 2869 SV ** const tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
a3621e74 2870 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2871 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74 2872 accept_buff[best].wordnum,
cfd0369c 2873 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",scan,
a3621e74
YO
2874 PL_colors[5] );
2875 });
2876 if ( best<accepted ) {
2877 reg_trie_accepted tmp = accept_buff[ best ];
2878 accept_buff[ best ] = accept_buff[ accepted ];
2879 accept_buff[ accepted ] = tmp;
2880 best = accepted;
2881 }
cc601c31 2882 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2883
2884 /*
2885 as far as I can tell we only need the SAVETMPS/FREETMPS
2886 for re's with EVAL in them but I'm leaving them in for
2887 all until I can be sure.
2888 */
2889 SAVETMPS;
95b24440
DM
2890 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
2891 /*** all unsaved local vars undefined at this point */
a3621e74
YO
2892 FREETMPS;
2893 }
2894 FREETMPS;
2895 LEAVE;
2896 }
2897
95b24440 2898 if (result) {
a3621e74
YO
2899 sayYES;
2900 } else {
2901 sayNO;
2902 }
2903 }
2904 /* unreached codepoint */
95b24440
DM
2905 case EXACT: {
2906 char *s = STRING(scan);
cd439c50 2907 ln = STR_LEN(scan);
eb160463 2908 if (do_utf8 != UTF) {
bc517b45 2909 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2910 char *l = locinput;
a3b680e6 2911 const char *e = s + ln;
a72c7584 2912
5ff6fc6d
JH
2913 if (do_utf8) {
2914 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2915 while (s < e) {
a3b680e6 2916 STRLEN ulen;
1aa99e6b 2917 if (l >= PL_regeol)
5ff6fc6d
JH
2918 sayNO;
2919 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2920 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2921 uniflags))
5ff6fc6d 2922 sayNO;
bc517b45 2923 l += ulen;
5ff6fc6d 2924 s ++;
1aa99e6b 2925 }
5ff6fc6d
JH
2926 }
2927 else {
2928 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2929 while (s < e) {
a3b680e6 2930 STRLEN ulen;
1aa99e6b
IH
2931 if (l >= PL_regeol)
2932 sayNO;
5ff6fc6d 2933 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2934 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2935 uniflags))
1aa99e6b 2936 sayNO;
bc517b45 2937 s += ulen;
a72c7584 2938 l ++;
1aa99e6b 2939 }
5ff6fc6d 2940 }
1aa99e6b
IH
2941 locinput = l;
2942 nextchr = UCHARAT(locinput);
2943 break;
2944 }
bc517b45 2945 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2946 /* Inline the first character, for speed. */
2947 if (UCHARAT(s) != nextchr)
2948 sayNO;
2949 if (PL_regeol - locinput < ln)
2950 sayNO;
2951 if (ln > 1 && memNE(s, locinput, ln))
2952 sayNO;
2953 locinput += ln;
2954 nextchr = UCHARAT(locinput);
2955 break;
95b24440 2956 }
d6a28714 2957 case EXACTFL:
b8c5462f
JH
2958 PL_reg_flags |= RF_tainted;
2959 /* FALL THROUGH */
95b24440
DM
2960 case EXACTF: {
2961 char *s = STRING(scan);
cd439c50 2962 ln = STR_LEN(scan);
d6a28714 2963
d07ddd77
JH
2964 if (do_utf8 || UTF) {
2965 /* Either target or the pattern are utf8. */
d6a28714 2966 char *l = locinput;
d07ddd77 2967 char *e = PL_regeol;
bc517b45 2968
eb160463 2969 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2970 l, &e, 0, do_utf8)) {
5486206c
JH
2971 /* One more case for the sharp s:
2972 * pack("U0U*", 0xDF) =~ /ss/i,
2973 * the 0xC3 0x9F are the UTF-8
2974 * byte sequence for the U+00DF. */
2975 if (!(do_utf8 &&
2976 toLOWER(s[0]) == 's' &&
2977 ln >= 2 &&
2978 toLOWER(s[1]) == 's' &&
2979 (U8)l[0] == 0xC3 &&
2980 e - l >= 2 &&
2981 (U8)l[1] == 0x9F))
2982 sayNO;
2983 }
d07ddd77
JH
2984 locinput = e;
2985 nextchr = UCHARAT(locinput);
2986 break;
a0ed51b3 2987 }
d6a28714 2988
bc517b45
JH
2989 /* Neither the target and the pattern are utf8. */
2990
d6a28714
JH
2991 /* Inline the first character, for speed. */
2992 if (UCHARAT(s) != nextchr &&
2993 UCHARAT(s) != ((OP(scan) == EXACTF)
2994 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2995 sayNO;
d6a28714 2996 if (PL_regeol - locinput < ln)
b8c5462f 2997 sayNO;
d6a28714
JH
2998 if (ln > 1 && (OP(scan) == EXACTF
2999 ? ibcmp(s, locinput, ln)
3000 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3001 sayNO;
d6a28714
JH
3002 locinput += ln;
3003 nextchr = UCHARAT(locinput);
a0d0e21e 3004 break;
95b24440 3005 }
d6a28714 3006 case ANYOF:
ffc61ed2 3007 if (do_utf8) {
9e55ce06
JH
3008 STRLEN inclasslen = PL_regeol - locinput;
3009
ba7b4546 3010 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3011 sayNO_ANYOF;
ffc61ed2
JH
3012 if (locinput >= PL_regeol)
3013 sayNO;
0f0076b4 3014 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3015 nextchr = UCHARAT(locinput);
e0f9d4a8 3016 break;
ffc61ed2
JH
3017 }
3018 else {
3019 if (nextchr < 0)
3020 nextchr = UCHARAT(locinput);
7d3e948e 3021 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 3022 sayNO_ANYOF;
ffc61ed2
JH
3023 if (!nextchr && locinput >= PL_regeol)
3024 sayNO;
3025 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3026 break;
3027 }
3028 no_anyof:
3029 /* If we might have the case of the German sharp s
3030 * in a casefolding Unicode character class. */
3031
ebc501f0
JH
3032 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3033 locinput += SHARP_S_SKIP;
e0f9d4a8 3034 nextchr = UCHARAT(locinput);
ffc61ed2 3035 }
e0f9d4a8
JH
3036 else
3037 sayNO;
b8c5462f 3038 break;
d6a28714 3039 case ALNUML:
b8c5462f
JH
3040 PL_reg_flags |= RF_tainted;
3041 /* FALL THROUGH */
d6a28714 3042 case ALNUM:
b8c5462f 3043 if (!nextchr)
4633a7c4 3044 sayNO;
ffc61ed2 3045 if (do_utf8) {
1a4fad37 3046 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3047 if (!(OP(scan) == ALNUM
3568d838 3048 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3049 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3050 {
3051 sayNO;
a0ed51b3 3052 }
b8c5462f 3053 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3054 nextchr = UCHARAT(locinput);
3055 break;
3056 }
ffc61ed2 3057 if (!(OP(scan) == ALNUM
d6a28714 3058 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3059 sayNO;
b8c5462f 3060 nextchr = UCHARAT(++locinput);
a0d0e21e 3061 break;
d6a28714 3062 case NALNUML:
b8c5462f
JH
3063 PL_reg_flags |= RF_tainted;
3064 /* FALL THROUGH */
d6a28714
JH
3065 case NALNUM:
3066 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3067 sayNO;
ffc61ed2 3068 if (do_utf8) {
1a4fad37 3069 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3070 if (OP(scan) == NALNUM
3568d838 3071 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3072 : isALNUM_LC_utf8((U8*)locinput))
3073 {
b8c5462f 3074 sayNO;
d6a28714 3075 }
b8c5462f
JH
3076 locinput += PL_utf8skip[nextchr];
3077 nextchr = UCHARAT(locinput);
3078 break;
3079 }
ffc61ed2 3080 if (OP(scan) == NALNUM
d6a28714 3081 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3082 sayNO;
76e3520e 3083 nextchr = UCHARAT(++locinput);
a0d0e21e 3084 break;
d6a28714
JH
3085 case BOUNDL:
3086 case NBOUNDL:
3280af22 3087 PL_reg_flags |= RF_tainted;
bbce6d69 3088 /* FALL THROUGH */
d6a28714
JH
3089 case BOUND:
3090 case NBOUND:
3091 /* was last char in word? */
ffc61ed2 3092 if (do_utf8) {
12d33761
HS
3093 if (locinput == PL_bostr)
3094 ln = '\n';
ffc61ed2 3095 else {
a3b680e6 3096 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3097
b4f7163a 3098 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
3099 }
3100 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3101 ln = isALNUM_uni(ln);
1a4fad37 3102 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3103 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3104 }
3105 else {
9041c2e3 3106 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
3107 n = isALNUM_LC_utf8((U8*)locinput);
3108 }
a0ed51b3 3109 }
d6a28714 3110 else {
12d33761
HS
3111 ln = (locinput != PL_bostr) ?
3112 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
3113 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3114 ln = isALNUM(ln);
3115 n = isALNUM(nextchr);
3116 }
3117 else {
3118 ln = isALNUM_LC(ln);
3119 n = isALNUM_LC(nextchr);
3120 }
d6a28714 3121 }
ffc61ed2
JH
3122 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3123 OP(scan) == BOUNDL))
3124 sayNO;
a0ed51b3 3125 break;
d6a28714 3126 case SPACEL:
3280af22 3127 PL_reg_flags |= RF_tainted;
bbce6d69 3128 /* FALL THROUGH */
d6a28714 3129 case SPACE:
9442cb0e 3130 if (!nextchr)
4633a7c4 3131 sayNO;
1aa99e6b 3132 if (do_utf8) {
fd400ab9 3133 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3134 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3135 if (!(OP(scan) == SPACE
3568d838 3136 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3137 : isSPACE_LC_utf8((U8*)locinput)))
3138 {
3139 sayNO;
3140 }
3141 locinput += PL_utf8skip[nextchr];
3142 nextchr = UCHARAT(locinput);
3143 break;
d6a28714 3144 }
ffc61ed2
JH
3145 if (!(OP(scan) == SPACE
3146 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3147 sayNO;
3148 nextchr = UCHARAT(++locinput);
3149 }
3150 else {
3151 if (!(OP(scan) == SPACE
3152 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3153 sayNO;
3154 nextchr = UCHARAT(++locinput);
a0ed51b3 3155 }
a0ed51b3 3156 break;
d6a28714 3157 case NSPACEL:
3280af22 3158 PL_reg_flags |= RF_tainted;
bbce6d69 3159 /* FALL THROUGH */
d6a28714 3160 case NSPACE:
9442cb0e 3161 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3162 sayNO;
1aa99e6b 3163 if (do_utf8) {
1a4fad37 3164 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3165 if (OP(scan) == NSPACE
3568d838 3166 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3167 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3168 {
3169 sayNO;
3170 }
3171 locinput += PL_utf8skip[nextchr];
3172 nextchr = UCHARAT(locinput);
3173 break;
a0ed51b3 3174 }
ffc61ed2 3175 if (OP(scan) == NSPACE
d6a28714 3176 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3177 sayNO;
76e3520e 3178 nextchr = UCHARAT(++locinput);
a0d0e21e 3179 break;
d6a28714 3180 case DIGITL:
a0ed51b3
LW
3181 PL_reg_flags |= RF_tainted;
3182 /* FALL THROUGH */
d6a28714 3183 case DIGIT:
9442cb0e 3184 if (!nextchr)
a0ed51b3 3185 sayNO;
1aa99e6b 3186 if (do_utf8) {
1a4fad37 3187 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3188 if (!(OP(scan) == DIGIT
3568d838 3189 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3190 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3191 {
a0ed51b3 3192 sayNO;
dfe13c55 3193 }
6f06b55f 3194 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3195 nextchr = UCHARAT(locinput);
3196 break;
3197 }
ffc61ed2 3198 if (!(OP(scan) == DIGIT
9442cb0e 3199 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3200 sayNO;
3201 nextchr = UCHARAT(++locinput);
3202 break;
d6a28714 3203 case NDIGITL:
b8c5462f
JH
3204 PL_reg_flags |= RF_tainted;
3205 /* FALL THROUGH */
d6a28714 3206 case NDIGIT:
9442cb0e 3207 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3208 sayNO;
1aa99e6b 3209 if (do_utf8) {
1a4fad37 3210 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3211 if (OP(scan) == NDIGIT
3568d838 3212 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3213 : isDIGIT_LC_utf8((U8*)locinput))
3214 {
a0ed51b3 3215 sayNO;
9442cb0e 3216 }
6f06b55f 3217 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3218 nextchr = UCHARAT(locinput);
3219 break;
3220 }
ffc61ed2 3221 if (OP(scan) == NDIGIT
9442cb0e 3222 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3223 sayNO;
3224 nextchr = UCHARAT(++locinput);
3225 break;
3226 case CLUMP:
b7c83a7e 3227 if (locinput >= PL_regeol)
a0ed51b3 3228 sayNO;
b7c83a7e 3229 if (do_utf8) {
1a4fad37 3230 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3231 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3232 sayNO;
3233 locinput += PL_utf8skip[nextchr];
3234 while (locinput < PL_regeol &&
3235 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3236 locinput += UTF8SKIP(locinput);
3237 if (locinput > PL_regeol)
3238 sayNO;
eb08e2da
JH
3239 }
3240 else
3241 locinput++;
a0ed51b3
LW
3242 nextchr = UCHARAT(locinput);
3243 break;
c8756f30 3244 case REFFL:
3280af22 3245 PL_reg_flags |= RF_tainted;
c8756f30 3246 /* FALL THROUGH */
c277df42 3247 case REF:
95b24440
DM
3248 case REFF: {
3249 char *s;
c277df42 3250 n = ARG(scan); /* which paren pair */
cf93c79d 3251 ln = PL_regstartp[n];
2c2d71f5 3252 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3253 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3254 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3255 if (ln == PL_regendp[n])
a0d0e21e 3256 break;
a0ed51b3 3257
cf93c79d 3258 s = PL_bostr + ln;
1aa99e6b 3259 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3260 char *l = locinput;
a3b680e6 3261 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3262 /*
3263 * Note that we can't do the "other character" lookup trick as
3264 * in the 8-bit case (no pun intended) because in Unicode we
3265 * have to map both upper and title case to lower case.
3266 */
3267 if (OP(scan) == REFF) {
3268 while (s < e) {
a3b680e6
AL
3269 STRLEN ulen1, ulen2;
3270 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3271 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3272
a0ed51b3
LW
3273 if (l >= PL_regeol)
3274 sayNO;
a2a2844f
JH
3275 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3276 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3277 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3278 sayNO;
a2a2844f
JH
3279 s += ulen1;
3280 l += ulen2;
a0ed51b3
LW
3281 }
3282 }
3283 locinput = l;
3284 nextchr = UCHARAT(locinput);
3285 break;
3286 }
3287
a0d0e21e 3288 /* Inline the first character, for speed. */
76e3520e 3289 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3290 (OP(scan) == REF ||
3291 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3292 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3293 sayNO;
cf93c79d 3294 ln = PL_regendp[n] - ln;
3280af22 3295 if (locinput + ln > PL_regeol)
4633a7c4 3296 sayNO;
c8756f30
AK
3297 if (ln > 1 && (OP(scan) == REF
3298 ? memNE(s, locinput, ln)
3299 : (OP(scan) == REFF
3300 ? ibcmp(s, locinput, ln)
3301 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3302 sayNO;
a0d0e21e 3303 locinput += ln;
76e3520e 3304 nextchr = UCHARAT(locinput);
a0d0e21e 3305 break;
95b24440 3306 }
a0d0e21e
LW
3307
3308 case NOTHING:
c277df42 3309 case TAIL:
a0d0e21e
LW
3310 break;
3311 case BACK:
3312 break;
c277df42
IZ
3313 case EVAL:
3314 {
3315 dSP;
6136c704
AL
3316 OP_4tree * const oop = PL_op;
3317 COP * const ocurcop = PL_curcop;
f3548bdc 3318 PAD *old_comppad;
c277df42 3319 SV *ret;
6136c704 3320 struct regexp * const oreg = PL_reg_re;
9041c2e3 3321
c277df42 3322 n = ARG(scan);
533c011a 3323 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3324 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3325 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3326 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3327
8e5e9ebe 3328 {
6136c704 3329 SV ** const before = SP;
8e5e9ebe
RGS
3330 CALLRUNOPS(aTHX); /* Scalar context. */
3331 SPAGAIN;
3332 if (SP == before)
075aa684 3333 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3334 else {
3335 ret = POPs;
3336 PUTBACK;
3337 }
3338 }
3339
0f5d15d6 3340 PL_op = oop;
f3548bdc 3341 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3342 PL_curcop = ocurcop;
c277df42 3343 if (logical) {
0f5d15d6
IZ
3344 if (logical == 2) { /* Postponed subexpression. */
3345 regexp *re;
6136c704 3346 MAGIC *mg = NULL;
0f5d15d6 3347 re_cc_state state;
cb50f42d 3348 int toggleutf;
faf82a0b 3349 register SV *sv;
0f5d15d6 3350
faf82a0b
AE
3351 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3352 mg = mg_find(sv, PERL_MAGIC_qr);
3353 else if (SvSMAGICAL(ret)) {
3354 if (SvGMAGICAL(ret))
3355 sv_unmagic(ret, PERL_MAGIC_qr);
3356 else
3357 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3358 }
faf82a0b 3359
0f5d15d6
IZ
3360 if (mg) {
3361 re = (regexp *)mg->mg_obj;
df0003d4 3362 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3363 }
3364 else {
3365 STRLEN len;
6136c704 3366 const char * const t = SvPV_const(ret, len);
0f5d15d6 3367 PMOP pm;
a3b680e6
AL
3368 char * const oprecomp = PL_regprecomp;
3369 const I32 osize = PL_regsize;
3370 const I32 onpar = PL_regnpar;
0f5d15d6 3371
5fcd1c1b 3372 Zero(&pm, 1, PMOP);
cb50f42d 3373 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3374 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3375 if (!(SvFLAGS(ret)
faf82a0b
AE
3376 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3377 | SVs_GMG)))
14befaf4
DM
3378 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3379 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3380 PL_regprecomp = oprecomp;
3381 PL_regsize = osize;
3382 PL_regnpar = onpar;
3383 }
a3621e74 3384 DEBUG_EXECUTE_r(
9041c2e3 3385 PerlIO_printf(Perl_debug_log,
a0288114 3386 "Entering embedded \"%s%.60s%s%s\"\n",
0f5d15d6
IZ
3387 PL_colors[0],
3388 re->precomp,
3389 PL_colors[1],
3390 (strlen(re->precomp) > 60 ? "..." : ""))
3391 );
3392 state.node = next;
3393 state.prev = PL_reg_call_cc;
95b24440 3394 state.cc = cc;
0f5d15d6
IZ
3395 state.re = PL_reg_re;
3396
95b24440 3397 cc = 0;
9041c2e3 3398
0f5d15d6 3399 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3400 REGCP_SET(lastcp);
0f5d15d6
IZ
3401 cache_re(re);
3402 state.ss = PL_savestack_ix;
3403 *PL_reglastparen = 0;
a01268b5 3404 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3405 PL_reg_call_cc = &state;
3406 PL_reginput = locinput;
cb50f42d
YST
3407 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3408 ((re->reganch & ROPT_UTF8) != 0);
3409 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3410
3411 /* XXXX This is too dramatic a measure... */
3412 PL_reg_maxiter = 0;
3413
95b24440 3414 /* XXX the only recursion left in regmatch() */
0f5d15d6 3415 if (regmatch(re->program + 1)) {
2c914db6
IZ
3416 /* Even though we succeeded, we need to restore
3417 global variables, since we may be wrapped inside
3418 SUSPEND, thus the match may be not finished yet. */
3419
3420 /* XXXX Do this only if SUSPENDed? */
3421 PL_reg_call_cc = state.prev;
95b24440 3422 cc = state.cc;
2c914db6
IZ
3423 PL_reg_re = state.re;
3424 cache_re(PL_reg_re);
cb50f42d 3425 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3426
3427 /* XXXX This is too dramatic a measure... */
3428 PL_reg_maxiter = 0;
3429
3430 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3431 ReREFCNT_dec(re);
3432 regcpblow(cp);
3433 sayYES;
3434 }
0f5d15d6 3435 ReREFCNT_dec(re);
02db2b7b 3436 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3437 regcppop();
3438 PL_reg_call_cc = state.prev;
95b24440 3439 cc = state.cc;
0f5d15d6 3440 PL_reg_re = state.re;
d3790889 3441 cache_re(PL_reg_re);
cb50f42d 3442 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3443
3444 /* XXXX This is too dramatic a measure... */
3445 PL_reg_maxiter = 0;
3446
8e514ae6 3447 logical = 0;
0f5d15d6
IZ
3448 sayNO;
3449 }
c277df42 3450 sw = SvTRUE(ret);
0f5d15d6 3451 logical = 0;
a0ed51b3 3452 }
080c2dec 3453 else {
3280af22 3454 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3455 cache_re(oreg);
3456 }
c277df42
IZ
3457 break;
3458 }
a0d0e21e 3459 case OPEN:
c277df42 3460 n = ARG(scan); /* which paren pair */
3280af22
NIS
3461 PL_reg_start_tmp[n] = locinput;
3462 if (n > PL_regsize)
3463 PL_regsize = n;
a0d0e21e
LW
3464 break;
3465 case CLOSE:
c277df42 3466 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3467 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3468 PL_regendp[n] = locinput - PL_bostr;
eb160463 3469 if (n > (I32)*PL_reglastparen)
3280af22 3470 *PL_reglastparen = n;
a01268b5 3471 *PL_reglastcloseparen = n;
a0d0e21e 3472 break;
c277df42
IZ
3473 case GROUPP:
3474 n = ARG(scan); /* which paren pair */
eb160463 3475 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3476 break;
3477 case IFTHEN:
2c2d71f5 3478 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3479 if (sw)
3480 next = NEXTOPER(NEXTOPER(scan));
3481 else {
3482 next = scan + ARG(scan);
3483 if (OP(next) == IFTHEN) /* Fake one. */
3484 next = NEXTOPER(NEXTOPER(next));
3485 }
3486 break;
3487 case LOGICAL:
0f5d15d6 3488 logical = scan->flags;
c277df42 3489 break;
2ab05381 3490/*******************************************************************
95b24440 3491 cc contains infoblock about the innermost (...)* loop, and
2ab05381
IZ
3492 a pointer to the next outer infoblock.
3493
3494 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3495
95b24440 3496 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3497
95b24440
DM
3498 2) This regnode mallocs an infoblock, and calls regmatch() recursively
3499 with the starting point at WHILEM node;
2ab05381
IZ
3500
3501 3) Each hit of WHILEM node tries to match A and Z (in the order
3502 depending on the current iteration, min/max of {min,max} and
3503 greediness). The information about where are nodes for "A"
3504 and "Z" is read from the infoblock, as is info on how many times "A"
3505 was already matched, and greediness.