This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #34976] substr uses utf8 length cache incorrectly
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
a687059c
LW
86 *
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
90 */
91#include "EXTERN.h"
864dbfa3 92#define PERL_IN_REGEXEC_C
a687059c 93#include "perl.h"
0f5d15d6 94
a687059c
LW
95#include "regcomp.h"
96
c277df42
IZ
97#define RF_tainted 1 /* tainted information used? */
98#define RF_warned 2 /* warned about big count? */
ce862d02 99#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
100#define RF_utf8 8 /* String contains multibyte chars? */
101
eb160463 102#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
103
104#define RS_init 1 /* eval environment created */
105#define RS_set 2 /* replsv value is set */
c277df42 106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
ba7b4546 111#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 112
c277df42
IZ
113/*
114 * Forwards.
115 */
116
33b8afdf 117#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 118#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 119
dfe13c55
GS
120#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
122#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
124#define HOPc(pos,off) ((char*)HOP(pos,off))
125#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 126
efb30f32 127#define HOPBACK(pos, off) ( \
e54858b0 128 (PL_reg_match_utf8) \
efb30f32
HS
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
131 ? (U8*)(pos - off) \
132 : (U8*)NULL \
133)
134#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135
1aa99e6b
IH
136#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
138#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
140#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142
bfed75c6 143#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((const U8*)b); LEAVE; } } STMT_END
51371543 144
5f80c4cf 145/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
146#define JUMPABLE(rn) ( \
147 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
148 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
149 OP(rn) == PLUS || OP(rn) == MINMOD || \
150 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
151)
152
cca55fe3
JP
153#define HAS_TEXT(rn) ( \
154 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
155)
e2d8ce26 156
a84d97b6
HS
157/*
158 Search for mandatory following text node; for lookahead, the text must
159 follow but for lookbehind (rn->flags != 0) we skip to the next step.
160*/
cca55fe3 161#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 162 while (JUMPABLE(rn)) \
a84d97b6 163 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 164 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
165 else if (OP(rn) == PLUS) \
166 rn = NEXTOPER(rn); \
a84d97b6
HS
167 else if (OP(rn) == IFMATCH) \
168 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 169 else rn += NEXT_OFF(rn); \
5f80c4cf 170} STMT_END
74750237 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
3280af22 177 int retval = PL_savestack_ix;
b1ce53c5
JH
178#define REGCP_PAREN_ELEMS 4
179 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
180 int p;
181
e49a9654
IH
182 if (paren_elems_to_push < 0)
183 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
184
a01268b5 185#define REGCP_OTHER_ELEMS 6
4b3c1a47 186 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 187 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 188/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
189 SSPUSHINT(PL_regendp[p]);
190 SSPUSHINT(PL_regstartp[p]);
3280af22 191 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
192 SSPUSHINT(p);
193 }
b1ce53c5 194/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
195 SSPUSHINT(PL_regsize);
196 SSPUSHINT(*PL_reglastparen);
a01268b5 197 SSPUSHINT(*PL_reglastcloseparen);
3280af22 198 SSPUSHPTR(PL_reginput);
41123dfd
JH
199#define REGCP_FRAME_ELEMS 2
200/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
201 * are needed for the regexp context stack bookkeeping. */
202 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 203 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 204
a0d0e21e
LW
205 return retval;
206}
207
c277df42 208/* These are needed since we do not localize EVAL nodes: */
a3621e74 209# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 210 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 211 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 212
a3621e74 213# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 214 PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 216 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 217
76e3520e 218STATIC char *
cea2e8a9 219S_regcppop(pTHX)
a0d0e21e 220{
b1ce53c5 221 I32 i;
a0d0e21e
LW
222 U32 paren = 0;
223 char *input;
cf93c79d 224 I32 tmps;
b1ce53c5 225
a3621e74
YO
226 GET_RE_DEBUG_FLAGS_DECL;
227
b1ce53c5 228 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 229 i = SSPOPINT;
b1ce53c5
JH
230 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
231 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 232 input = (char *) SSPOPPTR;
a01268b5 233 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
234 *PL_reglastparen = SSPOPINT;
235 PL_regsize = SSPOPINT;
b1ce53c5
JH
236
237 /* Now restore the parentheses context. */
41123dfd
JH
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 240 paren = (U32)SSPOPINT;
3280af22 241 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
242 PL_regstartp[paren] = SSPOPINT;
243 tmps = SSPOPINT;
3280af22
NIS
244 if (paren <= *PL_reglastparen)
245 PL_regendp[paren] = tmps;
a3621e74 246 DEBUG_EXECUTE_r(
c3464db5 247 PerlIO_printf(Perl_debug_log,
b900a521 248 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 249 (UV)paren, (IV)PL_regstartp[paren],
b900a521 250 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 251 (IV)PL_regendp[paren],
3280af22 252 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 253 );
a0d0e21e 254 }
a3621e74 255 DEBUG_EXECUTE_r(
eb160463 256 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 257 PerlIO_printf(Perl_debug_log,
faccc32b
JH
258 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
259 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
260 }
261 );
daf18116 262#if 1
dafc8851
JH
263 /* It would seem that the similar code in regtry()
264 * already takes care of this, and in fact it is in
265 * a better location to since this code can #if 0-ed out
266 * but the code in regtry() is needed or otherwise tests
267 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
268 * (as of patchlevel 7877) will fail. Then again,
269 * this code seems to be necessary or otherwise
270 * building DynaLoader will fail:
271 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
272 * --jhi */
eb160463
GS
273 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
274 if ((I32)paren > PL_regsize)
cf93c79d
IZ
275 PL_regstartp[paren] = -1;
276 PL_regendp[paren] = -1;
a0d0e21e 277 }
dafc8851 278#endif
a0d0e21e
LW
279 return input;
280}
281
0f5d15d6 282STATIC char *
cea2e8a9 283S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
284{
285 I32 tmp = PL_savestack_ix;
286
287 PL_savestack_ix = ss;
288 regcppop();
289 PL_savestack_ix = tmp;
942e002e 290 return Nullch;
0f5d15d6
IZ
291}
292
293typedef struct re_cc_state
294{
295 I32 ss;
296 regnode *node;
297 struct re_cc_state *prev;
298 CURCUR *cc;
299 regexp *re;
300} re_cc_state;
301
02db2b7b 302#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 303
29d1e993
HS
304#define TRYPAREN(paren, n, input) { \
305 if (paren) { \
306 if (n) { \
307 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
308 PL_regendp[paren] = input - PL_bostr; \
309 } \
310 else \
311 PL_regendp[paren] = -1; \
312 } \
313 if (regmatch(next)) \
314 sayYES; \
315 if (paren && n) \
316 PL_regendp[paren] = -1; \
317}
318
319
a687059c 320/*
e50aee73 321 * pregexec and friends
a687059c
LW
322 */
323
324/*
c277df42 325 - pregexec - match a regexp against a string
a687059c 326 */
c277df42 327I32
864dbfa3 328Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 329 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
330/* strend: pointer to null at end of string */
331/* strbeg: real beginning of string */
332/* minend: end of match must be >=minend after stringarg. */
333/* nosave: For optimizations. */
334{
335 return
9041c2e3 336 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
337 nosave ? 0 : REXEC_COPY_STR);
338}
0f5d15d6
IZ
339
340STATIC void
cea2e8a9 341S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
342{
343 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
344#ifdef DEBUGGING
345 PL_regprogram = prog->program;
346#endif
347 PL_regnpar = prog->nparens;
9041c2e3
NIS
348 PL_regdata = prog->data;
349 PL_reg_re = prog;
0f5d15d6 350}
22e551b9 351
9041c2e3 352/*
cad2e5aa
JH
353 * Need to implement the following flags for reg_anch:
354 *
355 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
356 * USE_INTUIT_ML
357 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
358 * INTUIT_AUTORITATIVE_ML
359 * INTUIT_ONCE_NOML - Intuit can match in one location only.
360 * INTUIT_ONCE_ML
361 *
362 * Another flag for this function: SECOND_TIME (so that float substrs
363 * with giant delta may be not rechecked).
364 */
365
366/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
367
2c2d71f5 368/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
369 Otherwise, only SvCUR(sv) is used to get strbeg. */
370
371/* XXXX We assume that strpos is strbeg unless sv. */
372
6eb5f6b9
JH
373/* XXXX Some places assume that there is a fixed substring.
374 An update may be needed if optimizer marks as "INTUITable"
375 RExen without fixed substrings. Similarly, it is assumed that
376 lengths of all the strings are no more than minlen, thus they
377 cannot come from lookahead.
378 (Or minlen should take into account lookahead.) */
379
2c2d71f5
JH
380/* A failure to find a constant substring means that there is no need to make
381 an expensive call to REx engine, thus we celebrate a failure. Similarly,
382 finding a substring too deep into the string means that less calls to
30944b6d
IZ
383 regtry() should be needed.
384
385 REx compiler's optimizer found 4 possible hints:
386 a) Anchored substring;
387 b) Fixed substring;
388 c) Whether we are anchored (beginning-of-line or \G);
389 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 390 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
391 string which does not contradict any of them.
392 */
2c2d71f5 393
6eb5f6b9
JH
394/* Most of decisions we do here should have been done at compile time.
395 The nodes of the REx which we used for the search should have been
396 deleted from the finite automaton. */
397
cad2e5aa
JH
398char *
399Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
400 char *strend, U32 flags, re_scream_pos_data *data)
401{
b7953727 402 register I32 start_shift = 0;
cad2e5aa 403 /* Should be nonnegative! */
b7953727 404 register I32 end_shift = 0;
2c2d71f5
JH
405 register char *s;
406 register SV *check;
a1933d95 407 char *strbeg;
cad2e5aa 408 char *t;
33b8afdf 409 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 410 I32 ml_anch;
6eb5f6b9 411 register char *other_last = Nullch; /* other substr checked before this */
b7953727 412 char *check_at = Nullch; /* check substr found at this pos */
7fba1cd6 413 I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
414#ifdef DEBUGGING
415 char *i_strpos = strpos;
ce333219 416 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 417#endif
a3621e74
YO
418
419 GET_RE_DEBUG_FLAGS_DECL;
420
a30b2f1f 421 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 422
b8d68ded 423 if (prog->reganch & ROPT_UTF8) {
a3621e74 424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
425 "UTF-8 regex...\n"));
426 PL_reg_flags |= RF_utf8;
427 }
428
a3621e74 429 DEBUG_EXECUTE_r({
b8d68ded 430 char *s = PL_reg_match_utf8 ?
c728cb41
JH
431 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
432 strpos;
b8d68ded
JH
433 int len = PL_reg_match_utf8 ?
434 strlen(s) : strend - strpos;
2a782b5b
JH
435 if (!PL_colorset)
436 reginitcolors();
b8d68ded 437 if (PL_reg_match_utf8)
a3621e74 438 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 439 "UTF-8 target...\n"));
2a782b5b
JH
440 PerlIO_printf(Perl_debug_log,
441 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
e4584336 442 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
443 prog->precomp,
444 PL_colors[1],
445 (strlen(prog->precomp) > 60 ? "..." : ""),
446 PL_colors[0],
447 (int)(len > 60 ? 60 : len),
448 s, PL_colors[1],
449 (len > 60 ? "..." : "")
450 );
451 });
cad2e5aa 452
c344f387
JH
453 /* CHR_DIST() would be more correct here but it makes things slow. */
454 if (prog->minlen > strend - strpos) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 456 "String too short... [re_intuit_start]\n"));
cad2e5aa 457 goto fail;
2c2d71f5 458 }
a1933d95 459 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 460 PL_regeol = strend;
33b8afdf
JH
461 if (do_utf8) {
462 if (!prog->check_utf8 && prog->check_substr)
463 to_utf8_substr(prog);
464 check = prog->check_utf8;
465 } else {
466 if (!prog->check_substr && prog->check_utf8)
467 to_byte_substr(prog);
468 check = prog->check_substr;
469 }
470 if (check == &PL_sv_undef) {
a3621e74 471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
472 "Non-utf string cannot match utf check string\n"));
473 goto fail;
474 }
2c2d71f5 475 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
476 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
477 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 478 && !multiline ) ); /* Check after \n? */
cad2e5aa 479
7e25d62c
JH
480 if (!ml_anch) {
481 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
482 | ROPT_IMPLICIT)) /* not a real BOL */
483 /* SvCUR is not set on references: SvRV and SvPVX overlap */
484 && sv && !SvROK(sv)
485 && (strpos != strbeg)) {
a3621e74 486 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
487 goto fail;
488 }
489 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 490 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 491 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
492 I32 slen;
493
1aa99e6b 494 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
495 if (SvTAIL(check)) {
496 slen = SvCUR(check); /* >= 1 */
cad2e5aa 497
9041c2e3 498 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 499 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 500 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 501 goto fail_finish;
cad2e5aa
JH
502 }
503 /* Now should match s[0..slen-2] */
504 slen--;
653099ff 505 if (slen && (*SvPVX(check) != *s
cad2e5aa 506 || (slen > 1
653099ff 507 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5 508 report_neq:
a3621e74 509 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
510 goto fail_finish;
511 }
cad2e5aa 512 }
653099ff
GS
513 else if (*SvPVX(check) != *s
514 || ((slen = SvCUR(check)) > 1
515 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
516 goto report_neq;
517 goto success_at_start;
7e25d62c 518 }
cad2e5aa 519 }
2c2d71f5 520 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 521 s = strpos;
2c2d71f5 522 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 525 if (!ml_anch) {
653099ff
GS
526 I32 end = prog->check_offset_max + CHR_SVLEN(check)
527 - (SvTAIL(check) != 0);
1aa99e6b 528 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
529
530 if (end_shift < eshift)
531 end_shift = eshift;
532 }
cad2e5aa 533 }
2c2d71f5 534 else { /* Can match at random position */
cad2e5aa
JH
535 ml_anch = 0;
536 s = strpos;
2c2d71f5
JH
537 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
538 /* Should be nonnegative! */
539 end_shift = prog->minlen - start_shift -
653099ff 540 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
541 }
542
2c2d71f5 543#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 544 if (end_shift < 0)
6bbae5e6 545 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
546#endif
547
2c2d71f5
JH
548 restart:
549 /* Find a possible match in the region s..strend by looking for
550 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 551 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
552 I32 p = -1; /* Internal iterator of scream. */
553 I32 *pp = data ? data->scream_pos : &p;
554
2c2d71f5
JH
555 if (PL_screamfirst[BmRARE(check)] >= 0
556 || ( BmRARE(check) == '\n'
557 && (BmPREVIOUS(check) == SvCUR(check) - 1)
558 && SvTAIL(check) ))
9041c2e3 559 s = screaminstr(sv, check,
2c2d71f5 560 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 561 else
2c2d71f5 562 goto fail_finish;
4addbd3b
HS
563 /* we may be pointing at the wrong string */
564 if (s && RX_MATCH_COPIED(prog))
7ef91622 565 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
566 if (data)
567 *data->scream_olds = s;
568 }
f33976b4 569 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
570 s = fbm_instr((U8*)(s + start_shift),
571 (U8*)(strend - end_shift),
7fba1cd6 572 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 573 else
1aa99e6b
IH
574 s = fbm_instr(HOP3(s, start_shift, strend),
575 HOP3(strend, -end_shift, strbeg),
7fba1cd6 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
577
578 /* Update the count-of-usability, remove useless subpatterns,
579 unshift s. */
2c2d71f5 580
a3621e74 581 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
2c2d71f5 582 (s ? "Found" : "Did not find"),
33b8afdf 583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 584 PL_colors[0],
7b0972df
JH
585 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
586 SvPVX(check),
2c2d71f5
JH
587 PL_colors[1], (SvTAIL(check) ? "$" : ""),
588 (s ? " at offset " : "...\n") ) );
589
590 if (!s)
591 goto fail_finish;
592
6eb5f6b9
JH
593 check_at = s;
594
2c2d71f5 595 /* Finish the diagnostic message */
a3621e74 596 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
597
598 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
599 Start with the other substr.
600 XXXX no SCREAM optimization yet - and a very coarse implementation
601 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
602 *always* match. Probably should be marked during compile...
603 Probably it is right to do no SCREAM here...
604 */
605
33b8afdf 606 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 607 /* Take into account the "other" substring. */
2c2d71f5
JH
608 /* XXXX May be hopelessly wrong for UTF... */
609 if (!other_last)
6eb5f6b9 610 other_last = strpos;
33b8afdf 611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
612 do_other_anchored:
613 {
1aa99e6b 614 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 615 char *s1 = s;
33b8afdf 616 SV* must;
2c2d71f5 617
2c2d71f5
JH
618 t = s - prog->check_offset_max;
619 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 620 && (!do_utf8
1aa99e6b 621 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 622 && t > strpos)))
30944b6d 623 /* EMPTY */;
2c2d71f5
JH
624 else
625 t = strpos;
1aa99e6b 626 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
627 if (t < other_last) /* These positions already checked */
628 t = other_last;
1aa99e6b 629 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
630 if (last < last1)
631 last1 = last;
632 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
633 /* On end-of-str: see comment below. */
33b8afdf
JH
634 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
635 if (must == &PL_sv_undef) {
636 s = (char*)NULL;
a3621e74 637 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
638 }
639 else
640 s = fbm_instr(
641 (unsigned char*)t,
642 HOP3(HOP3(last1, prog->anchored_offset, strend)
643 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
644 must,
7fba1cd6 645 multiline ? FBMrf_MULTILINE : 0
33b8afdf 646 );
a3621e74 647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1aa99e6b 648 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
649 (s ? "Found" : "Contradicts"),
650 PL_colors[0],
33b8afdf
JH
651 (int)(SvCUR(must)
652 - (SvTAIL(must)!=0)),
653 SvPVX(must),
654 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
655 if (!s) {
656 if (last1 >= last2) {
a3621e74 657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
658 ", giving up...\n"));
659 goto fail_finish;
660 }
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 662 ", trying floating at offset %ld...\n",
1aa99e6b
IH
663 (long)(HOP3c(s1, 1, strend) - i_strpos)));
664 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
665 s = HOP3c(last, 1, strend);
2c2d71f5
JH
666 goto restart;
667 }
668 else {
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 670 (long)(s - i_strpos)));
1aa99e6b
IH
671 t = HOP3c(s, -prog->anchored_offset, strbeg);
672 other_last = HOP3c(s, 1, strend);
30944b6d 673 s = s1;
2c2d71f5
JH
674 if (t == strpos)
675 goto try_at_start;
2c2d71f5
JH
676 goto try_at_offset;
677 }
30944b6d 678 }
2c2d71f5
JH
679 }
680 else { /* Take into account the floating substring. */
33b8afdf
JH
681 char *last, *last1;
682 char *s1 = s;
683 SV* must;
684
685 t = HOP3c(s, -start_shift, strbeg);
686 last1 = last =
687 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
688 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
689 last = HOP3c(t, prog->float_max_offset, strend);
690 s = HOP3c(t, prog->float_min_offset, strend);
691 if (s < other_last)
692 s = other_last;
2c2d71f5 693 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
694 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
695 /* fbm_instr() takes into account exact value of end-of-str
696 if the check is SvTAIL(ed). Since false positives are OK,
697 and end-of-str is not later than strend we are OK. */
698 if (must == &PL_sv_undef) {
699 s = (char*)NULL;
a3621e74 700 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
701 }
702 else
2c2d71f5 703 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
704 (unsigned char*)last + SvCUR(must)
705 - (SvTAIL(must)!=0),
7fba1cd6 706 must, multiline ? FBMrf_MULTILINE : 0);
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
33b8afdf
JH
708 (s ? "Found" : "Contradicts"),
709 PL_colors[0],
710 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
711 SvPVX(must),
712 PL_colors[1], (SvTAIL(must) ? "$" : "")));
713 if (!s) {
714 if (last1 == last) {
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
716 ", giving up...\n"));
717 goto fail_finish;
2c2d71f5 718 }
a3621e74 719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
720 ", trying anchored starting at offset %ld...\n",
721 (long)(s1 + 1 - i_strpos)));
722 other_last = last;
723 s = HOP3c(t, 1, strend);
724 goto restart;
725 }
726 else {
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
728 (long)(s - i_strpos)));
729 other_last = s; /* Fix this later. --Hugo */
730 s = s1;
731 if (t == strpos)
732 goto try_at_start;
733 goto try_at_offset;
734 }
2c2d71f5 735 }
cad2e5aa 736 }
2c2d71f5
JH
737
738 t = s - prog->check_offset_max;
2c2d71f5 739 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 740 && (!do_utf8
1aa99e6b
IH
741 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
742 && t > strpos))) {
2c2d71f5
JH
743 /* Fixed substring is found far enough so that the match
744 cannot start at strpos. */
745 try_at_offset:
cad2e5aa 746 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
747 /* Eventually fbm_*() should handle this, but often
748 anchored_offset is not 0, so this check will not be wasted. */
749 /* XXXX In the code below we prefer to look for "^" even in
750 presence of anchored substrings. And we search even
751 beyond the found float position. These pessimizations
752 are historical artefacts only. */
753 find_anchor:
2c2d71f5 754 while (t < strend - prog->minlen) {
cad2e5aa 755 if (*t == '\n') {
4ee3650e 756 if (t < check_at - prog->check_offset_min) {
33b8afdf 757 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
758 /* Since we moved from the found position,
759 we definitely contradict the found anchored
30944b6d
IZ
760 substr. Due to the above check we do not
761 contradict "check" substr.
762 Thus we can arrive here only if check substr
763 is float. Redo checking for "other"=="fixed".
764 */
9041c2e3 765 strpos = t + 1;
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 767 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
768 goto do_other_anchored;
769 }
4ee3650e
GS
770 /* We don't contradict the found floating substring. */
771 /* XXXX Why not check for STCLASS? */
cad2e5aa 772 s = t + 1;
a3621e74 773 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 774 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
775 goto set_useful;
776 }
4ee3650e
GS
777 /* Position contradicts check-string */
778 /* XXXX probably better to look for check-string
779 than for "\n", so one should lower the limit for t? */
a3621e74 780 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 781 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 782 other_last = strpos = s = t + 1;
cad2e5aa
JH
783 goto restart;
784 }
785 t++;
786 }
a3621e74 787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 788 PL_colors[0], PL_colors[1]));
2c2d71f5 789 goto fail_finish;
cad2e5aa 790 }
f5952150 791 else {
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 793 PL_colors[0], PL_colors[1]));
f5952150 794 }
cad2e5aa
JH
795 s = t;
796 set_useful:
33b8afdf 797 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
798 }
799 else {
f5952150 800 /* The found string does not prohibit matching at strpos,
2c2d71f5 801 - no optimization of calling REx engine can be performed,
f5952150
GS
802 unless it was an MBOL and we are not after MBOL,
803 or a future STCLASS check will fail this. */
2c2d71f5
JH
804 try_at_start:
805 /* Even in this situation we may use MBOL flag if strpos is offset
806 wrt the start of the string. */
05b4157f 807 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 808 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
809 /* May be due to an implicit anchor of m{.*foo} */
810 && !(prog->reganch & ROPT_IMPLICIT))
811 {
cad2e5aa
JH
812 t = strpos;
813 goto find_anchor;
814 }
a3621e74 815 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 816 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 817 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 818 );
2c2d71f5 819 success_at_start:
30944b6d 820 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
821 && (do_utf8 ? (
822 prog->check_utf8 /* Could be deleted already */
823 && --BmUSEFUL(prog->check_utf8) < 0
824 && (prog->check_utf8 == prog->float_utf8)
825 ) : (
826 prog->check_substr /* Could be deleted already */
827 && --BmUSEFUL(prog->check_substr) < 0
828 && (prog->check_substr == prog->float_substr)
829 )))
66e933ab 830 {
cad2e5aa 831 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
833 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
834 if (do_utf8 ? prog->check_substr : prog->check_utf8)
835 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
836 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
837 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 838 check = Nullsv; /* abort */
cad2e5aa 839 s = strpos;
3cf5c195
IZ
840 /* XXXX This is a remnant of the old implementation. It
841 looks wasteful, since now INTUIT can use many
6eb5f6b9 842 other heuristics. */
cad2e5aa
JH
843 prog->reganch &= ~RE_USE_INTUIT;
844 }
845 else
846 s = strpos;
847 }
848
6eb5f6b9
JH
849 /* Last resort... */
850 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
851 if (prog->regstclass) {
852 /* minlen == 0 is possible if regstclass is \b or \B,
853 and the fixed substr is ''$.
854 Since minlen is already taken into account, s+1 is before strend;
855 accidentally, minlen >= 1 guaranties no false positives at s + 1
856 even for \b or \B. But (minlen? 1 : 0) below assumes that
857 regstclass does not come from lookahead... */
858 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
859 This leaves EXACTF only, which is dealt with in find_byclass(). */
06b5626a
AL
860 const U8* str = (U8*)STRING(prog->regstclass);
861 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 862 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 863 : 1);
33b8afdf 864 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 865 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 866 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
867 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
868 cl_l, strend)
869 : strend);
6eb5f6b9
JH
870
871 t = s;
9926ca43 872 cache_re(prog);
06b5626a 873 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
874 if (!s) {
875#ifdef DEBUGGING
e1ec3a88 876 const char *what = 0;
6eb5f6b9
JH
877#endif
878 if (endpos == strend) {
a3621e74 879 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
880 "Could not match STCLASS...\n") );
881 goto fail;
882 }
a3621e74 883 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 884 "This position contradicts STCLASS...\n") );
653099ff
GS
885 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
886 goto fail;
6eb5f6b9 887 /* Contradict one of substrings */
33b8afdf
JH
888 if (prog->anchored_substr || prog->anchored_utf8) {
889 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 890 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 891 hop_and_restart:
1aa99e6b 892 s = HOP3c(t, 1, strend);
66e933ab
GS
893 if (s + start_shift + end_shift > strend) {
894 /* XXXX Should be taken into account earlier? */
a3621e74 895 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
896 "Could not match STCLASS...\n") );
897 goto fail;
898 }
5e39e1e5
HS
899 if (!check)
900 goto giveup;
a3621e74 901 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 902 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
903 what, (long)(s + start_shift - i_strpos)) );
904 goto restart;
905 }
66e933ab 906 /* Have both, check_string is floating */
6eb5f6b9
JH
907 if (t + start_shift >= check_at) /* Contradicts floating=check */
908 goto retry_floating_check;
909 /* Recheck anchored substring, but not floating... */
9041c2e3 910 s = check_at;
5e39e1e5
HS
911 if (!check)
912 goto giveup;
a3621e74 913 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 914 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
915 (long)(other_last - i_strpos)) );
916 goto do_other_anchored;
917 }
60e71179
GS
918 /* Another way we could have checked stclass at the
919 current position only: */
920 if (ml_anch) {
921 s = t = t + 1;
5e39e1e5
HS
922 if (!check)
923 goto giveup;
a3621e74 924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 925 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 926 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 927 goto try_at_offset;
66e933ab 928 }
33b8afdf 929 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 930 goto fail;
6eb5f6b9
JH
931 /* Check is floating subtring. */
932 retry_floating_check:
933 t = check_at - start_shift;
a3621e74 934 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
935 goto hop_and_restart;
936 }
b7953727 937 if (t != s) {
a3621e74 938 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 939 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
940 (long)(t - i_strpos), (long)(s - i_strpos))
941 );
942 }
943 else {
a3621e74 944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
945 "Does not contradict STCLASS...\n");
946 );
947 }
6eb5f6b9 948 }
5e39e1e5 949 giveup:
a3621e74 950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
951 PL_colors[4], (check ? "Guessed" : "Giving up"),
952 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 953 return s;
2c2d71f5
JH
954
955 fail_finish: /* Substring not found */
33b8afdf
JH
956 if (prog->check_substr || prog->check_utf8) /* could be removed already */
957 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 958 fail:
a3621e74 959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 960 PL_colors[4], PL_colors[5]));
cad2e5aa
JH
961 return Nullch;
962}
9661b544 963
6eb5f6b9 964/* We know what class REx starts with. Try to find this position... */
3c3eec57 965STATIC char *
06b5626a 966S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, I32 norun)
a687059c 967{
6eb5f6b9
JH
968 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23
G
973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 977 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1036
1037 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1038 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1039
89ebb4a3 1040 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1041 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1042 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1043 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1044 lnc = 0;
1045 while (sm < ((U8 *) m + ln)) {
1046 lnc++;
1047 sm += UTF8SKIP(sm);
1048 }
1aa99e6b
IH
1049 }
1050 else {
1051 c1 = *(U8*)m;
1052 c2 = PL_fold[c1];
1053 }
6eb5f6b9
JH
1054 goto do_exactf;
1055 case EXACTFL:
5dab1207
NIS
1056 m = STRING(c);
1057 ln = STR_LEN(c);
1058 lnc = (I32) ln;
d8093b23 1059 c1 = *(U8*)m;
6eb5f6b9
JH
1060 c2 = PL_fold_locale[c1];
1061 do_exactf:
db12adc6 1062 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1063
6eb5f6b9
JH
1064 if (norun && e < s)
1065 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1066
60a8b682
JH
1067 /* The idea in the EXACTF* cases is to first find the
1068 * first character of the EXACTF* node and then, if
1069 * necessary, case-insensitively compare the full
1070 * text of the node. The c1 and c2 are the first
1071 * characters (though in Unicode it gets a bit
1072 * more complicated because there are more cases
7f16dd3d
JH
1073 * than just upper and lower: one needs to use
1074 * the so-called folding case for case-insensitive
1075 * matching (called "loose matching" in Unicode).
1076 * ibcmp_utf8() will do just that. */
60a8b682 1077
1aa99e6b 1078 if (do_utf8) {
575cac57 1079 UV c, f;
89ebb4a3
JH
1080 U8 tmpbuf [UTF8_MAXBYTES+1];
1081 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
575cac57 1082 STRLEN len, foldlen;
d7f013c8 1083
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1089 ckWARN(WARN_UTF8) ?
1090 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1091 if ( c == c1
1092 && (ln == len ||
66423254 1093 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1094 m, (char **)0, ln, (bool)UTF))
55da9344 1095 && (norun || regtry(prog, s)) )
1aa99e6b 1096 goto got_it;
80aecb99
JH
1097 else {
1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1116 ckWARN(WARN_UTF8) ?
1117 0 : UTF8_ALLOW_ANY);
80aecb99 1118
60a8b682 1119 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1120 * Note that not all the possible combinations
1121 * are handled here: some of them are handled
1122 * by the standard folding rules, and some of
1123 * them (the character class or ANYOF cases)
1124 * are handled during compiletime in
1125 * regexec.c:S_regclass(). */
880bd946
JH
1126 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1127 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1128 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1129
1130 if ( (c == c1 || c == c2)
1131 && (ln == len ||
66423254 1132 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1133 m, (char **)0, ln, (bool)UTF))
55da9344 1134 && (norun || regtry(prog, s)) )
1aa99e6b 1135 goto got_it;
80aecb99
JH
1136 else {
1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
b4f7163a 1184 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1185
b4f7163a 1186 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1187 }
1188 tmp = ((OP(c) == BOUND ?
9041c2e3 1189 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1190 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1191 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1192 if (tmp == !(OP(c) == BOUND ?
3568d838 1193 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1194 isALNUM_LC_utf8((U8*)s)))
1195 {
1196 tmp = !tmp;
1197 if ((norun || regtry(prog, s)))
1198 goto got_it;
1199 }
078c425b 1200 s += uskip;
a687059c 1201 }
a0d0e21e 1202 }
667bb95a 1203 else {
12d33761 1204 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1205 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1206 while (s < strend) {
1207 if (tmp ==
1208 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1209 tmp = !tmp;
1210 if ((norun || regtry(prog, s)))
1211 goto got_it;
1212 }
1213 s++;
a0ed51b3 1214 }
a0ed51b3 1215 }
6eb5f6b9 1216 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1217 goto got_it;
1218 break;
bbce6d69 1219 case NBOUNDL:
3280af22 1220 PL_reg_flags |= RF_tainted;
bbce6d69 1221 /* FALL THROUGH */
a0d0e21e 1222 case NBOUND:
ffc61ed2 1223 if (do_utf8) {
12d33761 1224 if (s == PL_bostr)
ffc61ed2
JH
1225 tmp = '\n';
1226 else {
b4f7163a 1227 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1228
b4f7163a 1229 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1230 }
1231 tmp = ((OP(c) == NBOUND ?
9041c2e3 1232 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1233 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1234 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1235 if (tmp == !(OP(c) == NBOUND ?
3568d838 1236 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1237 isALNUM_LC_utf8((U8*)s)))
1238 tmp = !tmp;
1239 else if ((norun || regtry(prog, s)))
1240 goto got_it;
078c425b 1241 s += uskip;
ffc61ed2 1242 }
a0d0e21e 1243 }
667bb95a 1244 else {
12d33761 1245 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1246 tmp = ((OP(c) == NBOUND ?
1247 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1248 while (s < strend) {
1249 if (tmp ==
1250 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1251 tmp = !tmp;
1252 else if ((norun || regtry(prog, s)))
1253 goto got_it;
1254 s++;
1255 }
a0ed51b3 1256 }
6eb5f6b9 1257 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1258 goto got_it;
1259 break;
a0d0e21e 1260 case ALNUM:
ffc61ed2 1261 if (do_utf8) {
8269fa76 1262 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1263 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1264 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1265 if (tmp && (norun || regtry(prog, s)))
1266 goto got_it;
1267 else
1268 tmp = doevery;
1269 }
bbce6d69 1270 else
ffc61ed2 1271 tmp = 1;
078c425b 1272 s += uskip;
bbce6d69 1273 }
bbce6d69 1274 }
ffc61ed2
JH
1275 else {
1276 while (s < strend) {
1277 if (isALNUM(*s)) {
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
a0ed51b3 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s++;
a0ed51b3 1286 }
a0ed51b3
LW
1287 }
1288 break;
bbce6d69 1289 case ALNUML:
3280af22 1290 PL_reg_flags |= RF_tainted;
ffc61ed2 1291 if (do_utf8) {
078c425b 1292 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1293 if (isALNUM_LC_utf8((U8*)s)) {
1294 if (tmp && (norun || regtry(prog, s)))
1295 goto got_it;
1296 else
1297 tmp = doevery;
1298 }
a687059c 1299 else
ffc61ed2 1300 tmp = 1;
078c425b 1301 s += uskip;
a0d0e21e 1302 }
a0d0e21e 1303 }
ffc61ed2
JH
1304 else {
1305 while (s < strend) {
1306 if (isALNUM_LC(*s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a0ed51b3 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s++;
a0ed51b3 1315 }
a0ed51b3
LW
1316 }
1317 break;
a0d0e21e 1318 case NALNUM:
ffc61ed2 1319 if (do_utf8) {
8269fa76 1320 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1321 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1322 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1323 if (tmp && (norun || regtry(prog, s)))
1324 goto got_it;
1325 else
1326 tmp = doevery;
1327 }
bbce6d69 1328 else
ffc61ed2 1329 tmp = 1;
078c425b 1330 s += uskip;
bbce6d69 1331 }
bbce6d69 1332 }
ffc61ed2
JH
1333 else {
1334 while (s < strend) {
1335 if (!isALNUM(*s)) {
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0ed51b3 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s++;
a0ed51b3 1344 }
a0ed51b3
LW
1345 }
1346 break;
bbce6d69 1347 case NALNUML:
3280af22 1348 PL_reg_flags |= RF_tainted;
ffc61ed2 1349 if (do_utf8) {
078c425b 1350 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1351 if (!isALNUM_LC_utf8((U8*)s)) {
1352 if (tmp && (norun || regtry(prog, s)))
1353 goto got_it;
1354 else
1355 tmp = doevery;
1356 }
a687059c 1357 else
ffc61ed2 1358 tmp = 1;
078c425b 1359 s += uskip;
a687059c 1360 }
a0d0e21e 1361 }
ffc61ed2
JH
1362 else {
1363 while (s < strend) {
1364 if (!isALNUM_LC(*s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
a0ed51b3 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s++;
a0ed51b3 1373 }
a0ed51b3
LW
1374 }
1375 break;
a0d0e21e 1376 case SPACE:
ffc61ed2 1377 if (do_utf8) {
8269fa76 1378 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1379 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1380 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1381 if (tmp && (norun || regtry(prog, s)))
1382 goto got_it;
1383 else
1384 tmp = doevery;
1385 }
a0d0e21e 1386 else
ffc61ed2 1387 tmp = 1;
078c425b 1388 s += uskip;
2304df62 1389 }
a0d0e21e 1390 }
ffc61ed2
JH
1391 else {
1392 while (s < strend) {
1393 if (isSPACE(*s)) {
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0ed51b3 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s++;
a0ed51b3 1402 }
a0ed51b3
LW
1403 }
1404 break;
bbce6d69 1405 case SPACEL:
3280af22 1406 PL_reg_flags |= RF_tainted;
ffc61ed2 1407 if (do_utf8) {
078c425b 1408 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1409 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1410 if (tmp && (norun || regtry(prog, s)))
1411 goto got_it;
1412 else
1413 tmp = doevery;
1414 }
bbce6d69 1415 else
ffc61ed2 1416 tmp = 1;
078c425b 1417 s += uskip;
bbce6d69 1418 }
bbce6d69 1419 }
ffc61ed2
JH
1420 else {
1421 while (s < strend) {
1422 if (isSPACE_LC(*s)) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
a0ed51b3 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s++;
a0ed51b3 1431 }
a0ed51b3
LW
1432 }
1433 break;
a0d0e21e 1434 case NSPACE:
ffc61ed2 1435 if (do_utf8) {
8269fa76 1436 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1437 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1438 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1439 if (tmp && (norun || regtry(prog, s)))
1440 goto got_it;
1441 else
1442 tmp = doevery;
1443 }
a0d0e21e 1444 else
ffc61ed2 1445 tmp = 1;
078c425b 1446 s += uskip;
a687059c 1447 }
a0d0e21e 1448 }
ffc61ed2
JH
1449 else {
1450 while (s < strend) {
1451 if (!isSPACE(*s)) {
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0ed51b3 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s++;
a0ed51b3 1460 }
a0ed51b3
LW
1461 }
1462 break;
bbce6d69 1463 case NSPACEL:
3280af22 1464 PL_reg_flags |= RF_tainted;
ffc61ed2 1465 if (do_utf8) {
078c425b 1466 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1467 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1468 if (tmp && (norun || regtry(prog, s)))
1469 goto got_it;
1470 else
1471 tmp = doevery;
1472 }
bbce6d69 1473 else
ffc61ed2 1474 tmp = 1;
078c425b 1475 s += uskip;
bbce6d69 1476 }
bbce6d69 1477 }
ffc61ed2
JH
1478 else {
1479 while (s < strend) {
1480 if (!isSPACE_LC(*s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
a0ed51b3 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s++;
a0ed51b3 1489 }
a0ed51b3
LW
1490 }
1491 break;
a0d0e21e 1492 case DIGIT:
ffc61ed2 1493 if (do_utf8) {
8269fa76 1494 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1495 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1496 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1497 if (tmp && (norun || regtry(prog, s)))
1498 goto got_it;
1499 else
1500 tmp = doevery;
1501 }
a0d0e21e 1502 else
ffc61ed2 1503 tmp = 1;
078c425b 1504 s += uskip;
2b69d0c2 1505 }
a0d0e21e 1506 }
ffc61ed2
JH
1507 else {
1508 while (s < strend) {
1509 if (isDIGIT(*s)) {
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0ed51b3 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s++;
a0ed51b3 1518 }
a0ed51b3
LW
1519 }
1520 break;
b8c5462f
JH
1521 case DIGITL:
1522 PL_reg_flags |= RF_tainted;
ffc61ed2 1523 if (do_utf8) {
078c425b 1524 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1525 if (isDIGIT_LC_utf8((U8*)s)) {
1526 if (tmp && (norun || regtry(prog, s)))
1527 goto got_it;
1528 else
1529 tmp = doevery;
1530 }
b8c5462f 1531 else
ffc61ed2 1532 tmp = 1;
078c425b 1533 s += uskip;
b8c5462f 1534 }
b8c5462f 1535 }
ffc61ed2
JH
1536 else {
1537 while (s < strend) {
1538 if (isDIGIT_LC(*s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s++;
b8c5462f 1547 }
b8c5462f
JH
1548 }
1549 break;
a0d0e21e 1550 case NDIGIT:
ffc61ed2 1551 if (do_utf8) {
8269fa76 1552 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1553 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1554 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1555 if (tmp && (norun || regtry(prog, s)))
1556 goto got_it;
1557 else
1558 tmp = doevery;
1559 }
a0d0e21e 1560 else
ffc61ed2 1561 tmp = 1;
078c425b 1562 s += uskip;
a687059c 1563 }
a0d0e21e 1564 }
ffc61ed2
JH
1565 else {
1566 while (s < strend) {
1567 if (!isDIGIT(*s)) {
1568 if (tmp && (norun || regtry(prog, s)))
1569 goto got_it;
1570 else
1571 tmp = doevery;
1572 }
a0ed51b3 1573 else
ffc61ed2
JH
1574 tmp = 1;
1575 s++;
a0ed51b3 1576 }
a0ed51b3
LW
1577 }
1578 break;
b8c5462f
JH
1579 case NDIGITL:
1580 PL_reg_flags |= RF_tainted;
ffc61ed2 1581 if (do_utf8) {
078c425b 1582 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1583 if (!isDIGIT_LC_utf8((U8*)s)) {
1584 if (tmp && (norun || regtry(prog, s)))
1585 goto got_it;
1586 else
1587 tmp = doevery;
1588 }
b8c5462f 1589 else
ffc61ed2 1590 tmp = 1;
078c425b 1591 s += uskip;
b8c5462f 1592 }
a0ed51b3 1593 }
ffc61ed2
JH
1594 else {
1595 while (s < strend) {
1596 if (!isDIGIT_LC(*s)) {
1597 if (tmp && (norun || regtry(prog, s)))
1598 goto got_it;
1599 else
1600 tmp = doevery;
1601 }
cf93c79d 1602 else
ffc61ed2
JH
1603 tmp = 1;
1604 s++;
b8c5462f 1605 }
b8c5462f
JH
1606 }
1607 break;
b3c9acc1 1608 default:
3c3eec57
GS
1609 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1610 break;
d6a28714 1611 }
6eb5f6b9
JH
1612 return 0;
1613 got_it:
1614 return s;
1615}
1616
1617/*
1618 - regexec_flags - match a regexp against a string
1619 */
1620I32
1621Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1622 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1623/* strend: pointer to null at end of string */
1624/* strbeg: real beginning of string */
1625/* minend: end of match must be >=minend after stringarg. */
1626/* data: May be used for some additional optimizations. */
1627/* nosave: For optimizations. */
1628{
6eb5f6b9
JH
1629 register char *s;
1630 register regnode *c;
1631 register char *startpos = stringarg;
6eb5f6b9
JH
1632 I32 minlen; /* must match at least this many chars */
1633 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1634 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1635 constant substr. */ /* CC */
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
1638 char *scream_olds;
1639 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1640 bool do_utf8 = DO_UTF8(sv);
7fba1cd6 1641 I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1642#ifdef DEBUGGING
9e55ce06
JH
1643 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1644 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1645#endif
a3621e74
YO
1646
1647 GET_RE_DEBUG_FLAGS_DECL;
1648
a30b2f1f 1649 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1650
1651 PL_regcc = 0;
1652
1653 cache_re(prog);
1654#ifdef DEBUGGING
aea4f609 1655 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1656#endif
1657
1658 /* Be paranoid... */
1659 if (prog == NULL || startpos == NULL) {
1660 Perl_croak(aTHX_ "NULL regexp parameter");
1661 return 0;
1662 }
1663
1664 minlen = prog->minlen;
61a36c01 1665 if (strend - startpos < minlen) {
a3621e74 1666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1667 "String too short [regexec_flags]...\n"));
1668 goto phooey;
1aa99e6b 1669 }
6eb5f6b9 1670
6eb5f6b9
JH
1671 /* Check validity of program. */
1672 if (UCHARAT(prog->program) != REG_MAGIC) {
1673 Perl_croak(aTHX_ "corrupted regexp program");
1674 }
1675
1676 PL_reg_flags = 0;
1677 PL_reg_eval_set = 0;
1678 PL_reg_maxiter = 0;
1679
1680 if (prog->reganch & ROPT_UTF8)
1681 PL_reg_flags |= RF_utf8;
1682
1683 /* Mark beginning of line for ^ and lookbehind. */
1684 PL_regbol = startpos;
1685 PL_bostr = strbeg;
1686 PL_reg_sv = sv;
1687
1688 /* Mark end of line for $ (and such) */
1689 PL_regeol = strend;
1690
1691 /* see how far we have to get to not match where we matched before */
1692 PL_regtill = startpos+minend;
1693
1694 /* We start without call_cc context. */
1695 PL_reg_call_cc = 0;
1696
1697 /* If there is a "must appear" string, look for it. */
1698 s = startpos;
1699
1700 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1701 MAGIC *mg;
1702
1703 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1704 PL_reg_ganch = startpos;
1705 else if (sv && SvTYPE(sv) >= SVt_PVMG
1706 && SvMAGIC(sv)
14befaf4
DM
1707 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1708 && mg->mg_len >= 0) {
6eb5f6b9
JH
1709 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1710 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1711 if (s > PL_reg_ganch)
6eb5f6b9
JH
1712 goto phooey;
1713 s = PL_reg_ganch;
1714 }
1715 }
1716 else /* pos() not defined */
1717 PL_reg_ganch = strbeg;
1718 }
1719
33b8afdf 1720 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1721 re_scream_pos_data d;
1722
1723 d.scream_olds = &scream_olds;
1724 d.scream_pos = &scream_pos;
1725 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1726 if (!s) {
a3621e74 1727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1728 goto phooey; /* not present */
3fa9c3d7 1729 }
6eb5f6b9
JH
1730 }
1731
a3621e74 1732 DEBUG_EXECUTE_r({
9e55ce06
JH
1733 char *s0 = UTF ?
1734 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1735 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1736 prog->precomp;
1737 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1738 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1739 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1740 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1741 if (!PL_colorset)
1742 reginitcolors();
1743 PerlIO_printf(Perl_debug_log,
9e55ce06 1744 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
e4584336 1745 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1746 len0, len0, s0,
2a782b5b 1747 PL_colors[1],
9e55ce06 1748 len0 > 60 ? "..." : "",
2a782b5b 1749 PL_colors[0],
9e55ce06
JH
1750 (int)(len1 > 60 ? 60 : len1),
1751 s1, PL_colors[1],
1752 (len1 > 60 ? "..." : "")
2a782b5b
JH
1753 );
1754 });
6eb5f6b9
JH
1755
1756 /* Simplest case: anchored match need be tried only once. */
1757 /* [unless only anchor is BOL and multiline is set] */
1758 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1759 if (s == startpos && regtry(prog, startpos))
1760 goto got_it;
7fba1cd6 1761 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1762 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1763 {
1764 char *end;
1765
1766 if (minlen)
1767 dontbother = minlen - 1;
1aa99e6b 1768 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1769 /* for multiline we only have to try after newlines */
33b8afdf 1770 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1771 if (s == startpos)
1772 goto after_try;
1773 while (1) {
1774 if (regtry(prog, s))
1775 goto got_it;
1776 after_try:
1777 if (s >= end)
1778 goto phooey;
1779 if (prog->reganch & RE_USE_INTUIT) {
1780 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1781 if (!s)
1782 goto phooey;
1783 }
1784 else
1785 s++;
1786 }
1787 } else {
1788 if (s > startpos)
1789 s--;
1790 while (s < end) {
1791 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1792 if (regtry(prog, s))
1793 goto got_it;
1794 }
1795 }
1796 }
1797 }
1798 goto phooey;
1799 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1800 if (regtry(prog, PL_reg_ganch))
1801 goto got_it;
1802 goto phooey;
1803 }
1804
1805 /* Messy cases: unanchored match. */
33b8afdf 1806 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1807 /* we have /x+whatever/ */
1808 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1809 char ch;
bf93d4cc
GS
1810#ifdef DEBUGGING
1811 int did_match = 0;
1812#endif
33b8afdf
JH
1813 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1814 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1815 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1816
1aa99e6b 1817 if (do_utf8) {
6eb5f6b9
JH
1818 while (s < strend) {
1819 if (*s == ch) {
a3621e74 1820 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1821 if (regtry(prog, s)) goto got_it;
1822 s += UTF8SKIP(s);
1823 while (s < strend && *s == ch)
1824 s += UTF8SKIP(s);
1825 }
1826 s += UTF8SKIP(s);
1827 }
1828 }
1829 else {
1830 while (s < strend) {
1831 if (*s == ch) {
a3621e74 1832 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1833 if (regtry(prog, s)) goto got_it;
1834 s++;
1835 while (s < strend && *s == ch)
1836 s++;
1837 }
1838 s++;
1839 }
1840 }
a3621e74 1841 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1842 PerlIO_printf(Perl_debug_log,
b7953727
JH
1843 "Did not find anchored character...\n")
1844 );
6eb5f6b9
JH
1845 }
1846 /*SUPPRESS 560*/
33b8afdf
JH
1847 else if (prog->anchored_substr != Nullsv
1848 || prog->anchored_utf8 != Nullsv
1849 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1850 && prog->float_max_offset < strend - s)) {
1851 SV *must;
1852 I32 back_max;
1853 I32 back_min;
1854 char *last;
6eb5f6b9 1855 char *last1; /* Last position checked before */
bf93d4cc
GS
1856#ifdef DEBUGGING
1857 int did_match = 0;
1858#endif
33b8afdf
JH
1859 if (prog->anchored_substr || prog->anchored_utf8) {
1860 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1861 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1862 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1863 back_max = back_min = prog->anchored_offset;
1864 } else {
1865 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1868 back_max = prog->float_max_offset;
1869 back_min = prog->float_min_offset;
1870 }
1871 if (must == &PL_sv_undef)
1872 /* could not downgrade utf8 check substring, so must fail */
1873 goto phooey;
1874
1875 last = HOP3c(strend, /* Cannot start after this */
1876 -(I32)(CHR_SVLEN(must)
1877 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1878
1879 if (s > PL_bostr)
1880 last1 = HOPc(s, -1);
1881 else
1882 last1 = s - 1; /* bogus */
1883
1884 /* XXXX check_substr already used to find `s', can optimize if
1885 check_substr==must. */
1886 scream_pos = -1;
1887 dontbother = end_shift;
1888 strend = HOPc(strend, -dontbother);
1889 while ( (s <= last) &&
9041c2e3 1890 ((flags & REXEC_SCREAM)
1aa99e6b 1891 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1892 end_shift, &scream_pos, 0))
1aa99e6b 1893 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1894 (unsigned char*)strend, must,
7fba1cd6 1895 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1896 /* we may be pointing at the wrong string */
1897 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1898 s = strbeg + (s - SvPVX(sv));
a3621e74 1899 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1900 if (HOPc(s, -back_max) > last1) {
1901 last1 = HOPc(s, -back_min);
1902 s = HOPc(s, -back_max);
1903 }
1904 else {
1905 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1906
1907 last1 = HOPc(s, -back_min);
1908 s = t;
1909 }
1aa99e6b 1910 if (do_utf8) {
6eb5f6b9
JH
1911 while (s <= last1) {
1912 if (regtry(prog, s))
1913 goto got_it;
1914 s += UTF8SKIP(s);
1915 }
1916 }
1917 else {
1918 while (s <= last1) {
1919 if (regtry(prog, s))
1920 goto got_it;
1921 s++;
1922 }
1923 }
1924 }
a3621e74 1925 DEBUG_EXECUTE_r(if (!did_match)
b7953727
JH
1926 PerlIO_printf(Perl_debug_log,
1927 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1928 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1929 ? "anchored" : "floating"),
1930 PL_colors[0],
1931 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1932 SvPVX(must),
b7953727
JH
1933 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1934 );
6eb5f6b9
JH
1935 goto phooey;
1936 }
155aba94 1937 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1938 if (minlen) {
1939 I32 op = (U8)OP(prog->regstclass);
66e933ab 1940 /* don't bother with what can't match */
f14c76ed
RGS
1941 if (PL_regkind[op] != EXACT && op != CANY)
1942 strend = HOPc(strend, -(minlen - 1));
1943 }
a3621e74 1944 DEBUG_EXECUTE_r({
ffc61ed2 1945 SV *prop = sv_newmortal();
9e55ce06
JH
1946 char *s0;
1947 char *s1;
1948 int len0;
1949 int len1;
1950
ffc61ed2 1951 regprop(prop, c);
9e55ce06
JH
1952 s0 = UTF ?
1953 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1954 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1955 SvPVX(prop);
1956 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1957 s1 = UTF ?
c728cb41 1958 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1959 len1 = UTF ? SvCUR(dsv1) : strend - s;
1960 PerlIO_printf(Perl_debug_log,
1961 "Matching stclass `%*.*s' against `%*.*s'\n",
1962 len0, len0, s0,
1963 len1, len1, s1);
ffc61ed2 1964 });
06b5626a 1965 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1966 goto got_it;
a3621e74 1967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1968 }
1969 else {
1970 dontbother = 0;
33b8afdf
JH
1971 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1972 /* Trim the end. */
d6a28714 1973 char *last;
33b8afdf
JH
1974 SV* float_real;
1975
1976 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1977 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1978 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1979
1980 if (flags & REXEC_SCREAM) {
33b8afdf 1981 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1982 end_shift, &scream_pos, 1); /* last one */
1983 if (!last)
ffc61ed2 1984 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1985 /* we may be pointing at the wrong string */
1986 else if (RX_MATCH_COPIED(prog))
7ef91622 1987 s = strbeg + (s - SvPVX(sv));
b8c5462f 1988 }
d6a28714
JH
1989 else {
1990 STRLEN len;
06b5626a 1991 const char * const little = SvPV(float_real, len);
d6a28714 1992
33b8afdf 1993 if (SvTAIL(float_real)) {
d6a28714
JH
1994 if (memEQ(strend - len + 1, little, len - 1))
1995 last = strend - len + 1;
7fba1cd6 1996 else if (!multiline)
9041c2e3 1997 last = memEQ(strend - len, little, len)
d6a28714 1998 ? strend - len : Nullch;
b8c5462f 1999 else
d6a28714
JH
2000 goto find_last;
2001 } else {
2002 find_last:
9041c2e3 2003 if (len)
d6a28714 2004 last = rninstr(s, strend, little, little + len);
b8c5462f 2005 else
d6a28714 2006 last = strend; /* matching `$' */
b8c5462f 2007 }
b8c5462f 2008 }
bf93d4cc 2009 if (last == NULL) {
a3621e74 2010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2011 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2012 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2013 goto phooey; /* Should not happen! */
2014 }
d6a28714
JH
2015 dontbother = strend - last + prog->float_min_offset;
2016 }
2017 if (minlen && (dontbother < minlen))
2018 dontbother = minlen - 1;
2019 strend -= dontbother; /* this one's always in bytes! */
2020 /* We don't know much -- general case. */
1aa99e6b 2021 if (do_utf8) {
d6a28714
JH
2022 for (;;) {
2023 if (regtry(prog, s))
2024 goto got_it;
2025 if (s >= strend)
2026 break;
b8c5462f 2027 s += UTF8SKIP(s);
d6a28714
JH
2028 };
2029 }
2030 else {
2031 do {
2032 if (regtry(prog, s))
2033 goto got_it;
2034 } while (s++ < strend);
2035 }
2036 }
2037
2038 /* Failure. */
2039 goto phooey;
2040
2041got_it:
2042 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2043
2044 if (PL_reg_eval_set) {
2045 /* Preserve the current value of $^R */
2046 if (oreplsv != GvSV(PL_replgv))
2047 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2048 restored, the value remains
2049 the same. */
acfe0abc 2050 restore_pos(aTHX_ 0);
d6a28714
JH
2051 }
2052
2053 /* make sure $`, $&, $', and $digit will work later */
2054 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2055 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2056 if (flags & REXEC_COPY_STR) {
2057 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2058#ifdef PERL_COPY_ON_WRITE
2059 if ((SvIsCOW(sv)
2060 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2061 if (DEBUG_C_TEST) {
2062 PerlIO_printf(Perl_debug_log,
2063 "Copy on write: regexp capture, type %d\n",
2064 (int) SvTYPE(sv));
2065 }
2066 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2067 prog->subbeg = SvPVX(prog->saved_copy);
2068 assert (SvPOKp(prog->saved_copy));
2069 } else
2070#endif
2071 {
2072 RX_MATCH_COPIED_on(prog);
2073 s = savepvn(strbeg, i);
2074 prog->subbeg = s;
2075 }
d6a28714 2076 prog->sublen = i;
d6a28714
JH
2077 }
2078 else {
2079 prog->subbeg = strbeg;
2080 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2081 }
2082 }
9041c2e3 2083
d6a28714
JH
2084 return 1;
2085
2086phooey:
a3621e74 2087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2088 PL_colors[4], PL_colors[5]));
d6a28714 2089 if (PL_reg_eval_set)
acfe0abc 2090 restore_pos(aTHX_ 0);
d6a28714
JH
2091 return 0;
2092}
2093
2094/*
2095 - regtry - try match at specific point
2096 */
2097STATIC I32 /* 0 failure, 1 success */
2098S_regtry(pTHX_ regexp *prog, char *startpos)
2099{
d6a28714
JH
2100 register I32 i;
2101 register I32 *sp;
2102 register I32 *ep;
2103 CHECKPOINT lastcp;
a3621e74 2104 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2105
02db2b7b
IZ
2106#ifdef DEBUGGING
2107 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2108#endif
d6a28714
JH
2109 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2110 MAGIC *mg;
2111
2112 PL_reg_eval_set = RS_init;
a3621e74 2113 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2114 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2115 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2116 ));
e8347627 2117 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2118 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2119 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2120 SAVETMPS;
2121 /* Apparently this is not needed, judging by wantarray. */
e8347627 2122 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2123 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2124
2125 if (PL_reg_sv) {
2126 /* Make $_ available to executed code. */
2127 if (PL_reg_sv != DEFSV) {
59f00321 2128 SAVE_DEFSV;
d6a28714 2129 DEFSV = PL_reg_sv;
b8c5462f 2130 }
d6a28714 2131
9041c2e3 2132 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2133 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2134 /* prepare for quick setting of pos */
14befaf4
DM
2135 sv_magic(PL_reg_sv, (SV*)0,
2136 PERL_MAGIC_regex_global, Nullch, 0);
2137 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2138 mg->mg_len = -1;
b8c5462f 2139 }
d6a28714
JH
2140 PL_reg_magic = mg;
2141 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2142 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2143 }
09687e5a 2144 if (!PL_reg_curpm) {
e4584336 2145 Newz(22, PL_reg_curpm, 1, PMOP);
09687e5a
AB
2146#ifdef USE_ITHREADS
2147 {
2148 SV* repointer = newSViv(0);
577e12cc 2149 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2150 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2151 av_push(PL_regex_padav,repointer);
2152 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2153 PL_regex_pad = AvARRAY(PL_regex_padav);
2154 }
2155#endif
2156 }
aaa362c4 2157 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2158 PL_reg_oldcurpm = PL_curpm;
2159 PL_curpm = PL_reg_curpm;
2160 if (RX_MATCH_COPIED(prog)) {
2161 /* Here is a serious problem: we cannot rewrite subbeg,
2162 since it may be needed if this match fails. Thus
2163 $` inside (?{}) could fail... */
2164 PL_reg_oldsaved = prog->subbeg;
2165 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2166#ifdef PERL_COPY_ON_WRITE
2167 PL_nrs = prog->saved_copy;
2168#endif
d6a28714
JH
2169 RX_MATCH_COPIED_off(prog);
2170 }
2171 else
2172 PL_reg_oldsaved = Nullch;
2173 prog->subbeg = PL_bostr;
2174 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2175 }
2176 prog->startp[0] = startpos - PL_bostr;
2177 PL_reginput = startpos;
2178 PL_regstartp = prog->startp;
2179 PL_regendp = prog->endp;
2180 PL_reglastparen = &prog->lastparen;
a01268b5 2181 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2182 prog->lastparen = 0;
03994de8 2183 prog->lastcloseparen = 0;
d6a28714 2184 PL_regsize = 0;
a3621e74 2185 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2186 if (PL_reg_start_tmpl <= prog->nparens) {
2187 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2188 if(PL_reg_start_tmp)
2189 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2190 else
e4584336 2191 New(22, PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2192 }
2193
2194 /* XXXX What this code is doing here?!!! There should be no need
2195 to do this again and again, PL_reglastparen should take care of
3dd2943c 2196 this! --ilya*/
dafc8851
JH
2197
2198 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2199 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2200 * PL_reglastparen), is not needed at all by the test suite
2201 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2202 * enough, for building DynaLoader, or otherwise this
2203 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2204 * will happen. Meanwhile, this code *is* needed for the
2205 * above-mentioned test suite tests to succeed. The common theme
2206 * on those tests seems to be returning null fields from matches.
2207 * --jhi */
dafc8851 2208#if 1
d6a28714
JH
2209 sp = prog->startp;
2210 ep = prog->endp;
2211 if (prog->nparens) {
eb160463 2212 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2213 *++sp = -1;
2214 *++ep = -1;
2215 }
2216 }
dafc8851 2217#endif
02db2b7b 2218 REGCP_SET(lastcp);
d6a28714
JH
2219 if (regmatch(prog->program + 1)) {
2220 prog->endp[0] = PL_reginput - PL_bostr;
2221 return 1;
2222 }
02db2b7b 2223 REGCP_UNWIND(lastcp);
d6a28714
JH
2224 return 0;
2225}
2226
02db2b7b
IZ
2227#define RE_UNWIND_BRANCH 1
2228#define RE_UNWIND_BRANCHJ 2
2229
2230union re_unwind_t;
2231
2232typedef struct { /* XX: makes sense to enlarge it... */
2233 I32 type;
2234 I32 prev;
2235 CHECKPOINT lastcp;
2236} re_unwind_generic_t;
2237
2238typedef struct {
2239 I32 type;
2240 I32 prev;
2241 CHECKPOINT lastcp;
2242 I32 lastparen;
2243 regnode *next;
2244 char *locinput;
2245 I32 nextchr;
2246#ifdef DEBUGGING
2247 int regindent;
2248#endif
2249} re_unwind_branch_t;
2250
2251typedef union re_unwind_t {
2252 I32 type;
2253 re_unwind_generic_t generic;
2254 re_unwind_branch_t branch;
2255} re_unwind_t;
2256
8ba1375e
MJD
2257#define sayYES goto yes
2258#define sayNO goto no
e0f9d4a8 2259#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2260#define sayYES_FINAL goto yes_final
2261#define sayYES_LOUD goto yes_loud
2262#define sayNO_FINAL goto no_final
2263#define sayNO_SILENT goto do_no
2264#define saySAME(x) if (x) goto yes; else goto no
2265
3ab3c9b4
HS
2266#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2267#define POSCACHE_SEEN 1 /* we know what we're caching */
2268#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2269#define CACHEsayYES STMT_START { \
2270 if (cache_offset | cache_bit) { \
2271 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2272 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2273 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2274 /* cache records failure, but this is success */ \
2275 DEBUG_r( \
2276 PerlIO_printf(Perl_debug_log, \
2277 "%*s (remove success from failure cache)\n", \
2278 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 ); \
2280 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2281 } \
2282 } \
2283 sayYES; \
2284} STMT_END
2285#define CACHEsayNO STMT_START { \
2286 if (cache_offset | cache_bit) { \
2287 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2288 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2289 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2290 /* cache records success, but this is failure */ \
2291 DEBUG_r( \
2292 PerlIO_printf(Perl_debug_log, \
2293 "%*s (remove failure from success cache)\n", \
2294 REPORT_CODE_OFF+PL_regindent*2, "") \
2295 ); \
2296 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2297 } \
2298 } \
2299 sayNO; \
2300} STMT_END
2301
a3621e74
YO
2302/* this is used to determine how far from the left messages like
2303 'failed...' are printed. Currently 29 makes these messages line
2304 up with the opcode they refer to. Earlier perls used 25 which
2305 left these messages outdented making reviewing a debug output
2306 quite difficult.
2307*/
2308#define REPORT_CODE_OFF 29
2309
2310
2311/* Make sure there is a test for this +1 options in re_tests */
2312#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2313
2314#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2315 if ( trie->states[ state ].wordnum ) { \
2316 if ( !accepted ) { \
2317 ENTER; \
2318 SAVETMPS; \
2319 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2320 sv_accept_buff=NEWSV( 1234, \
2321 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2322 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2323 SvPOK_on( sv_accept_buff ); \
2324 sv_2mortal( sv_accept_buff ); \
2325 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2326 } else { \
2327 if ( accepted >= bufflen ) { \
2328 bufflen *= 2; \
2329 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2330 bufflen * sizeof(reg_trie_accepted) ); \
2331 } \
2332 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2333 + sizeof( reg_trie_accepted ) ); \
2334 } \
2335 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2336 accept_buff[ accepted ].endpos = uc; \
2337 ++accepted; \
2338 } } STMT_END
2339
2340#define TRIE_HANDLE_CHAR STMT_START { \
2341 if ( uvc < 256 ) { \
2342 charid = trie->charmap[ uvc ]; \
2343 } else { \
2344 charid = 0; \
2345 if( trie->widecharmap ) { \
2346 SV** svpp = (SV**)NULL; \
2347 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2348 sizeof( UV ), 0 ); \
2349 if ( svpp ) { \
2350 charid = (U16)SvIV( *svpp ); \
2351 } \
2352 } \
2353 } \
2354 if ( charid && \
cc601c31
YO
2355 ( base + charid > trie->uniquecharcount ) && \
2356 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2357 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2358 { \
2359 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2360 } else { \
2361 state = 0; \
2362 } \
2363 uc += len; \
2364 } STMT_END
8ba1375e 2365
d6a28714
JH
2366/*
2367 - regmatch - main matching routine
2368 *
2369 * Conceptually the strategy is simple: check to see whether the current
2370 * node matches, call self recursively to see whether the rest matches,
2371 * and then act accordingly. In practice we make some effort to avoid
2372 * recursion, in particular by going through "ordinary" nodes (that don't
2373 * need to know whether the rest of the match failed) by a loop instead of
2374 * by recursion.
2375 */
2376/* [lwall] I've hoisted the register declarations to the outer block in order to
2377 * maybe save a little bit of pushing and popping on the stack. It also takes
2378 * advantage of machines that use a register save mask on subroutine entry.
2379 */
2380STATIC I32 /* 0 failure, 1 success */
2381S_regmatch(pTHX_ regnode *prog)
2382{
d6a28714
JH
2383 register regnode *scan; /* Current node. */
2384 regnode *next; /* Next node. */
2385 regnode *inner; /* Next node in internal branch. */
2386 register I32 nextchr; /* renamed nextchr - nextchar colides with
2387 function of same name */
2388 register I32 n; /* no or next */
b7953727
JH
2389 register I32 ln = 0; /* len or last */
2390 register char *s = Nullch; /* operand or save */
d6a28714 2391 register char *locinput = PL_reginput;
b7953727 2392 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2393 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2394 I32 unwind = 0;
a3621e74
YO
2395
2396 /* used by the trie code */
ab74612d
NC
2397 SV *sv_accept_buff = 0; /* accepting states we have traversed */
2398 reg_trie_accepted *accept_buff = 0; /* "" */
2399 reg_trie_data *trie; /* what trie are we using right now */
2400 U32 accepted = 0; /* how many accepting states we have seen*/
a3621e74 2401
b7953727 2402#if 0
02db2b7b 2403 I32 firstcp = PL_savestack_ix;
b7953727 2404#endif
53c4c00c 2405 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2406#ifdef DEBUGGING
ce333219
JH
2407 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2408 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2409 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74 2410
ab74612d 2411 SV *re_debug_flags = NULL;
2a782b5b 2412#endif
02db2b7b 2413
a3621e74
YO
2414 GET_RE_DEBUG_FLAGS;
2415
d6a28714
JH
2416#ifdef DEBUGGING
2417 PL_regindent++;
2418#endif
2419
a3621e74 2420
d6a28714
JH
2421 /* Note that nextchr is a byte even in UTF */
2422 nextchr = UCHARAT(locinput);
2423 scan = prog;
2424 while (scan != NULL) {
8ba1375e 2425
a3621e74 2426 DEBUG_EXECUTE_r( {
d6a28714
JH
2427 SV *prop = sv_newmortal();
2428 int docolor = *PL_colors[0];
2429 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2430 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2431 /* The part of the string before starttry has one color
2432 (pref0_len chars), between starttry and current
2433 position another one (pref_len - pref0_len chars),
2434 after the current position the third one.
2435 We assume that pref0_len <= pref_len, otherwise we
2436 decrease pref0_len. */
9041c2e3 2437 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2438 ? (5 + taill) - l : locinput - PL_bostr;
2439 int pref0_len;
d6a28714 2440
df1ffd02 2441 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2442 pref_len++;
2443 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2444 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2445 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2446 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2448 l--;
d6a28714
JH
2449 if (pref0_len < 0)
2450 pref0_len = 0;
2451 if (pref0_len > pref_len)
2452 pref0_len = pref_len;
2453 regprop(prop, scan);
2a782b5b
JH
2454 {
2455 char *s0 =
f14c76ed 2456 do_utf8 && OP(scan) != CANY ?
2a782b5b 2457 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2458 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2459 locinput - pref_len;
df1ffd02 2460 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2461 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2462 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2463 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2464 locinput - pref_len + pref0_len;
df1ffd02 2465 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2466 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2467 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2468 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2469 locinput;
df1ffd02 2470 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2471 PerlIO_printf(Perl_debug_log,
2472 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2473 (IV)(locinput - PL_bostr),
2474 PL_colors[4],
2475 len0, s0,
2476 PL_colors[5],
2477 PL_colors[2],
2478 len1, s1,
2479 PL_colors[3],
2480 (docolor ? "" : "> <"),
2481 PL_colors[0],
2482 len2, s2,
2483 PL_colors[1],
2484 15 - l - pref_len + 1,
2485 "",
2486 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2487 SvPVX(prop));
2488 }
2489 });
d6a28714
JH
2490
2491 next = scan + NEXT_OFF(scan);
2492 if (next == scan)
2493 next = NULL;
2494
2495 switch (OP(scan)) {
2496 case BOL:
7fba1cd6 2497 if (locinput == PL_bostr)
d6a28714
JH
2498 {
2499 /* regtill = regbol; */
b8c5462f
JH
2500 break;
2501 }
d6a28714
JH
2502 sayNO;
2503 case MBOL:
12d33761
HS
2504 if (locinput == PL_bostr ||
2505 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2506 {
b8c5462f
JH
2507 break;
2508 }
d6a28714
JH
2509 sayNO;
2510 case SBOL:
c2a73568 2511 if (locinput == PL_bostr)
b8c5462f 2512 break;
d6a28714
JH
2513 sayNO;
2514 case GPOS:
2515 if (locinput == PL_reg_ganch)
2516 break;
2517 sayNO;
2518 case EOL:
d6a28714
JH
2519 goto seol;
2520 case MEOL:
d6a28714 2521 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2522 sayNO;
b8c5462f 2523 break;
d6a28714
JH
2524 case SEOL:
2525 seol:
2526 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2527 sayNO;
d6a28714 2528 if (PL_regeol - locinput > 1)
b8c5462f 2529 sayNO;
b8c5462f 2530 break;
d6a28714
JH
2531 case EOS:
2532 if (PL_regeol != locinput)
b8c5462f 2533 sayNO;
d6a28714 2534 break;
ffc61ed2 2535 case SANY:
d6a28714 2536 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2537 sayNO;
f33976b4
DB
2538 if (do_utf8) {
2539 locinput += PL_utf8skip[nextchr];
2540 if (locinput > PL_regeol)
2541 sayNO;
2542 nextchr = UCHARAT(locinput);
2543 }
2544 else
2545 nextchr = UCHARAT(++locinput);
2546 break;
2547 case CANY:
2548 if (!nextchr && locinput >= PL_regeol)
2549 sayNO;
b8c5462f 2550 nextchr = UCHARAT(++locinput);
a0d0e21e 2551 break;
ffc61ed2 2552 case REG_ANY:
1aa99e6b
IH
2553 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2554 sayNO;
2555 if (do_utf8) {
b8c5462f 2556 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2557 if (locinput > PL_regeol)
2558 sayNO;
a0ed51b3 2559 nextchr = UCHARAT(locinput);
a0ed51b3 2560 }
1aa99e6b
IH
2561 else
2562 nextchr = UCHARAT(++locinput);
a0ed51b3 2563 break;
a3621e74
YO
2564
2565
2566
2567 /*
2568 traverse the TRIE keeping track of all accepting states
2569 we transition through until we get to a failing node.
2570
2571 we use two slightly different pieces of code to handle
2572 the traversal depending on whether its case sensitive or
2573 not. we reuse the accept code however. (this should probably
2574 be turned into a macro.)
2575
2576 */
2577 case TRIEF:
2578 case TRIEFL:
2579 {
2580
2581 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2582 U8 *uc = ( U8* )locinput;
2583 U32 state = 1;
2584 U16 charid = 0;
2585 U32 base = 0;
2586 UV uvc = 0;
2587 STRLEN len = 0;
2588 STRLEN foldlen = 0;
2589 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2590 U8 *uscan = (U8*)NULL;
2591 STRLEN bufflen=0;
2592 accepted = 0;
2593
2594 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2595
2596 while ( state && uc <= (U8*)PL_regeol ) {
2597
2598 TRIE_CHECK_STATE_IS_ACCEPTING;
2599
2600 base = trie->states[ state ].trans.base;
2601
2602 DEBUG_TRIE_EXECUTE_r(
2603 PerlIO_printf( Perl_debug_log,
e4584336 2604 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2605 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2606 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2607 );
2608
2609 if ( base ) {
2610
2611 if ( do_utf8 || UTF ) {
2612 if ( foldlen>0 ) {
2613 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2614 foldlen -= len;
2615 uscan += len;
2616 len=0;
2617 } else {
2618 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2619 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2620 foldlen -= UNISKIP( uvc );
2621 uscan = foldbuf + UNISKIP( uvc );
2622 }
2623 } else {
e4584336 2624 uvc = (UV)*uc;
a3621e74
YO
2625 len = 1;
2626 }
2627
2628 TRIE_HANDLE_CHAR;
2629
2630 } else {
2631 state = 0;
2632 }
2633 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2634 PerlIO_printf( Perl_debug_log,
2635 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2636 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2637 );
2638 }
2639 if ( !accepted ) {
2640 sayNO;
2641 } else {
2642 goto TrieAccept;
2643 }
2644 }
2645 /* unreached codepoint: we jump into the middle of the next case
2646 from previous if blocks */
2647 case TRIE:
2648 {
2649 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2650 U8 *uc = (U8*)locinput;
2651 U32 state = 1;
2652 U16 charid = 0;
2653 U32 base = 0;
2654 UV uvc = 0;
2655 STRLEN len = 0;
2656 STRLEN bufflen = 0;
2657 accepted = 0;
2658
2659 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2660
2661 while ( state && uc <= (U8*)PL_regeol ) {
2662
2663 TRIE_CHECK_STATE_IS_ACCEPTING;
2664
2665 base = trie->states[ state ].trans.base;
2666
2667 DEBUG_TRIE_EXECUTE_r(
2668 PerlIO_printf( Perl_debug_log,
e4584336 2669 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
a3621e74 2670 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
e4584336 2671 (UV)state, (UV)base, (UV)accepted );
a3621e74
YO
2672 );
2673
2674 if ( base ) {
2675
2676 if ( do_utf8 || UTF ) {
2677 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2678 } else {
2679 uvc = (U32)*uc;
2680 len = 1;
2681 }
2682
2683 TRIE_HANDLE_CHAR;
2684
2685 } else {
2686 state = 0;
2687 }
2688 DEBUG_TRIE_EXECUTE_r(
2689 PerlIO_printf( Perl_debug_log,
e4584336
RB
2690 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2691 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2692 );
2693 }
2694 if ( !accepted ) {
2695 sayNO;
2696 }
2697 }
2698
2699
2700 /*
2701 There was at least one accepting state that we
2702 transitioned through. Presumably the number of accepting
2703 states is going to be low, typically one or two. So we
2704 simply scan through to find the one with lowest wordnum.
2705 Once we find it, we swap the last state into its place
2706 and decrement the size. We then try to match the rest of
2707 the pattern at the point where the word ends, if we
2708 succeed then we end the loop, otherwise the loop
2709 eventually terminates once all of the accepting states
2710 have been tried.
2711 */
2712 TrieAccept:
2713 {
2714 int gotit = 0;
2715
2716 if ( accepted == 1 ) {
2717 DEBUG_EXECUTE_r({
2718 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2719 PerlIO_printf( Perl_debug_log,
2720 "%*s %sonly one match : #%d <%s>%s\n",
e4584336 2721 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74
YO
2722 accept_buff[ 0 ].wordnum,
2723 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2724 PL_colors[5] );
2725 });
cc601c31 2726 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2727 /* in this case we free tmps/leave before we call regmatch
2728 as we wont be using accept_buff again. */
2729 FREETMPS;
2730 LEAVE;
2731 gotit = regmatch( scan + NEXT_OFF( scan ) );
2732 } else {
2733 DEBUG_EXECUTE_r(
e4584336
RB
2734 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
2735 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)accepted,
a3621e74
YO
2736 PL_colors[5] );
2737 );
2738 while ( !gotit && accepted-- ) {
2739 U32 best = 0;
2740 U32 cur;
2741 for( cur = 1 ; cur <= accepted ; cur++ ) {
e4584336
RB
2742 DEBUG_TRIE_EXECUTE_r(
2743 PerlIO_printf( Perl_debug_log,
2744 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2745 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2746 (IV)best, accept_buff[ best ].wordnum, (IV)cur,
2747 accept_buff[ cur ].wordnum, PL_colors[5] );
2748 );
a3621e74
YO
2749
2750 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2751 best = cur;
2752 }
2753 DEBUG_EXECUTE_r({
2754 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2755 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
e4584336 2756 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
a3621e74
YO
2757 accept_buff[best].wordnum,
2758 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2759 PL_colors[5] );
2760 });
2761 if ( best<accepted ) {
2762 reg_trie_accepted tmp = accept_buff[ best ];
2763 accept_buff[ best ] = accept_buff[ accepted ];
2764 accept_buff[ accepted ] = tmp;
2765 best = accepted;
2766 }
cc601c31 2767 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2768
2769 /*
2770 as far as I can tell we only need the SAVETMPS/FREETMPS
2771 for re's with EVAL in them but I'm leaving them in for
2772 all until I can be sure.
2773 */
2774 SAVETMPS;
2775 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2776 FREETMPS;
2777 }
2778 FREETMPS;
2779 LEAVE;
2780 }
2781
2782 if ( gotit ) {
2783 sayYES;
2784 } else {
2785 sayNO;
2786 }
2787 }
2788 /* unreached codepoint */
d6a28714 2789 case EXACT:
cd439c50
IZ
2790 s = STRING(scan);
2791 ln = STR_LEN(scan);
eb160463 2792 if (do_utf8 != UTF) {
bc517b45 2793 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2794 char *l = locinput;
2795 char *e = s + ln;
bc517b45 2796 STRLEN ulen;
a72c7584 2797
5ff6fc6d
JH
2798 if (do_utf8) {
2799 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2800 while (s < e) {
2801 if (l >= PL_regeol)
5ff6fc6d
JH
2802 sayNO;
2803 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2804 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2805 ckWARN(WARN_UTF8) ?
2806 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2807 sayNO;
bc517b45 2808 l += ulen;
5ff6fc6d 2809 s ++;
1aa99e6b 2810 }
5ff6fc6d
JH
2811 }
2812 else {
2813 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2814 while (s < e) {
2815 if (l >= PL_regeol)
2816 sayNO;
5ff6fc6d 2817 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2818 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2819 ckWARN(WARN_UTF8) ?
2820 0 : UTF8_ALLOW_ANY))
1aa99e6b 2821 sayNO;
bc517b45 2822 s += ulen;
a72c7584 2823 l ++;
1aa99e6b 2824 }
5ff6fc6d 2825 }
1aa99e6b
IH
2826 locinput = l;
2827 nextchr = UCHARAT(locinput);
2828 break;
2829 }
bc517b45 2830 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2831 /* Inline the first character, for speed. */
2832 if (UCHARAT(s) != nextchr)
2833 sayNO;
2834 if (PL_regeol - locinput < ln)
2835 sayNO;
2836 if (ln > 1 && memNE(s, locinput, ln))
2837 sayNO;
2838 locinput += ln;
2839 nextchr = UCHARAT(locinput);
2840 break;
2841 case EXACTFL:
b8c5462f
JH
2842 PL_reg_flags |= RF_tainted;
2843 /* FALL THROUGH */
d6a28714 2844 case EXACTF:
cd439c50
IZ
2845 s = STRING(scan);
2846 ln = STR_LEN(scan);
d6a28714 2847
d07ddd77
JH
2848 if (do_utf8 || UTF) {
2849 /* Either target or the pattern are utf8. */
d6a28714 2850 char *l = locinput;
d07ddd77 2851 char *e = PL_regeol;
bc517b45 2852
eb160463 2853 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2854 l, &e, 0, do_utf8)) {
5486206c
JH
2855 /* One more case for the sharp s:
2856 * pack("U0U*", 0xDF) =~ /ss/i,
2857 * the 0xC3 0x9F are the UTF-8
2858 * byte sequence for the U+00DF. */
2859 if (!(do_utf8 &&
2860 toLOWER(s[0]) == 's' &&
2861 ln >= 2 &&
2862 toLOWER(s[1]) == 's' &&
2863 (U8)l[0] == 0xC3 &&
2864 e - l >= 2 &&
2865 (U8)l[1] == 0x9F))
2866 sayNO;
2867 }
d07ddd77
JH
2868 locinput = e;
2869 nextchr = UCHARAT(locinput);
2870 break;
a0ed51b3 2871 }
d6a28714 2872
bc517b45
JH
2873 /* Neither the target and the pattern are utf8. */
2874
d6a28714
JH
2875 /* Inline the first character, for speed. */
2876 if (UCHARAT(s) != nextchr &&
2877 UCHARAT(s) != ((OP(scan) == EXACTF)
2878 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2879 sayNO;
d6a28714 2880 if (PL_regeol - locinput < ln)
b8c5462f 2881 sayNO;
d6a28714
JH
2882 if (ln > 1 && (OP(scan) == EXACTF
2883 ? ibcmp(s, locinput, ln)
2884 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2885 sayNO;
d6a28714
JH
2886 locinput += ln;
2887 nextchr = UCHARAT(locinput);
a0d0e21e 2888 break;
d6a28714 2889 case ANYOF:
ffc61ed2 2890 if (do_utf8) {
9e55ce06
JH
2891 STRLEN inclasslen = PL_regeol - locinput;
2892
ba7b4546 2893 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2894 sayNO_ANYOF;
ffc61ed2
JH
2895 if (locinput >= PL_regeol)
2896 sayNO;
0f0076b4 2897 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2898 nextchr = UCHARAT(locinput);
e0f9d4a8 2899 break;
ffc61ed2
JH
2900 }
2901 else {
2902 if (nextchr < 0)
2903 nextchr = UCHARAT(locinput);
7d3e948e 2904 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2905 sayNO_ANYOF;
ffc61ed2
JH
2906 if (!nextchr && locinput >= PL_regeol)
2907 sayNO;
2908 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2909 break;
2910 }
2911 no_anyof:
2912 /* If we might have the case of the German sharp s
2913 * in a casefolding Unicode character class. */
2914
ebc501f0
JH
2915 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2916 locinput += SHARP_S_SKIP;
e0f9d4a8 2917 nextchr = UCHARAT(locinput);
ffc61ed2 2918 }
e0f9d4a8
JH
2919 else
2920 sayNO;
b8c5462f 2921 break;
d6a28714 2922 case ALNUML:
b8c5462f
JH
2923 PL_reg_flags |= RF_tainted;
2924 /* FALL THROUGH */
d6a28714 2925 case ALNUM:
b8c5462f 2926 if (!nextchr)
4633a7c4 2927 sayNO;
ffc61ed2 2928 if (do_utf8) {
ad24be35 2929 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2930 if (!(OP(scan) == ALNUM
3568d838 2931 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2932 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2933 {
2934 sayNO;
a0ed51b3 2935 }
b8c5462f 2936 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2937 nextchr = UCHARAT(locinput);
2938 break;
2939 }
ffc61ed2 2940 if (!(OP(scan) == ALNUM
d6a28714 2941 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2942 sayNO;
b8c5462f 2943 nextchr = UCHARAT(++locinput);
a0d0e21e 2944 break;
d6a28714 2945 case NALNUML:
b8c5462f
JH
2946 PL_reg_flags |= RF_tainted;
2947 /* FALL THROUGH */
d6a28714
JH
2948 case NALNUM:
2949 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2950 sayNO;
ffc61ed2 2951 if (do_utf8) {
8269fa76 2952 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2953 if (OP(scan) == NALNUM
3568d838 2954 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2955 : isALNUM_LC_utf8((U8*)locinput))
2956 {
b8c5462f 2957 sayNO;
d6a28714 2958 }
b8c5462f
JH
2959 locinput += PL_utf8skip[nextchr];
2960 nextchr = UCHARAT(locinput);
2961 break;
2962 }
ffc61ed2 2963 if (OP(scan) == NALNUM
d6a28714 2964 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2965 sayNO;
76e3520e 2966 nextchr = UCHARAT(++locinput);
a0d0e21e 2967 break;
d6a28714
JH
2968 case BOUNDL:
2969 case NBOUNDL:
3280af22 2970 PL_reg_flags |= RF_tainted;
bbce6d69 2971 /* FALL THROUGH */
d6a28714
JH
2972 case BOUND:
2973 case NBOUND:
2974 /* was last char in word? */
ffc61ed2 2975 if (do_utf8) {
12d33761
HS
2976 if (locinput == PL_bostr)
2977 ln = '\n';
ffc61ed2 2978 else {
b4f7163a 2979 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2980
b4f7163a 2981 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2982 }
2983 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2984 ln = isALNUM_uni(ln);
8269fa76 2985 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2986 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2987 }
2988 else {
9041c2e3 2989 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2990 n = isALNUM_LC_utf8((U8*)locinput);
2991 }
a0ed51b3 2992 }
d6a28714 2993 else {
12d33761
HS
2994 ln = (locinput != PL_bostr) ?
2995 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2996 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2997 ln = isALNUM(ln);
2998 n = isALNUM(nextchr);
2999 }
3000 else {
3001 ln = isALNUM_LC(ln);
3002 n = isALNUM_LC(nextchr);
3003 }
d6a28714 3004 }
ffc61ed2
JH
3005 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3006 OP(scan) == BOUNDL))
3007 sayNO;
a0ed51b3 3008 break;
d6a28714 3009 case SPACEL:
3280af22 3010 PL_reg_flags |= RF_tainted;
bbce6d69 3011 /* FALL THROUGH */
d6a28714 3012 case SPACE:
9442cb0e 3013 if (!nextchr)
4633a7c4 3014 sayNO;
1aa99e6b 3015 if (do_utf8) {
fd400ab9 3016 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 3017 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 3018 if (!(OP(scan) == SPACE
3568d838 3019 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3020 : isSPACE_LC_utf8((U8*)locinput)))
3021 {
3022 sayNO;
3023 }
3024 locinput += PL_utf8skip[nextchr];
3025 nextchr = UCHARAT(locinput);
3026 break;
d6a28714 3027 }
ffc61ed2
JH
3028 if (!(OP(scan) == SPACE
3029 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3030 sayNO;
3031 nextchr = UCHARAT(++locinput);
3032 }
3033 else {
3034 if (!(OP(scan) == SPACE
3035 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3036 sayNO;
3037 nextchr = UCHARAT(++locinput);
a0ed51b3 3038 }
a0ed51b3 3039 break;
d6a28714 3040 case NSPACEL:
3280af22 3041 PL_reg_flags |= RF_tainted;
bbce6d69 3042 /* FALL THROUGH */
d6a28714 3043 case NSPACE:
9442cb0e 3044 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3045 sayNO;
1aa99e6b 3046 if (do_utf8) {
8269fa76 3047 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 3048 if (OP(scan) == NSPACE
3568d838 3049 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3050 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3051 {
3052 sayNO;
3053 }
3054 locinput += PL_utf8skip[nextchr];
3055 nextchr = UCHARAT(locinput);
3056 break;
a0ed51b3 3057 }
ffc61ed2 3058 if (OP(scan) == NSPACE
d6a28714 3059 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3060 sayNO;
76e3520e 3061 nextchr = UCHARAT(++locinput);
a0d0e21e 3062 break;
d6a28714 3063 case DIGITL:
a0ed51b3
LW
3064 PL_reg_flags |= RF_tainted;
3065 /* FALL THROUGH */
d6a28714 3066 case DIGIT:
9442cb0e 3067 if (!nextchr)
a0ed51b3 3068 sayNO;
1aa99e6b 3069 if (do_utf8) {
8269fa76 3070 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3071 if (!(OP(scan) == DIGIT
3568d838 3072 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3073 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3074 {
a0ed51b3 3075 sayNO;
dfe13c55 3076 }
6f06b55f 3077 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3078 nextchr = UCHARAT(locinput);
3079 break;
3080 }
ffc61ed2 3081 if (!(OP(scan) == DIGIT
9442cb0e 3082 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3083 sayNO;
3084 nextchr = UCHARAT(++locinput);
3085 break;
d6a28714 3086 case NDIGITL:
b8c5462f
JH
3087 PL_reg_flags |= RF_tainted;
3088 /* FALL THROUGH */
d6a28714 3089 case NDIGIT:
9442cb0e 3090 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3091 sayNO;
1aa99e6b 3092 if (do_utf8) {
8269fa76 3093 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3094 if (OP(scan) == NDIGIT
3568d838 3095 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3096 : isDIGIT_LC_utf8((U8*)locinput))
3097 {
a0ed51b3 3098 sayNO;
9442cb0e 3099 }
6f06b55f 3100 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3101 nextchr = UCHARAT(locinput);
3102 break;
3103 }
ffc61ed2 3104 if (OP(scan) == NDIGIT
9442cb0e 3105 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3106 sayNO;
3107 nextchr = UCHARAT(++locinput);
3108 break;
3109 case CLUMP:
b7c83a7e 3110 if (locinput >= PL_regeol)
a0ed51b3 3111 sayNO;
b7c83a7e
JH
3112 if (do_utf8) {
3113 LOAD_UTF8_CHARCLASS(mark,"~");
3114 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3115 sayNO;
3116 locinput += PL_utf8skip[nextchr];
3117 while (locinput < PL_regeol &&
3118 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3119 locinput += UTF8SKIP(locinput);
3120 if (locinput > PL_regeol)
3121 sayNO;
eb08e2da
JH
3122 }
3123 else
3124 locinput++;
a0ed51b3
LW
3125 nextchr = UCHARAT(locinput);
3126 break;
c8756f30 3127 case REFFL:
3280af22 3128 PL_reg_flags |= RF_tainted;
c8756f30 3129 /* FALL THROUGH */
c277df42 3130 case REF:
c8756f30 3131 case REFF:
c277df42 3132 n = ARG(scan); /* which paren pair */
cf93c79d 3133 ln = PL_regstartp[n];
2c2d71f5 3134 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3135 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3136 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3137 if (ln == PL_regendp[n])
a0d0e21e 3138 break;
a0ed51b3 3139
cf93c79d 3140 s = PL_bostr + ln;
1aa99e6b 3141 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3142 char *l = locinput;
cf93c79d 3143 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3144 /*
3145 * Note that we can't do the "other character" lookup trick as
3146 * in the 8-bit case (no pun intended) because in Unicode we
3147 * have to map both upper and title case to lower case.
3148 */
3149 if (OP(scan) == REFF) {
a2a2844f 3150 STRLEN ulen1, ulen2;
89ebb4a3
JH
3151 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3152 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a0ed51b3
LW
3153 while (s < e) {
3154 if (l >= PL_regeol)
3155 sayNO;
a2a2844f
JH
3156 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3157 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3158 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3159 sayNO;
a2a2844f
JH
3160 s += ulen1;
3161 l += ulen2;
a0ed51b3
LW
3162 }
3163 }
3164 locinput = l;
3165 nextchr = UCHARAT(locinput);
3166 break;
3167 }
3168
a0d0e21e 3169 /* Inline the first character, for speed. */
76e3520e 3170 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3171 (OP(scan) == REF ||
3172 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3173 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3174 sayNO;
cf93c79d 3175 ln = PL_regendp[n] - ln;
3280af22 3176 if (locinput + ln > PL_regeol)
4633a7c4 3177 sayNO;
c8756f30
AK
3178 if (ln > 1 && (OP(scan) == REF
3179 ? memNE(s, locinput, ln)
3180 : (OP(scan) == REFF
3181 ? ibcmp(s, locinput, ln)
3182 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3183 sayNO;
a0d0e21e 3184 locinput += ln;
76e3520e 3185 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3186 break;
3187
3188 case NOTHING:
c277df42 3189 case TAIL:
a0d0e21e
LW
3190 break;
3191 case BACK:
3192 break;
c277df42
IZ
3193 case EVAL:
3194 {
3195 dSP;
533c011a 3196 OP_4tree *oop = PL_op;
3280af22 3197 COP *ocurcop = PL_curcop;
f3548bdc 3198 PAD *old_comppad;
c277df42 3199 SV *ret;
080c2dec 3200 struct regexp *oreg = PL_reg_re;
9041c2e3 3201
c277df42 3202 n = ARG(scan);
533c011a 3203 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3204 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3205 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3206 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3207
8e5e9ebe
RGS
3208 {
3209 SV **before = SP;
3210 CALLRUNOPS(aTHX); /* Scalar context. */
3211 SPAGAIN;
3212 if (SP == before)
075aa684 3213 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3214 else {
3215 ret = POPs;
3216 PUTBACK;
3217 }
3218 }
3219
0f5d15d6 3220 PL_op = oop;
f3548bdc 3221 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3222 PL_curcop = ocurcop;
c277df42 3223 if (logical) {
0f5d15d6
IZ
3224 if (logical == 2) { /* Postponed subexpression. */
3225 regexp *re;
22c35a8c 3226 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3227 re_cc_state state;
0f5d15d6 3228 CHECKPOINT cp, lastcp;
cb50f42d 3229 int toggleutf;
faf82a0b 3230 register SV *sv;
0f5d15d6 3231
faf82a0b
AE
3232 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3233 mg = mg_find(sv, PERL_MAGIC_qr);
3234 else if (SvSMAGICAL(ret)) {
3235 if (SvGMAGICAL(ret))
3236 sv_unmagic(ret, PERL_MAGIC_qr);
3237 else
3238 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3239 }
faf82a0b 3240
0f5d15d6
IZ
3241 if (mg) {
3242 re = (regexp *)mg->mg_obj;
df0003d4 3243 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3244 }
3245 else {
3246 STRLEN len;
3247 char *t = SvPV(ret, len);
3248 PMOP pm;
3249 char *oprecomp = PL_regprecomp;
3250 I32 osize = PL_regsize;
3251 I32 onpar = PL_regnpar;
3252
5fcd1c1b 3253 Zero(&pm, 1, PMOP);
cb50f42d 3254 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 3255 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 3256 if (!(SvFLAGS(ret)
faf82a0b
AE
3257 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3258 | SVs_GMG)))
14befaf4
DM
3259 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3260 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3261 PL_regprecomp = oprecomp;
3262 PL_regsize = osize;
3263 PL_regnpar = onpar;
3264 }
a3621e74 3265 DEBUG_EXECUTE_r(
9041c2e3 3266 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
3267 "Entering embedded `%s%.60s%s%s'\n",
3268 PL_colors[0],
3269 re->precomp,
3270 PL_colors[1],
3271 (strlen(re->precomp) > 60 ? "..." : ""))
3272 );
3273 state.node = next;
3274 state.prev = PL_reg_call_cc;
3275 state.cc = PL_regcc;
3276 state.re = PL_reg_re;
3277
2ab05381 3278 PL_regcc = 0;
9041c2e3 3279
0f5d15d6 3280 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3281 REGCP_SET(lastcp);
0f5d15d6
IZ
3282 cache_re(re);
3283 state.ss = PL_savestack_ix;
3284 *PL_reglastparen = 0;
a01268b5 3285 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3286 PL_reg_call_cc = &state;
3287 PL_reginput = locinput;
cb50f42d
YST
3288 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3289 ((re->reganch & ROPT_UTF8) != 0);
3290 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3291
3292 /* XXXX This is too dramatic a measure... */
3293 PL_reg_maxiter = 0;
3294
0f5d15d6 3295 if (regmatch(re->program + 1)) {
2c914db6
IZ
3296 /* Even though we succeeded, we need to restore
3297 global variables, since we may be wrapped inside
3298 SUSPEND, thus the match may be not finished yet. */
3299
3300 /* XXXX Do this only if SUSPENDed? */
3301 PL_reg_call_cc = state.prev;
3302 PL_regcc = state.cc;
3303 PL_reg_re = state.re;
3304 cache_re(PL_reg_re);
cb50f42d 3305 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3306
3307 /* XXXX This is too dramatic a measure... */
3308 PL_reg_maxiter = 0;
3309
3310 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3311 ReREFCNT_dec(re);
3312 regcpblow(cp);
3313 sayYES;
3314 }
0f5d15d6 3315 ReREFCNT_dec(re);
02db2b7b 3316 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3317 regcppop();
3318 PL_reg_call_cc = state.prev;
3319 PL_regcc = state.cc;
3320 PL_reg_re = state.re;
d3790889 3321 cache_re(PL_reg_re);
cb50f42d 3322 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3323
3324 /* XXXX This is too dramatic a measure... */
3325 PL_reg_maxiter = 0;
3326
8e514ae6 3327 logical = 0;
0f5d15d6
IZ
3328 sayNO;
3329 }
c277df42 3330 sw = SvTRUE(ret);
0f5d15d6 3331 logical = 0;
a0ed51b3 3332 }
080c2dec 3333 else {
3280af22 3334 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3335 cache_re(oreg);
3336 }
c277df42
IZ
3337 break;
3338 }
a0d0e21e 3339 case OPEN:
c277df42 3340 n = ARG(scan); /* which paren pair */
3280af22
NIS
3341 PL_reg_start_tmp[n] = locinput;
3342 if (n > PL_regsize)
3343 PL_regsize = n;
a0d0e21e
LW
3344 break;
3345 case CLOSE:
c277df42 3346 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3347 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3348 PL_regendp[n] = locinput - PL_bostr;
eb160463 3349 if (n > (I32)*PL_reglastparen)
3280af22 3350 *PL_reglastparen = n;
a01268b5 3351 *PL_reglastcloseparen = n;
a0d0e21e 3352 break;
c277df42
IZ
3353 case GROUPP:
3354 n = ARG(scan); /* which paren pair */
eb160463 3355 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3356 break;
3357 case IFTHEN:
2c2d71f5 3358 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3359 if (sw)
3360 next = NEXTOPER(NEXTOPER(scan));
3361 else {
3362 next = scan + ARG(scan);
3363 if (OP(next) == IFTHEN) /* Fake one. */
3364 next = NEXTOPER(NEXTOPER(next));
3365 }
3366 break;
3367 case LOGICAL:
0f5d15d6 3368 logical = scan->flags;
c277df42 3369 break;
2ab05381
IZ
3370/*******************************************************************
3371 PL_regcc contains infoblock about the innermost (...)* loop, and
3372 a pointer to the next outer infoblock.
3373
3374 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3375
3376 1) After matching X, regnode for CURLYX is processed;
3377
9041c2e3 3378 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3379 regmatch() recursively with the starting point at WHILEM node;
3380
3381 3) Each hit of WHILEM node tries to match A and Z (in the order
3382 depending on the current iteration, min/max of {min,max} and
3383 greediness). The information about where are nodes for "A"
3384 and "Z" is read from the infoblock, as is info on how many times "A"
3385 was already matched, and greediness.
3386
3387 4) After A matches, the same WHILEM node is hit again.
3388
3389 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3390 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3391 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3392 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3393 of the external loop.
3394
3395 Currently present infoblocks form a tree with a stem formed by PL_curcc
3396 and whatever it mentions via ->next, and additional attached trees
3397 corresponding to temporarily unset infoblocks as in "5" above.
3398
9041c2e3 3399 In the following picture infoblocks for outer loop of
2ab05381
IZ
3400 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3401 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3402 infoblocks are drawn below the "reset" infoblock.
3403
3404 In fact in the picture below we do not show failed matches for Z and T
3405 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3406 more obvious *why* one needs to *temporary* unset infoblocks.]
3407
3408 Matched REx position InfoBlocks Comment
3409 (Y(A)*?Z)*?T x
3410 Y(A)*?Z)*?T x <- O
3411 Y (A)*?Z)*?T x <- O
3412 Y A)*?Z)*?T x <- O <- I
3413 YA )*?Z)*?T x <- O <- I
3414 YA A)*?Z)*?T x <- O <- I
3415 YAA )*?Z)*?T x <- O <- I
3416 YAA Z)*?T x <- O # Temporary unset I
3417 I
3418
3419 YAAZ Y(A)*?Z)*?T x <- O
3420 I
3421
3422 YAAZY (A)*?Z)*?T x <- O
3423 I
3424
3425 YAAZY A)*?Z)*?T x <- O <- I
3426 I
3427
3428 YAAZYA )*?Z)*?T x <- O <- I
3429 I
3430
3431 YAAZYA Z)*?T x <- O # Temporary unset I
3432 I,I
3433
3434 YAAZYAZ )*?T x <- O
3435 I,I
3436
3437 YAAZYAZ T x # Temporary unset O
3438 O
3439 I,I
3440
3441 YAAZYAZT x
3442 O
3443 I,I
3444 *******************************************************************/
a0d0e21e
LW
3445 case CURLYX: {
3446 CURCUR cc;
3280af22 3447 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3448 /* No need to save/restore up to this paren */
3449 I32 parenfloor = scan->flags;
c277df42
IZ
3450
3451 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3452 next += ARG(next);
3280af22
NIS
3453 cc.oldcc = PL_regcc;
3454 PL_regcc = &cc;
cb434fcc
IZ
3455 /* XXXX Probably it is better to teach regpush to support
3456 parenfloor > PL_regsize... */
eb160463 3457 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3458 parenfloor = *PL_reglastparen; /* Pessimization... */
3459 cc.parenfloor = parenfloor;
a0d0e21e
LW
3460 cc.cur = -1;
3461 cc.min = ARG1(scan);
3462 cc.max = ARG2(scan);
c277df42 3463 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3464 cc.next = next;
3465 cc.minmod = minmod;
3466 cc.lastloc = 0;
3280af22 3467 PL_reginput = locinput;
a0d0e21e
LW
3468 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3469 regcpblow(cp);
3280af22 3470 PL_regcc = cc.oldcc;
4633a7c4 3471 saySAME(n);
a0d0e21e
LW
3472 }
3473 /* NOT REACHED */
3474 case WHILEM: {
3475 /*
3476 * This is really hard to understand, because after we match
3477 * what we're trying to match, we must make sure the rest of
2c2d71f5 3478 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3479 * to go back UP the parse tree by recursing ever deeper. And
3480 * if it fails, we have to reset our parent's current state
3481 * that we can try again after backing off.
3482 */
3483
c277df42 3484 CHECKPOINT cp, lastcp;
3280af22 3485 CURCUR* cc = PL_regcc;
c277df42 3486 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3487 I32 cache_offset = 0, cache_bit = 0;
c277df42 3488
4633a7c4 3489 n = cc->cur + 1; /* how many we know we matched */
3280af22 3490 PL_reginput = locinput;
a0d0e21e 3491
a3621e74 3492 DEBUG_EXECUTE_r(
9041c2e3 3493 PerlIO_printf(Perl_debug_log,
91f3b821 3494 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3495 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3496 (long)n, (long)cc->min,
2797576d 3497 (long)cc->max, PTR2UV(cc))
c277df42 3498 );
4633a7c4 3499
a0d0e21e
LW
3500 /* If degenerate scan matches "", assume scan done. */
3501
579cf2c3 3502 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3503 PL_regcc = cc->oldcc;
2ab05381
IZ
3504 if (PL_regcc)
3505 ln = PL_regcc->cur;
a3621e74 3506 DEBUG_EXECUTE_r(
c3464db5
DD
3507 PerlIO_printf(Perl_debug_log,
3508 "%*s empty match detected, try continuation...\n",
3280af22 3509 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3510 );
a0d0e21e 3511 if (regmatch(cc->next))
4633a7c4 3512 sayYES;
2ab05381
IZ
3513 if (PL_regcc)
3514 PL_regcc->cur = ln;
3280af22 3515 PL_regcc = cc;
4633a7c4 3516 sayNO;
a0d0e21e
LW
3517 }
3518
3519 /* First just match a string of min scans. */
3520
3521 if (n < cc->min) {
3522 cc->cur = n;
3523 cc->lastloc = locinput;
4633a7c4
LW
3524 if (regmatch(cc->scan))
3525 sayYES;
3526 cc->cur = n - 1;
c277df42 3527 cc->lastloc = lastloc;
4633a7c4 3528 sayNO;
a0d0e21e
LW
3529 }
3530
2c2d71f5
JH
3531 if (scan->flags) {
3532 /* Check whether we already were at this position.
3533 Postpone detection until we know the match is not
3534 *that* muc