This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Better version of the Aho-Corasick patch and lots of benchmarks.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e 33#ifdef PERL_EXT_RE_BUILD
54df2634 34#include "re_top.h"
9041c2e3 35#endif
56953603 36
a687059c 37/*
e50aee73 38 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
39 *
40 * Copyright (c) 1986 by University of Toronto.
41 * Written by Henry Spencer. Not derived from licensed software.
42 *
43 * Permission is granted to anyone to use this software for any
44 * purpose on any computer system, and to redistribute it freely,
45 * subject to the following restrictions:
46 *
47 * 1. The author is not responsible for the consequences of use of
48 * this software, no matter how awful, even if they arise
49 * from defects in it.
50 *
51 * 2. The origin of this software must not be misrepresented, either
52 * by explicit claim or by omission.
53 *
54 * 3. Altered versions must be plainly marked as such, and must not
55 * be misrepresented as being the original software.
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
a687059c
LW
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69#include "EXTERN.h"
864dbfa3 70#define PERL_IN_REGEXEC_C
a687059c 71#include "perl.h"
0f5d15d6 72
54df2634
NC
73#ifdef PERL_IN_XSUB_RE
74# include "re_comp.h"
75#else
76# include "regcomp.h"
77#endif
a687059c 78
c277df42
IZ
79#define RF_tainted 1 /* tainted information used? */
80#define RF_warned 2 /* warned about big count? */
ce862d02 81#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
82#define RF_utf8 8 /* String contains multibyte chars? */
83
eb160463 84#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
85
86#define RS_init 1 /* eval environment created */
87#define RS_set 2 /* replsv value is set */
c277df42 88
a687059c
LW
89#ifndef STATIC
90#define STATIC static
91#endif
92
32fc9b6a 93#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 94
c277df42
IZ
95/*
96 * Forwards.
97 */
98
33b8afdf 99#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 100#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 101
3dab1dad
YO
102#define HOPc(pos,off) \
103 (char *)(PL_reg_match_utf8 \
52657f30 104 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
105 : (U8*)(pos + off))
106#define HOPBACKc(pos, off) \
07be1b83
YO
107 (char*)(PL_reg_match_utf8\
108 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
109 : (pos - off >= PL_bostr) \
8e11feef 110 ? (U8*)pos - off \
3dab1dad 111 : NULL)
efb30f32 112
53c4c00c 113#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 114#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 115
1a4fad37
AL
116#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
117 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
118#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
119#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
120#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
121#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 122
3dab1dad
YO
123/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
124
5f80c4cf 125/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
126#define JUMPABLE(rn) ( \
127 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
128 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
129 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 130 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
131)
132
cca55fe3 133#define HAS_TEXT(rn) ( \
3dab1dad 134 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 135)
e2d8ce26 136
a84d97b6
HS
137/*
138 Search for mandatory following text node; for lookahead, the text must
139 follow but for lookbehind (rn->flags != 0) we skip to the next step.
140*/
cca55fe3 141#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
142 while (JUMPABLE(rn)) { \
143 const OPCODE type = OP(rn); \
144 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 145 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 146 else if (type == PLUS) \
cca55fe3 147 rn = NEXTOPER(rn); \
3dab1dad 148 else if (type == IFMATCH) \
a84d97b6 149 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 150 else rn += NEXT_OFF(rn); \
3dab1dad 151 } \
5f80c4cf 152} STMT_END
74750237 153
acfe0abc 154static void restore_pos(pTHX_ void *arg);
51371543 155
76e3520e 156STATIC CHECKPOINT
cea2e8a9 157S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 158{
97aff369 159 dVAR;
a3b680e6 160 const int retval = PL_savestack_ix;
b1ce53c5 161#define REGCP_PAREN_ELEMS 4
a3b680e6 162 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
163 int p;
164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
175 SSPUSHINT(p);
176 }
b1ce53c5 177/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
178 SSPUSHINT(PL_regsize);
179 SSPUSHINT(*PL_reglastparen);
a01268b5 180 SSPUSHINT(*PL_reglastcloseparen);
3280af22 181 SSPUSHPTR(PL_reginput);
41123dfd
JH
182#define REGCP_FRAME_ELEMS 2
183/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
184 * are needed for the regexp context stack bookkeeping. */
185 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 186 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 187
a0d0e21e
LW
188 return retval;
189}
190
c277df42 191/* These are needed since we do not localize EVAL nodes: */
a3621e74 192# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 193 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 194 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 195
a3621e74 196# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 197 PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 199 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 200
76e3520e 201STATIC char *
097eb12c 202S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 203{
97aff369 204 dVAR;
b1ce53c5 205 I32 i;
a0d0e21e 206 char *input;
b1ce53c5 207
a3621e74
YO
208 GET_RE_DEBUG_FLAGS_DECL;
209
b1ce53c5 210 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 211 i = SSPOPINT;
b1ce53c5
JH
212 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
213 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 214 input = (char *) SSPOPPTR;
a01268b5 215 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
216 *PL_reglastparen = SSPOPINT;
217 PL_regsize = SSPOPINT;
b1ce53c5
JH
218
219 /* Now restore the parentheses context. */
41123dfd
JH
220 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
221 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 222 I32 tmps;
097eb12c 223 U32 paren = (U32)SSPOPINT;
3280af22 224 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
225 PL_regstartp[paren] = SSPOPINT;
226 tmps = SSPOPINT;
3280af22
NIS
227 if (paren <= *PL_reglastparen)
228 PL_regendp[paren] = tmps;
a3621e74 229 DEBUG_EXECUTE_r(
c3464db5 230 PerlIO_printf(Perl_debug_log,
b900a521 231 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 232 (UV)paren, (IV)PL_regstartp[paren],
b900a521 233 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 234 (IV)PL_regendp[paren],
3280af22 235 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 236 );
a0d0e21e 237 }
a3621e74 238 DEBUG_EXECUTE_r(
bb7a0f54 239 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 240 PerlIO_printf(Perl_debug_log,
faccc32b 241 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 242 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
243 }
244 );
daf18116 245#if 1
dafc8851
JH
246 /* It would seem that the similar code in regtry()
247 * already takes care of this, and in fact it is in
248 * a better location to since this code can #if 0-ed out
249 * but the code in regtry() is needed or otherwise tests
250 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
251 * (as of patchlevel 7877) will fail. Then again,
252 * this code seems to be necessary or otherwise
253 * building DynaLoader will fail:
254 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
255 * --jhi */
bb7a0f54 256 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
257 if (i > PL_regsize)
258 PL_regstartp[i] = -1;
259 PL_regendp[i] = -1;
a0d0e21e 260 }
dafc8851 261#endif
a0d0e21e
LW
262 return input;
263}
264
02db2b7b 265#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 266
95b24440 267#define TRYPAREN(paren, n, input, where) { \
29d1e993
HS
268 if (paren) { \
269 if (n) { \
270 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
271 PL_regendp[paren] = input - PL_bostr; \
272 } \
273 else \
274 PL_regendp[paren] = -1; \
275 } \
95b24440
DM
276 REGMATCH(next, where); \
277 if (result) \
29d1e993
HS
278 sayYES; \
279 if (paren && n) \
280 PL_regendp[paren] = -1; \
281}
282
283
a687059c 284/*
e50aee73 285 * pregexec and friends
a687059c
LW
286 */
287
76234dfb 288#ifndef PERL_IN_XSUB_RE
a687059c 289/*
c277df42 290 - pregexec - match a regexp against a string
a687059c 291 */
c277df42 292I32
864dbfa3 293Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 294 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
295/* strend: pointer to null at end of string */
296/* strbeg: real beginning of string */
297/* minend: end of match must be >=minend after stringarg. */
298/* nosave: For optimizations. */
299{
300 return
9041c2e3 301 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
302 nosave ? 0 : REXEC_COPY_STR);
303}
76234dfb 304#endif
22e551b9 305
9041c2e3 306/*
cad2e5aa
JH
307 * Need to implement the following flags for reg_anch:
308 *
309 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
310 * USE_INTUIT_ML
311 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
312 * INTUIT_AUTORITATIVE_ML
313 * INTUIT_ONCE_NOML - Intuit can match in one location only.
314 * INTUIT_ONCE_ML
315 *
316 * Another flag for this function: SECOND_TIME (so that float substrs
317 * with giant delta may be not rechecked).
318 */
319
320/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
321
3f7c398e 322/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
323 Otherwise, only SvCUR(sv) is used to get strbeg. */
324
325/* XXXX We assume that strpos is strbeg unless sv. */
326
6eb5f6b9
JH
327/* XXXX Some places assume that there is a fixed substring.
328 An update may be needed if optimizer marks as "INTUITable"
329 RExen without fixed substrings. Similarly, it is assumed that
330 lengths of all the strings are no more than minlen, thus they
331 cannot come from lookahead.
332 (Or minlen should take into account lookahead.) */
333
2c2d71f5
JH
334/* A failure to find a constant substring means that there is no need to make
335 an expensive call to REx engine, thus we celebrate a failure. Similarly,
336 finding a substring too deep into the string means that less calls to
30944b6d
IZ
337 regtry() should be needed.
338
339 REx compiler's optimizer found 4 possible hints:
340 a) Anchored substring;
341 b) Fixed substring;
342 c) Whether we are anchored (beginning-of-line or \G);
343 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 344 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
345 string which does not contradict any of them.
346 */
2c2d71f5 347
6eb5f6b9
JH
348/* Most of decisions we do here should have been done at compile time.
349 The nodes of the REx which we used for the search should have been
350 deleted from the finite automaton. */
351
cad2e5aa
JH
352char *
353Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
354 char *strend, U32 flags, re_scream_pos_data *data)
355{
97aff369 356 dVAR;
b7953727 357 register I32 start_shift = 0;
cad2e5aa 358 /* Should be nonnegative! */
b7953727 359 register I32 end_shift = 0;
2c2d71f5
JH
360 register char *s;
361 register SV *check;
a1933d95 362 char *strbeg;
cad2e5aa 363 char *t;
a3b680e6 364 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 365 I32 ml_anch;
bd61b366
SS
366 register char *other_last = NULL; /* other substr checked before this */
367 char *check_at = NULL; /* check substr found at this pos */
1df70142 368 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 369#ifdef DEBUGGING
890ce7af
AL
370 const char * const i_strpos = strpos;
371 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 372#endif
a3621e74
YO
373
374 GET_RE_DEBUG_FLAGS_DECL;
375
a30b2f1f 376 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 377
b8d68ded 378 if (prog->reganch & ROPT_UTF8) {
a3621e74 379 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
380 "UTF-8 regex...\n"));
381 PL_reg_flags |= RF_utf8;
382 }
383
a3621e74 384 DEBUG_EXECUTE_r({
1df70142 385 const char *s = PL_reg_match_utf8 ?
c728cb41
JH
386 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
387 strpos;
1df70142 388 const int len = PL_reg_match_utf8 ?
bb7a0f54 389 (int)strlen(s) : strend - strpos;
2a782b5b
JH
390 if (!PL_colorset)
391 reginitcolors();
b8d68ded 392 if (PL_reg_match_utf8)
a3621e74 393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 394 "UTF-8 target...\n"));
2a782b5b 395 PerlIO_printf(Perl_debug_log,
a0288114 396 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
e4584336 397 PL_colors[4], PL_colors[5], PL_colors[0],
2a782b5b
JH
398 prog->precomp,
399 PL_colors[1],
400 (strlen(prog->precomp) > 60 ? "..." : ""),
401 PL_colors[0],
402 (int)(len > 60 ? 60 : len),
403 s, PL_colors[1],
404 (len > 60 ? "..." : "")
405 );
406 });
cad2e5aa 407
c344f387
JH
408 /* CHR_DIST() would be more correct here but it makes things slow. */
409 if (prog->minlen > strend - strpos) {
a3621e74 410 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 411 "String too short... [re_intuit_start]\n"));
cad2e5aa 412 goto fail;
2c2d71f5 413 }
a1933d95 414 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 415 PL_regeol = strend;
33b8afdf
JH
416 if (do_utf8) {
417 if (!prog->check_utf8 && prog->check_substr)
418 to_utf8_substr(prog);
419 check = prog->check_utf8;
420 } else {
421 if (!prog->check_substr && prog->check_utf8)
422 to_byte_substr(prog);
423 check = prog->check_substr;
424 }
425 if (check == &PL_sv_undef) {
a3621e74 426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
427 "Non-utf string cannot match utf check string\n"));
428 goto fail;
429 }
2c2d71f5 430 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
431 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
432 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 433 && !multiline ) ); /* Check after \n? */
cad2e5aa 434
7e25d62c
JH
435 if (!ml_anch) {
436 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
437 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 438 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
439 && sv && !SvROK(sv)
440 && (strpos != strbeg)) {
a3621e74 441 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
442 goto fail;
443 }
444 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 445 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 446 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
447 I32 slen;
448
1aa99e6b 449 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
450 if (SvTAIL(check)) {
451 slen = SvCUR(check); /* >= 1 */
cad2e5aa 452
9041c2e3 453 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 454 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 456 goto fail_finish;
cad2e5aa
JH
457 }
458 /* Now should match s[0..slen-2] */
459 slen--;
3f7c398e 460 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 461 || (slen > 1
3f7c398e 462 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 463 report_neq:
a3621e74 464 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
465 goto fail_finish;
466 }
cad2e5aa 467 }
3f7c398e 468 else if (*SvPVX_const(check) != *s
653099ff 469 || ((slen = SvCUR(check)) > 1
3f7c398e 470 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 471 goto report_neq;
c315bfe8 472 check_at = s;
2c2d71f5 473 goto success_at_start;
7e25d62c 474 }
cad2e5aa 475 }
2c2d71f5 476 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 477 s = strpos;
2c2d71f5 478 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 479 end_shift = prog->minlen - start_shift -
653099ff 480 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 481 if (!ml_anch) {
a3b680e6 482 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 483 - (SvTAIL(check) != 0);
a3b680e6 484 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
485
486 if (end_shift < eshift)
487 end_shift = eshift;
488 }
cad2e5aa 489 }
2c2d71f5 490 else { /* Can match at random position */
cad2e5aa
JH
491 ml_anch = 0;
492 s = strpos;
2c2d71f5
JH
493 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
494 /* Should be nonnegative! */
495 end_shift = prog->minlen - start_shift -
653099ff 496 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
497 }
498
2c2d71f5 499#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 500 if (end_shift < 0)
6bbae5e6 501 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
502#endif
503
2c2d71f5
JH
504 restart:
505 /* Find a possible match in the region s..strend by looking for
506 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 507 if (flags & REXEC_SCREAM) {
cad2e5aa 508 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 509 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 510
2c2d71f5
JH
511 if (PL_screamfirst[BmRARE(check)] >= 0
512 || ( BmRARE(check) == '\n'
513 && (BmPREVIOUS(check) == SvCUR(check) - 1)
514 && SvTAIL(check) ))
9041c2e3 515 s = screaminstr(sv, check,
2c2d71f5 516 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 517 else
2c2d71f5 518 goto fail_finish;
4addbd3b
HS
519 /* we may be pointing at the wrong string */
520 if (s && RX_MATCH_COPIED(prog))
3f7c398e 521 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
522 if (data)
523 *data->scream_olds = s;
524 }
f33976b4 525 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
526 s = fbm_instr((U8*)(s + start_shift),
527 (U8*)(strend - end_shift),
7fba1cd6 528 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 529 else
1aa99e6b
IH
530 s = fbm_instr(HOP3(s, start_shift, strend),
531 HOP3(strend, -end_shift, strbeg),
7fba1cd6 532 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
533
534 /* Update the count-of-usability, remove useless subpatterns,
535 unshift s. */
2c2d71f5 536
a0288114 537 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
2c2d71f5 538 (s ? "Found" : "Did not find"),
33b8afdf 539 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 540 PL_colors[0],
7b0972df 541 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
3f7c398e 542 SvPVX_const(check),
2c2d71f5
JH
543 PL_colors[1], (SvTAIL(check) ? "$" : ""),
544 (s ? " at offset " : "...\n") ) );
545
546 if (!s)
547 goto fail_finish;
548
6eb5f6b9
JH
549 check_at = s;
550
2c2d71f5 551 /* Finish the diagnostic message */
a3621e74 552 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
553
554 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
555 Start with the other substr.
556 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 557 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
558 *always* match. Probably should be marked during compile...
559 Probably it is right to do no SCREAM here...
560 */
561
33b8afdf 562 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 563 /* Take into account the "other" substring. */
2c2d71f5
JH
564 /* XXXX May be hopelessly wrong for UTF... */
565 if (!other_last)
6eb5f6b9 566 other_last = strpos;
33b8afdf 567 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
568 do_other_anchored:
569 {
890ce7af
AL
570 char * const last = HOP3c(s, -start_shift, strbeg);
571 char *last1, *last2;
2c2d71f5 572 char *s1 = s;
33b8afdf 573 SV* must;
2c2d71f5 574
2c2d71f5
JH
575 t = s - prog->check_offset_max;
576 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 577 && (!do_utf8
0ce71af7 578 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 579 && t > strpos)))
6f207bd3 580 NOOP;
2c2d71f5
JH
581 else
582 t = strpos;
1aa99e6b 583 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
584 if (t < other_last) /* These positions already checked */
585 t = other_last;
1aa99e6b 586 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
587 if (last < last1)
588 last1 = last;
589 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
590 /* On end-of-str: see comment below. */
33b8afdf
JH
591 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
592 if (must == &PL_sv_undef) {
593 s = (char*)NULL;
a3621e74 594 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
595 }
596 else
597 s = fbm_instr(
598 (unsigned char*)t,
599 HOP3(HOP3(last1, prog->anchored_offset, strend)
600 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
601 must,
7fba1cd6 602 multiline ? FBMrf_MULTILINE : 0
33b8afdf 603 );
a3621e74 604 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a0288114 605 "%s anchored substr \"%s%.*s%s\"%s",
2c2d71f5
JH
606 (s ? "Found" : "Contradicts"),
607 PL_colors[0],
33b8afdf
JH
608 (int)(SvCUR(must)
609 - (SvTAIL(must)!=0)),
3f7c398e 610 SvPVX_const(must),
33b8afdf 611 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
612 if (!s) {
613 if (last1 >= last2) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
615 ", giving up...\n"));
616 goto fail_finish;
617 }
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 619 ", trying floating at offset %ld...\n",
1aa99e6b
IH
620 (long)(HOP3c(s1, 1, strend) - i_strpos)));
621 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
622 s = HOP3c(last, 1, strend);
2c2d71f5
JH
623 goto restart;
624 }
625 else {
a3621e74 626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 627 (long)(s - i_strpos)));
1aa99e6b
IH
628 t = HOP3c(s, -prog->anchored_offset, strbeg);
629 other_last = HOP3c(s, 1, strend);
30944b6d 630 s = s1;
2c2d71f5
JH
631 if (t == strpos)
632 goto try_at_start;
2c2d71f5
JH
633 goto try_at_offset;
634 }
30944b6d 635 }
2c2d71f5
JH
636 }
637 else { /* Take into account the floating substring. */
33b8afdf
JH
638 char *last, *last1;
639 char *s1 = s;
640 SV* must;
641
642 t = HOP3c(s, -start_shift, strbeg);
643 last1 = last =
644 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
645 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
646 last = HOP3c(t, prog->float_max_offset, strend);
647 s = HOP3c(t, prog->float_min_offset, strend);
648 if (s < other_last)
649 s = other_last;
2c2d71f5 650 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
651 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
652 /* fbm_instr() takes into account exact value of end-of-str
653 if the check is SvTAIL(ed). Since false positives are OK,
654 and end-of-str is not later than strend we are OK. */
655 if (must == &PL_sv_undef) {
656 s = (char*)NULL;
a3621e74 657 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
658 }
659 else
2c2d71f5 660 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
661 (unsigned char*)last + SvCUR(must)
662 - (SvTAIL(must)!=0),
7fba1cd6 663 must, multiline ? FBMrf_MULTILINE : 0);
a0288114 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
33b8afdf
JH
665 (s ? "Found" : "Contradicts"),
666 PL_colors[0],
667 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 668 SvPVX_const(must),
33b8afdf
JH
669 PL_colors[1], (SvTAIL(must) ? "$" : "")));
670 if (!s) {
671 if (last1 == last) {
a3621e74 672 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
673 ", giving up...\n"));
674 goto fail_finish;
2c2d71f5 675 }
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
677 ", trying anchored starting at offset %ld...\n",
678 (long)(s1 + 1 - i_strpos)));
679 other_last = last;
680 s = HOP3c(t, 1, strend);
681 goto restart;
682 }
683 else {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
685 (long)(s - i_strpos)));
686 other_last = s; /* Fix this later. --Hugo */
687 s = s1;
688 if (t == strpos)
689 goto try_at_start;
690 goto try_at_offset;
691 }
2c2d71f5 692 }
cad2e5aa 693 }
2c2d71f5
JH
694
695 t = s - prog->check_offset_max;
2c2d71f5 696 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 697 && (!do_utf8
0ce71af7 698 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 699 && t > strpos))) {
2c2d71f5
JH
700 /* Fixed substring is found far enough so that the match
701 cannot start at strpos. */
702 try_at_offset:
cad2e5aa 703 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
704 /* Eventually fbm_*() should handle this, but often
705 anchored_offset is not 0, so this check will not be wasted. */
706 /* XXXX In the code below we prefer to look for "^" even in
707 presence of anchored substrings. And we search even
708 beyond the found float position. These pessimizations
709 are historical artefacts only. */
710 find_anchor:
2c2d71f5 711 while (t < strend - prog->minlen) {
cad2e5aa 712 if (*t == '\n') {
4ee3650e 713 if (t < check_at - prog->check_offset_min) {
33b8afdf 714 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
715 /* Since we moved from the found position,
716 we definitely contradict the found anchored
30944b6d
IZ
717 substr. Due to the above check we do not
718 contradict "check" substr.
719 Thus we can arrive here only if check substr
720 is float. Redo checking for "other"=="fixed".
721 */
9041c2e3 722 strpos = t + 1;
a3621e74 723 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 724 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
725 goto do_other_anchored;
726 }
4ee3650e
GS
727 /* We don't contradict the found floating substring. */
728 /* XXXX Why not check for STCLASS? */
cad2e5aa 729 s = t + 1;
a3621e74 730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 731 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
732 goto set_useful;
733 }
4ee3650e
GS
734 /* Position contradicts check-string */
735 /* XXXX probably better to look for check-string
736 than for "\n", so one should lower the limit for t? */
a3621e74 737 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 738 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 739 other_last = strpos = s = t + 1;
cad2e5aa
JH
740 goto restart;
741 }
742 t++;
743 }
a3621e74 744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 745 PL_colors[0], PL_colors[1]));
2c2d71f5 746 goto fail_finish;
cad2e5aa 747 }
f5952150 748 else {
a3621e74 749 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 750 PL_colors[0], PL_colors[1]));
f5952150 751 }
cad2e5aa
JH
752 s = t;
753 set_useful:
33b8afdf 754 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
755 }
756 else {
f5952150 757 /* The found string does not prohibit matching at strpos,
2c2d71f5 758 - no optimization of calling REx engine can be performed,
f5952150
GS
759 unless it was an MBOL and we are not after MBOL,
760 or a future STCLASS check will fail this. */
2c2d71f5
JH
761 try_at_start:
762 /* Even in this situation we may use MBOL flag if strpos is offset
763 wrt the start of the string. */
05b4157f 764 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 765 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
766 /* May be due to an implicit anchor of m{.*foo} */
767 && !(prog->reganch & ROPT_IMPLICIT))
768 {
cad2e5aa
JH
769 t = strpos;
770 goto find_anchor;
771 }
a3621e74 772 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 773 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 774 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 775 );
2c2d71f5 776 success_at_start:
30944b6d 777 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
778 && (do_utf8 ? (
779 prog->check_utf8 /* Could be deleted already */
780 && --BmUSEFUL(prog->check_utf8) < 0
781 && (prog->check_utf8 == prog->float_utf8)
782 ) : (
783 prog->check_substr /* Could be deleted already */
784 && --BmUSEFUL(prog->check_substr) < 0
785 && (prog->check_substr == prog->float_substr)
786 )))
66e933ab 787 {
cad2e5aa 788 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
790 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
791 if (do_utf8 ? prog->check_substr : prog->check_utf8)
792 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
793 prog->check_substr = prog->check_utf8 = NULL; /* disable */
794 prog->float_substr = prog->float_utf8 = NULL; /* clear */
795 check = NULL; /* abort */
cad2e5aa 796 s = strpos;
3cf5c195
IZ
797 /* XXXX This is a remnant of the old implementation. It
798 looks wasteful, since now INTUIT can use many
6eb5f6b9 799 other heuristics. */
cad2e5aa
JH
800 prog->reganch &= ~RE_USE_INTUIT;
801 }
802 else
803 s = strpos;
804 }
805
6eb5f6b9
JH
806 /* Last resort... */
807 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 808 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
809 /* minlen == 0 is possible if regstclass is \b or \B,
810 and the fixed substr is ''$.
811 Since minlen is already taken into account, s+1 is before strend;
812 accidentally, minlen >= 1 guaranties no false positives at s + 1
813 even for \b or \B. But (minlen? 1 : 0) below assumes that
814 regstclass does not come from lookahead... */
815 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
816 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 817 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 818 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 819 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 820 : 1);
07be1b83 821 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 822 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 823 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
824 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
825 cl_l, strend)
826 : strend);
07be1b83
YO
827 /*if (OP(prog->regstclass) == TRIE)
828 endpos++;*/
6eb5f6b9 829 t = s;
3b0527fe 830 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
831 if (!s) {
832#ifdef DEBUGGING
cbbf8932 833 const char *what = NULL;
6eb5f6b9
JH
834#endif
835 if (endpos == strend) {
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
837 "Could not match STCLASS...\n") );
838 goto fail;
839 }
a3621e74 840 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 841 "This position contradicts STCLASS...\n") );
653099ff
GS
842 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
843 goto fail;
6eb5f6b9 844 /* Contradict one of substrings */
33b8afdf
JH
845 if (prog->anchored_substr || prog->anchored_utf8) {
846 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 847 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 848 hop_and_restart:
1aa99e6b 849 s = HOP3c(t, 1, strend);
66e933ab
GS
850 if (s + start_shift + end_shift > strend) {
851 /* XXXX Should be taken into account earlier? */
a3621e74 852 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
853 "Could not match STCLASS...\n") );
854 goto fail;
855 }
5e39e1e5
HS
856 if (!check)
857 goto giveup;
a3621e74 858 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 859 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
860 what, (long)(s + start_shift - i_strpos)) );
861 goto restart;
862 }
66e933ab 863 /* Have both, check_string is floating */
6eb5f6b9
JH
864 if (t + start_shift >= check_at) /* Contradicts floating=check */
865 goto retry_floating_check;
866 /* Recheck anchored substring, but not floating... */
9041c2e3 867 s = check_at;
5e39e1e5
HS
868 if (!check)
869 goto giveup;
a3621e74 870 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 871 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
872 (long)(other_last - i_strpos)) );
873 goto do_other_anchored;
874 }
60e71179
GS
875 /* Another way we could have checked stclass at the
876 current position only: */
877 if (ml_anch) {
878 s = t = t + 1;
5e39e1e5
HS
879 if (!check)
880 goto giveup;
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 882 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 883 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 884 goto try_at_offset;
66e933ab 885 }
33b8afdf 886 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 887 goto fail;
6eb5f6b9
JH
888 /* Check is floating subtring. */
889 retry_floating_check:
890 t = check_at - start_shift;
a3621e74 891 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
892 goto hop_and_restart;
893 }
b7953727 894 if (t != s) {
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 896 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
897 (long)(t - i_strpos), (long)(s - i_strpos))
898 );
899 }
900 else {
a3621e74 901 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
902 "Does not contradict STCLASS...\n");
903 );
904 }
6eb5f6b9 905 }
5e39e1e5 906 giveup:
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
908 PL_colors[4], (check ? "Guessed" : "Giving up"),
909 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 910 return s;
2c2d71f5
JH
911
912 fail_finish: /* Substring not found */
33b8afdf
JH
913 if (prog->check_substr || prog->check_utf8) /* could be removed already */
914 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 915 fail:
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 917 PL_colors[4], PL_colors[5]));
bd61b366 918 return NULL;
cad2e5aa 919}
9661b544 920
6eb5f6b9 921/* We know what class REx starts with. Try to find this position... */
3b0527fe 922/* if reginfo is NULL, its a dryrun */
07be1b83
YO
923/* annoyingly all the vars in this routine have different names from their counterparts
924 in regmatch. /grrr */
3b0527fe 925
3c3eec57 926STATIC char *
07be1b83
YO
927S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
928 const char *strend, const regmatch_info *reginfo)
a687059c 929{
27da23d5 930 dVAR;
1df70142 931 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 932 char *m;
d8093b23 933 STRLEN ln;
5dab1207 934 STRLEN lnc;
078c425b 935 register STRLEN uskip;
d8093b23
G
936 unsigned int c1;
937 unsigned int c2;
6eb5f6b9
JH
938 char *e;
939 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 940 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 941
6eb5f6b9
JH
942 /* We know what class it must start with. */
943 switch (OP(c)) {
6eb5f6b9 944 case ANYOF:
388cc4de 945 if (do_utf8) {
078c425b 946 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
947 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
948 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a
DM
949 reginclass(prog, c, (U8*)s, 0, do_utf8) :
950 REGINCLASS(prog, c, (U8*)s)) {
3b0527fe 951 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
952 goto got_it;
953 else
954 tmp = doevery;
955 }
956 else
957 tmp = 1;
078c425b 958 s += uskip;
388cc4de
HS
959 }
960 }
961 else {
962 while (s < strend) {
963 STRLEN skip = 1;
964
32fc9b6a 965 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
966 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
967 /* The assignment of 2 is intentional:
968 * for the folded sharp s, the skip is 2. */
969 (skip = SHARP_S_SKIP))) {
3b0527fe 970 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += skip;
978 }
a0d0e21e 979 }
6eb5f6b9 980 break;
f33976b4
DB
981 case CANY:
982 while (s < strend) {
3b0527fe 983 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
984 goto got_it;
985 else
986 tmp = doevery;
987 s++;
988 }
989 break;
6eb5f6b9 990 case EXACTF:
5dab1207
NIS
991 m = STRING(c);
992 ln = STR_LEN(c); /* length to match in octets/bytes */
993 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 994 if (UTF) {
a2a2844f 995 STRLEN ulen1, ulen2;
5dab1207 996 U8 *sm = (U8 *) m;
89ebb4a3
JH
997 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
998 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 999 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1000
1001 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1002 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1003
89ebb4a3 1004 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1005 0, uniflags);
89ebb4a3 1006 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1007 0, uniflags);
5dab1207
NIS
1008 lnc = 0;
1009 while (sm < ((U8 *) m + ln)) {
1010 lnc++;
1011 sm += UTF8SKIP(sm);
1012 }
1aa99e6b
IH
1013 }
1014 else {
1015 c1 = *(U8*)m;
1016 c2 = PL_fold[c1];
1017 }
6eb5f6b9
JH
1018 goto do_exactf;
1019 case EXACTFL:
5dab1207
NIS
1020 m = STRING(c);
1021 ln = STR_LEN(c);
1022 lnc = (I32) ln;
d8093b23 1023 c1 = *(U8*)m;
6eb5f6b9
JH
1024 c2 = PL_fold_locale[c1];
1025 do_exactf:
db12adc6 1026 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1027
3b0527fe 1028 if (!reginfo && e < s)
6eb5f6b9 1029 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1030
60a8b682
JH
1031 /* The idea in the EXACTF* cases is to first find the
1032 * first character of the EXACTF* node and then, if
1033 * necessary, case-insensitively compare the full
1034 * text of the node. The c1 and c2 are the first
1035 * characters (though in Unicode it gets a bit
1036 * more complicated because there are more cases
7f16dd3d
JH
1037 * than just upper and lower: one needs to use
1038 * the so-called folding case for case-insensitive
1039 * matching (called "loose matching" in Unicode).
1040 * ibcmp_utf8() will do just that. */
60a8b682 1041
1aa99e6b 1042 if (do_utf8) {
575cac57 1043 UV c, f;
89ebb4a3 1044 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1045 STRLEN len, foldlen;
4ad0818d 1046 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1047 if (c1 == c2) {
5dab1207
NIS
1048 /* Upper and lower of 1st char are equal -
1049 * probably not a "letter". */
1aa99e6b 1050 while (s <= e) {
89ebb4a3 1051 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1052 uniflags);
80aecb99
JH
1053 if ( c == c1
1054 && (ln == len ||
3dab1dad
YO
1055 ibcmp_utf8(s, NULL, 0, do_utf8,
1056 m, NULL, ln, (bool)UTF))
3b0527fe 1057 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1058 goto got_it;
80aecb99 1059 else {
1df70142 1060 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1061 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1062 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1063 if ( f != c
1064 && (f == c1 || f == c2)
1065 && (ln == foldlen ||
66423254 1066 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1067 NULL, foldlen, do_utf8,
d07ddd77 1068 m,
3dab1dad 1069 NULL, ln, (bool)UTF))
3b0527fe 1070 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1071 goto got_it;
1072 }
1aa99e6b
IH
1073 s += len;
1074 }
09091399
JH
1075 }
1076 else {
1aa99e6b 1077 while (s <= e) {
89ebb4a3 1078 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1079 uniflags);
80aecb99 1080
60a8b682 1081 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1082 * Note that not all the possible combinations
1083 * are handled here: some of them are handled
1084 * by the standard folding rules, and some of
1085 * them (the character class or ANYOF cases)
1086 * are handled during compiletime in
1087 * regexec.c:S_regclass(). */
880bd946
JH
1088 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1089 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1090 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1091
1092 if ( (c == c1 || c == c2)
1093 && (ln == len ||
3dab1dad
YO
1094 ibcmp_utf8(s, NULL, 0, do_utf8,
1095 m, NULL, ln, (bool)UTF))
3b0527fe 1096 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b 1097 goto got_it;
80aecb99 1098 else {
1df70142 1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
80aecb99 1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
a6872d42 1105 !ibcmp_utf8((char *) foldbuf,
3dab1dad 1106 NULL, foldlen, do_utf8,
d07ddd77 1107 m,
3dab1dad 1108 NULL, ln, (bool)UTF))
3b0527fe 1109 && (!reginfo || regtry(reginfo, s)) )
80aecb99
JH
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399 1114 }
1aa99e6b
IH
1115 }
1116 else {
1117 if (c1 == c2)
1118 while (s <= e) {
1119 if ( *(U8*)s == c1
1120 && (ln == 1 || !(OP(c) == EXACTF
1121 ? ibcmp(s, m, ln)
1122 : ibcmp_locale(s, m, ln)))
3b0527fe 1123 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1124 goto got_it;
1125 s++;
1126 }
1127 else
1128 while (s <= e) {
1129 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1130 && (ln == 1 || !(OP(c) == EXACTF
1131 ? ibcmp(s, m, ln)
1132 : ibcmp_locale(s, m, ln)))
3b0527fe 1133 && (!reginfo || regtry(reginfo, s)) )
1aa99e6b
IH
1134 goto got_it;
1135 s++;
1136 }
b3c9acc1
IZ
1137 }
1138 break;
bbce6d69 1139 case BOUNDL:
3280af22 1140 PL_reg_flags |= RF_tainted;
bbce6d69 1141 /* FALL THROUGH */
a0d0e21e 1142 case BOUND:
ffc61ed2 1143 if (do_utf8) {
12d33761 1144 if (s == PL_bostr)
ffc61ed2
JH
1145 tmp = '\n';
1146 else {
6136c704 1147 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1148 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1149 }
1150 tmp = ((OP(c) == BOUND ?
9041c2e3 1151 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1152 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1153 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1154 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1155 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1156 isALNUM_LC_utf8((U8*)s)))
1157 {
1158 tmp = !tmp;
3b0527fe 1159 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1160 goto got_it;
1161 }
078c425b 1162 s += uskip;
a687059c 1163 }
a0d0e21e 1164 }
667bb95a 1165 else {
12d33761 1166 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1167 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1168 while (s < strend) {
1169 if (tmp ==
1170 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1171 tmp = !tmp;
3b0527fe 1172 if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1173 goto got_it;
1174 }
1175 s++;
a0ed51b3 1176 }
a0ed51b3 1177 }
3b0527fe 1178 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1179 goto got_it;
1180 break;
bbce6d69 1181 case NBOUNDL:
3280af22 1182 PL_reg_flags |= RF_tainted;
bbce6d69 1183 /* FALL THROUGH */
a0d0e21e 1184 case NBOUND:
ffc61ed2 1185 if (do_utf8) {
12d33761 1186 if (s == PL_bostr)
ffc61ed2
JH
1187 tmp = '\n';
1188 else {
6136c704 1189 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1190 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1191 }
1192 tmp = ((OP(c) == NBOUND ?
9041c2e3 1193 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1194 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1195 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1196 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1197 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1198 isALNUM_LC_utf8((U8*)s)))
1199 tmp = !tmp;
3b0527fe 1200 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2 1201 goto got_it;
078c425b 1202 s += uskip;
ffc61ed2 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == NBOUND ?
1208 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1210 if (tmp ==
1211 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1212 tmp = !tmp;
3b0527fe 1213 else if ((!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1214 goto got_it;
1215 s++;
1216 }
a0ed51b3 1217 }
3b0527fe 1218 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
a0d0e21e 1221 case ALNUM:
ffc61ed2 1222 if (do_utf8) {
1a4fad37 1223 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1224 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1225 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1226 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1227 goto got_it;
1228 else
1229 tmp = doevery;
1230 }
bbce6d69 1231 else
ffc61ed2 1232 tmp = 1;
078c425b 1233 s += uskip;
bbce6d69 1234 }
bbce6d69 1235 }
ffc61ed2
JH
1236 else {
1237 while (s < strend) {
1238 if (isALNUM(*s)) {
3b0527fe 1239 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
a0ed51b3 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s++;
a0ed51b3 1247 }
a0ed51b3
LW
1248 }
1249 break;
bbce6d69 1250 case ALNUML:
3280af22 1251 PL_reg_flags |= RF_tainted;
ffc61ed2 1252 if (do_utf8) {
078c425b 1253 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1254 if (isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1255 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1256 goto got_it;
1257 else
1258 tmp = doevery;
1259 }
a687059c 1260 else
ffc61ed2 1261 tmp = 1;
078c425b 1262 s += uskip;
a0d0e21e 1263 }
a0d0e21e 1264 }
ffc61ed2
JH
1265 else {
1266 while (s < strend) {
1267 if (isALNUM_LC(*s)) {
3b0527fe 1268 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a0ed51b3 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s++;
a0ed51b3 1276 }
a0ed51b3
LW
1277 }
1278 break;
a0d0e21e 1279 case NALNUM:
ffc61ed2 1280 if (do_utf8) {
1a4fad37 1281 LOAD_UTF8_CHARCLASS_ALNUM();
078c425b 1282 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1283 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
3b0527fe 1284 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1285 goto got_it;
1286 else
1287 tmp = doevery;
1288 }
bbce6d69 1289 else
ffc61ed2 1290 tmp = 1;
078c425b 1291 s += uskip;
bbce6d69 1292 }
bbce6d69 1293 }
ffc61ed2
JH
1294 else {
1295 while (s < strend) {
1296 if (!isALNUM(*s)) {
3b0527fe 1297 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
a0ed51b3 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s++;
a0ed51b3 1305 }
a0ed51b3
LW
1306 }
1307 break;
bbce6d69 1308 case NALNUML:
3280af22 1309 PL_reg_flags |= RF_tainted;
ffc61ed2 1310 if (do_utf8) {
078c425b 1311 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1312 if (!isALNUM_LC_utf8((U8*)s)) {
3b0527fe 1313 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1314 goto got_it;
1315 else
1316 tmp = doevery;
1317 }
a687059c 1318 else
ffc61ed2 1319 tmp = 1;
078c425b 1320 s += uskip;
a687059c 1321 }
a0d0e21e 1322 }
ffc61ed2
JH
1323 else {
1324 while (s < strend) {
1325 if (!isALNUM_LC(*s)) {
3b0527fe 1326 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a0ed51b3 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s++;
a0ed51b3 1334 }
a0ed51b3
LW
1335 }
1336 break;
a0d0e21e 1337 case SPACE:
ffc61ed2 1338 if (do_utf8) {
1a4fad37 1339 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1340 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1341 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
3b0527fe 1342 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1343 goto got_it;
1344 else
1345 tmp = doevery;
1346 }
a0d0e21e 1347 else
ffc61ed2 1348 tmp = 1;
078c425b 1349 s += uskip;
2304df62 1350 }
a0d0e21e 1351 }
ffc61ed2
JH
1352 else {
1353 while (s < strend) {
1354 if (isSPACE(*s)) {
3b0527fe 1355 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0ed51b3 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s++;
a0ed51b3 1363 }
a0ed51b3
LW
1364 }
1365 break;
bbce6d69 1366 case SPACEL:
3280af22 1367 PL_reg_flags |= RF_tainted;
ffc61ed2 1368 if (do_utf8) {
078c425b 1369 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1370 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
3b0527fe 1371 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1372 goto got_it;
1373 else
1374 tmp = doevery;
1375 }
bbce6d69 1376 else
ffc61ed2 1377 tmp = 1;
078c425b 1378 s += uskip;
bbce6d69 1379 }
bbce6d69 1380 }
ffc61ed2
JH
1381 else {
1382 while (s < strend) {
1383 if (isSPACE_LC(*s)) {
3b0527fe 1384 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
a0ed51b3 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s++;
a0ed51b3 1392 }
a0ed51b3
LW
1393 }
1394 break;
a0d0e21e 1395 case NSPACE:
ffc61ed2 1396 if (do_utf8) {
1a4fad37 1397 LOAD_UTF8_CHARCLASS_SPACE();
078c425b 1398 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1399 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
3b0527fe 1400 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1401 goto got_it;
1402 else
1403 tmp = doevery;
1404 }
a0d0e21e 1405 else
ffc61ed2 1406 tmp = 1;
078c425b 1407 s += uskip;
a687059c 1408 }
a0d0e21e 1409 }
ffc61ed2
JH
1410 else {
1411 while (s < strend) {
1412 if (!isSPACE(*s)) {
3b0527fe 1413 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0ed51b3 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s++;
a0ed51b3 1421 }
a0ed51b3
LW
1422 }
1423 break;
bbce6d69 1424 case NSPACEL:
3280af22 1425 PL_reg_flags |= RF_tainted;
ffc61ed2 1426 if (do_utf8) {
078c425b 1427 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1428 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
3b0527fe 1429 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1430 goto got_it;
1431 else
1432 tmp = doevery;
1433 }
bbce6d69 1434 else
ffc61ed2 1435 tmp = 1;
078c425b 1436 s += uskip;
bbce6d69 1437 }
bbce6d69 1438 }
ffc61ed2
JH
1439 else {
1440 while (s < strend) {
1441 if (!isSPACE_LC(*s)) {
3b0527fe 1442 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
a0ed51b3 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s++;
a0ed51b3 1450 }
a0ed51b3
LW
1451 }
1452 break;
a0d0e21e 1453 case DIGIT:
ffc61ed2 1454 if (do_utf8) {
1a4fad37 1455 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1456 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1457 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1458 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1459 goto got_it;
1460 else
1461 tmp = doevery;
1462 }
a0d0e21e 1463 else
ffc61ed2 1464 tmp = 1;
078c425b 1465 s += uskip;
2b69d0c2 1466 }
a0d0e21e 1467 }
ffc61ed2
JH
1468 else {
1469 while (s < strend) {
1470 if (isDIGIT(*s)) {
3b0527fe 1471 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0ed51b3 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s++;
a0ed51b3 1479 }
a0ed51b3
LW
1480 }
1481 break;
b8c5462f
JH
1482 case DIGITL:
1483 PL_reg_flags |= RF_tainted;
ffc61ed2 1484 if (do_utf8) {
078c425b 1485 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1486 if (isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1487 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1488 goto got_it;
1489 else
1490 tmp = doevery;
1491 }
b8c5462f 1492 else
ffc61ed2 1493 tmp = 1;
078c425b 1494 s += uskip;
b8c5462f 1495 }
b8c5462f 1496 }
ffc61ed2
JH
1497 else {
1498 while (s < strend) {
1499 if (isDIGIT_LC(*s)) {
3b0527fe 1500 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s++;
b8c5462f 1508 }
b8c5462f
JH
1509 }
1510 break;
a0d0e21e 1511 case NDIGIT:
ffc61ed2 1512 if (do_utf8) {
1a4fad37 1513 LOAD_UTF8_CHARCLASS_DIGIT();
078c425b 1514 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1515 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
3b0527fe 1516 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1517 goto got_it;
1518 else
1519 tmp = doevery;
1520 }
a0d0e21e 1521 else
ffc61ed2 1522 tmp = 1;
078c425b 1523 s += uskip;
a687059c 1524 }
a0d0e21e 1525 }
ffc61ed2
JH
1526 else {
1527 while (s < strend) {
1528 if (!isDIGIT(*s)) {
3b0527fe 1529 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0ed51b3 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s++;
a0ed51b3 1537 }
a0ed51b3
LW
1538 }
1539 break;
b8c5462f
JH
1540 case NDIGITL:
1541 PL_reg_flags |= RF_tainted;
ffc61ed2 1542 if (do_utf8) {
078c425b 1543 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1544 if (!isDIGIT_LC_utf8((U8*)s)) {
3b0527fe 1545 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1546 goto got_it;
1547 else
1548 tmp = doevery;
1549 }
b8c5462f 1550 else
ffc61ed2 1551 tmp = 1;
078c425b 1552 s += uskip;
b8c5462f 1553 }
a0ed51b3 1554 }
ffc61ed2
JH
1555 else {
1556 while (s < strend) {
1557 if (!isDIGIT_LC(*s)) {
3b0527fe 1558 if (tmp && (!reginfo || regtry(reginfo, s)))
ffc61ed2
JH
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
cf93c79d 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s++;
b8c5462f 1566 }
b8c5462f
JH
1567 }
1568 break;
07be1b83
YO
1569 case TRIE:
1570 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1571 {
1572 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1573 trie_type = do_utf8 ?
1574 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1575 : trie_plain;
1576 /* what trie are we using right now */
1577 reg_ac_data *aho
1578 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1579 reg_trie_data *trie=aho->trie;
1580
1581 const char *last_start = strend - trie->minlen;
1582 const char *real_start = s;
1583 STRLEN maxlen = trie->maxlen;
1584 U8 **points;
1585
1586 GET_RE_DEBUG_FLAGS_DECL;
1587
1588 Newxz(points,maxlen,U8 *);
1589
1590 if (trie->bitmap && trie_type != trie_utf8_fold) {
1591 while (!TRIE_BITMAP_TEST(trie,*s) && s <= last_start ) {
1592 s++;
1593 }
1594 }
1595
1596 while (s <= last_start) {
1597 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1598 U8 *uc = (U8*)s;
1599 U16 charid = 0;
1600 U32 base = 1;
1601 U32 state = 1;
1602 UV uvc = 0;
1603 STRLEN len = 0;
1604 STRLEN foldlen = 0;
1605 U8 *uscan = (U8*)NULL;
1606 U8 *leftmost = NULL;
1607
1608 U32 pointpos = 0;
1609
1610 while ( state && uc <= (U8*)strend ) {
1611 int failed=0;
1612 if (aho->states[ state ].wordnum) {
1613 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1614 if (!leftmost || lpos < leftmost)
1615 leftmost= lpos;
1616 if (base==0) break;
1617 }
1618 points[pointpos++ % maxlen]= uc;
1619 switch (trie_type) {
1620 case trie_utf8_fold:
1621 if ( foldlen>0 ) {
1622 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
1623 foldlen -= len;
1624 uscan += len;
1625 len=0;
1626 } else {
1627 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1628 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
1629 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
1630 foldlen -= UNISKIP( uvc );
1631 uscan = foldbuf + UNISKIP( uvc );
1632 }
1633 break;
1634 case trie_utf8:
1635 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
1636 &len, uniflags );
1637 break;
1638 case trie_plain:
1639 uvc = (UV)*uc;
1640 len = 1;
1641 }
1642
1643 if (uvc < 256) {
1644 charid = trie->charmap[ uvc ];
1645 }
1646 else {
1647 charid = 0;
1648 if (trie->widecharmap) {
1649 SV** const svpp = hv_fetch(trie->widecharmap,
1650 (char*)&uvc, sizeof(UV), 0);
1651 if (svpp)
1652 charid = (U16)SvIV(*svpp);
1653 }
1654 }
1655 DEBUG_TRIE_EXECUTE_r(
1656 PerlIO_printf(Perl_debug_log,
1657 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1658 (int)((const char*)uc - real_start), charid, uvc)
1659 );
1660 uc += len;
1661
1662 do {
1663 U32 word = aho->states[ state ].wordnum;
1664 base = aho->states[ state ].trans.base;
1665
1666 DEBUG_TRIE_EXECUTE_r(
1667 PerlIO_printf( Perl_debug_log,
1668 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1669 failed ? "Fail transition to " : "",
1670 state, base, uvc, word)
1671 );
1672 if ( base ) {
1673 U32 tmp;
1674 if (charid &&
1675 (base + charid > trie->uniquecharcount )
1676 && (base + charid - 1 - trie->uniquecharcount
1677 < trie->lasttrans)
1678 && trie->trans[base + charid - 1 -
1679 trie->uniquecharcount].check == state
1680 && (tmp=trie->trans[base + charid - 1 -
1681 trie->uniquecharcount ].next))
1682 {
1683 state = tmp;
1684 break;
1685 }
1686 else {
1687 failed++;
1688 if ( state == 1 )
1689 break;
1690 else
1691 state = aho->fail[state];
1692 }
1693 }
1694 else {
1695 /* we must be accepting here */
1696 failed++;
1697 break;
1698 }
1699 } while(state);
1700 if (failed) {
1701 if (leftmost)
1702 break;
1703 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
1704 while (!TRIE_BITMAP_TEST(trie,*uc) && uc <= (U8*)last_start ) {
1705 uc++;
1706 }
1707 }
1708 }
1709 }
1710 if ( aho->states[ state ].wordnum ) {
1711 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1712 if (!leftmost || lpos < leftmost)
1713 leftmost = lpos;
1714 }
1715 DEBUG_TRIE_EXECUTE_r(
1716 PerlIO_printf( Perl_debug_log,
1717 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1718 "All done: ",
1719 state, base, uvc)
1720 );
1721 if (leftmost) {
1722 s = (char*)leftmost;
1723 if (!reginfo || regtry(reginfo, s))
1724 goto got_it;
1725 s = HOPc(s,1);
1726 } else {
1727 break;
1728 }
1729 }
1730 }
1731 break;
b3c9acc1 1732 default:
3c3eec57
GS
1733 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1734 break;
d6a28714 1735 }
6eb5f6b9
JH
1736 return 0;
1737 got_it:
1738 return s;
1739}
1740
1741/*
1742 - regexec_flags - match a regexp against a string
1743 */
1744I32
1745Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1746 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1747/* strend: pointer to null at end of string */
1748/* strbeg: real beginning of string */
1749/* minend: end of match must be >=minend after stringarg. */
1750/* data: May be used for some additional optimizations. */
1751/* nosave: For optimizations. */
1752{
97aff369 1753 dVAR;
6eb5f6b9
JH
1754 register char *s;
1755 register regnode *c;
1756 register char *startpos = stringarg;
6eb5f6b9
JH
1757 I32 minlen; /* must match at least this many chars */
1758 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1759 I32 end_shift = 0; /* Same for the end. */ /* CC */
1760 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1761 char *scream_olds = NULL;
3dab1dad 1762 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1763 const bool do_utf8 = DO_UTF8(sv);
2757e526 1764 I32 multiline;
2a782b5b 1765#ifdef DEBUGGING
2757e526
JH
1766 SV* dsv0;
1767 SV* dsv1;
2a782b5b 1768#endif
3b0527fe 1769 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1770
1771 GET_RE_DEBUG_FLAGS_DECL;
1772
9d4ba2ae 1773 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1774
1775 /* Be paranoid... */
1776 if (prog == NULL || startpos == NULL) {
1777 Perl_croak(aTHX_ "NULL regexp parameter");
1778 return 0;
1779 }
1780
2757e526 1781 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1782 reginfo.prog = prog;
2757e526
JH
1783
1784#ifdef DEBUGGING
1785 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1786 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1787#endif
1788
bac06658
JH
1789 RX_MATCH_UTF8_set(prog, do_utf8);
1790
6eb5f6b9 1791 minlen = prog->minlen;
61a36c01 1792 if (strend - startpos < minlen) {
a3621e74 1793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1794 "String too short [regexec_flags]...\n"));
1795 goto phooey;
1aa99e6b 1796 }
6eb5f6b9 1797
6eb5f6b9
JH
1798 /* Check validity of program. */
1799 if (UCHARAT(prog->program) != REG_MAGIC) {
1800 Perl_croak(aTHX_ "corrupted regexp program");
1801 }
1802
1803 PL_reg_flags = 0;
1804 PL_reg_eval_set = 0;
1805 PL_reg_maxiter = 0;
1806
1807 if (prog->reganch & ROPT_UTF8)
1808 PL_reg_flags |= RF_utf8;
1809
1810 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1811 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1812 PL_bostr = strbeg;
3b0527fe 1813 reginfo.sv = sv;
6eb5f6b9
JH
1814
1815 /* Mark end of line for $ (and such) */
1816 PL_regeol = strend;
1817
1818 /* see how far we have to get to not match where we matched before */
3b0527fe 1819 reginfo.till = startpos+minend;
6eb5f6b9 1820
6eb5f6b9
JH
1821 /* If there is a "must appear" string, look for it. */
1822 s = startpos;
1823
3b0527fe 1824 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1825 MAGIC *mg;
1826
1827 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1828 reginfo.ganch = startpos;
6eb5f6b9
JH
1829 else if (sv && SvTYPE(sv) >= SVt_PVMG
1830 && SvMAGIC(sv)
14befaf4
DM
1831 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1832 && mg->mg_len >= 0) {
3b0527fe 1833 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1834 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1835 if (s > reginfo.ganch)
6eb5f6b9 1836 goto phooey;
3b0527fe 1837 s = reginfo.ganch;
6eb5f6b9
JH
1838 }
1839 }
1840 else /* pos() not defined */
3b0527fe 1841 reginfo.ganch = strbeg;
6eb5f6b9
JH
1842 }
1843
a0714e2c 1844 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1845 re_scream_pos_data d;
1846
1847 d.scream_olds = &scream_olds;
1848 d.scream_pos = &scream_pos;
1849 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1850 if (!s) {
a3621e74 1851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1852 goto phooey; /* not present */
3fa9c3d7 1853 }
6eb5f6b9
JH
1854 }
1855
a3621e74 1856 DEBUG_EXECUTE_r({
1df70142
AL
1857 const char * const s0 = UTF
1858 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1859 UNI_DISPLAY_REGEX)
1860 : prog->precomp;
bb7a0f54 1861 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1df70142 1862 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1863 UNI_DISPLAY_REGEX) : startpos;
bb7a0f54 1864 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1865 if (!PL_colorset)
1866 reginitcolors();
1867 PerlIO_printf(Perl_debug_log,
a0288114 1868 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
e4584336 1869 PL_colors[4], PL_colors[5], PL_colors[0],
9e55ce06 1870 len0, len0, s0,
2a782b5b 1871 PL_colors[1],
9e55ce06 1872 len0 > 60 ? "..." : "",
2a782b5b 1873 PL_colors[0],
9e55ce06
JH
1874 (int)(len1 > 60 ? 60 : len1),
1875 s1, PL_colors[1],
1876 (len1 > 60 ? "..." : "")
2a782b5b
JH
1877 );
1878 });
6eb5f6b9
JH
1879
1880 /* Simplest case: anchored match need be tried only once. */
1881 /* [unless only anchor is BOL and multiline is set] */
1882 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1883 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1884 goto got_it;
7fba1cd6 1885 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1886 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1887 {
1888 char *end;
1889
1890 if (minlen)
1891 dontbother = minlen - 1;
1aa99e6b 1892 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1893 /* for multiline we only have to try after newlines */
33b8afdf 1894 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1895 if (s == startpos)
1896 goto after_try;
1897 while (1) {
3b0527fe 1898 if (regtry(&reginfo, s))
6eb5f6b9
JH
1899 goto got_it;
1900 after_try:
1901 if (s >= end)
1902 goto phooey;
1903 if (prog->reganch & RE_USE_INTUIT) {
1904 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1905 if (!s)
1906 goto phooey;
1907 }
1908 else
1909 s++;
1910 }
1911 } else {
1912 if (s > startpos)
1913 s--;
1914 while (s < end) {
1915 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1916 if (regtry(&reginfo, s))
6eb5f6b9
JH
1917 goto got_it;
1918 }
1919 }
1920 }
1921 }
1922 goto phooey;
1923 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1924 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1925 goto got_it;
1926 goto phooey;
1927 }
1928
1929 /* Messy cases: unanchored match. */
33b8afdf 1930 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1931 /* we have /x+whatever/ */
1932 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1933 char ch;
bf93d4cc
GS
1934#ifdef DEBUGGING
1935 int did_match = 0;
1936#endif
33b8afdf
JH
1937 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1938 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1939 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1940
1aa99e6b 1941 if (do_utf8) {
6eb5f6b9
JH
1942 while (s < strend) {
1943 if (*s == ch) {
a3621e74 1944 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1945 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1946 s += UTF8SKIP(s);
1947 while (s < strend && *s == ch)
1948 s += UTF8SKIP(s);
1949 }
1950 s += UTF8SKIP(s);
1951 }
1952 }
1953 else {
1954 while (s < strend) {
1955 if (*s == ch) {
a3621e74 1956 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1957 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1958 s++;
1959 while (s < strend && *s == ch)
1960 s++;
1961 }
1962 s++;
1963 }
1964 }
a3621e74 1965 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1966 PerlIO_printf(Perl_debug_log,
b7953727
JH
1967 "Did not find anchored character...\n")
1968 );
6eb5f6b9 1969 }
a0714e2c
SS
1970 else if (prog->anchored_substr != NULL
1971 || prog->anchored_utf8 != NULL
1972 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1973 && prog->float_max_offset < strend - s)) {
1974 SV *must;
1975 I32 back_max;
1976 I32 back_min;
1977 char *last;
6eb5f6b9 1978 char *last1; /* Last position checked before */
bf93d4cc
GS
1979#ifdef DEBUGGING
1980 int did_match = 0;
1981#endif
33b8afdf
JH
1982 if (prog->anchored_substr || prog->anchored_utf8) {
1983 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1984 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1985 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1986 back_max = back_min = prog->anchored_offset;
1987 } else {
1988 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1989 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1990 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1991 back_max = prog->float_max_offset;
1992 back_min = prog->float_min_offset;
1993 }
1994 if (must == &PL_sv_undef)
1995 /* could not downgrade utf8 check substring, so must fail */
1996 goto phooey;
1997
1998 last = HOP3c(strend, /* Cannot start after this */
1999 -(I32)(CHR_SVLEN(must)
2000 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
2001
2002 if (s > PL_bostr)
2003 last1 = HOPc(s, -1);
2004 else
2005 last1 = s - 1; /* bogus */
2006
a0288114 2007 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2008 check_substr==must. */
2009 scream_pos = -1;
2010 dontbother = end_shift;
2011 strend = HOPc(strend, -dontbother);
2012 while ( (s <= last) &&
9041c2e3 2013 ((flags & REXEC_SCREAM)
1aa99e6b 2014 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 2015 end_shift, &scream_pos, 0))
1aa99e6b 2016 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 2017 (unsigned char*)strend, must,
7fba1cd6 2018 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
2019 /* we may be pointing at the wrong string */
2020 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 2021 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2022 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2023 if (HOPc(s, -back_max) > last1) {
2024 last1 = HOPc(s, -back_min);
2025 s = HOPc(s, -back_max);
2026 }
2027 else {
52657f30 2028 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2029
2030 last1 = HOPc(s, -back_min);
52657f30 2031 s = t;
6eb5f6b9 2032 }
1aa99e6b 2033 if (do_utf8) {
6eb5f6b9 2034 while (s <= last1) {
3b0527fe 2035 if (regtry(&reginfo, s))
6eb5f6b9
JH
2036 goto got_it;
2037 s += UTF8SKIP(s);
2038 }
2039 }
2040 else {
2041 while (s <= last1) {
3b0527fe 2042 if (regtry(&reginfo, s))
6eb5f6b9
JH
2043 goto got_it;
2044 s++;
2045 }
2046 }
2047 }
a3621e74 2048 DEBUG_EXECUTE_r(if (!did_match)
b7953727 2049 PerlIO_printf(Perl_debug_log,
a0288114 2050 "Did not find %s substr \"%s%.*s%s\"%s...\n",
33b8afdf 2051 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
2052 ? "anchored" : "floating"),
2053 PL_colors[0],
2054 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
3f7c398e 2055 SvPVX_const(must),
b7953727
JH
2056 PL_colors[1], (SvTAIL(must) ? "$" : ""))
2057 );
6eb5f6b9
JH
2058 goto phooey;
2059 }
155aba94 2060 else if ((c = prog->regstclass)) {
f14c76ed 2061 if (minlen) {
07be1b83 2062 U8 op = OP(prog->regstclass);
66e933ab 2063 /* don't bother with what can't match */
07be1b83 2064 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
2065 strend = HOPc(strend, -(minlen - 1));
2066 }
a3621e74 2067 DEBUG_EXECUTE_r({
ffc61ed2 2068 SV *prop = sv_newmortal();
cfd0369c
NC
2069 const char *s0;
2070 const char *s1;
9e55ce06
JH
2071 int len0;
2072 int len1;
2073
32fc9b6a 2074 regprop(prog, prop, c);
9e55ce06 2075 s0 = UTF ?
3f7c398e 2076 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
c728cb41 2077 UNI_DISPLAY_REGEX) :
cfd0369c 2078 SvPVX_const(prop);
9e55ce06
JH
2079 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
2080 s1 = UTF ?
c728cb41 2081 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
bb7a0f54 2082 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
9e55ce06 2083 PerlIO_printf(Perl_debug_log,
07be1b83 2084 "Matching stclass \"%*.*s\" against \"%*.*s\" (%d chars)\n",
9e55ce06 2085 len0, len0, s0,
07be1b83 2086 len1, len1, s1, (int)(strend - s));
ffc61ed2 2087 });
3b0527fe 2088 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2089 goto got_it;
07be1b83 2090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2091 }
2092 else {
2093 dontbother = 0;
a0714e2c 2094 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2095 /* Trim the end. */
d6a28714 2096 char *last;
33b8afdf
JH
2097 SV* float_real;
2098
2099 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2100 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2101 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2102
2103 if (flags & REXEC_SCREAM) {
33b8afdf 2104 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2105 end_shift, &scream_pos, 1); /* last one */
2106 if (!last)
ffc61ed2 2107 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
2108 /* we may be pointing at the wrong string */
2109 else if (RX_MATCH_COPIED(prog))
3f7c398e 2110 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2111 }
d6a28714
JH
2112 else {
2113 STRLEN len;
cfd0369c 2114 const char * const little = SvPV_const(float_real, len);
d6a28714 2115
33b8afdf 2116 if (SvTAIL(float_real)) {
d6a28714
JH
2117 if (memEQ(strend - len + 1, little, len - 1))
2118 last = strend - len + 1;
7fba1cd6 2119 else if (!multiline)
9041c2e3 2120 last = memEQ(strend - len, little, len)
bd61b366 2121 ? strend - len : NULL;
b8c5462f 2122 else
d6a28714
JH
2123 goto find_last;
2124 } else {
2125 find_last:
9041c2e3 2126 if (len)
d6a28714 2127 last = rninstr(s, strend, little, little + len);
b8c5462f 2128 else
a0288114 2129 last = strend; /* matching "$" */
b8c5462f 2130 }
b8c5462f 2131 }
bf93d4cc 2132 if (last == NULL) {
a3621e74 2133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 2134 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 2135 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2136 goto phooey; /* Should not happen! */
2137 }
d6a28714
JH
2138 dontbother = strend - last + prog->float_min_offset;
2139 }
2140 if (minlen && (dontbother < minlen))
2141 dontbother = minlen - 1;
2142 strend -= dontbother; /* this one's always in bytes! */
2143 /* We don't know much -- general case. */
1aa99e6b 2144 if (do_utf8) {
d6a28714 2145 for (;;) {
3b0527fe 2146 if (regtry(&reginfo, s))
d6a28714
JH
2147 goto got_it;
2148 if (s >= strend)
2149 break;
b8c5462f 2150 s += UTF8SKIP(s);
d6a28714
JH
2151 };
2152 }
2153 else {
2154 do {
3b0527fe 2155 if (regtry(&reginfo, s))
d6a28714
JH
2156 goto got_it;
2157 } while (s++ < strend);
2158 }
2159 }
2160
2161 /* Failure. */
2162 goto phooey;
2163
2164got_it:
2165 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2166
2167 if (PL_reg_eval_set) {
2168 /* Preserve the current value of $^R */
2169 if (oreplsv != GvSV(PL_replgv))
2170 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2171 restored, the value remains
2172 the same. */
4f639d21 2173 restore_pos(aTHX_ prog);
d6a28714
JH
2174 }
2175
2176 /* make sure $`, $&, $', and $digit will work later */
2177 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2178 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2179 if (flags & REXEC_COPY_STR) {
2180 I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2181#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2182 if ((SvIsCOW(sv)
2183 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2184 if (DEBUG_C_TEST) {
2185 PerlIO_printf(Perl_debug_log,
2186 "Copy on write: regexp capture, type %d\n",
2187 (int) SvTYPE(sv));
2188 }
2189 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2190 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2191 assert (SvPOKp(prog->saved_copy));
2192 } else
2193#endif
2194 {
2195 RX_MATCH_COPIED_on(prog);
2196 s = savepvn(strbeg, i);
2197 prog->subbeg = s;
2198 }
d6a28714 2199 prog->sublen = i;
d6a28714
JH
2200 }
2201 else {
2202 prog->subbeg = strbeg;
2203 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2204 }
2205 }
9041c2e3 2206
d6a28714
JH
2207 return 1;
2208
2209phooey:
a3621e74 2210 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2211 PL_colors[4], PL_colors[5]));
d6a28714 2212 if (PL_reg_eval_set)
4f639d21 2213 restore_pos(aTHX_ prog);
d6a28714
JH
2214 return 0;
2215}
2216
2217/*
2218 - regtry - try match at specific point
2219 */
2220STATIC I32 /* 0 failure, 1 success */
3b0527fe 2221S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 2222{
97aff369 2223 dVAR;
d6a28714
JH
2224 register I32 *sp;
2225 register I32 *ep;
2226 CHECKPOINT lastcp;
3b0527fe 2227 regexp *prog = reginfo->prog;
a3621e74 2228 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2229
02db2b7b
IZ
2230#ifdef DEBUGGING
2231 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2232#endif
d6a28714
JH
2233 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2234 MAGIC *mg;
2235
2236 PL_reg_eval_set = RS_init;
a3621e74 2237 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2238 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2239 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2240 ));
e8347627 2241 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2242 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2243 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2244 SAVETMPS;
2245 /* Apparently this is not needed, judging by wantarray. */
e8347627 2246 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2247 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2248
3b0527fe 2249 if (reginfo->sv) {
d6a28714 2250 /* Make $_ available to executed code. */
3b0527fe 2251 if (reginfo->sv != DEFSV) {
59f00321 2252 SAVE_DEFSV;
3b0527fe 2253 DEFSV = reginfo->sv;
b8c5462f 2254 }
d6a28714 2255
3b0527fe
DM
2256 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2257 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2258 /* prepare for quick setting of pos */
d300d9fa
NC
2259#ifdef PERL_OLD_COPY_ON_WRITE
2260 if (SvIsCOW(sv))
2261 sv_force_normal_flags(sv, 0);
2262#endif
3dab1dad 2263 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2264 &PL_vtbl_mglob, NULL, 0);
d6a28714 2265 mg->mg_len = -1;
b8c5462f 2266 }
d6a28714
JH
2267 PL_reg_magic = mg;
2268 PL_reg_oldpos = mg->mg_len;
4f639d21 2269 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2270 }
09687e5a 2271 if (!PL_reg_curpm) {
a02a5408 2272 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2273#ifdef USE_ITHREADS
2274 {
2275 SV* repointer = newSViv(0);
577e12cc 2276 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2277 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2278 av_push(PL_regex_padav,repointer);
2279 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2280 PL_regex_pad = AvARRAY(PL_regex_padav);
2281 }
2282#endif
2283 }
aaa362c4 2284 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2285 PL_reg_oldcurpm = PL_curpm;
2286 PL_curpm = PL_reg_curpm;
2287 if (RX_MATCH_COPIED(prog)) {
2288 /* Here is a serious problem: we cannot rewrite subbeg,
2289 since it may be needed if this match fails. Thus
2290 $` inside (?{}) could fail... */
2291 PL_reg_oldsaved = prog->subbeg;
2292 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2293#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2294 PL_nrs = prog->saved_copy;
2295#endif
d6a28714
JH
2296 RX_MATCH_COPIED_off(prog);
2297 }
2298 else
bd61b366 2299 PL_reg_oldsaved = NULL;
d6a28714
JH
2300 prog->subbeg = PL_bostr;
2301 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2302 }
973dddac 2303 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2304 PL_reginput = startpos;
2305 PL_regstartp = prog->startp;
2306 PL_regendp = prog->endp;
2307 PL_reglastparen = &prog->lastparen;
a01268b5 2308 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2309 prog->lastparen = 0;
03994de8 2310 prog->lastcloseparen = 0;
d6a28714 2311 PL_regsize = 0;
a3621e74 2312 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2313 if (PL_reg_start_tmpl <= prog->nparens) {
2314 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2315 if(PL_reg_start_tmp)
2316 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2317 else
a02a5408 2318 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2319 }
2320
2321 /* XXXX What this code is doing here?!!! There should be no need
2322 to do this again and again, PL_reglastparen should take care of
3dd2943c 2323 this! --ilya*/
dafc8851
JH
2324
2325 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2326 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2327 * PL_reglastparen), is not needed at all by the test suite
2328 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2329 * enough, for building DynaLoader, or otherwise this
2330 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2331 * will happen. Meanwhile, this code *is* needed for the
2332 * above-mentioned test suite tests to succeed. The common theme
2333 * on those tests seems to be returning null fields from matches.
2334 * --jhi */
dafc8851 2335#if 1
d6a28714
JH
2336 sp = prog->startp;
2337 ep = prog->endp;
2338 if (prog->nparens) {
097eb12c 2339 register I32 i;
eb160463 2340 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2341 *++sp = -1;
2342 *++ep = -1;
2343 }
2344 }
dafc8851 2345#endif
02db2b7b 2346 REGCP_SET(lastcp);
3b0527fe 2347 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2348 prog->endp[0] = PL_reginput - PL_bostr;
2349 return 1;
2350 }
02db2b7b 2351 REGCP_UNWIND(lastcp);
d6a28714
JH
2352 return 0;
2353}
2354
02db2b7b
IZ
2355#define RE_UNWIND_BRANCH 1
2356#define RE_UNWIND_BRANCHJ 2
2357
2358union re_unwind_t;
2359
2360typedef struct { /* XX: makes sense to enlarge it... */
2361 I32 type;
2362 I32 prev;
2363 CHECKPOINT lastcp;
2364} re_unwind_generic_t;
2365
2366typedef struct {
2367 I32 type;
2368 I32 prev;
2369 CHECKPOINT lastcp;
2370 I32 lastparen;
2371 regnode *next;
2372 char *locinput;
2373 I32 nextchr;
3a2830be 2374 int minmod;
02db2b7b
IZ
2375#ifdef DEBUGGING
2376 int regindent;
2377#endif
2378} re_unwind_branch_t;
2379
2380typedef union re_unwind_t {
2381 I32 type;
2382 re_unwind_generic_t generic;
2383 re_unwind_branch_t branch;
2384} re_unwind_t;
2385
8ba1375e
MJD
2386#define sayYES goto yes
2387#define sayNO goto no
e0f9d4a8 2388#define sayNO_ANYOF goto no_anyof
8ba1375e 2389#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2390#define sayNO_FINAL goto no_final
2391#define sayNO_SILENT goto do_no
2392#define saySAME(x) if (x) goto yes; else goto no
2393
3ab3c9b4
HS
2394#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2395#define POSCACHE_SEEN 1 /* we know what we're caching */
2396#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
7409bbd3 2397
3ab3c9b4 2398#define CACHEsayYES STMT_START { \
d8319b27 2399 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3
DM
2400 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
2401 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) | (1<<POSCACHE_SEEN); \
2402 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2403 } \
2404 else if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS)) { \
2405 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2406 } \
2407 else { \
3ab3c9b4
HS
2408 /* cache records failure, but this is success */ \
2409 DEBUG_r( \
2410 PerlIO_printf(Perl_debug_log, \
2411 "%*s (remove success from failure cache)\n", \
2412 REPORT_CODE_OFF+PL_regindent*2, "") \
2413 ); \
d8319b27 2414 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2415 } \
2416 } \
2417 sayYES; \
2418} STMT_END
7409bbd3 2419
3ab3c9b4 2420#define CACHEsayNO STMT_START { \
d8319b27 2421 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) { \
7409bbd3 2422 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) { \
3ab3c9b4 2423 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
7409bbd3
DM
2424 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2425 } \
2426 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2427 PL_reg_poscache[st->u.whilem.cache_offset] |= (1<<st->u.whilem.cache_bit); \
2428 } \
2429 else { \
3ab3c9b4
HS
2430 /* cache records success, but this is failure */ \
2431 DEBUG_r( \
2432 PerlIO_printf(Perl_debug_log, \
2433 "%*s (remove failure from success cache)\n", \
2434 REPORT_CODE_OFF+PL_regindent*2, "") \
2435 ); \
d8319b27 2436 PL_reg_poscache[st->u.whilem.cache_offset] &= ~(1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2437 } \
2438 } \
2439 sayNO; \
2440} STMT_END
2441
a3621e74
YO
2442/* this is used to determine how far from the left messages like
2443 'failed...' are printed. Currently 29 makes these messages line
2444 up with the opcode they refer to. Earlier perls used 25 which
2445 left these messages outdented making reviewing a debug output
2446 quite difficult.
2447*/
2448#define REPORT_CODE_OFF 29
2449
2450
2451/* Make sure there is a test for this +1 options in re_tests */
2452#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2453
9e137952
DM
2454/* this value indiciates that the c1/c2 "next char" test should be skipped */
2455#define CHRTEST_VOID -1000
2456
86545054
DM
2457#define SLAB_FIRST(s) (&(s)->states[0])
2458#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2459
5d9a96ca
DM
2460/* grab a new slab and return the first slot in it */
2461
2462STATIC regmatch_state *
2463S_push_slab(pTHX)
2464{
54df2634
NC
2465#if PERL_VERSION < 9
2466 dMY_CXT;
2467#endif
5d9a96ca
DM
2468 regmatch_slab *s = PL_regmatch_slab->next;
2469 if (!s) {
2470 Newx(s, 1, regmatch_slab);
2471 s->prev = PL_regmatch_slab;
2472 s->next = NULL;
2473 PL_regmatch_slab->next = s;
2474 }
2475 PL_regmatch_slab = s;
86545054 2476 return SLAB_FIRST(s);
5d9a96ca 2477}
5b47454d 2478
95b24440
DM
2479/* simulate a recursive call to regmatch */
2480
2481#define REGMATCH(ns, where) \
5d9a96ca
DM
2482 st->scan = scan; \
2483 scan = (ns); \
2484 st->resume_state = resume_##where; \
95b24440
DM
2485 goto start_recurse; \
2486 resume_point_##where:
2487
aa283a38
DM
2488
2489/* push a new regex state. Set newst to point to it */
2490
2491#define PUSH_STATE(newst, resume) \
2492 depth++; \
2493 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "PUSH STATE(%d)\n", depth)); \
2494 st->scan = scan; \
2495 st->next = next; \
2496 st->n = n; \
2497 st->locinput = locinput; \
2498 st->resume_state = resume; \
2499 newst = st+1; \
86545054 2500 if (newst > SLAB_LAST(PL_regmatch_slab)) \
aa283a38
DM
2501 newst = S_push_slab(aTHX); \
2502 PL_regmatch_state = newst; \
2503 newst->cc = 0; \
2504 newst->minmod = 0; \
2505 newst->sw = 0; \
2506 newst->logical = 0; \
2507 newst->unwind = 0; \
2508 locinput = PL_reginput; \
2509 nextchr = UCHARAT(locinput);
2510
2511#define POP_STATE \
2512 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "POP STATE(%d)\n", depth)); \
2513 depth--; \
2514 st--; \
86545054 2515 if (st < SLAB_FIRST(PL_regmatch_slab)) { \
aa283a38 2516 PL_regmatch_slab = PL_regmatch_slab->prev; \
86545054 2517 st = SLAB_LAST(PL_regmatch_slab); \
aa283a38
DM
2518 } \
2519 PL_regmatch_state = st; \
2520 scan = st->scan; \
2521 next = st->next; \
2522 n = st->n; \
2523 locinput = st->locinput; \
2524 nextchr = UCHARAT(locinput);
2525
d6a28714
JH
2526/*
2527 - regmatch - main matching routine
2528 *
2529 * Conceptually the strategy is simple: check to see whether the current
2530 * node matches, call self recursively to see whether the rest matches,
2531 * and then act accordingly. In practice we make some effort to avoid
2532 * recursion, in particular by going through "ordinary" nodes (that don't
2533 * need to know whether the rest of the match failed) by a loop instead of
2534 * by recursion.
2535 */
2536/* [lwall] I've hoisted the register declarations to the outer block in order to
2537 * maybe save a little bit of pushing and popping on the stack. It also takes
2538 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2539 *
2540 * This function used to be heavily recursive, but since this had the
2541 * effect of blowing the CPU stack on complex regexes, it has been
2542 * restructured to be iterative, and to save state onto the heap rather
2543 * than the stack. Essentially whereever regmatch() used to be called, it
2544 * pushes the current state, notes where to return, then jumps back into
2545 * the main loop.
2546 *
2547 * Originally the structure of this function used to look something like
2548
2549 S_regmatch() {
2550 int a = 1, b = 2;
2551 ...
2552 while (scan != NULL) {
5d9a96ca 2553 a++; // do stuff with a and b
95b24440
DM
2554 ...
2555 switch (OP(scan)) {
2556 case FOO: {
2557 int local = 3;
2558 ...
2559 if (regmatch(...)) // recurse
2560 goto yes;
2561 }
2562 ...
2563 }
2564 }
2565 yes:
2566 return 1;
2567 }
2568
2569 * Now it looks something like this:
2570
5d9a96ca 2571 typedef struct {
95b24440
DM
2572 int a, b, local;
2573 int resume_state;
5d9a96ca 2574 } regmatch_state;
95b24440
DM
2575
2576 S_regmatch() {
5d9a96ca
DM
2577 regmatch_state *st = new();
2578 int depth=0;
2579 st->a++; // do stuff with a and b
95b24440
DM
2580 ...
2581 while (scan != NULL) {
2582 ...
2583 switch (OP(scan)) {
2584 case FOO: {
5d9a96ca 2585 st->local = 3;
95b24440 2586 ...
5d9a96ca
DM
2587 st->scan = scan;
2588 scan = ...;
2589 st->resume_state = resume_FOO;
2590 goto start_recurse; // recurse
95b24440 2591
5d9a96ca
DM
2592 resume_point_FOO:
2593 if (result)
95b24440
DM
2594 goto yes;
2595 }
2596 ...
2597 }
5d9a96ca
DM
2598 start_recurse:
2599 st = new(); push a new state
2600 st->a = 1; st->b = 2;
2601 depth++;
95b24440 2602 }
5d9a96ca 2603 yes:
95b24440 2604 result = 1;
5d9a96ca
DM
2605 if (depth--) {
2606 st = pop();
95b24440
DM
2607 switch (resume_state) {
2608 case resume_FOO:
2609 goto resume_point_FOO;
2610 ...
2611 }
2612 }
2613 return result
2614 }
2615
2616 * WARNING: this means that any line in this function that contains a
2617 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2618 * regmatch() using gotos instead. Thus the values of any local variables
2619 * not saved in the regmatch_state structure will have been lost when
2620 * execution resumes on the next line .
5d9a96ca
DM
2621 *
2622 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2623 * PL_regmatch_state always points to the currently active state, and
2624 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2625 * The first time regmatch is called, the first slab is allocated, and is
2626 * never freed until interpreter desctruction. When the slab is full,
2627 * a new one is allocated chained to the end. At exit from regmatch, slabs
2628 * allocated since entry are freed.
d6a28714 2629 */
95b24440 2630
3dab1dad 2631#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2632
07be1b83
YO
2633#ifdef DEBUGGING
2634STATIC void
2635S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2636{
2637 const int docolor = *PL_colors[0];
2638 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2639 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2640 /* The part of the string before starttry has one color
2641 (pref0_len chars), between starttry and current
2642 position another one (pref_len - pref0_len chars),
2643 after the current position the third one.
2644 We assume that pref0_len <= pref_len, otherwise we
2645 decrease pref0_len. */
2646 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2647 ? (5 + taill) - l : locinput - PL_bostr;
2648 int pref0_len;
2649
2650 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2651 pref_len++;
2652 pref0_len = pref_len - (locinput - PL_reg_starttry);
2653 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2654 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2655 ? (5 + taill) - pref_len : PL_regeol - locinput);
2656 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2657 l--;
2658 if (pref0_len < 0)
2659 pref0_len = 0;
2660 if (pref0_len > pref_len)
2661 pref0_len = pref_len;
2662 {
2663 const char * const s0 =
2664 do_utf8 && OP(scan) != CANY ?
2665 pv_uni_display(PERL_DEBUG_PAD(0), (U8*)(locinput - pref_len),
2666 pref0_len, 60, UNI_DISPLAY_REGEX) :
2667 locinput - pref_len;
2668 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2669 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2670 pv_uni_display(PERL_DEBUG_PAD(1),
2671 (U8*)(locinput - pref_len + pref0_len),
2672 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2673 locinput - pref_len + pref0_len;
2674 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2675 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2676 pv_uni_display(PERL_DEBUG_PAD(2), (U8*)locinput,
2677 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2678 locinput;
2679 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2680 PerlIO_printf(Perl_debug_log,
2681 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|",
2682 (IV)(locinput - PL_bostr),
2683 PL_colors[4],
2684 len0, s0,
2685 PL_colors[5],
2686 PL_colors[2],
2687 len1, s1,
2688 PL_colors[3],
2689 (docolor ? "" : "> <"),
2690 PL_colors[0],
2691 len2, s2,
2692 PL_colors[1],
2693 15 - l - pref_len + 1,
2694 "");
2695 }
2696}
2697#endif
2698
d6a28714 2699STATIC I32 /* 0 failure, 1 success */
3b0527fe 2700S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2701{
54df2634
NC
2702#if PERL_VERSION < 9
2703 dMY_CXT;
2704#endif
27da23d5 2705 dVAR;
95b24440 2706 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2707 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2708
3b0527fe
DM
2709 regexp *rex = reginfo->prog;
2710
5d9a96ca
DM
2711 regmatch_slab *orig_slab;
2712 regmatch_state *orig_state;
a3621e74 2713
5d9a96ca
DM
2714 /* the current state. This is a cached copy of PL_regmatch_state */
2715 register regmatch_state *st;
95b24440 2716
5d9a96ca
DM
2717 /* cache heavy used fields of st in registers */
2718 register regnode *scan;
2719 register regnode *next;
2720 register I32 n = 0; /* initialize to shut up compiler warning */
2721 register char *locinput = PL_reginput;
95b24440 2722
5d9a96ca
DM
2723 /* these variables are NOT saved during a recusive RFEGMATCH: */
2724 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2725 bool result; /* return value of S_regmatch */
2726 regnode *inner; /* Next node in internal branch. */
2727 int depth = 0; /* depth of recursion */
aa283a38 2728 regmatch_state *newst; /* when pushing a state, this is the new one */
77cb431f
DM
2729 regmatch_state *yes_state = NULL; /* state to pop to on success of
2730 subpattern */
95b24440
DM
2731
2732#ifdef DEBUGGING
ab74612d 2733 SV *re_debug_flags = NULL;
a3621e74 2734 GET_RE_DEBUG_FLAGS;
d6a28714
JH
2735 PL_regindent++;
2736#endif
2737
5d9a96ca
DM
2738 /* on first ever call to regmatch, allocate first slab */
2739 if (!PL_regmatch_slab) {
2740 Newx(PL_regmatch_slab, 1, regmatch_slab);
2741 PL_regmatch_slab->prev = NULL;
2742 PL_regmatch_slab->next = NULL;
86545054 2743 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2744 }
2745
2746 /* remember current high-water mark for exit */
2747 /* XXX this should be done with SAVE* instead */
2748 orig_slab = PL_regmatch_slab;
2749 orig_state = PL_regmatch_state;
2750
2751 /* grab next free state slot */
2752 st = ++PL_regmatch_state;
86545054 2753 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2754 st = PL_regmatch_state = S_push_slab(aTHX);
2755
2756 st->minmod = 0;
2757 st->sw = 0;
2758 st->logical = 0;
2759 st->unwind = 0;
2760 st->cc = NULL;
d6a28714
JH
2761 /* Note that nextchr is a byte even in UTF */
2762 nextchr = UCHARAT(locinput);
2763 scan = prog;
2764 while (scan != NULL) {
8ba1375e 2765
a3621e74 2766 DEBUG_EXECUTE_r( {
6136c704 2767 SV * const prop = sv_newmortal();
07be1b83 2768 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2769 regprop(rex, prop, scan);
07be1b83
YO
2770
2771 PerlIO_printf(Perl_debug_log,
2772 "%3"IVdf":%*s%s(%"IVdf")\n",
2773 (IV)(scan - rex->program), PL_regindent*2, "",
2774 SvPVX_const(prop),
2775 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2776 });
d6a28714
JH
2777
2778 next = scan + NEXT_OFF(scan);
2779 if (next == scan)
2780 next = NULL;
2781
2782 switch (OP(scan)) {
2783 case BOL:
7fba1cd6 2784 if (locinput == PL_bostr)
d6a28714 2785 {
3b0527fe 2786 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2787 break;
2788 }
d6a28714
JH
2789 sayNO;
2790 case MBOL:
12d33761
HS
2791 if (locinput == PL_bostr ||
2792 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2793 {
b8c5462f
JH
2794 break;
2795 }
d6a28714
JH
2796 sayNO;
2797 case SBOL:
c2a73568 2798 if (locinput == PL_bostr)
b8c5462f 2799 break;
d6a28714
JH
2800 sayNO;
2801 case GPOS:
3b0527fe 2802 if (locinput == reginfo->ganch)
d6a28714
JH
2803 break;
2804 sayNO;
2805 case EOL:
d6a28714
JH
2806 goto seol;
2807 case MEOL:
d6a28714 2808 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2809 sayNO;
b8c5462f 2810 break;
d6a28714
JH
2811 case SEOL:
2812 seol:
2813 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2814 sayNO;
d6a28714 2815 if (PL_regeol - locinput > 1)
b8c5462f 2816 sayNO;
b8c5462f 2817 break;
d6a28714
JH
2818 case EOS:
2819 if (PL_regeol != locinput)
b8c5462f 2820 sayNO;
d6a28714 2821 break;
ffc61ed2 2822 case SANY:
d6a28714 2823 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2824 sayNO;
f33976b4
DB
2825 if (do_utf8) {
2826 locinput += PL_utf8skip[nextchr];
2827 if (locinput > PL_regeol)
2828 sayNO;
2829 nextchr = UCHARAT(locinput);
2830 }
2831 else
2832 nextchr = UCHARAT(++locinput);
2833 break;
2834 case CANY:
2835 if (!nextchr && locinput >= PL_regeol)
2836 sayNO;
b8c5462f 2837 nextchr = UCHARAT(++locinput);
a0d0e21e 2838 break;
ffc61ed2 2839 case REG_ANY:
1aa99e6b
IH
2840 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2841 sayNO;
2842 if (do_utf8) {
b8c5462f 2843 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2844 if (locinput > PL_regeol)
2845 sayNO;
a0ed51b3 2846 nextchr = UCHARAT(locinput);
a0ed51b3 2847 }
1aa99e6b
IH
2848 else
2849 nextchr = UCHARAT(++locinput);
a0ed51b3 2850 break;
5b47454d 2851 case TRIE:
3dab1dad 2852 {
07be1b83 2853 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2854 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2855 trie_type = do_utf8 ?
2856 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2857 : trie_plain;
2858
2859 /* what trie are we using right now */
2860 reg_trie_data *trie
2861 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2862 U32 state = trie->startstate;
2863
2864 if (trie->bitmap && trie_type != trie_utf8_fold &&
2865 !TRIE_BITMAP_TEST(trie,*locinput)
2866 ) {
2867 if (trie->states[ state ].wordnum) {
2868 DEBUG_EXECUTE_r(
2869 PerlIO_printf(Perl_debug_log,
2870 "%*s %smatched empty string...%s\n",
2871 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2872 );
2873 break;
2874 } else {
2875 DEBUG_EXECUTE_r(
2876 PerlIO_printf(Perl_debug_log,
2877 "%*s %sfailed to match start class...%s\n",
2878 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2879 );
2880 sayNO_SILENT;
2881 }
2882 }
a3621e74 2883 {
07be1b83
YO
2884 /*
2885 traverse the TRIE keeping track of all accepting states
2886 we transition through until we get to a failing node.
2887 */
2888
a3621e74 2889 U8 *uc = ( U8* )locinput;
a3621e74
YO
2890 U16 charid = 0;
2891 U32 base = 0;
2892 UV uvc = 0;
2893 STRLEN len = 0;
2894 STRLEN foldlen = 0;
a3621e74
YO
2895 U8 *uscan = (U8*)NULL;
2896 STRLEN bufflen=0;
95b24440 2897 SV *sv_accept_buff = NULL;
5b47454d 2898
d8319b27 2899 st->u.trie.accepted = 0; /* how many accepting states we have seen */
95b24440 2900 result = 0;
a3621e74
YO
2901
2902 while ( state && uc <= (U8*)PL_regeol ) {
2903
5b47454d 2904 if (trie->states[ state ].wordnum) {
d8319b27 2905 if (!st->u.trie.accepted ) {
5b47454d
DM
2906 ENTER;
2907 SAVETMPS;
2908 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2909 sv_accept_buff=newSV(bufflen *
2910 sizeof(reg_trie_accepted) - 1);
2911 SvCUR_set(sv_accept_buff,
2912 sizeof(reg_trie_accepted));
2913 SvPOK_on(sv_accept_buff);
2914 sv_2mortal(sv_accept_buff);
d8319b27 2915 st->u.trie.accept_buff =
5b47454d
DM
2916 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2917 }
2918 else {
d8319b27 2919 if (st->u.trie.accepted >= bufflen) {
5b47454d 2920 bufflen *= 2;
d8319b27 2921 st->u.trie.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2922 SvGROW(sv_accept_buff,
2923 bufflen * sizeof(reg_trie_accepted));
2924 }
2925 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2926 + sizeof(reg_trie_accepted));
2927 }
d8319b27
DM
2928 st->u.trie.accept_buff[st->u.trie.accepted].wordnum = trie->states[state].wordnum;
2929 st->u.trie.accept_buff[st->u.trie.accepted].endpos = uc;
2930 ++st->u.trie.accepted;
5b47454d 2931 }
a3621e74
YO
2932
2933 base = trie->states[ state ].trans.base;
2934
07be1b83
YO
2935 DEBUG_TRIE_EXECUTE_r({
2936 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2937 PerlIO_printf( Perl_debug_log,
e4584336 2938 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2939 2+PL_regindent * 2, "", PL_colors[4],
d8319b27 2940 (UV)state, (UV)base, (UV)st->u.trie.accepted );
07be1b83 2941 });
a3621e74
YO
2942
2943 if ( base ) {
5b47454d 2944 switch (trie_type) {
3dab1dad 2945 case trie_utf8_fold:
a3621e74
YO
2946 if ( foldlen>0 ) {
2947 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2948 foldlen -= len;
2949 uscan += len;
2950 len=0;
2951 } else {
1df70142 2952 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
a3621e74
YO
2953 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2954 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2955 foldlen -= UNISKIP( uvc );
2956 uscan = foldbuf + UNISKIP( uvc );
2957 }
5b47454d
DM
2958 break;
2959 case trie_utf8:
2960 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN,
2961 &len, uniflags );
2962 break;
2963 case trie_plain:
e4584336 2964 uvc = (UV)*uc;
a3621e74
YO
2965 len = 1;
2966 }
2967
5b47454d
DM
2968 if (uvc < 256) {
2969 charid = trie->charmap[ uvc ];
2970 }
2971 else {
2972 charid = 0;
2973 if (trie->widecharmap) {
3dab1dad 2974 SV** const svpp = hv_fetch(trie->widecharmap,
5b47454d
DM
2975 (char*)&uvc, sizeof(UV), 0);
2976 if (svpp)
2977 charid = (U16)SvIV(*svpp);
2978 }
2979 }
a3621e74 2980
5b47454d
DM
2981 if (charid &&
2982 (base + charid > trie->uniquecharcount )
2983 && (base + charid - 1 - trie->uniquecharcount
2984 < trie->lasttrans)
2985 && trie->trans[base + charid - 1 -
2986 trie->uniquecharcount].check == state)
2987 {
2988 state = trie->trans[base + charid - 1 -
2989 trie->uniquecharcount ].next;
2990 }
2991 else {
2992 state = 0;
2993 }
2994 uc += len;
2995
2996 }
2997 else {
a3621e74
YO
2998 state = 0;
2999 }
3000 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
3001 PerlIO_printf( Perl_debug_log,
3002 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
3003 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3004 );
3005 }
d8319b27 3006 if (!st->u.trie.accepted )
a3621e74 3007 sayNO;
a3621e74
YO
3008
3009 /*
3010 There was at least one accepting state that we
3011 transitioned through. Presumably the number of accepting
3012 states is going to be low, typically one or two. So we
3013 simply scan through to find the one with lowest wordnum.
3014 Once we find it, we swap the last state into its place
3015 and decrement the size. We then try to match the rest of
3016 the pattern at the point where the word ends, if we
3017 succeed then we end the loop, otherwise the loop
3018 eventually terminates once all of the accepting states
3019 have been tried.
3020 */
a3621e74 3021
d8319b27 3022 if ( st->u.trie.accepted == 1 ) {
f2278c82
YO
3023 DEBUG_EXECUTE_r({
3024 SV ** const tmp = RX_DEBUG(reginfo->prog)
3025 ? av_fetch( trie->words, st->u.trie.accept_buff[ 0 ].wordnum-1, 0 )
3026 : NULL;
3027 PerlIO_printf( Perl_debug_log,
3028 "%*s %sonly one match : #%d <%s>%s\n",
3029 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3030 st->u.trie.accept_buff[ 0 ].wordnum,
3031 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
3032 PL_colors[5] );
3033 });
d8319b27 3034 PL_reginput = (char *)st->u.trie.accept_buff[ 0 ].endpos;
a3621e74
YO
3035 /* in this case we free tmps/leave before we call regmatch
3036 as we wont be using accept_buff again. */
3037 FREETMPS;
3038 LEAVE;
07be1b83 3039 /* do we need this? why dont we just do a break? */
95b24440
DM
3040 REGMATCH(scan + NEXT_OFF(scan), TRIE1);
3041 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3042 } else {
3043 DEBUG_EXECUTE_r(
e4584336 3044 PerlIO_printf( Perl_debug_log,"%*s %sgot %"IVdf" possible matches%s\n",
d8319b27 3045 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4], (IV)st->u.trie.accepted,
a3621e74
YO
3046 PL_colors[5] );
3047 );
d8319b27 3048 while ( !result && st->u.trie.accepted-- ) {
a3621e74
YO
3049 U32 best = 0;
3050 U32 cur;
d8319b27 3051 for( cur = 1 ; cur <= st->u.trie.accepted ; cur++ ) {
e4584336
RB
3052 DEBUG_TRIE_EXECUTE_r(
3053 PerlIO_printf( Perl_debug_log,
3054 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3055 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
d8319b27
DM
3056 (IV)best, st->u.trie.accept_buff[ best ].wordnum, (IV)cur,
3057 st->u.trie.accept_buff[ cur ].wordnum, PL_colors[5] );
e4584336 3058 );
a3621e74 3059
d8319b27
DM
3060 if (st->u.trie.accept_buff[cur].wordnum <
3061 st->u.trie.accept_buff[best].wordnum)
e822a8b4 3062 best = cur;
a3621e74 3063 }
f2278c82
YO
3064 DEBUG_EXECUTE_r({
3065 reg_trie_data * const trie = (reg_trie_data*)
3066 rex->data->data[ARG(scan)];
3067 SV ** const tmp = RX_DEBUG(reginfo->prog)
3068 ? av_fetch( trie->words, st->u.trie.accept_buff[ best ].wordnum - 1, 0 )
3069 : NULL;
3070 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
3071 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
3072 st->u.trie.accept_buff[best].wordnum,
3073 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
3074 PL_colors[5] );
3075 });
d8319b27
DM
3076 if ( best<st->u.trie.accepted ) {
3077 reg_trie_accepted tmp = st->u.trie.accept_buff[ best ];
3078 st->u.trie.accept_buff[ best ] = st->u.trie.accept_buff[ st->u.trie.accepted ];
3079 st->u.trie.accept_buff[ st->u.trie.accepted ] = tmp;
3080 best = st->u.trie.accepted;
a3621e74 3081 }
d8319b27 3082 PL_reginput = (char *)st->u.trie.accept_buff[ best ].endpos;
a3621e74
YO
3083
3084 /*
3085 as far as I can tell we only need the SAVETMPS/FREETMPS
3086 for re's with EVAL in them but I'm leaving them in for
3087 all until I can be sure.
3088 */
3089 SAVETMPS;
95b24440
DM
3090 REGMATCH(scan + NEXT_OFF(scan), TRIE2);
3091 /*** all unsaved local vars undefined at this point */
a3621e74
YO
3092 FREETMPS;
3093 }
3094 FREETMPS;
3095 LEAVE;
3096 }
3097
95b24440 3098 if (result) {
a3621e74
YO
3099 sayYES;
3100 } else {
3101 sayNO;
3102 }
3dab1dad 3103 }}
a3621e74 3104 /* unreached codepoint */
95b24440
DM
3105 case EXACT: {
3106 char *s = STRING(scan);
5d9a96ca 3107 st->ln = STR_LEN(scan);
eb160463 3108 if (do_utf8 != UTF) {
bc517b45 3109 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3110 char *l = locinput;
5d9a96ca 3111 const char *e = s + st->ln;
a72c7584 3112
5ff6fc6d
JH
3113 if (do_utf8) {
3114 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3115 while (s < e) {
a3b680e6 3116 STRLEN ulen;
1aa99e6b 3117 if (l >= PL_regeol)
5ff6fc6d
JH
3118 sayNO;
3119 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3120 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3121 uniflags))
5ff6fc6d 3122 sayNO;
bc517b45 3123 l += ulen;
5ff6fc6d 3124 s ++;
1aa99e6b 3125 }
5ff6fc6d
JH
3126 }
3127 else {
3128 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3129 while (s < e) {
a3b680e6 3130 STRLEN ulen;
1aa99e6b
IH
3131 if (l >= PL_regeol)
3132 sayNO;
5ff6fc6d 3133 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3134 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3135 uniflags))
1aa99e6b 3136 sayNO;
bc517b45 3137 s += ulen;
a72c7584 3138 l ++;
1aa99e6b 3139 }
5ff6fc6d 3140 }
1aa99e6b
IH
3141 locinput = l;
3142 nextchr = UCHARAT(locinput);
3143 break;
3144 }
bc517b45 3145 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3146 /* Inline the first character, for speed. */
3147 if (UCHARAT(s) != nextchr)
3148 sayNO;
5d9a96ca 3149 if (PL_regeol - locinput < st->ln)
d6a28714 3150 sayNO;
5d9a96ca 3151 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 3152 sayNO;
5d9a96ca 3153 locinput += st->ln;
d6a28714
JH
3154 nextchr = UCHARAT(locinput);
3155 break;
95b24440 3156 }
d6a28714 3157 case EXACTFL:
b8c5462f
JH
3158 PL_reg_flags |= RF_tainted;
3159 /* FALL THROUGH */
95b24440
DM
3160 case EXACTF: {
3161 char *s = STRING(scan);
5d9a96ca 3162 st->ln = STR_LEN(scan);
d6a28714 3163
d07ddd77
JH
3164 if (do_utf8 || UTF) {
3165 /* Either target or the pattern are utf8. */
d6a28714 3166 char *l = locinput;
d07ddd77 3167 char *e = PL_regeol;
bc517b45 3168
5d9a96ca 3169 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 3170 l, &e, 0, do_utf8)) {
5486206c
JH
3171 /* One more case for the sharp s:
3172 * pack("U0U*", 0xDF) =~ /ss/i,
3173 * the 0xC3 0x9F are the UTF-8
3174 * byte sequence for the U+00DF. */
3175 if (!(do_utf8 &&
3176 toLOWER(s[0]) == 's' &&
5d9a96ca 3177 st->ln >= 2 &&
5486206c
JH
3178 toLOWER(s[1]) == 's' &&
3179 (U8)l[0] == 0xC3 &&
3180 e - l >= 2 &&
3181 (U8)l[1] == 0x9F))
3182 sayNO;
3183 }
d07ddd77
JH
3184 locinput = e;
3185 nextchr = UCHARAT(locinput);
3186 break;
a0ed51b3 3187 }
d6a28714 3188
bc517b45
JH
3189 /* Neither the target and the pattern are utf8. */
3190
d6a28714
JH
3191 /* Inline the first character, for speed. */
3192 if (UCHARAT(s) != nextchr &&
3193 UCHARAT(s) != ((OP(scan) == EXACTF)
3194 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3195 sayNO;
5d9a96ca 3196 if (PL_regeol - locinput < st->ln)
b8c5462f 3197 sayNO;
5d9a96ca
DM
3198 if (st->ln > 1 && (OP(scan) == EXACTF
3199 ? ibcmp(s, locinput, st->ln)
3200 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 3201 sayNO;
5d9a96ca 3202 locinput += st->ln;
d6a28714 3203 nextchr = UCHARAT(locinput);
a0d0e21e 3204 break;
95b24440 3205 }
d6a28714 3206 case ANYOF:
ffc61ed2 3207 if (do_utf8) {
9e55ce06
JH
3208 STRLEN inclasslen = PL_regeol - locinput;
3209
32fc9b6a 3210 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 3211 sayNO_ANYOF;
ffc61ed2
JH
3212 if (locinput >= PL_regeol)
3213 sayNO;
0f0076b4 3214 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3215 nextchr = UCHARAT(locinput);
e0f9d4a8 3216 break;
ffc61ed2
JH
3217 }
3218 else {
3219 if (nextchr < 0)
3220 nextchr = UCHARAT(locinput);
32fc9b6a 3221 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 3222 sayNO_ANYOF;
ffc61ed2
JH
3223 if (!nextchr && locinput >= PL_regeol)
3224 sayNO;
3225 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3226 break;
3227 }
3228 no_anyof:
3229 /* If we might have the case of the German sharp s
3230 * in a casefolding Unicode character class. */
3231
ebc501f0
JH
3232 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3233 locinput += SHARP_S_SKIP;
e0f9d4a8 3234 nextchr = UCHARAT(locinput);
ffc61ed2 3235 }
e0f9d4a8
JH
3236 else
3237 sayNO;
b8c5462f 3238 break;
d6a28714 3239 case ALNUML:
b8c5462f
JH
3240 PL_reg_flags |= RF_tainted;
3241 /* FALL THROUGH */
d6a28714 3242 case ALNUM:
b8c5462f 3243 if (!nextchr)
4633a7c4 3244 sayNO;
ffc61ed2 3245 if (do_utf8) {
1a4fad37 3246 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3247 if (!(OP(scan) == ALNUM
bb7a0f54 3248 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 3249 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
3250 {
3251 sayNO;
a0ed51b3 3252 }
b8c5462f 3253 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3254 nextchr = UCHARAT(locinput);
3255 break;
3256 }
ffc61ed2 3257 if (!(OP(scan) == ALNUM
d6a28714 3258 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 3259 sayNO;
b8c5462f 3260 nextchr = UCHARAT(++locinput);
a0d0e21e 3261 break;
d6a28714 3262 case NALNUML:
b8c5462f
JH
3263 PL_reg_flags |= RF_tainted;
3264 /* FALL THROUGH */
d6a28714
JH
3265 case NALNUM:
3266 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 3267 sayNO;
ffc61ed2 3268 if (do_utf8) {
1a4fad37 3269 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 3270 if (OP(scan) == NALNUM
bb7a0f54 3271 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
3272 : isALNUM_LC_utf8((U8*)locinput))
3273 {
b8c5462f 3274 sayNO;
d6a28714 3275 }
b8c5462f
JH
3276 locinput += PL_utf8skip[nextchr];
3277 nextchr = UCHARAT(locinput);
3278 break;
3279 }
ffc61ed2 3280 if (OP(scan) == NALNUM
d6a28714 3281 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 3282 sayNO;
76e3520e 3283 nextchr = UCHARAT(++locinput);
a0d0e21e 3284 break;
d6a28714
JH
3285 case BOUNDL:
3286 case NBOUNDL:
3280af22 3287 PL_reg_flags |= RF_tainted;
bbce6d69 3288 /* FALL THROUGH */
d6a28714
JH
3289 case BOUND:
3290 case NBOUND:
3291 /* was last char in word? */
ffc61ed2 3292 if (do_utf8) {
12d33761 3293 if (locinput == PL_bostr)
5d9a96ca 3294 st->ln = '\n';
ffc61ed2 3295 else {
a3b680e6 3296 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 3297
4ad0818d 3298 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
3299 }
3300 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3301 st->ln = isALNUM_uni(st->ln);
1a4fad37 3302 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 3303 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
3304 }
3305 else {
5d9a96ca 3306 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
3307 n = isALNUM_LC_utf8((U8*)locinput);
3308 }
a0ed51b3 3309 }
d6a28714 3310 else {
5d9a96ca 3311 st->ln = (locinput != PL_bostr) ?
12d33761 3312 UCHARAT(locinput - 1) : '\n';
ffc61ed2 3313 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 3314 st->ln = isALNUM(st->ln);
ffc61ed2
JH
3315 n = isALNUM(nextchr);
3316 }
3317 else {
5d9a96ca 3318 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
3319 n = isALNUM_LC(nextchr);
3320 }
d6a28714 3321 }
5d9a96ca 3322 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
3323 OP(scan) == BOUNDL))
3324 sayNO;
a0ed51b3 3325 break;
d6a28714 3326 case SPACEL:
3280af22 3327 PL_reg_flags |= RF_tainted;
bbce6d69 3328 /* FALL THROUGH */
d6a28714 3329 case SPACE:
9442cb0e 3330 if (!nextchr)
4633a7c4 3331 sayNO;
1aa99e6b 3332 if (do_utf8) {
fd400ab9 3333 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 3334 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3335 if (!(OP(scan) == SPACE
bb7a0f54 3336 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3337 : isSPACE_LC_utf8((U8*)locinput)))
3338 {
3339 sayNO;
3340 }
3341 locinput += PL_utf8skip[nextchr];
3342 nextchr = UCHARAT(locinput);
3343 break;
d6a28714 3344 }
ffc61ed2
JH
3345 if (!(OP(scan) == SPACE
3346 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3347 sayNO;
3348 nextchr = UCHARAT(++locinput);
3349 }
3350 else {
3351 if (!(OP(scan) == SPACE
3352 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3353 sayNO;
3354 nextchr = UCHARAT(++locinput);
a0ed51b3 3355 }
a0ed51b3 3356 break;
d6a28714 3357 case NSPACEL:
3280af22 3358 PL_reg_flags |= RF_tainted;
bbce6d69 3359 /* FALL THROUGH */
d6a28714 3360 case NSPACE:
9442cb0e 3361 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3362 sayNO;
1aa99e6b 3363 if (do_utf8) {
1a4fad37 3364 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3365 if (OP(scan) == NSPACE
bb7a0f54 3366 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3367 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3368 {
3369 sayNO;
3370 }
3371 locinput += PL_utf8skip[nextchr];
3372 nextchr = UCHARAT(locinput);
3373 break;
a0ed51b3 3374 }
ffc61ed2 3375 if (OP(scan) == NSPACE
d6a28714 3376 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3377 sayNO;
76e3520e 3378 nextchr = UCHARAT(++locinput);
a0d0e21e 3379 break;
d6a28714 3380 case DIGITL:
a0ed51b3
LW
3381 PL_reg_flags |= RF_tainted;
3382 /* FALL THROUGH */
d6a28714 3383 case DIGIT:
9442cb0e 3384 if (!nextchr)
a0ed51b3 3385 sayNO;
1aa99e6b 3386 if (do_utf8) {
1a4fad37 3387 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3388 if (!(OP(scan) == DIGIT
bb7a0f54 3389 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3390 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3391 {
a0ed51b3 3392 sayNO;
dfe13c55 3393 }
6f06b55f 3394 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3395 nextchr = UCHARAT(locinput);
3396 break;
3397 }
ffc61ed2 3398 if (!(OP(scan) == DIGIT
9442cb0e 3399 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3400 sayNO;
3401 nextchr = UCHARAT(++locinput);
3402 break;
d6a28714 3403 case NDIGITL:
b8c5462f
JH
3404 PL_reg_flags |= RF_tainted;
3405 /* FALL THROUGH */
d6a28714 3406 case NDIGIT:
9442cb0e 3407 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3408 sayNO;
1aa99e6b 3409 if (do_utf8) {
1a4fad37 3410 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3411 if (OP(scan) == NDIGIT
bb7a0f54 3412 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3413 : isDIGIT_LC_utf8((U8*)locinput))
3414 {
a0ed51b3 3415 sayNO;
9442cb0e 3416 }
6f06b55f 3417 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3418 nextchr = UCHARAT(locinput);
3419 break;
3420 }
ffc61ed2 3421 if (OP(scan) == NDIGIT
9442cb0e 3422 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3423 sayNO;
3424 nextchr = UCHARAT(++locinput);
3425 break;
3426 case CLUMP:
b7c83a7e 3427 if (locinput >= PL_regeol)
a0ed51b3 3428 sayNO;
b7c83a7e 3429 if (do_utf8) {
1a4fad37 3430 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3431 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3432 sayNO;
3433 locinput += PL_utf8skip[nextchr];
3434 while (locinput < PL_regeol &&
3435 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3436 locinput += UTF8SKIP(locinput);