This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: change variable name to reflect its purpose
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
af364d03
KW
97/* Valid for non-utf8 strings only: avoids the reginclass call if there are no
98 * complications: i.e., if everything matchable is straight forward in the
99 * bitmap */
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
37e2e78e
KW
127
128/* Doesn't do an assert to verify that is correct */
129#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
c269f403 130 if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
37e2e78e 131
1a4fad37
AL
132#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
133#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
134#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 135
37e2e78e 136#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
137 LOAD_UTF8_CHARCLASS(X_begin, " "); \
138 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
139 /* These are utf8 constants, and not utf-ebcdic constants, so the \
140 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 141 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
142 \
143 /* No asserts are done for these, in case called on an early \
144 * Unicode version in which they map to nothing */ \
d275fa5e
CB
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
149 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 151 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 152
d1eb3177
YO
153/*
154 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
155 so that it is possible to override the option here without having to
156 rebuild the entire core. as we are required to do if we change regcomp.h
157 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
158*/
159#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
160#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
161#endif
162
163#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
164#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
165#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
166#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
167#define RE_utf8_perl_word PL_utf8_alnum
168#define RE_utf8_perl_space PL_utf8_space
169#define RE_utf8_posix_digit PL_utf8_digit
170#define perl_word alnum
171#define perl_space space
172#define posix_digit digit
173#else
174#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
175#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
176#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
177#define RE_utf8_perl_word PL_utf8_perl_word
178#define RE_utf8_perl_space PL_utf8_perl_space
179#define RE_utf8_posix_digit PL_utf8_posix_digit
180#endif
181
182
ee9a90b8
KW
183/* The actual code for CCC_TRY, which uses several variables from the routine
184 * it's callable from. It is designed to be the bulk of a case statement.
185 * FUNC is the macro or function to call on non-utf8 targets that indicate if
186 * nextchr matches the class.
187 * UTF8_TEST is the whole test string to use for utf8 targets
188 * LOAD is what to use to test, and if not present to load in the swash for the
189 * class
190 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
191 * UTF8_TEST test.
192 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
193 * utf8 and a variant, load the swash if necessary and test using the utf8
194 * test. Advance to the next character if test is ok, otherwise fail; If not
195 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
196 * fails, or advance to the next character */
197
198#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
199 if (locinput >= PL_regeol) { \
200 sayNO; \
201 } \
202 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
203 LOAD_UTF8_CHARCLASS(CLASS, STR); \
204 if (POS_OR_NEG (UTF8_TEST)) { \
205 sayNO; \
206 } \
207 locinput += PL_utf8skip[nextchr]; \
208 nextchr = UCHARAT(locinput); \
209 break; \
210 } \
211 if (POS_OR_NEG (FUNC(nextchr))) { \
212 sayNO; \
213 } \
214 nextchr = UCHARAT(++locinput); \
980866de
KW
215 break;
216
ee9a90b8
KW
217/* Handle the non-locale cases for a character class and its complement. It
218 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
219 * This is because that code fails when the test succeeds, so we want to have
220 * the test fail so that the code succeeds. The swash is stored in a
221 * predictable PL_ place */
222#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) \
223 case NAME: \
224 _CCC_TRY_CODE( !, FUNC, \
225 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
226 (U8*)locinput, TRUE)), \
227 CLASS, STR) \
228 case NNAME: \
229 _CCC_TRY_CODE( , FUNC, \
230 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
231 (U8*)locinput, TRUE)), \
232 CLASS, STR) \
233
234/* Generate the case statements for both locale and non-locale character
235 * classes in regmatch for classes that don't have special unicode semantics.
236 * Locales don't use an immediate swash, but an intermediary special locale
237 * function that is called on the pointer to the current place in the input
238 * string. That function will resolve to needing the same swash. One might
239 * think that because we don't know what the locale will match, we shouldn't
240 * check with the swash loading function that it loaded properly; ie, that we
241 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
242 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
243 * irrelevant here */
244#define CCC_TRY(NAME, NNAME, FUNC, \
245 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
246 CLASS, STR) \
247 case NAMEL: \
248 PL_reg_flags |= RF_tainted; \
249 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
250 case NNAMEL: \
251 PL_reg_flags |= RF_tainted; \
252 _CCC_TRY_CODE( , LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
253 /* Generate the non-locale cases */ \
254 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
255
256/* This is like CCC_TRY, but has an extra set of parameters for generating case
257 * statements to handle separate Unicode semantics nodes */
258#define CCC_TRY_U(NAME, NNAME, FUNC, \
259 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
260 NAMEU, NNAMEU, FUNCU, \
261 CLASS, STR) \
262 CCC_TRY(NAME, NNAME, FUNC, NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, CLASS, STR) \
263 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 264
3dab1dad
YO
265/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
266
5f80c4cf 267/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
268/* it would be nice to rework regcomp.sym to generate this stuff. sigh
269 *
270 * NOTE that *nothing* that affects backtracking should be in here, specifically
271 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
272 * node that is in between two EXACT like nodes when ascertaining what the required
273 * "follow" character is. This should probably be moved to regex compile time
274 * although it may be done at run time beause of the REF possibility - more
275 * investigation required. -- demerphq
276*/
3e901dc0
YO
277#define JUMPABLE(rn) ( \
278 OP(rn) == OPEN || \
279 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
280 OP(rn) == EVAL || \
cca55fe3
JP
281 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
282 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 283 OP(rn) == KEEPS || \
3dab1dad 284 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 285)
ee9b8eae 286#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 287
ee9b8eae
YO
288#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
289
290#if 0
291/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
292 we don't need this definition. */
293#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
9a5a5549 294#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
295#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
296
297#else
298/* ... so we use this as its faster. */
299#define IS_TEXT(rn) ( OP(rn)==EXACT )
9a5a5549 300#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU )
ee9b8eae
YO
301#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
302#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
303
304#endif
e2d8ce26 305
a84d97b6
HS
306/*
307 Search for mandatory following text node; for lookahead, the text must
308 follow but for lookbehind (rn->flags != 0) we skip to the next step.
309*/
cca55fe3 310#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
311 while (JUMPABLE(rn)) { \
312 const OPCODE type = OP(rn); \
313 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 314 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 315 else if (type == PLUS) \
cca55fe3 316 rn = NEXTOPER(rn); \
3dab1dad 317 else if (type == IFMATCH) \
a84d97b6 318 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 319 else rn += NEXT_OFF(rn); \
3dab1dad 320 } \
5f80c4cf 321} STMT_END
74750237 322
c476f425 323
acfe0abc 324static void restore_pos(pTHX_ void *arg);
51371543 325
620d5b66
NC
326#define REGCP_PAREN_ELEMS 4
327#define REGCP_OTHER_ELEMS 5
e0fa7e2b 328#define REGCP_FRAME_ELEMS 1
620d5b66
NC
329/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
330 * are needed for the regexp context stack bookkeeping. */
331
76e3520e 332STATIC CHECKPOINT
cea2e8a9 333S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 334{
97aff369 335 dVAR;
a3b680e6 336 const int retval = PL_savestack_ix;
a3b680e6 337 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
338 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
339 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 340 int p;
40a82448 341 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 342
e49a9654
IH
343 if (paren_elems_to_push < 0)
344 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
345
e0fa7e2b
NC
346 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
347 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
348 " out of range (%lu-%ld)",
349 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 350
620d5b66 351 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 352
3280af22 353 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 354/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
355 SSPUSHINT(PL_regoffs[p].end);
356 SSPUSHINT(PL_regoffs[p].start);
3280af22 357 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 358 SSPUSHINT(p);
e7707071 359 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 360 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 361 (UV)p, (IV)PL_regoffs[p].start,
40a82448 362 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 363 (IV)PL_regoffs[p].end
40a82448 364 ));
a0d0e21e 365 }
b1ce53c5 366/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 367 SSPUSHPTR(PL_regoffs);
3280af22
NIS
368 SSPUSHINT(PL_regsize);
369 SSPUSHINT(*PL_reglastparen);
a01268b5 370 SSPUSHINT(*PL_reglastcloseparen);
3280af22 371 SSPUSHPTR(PL_reginput);
e0fa7e2b 372 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 373
a0d0e21e
LW
374 return retval;
375}
376
c277df42 377/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
378#define REGCP_SET(cp) \
379 DEBUG_STATE_r( \
ab3bbdeb 380 PerlIO_printf(Perl_debug_log, \
e4f74956 381 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
382 (IV)PL_savestack_ix)); \
383 cp = PL_savestack_ix
c3464db5 384
ab3bbdeb 385#define REGCP_UNWIND(cp) \
e4f74956 386 DEBUG_STATE_r( \
ab3bbdeb 387 if (cp != PL_savestack_ix) \
e4f74956
YO
388 PerlIO_printf(Perl_debug_log, \
389 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
390 (IV)(cp), (IV)PL_savestack_ix)); \
391 regcpblow(cp)
c277df42 392
76e3520e 393STATIC char *
097eb12c 394S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 395{
97aff369 396 dVAR;
e0fa7e2b 397 UV i;
a0d0e21e 398 char *input;
a3621e74
YO
399 GET_RE_DEBUG_FLAGS_DECL;
400
7918f24d
NC
401 PERL_ARGS_ASSERT_REGCPPOP;
402
b1ce53c5 403 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 404 i = SSPOPUV;
e0fa7e2b
NC
405 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
406 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 407 input = (char *) SSPOPPTR;
a01268b5 408 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
409 *PL_reglastparen = SSPOPINT;
410 PL_regsize = SSPOPINT;
f0ab9afb 411 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 412
620d5b66 413 i -= REGCP_OTHER_ELEMS;
b1ce53c5 414 /* Now restore the parentheses context. */
620d5b66 415 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 416 I32 tmps;
097eb12c 417 U32 paren = (U32)SSPOPINT;
3280af22 418 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 419 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 420 tmps = SSPOPINT;
3280af22 421 if (paren <= *PL_reglastparen)
f0ab9afb 422 PL_regoffs[paren].end = tmps;
e7707071 423 DEBUG_BUFFERS_r(
c3464db5 424 PerlIO_printf(Perl_debug_log,
b900a521 425 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 426 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 427 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 428 (IV)PL_regoffs[paren].end,
3280af22 429 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 430 );
a0d0e21e 431 }
e7707071 432 DEBUG_BUFFERS_r(
bb7a0f54 433 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 434 PerlIO_printf(Perl_debug_log,
faccc32b 435 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 436 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
437 }
438 );
daf18116 439#if 1
dafc8851
JH
440 /* It would seem that the similar code in regtry()
441 * already takes care of this, and in fact it is in
442 * a better location to since this code can #if 0-ed out
443 * but the code in regtry() is needed or otherwise tests
444 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
445 * (as of patchlevel 7877) will fail. Then again,
446 * this code seems to be necessary or otherwise
225593e1
DM
447 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
448 * --jhi updated by dapm */
3b6647e0 449 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 450 if (i > PL_regsize)
f0ab9afb
NC
451 PL_regoffs[i].start = -1;
452 PL_regoffs[i].end = -1;
a0d0e21e 453 }
dafc8851 454#endif
a0d0e21e
LW
455 return input;
456}
457
02db2b7b 458#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 459
a687059c 460/*
e50aee73 461 * pregexec and friends
a687059c
LW
462 */
463
76234dfb 464#ifndef PERL_IN_XSUB_RE
a687059c 465/*
c277df42 466 - pregexec - match a regexp against a string
a687059c 467 */
c277df42 468I32
49d7dfbc 469Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 470 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
471/* strend: pointer to null at end of string */
472/* strbeg: real beginning of string */
473/* minend: end of match must be >=minend after stringarg. */
474/* nosave: For optimizations. */
475{
7918f24d
NC
476 PERL_ARGS_ASSERT_PREGEXEC;
477
c277df42 478 return
9041c2e3 479 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
480 nosave ? 0 : REXEC_COPY_STR);
481}
76234dfb 482#endif
22e551b9 483
9041c2e3 484/*
cad2e5aa
JH
485 * Need to implement the following flags for reg_anch:
486 *
487 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
488 * USE_INTUIT_ML
489 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
490 * INTUIT_AUTORITATIVE_ML
491 * INTUIT_ONCE_NOML - Intuit can match in one location only.
492 * INTUIT_ONCE_ML
493 *
494 * Another flag for this function: SECOND_TIME (so that float substrs
495 * with giant delta may be not rechecked).
496 */
497
498/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
499
3f7c398e 500/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
501 Otherwise, only SvCUR(sv) is used to get strbeg. */
502
503/* XXXX We assume that strpos is strbeg unless sv. */
504
6eb5f6b9
JH
505/* XXXX Some places assume that there is a fixed substring.
506 An update may be needed if optimizer marks as "INTUITable"
507 RExen without fixed substrings. Similarly, it is assumed that
508 lengths of all the strings are no more than minlen, thus they
509 cannot come from lookahead.
40d049e4
YO
510 (Or minlen should take into account lookahead.)
511 NOTE: Some of this comment is not correct. minlen does now take account
512 of lookahead/behind. Further research is required. -- demerphq
513
514*/
6eb5f6b9 515
2c2d71f5
JH
516/* A failure to find a constant substring means that there is no need to make
517 an expensive call to REx engine, thus we celebrate a failure. Similarly,
518 finding a substring too deep into the string means that less calls to
30944b6d
IZ
519 regtry() should be needed.
520
521 REx compiler's optimizer found 4 possible hints:
522 a) Anchored substring;
523 b) Fixed substring;
524 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 525 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 526 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
527 string which does not contradict any of them.
528 */
2c2d71f5 529
6eb5f6b9
JH
530/* Most of decisions we do here should have been done at compile time.
531 The nodes of the REx which we used for the search should have been
532 deleted from the finite automaton. */
533
cad2e5aa 534char *
288b8c02 535Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 536 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 537{
97aff369 538 dVAR;
288b8c02 539 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 540 register I32 start_shift = 0;
cad2e5aa 541 /* Should be nonnegative! */
b7953727 542 register I32 end_shift = 0;
2c2d71f5
JH
543 register char *s;
544 register SV *check;
a1933d95 545 char *strbeg;
cad2e5aa 546 char *t;
f2ed9b32 547 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 548 I32 ml_anch;
bd61b366
SS
549 register char *other_last = NULL; /* other substr checked before this */
550 char *check_at = NULL; /* check substr found at this pos */
bbe252da 551 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 552 RXi_GET_DECL(prog,progi);
30944b6d 553#ifdef DEBUGGING
890ce7af 554 const char * const i_strpos = strpos;
30944b6d 555#endif
a3621e74
YO
556 GET_RE_DEBUG_FLAGS_DECL;
557
7918f24d
NC
558 PERL_ARGS_ASSERT_RE_INTUIT_START;
559
f2ed9b32 560 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 561
3c8556c3 562 if (RX_UTF8(rx)) {
b8d68ded
JH
563 PL_reg_flags |= RF_utf8;
564 }
ab3bbdeb 565 DEBUG_EXECUTE_r(
f2ed9b32 566 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
567 sv ? "Guessing start of match in sv for"
568 : "Guessing start of match in string for");
2a782b5b 569 );
cad2e5aa 570
c344f387
JH
571 /* CHR_DIST() would be more correct here but it makes things slow. */
572 if (prog->minlen > strend - strpos) {
a3621e74 573 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 574 "String too short... [re_intuit_start]\n"));
cad2e5aa 575 goto fail;
2c2d71f5 576 }
1de06328 577
a1933d95 578 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 579 PL_regeol = strend;
f2ed9b32 580 if (utf8_target) {
33b8afdf
JH
581 if (!prog->check_utf8 && prog->check_substr)
582 to_utf8_substr(prog);
583 check = prog->check_utf8;
584 } else {
585 if (!prog->check_substr && prog->check_utf8)
586 to_byte_substr(prog);
587 check = prog->check_substr;
588 }
1de06328 589 if (check == &PL_sv_undef) {
a3621e74 590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 591 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
592 goto fail;
593 }
bbe252da
YO
594 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
595 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
596 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 597 && !multiline ) ); /* Check after \n? */
cad2e5aa 598
7e25d62c 599 if (!ml_anch) {
bbe252da
YO
600 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
601 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 602 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
603 && sv && !SvROK(sv)
604 && (strpos != strbeg)) {
a3621e74 605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
606 goto fail;
607 }
608 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 609 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 610 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
611 I32 slen;
612
1aa99e6b 613 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 614
653099ff
GS
615 if (SvTAIL(check)) {
616 slen = SvCUR(check); /* >= 1 */
cad2e5aa 617
9041c2e3 618 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 619 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 620 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 621 goto fail_finish;
cad2e5aa
JH
622 }
623 /* Now should match s[0..slen-2] */
624 slen--;
3f7c398e 625 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 626 || (slen > 1
3f7c398e 627 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 628 report_neq:
a3621e74 629 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
630 goto fail_finish;
631 }
cad2e5aa 632 }
3f7c398e 633 else if (*SvPVX_const(check) != *s
653099ff 634 || ((slen = SvCUR(check)) > 1
3f7c398e 635 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 636 goto report_neq;
c315bfe8 637 check_at = s;
2c2d71f5 638 goto success_at_start;
7e25d62c 639 }
cad2e5aa 640 }
2c2d71f5 641 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 642 s = strpos;
2c2d71f5 643 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
644 end_shift = prog->check_end_shift;
645
2c2d71f5 646 if (!ml_anch) {
a3b680e6 647 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 648 - (SvTAIL(check) != 0);
a3b680e6 649 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
650
651 if (end_shift < eshift)
652 end_shift = eshift;
653 }
cad2e5aa 654 }
2c2d71f5 655 else { /* Can match at random position */
cad2e5aa
JH
656 ml_anch = 0;
657 s = strpos;
1de06328
YO
658 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
659 end_shift = prog->check_end_shift;
660
661 /* end shift should be non negative here */
cad2e5aa
JH
662 }
663
bcdf7404 664#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 665 if (end_shift < 0)
1de06328 666 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 667 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
668#endif
669
2c2d71f5
JH
670 restart:
671 /* Find a possible match in the region s..strend by looking for
672 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
673
674 {
675 I32 srch_start_shift = start_shift;
676 I32 srch_end_shift = end_shift;
677 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
678 srch_end_shift -= ((strbeg - s) - srch_start_shift);
679 srch_start_shift = strbeg - s;
680 }
6bda09f9 681 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
682 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
683 (IV)prog->check_offset_min,
684 (IV)srch_start_shift,
685 (IV)srch_end_shift,
686 (IV)prog->check_end_shift);
687 });
688
cad2e5aa 689 if (flags & REXEC_SCREAM) {
cad2e5aa 690 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 691 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 692
2c2d71f5
JH
693 if (PL_screamfirst[BmRARE(check)] >= 0
694 || ( BmRARE(check) == '\n'
85c508c3 695 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 696 && SvTAIL(check) ))
9041c2e3 697 s = screaminstr(sv, check,
1de06328 698 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 699 else
2c2d71f5 700 goto fail_finish;
4addbd3b 701 /* we may be pointing at the wrong string */
07bc277f 702 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 703 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
704 if (data)
705 *data->scream_olds = s;
706 }
1de06328
YO
707 else {
708 U8* start_point;
709 U8* end_point;
bbe252da 710 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
711 start_point= (U8*)(s + srch_start_shift);
712 end_point= (U8*)(strend - srch_end_shift);
713 } else {
714 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
715 end_point= HOP3(strend, -srch_end_shift, strbeg);
716 }
6bda09f9 717 DEBUG_OPTIMISE_MORE_r({
56570a2c 718 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 719 (int)(end_point - start_point),
fc8cd66c 720 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
721 start_point);
722 });
723
724 s = fbm_instr( start_point, end_point,
7fba1cd6 725 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
726 }
727 }
cad2e5aa
JH
728 /* Update the count-of-usability, remove useless subpatterns,
729 unshift s. */
2c2d71f5 730
ab3bbdeb 731 DEBUG_EXECUTE_r({
f2ed9b32 732 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
733 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
734 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 735 (s ? "Found" : "Did not find"),
f2ed9b32 736 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
737 ? "anchored" : "floating"),
738 quoted,
739 RE_SV_TAIL(check),
740 (s ? " at offset " : "...\n") );
741 });
2c2d71f5
JH
742
743 if (!s)
744 goto fail_finish;
2c2d71f5 745 /* Finish the diagnostic message */
a3621e74 746 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 747
1de06328
YO
748 /* XXX dmq: first branch is for positive lookbehind...
749 Our check string is offset from the beginning of the pattern.
750 So we need to do any stclass tests offset forward from that
751 point. I think. :-(
752 */
753
754
755
756 check_at=s;
757
758
2c2d71f5
JH
759 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
760 Start with the other substr.
761 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 762 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
763 *always* match. Probably should be marked during compile...
764 Probably it is right to do no SCREAM here...
765 */
766
f2ed9b32 767 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
768 : (prog->float_substr && prog->anchored_substr))
769 {
30944b6d 770 /* Take into account the "other" substring. */
2c2d71f5
JH
771 /* XXXX May be hopelessly wrong for UTF... */
772 if (!other_last)
6eb5f6b9 773 other_last = strpos;
f2ed9b32 774 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
775 do_other_anchored:
776 {
890ce7af
AL
777 char * const last = HOP3c(s, -start_shift, strbeg);
778 char *last1, *last2;
be8e71aa 779 char * const saved_s = s;
33b8afdf 780 SV* must;
2c2d71f5 781
2c2d71f5
JH
782 t = s - prog->check_offset_max;
783 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 784 && (!utf8_target
0ce71af7 785 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 786 && t > strpos)))
6f207bd3 787 NOOP;
2c2d71f5
JH
788 else
789 t = strpos;
1aa99e6b 790 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
791 if (t < other_last) /* These positions already checked */
792 t = other_last;
1aa99e6b 793 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
794 if (last < last1)
795 last1 = last;
1de06328
YO
796 /* XXXX It is not documented what units *_offsets are in.
797 We assume bytes, but this is clearly wrong.
798 Meaning this code needs to be carefully reviewed for errors.
799 dmq.
800 */
801
2c2d71f5 802 /* On end-of-str: see comment below. */
f2ed9b32 803 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
804 if (must == &PL_sv_undef) {
805 s = (char*)NULL;
1de06328 806 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
807 }
808 else
809 s = fbm_instr(
810 (unsigned char*)t,
811 HOP3(HOP3(last1, prog->anchored_offset, strend)
812 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
813 must,
7fba1cd6 814 multiline ? FBMrf_MULTILINE : 0
33b8afdf 815 );
ab3bbdeb 816 DEBUG_EXECUTE_r({
f2ed9b32 817 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
818 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
819 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 820 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
821 quoted, RE_SV_TAIL(must));
822 });
823
824
2c2d71f5
JH
825 if (!s) {
826 if (last1 >= last2) {
a3621e74 827 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
828 ", giving up...\n"));
829 goto fail_finish;
830 }
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 832 ", trying floating at offset %ld...\n",
be8e71aa 833 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
834 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
835 s = HOP3c(last, 1, strend);
2c2d71f5
JH
836 goto restart;
837 }
838 else {
a3621e74 839 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 840 (long)(s - i_strpos)));
1aa99e6b
IH
841 t = HOP3c(s, -prog->anchored_offset, strbeg);
842 other_last = HOP3c(s, 1, strend);
be8e71aa 843 s = saved_s;
2c2d71f5
JH
844 if (t == strpos)
845 goto try_at_start;
2c2d71f5
JH
846 goto try_at_offset;
847 }
30944b6d 848 }
2c2d71f5
JH
849 }
850 else { /* Take into account the floating substring. */
33b8afdf 851 char *last, *last1;
be8e71aa 852 char * const saved_s = s;
33b8afdf
JH
853 SV* must;
854
855 t = HOP3c(s, -start_shift, strbeg);
856 last1 = last =
857 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
858 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
859 last = HOP3c(t, prog->float_max_offset, strend);
860 s = HOP3c(t, prog->float_min_offset, strend);
861 if (s < other_last)
862 s = other_last;
2c2d71f5 863 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 864 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
865 /* fbm_instr() takes into account exact value of end-of-str
866 if the check is SvTAIL(ed). Since false positives are OK,
867 and end-of-str is not later than strend we are OK. */
868 if (must == &PL_sv_undef) {
869 s = (char*)NULL;
1de06328 870 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
871 }
872 else
2c2d71f5 873 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
874 (unsigned char*)last + SvCUR(must)
875 - (SvTAIL(must)!=0),
7fba1cd6 876 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 877 DEBUG_EXECUTE_r({
f2ed9b32 878 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
879 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
880 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 881 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
882 quoted, RE_SV_TAIL(must));
883 });
33b8afdf
JH
884 if (!s) {
885 if (last1 == last) {
a3621e74 886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
887 ", giving up...\n"));
888 goto fail_finish;
2c2d71f5 889 }
a3621e74 890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 891 ", trying anchored starting at offset %ld...\n",
be8e71aa 892 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
893 other_last = last;
894 s = HOP3c(t, 1, strend);
895 goto restart;
896 }
897 else {
a3621e74 898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
899 (long)(s - i_strpos)));
900 other_last = s; /* Fix this later. --Hugo */
be8e71aa 901 s = saved_s;
33b8afdf
JH
902 if (t == strpos)
903 goto try_at_start;
904 goto try_at_offset;
905 }
2c2d71f5 906 }
cad2e5aa 907 }
2c2d71f5 908
1de06328 909
9ef43ace 910 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 911
6bda09f9 912 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
913 PerlIO_printf(Perl_debug_log,
914 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
915 (IV)prog->check_offset_min,
916 (IV)prog->check_offset_max,
917 (IV)(s-strpos),
918 (IV)(t-strpos),
919 (IV)(t-s),
920 (IV)(strend-strpos)
921 )
922 );
923
2c2d71f5 924 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 925 && (!utf8_target
9ef43ace 926 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
927 && t > strpos)))
928 {
2c2d71f5
JH
929 /* Fixed substring is found far enough so that the match
930 cannot start at strpos. */
931 try_at_offset:
cad2e5aa 932 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
933 /* Eventually fbm_*() should handle this, but often
934 anchored_offset is not 0, so this check will not be wasted. */
935 /* XXXX In the code below we prefer to look for "^" even in
936 presence of anchored substrings. And we search even
937 beyond the found float position. These pessimizations
938 are historical artefacts only. */
939 find_anchor:
2c2d71f5 940 while (t < strend - prog->minlen) {
cad2e5aa 941 if (*t == '\n') {
4ee3650e 942 if (t < check_at - prog->check_offset_min) {
f2ed9b32 943 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
944 /* Since we moved from the found position,
945 we definitely contradict the found anchored
30944b6d
IZ
946 substr. Due to the above check we do not
947 contradict "check" substr.
948 Thus we can arrive here only if check substr
949 is float. Redo checking for "other"=="fixed".
950 */
9041c2e3 951 strpos = t + 1;
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 953 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
954 goto do_other_anchored;
955 }
4ee3650e
GS
956 /* We don't contradict the found floating substring. */
957 /* XXXX Why not check for STCLASS? */
cad2e5aa 958 s = t + 1;
a3621e74 959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 960 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
961 goto set_useful;
962 }
4ee3650e
GS
963 /* Position contradicts check-string */
964 /* XXXX probably better to look for check-string
965 than for "\n", so one should lower the limit for t? */
a3621e74 966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 967 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 968 other_last = strpos = s = t + 1;
cad2e5aa
JH
969 goto restart;
970 }
971 t++;
972 }
a3621e74 973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 974 PL_colors[0], PL_colors[1]));
2c2d71f5 975 goto fail_finish;
cad2e5aa 976 }
f5952150 977 else {
a3621e74 978 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 979 PL_colors[0], PL_colors[1]));
f5952150 980 }
cad2e5aa
JH
981 s = t;
982 set_useful:
f2ed9b32 983 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
984 }
985 else {
f5952150 986 /* The found string does not prohibit matching at strpos,
2c2d71f5 987 - no optimization of calling REx engine can be performed,
f5952150
GS
988 unless it was an MBOL and we are not after MBOL,
989 or a future STCLASS check will fail this. */
2c2d71f5
JH
990 try_at_start:
991 /* Even in this situation we may use MBOL flag if strpos is offset
992 wrt the start of the string. */
05b4157f 993 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 994 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 995 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 996 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 997 {
cad2e5aa
JH
998 t = strpos;
999 goto find_anchor;
1000 }
a3621e74 1001 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1002 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1003 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1004 );
2c2d71f5 1005 success_at_start:
bbe252da 1006 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1007 && (utf8_target ? (
33b8afdf
JH
1008 prog->check_utf8 /* Could be deleted already */
1009 && --BmUSEFUL(prog->check_utf8) < 0
1010 && (prog->check_utf8 == prog->float_utf8)
1011 ) : (
1012 prog->check_substr /* Could be deleted already */
1013 && --BmUSEFUL(prog->check_substr) < 0
1014 && (prog->check_substr == prog->float_substr)
1015 )))
66e933ab 1016 {
cad2e5aa 1017 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1018 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1019 /* XXX Does the destruction order has to change with utf8_target? */
1020 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1021 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1022 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1023 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1024 check = NULL; /* abort */
cad2e5aa 1025 s = strpos;
486ec47a 1026 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1027 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1028 if (prog->intflags & PREGf_IMPLICIT)
1029 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1030 /* XXXX This is a remnant of the old implementation. It
1031 looks wasteful, since now INTUIT can use many
6eb5f6b9 1032 other heuristics. */
bbe252da 1033 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1034 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1035 }
1036 else
1037 s = strpos;
1038 }
1039
6eb5f6b9
JH
1040 /* Last resort... */
1041 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1042 /* trie stclasses are too expensive to use here, we are better off to
1043 leave it to regmatch itself */
f8fc2ecf 1044 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1045 /* minlen == 0 is possible if regstclass is \b or \B,
1046 and the fixed substr is ''$.
1047 Since minlen is already taken into account, s+1 is before strend;
1048 accidentally, minlen >= 1 guaranties no false positives at s + 1
1049 even for \b or \B. But (minlen? 1 : 0) below assumes that
1050 regstclass does not come from lookahead... */
1051 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
9a5a5549 1052 This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1053 const U8* const str = (U8*)STRING(progi->regstclass);
1054 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1055 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1056 : 1);
1de06328
YO
1057 char * endpos;
1058 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1059 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1060 else if (prog->float_substr || prog->float_utf8)
1061 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1062 else
1063 endpos= strend;
1064
70685ca0
JH
1065 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1066 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1067
6eb5f6b9 1068 t = s;
f8fc2ecf 1069 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1070 if (!s) {
1071#ifdef DEBUGGING
cbbf8932 1072 const char *what = NULL;
6eb5f6b9
JH
1073#endif
1074 if (endpos == strend) {
a3621e74 1075 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1076 "Could not match STCLASS...\n") );
1077 goto fail;
1078 }
a3621e74 1079 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1080 "This position contradicts STCLASS...\n") );
bbe252da 1081 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1082 goto fail;
6eb5f6b9 1083 /* Contradict one of substrings */
33b8afdf 1084 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1085 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1086 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1087 hop_and_restart:
1aa99e6b 1088 s = HOP3c(t, 1, strend);
66e933ab
GS
1089 if (s + start_shift + end_shift > strend) {
1090 /* XXXX Should be taken into account earlier? */
a3621e74 1091 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1092 "Could not match STCLASS...\n") );
1093 goto fail;
1094 }
5e39e1e5
HS
1095 if (!check)
1096 goto giveup;
a3621e74 1097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1098 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1099 what, (long)(s + start_shift - i_strpos)) );
1100 goto restart;
1101 }
66e933ab 1102 /* Have both, check_string is floating */
6eb5f6b9
JH
1103 if (t + start_shift >= check_at) /* Contradicts floating=check */
1104 goto retry_floating_check;
1105 /* Recheck anchored substring, but not floating... */
9041c2e3 1106 s = check_at;
5e39e1e5
HS
1107 if (!check)
1108 goto giveup;
a3621e74 1109 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1110 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1111 (long)(other_last - i_strpos)) );
1112 goto do_other_anchored;
1113 }
60e71179
GS
1114 /* Another way we could have checked stclass at the
1115 current position only: */
1116 if (ml_anch) {
1117 s = t = t + 1;
5e39e1e5
HS
1118 if (!check)
1119 goto giveup;
a3621e74 1120 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1121 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1122 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1123 goto try_at_offset;
66e933ab 1124 }
f2ed9b32 1125 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1126 goto fail;
486ec47a 1127 /* Check is floating substring. */
6eb5f6b9
JH
1128 retry_floating_check:
1129 t = check_at - start_shift;
a3621e74 1130 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1131 goto hop_and_restart;
1132 }
b7953727 1133 if (t != s) {
a3621e74 1134 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1135 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1136 (long)(t - i_strpos), (long)(s - i_strpos))
1137 );
1138 }
1139 else {
a3621e74 1140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1141 "Does not contradict STCLASS...\n");
1142 );
1143 }
6eb5f6b9 1144 }
5e39e1e5 1145 giveup:
a3621e74 1146 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1147 PL_colors[4], (check ? "Guessed" : "Giving up"),
1148 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1149 return s;
2c2d71f5
JH
1150
1151 fail_finish: /* Substring not found */
33b8afdf 1152 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1153 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1154 fail:
a3621e74 1155 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1156 PL_colors[4], PL_colors[5]));
bd61b366 1157 return NULL;
cad2e5aa 1158}
9661b544 1159
a0a388a1
YO
1160#define DECL_TRIE_TYPE(scan) \
1161 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1162 trie_type = (scan->flags != EXACT) \
f2ed9b32
KW
1163 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1164 : (utf8_target ? trie_utf8 : trie_plain)
3b0527fe 1165
55eed653
NC
1166#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1167uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1168 switch (trie_type) { \
1169 case trie_utf8_fold: \
1170 if ( foldlen>0 ) { \
0abd0d78 1171 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1172 foldlen -= len; \
1173 uscan += len; \
1174 len=0; \
1175 } else { \
0abd0d78 1176 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1177 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1178 foldlen -= UNISKIP( uvc ); \
1179 uscan = foldbuf + UNISKIP( uvc ); \
1180 } \
1181 break; \
a0a388a1
YO
1182 case trie_latin_utf8_fold: \
1183 if ( foldlen>0 ) { \
1184 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1185 foldlen -= len; \
1186 uscan += len; \
1187 len=0; \
1188 } else { \
1189 len = 1; \
1190 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1191 foldlen -= UNISKIP( uvc ); \
1192 uscan = foldbuf + UNISKIP( uvc ); \
1193 } \
1194 break; \
4cadc6a9
YO
1195 case trie_utf8: \
1196 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1197 break; \
1198 case trie_plain: \
1199 uvc = (UV)*uc; \
1200 len = 1; \
1201 } \
4cadc6a9
YO
1202 if (uvc < 256) { \
1203 charid = trie->charmap[ uvc ]; \
1204 } \
1205 else { \
1206 charid = 0; \
55eed653
NC
1207 if (widecharmap) { \
1208 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1209 (char*)&uvc, sizeof(UV), 0); \
1210 if (svpp) \
1211 charid = (U16)SvIV(*svpp); \
1212 } \
1213 } \
1214} STMT_END
1215
a0a388a1
YO
1216#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1217{ \
1218 char *my_strend= (char *)strend; \
4cadc6a9
YO
1219 if ( (CoNd) \
1220 && (ln == len || \
f2ed9b32
KW
1221 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1222 m, NULL, ln, cBOOL(UTF_PATTERN))) \
a0a388a1 1223 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1224 goto got_it; \
1225 else { \
1226 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1227 uvchr_to_utf8(tmpbuf, c); \
1228 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1229 if ( f != c \
1230 && (f == c1 || f == c2) \
a0a388a1 1231 && (ln == len || \
f2ed9b32
KW
1232 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1233 m, NULL, ln, cBOOL(UTF_PATTERN)))\
a0a388a1 1234 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1235 goto got_it; \
1236 } \
a0a388a1
YO
1237} \
1238s += len
4cadc6a9
YO
1239
1240#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1241STMT_START { \
a932d541 1242 re_fold_t folder; \
9a5a5549
KW
1243 switch (OP(c)) { \
1244 case EXACTFU: folder = foldEQ_latin1; break; \
1245 case EXACTFL: folder = foldEQ_locale; break; \
1246 case EXACTF: folder = foldEQ; break; \
1247 default: \
1248 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1249 } \
4cadc6a9
YO
1250 while (s <= e) { \
1251 if ( (CoNd) \
9a5a5549
KW
1252 && (ln == 1 || folder(s, m, ln)) \
1253 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1254 goto got_it; \
1255 s++; \
1256 } \
1257} STMT_END
1258
1259#define REXEC_FBC_UTF8_SCAN(CoDe) \
1260STMT_START { \
1261 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1262 CoDe \
1263 s += uskip; \
1264 } \
1265} STMT_END
1266
1267#define REXEC_FBC_SCAN(CoDe) \
1268STMT_START { \
1269 while (s < strend) { \
1270 CoDe \
1271 s++; \
1272 } \
1273} STMT_END
1274
1275#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1276REXEC_FBC_UTF8_SCAN( \
1277 if (CoNd) { \
24b23f37 1278 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1279 goto got_it; \
1280 else \
1281 tmp = doevery; \
1282 } \
1283 else \
1284 tmp = 1; \
1285)
1286
1287#define REXEC_FBC_CLASS_SCAN(CoNd) \
1288REXEC_FBC_SCAN( \
1289 if (CoNd) { \
24b23f37 1290 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1291 goto got_it; \
1292 else \
1293 tmp = doevery; \
1294 } \
1295 else \
1296 tmp = 1; \
1297)
1298
1299#define REXEC_FBC_TRYIT \
24b23f37 1300if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1301 goto got_it
1302
e1d1eefb 1303#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1304 if (utf8_target) { \
e1d1eefb
YO
1305 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1306 } \
1307 else { \
1308 REXEC_FBC_CLASS_SCAN(CoNd); \
1309 } \
1310 break
1311
4cadc6a9 1312#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1313 if (utf8_target) { \
4cadc6a9
YO
1314 UtFpReLoAd; \
1315 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1316 } \
1317 else { \
1318 REXEC_FBC_CLASS_SCAN(CoNd); \
1319 } \
1320 break
1321
1322#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1323 PL_reg_flags |= RF_tainted; \
f2ed9b32 1324 if (utf8_target) { \
4cadc6a9
YO
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1326 } \
1327 else { \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
1329 } \
1330 break
1331
786e8c11
YO
1332#define DUMP_EXEC_POS(li,s,doutf8) \
1333 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1334
1335/* We know what class REx starts with. Try to find this position... */
1336/* if reginfo is NULL, its a dryrun */
1337/* annoyingly all the vars in this routine have different names from their counterparts
1338 in regmatch. /grrr */
1339
3c3eec57 1340STATIC char *
07be1b83 1341S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1342 const char *strend, regmatch_info *reginfo)
a687059c 1343{
27da23d5 1344 dVAR;
bbe252da 1345 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1346 char *m;
d8093b23 1347 STRLEN ln;
5dab1207 1348 STRLEN lnc;
078c425b 1349 register STRLEN uskip;
d8093b23
G
1350 unsigned int c1;
1351 unsigned int c2;
6eb5f6b9
JH
1352 char *e;
1353 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1354 register const bool utf8_target = PL_reg_match_utf8;
f8fc2ecf 1355 RXi_GET_DECL(prog,progi);
7918f24d
NC
1356
1357 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1358
6eb5f6b9
JH
1359 /* We know what class it must start with. */
1360 switch (OP(c)) {
f56b6394 1361 case ANYOFV:
6eb5f6b9 1362 case ANYOF:
f56b6394 1363 if (utf8_target || OP(c) == ANYOFV) {
3ff7ceb3 1364 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
388cc4de 1365 !UTF8_IS_INVARIANT((U8)s[0]) ?
f2ed9b32 1366 reginclass(prog, c, (U8*)s, 0, utf8_target) :
4cadc6a9 1367 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1368 }
1369 else {
1370 while (s < strend) {
1371 STRLEN skip = 1;
1372
32fc9b6a 1373 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1374 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1375 /* The assignment of 2 is intentional:
1376 * for the folded sharp s, the skip is 2. */
1377 (skip = SHARP_S_SKIP))) {
24b23f37 1378 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
1383 else
1384 tmp = 1;
1385 s += skip;
1386 }
a0d0e21e 1387 }
6eb5f6b9 1388 break;
f33976b4 1389 case CANY:
4cadc6a9 1390 REXEC_FBC_SCAN(
24b23f37 1391 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1392 goto got_it;
1393 else
1394 tmp = doevery;
4cadc6a9 1395 );
f33976b4 1396 break;
9a5a5549 1397 case EXACTFU:
6eb5f6b9 1398 case EXACTF:
5dab1207
NIS
1399 m = STRING(c);
1400 ln = STR_LEN(c); /* length to match in octets/bytes */
1401 lnc = (I32) ln; /* length to match in characters */
f2ed9b32 1402 if (UTF_PATTERN) {
a2a2844f 1403 STRLEN ulen1, ulen2;
5dab1207 1404 U8 *sm = (U8 *) m;
89ebb4a3
JH
1405 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1406 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1407 /* used by commented-out code below */
1408 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1409
1410 /* XXX: Since the node will be case folded at compile
1411 time this logic is a little odd, although im not
1412 sure that its actually wrong. --dmq */
1413
1414 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1415 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1416
1417 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1418 codepoint of the first character in the converted
1419 form, yet originally we did the extra step.
1420 No tests fail by commenting this code out however
1421 so Ive left it out. -- dmq.
1422
89ebb4a3 1423 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1424 0, uniflags);
89ebb4a3 1425 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1426 0, uniflags);
a0a388a1
YO
1427 */
1428
5dab1207
NIS
1429 lnc = 0;
1430 while (sm < ((U8 *) m + ln)) {
1431 lnc++;
1432 sm += UTF8SKIP(sm);
1433 }
1aa99e6b
IH
1434 }
1435 else {
1436 c1 = *(U8*)m;
9a5a5549
KW
1437 if (utf8_target || OP(c) == EXACTFU) {
1438
1439 /* Micro sign folds to GREEK SMALL LETTER MU;
1440 LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1441 c2 to the first 's' of the pair, and the code below will
1442 look for others */
1443 c2 = (c1 == MICRO_SIGN)
1444 ? GREEK_SMALL_LETTER_MU
1445 : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1446 ? 's'
1447 : PL_fold_latin1[c1];
1448 } else c2 = PL_fold[c1];
1aa99e6b 1449 }
6eb5f6b9
JH
1450 goto do_exactf;
1451 case EXACTFL:
5dab1207
NIS
1452 m = STRING(c);
1453 ln = STR_LEN(c);
1454 lnc = (I32) ln;
d8093b23 1455 c1 = *(U8*)m;
6eb5f6b9
JH
1456 c2 = PL_fold_locale[c1];
1457 do_exactf:
db12adc6 1458 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1459
3b0527fe 1460 if (!reginfo && e < s)
6eb5f6b9 1461 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1462
60a8b682
JH
1463 /* The idea in the EXACTF* cases is to first find the
1464 * first character of the EXACTF* node and then, if
1465 * necessary, case-insensitively compare the full
1466 * text of the node. The c1 and c2 are the first
1467 * characters (though in Unicode it gets a bit
1468 * more complicated because there are more cases
7f16dd3d
JH
1469 * than just upper and lower: one needs to use
1470 * the so-called folding case for case-insensitive
1471 * matching (called "loose matching" in Unicode).
4c1b470c 1472 * foldEQ_utf8() will do just that. */
60a8b682 1473
f2ed9b32 1474 if (utf8_target || UTF_PATTERN) {
575cac57 1475 UV c, f;
89ebb4a3 1476 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1477 STRLEN len = 1;
1478 STRLEN foldlen;
4ad0818d 1479 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1480 if (c1 == c2) {
5dab1207
NIS
1481 /* Upper and lower of 1st char are equal -
1482 * probably not a "letter". */
1aa99e6b 1483 while (s <= e) {
f2ed9b32 1484 if (utf8_target) {
a0a388a1 1485 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1486 uniflags);
a0a388a1
YO
1487 } else {
1488 c = *((U8*)s);
1489 }
4cadc6a9 1490 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1491 }
09091399
JH
1492 }
1493 else {
1aa99e6b 1494 while (s <= e) {
f2ed9b32 1495 if (utf8_target) {
a0a388a1 1496 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1497 uniflags);
a0a388a1
YO
1498 } else {
1499 c = *((U8*)s);
1500 }
80aecb99 1501
60a8b682 1502 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1503 * Note that not all the possible combinations
1504 * are handled here: some of them are handled
1505 * by the standard folding rules, and some of
1506 * them (the character class or ANYOF cases)
1507 * are handled during compiletime in
1508 * regexec.c:S_regclass(). */
880bd946
JH
1509 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1510 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1511 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1512
4cadc6a9 1513 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1514 }
09091399 1515 }
1aa99e6b
IH
1516 }
1517 else {
a0a388a1 1518 /* Neither pattern nor string are UTF8 */
1aa99e6b 1519 if (c1 == c2)
4cadc6a9 1520 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1521 else
4cadc6a9 1522 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1523 }
1524 break;
bbce6d69 1525 case BOUNDL:
3280af22 1526 PL_reg_flags |= RF_tainted;
bbce6d69 1527 /* FALL THROUGH */
a0d0e21e 1528 case BOUND:
f2ed9b32 1529 if (utf8_target) {
12d33761 1530 if (s == PL_bostr)
ffc61ed2
JH
1531 tmp = '\n';
1532 else {
6136c704 1533 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1534 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1535 }
1536 tmp = ((OP(c) == BOUND ?
9041c2e3 1537 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1538 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1539 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1540 if (tmp == !(OP(c) == BOUND ?
f2ed9b32 1541 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1542 isALNUM_LC_utf8((U8*)s)))
1543 {
1544 tmp = !tmp;
4cadc6a9 1545 REXEC_FBC_TRYIT;
a687059c 1546 }
4cadc6a9 1547 );
a0d0e21e 1548 }
a12cf05f 1549 else { /* Not utf8 */
12d33761 1550 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
a12cf05f
KW
1551 tmp = cBOOL((OP(c) == BOUNDL)
1552 ? isALNUM_LC(tmp)
1553 : (isWORDCHAR_L1(tmp)
b988e673 1554 && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
4cadc6a9 1555 REXEC_FBC_SCAN(
ffc61ed2 1556 if (tmp ==
a12cf05f
KW
1557 !((OP(c) == BOUNDL)
1558 ? isALNUM_LC(*s)
1559 : (isWORDCHAR_L1((U8) *s)
b988e673 1560 && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
a12cf05f 1561 {
ffc61ed2 1562 tmp = !tmp;
4cadc6a9 1563 REXEC_FBC_TRYIT;
a0ed51b3 1564 }
4cadc6a9 1565 );
a0ed51b3 1566 }
24b23f37 1567 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1568 goto got_it;
1569 break;
bbce6d69 1570 case NBOUNDL:
3280af22 1571 PL_reg_flags |= RF_tainted;
bbce6d69 1572 /* FALL THROUGH */
a0d0e21e 1573 case NBOUND:
f2ed9b32 1574 if (utf8_target) {
12d33761 1575 if (s == PL_bostr)
ffc61ed2
JH
1576 tmp = '\n';
1577 else {
6136c704 1578 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1579 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1580 }
1581 tmp = ((OP(c) == NBOUND ?
9041c2e3 1582 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1583 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1584 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1585 if (tmp == !(OP(c) == NBOUND ?
f2ed9b32 1586 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1587 isALNUM_LC_utf8((U8*)s)))
1588 tmp = !tmp;
4cadc6a9
YO
1589 else REXEC_FBC_TRYIT;
1590 );
a0d0e21e 1591 }
667bb95a 1592 else {
12d33761 1593 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
a12cf05f
KW
1594 tmp = cBOOL((OP(c) == NBOUNDL)
1595 ? isALNUM_LC(tmp)
1596 : (isWORDCHAR_L1(tmp)
b988e673 1597 && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET))));
4cadc6a9 1598 REXEC_FBC_SCAN(
a12cf05f
KW
1599 if (tmp == ! cBOOL(
1600 (OP(c) == NBOUNDL)
1601 ? isALNUM_LC(*s)
1602 : (isWORDCHAR_L1((U8) *s)
b988e673 1603 && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))))
a12cf05f 1604 {
ffc61ed2 1605 tmp = !tmp;
a12cf05f 1606 }
4cadc6a9
YO
1607 else REXEC_FBC_TRYIT;
1608 );
a0ed51b3 1609 }
24b23f37 1610 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1611 goto got_it;
1612 break;
bbce6d69 1613 case ALNUML:
4cadc6a9
YO
1614 REXEC_FBC_CSCAN_TAINT(
1615 isALNUM_LC_utf8((U8*)s),
1616 isALNUM_LC(*s)
1617 );
980866de
KW
1618 case ALNUMU:
1619 REXEC_FBC_CSCAN_PRELOAD(
1620 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1621 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1622 isWORDCHAR_L1((U8) *s)
1623 );
1624 case ALNUM:
1625 REXEC_FBC_CSCAN_PRELOAD(
1626 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1627 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1628 isWORDCHAR((U8) *s)
1629 );
1630 case NALNUMU:
1631 REXEC_FBC_CSCAN_PRELOAD(
1632 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1633 swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
1634 ! isWORDCHAR_L1((U8) *s)
1635 );
a0d0e21e 1636 case NALNUM:
4cadc6a9 1637 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1638 LOAD_UTF8_CHARCLASS_PERL_WORD(),
f2ed9b32 1639 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
980866de 1640 ! isALNUM(*s)
4cadc6a9 1641 );
bbce6d69 1642 case NALNUML:
4cadc6a9
YO
1643 REXEC_FBC_CSCAN_TAINT(
1644 !isALNUM_LC_utf8((U8*)s),
1645 !isALNUM_LC(*s)
1646 );
980866de
KW
1647 case SPACEU:
1648 REXEC_FBC_CSCAN_PRELOAD(
1649 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1650 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1651 isSPACE_L1((U8) *s)
1652 );
a0d0e21e 1653 case SPACE:
4cadc6a9 1654 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1655 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1656 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
980866de 1657 isSPACE((U8) *s)
4cadc6a9 1658 );
bbce6d69 1659 case SPACEL:
4cadc6a9 1660 REXEC_FBC_CSCAN_TAINT(
6bbba904 1661 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1662 isSPACE_LC(*s)
1663 );
980866de
KW
1664 case NSPACEU:
1665 REXEC_FBC_CSCAN_PRELOAD(
1666 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1667 !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1668 ! isSPACE_L1((U8) *s)
1669 );
a0d0e21e 1670 case NSPACE:
4cadc6a9 1671 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1672 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1673 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
980866de 1674 ! isSPACE((U8) *s)
4cadc6a9 1675 );
bbce6d69 1676 case NSPACEL:
4cadc6a9 1677 REXEC_FBC_CSCAN_TAINT(
6bbba904 1678 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1679 !isSPACE_LC(*s)
1680 );
a0d0e21e 1681 case DIGIT:
4cadc6a9 1682 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1683 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1684 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1685 isDIGIT(*s)
1686 );
b8c5462f 1687 case DIGITL:
4cadc6a9
YO
1688 REXEC_FBC_CSCAN_TAINT(
1689 isDIGIT_LC_utf8((U8*)s),
1690 isDIGIT_LC(*s)
1691 );
a0d0e21e 1692 case NDIGIT:
4cadc6a9 1693 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1694 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1695 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1696 !isDIGIT(*s)
1697 );
b8c5462f 1698 case NDIGITL:
4cadc6a9
YO
1699 REXEC_FBC_CSCAN_TAINT(
1700 !isDIGIT_LC_utf8((U8*)s),
1701 !isDIGIT_LC(*s)
1702 );
e1d1eefb
YO
1703 case LNBREAK:
1704 REXEC_FBC_CSCAN(
1705 is_LNBREAK_utf8(s),
1706 is_LNBREAK_latin1(s)
1707 );
1708 case VERTWS:
1709 REXEC_FBC_CSCAN(
1710 is_VERTWS_utf8(s),
1711 is_VERTWS_latin1(s)
1712 );
1713 case NVERTWS:
1714 REXEC_FBC_CSCAN(
1715 !is_VERTWS_utf8(s),
1716 !is_VERTWS_latin1(s)
1717 );
1718 case HORIZWS:
1719 REXEC_FBC_CSCAN(
1720 is_HORIZWS_utf8(s),
1721 is_HORIZWS_latin1(s)
1722 );
1723 case NHORIZWS:
1724 REXEC_FBC_CSCAN(
1725 !is_HORIZWS_utf8(s),
1726 !is_HORIZWS_latin1(s)
1727 );
1de06328
YO
1728 case AHOCORASICKC:
1729 case AHOCORASICK:
07be1b83 1730 {
a0a388a1 1731 DECL_TRIE_TYPE(c);
07be1b83
YO
1732 /* what trie are we using right now */
1733 reg_ac_data *aho
f8fc2ecf 1734 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1735 reg_trie_data *trie
1736 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1737 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1738
1739 const char *last_start = strend - trie->minlen;
6148ee25 1740#ifdef DEBUGGING
07be1b83 1741 const char *real_start = s;
6148ee25 1742#endif
07be1b83 1743 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1744 SV *sv_points;
1745 U8 **points; /* map of where we were in the input string
786e8c11 1746 when reading a given char. For ASCII this
be8e71aa 1747 is unnecessary overhead as the relationship
38a44b82
NC
1748 is always 1:1, but for Unicode, especially
1749 case folded Unicode this is not true. */
f9e705e8 1750 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1751 U8 *bitmap=NULL;
1752
07be1b83
YO
1753
1754 GET_RE_DEBUG_FLAGS_DECL;
1755
be8e71aa
YO
1756 /* We can't just allocate points here. We need to wrap it in
1757 * an SV so it gets freed properly if there is a croak while
1758 * running the match */
1759 ENTER;
1760 SAVETMPS;
1761 sv_points=newSV(maxlen * sizeof(U8 *));
1762 SvCUR_set(sv_points,
1763 maxlen * sizeof(U8 *));
1764 SvPOK_on(sv_points);
1765 sv_2mortal(sv_points);
1766 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1767 if ( trie_type != trie_utf8_fold
1768 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1769 {
786e8c11
YO
1770 if (trie->bitmap)
1771 bitmap=(U8*)trie->bitmap;
1772 else
1773 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1774 }
786e8c11
YO
1775 /* this is the Aho-Corasick algorithm modified a touch
1776 to include special handling for long "unknown char"
1777 sequences. The basic idea being that we use AC as long
1778 as we are dealing with a possible matching char, when
1779 we encounter an unknown char (and we have not encountered
1780 an accepting state) we scan forward until we find a legal
1781 starting char.
1782 AC matching is basically that of trie matching, except
1783 that when we encounter a failing transition, we fall back
1784 to the current states "fail state", and try the current char
1785 again, a process we repeat until we reach the root state,
1786 state 1, or a legal transition. If we fail on the root state
1787 then we can either terminate if we have reached an accepting
1788 state previously, or restart the entire process from the beginning
1789 if we have not.
1790
1791 */
07be1b83
YO
1792 while (s <= last_start) {
1793 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1794 U8 *uc = (U8*)s;
1795 U16 charid = 0;
1796 U32 base = 1;
1797 U32 state = 1;
1798 UV uvc = 0;
1799 STRLEN len = 0;
1800 STRLEN foldlen = 0;
1801 U8 *uscan = (U8*)NULL;
1802 U8 *leftmost = NULL;
786e8c11
YO
1803#ifdef DEBUGGING
1804 U32 accepted_word= 0;
1805#endif
07be1b83
YO
1806 U32 pointpos = 0;
1807
1808 while ( state && uc <= (U8*)strend ) {
1809 int failed=0;
786e8c11
YO
1810 U32 word = aho->states[ state ].wordnum;
1811
1de06328
YO
1812 if( state==1 ) {
1813 if ( bitmap ) {
1814 DEBUG_TRIE_EXECUTE_r(
1815 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1816 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1817 (char *)uc, utf8_target );
1de06328
YO
1818 PerlIO_printf( Perl_debug_log,
1819 " Scanning for legal start char...\n");
1820 }
d085b490
YO
1821 );
1822 if (utf8_target) {
1823 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1824 uc += UTF8SKIP(uc);
1825 }
1826 } else {
1827 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1828 uc++;
1829 }
1830 }
1de06328 1831 s= (char *)uc;
786e8c11 1832 }
786e8c11
YO
1833 if (uc >(U8*)last_start) break;
1834 }
1835
1836 if ( word ) {
2e64971a 1837 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1838 if (!leftmost || lpos < leftmost) {
1839 DEBUG_r(accepted_word=word);
07be1b83 1840 leftmost= lpos;
786e8c11 1841 }
07be1b83 1842 if (base==0) break;
786e8c11 1843
07be1b83
YO
1844 }
1845 points[pointpos++ % maxlen]= uc;
55eed653
NC
1846 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1847 uscan, len, uvc, charid, foldlen,
1848 foldbuf, uniflags);
786e8c11
YO
1849 DEBUG_TRIE_EXECUTE_r({
1850 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1851 s, utf8_target );
07be1b83 1852 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1853 " Charid:%3u CP:%4"UVxf" ",
1854 charid, uvc);
1855 });
07be1b83
YO
1856
1857 do {
6148ee25 1858#ifdef DEBUGGING
786e8c11 1859 word = aho->states[ state ].wordnum;
6148ee25 1860#endif
07be1b83
YO
1861 base = aho->states[ state ].trans.base;
1862
786e8c11
YO
1863 DEBUG_TRIE_EXECUTE_r({
1864 if (failed)
1865 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1866 s, utf8_target );
07be1b83 1867 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1868 "%sState: %4"UVxf", word=%"UVxf,
1869 failed ? " Fail transition to " : "",
1870 (UV)state, (UV)word);
1871 });
07be1b83
YO
1872 if ( base ) {
1873 U32 tmp;
6dd2be57 1874 I32 offset;
07be1b83 1875 if (charid &&
6dd2be57
DM
1876 ( ((offset = base + charid
1877 - 1 - trie->uniquecharcount)) >= 0)
1878 && ((U32)offset < trie->lasttrans)
1879 && trie->trans[offset].check == state
1880 && (tmp=trie->trans[offset].next))
07be1b83 1881 {
786e8c11
YO
1882 DEBUG_TRIE_EXECUTE_r(
1883 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1884 state = tmp;
1885 break;
1886 }
1887 else {
786e8c11
YO
1888 DEBUG_TRIE_EXECUTE_r(
1889 PerlIO_printf( Perl_debug_log," - fail\n"));
1890 failed = 1;
1891 state = aho->fail[state];
07be1b83
YO
1892 }
1893 }
1894 else {
1895 /* we must be accepting here */
786e8c11
YO
1896 DEBUG_TRIE_EXECUTE_r(
1897 PerlIO_printf( Perl_debug_log," - accepting\n"));
1898 failed = 1;
07be1b83
YO
1899 break;
1900 }
1901 } while(state);
786e8c11 1902 uc += len;
07be1b83
YO
1903 if (failed) {
1904 if (leftmost)
1905 break;
786e8c11 1906 if (!state) state = 1;
07be1b83
YO
1907 }
1908 }
1909 if ( aho->states[ state ].wordnum ) {
2e64971a 1910 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1911 if (!leftmost || lpos < leftmost) {
1912 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1913 leftmost = lpos;
786e8c11 1914 }
07be1b83 1915 }
07be1b83
YO
1916 if (leftmost) {
1917 s = (char*)leftmost;
786e8c11
YO
1918 DEBUG_TRIE_EXECUTE_r({
1919 PerlIO_printf(
70685ca0
JH
1920 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1921 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1922 );
1923 });
24b23f37 1924 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1925 FREETMPS;
1926 LEAVE;
07be1b83 1927 goto got_it;
be8e71aa 1928 }
07be1b83 1929 s = HOPc(s,1);
786e8c11
YO
1930 DEBUG_TRIE_EXECUTE_r({
1931 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1932 });
07be1b83 1933 } else {
786e8c11
YO
1934 DEBUG_TRIE_EXECUTE_r(
1935 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1936 break;
1937 }
1938 }
be8e71aa
YO
1939 FREETMPS;
1940 LEAVE;
07be1b83
YO
1941 }
1942 break;
b3c9acc1 1943 default:
3c3eec57
GS
1944 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1945 break;
d6a28714 1946 }
6eb5f6b9
JH
1947 return 0;
1948 got_it:
1949 return s;
1950}
1951
fae667d5 1952
6eb5f6b9
JH
1953/*
1954 - regexec_flags - match a regexp against a string
1955 */
1956I32
288b8c02 1957Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1958 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1959/* strend: pointer to null at end of string */
1960/* strbeg: real beginning of string */
1961/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1962/* data: May be used for some additional optimizations.
1963 Currently its only used, with a U32 cast, for transmitting
1964 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1965/* nosave: For optimizations. */
1966{
97aff369 1967 dVAR;
288b8c02 1968 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1969 /*register*/ char *s;
6eb5f6b9 1970 register regnode *c;
24b23f37 1971 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1972 I32 minlen; /* must match at least this many chars */
1973 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1974 I32 end_shift = 0; /* Same for the end. */ /* CC */
1975 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1976 char *scream_olds = NULL;
f2ed9b32 1977 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 1978 I32 multiline;
f8fc2ecf 1979 RXi_GET_DECL(prog,progi);
3b0527fe 1980 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1981 regexp_paren_pair *swap = NULL;
a3621e74
YO
1982 GET_RE_DEBUG_FLAGS_DECL;
1983
7918f24d 1984 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1985 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1986
1987 /* Be paranoid... */
1988 if (prog == NULL || startpos == NULL) {
1989 Perl_croak(aTHX_ "NULL regexp parameter");
1990 return 0;
1991 }
1992
bbe252da 1993 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1994 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1995
f2ed9b32 1996 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 1997 DEBUG_EXECUTE_r(
f2ed9b32 1998 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
1999 "Matching");
2000 );
bac06658 2001
6eb5f6b9 2002 minlen = prog->minlen;
1de06328
YO
2003
2004 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2005 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2006 "String too short [regexec_flags]...\n"));
2007 goto phooey;
1aa99e6b 2008 }
6eb5f6b9 2009
1de06328 2010
6eb5f6b9 2011 /* Check validity of program. */
f8fc2ecf 2012 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2013 Perl_croak(aTHX_ "corrupted regexp program");
2014 }
2015
2016 PL_reg_flags = 0;
2017 PL_reg_eval_set = 0;
2018 PL_reg_maxiter = 0;
2019
3c8556c3 2020 if (RX_UTF8(rx))
6eb5f6b9
JH
2021 PL_reg_flags |= RF_utf8;
2022
2023 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2024 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2025 PL_bostr = strbeg;
3b0527fe 2026 reginfo.sv = sv;
6eb5f6b9
JH
2027
2028 /* Mark end of line for $ (and such) */
2029 PL_regeol = strend;
2030
2031 /* see how far we have to get to not match where we matched before */
3b0527fe 2032 reginfo.till = startpos+minend;
6eb5f6b9 2033
6eb5f6b9
JH
2034 /* If there is a "must appear" string, look for it. */
2035 s = startpos;
2036
bbe252da 2037 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2038 MAGIC *mg;
2c296965 2039 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2040 reginfo.ganch = startpos + prog->gofs;
2c296965 2041 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2042 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2043 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2044 && SvMAGIC(sv)
14befaf4
DM
2045 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2046 && mg->mg_len >= 0) {
3b0527fe 2047 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2048 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2049 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2050
bbe252da 2051 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2052 if (s > reginfo.ganch)
6eb5f6b9 2053 goto phooey;
58e23c8d 2054 s = reginfo.ganch - prog->gofs;
2c296965 2055 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2056 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2057 if (s < strbeg)
2058 goto phooey;
6eb5f6b9
JH
2059 }
2060 }
58e23c8d 2061 else if (data) {
70685ca0 2062 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2063 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2064 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2065
2066 } else { /* pos() not defined */
3b0527fe 2067 reginfo.ganch = strbeg;
2c296965
YO
2068 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2069 "GPOS: reginfo.ganch = strbeg\n"));
2070 }
6eb5f6b9 2071 }
288b8c02 2072 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2073 /* We have to be careful. If the previous successful match
2074 was from this regex we don't want a subsequent partially
2075 successful match to clobber the old results.
2076 So when we detect this possibility we add a swap buffer
2077 to the re, and switch the buffer each match. If we fail
2078 we switch it back, otherwise we leave it swapped.
2079 */
2080 swap = prog->offs;
2081 /* do we need a save destructor here for eval dies? */
2082 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2083 }
a0714e2c 2084 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2085 re_scream_pos_data d;
2086
2087 d.scream_olds = &scream_olds;
2088 d.scream_pos = &scream_pos;
288b8c02 2089 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2090 if (!s) {
a3621e74 2091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2092 goto phooey; /* not present */
3fa9c3d7 2093 }
6eb5f6b9
JH
2094 }
2095
1de06328 2096
6eb5f6b9
JH
2097
2098 /* Simplest case: anchored match need be tried only once. */
2099 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2100 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2101 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2102 goto got_it;
bbe252da
YO
2103 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2104 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2105 {
2106 char *end;
2107
2108 if (minlen)
2109 dontbother = minlen - 1;
1aa99e6b 2110 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2111 /* for multiline we only have to try after newlines */
33b8afdf 2112 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2113 /* because of the goto we can not easily reuse the macros for bifurcating the
2114 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2115 if (utf8_target) {
2116 if (s == startpos)
2117 goto after_try_utf8;
2118 while (1) {
2119 if (regtry(&reginfo, &s)) {
2120 goto got_it;
2121 }
2122 after_try_utf8:
2123 if (s > end) {
2124 goto phooey;
2125 }
2126 if (prog->extflags & RXf_USE_INTUIT) {
2127 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2128 if (!s) {
2129 goto phooey;
2130 }
2131 }
2132 else {
2133 s += UTF8SKIP(s);
2134 }
2135 }
2136 } /* end search for check string in unicode */
2137 else {
2138 if (s == startpos) {
2139 goto after_try_latin;
2140 }
2141 while (1) {
2142 if (regtry(&reginfo, &s)) {
2143 goto got_it;
2144 }
2145 after_try_latin:
2146 if (s > end) {
2147 goto phooey;
2148 }
2149 if (prog->extflags & RXf_USE_INTUIT) {
2150 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2151 if (!s) {
2152 goto phooey;
2153 }
2154 }
2155 else {
2156 s++;
2157 }
2158 }
2159 } /* end search for check string in latin*/
2160 } /* end search for check string */
2161 else { /* search for newline */
2162 if (s > startpos) {
2163 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2164 s--;
92f3d482
YO
2165 }
2166 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
6eb5f6b9
JH
2167 while (s < end) {
2168 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2169 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2170 goto got_it;
2171 }
92f3d482
YO
2172 }
2173 } /* end search for newline */
2174 } /* end anchored/multiline check string search */
6eb5f6b9 2175 goto phooey;
bbe252da 2176 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2177 {
486ec47a 2178 /* the warning about reginfo.ganch being used without initialization
bbe252da 2179 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2180 and we only enter this block when the same bit is set. */
58e23c8d 2181 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2182
2183 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2184 goto got_it;
2185 goto phooey;
2186 }
2187
2188 /* Messy cases: unanchored match. */
bbe252da 2189 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2190 /* we have /x+whatever/ */
f2ed9b32 2191 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2192 char ch;
bf93d4cc
GS
2193#ifdef DEBUGGING
2194 int did_match = 0;
2195#endif
f2ed9b32
KW
2196 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2197 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2198 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2199
f2ed9b32 2200 if (utf8_target) {
4cadc6a9 2201 REXEC_FBC_SCAN(
6eb5f6b9 2202 if (*s == ch) {
a3621e74 2203 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2204 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2205 s += UTF8SKIP(s);
2206 while (s < strend && *s == ch)
2207 s += UTF8SKIP(s);
2208 }
4cadc6a9 2209 );
6eb5f6b9
JH
2210 }
2211 else {
4cadc6a9 2212 REXEC_FBC_SCAN(
6eb5f6b9 2213 if (*s == ch) {
a3621e74 2214 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2215 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2216 s++;
2217 while (s < strend && *s == ch)
2218 s++;
2219 }
4cadc6a9 2220 );
6eb5f6b9 2221 }
a3621e74 2222 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2223 PerlIO_printf(Perl_debug_log,
b7953727
JH
2224 "Did not find anchored character...\n")
2225 );
6eb5f6b9 2226 }
a0714e2c
SS
2227 else if (prog->anchored_substr != NULL
2228 || prog->anchored_utf8 != NULL
2229 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2230 && prog->float_max_offset < strend - s)) {
2231 SV *must;
2232 I32 back_max;
2233 I32 back_min;
2234 char *last;
6eb5f6b9 2235 char *last1; /* Last position checked before */
bf93d4cc
GS
2236#ifdef DEBUGGING
2237 int did_match = 0;
2238#endif
33b8afdf 2239 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2240 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2241 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2242 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2243 back_max = back_min = prog->anchored_offset;
2244 } else {
f2ed9b32
KW
2245 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2246 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2247 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2248 back_max = prog->float_max_offset;
2249 back_min = prog->float_min_offset;
2250 }
1de06328
YO
2251
2252
33b8afdf
JH
2253 if (must == &PL_sv_undef)
2254 /* could not downgrade utf8 check substring, so must fail */
2255 goto phooey;
2256
1de06328
YO
2257 if (back_min<0) {
2258 last = strend;
2259 } else {
2260 last = HOP3c(strend, /* Cannot start after this */
2261 -(I32)(CHR_SVLEN(must)
2262 - (SvTAIL(must) != 0) + back_min), strbeg);
2263 }
6eb5f6b9
JH
2264 if (s > PL_bostr)
2265 last1 = HOPc(s, -1);
2266 else
2267 last1 = s - 1; /* bogus */
2268
a0288114 2269 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2270 check_substr==must. */
2271 scream_pos = -1;
2272 dontbother = end_shift;
2273 strend = HOPc(strend, -dontbother);
2274 while ( (s <= last) &&
9041c2e3 2275 ((flags & REXEC_SCREAM)
1de06328 2276 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2277 end_shift, &scream_pos, 0))
1de06328 2278 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2279 (unsigned char*)strend, must,
7fba1cd6 2280 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2281 /* we may be pointing at the wrong string */
07bc277f 2282 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2283 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2284 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2285 if (HOPc(s, -back_max) > last1) {
2286 last1 = HOPc(s, -back_min);
2287 s = HOPc(s, -back_max);
2288 }
2289 else {
52657f30 2290 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2291
2292 last1 = HOPc(s, -back_min);
52657f30 2293 s = t;
6eb5f6b9 2294 }
f2ed9b32 2295 if (utf8_target) {
6eb5f6b9 2296 while (s <= last1) {
24b23f37 2297 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2298 goto got_it;
2299 s += UTF8SKIP(s);
2300 }
2301 }
2302 else {
2303 while (s <= last1) {
24b23f37 2304 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2305 goto got_it;
2306 s++;
2307 }
2308 }
2309 }
ab3bbdeb 2310 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2311 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2312 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2313 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2314 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2315 ? "anchored" : "floating"),
ab3bbdeb
YO
2316 quoted, RE_SV_TAIL(must));
2317 });
6eb5f6b9
JH
2318 goto phooey;
2319 }
f8fc2ecf 2320 else if ( (c = progi->regstclass) ) {
f14c76ed 2321 if (minlen) {
f8fc2ecf 2322 const OPCODE op = OP(progi->regstclass);
66e933ab 2323 /* don't bother with what can't match */
786e8c11 2324 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2325 strend = HOPc(strend, -(minlen - 1));
2326 }
a3621e74 2327 DEBUG_EXECUTE_r({
be8e71aa 2328 SV * const prop = sv_newmortal();
32fc9b6a 2329 regprop(prog, prop, c);
0df25f3d 2330 {
f2ed9b32 2331 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2332 s,strend-s,60);
0df25f3d 2333 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2334 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2335 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2336 quoted, (int)(strend - s));
0df25f3d 2337 }
ffc61ed2 2338 });
3b0527fe 2339 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2340 goto got_it;
07be1b83 2341 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2342 }
2343 else {
2344 dontbother = 0;
a0714e2c 2345 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2346 /* Trim the end. */
d6a28714 2347 char *last;
33b8afdf
JH
2348 SV* float_real;
2349
f2ed9b32
KW
2350 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2351 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2352 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2353
2354 if (flags & REXEC_SCREAM) {
33b8afdf 2355 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2356 end_shift, &scream_pos, 1); /* last one */
2357 if (!last)
ffc61ed2 2358 last = scream_olds; /* Only one occurrence. */
4addbd3b 2359 /* we may be pointing at the wrong string */
07bc277f 2360 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2361 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2362 }
d6a28714
JH
2363 else {
2364 STRLEN len;
cfd0369c 2365 const char * const little = SvPV_const(float_real, len);
d6a28714 2366
33b8afdf 2367 if (SvTAIL(float_real)) {
d6a28714
JH
2368 if (memEQ(strend - len + 1, little, len - 1))
2369 last = strend - len + 1;
7fba1cd6 2370 else if (!multiline)
9041c2e3 2371 last = memEQ(strend - len, little, len)
bd61b366 2372 ? strend - len : NULL;
b8c5462f 2373 else
d6a28714
JH
2374 goto find_last;
2375 } else {
2376 find_last:
9041c2e3 2377 if (len)
d6a28714 2378 last = rninstr(s, strend, little, little + len);
b8c5462f 2379 else
a0288114 2380 last = strend; /* matching "$" */
b8c5462f 2381 }
b8c5462f 2382 }
bf93d4cc 2383 if (last == NULL) {
6bda09f9
YO
2384 DEBUG_EXECUTE_r(
2385 PerlIO_printf(Perl_debug_log,
2386 "%sCan't trim the tail, match fails (should not happen)%s\n",
2387 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2388 goto phooey; /* Should not happen! */
2389 }
d6a28714
JH
2390 dontbother = strend - last + prog->float_min_offset;
2391 }
2392 if (minlen && (dontbother < minlen))
2393 dontbother = minlen - 1;
2394 strend -= dontbother; /* this one's always in bytes! */
2395 /* We don't know much -- general case. */
f2ed9b32 2396 if (utf8_target) {
d6a28714 2397 for (;;) {
24b23f37 2398 if (regtry(&reginfo, &s))
d6a28714
JH
2399 goto got_it;
2400 if (s >= strend)
2401 break;
b8c5462f 2402 s += UTF8SKIP(s);
d6a28714
JH
2403 };
2404 }
2405 else {
2406 do {
24b23f37 2407 if (regtry(&reginfo, &s))
d6a28714
JH
2408 goto got_it;
2409 } while (s++ < strend);
2410 }
2411 }
2412
2413 /* Failure. */
2414 goto phooey;
2415
2416got_it:
e9105d30 2417 Safefree(swap);
288b8c02 2418 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2419
19b95bf0 2420 if (PL_reg_eval_set)
4f639d21 2421 restore_pos(aTHX_ prog);
5daac39c
NC
2422 if (RXp_PAREN_NAMES(prog))
2423 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2424
2425 /* make sure $`, $&, $', and $digit will work later */
2426 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2427 RX_MATCH_COPY_FREE(rx);
d6a28714 2428 if (flags & REXEC_COPY_STR) {
be8e71aa 2429 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2430#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2431 if ((SvIsCOW(sv)
2432 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2433 if (DEBUG_C_TEST) {
2434 PerlIO_printf(Perl_debug_log,
2435 "Copy on write: regexp capture, type %d\n",
2436 (int) SvTYPE(sv));
2437 }
2438 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2439 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2440 assert (SvPOKp(prog->saved_copy));
2441 } else
2442#endif
2443 {
288b8c02 2444 RX_MATCH_COPIED_on(rx);
ed252734
NC
2445 s = savepvn(strbeg, i);
2446 prog->subbeg = s;
2447 }
d6a28714 2448 prog->sublen = i;
d6a28714
JH
2449 }
2450 else {
2451 prog->subbeg = strbeg;
2452 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2453 }
2454 }
9041c2e3 2455
d6a28714
JH
2456 return 1;
2457
2458phooey:
a3621e74 2459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2460 PL_colors[4], PL_colors[5]));
d6a28714 2461 if (PL_reg_eval_set)
4f639d21 2462 restore_pos(aTHX_ prog);
e9105d30 2463 if (swap) {
c74340f9 2464 /* we failed :-( roll it back */
e9105d30
GG
2465 Safefree(prog->offs);
2466 prog->offs = swap;
2467 }
2468
d6a28714
JH
2469 return 0;
2470}
2471
6bda09f9 2472
d6a28714
JH
2473/*
2474 - regtry - try match at specific point
2475 */
2476STATIC I32 /* 0 failure, 1 success */
24b23f37 2477S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2478{
97aff369 2479 dVAR;
d6a28714 2480 CHECKPOINT lastcp;
288b8c02
NC
2481 REGEXP *const rx = reginfo->prog;
2482 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2483 RXi_GET_DECL(prog,progi);
a3621e74 2484 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2485
2486 PERL_ARGS_ASSERT_REGTRY;
2487
24b23f37 2488 reginfo->cutpoint=NULL;
d6a28714 2489
bbe252da 2490 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2491 MAGIC *mg;
2492
2493 PL_reg_eval_set = RS_init;
a3621e74 2494 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2495 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2496 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2497 ));
ea8d6ae1 2498 SAVESTACK_CXPOS();
d6a28714
JH
2499 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2500 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2501 SAVETMPS;
2502 /* Apparently this is not needed, judging by wantarray. */
e8347627 2503 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2504 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2505
3b0527fe 2506 if (reginfo->sv) {
d6a28714 2507 /* Make $_ available to executed code. */
3b0527fe 2508 if (reginfo->sv != DEFSV) {
59f00321 2509 SAVE_DEFSV;
414bf5ae 2510 DEFSV_set(reginfo->sv);
b8c5462f 2511 }
d6a28714 2512
3b0527fe
DM
2513 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2514 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2515 /* prepare for quick setting of pos */
d300d9fa 2516#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2517 if (SvIsCOW(reginfo->sv))
2518 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2519#endif
3dab1dad 2520 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2521 &PL_vtbl_mglob, NULL, 0);
d6a28714 2522 mg->mg_len = -1;
b8c5462f 2523 }
d6a28714
JH
2524 PL_reg_magic = mg;
2525 PL_reg_oldpos = mg->mg_len;
4f639d21 2526 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2527 }
09687e5a 2528 if (!PL_reg_curpm) {
a02a5408 2529 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2530#ifdef USE_ITHREADS
2531 {
14a49a24 2532 SV* const repointer = &PL_sv_undef;
92313705
NC
2533 /* this regexp is also owned by the new PL_reg_curpm, which
2534 will try to free it. */
d2ece331 2535 av_push(PL_regex_padav, repointer);
09687e5a
AB
2536 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2537 PL_regex_pad = AvARRAY(PL_regex_padav);
2538 }
2539#endif
2540 }
86c29d75
NC
2541#ifdef USE_ITHREADS
2542 /* It seems that non-ithreads works both with and without this code.
2543 So for efficiency reasons it seems best not to have the code
2544 compiled when it is not needed. */
92313705
NC
2545 /* This is safe against NULLs: */
2546 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2547 /* PM_reg_curpm owns a reference to this regexp. */
2548 ReREFCNT_inc(rx);
86c29d75 2549#endif
288b8c02 2550 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2551 PL_reg_oldcurpm = PL_curpm;
2552 PL_curpm = PL_reg_curpm;
07bc277f 2553 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2554 /* Here is a serious problem: we cannot rewrite subbeg,
2555 since it may be needed if this match fails. Thus
2556 $` inside (?{}) could fail... */
2557 PL_reg_oldsaved = prog->subbeg;
2558 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2559#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2560 PL_nrs = prog->saved_copy;
2561#endif
07bc277f 2562 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2563 }
2564 else
bd61b366 2565 PL_reg_oldsaved = NULL;
d6a28714
JH
2566 prog->subbeg = PL_bostr;
2567 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2568 }
24b23f37 2569 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2570 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2571 PL_reginput = *startpos;
d6a28714 2572 PL_reglastparen = &prog->lastparen;
a01268b5 2573 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2574 prog->lastparen = 0;
03994de8 2575 prog->lastcloseparen = 0;
d6a28714 2576 PL_regsize = 0;
f0ab9afb 2577 PL_regoffs = prog->offs;
d6a28714
JH
2578 if (PL_reg_start_tmpl <= prog->nparens) {
2579 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2580 if(PL_reg_start_tmp)
2581 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2582 else
a02a5408 2583 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2584 }
2585
2586 /* XXXX What this code is doing here?!!! There should be no need
2587 to do this again and again, PL_reglastparen should take care of
3dd2943c 2588 this! --ilya*/
dafc8851
JH
2589
2590 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2591 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2592 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2593 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2594 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2595 * Meanwhile, this code *is* needed for the
daf18116
JH
2596 * above-mentioned test suite tests to succeed. The common theme
2597 * on those tests seems to be returning null fields from matches.
225593e1 2598 * --jhi updated by dapm */
dafc8851 2599#if 1
d6a28714 2600 if (prog->nparens) {
f0ab9afb 2601 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2602 register I32 i;
eb160463 2603 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2604 ++pp;
2605 pp->start = -1;
2606 pp->end = -1;
d6a28714
JH
2607 }
2608 }
dafc8851 2609#endif
02db2b7b 2610 REGCP_SET(lastcp);
f8fc2ecf 2611 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2612 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2613 return 1;
2614 }
24b23f37
YO
2615 if (reginfo->cutpoint)
2616 *startpos= reginfo->cutpoint;
02db2b7b 2617 REGCP_UNWIND(lastcp);
d6a28714
JH
2618 return 0;
2619}
2620
02db2b7b 2621
8ba1375e
MJD
2622#define sayYES goto yes
2623#define sayNO goto no
262b90c4 2624#define sayNO_SILENT goto no_silent
8ba1375e 2625
f9f4320a
YO
2626/* we dont use STMT_START/END here because it leads to
2627 "unreachable code" warnings, which are bogus, but distracting. */
2628#define CACHEsayNO \
c476f425
DM
2629 if (ST.cache_mask) \
2630 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2631 sayNO
3298f257 2632
a3621e74 2633/* this is used to determine how far from the left messages like
265c4333
YO
2634 'failed...' are printed. It should be set such that messages
2635 are inline with the regop output that created them.
a3621e74 2636*/
265c4333 2637#define REPORT_CODE_OFF 32
a3621e74
YO
2638
2639
40a82448
DM
2640#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2641#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2642
86545054
DM
2643#define SLAB_FIRST(s) (&(s)->states[0])
2644#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2645
5d9a96ca
DM
2646/* grab a new slab and return the first slot in it */
2647
2648STATIC regmatch_state *
2649S_push_slab(pTHX)
2650{
a35a87e7 2651#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2652 dMY_CXT;
2653#endif
5d9a96ca
DM
2654 regmatch_slab *s = PL_regmatch_slab->next;
2655 if (!s) {
2656 Newx(s, 1, regmatch_slab);
2657 s->prev = PL_regmatch_slab;
2658 s->next = NULL;
2659 PL_regmatch_slab->next = s;
2660 }
2661 PL_regmatch_slab = s;
86545054 2662 return SLAB_FIRST(s);
5d9a96ca 2663}
5b47454d 2664
95b24440 2665
40a82448
DM
2666/* push a new state then goto it */
2667
2668#define PUSH_STATE_GOTO(state, node) \
2669 scan = node; \
2670 st->resume_state = state; \
2671 goto push_state;
2672
2673/* push a new state with success backtracking, then goto it */
2674
2675#define PUSH_YES_STATE_GOTO(state, node) \
2676 scan = node; \
2677 st->resume_state = state; \
2678 goto push_yes_state;
2679
aa283a38 2680
aa283a38 2681
d6a28714 2682/*
95b24440 2683
bf1f174e
DM
2684regmatch() - main matching routine
2685
2686This is basically one big switch statement in a loop. We execute an op,
2687set 'next' to point the next op, and continue. If we come to a point which
2688we may need to backtrack to on failure such as (A|B|C), we push a
2689backtrack state onto the backtrack stack. On failure, we pop the top
2690state, and re-enter the loop at the state indicated. If there are no more
2691states to pop, we return failure.
2692
2693Sometimes we also need to backtrack on success; for example /A+/, where
2694after successfully matching one A, we need to go back and try to
2695match another one; similarly for lookahead assertions: if the assertion
2696completes successfully, we backtrack to the state just before the assertion
2697and then carry on. In these cases, the pushed state is marked as
2698'backtrack on success too'. This marking is in fact done by a chain of
2699pointers, each pointing to the previous 'yes' state. On success, we pop to
2700the nearest yes state, discarding any intermediate failure-only states.
2701Sometimes a yes state is pushed just to force some cleanup code to be
2702called at the end of a successful match or submatch; e.g. (??{$re}) uses
2703it to free the inner regex.
2704
2705Note that failure backtracking rewinds the cursor position, while
2706success backtracking leaves it alone.
2707
2708A pattern is complete when the END op is executed, while a subpattern
2709such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2710ops trigger the "pop to last yes state if any, otherwise return true"
2711behaviour.
2712
2713A common convention in this function is to use A and B to refer to the two
2714subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2715the subpattern to be matched possibly multiple times, while B is the entire
2716rest of the pattern. Variable and state names reflect this convention.
2717
2718The states in the main switch are the union of ops and failure/success of
2719substates associated with with that op. For example, IFMATCH is the op
2720that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2721'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2722successfully matched A and IFMATCH_A_fail is a state saying that we have
2723just failed to match A. Resume states always come in pairs. The backtrack
2724state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2725at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2726on success or failure.
2727
2728The struct that holds a backtracking state is actually a big union, with
2729one variant for each major type of op. The variable st points to the
2730top-most backtrack struct. To make the code clearer, within each
2731block of code we #define ST to alias the relevant union.
2732
2733Here's a concrete example of a (vastly oversimplified) IFMATCH
2734implementation:
2735
2736 switch (state) {
2737 ....
2738
2739#define ST st->u.ifmatch
2740
2741 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2742 ST.foo = ...; // some state we wish to save
95b24440 2743 ...
bf1f174e
DM
2744 // push a yes backtrack state with a resume value of
2745 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2746 // first node of A:
2747 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2748 // NOTREACHED
2749
2750 case IFMATCH_A: // we have successfully executed A; now continue with B
2751 next = B;
2752 bar = ST.foo; // do something with the preserved value
2753 break;
2754
2755 case IFMATCH_A_fail: // A failed, so the assertion failed
2756 ...; // do some housekeeping, then ...
2757 sayNO; // propagate the failure
2758
2759#undef ST
95b24440 2760
bf1f174e
DM
2761 ...
2762 }
95b24440 2763
bf1f174e
DM
2764For any old-timers reading this who are familiar with the old recursive
2765approach, the code above is equivalent to:
95b24440 2766
bf1f174e
DM
2767 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2768 {
2769 int foo = ...
95b24440 2770 ...
bf1f174e
DM
2771 if (regmatch(A)) {
2772 next = B;
2773 bar = foo;
2774 break;
95b24440 2775 }
bf1f174e
DM
2776 ...; // do some housekeeping, then ...
2777 sayNO; // propagate the failure
95b24440 2778 }
bf1f174e
DM
2779
2780The topmost backtrack state, pointed to by st, is usually free. If you
2781want to claim it, populate any ST.foo fields in it with values you wish to
2782save, then do one of
2783
2784 PUSH_STATE_GOTO(resume_state, node);
2785 PUSH_YES_STATE_GOTO(resume_state, node);
2786
2787which sets that backtrack state's resume value to 'resume_state', pushes a
2788new free entry to the top of the backtrack stack, then goes to 'node'.
2789On backtracking, the free slot is popped, and the saved state becomes the
2790new free state. An ST.foo field in this new top state can be temporarily
2791accessed to retrieve values, but once the main loop is re-entered, it
2792becomes available for reuse.
2793
2794Note that the depth of the backtrack stack constantly increases during the
2795left-to-right execution of the pattern, rather than going up and down with
2796the pattern nesting. For example the stack is at its maximum at Z at the
2797end of the pattern, rather than at X in the following:
2798
2799 /(((X)+)+)+....(Y)+....Z/
2800
2801The only exceptions to this are lookahead/behind assertions and the cut,
2802(?>A), which pop all the backtrack states associated with A before
2803continuing.
2804
486ec47a 2805Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2806PL_regmatch_state and st always point to the currently active state,
2807and PL_regmatch_slab points to the slab currently containing
2808PL_regmatch_state. The first time regmatch() is called, the first slab is
2809allocated, and is never freed until interpreter destruction. When the slab
2810is full, a new one is allocated and chained to the end. At exit from
2811regmatch(), slabs allocated since entry are freed.
2812
2813*/
95b24440 2814
40a82448 2815
5bc10b2c 2816#define DEBUG_STATE_pp(pp) \
265c4333 2817 DEBUG_STATE_r({ \
f2ed9b32 2818 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2819 PerlIO_printf(Perl_debug_log, \
5d458dd8 2820 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2821 depth*2, "", \
13d6edb4 2822 PL_reg_name[st->resume_state], \
5d458dd8
YO
2823 ((st==yes_state||st==mark_state) ? "[" : ""), \
2824 ((st==yes_state) ? "Y" : ""), \
2825 ((st==mark_state) ? "M" : ""), \
2826 ((st==yes_state||st==mark_state) ? "]" : "") \
2827 ); \
265c4333 2828 });
5bc10b2c 2829
40a82448 2830
3dab1dad 2831#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2832
3df15adc 2833#ifdef DEBUGGING
5bc10b2c 2834
ab3bbdeb 2835STATIC void
f2ed9b32 2836S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2837 const char *start, const char *end, const char *blurb)
2838{
efd26800 2839 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2840
2841 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2842
ab3bbdeb
YO
2843 if (!PL_colorset)
2844 reginitcolors();
2845 {
2846 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2847 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2848
f2ed9b32 2849 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2850 start, end - start, 60);
2851
2852 PerlIO_printf(Perl_debug_log,
2853 "%s%s REx%s %s against %s\n",
2854 PL_colors[4], blurb, PL_colors[5], s0, s1);
2855
f2ed9b32 2856 if (utf8_target||utf8_pat)
1de06328
YO
2857 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2858 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2859 utf8_pat && utf8_target ? " and " : "",
2860 utf8_target ? "string" : ""
ab3bbdeb
YO
2861 );
2862 }
2863}
3df15adc
YO
2864
2865STATIC void
786e8c11
YO
2866S_dump_exec_pos(pTHX_ const char *locinput,
2867 const regnode *scan,
2868 const char *loc_regeol,
2869 const char *loc_bostr,
2870 const char *loc_reg_starttry,
f2ed9b32 2871 const bool utf8_target)
07be1b83 2872{
786e8c11 2873 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2874 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2875 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2876 /* The part of the string before starttry has one color
2877 (pref0_len chars), between starttry and current
2878 position another one (pref_len - pref0_len chars),
2879 after the current position the third one.
2880 We assume that pref0_len <= pref_len, otherwise we
2881 decrease pref0_len. */
786e8c11
YO
2882 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2883 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2884 int pref0_len;
2885
7918f24d
NC
2886 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2887
f2ed9b32 2888 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2889 pref_len++;
786e8c11
YO
2890 pref0_len = pref_len - (locinput - loc_reg_starttry);
2891 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2892 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2893 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2894 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2895 l--;
2896 if (pref0_len < 0)
2897 pref0_len = 0;
2898 if (pref0_len > pref_len)
2899 pref0_len = pref_len;
2900 {
f2ed9b32 2901 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2902
ab3bbdeb 2903 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2904 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2905
ab3bbdeb 2906 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2907 (locinput - pref_len + pref0_len),
1de06328 2908 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2909
ab3bbdeb 2910 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2911 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2912
1de06328 2913 const STRLEN tlen=len0+len1+len2;
3df15adc 2914 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2915 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2916 (IV)(locinput - loc_bostr),
07be1b83 2917 len0, s0,
07be1b83 2918 len1, s1,
07be1b83 2919 (docolor ? "" : "> <"),
07be1b83 2920 len2, s2,
f9f4320a 2921 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2922 "");
2923 }
2924}
3df15adc 2925
07be1b83
YO
2926#endif
2927
0a4db386
YO
2928/* reg_check_named_buff_matched()
2929 * Checks to see if a named buffer has matched. The data array of
2930 * buffer numbers corresponding to the buffer is expected to reside
2931 * in the regexp->data->data array in the slot stored in the ARG() of
2932 * node involved. Note that this routine doesn't actually care about the
2933 * name, that information is not preserved from compilation to execution.
2934 * Returns the index of the leftmost defined buffer with the given name
2935 * or 0 if non of the buffers matched.
2936 */
2937STATIC I32
7918f24d
NC
2938S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2939{
0a4db386 2940 I32 n;
f8fc2ecf 2941 RXi_GET_DECL(rex,rexi);
ad64d0ec 2942 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2943 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2944
2945 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2946
0a4db386
YO
2947 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2948 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2949 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2950 {
2951 return nums[n];
2952 }
2953 }
2954 return 0;
2955}
2956
2f554ef7
DM
2957
2958/* free all slabs above current one - called during LEAVE_SCOPE */
2959
2960STATIC void
2961S_clear_backtrack_stack(pTHX_ void *p)
2962{
2963 regmatch_slab *s = PL_regmatch_slab->next;
2964 PERL_UNUSED_ARG(p);
2965
2966 if (!s)
2967 return;
2968 PL_regmatch_slab->next = NULL;
2969 while (s) {
2970 regmatch_slab * const osl = s;
2971 s = s->next;
2972 Safefree(osl);
2973 }
2974}
2975
2976
28d8d7f4
YO
2977#define SETREX(Re1,Re2) \
2978 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2979 Re1 = (Re2)
2980
d6a28714 2981STATIC I32 /* 0 failure, 1 success */
24b23f37 2982S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2983{
a35a87e7 2984#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2985 dMY_CXT;
2986#endif
27da23d5 2987 dVAR;
f2ed9b32 2988 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 2989 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2990 REGEXP *rex_sv = reginfo->prog;
2991 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2992 RXi_GET_DECL(rex,rexi);
2f554ef7 2993 I32 oldsave;
5d9a96ca
DM
2994 /* the current state. This is a cached copy of PL_regmatch_state */
2995 register regmatch_state *st;
5d9a96ca
DM
2996 /* cache heavy used fields of st in registers */
2997 register regnode *scan;
2998 register regnode *next;
438e9bae 2999 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3000 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3001 register char *locinput = PL_reginput;
5d9a96ca 3002 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3003
b69b0499 3004 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3005 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3006 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3007 const U32 max_nochange_depth =
3008 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3009 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3010 regmatch_state *yes_state = NULL; /* state to pop to on success of
3011 subpattern */
e2e6a0f1
YO
3012 /* mark_state piggy backs on the yes_state logic so that when we unwind
3013 the stack on success we can update the mark_state as we go */
3014 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3015 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3016 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3017 U32 state_num;
5d458dd8
YO
3018 bool no_final = 0; /* prevent failure from backtracking? */
3019 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3020 char *startpoint = PL_reginput;
5d458dd8
YO
3021 SV *popmark = NULL; /* are we looking for a mark? */
3022 SV *sv_commit = NULL; /* last mark name seen in failure */
3023 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3024 during a successful match */
5d458dd8
YO
3025 U32 lastopen = 0; /* last open we saw */
3026 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3027 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3028 /* these three flags are set by various ops to signal information to
3029 * the very next op. They have a useful lifetime of exactly one loop
3030 * iteration, and are not preserved or restored by state pushes/pops
3031 */
3032 bool sw = 0; /* the condition value in (?(cond)a|b) */
3033 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3034 int logical = 0; /* the following EVAL is:
3035 0: (?{...})
3036 1: (?(?{...})X|Y)
3037 2: (??{...})
3038 or the following IFMATCH/UNLESSM is:
3039 false: plain (?=foo)
3040 true: used as a condition: (?(?=foo))
3041 */
95b24440 3042#ifdef DEBUGGING
e68ec53f 3043 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3044#endif
3045
7918f24d
NC
3046 PERL_ARGS_ASSERT_REGMATCH;
3047
3b57cd43 3048 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3049 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3050 }));
5d9a96ca
DM
3051 /* on first ever call to regmatch, allocate first slab */
3052 if (!PL_regmatch_slab) {
3053 Newx(PL_regmatch_slab, 1, regmatch_slab);
3054 PL_regmatch_slab->prev = NULL;
3055 PL_regmatch_slab->next = NULL;
86545054 3056 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3057 }
3058
2f554ef7
DM
3059 oldsave = PL_savestack_ix;
3060 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3061 SAVEVPTR(PL_regmatch_slab);
3062 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3063
3064 /* grab next free state slot */
3065 st = ++PL_regmatch_state;
86545054 3066 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3067 st = PL_regmatch_state = S_push_slab(aTHX);
3068
d6a28714
JH
3069 /* Note that nextchr is a byte even in UTF */
3070 nextchr = UCHARAT(locinput);
3071 scan = prog;
3072 while (scan != NULL) {
8ba1375e 3073
a3621e74 3074 DEBUG_EXECUTE_r( {
6136c704 3075 SV * const prop = sv_newmortal();
1de06328 3076 regnode *rnext=regnext(scan);
f2ed9b32 3077 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3078 regprop(rex, prop, scan);
07be1b83
YO
3079
3080 PerlIO_printf(Perl_debug_log,
3081 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3082 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3083 SvPVX_const(prop),
1de06328 3084 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3085 0 : (IV)(rnext - rexi->program));
2a782b5b 3086 });
d6a28714
JH
3087
3088 next = scan + NEXT_OFF(scan);
3089 if (next == scan)
3090 next = NULL;
40a82448 3091 state_num = OP(scan);
d6a28714 3092
40a82448 3093 reenter_switch:
34a81e2b
B
3094
3095 assert(PL_reglastparen == &rex->lastparen);
3096 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3097 assert(PL_regoffs == rex->offs);
3098
40a82448 3099 switch (state_num) {
d6a28714 3100 case BOL:
7fba1cd6 3101 if (locinput == PL_bostr)
d6a28714 3102 {
3b0527fe 3103 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3104 break;
3105 }
d6a28714
JH
3106 sayNO;
3107 case MBOL:
12d33761
HS
3108 if (locinput == PL_bostr ||
3109 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3110 {
b8c5462f
JH
3111 break;
3112 }
d6a28714
JH
3113 sayNO;
3114 case SBOL:
c2a73568 3115 if (locinput == PL_bostr)
b8c5462f 3116 break;
d6a28714
JH
3117 sayNO;
3118 case GPOS:
3b0527fe 3119 if (locinput == reginfo->ganch)
d6a28714
JH
3120 break;
3121 sayNO;
ee9b8eae
YO
3122
3123 case KEEPS:
3124 /* update the startpoint */
f0ab9afb 3125 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3126 PL_reginput = locinput;
f0ab9afb 3127 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3128 PUSH_STATE_GOTO(KEEPS_next, next);
3129 /*NOT-REACHED*/
3130 case KEEPS_next_fail:
3131 /* rollback the start point change */
f0ab9afb 3132 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3133 sayNO_SILENT;
3134 /*NOT-REACHED*/
d6a28714 3135 case EOL:
d6a28714
JH
3136 goto seol;
3137 case MEOL:
d6a28714 3138 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3139 sayNO;
b8c5462f 3140 break;
d6a28714
JH
3141 case SEOL:
3142 seol:
3143 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3144 sayNO;
d6a28714 3145 if (PL_regeol - locinput > 1)
b8c5462f 3146 sayNO;
b8c5462f 3147 break;
d6a28714
JH
3148 case EOS:
3149 if (PL_regeol != locinput)
b8c5462f 3150 sayNO;
d6a28714 3151 break;
ffc61ed2 3152 case SANY:
d6a28714 3153 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3154 sayNO;
f2ed9b32 3155 if (utf8_target) {
f33976b4
DB
3156 locinput += PL_utf8skip[nextchr];
3157 if (locinput > PL_regeol)
3158 sayNO;
3159 nextchr = UCHARAT(locinput);
3160 }
3161 else
3162 nextchr = UCHARAT(++locinput);
3163 break;
3164 case CANY:
3165 if (!nextchr && locinput >= PL_regeol)
3166 sayNO;
b8c5462f 3167 nextchr = UCHARAT(++locinput);
a0d0e21e 3168 break;
ffc61ed2 3169 case REG_ANY:
1aa99e6b
IH
3170 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3171 sayNO;
f2ed9b32 3172 if (utf8_target) {
b8c5462f 3173 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3174 if (locinput > PL_regeol)
3175 sayNO;
a0ed51b3 3176 nextchr = UCHARAT(locinput);
a0ed51b3 3177 }
1aa99e6b
IH
3178 else
3179 nextchr = UCHARAT(++locinput);
a0ed51b3 3180 break;
166ba7cd
DM
3181
3182#undef ST
3183#define ST st->u.trie
786e8c11
YO
3184 case TRIEC:
3185 /* In this case the charclass data is available inline so
3186 we can fail fast without a lot of extra overhead.
3187 */
f2ed9b32 3188 if (scan->flags == EXACT || !utf8_target) {
786e8c11
YO
3189 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3190 DEBUG_EXECUTE_r(
3191 PerlIO_printf(Perl_debug_log,
3192 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3193 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3194 );
3195 sayNO_SILENT;
3196 /* NOTREACHED */
3197 }
3198 }
3199 /* FALL THROUGH */
5b47454d 3200 case TRIE:
2e64971a
DM
3201 /* the basic plan of execution of the trie is:
3202 * At the beginning, run though all the states, and
3203 * find the longest-matching word. Also remember the position
3204 * of the shortest matching word. For example, this pattern:
3205 * 1 2 3 4 5
3206 * ab|a|x|abcd|abc
3207 * when matched against the string "abcde", will generate
3208 * accept states for all words except 3, with the longest
3209 * matching word being 4, and the shortest being 1 (with
3210 * the position being after char 1 of the string).
3211 *
3212 * Then for each matching word, in word order (i.e. 1,2,4,5),
3213 * we run the remainder of the pattern; on each try setting
3214 * the current position to the character following the word,
3215 * returning to try the next word on failure.
3216 *
3217 * We avoid having to build a list of words at runtime by
3218 * using a compile-time structure, wordinfo[].prev, which
3219 * gives, for each word, the previous accepting word (if any).
3220 * In the case above it would contain the mappings 1->2, 2->0,
3221 * 3->0, 4->5, 5->1. We can use this table to generate, from
3222 * the longest word (4 above), a list of all words, by
3223 * following the list of prev pointers; this gives us the
3224 * unordered list 4,5,1,2. Then given the current word we have
3225 * just tried, we can go through the list and find the
3226 * next-biggest word to try (so if we just failed on word 2,
3227 * the next in the list is 4).
3228 *
3229 * Since at runtime we don't record the matching position in
3230 * the string for each word, we have to work that out for
3231 * each word we're about to process. The wordinfo table holds
3232 * the character length of each word; given that we recorded
3233 * at the start: the position of the shortest word and its
3234 * length in chars, we just need to move the pointer the
3235 * difference between the two char lengths. Depending on
3236 * Unicode status and folding, that's cheap or expensive.
3237 *
3238 * This algorithm is optimised for the case where are only a
3239 * small number of accept states, i.e. 0,1, or maybe 2.
3240 * With lots of accepts states, and having to try all of them,
3241 * it becomes quadratic on number of accept states to find all
3242 * the next words.
3243 */
3244
3dab1dad 3245 {
07be1b83 3246 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3247 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3248
3249 /* what trie are we using right now */
be8e71aa 3250 reg_trie_data * const trie
f8fc2ecf 3251 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3252 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3253 U32 state = trie->startstate;
166ba7cd 3254
3dab1dad
YO
3255 if (trie->bitmap && trie_type != trie_utf8_fold &&
3256 !TRIE_BITMAP_TEST(trie,*locinput)
3257 ) {
3258 if (trie->states[ state ].wordnum) {
3259 DEBUG_EXECUTE_r(
3260 PerlIO_printf(Perl_debug_log,
3261 "%*s %smatched empty string...%s\n",
5bc10b2c 3262 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3263 );
20dbff7c
YO
3264 if (!trie->jump)
3265 break;
3dab1dad
YO
3266 } else {
3267 DEBUG_EXECUTE_r(
3268 PerlIO_printf(Perl_debug_log,
786e8c11 3269 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3270 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3271 );
3272 sayNO_SILENT;
3273 }
3274 }
166ba7cd 3275
786e8c11
YO
3276 {
3277 U8 *uc = ( U8* )locinput;
3278
3279 STRLEN len = 0;
3280 STRLEN foldlen = 0;
3281 U8 *uscan = (U8*)NULL;
786e8c11 3282 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3283 U32 charcount = 0; /* how many input chars we have matched */
3284 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3285
786e8c11
YO
3286 ST.B = next;
3287 ST.jump = trie->jump;
786e8c11 3288 ST.me = scan;
2e64971a
DM
3289 ST.firstpos = NULL;
3290 ST.longfold = FALSE; /* char longer if folded => it's harder */
3291 ST.nextword = 0;
3292
3293 /* fully traverse the TRIE; note the position of the
3294 shortest accept state and the wordnum of the longest
3295 accept state */
07be1b83 3296
a3621e74 3297 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3298 U32 base = trie->states[ state ].trans.base;
f9f4320a 3299 UV uvc = 0;
acb909b4 3300 U16 charid = 0;
2e64971a
DM
3301 U16 wordnum;
3302 wordnum = trie->states[ state ].wordnum;
3303
3304 if (wordnum) { /* it's an accept state */
3305 if (!accepted) {
3306 accepted = 1;
3307 /* record first match position */
3308 if (ST.longfold) {
3309 ST.firstpos = (U8*)locinput;
3310 ST.firstchars = 0;
5b47454d 3311 }
2e64971a
DM
3312 else {
3313 ST.firstpos = uc;
3314 ST.firstchars = charcount;
3315 }
3316 }
3317 if (!ST.nextword || wordnum < ST.nextword)
3318 ST.nextword = wordnum;
3319 ST.topword = wordnum;
786e8c11 3320 }
a3621e74 3321
07be1b83 3322 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3323 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3324 PerlIO_printf( Perl_debug_log,
2e64971a 3325 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3326 2+depth * 2, "", PL_colors[4],
2e64971a 3327 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3328 });
a3621e74 3329
2e64971a 3330 /* read a char and goto next state */
a3621e74 3331 if ( base ) {
6dd2be57 3332 I32 offset;
55eed653
NC
3333 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3334 uscan, len, uvc, charid, foldlen,
3335 foldbuf, uniflags);
2e64971a
DM
3336 charcount++;
3337 if (foldlen>0)
3338 ST.longfold = TRUE;
5b47454d 3339 if (charid &&
6dd2be57
DM
3340 ( ((offset =
3341 base + charid - 1 - trie->uniquecharcount)) >= 0)
3342
3343 && ((U32)offset < trie->lasttrans)
3344 && trie->trans[offset].check == state)
5b47454d 3345 {
6dd2be57 3346 state = trie->trans[offset].next;
5b47454d
DM
3347 }
3348 else {
3349 state = 0;
3350 }
3351 uc += len;
3352
3353 }
3354 else {
a3621e74
YO
3355 state = 0;
3356 }
3357 DEBUG_TRIE_EXECUTE_r(
e4584336 3358 PerlIO_printf( Perl_debug_log,
786e8c11 3359 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3360 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3361 );
3362 }
2e64971a 3363 if (!accepted)
a3621e74 3364 sayNO;
a3621e74 3365
2e64971a
DM
3366 /* calculate total number of accept states */
3367 {
3368 U16 w = ST.topword;
3369 accepted = 0;
3370 while (w) {
3371 w = trie->wordinfo[w].prev;
3372 accepted++;
3373 }
3374 ST.accepted = accepted;
3375 }