This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make regexp_paren_pair.start_tmp an offset
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
7e2509c1
KW
94/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
95 * call if there are no complications: i.e., if everything matchable is
96 * straight forward in the bitmap */
af364d03
KW
97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
98 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 99
c277df42
IZ
100/*
101 * Forwards.
102 */
103
f2ed9b32 104#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 105#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 106
3dab1dad
YO
107#define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
52657f30 109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
110 : (U8*)(pos + off))
111#define HOPBACKc(pos, off) \
07be1b83
YO
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
8e11feef 115 ? (U8*)pos - off \
3dab1dad 116 : NULL)
efb30f32 117
e7409c1b 118#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 119#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 120
20d0b1e9 121/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
122#ifdef EBCDIC
123 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
124 * skip the check on EBCDIC platforms */
125# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
126#else
127# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
128 if (!CAT2(PL_utf8_,class)) { \
129 bool ok; \
130 ENTER; save_re_context(); \
131 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 133#endif
37e2e78e
KW
134
135/* Doesn't do an assert to verify that is correct */
136#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 137 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 138 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
139 ENTER; save_re_context(); \
140 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 LEAVE; } } STMT_END
37e2e78e 142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
144#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
145#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 146
37e2e78e 147#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
148 LOAD_UTF8_CHARCLASS(X_begin, " "); \
149 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
150 /* These are utf8 constants, and not utf-ebcdic constants, so the \
151 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 152 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
153 \
154 /* No asserts are done for these, in case called on an early \
155 * Unicode version in which they map to nothing */ \
d275fa5e
CB
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 163
1dcf4a1b 164#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 165
ee9a90b8
KW
166/* The actual code for CCC_TRY, which uses several variables from the routine
167 * it's callable from. It is designed to be the bulk of a case statement.
168 * FUNC is the macro or function to call on non-utf8 targets that indicate if
169 * nextchr matches the class.
170 * UTF8_TEST is the whole test string to use for utf8 targets
171 * LOAD is what to use to test, and if not present to load in the swash for the
172 * class
173 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
174 * UTF8_TEST test.
175 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
176 * utf8 and a variant, load the swash if necessary and test using the utf8
177 * test. Advance to the next character if test is ok, otherwise fail; If not
178 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
179 * fails, or advance to the next character */
180
181#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
182 if (locinput >= PL_regeol) { \
183 sayNO; \
184 } \
185 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
186 LOAD_UTF8_CHARCLASS(CLASS, STR); \
187 if (POS_OR_NEG (UTF8_TEST)) { \
188 sayNO; \
189 } \
190 locinput += PL_utf8skip[nextchr]; \
191 nextchr = UCHARAT(locinput); \
192 break; \
193 } \
194 if (POS_OR_NEG (FUNC(nextchr))) { \
195 sayNO; \
196 } \
197 nextchr = UCHARAT(++locinput); \
980866de
KW
198 break;
199
ee9a90b8
KW
200/* Handle the non-locale cases for a character class and its complement. It
201 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
202 * This is because that code fails when the test succeeds, so we want to have
203 * the test fail so that the code succeeds. The swash is stored in a
204 * predictable PL_ place */
cfaf538b
KW
205#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
206 CLASS, STR) \
ee9a90b8
KW
207 case NAME: \
208 _CCC_TRY_CODE( !, FUNC, \
209 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
210 (U8*)locinput, TRUE)), \
211 CLASS, STR) \
212 case NNAME: \
1dcf4a1b 213 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
214 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
215 (U8*)locinput, TRUE)), \
216 CLASS, STR) \
217
218/* Generate the case statements for both locale and non-locale character
219 * classes in regmatch for classes that don't have special unicode semantics.
220 * Locales don't use an immediate swash, but an intermediary special locale
221 * function that is called on the pointer to the current place in the input
222 * string. That function will resolve to needing the same swash. One might
223 * think that because we don't know what the locale will match, we shouldn't
224 * check with the swash loading function that it loaded properly; ie, that we
225 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
226 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 * irrelevant here */
228#define CCC_TRY(NAME, NNAME, FUNC, \
229 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 230 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
231 CLASS, STR) \
232 case NAMEL: \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 case NNAMEL: \
236 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
237 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
238 CLASS, STR) \
cfaf538b
KW
239 case NAMEA: \
240 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
244 nextchr = UCHARAT(++locinput); \
245 break; \
246 case NNAMEA: \
247 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 sayNO; \
249 } \
250 if (utf8_target) { \
251 locinput += PL_utf8skip[nextchr]; \
252 nextchr = UCHARAT(locinput); \
253 } \
254 else { \
255 nextchr = UCHARAT(++locinput); \
256 } \
257 break; \
ee9a90b8
KW
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260
261/* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263#define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
cfaf538b 266 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 267 CLASS, STR) \
cfaf538b
KW
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
271 CLASS, STR) \
ee9a90b8 272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 273
3dab1dad
YO
274/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275
5f80c4cf 276/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
277/* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 *
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
285*/
3e901dc0
YO
286#define JUMPABLE(rn) ( \
287 OP(rn) == OPEN || \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == EVAL || \
cca55fe3
JP
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 292 OP(rn) == KEEPS || \
3dab1dad 293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 294)
ee9b8eae 295#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 296
ee9b8eae
YO
297#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298
299#if 0
300/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 303#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305
306#else
307/* ... so we use this as its faster. */
308#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 309#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
310#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
312
313#endif
e2d8ce26 314
a84d97b6
HS
315/*
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318*/
cca55fe3 319#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 323 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 324 else if (type == PLUS) \
cca55fe3 325 rn = NEXTOPER(rn); \
3dab1dad 326 else if (type == IFMATCH) \
a84d97b6 327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 328 else rn += NEXT_OFF(rn); \
3dab1dad 329 } \
5f80c4cf 330} STMT_END
74750237 331
c476f425 332
acfe0abc 333static void restore_pos(pTHX_ void *arg);
51371543 334
620d5b66 335#define REGCP_PAREN_ELEMS 4
b93070ed 336#define REGCP_OTHER_ELEMS 4
e0fa7e2b 337#define REGCP_FRAME_ELEMS 1
620d5b66
NC
338/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
339 * are needed for the regexp context stack bookkeeping. */
340
76e3520e 341STATIC CHECKPOINT
b93070ed 342S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 343{
97aff369 344 dVAR;
a3b680e6 345 const int retval = PL_savestack_ix;
a3b680e6 346 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
347 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
348 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 349 int p;
40a82448 350 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 351
b93070ed
DM
352 PERL_ARGS_ASSERT_REGCPPUSH;
353
e49a9654 354 if (paren_elems_to_push < 0)
5637ef5b
NC
355 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
356 paren_elems_to_push);
e49a9654 357
e0fa7e2b
NC
358 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
359 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
360 " out of range (%lu-%ld)",
361 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 362
620d5b66 363 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 364
3280af22 365 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 366/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
367 SSPUSHINT(rex->offs[p].end);
368 SSPUSHINT(rex->offs[p].start);
1ca2007e 369 SSPUSHINT(rex->offs[p].start_tmp);
a0d0e21e 370 SSPUSHINT(p);
e7707071 371 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 372 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
1ca2007e
DM
373 (UV)p,
374 (IV)rex->offs[p].start,
375 (IV)rex->offs[p].start_tmp,
b93070ed 376 (IV)rex->offs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 380 SSPUSHINT(PL_regsize);
b93070ed
DM
381 SSPUSHINT(rex->lastparen);
382 SSPUSHINT(rex->lastcloseparen);
3280af22 383 SSPUSHPTR(PL_reginput);
e0fa7e2b 384 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 385
a0d0e21e
LW
386 return retval;
387}
388
c277df42 389/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
390#define REGCP_SET(cp) \
391 DEBUG_STATE_r( \
ab3bbdeb 392 PerlIO_printf(Perl_debug_log, \
e4f74956 393 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
394 (IV)PL_savestack_ix)); \
395 cp = PL_savestack_ix
c3464db5 396
ab3bbdeb 397#define REGCP_UNWIND(cp) \
e4f74956 398 DEBUG_STATE_r( \
ab3bbdeb 399 if (cp != PL_savestack_ix) \
e4f74956
YO
400 PerlIO_printf(Perl_debug_log, \
401 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
402 (IV)(cp), (IV)PL_savestack_ix)); \
403 regcpblow(cp)
c277df42 404
76e3520e 405STATIC char *
b93070ed 406S_regcppop(pTHX_ regexp *rex)
a0d0e21e 407{
97aff369 408 dVAR;
e0fa7e2b 409 UV i;
a0d0e21e 410 char *input;
a3621e74
YO
411 GET_RE_DEBUG_FLAGS_DECL;
412
7918f24d
NC
413 PERL_ARGS_ASSERT_REGCPPOP;
414
b1ce53c5 415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 416 i = SSPOPUV;
e0fa7e2b
NC
417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 419 input = (char *) SSPOPPTR;
b93070ed
DM
420 rex->lastcloseparen = SSPOPINT;
421 rex->lastparen = SSPOPINT;
3280af22 422 PL_regsize = SSPOPINT;
b1ce53c5 423
620d5b66 424 i -= REGCP_OTHER_ELEMS;
b1ce53c5 425 /* Now restore the parentheses context. */
620d5b66 426 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 427 I32 tmps;
097eb12c 428 U32 paren = (U32)SSPOPINT;
1ca2007e 429 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 430 rex->offs[paren].start = SSPOPINT;
cf93c79d 431 tmps = SSPOPINT;
b93070ed
DM
432 if (paren <= rex->lastparen)
433 rex->offs[paren].end = tmps;
e7707071 434 DEBUG_BUFFERS_r(
c3464db5 435 PerlIO_printf(Perl_debug_log,
b900a521 436 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
1ca2007e
DM
437 (UV)paren,
438 (IV)rex->offs[paren].start,
439 (IV)rex->offs[paren].start_tmp,
b93070ed
DM
440 (IV)rex->offs[paren].end,
441 (paren > rex->lastparen ? "(no)" : ""));
c277df42 442 );
a0d0e21e 443 }
e7707071 444 DEBUG_BUFFERS_r(
b93070ed 445 if (rex->lastparen + 1 <= rex->nparens) {
c3464db5 446 PerlIO_printf(Perl_debug_log,
faccc32b 447 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
b93070ed 448 (IV)(rex->lastparen + 1), (IV)rex->nparens);
c277df42
IZ
449 }
450 );
daf18116 451#if 1
dafc8851
JH
452 /* It would seem that the similar code in regtry()
453 * already takes care of this, and in fact it is in
454 * a better location to since this code can #if 0-ed out
455 * but the code in regtry() is needed or otherwise tests
456 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
457 * (as of patchlevel 7877) will fail. Then again,
458 * this code seems to be necessary or otherwise
225593e1
DM
459 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
460 * --jhi updated by dapm */
b93070ed 461 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 462 if (i > PL_regsize)
b93070ed
DM
463 rex->offs[i].start = -1;
464 rex->offs[i].end = -1;
a0d0e21e 465 }
dafc8851 466#endif
a0d0e21e
LW
467 return input;
468}
469
02db2b7b 470#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 471
a687059c 472/*
e50aee73 473 * pregexec and friends
a687059c
LW
474 */
475
76234dfb 476#ifndef PERL_IN_XSUB_RE
a687059c 477/*
c277df42 478 - pregexec - match a regexp against a string
a687059c 479 */
c277df42 480I32
49d7dfbc 481Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 482 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
483/* strend: pointer to null at end of string */
484/* strbeg: real beginning of string */
485/* minend: end of match must be >=minend after stringarg. */
486/* nosave: For optimizations. */
487{
7918f24d
NC
488 PERL_ARGS_ASSERT_PREGEXEC;
489
c277df42 490 return
9041c2e3 491 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
492 nosave ? 0 : REXEC_COPY_STR);
493}
76234dfb 494#endif
22e551b9 495
9041c2e3 496/*
cad2e5aa
JH
497 * Need to implement the following flags for reg_anch:
498 *
499 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
500 * USE_INTUIT_ML
501 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
502 * INTUIT_AUTORITATIVE_ML
503 * INTUIT_ONCE_NOML - Intuit can match in one location only.
504 * INTUIT_ONCE_ML
505 *
506 * Another flag for this function: SECOND_TIME (so that float substrs
507 * with giant delta may be not rechecked).
508 */
509
510/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
511
3f7c398e 512/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
513 Otherwise, only SvCUR(sv) is used to get strbeg. */
514
515/* XXXX We assume that strpos is strbeg unless sv. */
516
6eb5f6b9
JH
517/* XXXX Some places assume that there is a fixed substring.
518 An update may be needed if optimizer marks as "INTUITable"
519 RExen without fixed substrings. Similarly, it is assumed that
520 lengths of all the strings are no more than minlen, thus they
521 cannot come from lookahead.
40d049e4
YO
522 (Or minlen should take into account lookahead.)
523 NOTE: Some of this comment is not correct. minlen does now take account
524 of lookahead/behind. Further research is required. -- demerphq
525
526*/
6eb5f6b9 527
2c2d71f5
JH
528/* A failure to find a constant substring means that there is no need to make
529 an expensive call to REx engine, thus we celebrate a failure. Similarly,
530 finding a substring too deep into the string means that less calls to
30944b6d
IZ
531 regtry() should be needed.
532
533 REx compiler's optimizer found 4 possible hints:
534 a) Anchored substring;
535 b) Fixed substring;
536 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 537 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 538 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
539 string which does not contradict any of them.
540 */
2c2d71f5 541
6eb5f6b9
JH
542/* Most of decisions we do here should have been done at compile time.
543 The nodes of the REx which we used for the search should have been
544 deleted from the finite automaton. */
545
cad2e5aa 546char *
288b8c02 547Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 548 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 549{
97aff369 550 dVAR;
288b8c02 551 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 552 register I32 start_shift = 0;
cad2e5aa 553 /* Should be nonnegative! */
b7953727 554 register I32 end_shift = 0;
2c2d71f5
JH
555 register char *s;
556 register SV *check;
a1933d95 557 char *strbeg;
cad2e5aa 558 char *t;
f2ed9b32 559 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 560 I32 ml_anch;
bd61b366
SS
561 register char *other_last = NULL; /* other substr checked before this */
562 char *check_at = NULL; /* check substr found at this pos */
d8080198 563 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 564 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 565 RXi_GET_DECL(prog,progi);
30944b6d 566#ifdef DEBUGGING
890ce7af 567 const char * const i_strpos = strpos;
30944b6d 568#endif
a3621e74
YO
569 GET_RE_DEBUG_FLAGS_DECL;
570
7918f24d 571 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
572 PERL_UNUSED_ARG(flags);
573 PERL_UNUSED_ARG(data);
7918f24d 574
f2ed9b32 575 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 576
3c8556c3 577 if (RX_UTF8(rx)) {
b8d68ded
JH
578 PL_reg_flags |= RF_utf8;
579 }
ab3bbdeb 580 DEBUG_EXECUTE_r(
f2ed9b32 581 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
582 sv ? "Guessing start of match in sv for"
583 : "Guessing start of match in string for");
2a782b5b 584 );
cad2e5aa 585
c344f387
JH
586 /* CHR_DIST() would be more correct here but it makes things slow. */
587 if (prog->minlen > strend - strpos) {
a3621e74 588 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 589 "String too short... [re_intuit_start]\n"));
cad2e5aa 590 goto fail;
2c2d71f5 591 }
1de06328 592
a1933d95 593 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 594 PL_regeol = strend;
f2ed9b32 595 if (utf8_target) {
33b8afdf
JH
596 if (!prog->check_utf8 && prog->check_substr)
597 to_utf8_substr(prog);
598 check = prog->check_utf8;
599 } else {
600 if (!prog->check_substr && prog->check_utf8)
601 to_byte_substr(prog);
602 check = prog->check_substr;
603 }
1de06328 604 if (check == &PL_sv_undef) {
a3621e74 605 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 606 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
607 goto fail;
608 }
bbe252da
YO
609 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
610 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
611 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 612 && !multiline ) ); /* Check after \n? */
cad2e5aa 613
7e25d62c 614 if (!ml_anch) {
bbe252da
YO
615 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
616 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 617 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
618 && sv && !SvROK(sv)
619 && (strpos != strbeg)) {
a3621e74 620 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
621 goto fail;
622 }
623 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 624 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 625 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
626 I32 slen;
627
1aa99e6b 628 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 629
653099ff
GS
630 if (SvTAIL(check)) {
631 slen = SvCUR(check); /* >= 1 */
cad2e5aa 632
9041c2e3 633 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 634 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 635 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 636 goto fail_finish;
cad2e5aa
JH
637 }
638 /* Now should match s[0..slen-2] */
639 slen--;
3f7c398e 640 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 641 || (slen > 1
3f7c398e 642 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 643 report_neq:
a3621e74 644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
645 goto fail_finish;
646 }
cad2e5aa 647 }
3f7c398e 648 else if (*SvPVX_const(check) != *s
653099ff 649 || ((slen = SvCUR(check)) > 1
3f7c398e 650 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 651 goto report_neq;
c315bfe8 652 check_at = s;
2c2d71f5 653 goto success_at_start;
7e25d62c 654 }
cad2e5aa 655 }
2c2d71f5 656 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 657 s = strpos;
2c2d71f5 658 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
659 end_shift = prog->check_end_shift;
660
2c2d71f5 661 if (!ml_anch) {
a3b680e6 662 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 663 - (SvTAIL(check) != 0);
a3b680e6 664 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
665
666 if (end_shift < eshift)
667 end_shift = eshift;
668 }
cad2e5aa 669 }
2c2d71f5 670 else { /* Can match at random position */
cad2e5aa
JH
671 ml_anch = 0;
672 s = strpos;
1de06328
YO
673 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
674 end_shift = prog->check_end_shift;
675
676 /* end shift should be non negative here */
cad2e5aa
JH
677 }
678
bcdf7404 679#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 680 if (end_shift < 0)
1de06328 681 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 682 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
683#endif
684
2c2d71f5
JH
685 restart:
686 /* Find a possible match in the region s..strend by looking for
687 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
688
689 {
690 I32 srch_start_shift = start_shift;
691 I32 srch_end_shift = end_shift;
c33e64f0
FC
692 U8* start_point;
693 U8* end_point;
1de06328
YO
694 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
695 srch_end_shift -= ((strbeg - s) - srch_start_shift);
696 srch_start_shift = strbeg - s;
697 }
6bda09f9 698 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
699 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
700 (IV)prog->check_offset_min,
701 (IV)srch_start_shift,
702 (IV)srch_end_shift,
703 (IV)prog->check_end_shift);
704 });
705
bbe252da 706 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
707 start_point= (U8*)(s + srch_start_shift);
708 end_point= (U8*)(strend - srch_end_shift);
709 } else {
710 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
711 end_point= HOP3(strend, -srch_end_shift, strbeg);
712 }
6bda09f9 713 DEBUG_OPTIMISE_MORE_r({
56570a2c 714 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 715 (int)(end_point - start_point),
fc8cd66c 716 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
717 start_point);
718 });
719
720 s = fbm_instr( start_point, end_point,
7fba1cd6 721 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 722 }
cad2e5aa
JH
723 /* Update the count-of-usability, remove useless subpatterns,
724 unshift s. */
2c2d71f5 725
ab3bbdeb 726 DEBUG_EXECUTE_r({
f2ed9b32 727 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
728 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
729 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 730 (s ? "Found" : "Did not find"),
f2ed9b32 731 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
732 ? "anchored" : "floating"),
733 quoted,
734 RE_SV_TAIL(check),
735 (s ? " at offset " : "...\n") );
736 });
2c2d71f5
JH
737
738 if (!s)
739 goto fail_finish;
2c2d71f5 740 /* Finish the diagnostic message */
a3621e74 741 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 742
1de06328
YO
743 /* XXX dmq: first branch is for positive lookbehind...
744 Our check string is offset from the beginning of the pattern.
745 So we need to do any stclass tests offset forward from that
746 point. I think. :-(
747 */
748
749
750
751 check_at=s;
752
753
2c2d71f5
JH
754 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
755 Start with the other substr.
756 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 757 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
758 *always* match. Probably should be marked during compile...
759 Probably it is right to do no SCREAM here...
760 */
761
f2ed9b32 762 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
763 : (prog->float_substr && prog->anchored_substr))
764 {
30944b6d 765 /* Take into account the "other" substring. */
2c2d71f5
JH
766 /* XXXX May be hopelessly wrong for UTF... */
767 if (!other_last)
6eb5f6b9 768 other_last = strpos;
f2ed9b32 769 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
770 do_other_anchored:
771 {
890ce7af
AL
772 char * const last = HOP3c(s, -start_shift, strbeg);
773 char *last1, *last2;
be8e71aa 774 char * const saved_s = s;
33b8afdf 775 SV* must;
2c2d71f5 776
2c2d71f5
JH
777 t = s - prog->check_offset_max;
778 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 779 && (!utf8_target
0ce71af7 780 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 781 && t > strpos)))
6f207bd3 782 NOOP;
2c2d71f5
JH
783 else
784 t = strpos;
1aa99e6b 785 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
786 if (t < other_last) /* These positions already checked */
787 t = other_last;
1aa99e6b 788 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
789 if (last < last1)
790 last1 = last;
1de06328
YO
791 /* XXXX It is not documented what units *_offsets are in.
792 We assume bytes, but this is clearly wrong.
793 Meaning this code needs to be carefully reviewed for errors.
794 dmq.
795 */
796
2c2d71f5 797 /* On end-of-str: see comment below. */
f2ed9b32 798 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
799 if (must == &PL_sv_undef) {
800 s = (char*)NULL;
1de06328 801 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
802 }
803 else
804 s = fbm_instr(
805 (unsigned char*)t,
806 HOP3(HOP3(last1, prog->anchored_offset, strend)
807 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
808 must,
7fba1cd6 809 multiline ? FBMrf_MULTILINE : 0
33b8afdf 810 );
ab3bbdeb 811 DEBUG_EXECUTE_r({
f2ed9b32 812 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
813 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
814 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 815 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
816 quoted, RE_SV_TAIL(must));
817 });
818
819
2c2d71f5
JH
820 if (!s) {
821 if (last1 >= last2) {
a3621e74 822 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
823 ", giving up...\n"));
824 goto fail_finish;
825 }
a3621e74 826 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 827 ", trying floating at offset %ld...\n",
be8e71aa 828 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
829 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
830 s = HOP3c(last, 1, strend);
2c2d71f5
JH
831 goto restart;
832 }
833 else {
a3621e74 834 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 835 (long)(s - i_strpos)));
1aa99e6b
IH
836 t = HOP3c(s, -prog->anchored_offset, strbeg);
837 other_last = HOP3c(s, 1, strend);
be8e71aa 838 s = saved_s;
2c2d71f5
JH
839 if (t == strpos)
840 goto try_at_start;
2c2d71f5
JH
841 goto try_at_offset;
842 }
30944b6d 843 }
2c2d71f5
JH
844 }
845 else { /* Take into account the floating substring. */
33b8afdf 846 char *last, *last1;
be8e71aa 847 char * const saved_s = s;
33b8afdf
JH
848 SV* must;
849
850 t = HOP3c(s, -start_shift, strbeg);
851 last1 = last =
852 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
853 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
854 last = HOP3c(t, prog->float_max_offset, strend);
855 s = HOP3c(t, prog->float_min_offset, strend);
856 if (s < other_last)
857 s = other_last;
2c2d71f5 858 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 859 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
860 /* fbm_instr() takes into account exact value of end-of-str
861 if the check is SvTAIL(ed). Since false positives are OK,
862 and end-of-str is not later than strend we are OK. */
863 if (must == &PL_sv_undef) {
864 s = (char*)NULL;
1de06328 865 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
866 }
867 else
2c2d71f5 868 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
869 (unsigned char*)last + SvCUR(must)
870 - (SvTAIL(must)!=0),
7fba1cd6 871 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 872 DEBUG_EXECUTE_r({
f2ed9b32 873 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
874 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
875 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 876 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
877 quoted, RE_SV_TAIL(must));
878 });
33b8afdf
JH
879 if (!s) {
880 if (last1 == last) {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
882 ", giving up...\n"));
883 goto fail_finish;
2c2d71f5 884 }
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 886 ", trying anchored starting at offset %ld...\n",
be8e71aa 887 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
888 other_last = last;
889 s = HOP3c(t, 1, strend);
890 goto restart;
891 }
892 else {
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
894 (long)(s - i_strpos)));
895 other_last = s; /* Fix this later. --Hugo */
be8e71aa 896 s = saved_s;
33b8afdf
JH
897 if (t == strpos)
898 goto try_at_start;
899 goto try_at_offset;
900 }
2c2d71f5 901 }
cad2e5aa 902 }
2c2d71f5 903
1de06328 904
9ef43ace 905 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 906
6bda09f9 907 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
908 PerlIO_printf(Perl_debug_log,
909 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
910 (IV)prog->check_offset_min,
911 (IV)prog->check_offset_max,
912 (IV)(s-strpos),
913 (IV)(t-strpos),
914 (IV)(t-s),
915 (IV)(strend-strpos)
916 )
917 );
918
2c2d71f5 919 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 920 && (!utf8_target
9ef43ace 921 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
922 && t > strpos)))
923 {
2c2d71f5
JH
924 /* Fixed substring is found far enough so that the match
925 cannot start at strpos. */
926 try_at_offset:
cad2e5aa 927 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
928 /* Eventually fbm_*() should handle this, but often
929 anchored_offset is not 0, so this check will not be wasted. */
930 /* XXXX In the code below we prefer to look for "^" even in
931 presence of anchored substrings. And we search even
932 beyond the found float position. These pessimizations
933 are historical artefacts only. */
934 find_anchor:
2c2d71f5 935 while (t < strend - prog->minlen) {
cad2e5aa 936 if (*t == '\n') {
4ee3650e 937 if (t < check_at - prog->check_offset_min) {
f2ed9b32 938 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
939 /* Since we moved from the found position,
940 we definitely contradict the found anchored
30944b6d
IZ
941 substr. Due to the above check we do not
942 contradict "check" substr.
943 Thus we can arrive here only if check substr
944 is float. Redo checking for "other"=="fixed".
945 */
9041c2e3 946 strpos = t + 1;
a3621e74 947 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 948 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
949 goto do_other_anchored;
950 }
4ee3650e
GS
951 /* We don't contradict the found floating substring. */
952 /* XXXX Why not check for STCLASS? */
cad2e5aa 953 s = t + 1;
a3621e74 954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 955 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
956 goto set_useful;
957 }
4ee3650e
GS
958 /* Position contradicts check-string */
959 /* XXXX probably better to look for check-string
960 than for "\n", so one should lower the limit for t? */
a3621e74 961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 962 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 963 other_last = strpos = s = t + 1;
cad2e5aa
JH
964 goto restart;
965 }
966 t++;
967 }
a3621e74 968 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 969 PL_colors[0], PL_colors[1]));
2c2d71f5 970 goto fail_finish;
cad2e5aa 971 }
f5952150 972 else {
a3621e74 973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 974 PL_colors[0], PL_colors[1]));
f5952150 975 }
cad2e5aa
JH
976 s = t;
977 set_useful:
f2ed9b32 978 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
979 }
980 else {
f5952150 981 /* The found string does not prohibit matching at strpos,
2c2d71f5 982 - no optimization of calling REx engine can be performed,
f5952150
GS
983 unless it was an MBOL and we are not after MBOL,
984 or a future STCLASS check will fail this. */
2c2d71f5
JH
985 try_at_start:
986 /* Even in this situation we may use MBOL flag if strpos is offset
987 wrt the start of the string. */
05b4157f 988 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 989 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 990 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 991 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 992 {
cad2e5aa
JH
993 t = strpos;
994 goto find_anchor;
995 }
a3621e74 996 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 997 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 998 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 999 );
2c2d71f5 1000 success_at_start:
bbe252da 1001 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1002 && (utf8_target ? (
33b8afdf
JH
1003 prog->check_utf8 /* Could be deleted already */
1004 && --BmUSEFUL(prog->check_utf8) < 0
1005 && (prog->check_utf8 == prog->float_utf8)
1006 ) : (
1007 prog->check_substr /* Could be deleted already */
1008 && --BmUSEFUL(prog->check_substr) < 0
1009 && (prog->check_substr == prog->float_substr)
1010 )))
66e933ab 1011 {
cad2e5aa 1012 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1014 /* XXX Does the destruction order has to change with utf8_target? */
1015 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1016 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1017 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1018 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1019 check = NULL; /* abort */
cad2e5aa 1020 s = strpos;
486ec47a 1021 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1022 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1023 if (prog->intflags & PREGf_IMPLICIT)
1024 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1025 /* XXXX This is a remnant of the old implementation. It
1026 looks wasteful, since now INTUIT can use many
6eb5f6b9 1027 other heuristics. */
bbe252da 1028 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1029 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1030 }
1031 else
1032 s = strpos;
1033 }
1034
6eb5f6b9
JH
1035 /* Last resort... */
1036 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1037 /* trie stclasses are too expensive to use here, we are better off to
1038 leave it to regmatch itself */
f8fc2ecf 1039 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1040 /* minlen == 0 is possible if regstclass is \b or \B,
1041 and the fixed substr is ''$.
1042 Since minlen is already taken into account, s+1 is before strend;
1043 accidentally, minlen >= 1 guaranties no false positives at s + 1
1044 even for \b or \B. But (minlen? 1 : 0) below assumes that
1045 regstclass does not come from lookahead... */
1046 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1047 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1048 const U8* const str = (U8*)STRING(progi->regstclass);
1049 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1050 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1051 : 1);
1de06328
YO
1052 char * endpos;
1053 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1054 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1055 else if (prog->float_substr || prog->float_utf8)
1056 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1057 else
1058 endpos= strend;
1059
d8080198
YO
1060 if (checked_upto < s)
1061 checked_upto = s;
1062 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1063 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1064
6eb5f6b9 1065 t = s;
d8080198
YO
1066 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1067 if (s) {
1068 checked_upto = s;
1069 } else {
6eb5f6b9 1070#ifdef DEBUGGING
cbbf8932 1071 const char *what = NULL;
6eb5f6b9
JH
1072#endif
1073 if (endpos == strend) {
a3621e74 1074 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1075 "Could not match STCLASS...\n") );
1076 goto fail;
1077 }
a3621e74 1078 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1079 "This position contradicts STCLASS...\n") );
bbe252da 1080 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1081 goto fail;
d8080198
YO
1082 checked_upto = HOPBACKc(endpos, start_shift);
1083 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1084 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1085 /* Contradict one of substrings */
33b8afdf 1086 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1087 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1088 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1089 hop_and_restart:
1aa99e6b 1090 s = HOP3c(t, 1, strend);
66e933ab
GS
1091 if (s + start_shift + end_shift > strend) {
1092 /* XXXX Should be taken into account earlier? */
a3621e74 1093 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1094 "Could not match STCLASS...\n") );
1095 goto fail;
1096 }
5e39e1e5
HS
1097 if (!check)
1098 goto giveup;
a3621e74 1099 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1100 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1101 what, (long)(s + start_shift - i_strpos)) );
1102 goto restart;
1103 }
66e933ab 1104 /* Have both, check_string is floating */
6eb5f6b9
JH
1105 if (t + start_shift >= check_at) /* Contradicts floating=check */
1106 goto retry_floating_check;
1107 /* Recheck anchored substring, but not floating... */
9041c2e3 1108 s = check_at;
5e39e1e5
HS
1109 if (!check)
1110 goto giveup;
a3621e74 1111 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1112 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1113 (long)(other_last - i_strpos)) );
1114 goto do_other_anchored;
1115 }
60e71179
GS
1116 /* Another way we could have checked stclass at the
1117 current position only: */
1118 if (ml_anch) {
1119 s = t = t + 1;
5e39e1e5
HS
1120 if (!check)
1121 goto giveup;
a3621e74 1122 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1123 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1124 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1125 goto try_at_offset;
66e933ab 1126 }
f2ed9b32 1127 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1128 goto fail;
486ec47a 1129 /* Check is floating substring. */
6eb5f6b9
JH
1130 retry_floating_check:
1131 t = check_at - start_shift;
a3621e74 1132 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1133 goto hop_and_restart;
1134 }
b7953727 1135 if (t != s) {
a3621e74 1136 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1137 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1138 (long)(t - i_strpos), (long)(s - i_strpos))
1139 );
1140 }
1141 else {
a3621e74 1142 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1143 "Does not contradict STCLASS...\n");
1144 );
1145 }
6eb5f6b9 1146 }
5e39e1e5 1147 giveup:
a3621e74 1148 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1149 PL_colors[4], (check ? "Guessed" : "Giving up"),
1150 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1151 return s;
2c2d71f5
JH
1152
1153 fail_finish: /* Substring not found */
33b8afdf 1154 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1155 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1156 fail:
a3621e74 1157 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1158 PL_colors[4], PL_colors[5]));
bd61b366 1159 return NULL;
cad2e5aa 1160}
9661b544 1161
a0a388a1
YO
1162#define DECL_TRIE_TYPE(scan) \
1163 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1164 trie_type = ((scan->flags == EXACT) \
1165 ? (utf8_target ? trie_utf8 : trie_plain) \
1166 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1167
1168#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1169uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1170 STRLEN skiplen; \
1171 switch (trie_type) { \
1172 case trie_utf8_fold: \
1173 if ( foldlen>0 ) { \
1174 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1175 foldlen -= len; \
1176 uscan += len; \
1177 len=0; \
1178 } else { \
1179 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1180 len = UTF8SKIP(uc); \
1181 skiplen = UNISKIP( uvc ); \
1182 foldlen -= skiplen; \
1183 uscan = foldbuf + skiplen; \
1184 } \
1185 break; \
1186 case trie_latin_utf8_fold: \
1187 if ( foldlen>0 ) { \
1188 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1189 foldlen -= len; \
1190 uscan += len; \
1191 len=0; \
1192 } else { \
1193 len = 1; \
1194 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1195 skiplen = UNISKIP( uvc ); \
1196 foldlen -= skiplen; \
1197 uscan = foldbuf + skiplen; \
1198 } \
1199 break; \
1200 case trie_utf8: \
1201 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1202 break; \
1203 case trie_plain: \
1204 uvc = (UV)*uc; \
1205 len = 1; \
1206 } \
1207 if (uvc < 256) { \
1208 charid = trie->charmap[ uvc ]; \
1209 } \
1210 else { \
1211 charid = 0; \
1212 if (widecharmap) { \
1213 SV** const svpp = hv_fetch(widecharmap, \
1214 (char*)&uvc, sizeof(UV), 0); \
1215 if (svpp) \
1216 charid = (U16)SvIV(*svpp); \
1217 } \
1218 } \
4cadc6a9
YO
1219} STMT_END
1220
4cadc6a9
YO
1221#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1222STMT_START { \
1223 while (s <= e) { \
1224 if ( (CoNd) \
fac1af77 1225 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1226 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1227 goto got_it; \
1228 s++; \
1229 } \
1230} STMT_END
1231
1232#define REXEC_FBC_UTF8_SCAN(CoDe) \
1233STMT_START { \
1234 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1235 CoDe \
1236 s += uskip; \
1237 } \
1238} STMT_END
1239
1240#define REXEC_FBC_SCAN(CoDe) \
1241STMT_START { \
1242 while (s < strend) { \
1243 CoDe \
1244 s++; \
1245 } \
1246} STMT_END
1247
1248#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1249REXEC_FBC_UTF8_SCAN( \
1250 if (CoNd) { \
24b23f37 1251 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1252 goto got_it; \
1253 else \
1254 tmp = doevery; \
1255 } \
1256 else \
1257 tmp = 1; \
1258)
1259
1260#define REXEC_FBC_CLASS_SCAN(CoNd) \
1261REXEC_FBC_SCAN( \
1262 if (CoNd) { \
24b23f37 1263 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1264 goto got_it; \
1265 else \
1266 tmp = doevery; \
1267 } \
1268 else \
1269 tmp = 1; \
1270)
1271
1272#define REXEC_FBC_TRYIT \
24b23f37 1273if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it
1275
e1d1eefb 1276#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1277 if (utf8_target) { \
e1d1eefb
YO
1278 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1279 } \
1280 else { \
1281 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1282 }
e1d1eefb 1283
4cadc6a9 1284#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1285 if (utf8_target) { \
4cadc6a9
YO
1286 UtFpReLoAd; \
1287 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1288 } \
1289 else { \
1290 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1291 }
4cadc6a9
YO
1292
1293#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1294 PL_reg_flags |= RF_tainted; \
f2ed9b32 1295 if (utf8_target) { \
4cadc6a9
YO
1296 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1297 } \
1298 else { \
1299 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1300 }
4cadc6a9 1301
786e8c11
YO
1302#define DUMP_EXEC_POS(li,s,doutf8) \
1303 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1304
cfaf538b
KW
1305
1306#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1307 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1308 tmp = TEST_NON_UTF8(tmp); \
1309 REXEC_FBC_UTF8_SCAN( \
1310 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1311 tmp = !tmp; \
1312 IF_SUCCESS; \
1313 } \
1314 else { \
1315 IF_FAIL; \
1316 } \
1317 ); \
1318
1319#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1320 if (s == PL_bostr) { \
1321 tmp = '\n'; \
1322 } \
1323 else { \
1324 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1325 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1326 } \
1327 tmp = TeSt1_UtF8; \
1328 LOAD_UTF8_CHARCLASS_ALNUM(); \
1329 REXEC_FBC_UTF8_SCAN( \
1330 if (tmp == ! (TeSt2_UtF8)) { \
1331 tmp = !tmp; \
1332 IF_SUCCESS; \
1333 } \
1334 else { \
1335 IF_FAIL; \
1336 } \
1337 ); \
1338
63ac0dad
KW
1339/* The only difference between the BOUND and NBOUND cases is that
1340 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1341 * NBOUND. This is accomplished by passing it in either the if or else clause,
1342 * with the other one being empty */
1343#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1344 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1345
1346#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1347 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1348
1349#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1350 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1351
1352#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1353 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1354
63ac0dad
KW
1355
1356/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1357 * be passed in completely with the variable name being tested, which isn't
1358 * such a clean interface, but this is easier to read than it was before. We
1359 * are looking for the boundary (or non-boundary between a word and non-word
1360 * character. The utf8 and non-utf8 cases have the same logic, but the details
1361 * must be different. Find the "wordness" of the character just prior to this
1362 * one, and compare it with the wordness of this one. If they differ, we have
1363 * a boundary. At the beginning of the string, pretend that the previous
1364 * character was a new-line */
cfaf538b 1365#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1366 if (utf8_target) { \
cfaf538b 1367 UTF8_CODE \
63ac0dad
KW
1368 } \
1369 else { /* Not utf8 */ \
1370 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1371 tmp = TEST_NON_UTF8(tmp); \
1372 REXEC_FBC_SCAN( \
1373 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1374 tmp = !tmp; \
1375 IF_SUCCESS; \
1376 } \
1377 else { \
1378 IF_FAIL; \
1379 } \
1380 ); \
1381 } \
1382 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1383 goto got_it;
1384
786e8c11
YO
1385/* We know what class REx starts with. Try to find this position... */
1386/* if reginfo is NULL, its a dryrun */
1387/* annoyingly all the vars in this routine have different names from their counterparts
1388 in regmatch. /grrr */
1389
3c3eec57 1390STATIC char *
07be1b83 1391S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1392 const char *strend, regmatch_info *reginfo)
a687059c 1393{
27da23d5 1394 dVAR;
bbe252da 1395 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1396 char *pat_string; /* The pattern's exactish string */
1397 char *pat_end; /* ptr to end char of pat_string */
1398 re_fold_t folder; /* Function for computing non-utf8 folds */
1399 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1400 STRLEN ln;
5dab1207 1401 STRLEN lnc;
078c425b 1402 register STRLEN uskip;
fac1af77
KW
1403 U8 c1;
1404 U8 c2;
6eb5f6b9
JH
1405 char *e;
1406 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1407 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1408 UV utf8_fold_flags = 0;
f8fc2ecf 1409 RXi_GET_DECL(prog,progi);
7918f24d
NC
1410
1411 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1412
6eb5f6b9
JH
1413 /* We know what class it must start with. */
1414 switch (OP(c)) {
f56b6394 1415 case ANYOFV:
6eb5f6b9 1416 case ANYOF:
f56b6394 1417 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1418 STRLEN inclasslen = strend - s;
1419 REXEC_FBC_UTF8_CLASS_SCAN(
1420 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1421 }
1422 else {
6ef69d56 1423 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1424 }
6eb5f6b9 1425 break;
f33976b4 1426 case CANY:
4cadc6a9 1427 REXEC_FBC_SCAN(
24b23f37 1428 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1429 goto got_it;
1430 else
1431 tmp = doevery;
4cadc6a9 1432 );
f33976b4 1433 break;
2f7f8cb1
KW
1434
1435 case EXACTFA:
1436 if (UTF_PATTERN || utf8_target) {
1437 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1438 goto do_exactf_utf8;
1439 }
1440 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1441 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1442 goto do_exactf_non_utf8; /* isn't dealt with by these */
1443
6eb5f6b9 1444 case EXACTF:
62bf7766 1445 if (utf8_target) {
77a6d856
KW
1446
1447 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1448 utf8_fold_flags = 0;
fac1af77
KW
1449 goto do_exactf_utf8;
1450 }
1451 fold_array = PL_fold;
1452 folder = foldEQ;
1453 goto do_exactf_non_utf8;
1454
1455 case EXACTFL:
1456 if (UTF_PATTERN || utf8_target) {
17580e7a 1457 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1458 goto do_exactf_utf8;
1459 }
1460 fold_array = PL_fold_locale;
1461 folder = foldEQ_locale;
16d951b7
KW
1462 goto do_exactf_non_utf8;
1463
3c760661
KW
1464 case EXACTFU_SS:
1465 if (UTF_PATTERN) {
1466 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1467 }
1468 goto do_exactf_utf8;
1469
fab2782b 1470 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1471 case EXACTFU:
1472 if (UTF_PATTERN || utf8_target) {
77a6d856 1473 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1474 goto do_exactf_utf8;
1475 }
1476
1477 /* Any 'ss' in the pattern should have been replaced by regcomp,
1478 * so we don't have to worry here about this single special case
1479 * in the Latin1 range */
1480 fold_array = PL_fold_latin1;
1481 folder = foldEQ_latin1;
fac1af77
KW
1482
1483 /* FALL THROUGH */
1484
62bf7766
KW
1485 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1486 are no glitches with fold-length differences
1487 between the target string and pattern */
fac1af77
KW
1488
1489 /* The idea in the non-utf8 EXACTF* cases is to first find the
1490 * first character of the EXACTF* node and then, if necessary,
1491 * case-insensitively compare the full text of the node. c1 is the
1492 * first character. c2 is its fold. This logic will not work for
1493 * Unicode semantics and the german sharp ss, which hence should
1494 * not be compiled into a node that gets here. */
1495 pat_string = STRING(c);
1496 ln = STR_LEN(c); /* length to match in octets/bytes */
1497
8a90a8fe
KW
1498 /* We know that we have to match at least 'ln' bytes (which is the
1499 * same as characters, since not utf8). If we have to match 3
1500 * characters, and there are only 2 availabe, we know without
1501 * trying that it will fail; so don't start a match past the
1502 * required minimum number from the far end */
fac1af77
KW
1503 e = HOP3c(strend, -((I32)ln), s);
1504
1505 if (!reginfo && e < s) {
1506 e = s; /* Due to minlen logic of intuit() */
1507 }
1508
1509 c1 = *pat_string;
1510 c2 = fold_array[c1];
1511 if (c1 == c2) { /* If char and fold are the same */
1512 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1513 }
1514 else {
1515 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1516 }
1517 break;
1518
1519 do_exactf_utf8:
e067297c
KW
1520 {
1521 unsigned expansion;
1522
fac1af77
KW
1523
1524 /* If one of the operands is in utf8, we can't use the simpler
1525 * folding above, due to the fact that many different characters
1526 * can have the same fold, or portion of a fold, or different-
1527 * length fold */
1528 pat_string = STRING(c);
1529 ln = STR_LEN(c); /* length to match in octets/bytes */
1530 pat_end = pat_string + ln;
1531 lnc = (UTF_PATTERN) /* length to match in characters */
1532 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1533 : ln;
1534
e067297c
KW
1535 /* We have 'lnc' characters to match in the pattern, but because of
1536 * multi-character folding, each character in the target can match
1537 * up to 3 characters (Unicode guarantees it will never exceed
1538 * this) if it is utf8-encoded; and up to 2 if not (based on the
1539 * fact that the Latin 1 folds are already determined, and the
1540 * only multi-char fold in that range is the sharp-s folding to
1541 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1542 * string character. Adjust lnc accordingly, rounding up, so that
1543 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1544 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1545 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1546
1547 /* As in the non-UTF8 case, if we have to match 3 characters, and
1548 * only 2 are left, it's guaranteed to fail, so don't start a
1549 * match that would require us to go beyond the end of the string
1550 */
1551 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1552
1553 if (!reginfo && e < s) {
1554 e = s; /* Due to minlen logic of intuit() */
1555 }
1556
b33105db
KW
1557 /* XXX Note that we could recalculate e to stop the loop earlier,
1558 * as the worst case expansion above will rarely be met, and as we
1559 * go along we would usually find that e moves further to the left.
1560 * This would happen only after we reached the point in the loop
1561 * where if there were no expansion we should fail. Unclear if
1562 * worth the expense */
e067297c 1563
fac1af77
KW
1564 while (s <= e) {
1565 char *my_strend= (char *)strend;
1566 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1567 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1568 && (!reginfo || regtry(reginfo, &s)) )
1569 {
1570 goto got_it;
1571 }
bbdd8bad 1572 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1573 }
1574 break;
e067297c 1575 }
bbce6d69 1576 case BOUNDL:
3280af22 1577 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1578 FBC_BOUND(isALNUM_LC,
1579 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1580 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1581 break;
bbce6d69 1582 case NBOUNDL:
3280af22 1583 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1584 FBC_NBOUND(isALNUM_LC,
1585 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1586 isALNUM_LC_utf8((U8*)s));
1587 break;
1588 case BOUND:
1589 FBC_BOUND(isWORDCHAR,
1590 isALNUM_uni(tmp),
1591 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1592 break;
cfaf538b
KW
1593 case BOUNDA:
1594 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1595 isWORDCHAR_A(tmp),
1596 isWORDCHAR_A((U8*)s));
1597 break;
a0d0e21e 1598 case NBOUND:
63ac0dad
KW
1599 FBC_NBOUND(isWORDCHAR,
1600 isALNUM_uni(tmp),
1601 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1602 break;
cfaf538b
KW
1603 case NBOUNDA:
1604 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1605 isWORDCHAR_A(tmp),
1606 isWORDCHAR_A((U8*)s));
1607 break;
63ac0dad
KW
1608 case BOUNDU:
1609 FBC_BOUND(isWORDCHAR_L1,
1610 isALNUM_uni(tmp),
1611 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1612 break;
1613 case NBOUNDU:
1614 FBC_NBOUND(isWORDCHAR_L1,
1615 isALNUM_uni(tmp),
1616 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1617 break;
bbce6d69 1618 case ALNUML:
4cadc6a9
YO
1619 REXEC_FBC_CSCAN_TAINT(
1620 isALNUM_LC_utf8((U8*)s),
1621 isALNUM_LC(*s)
1622 );
6895a8aa 1623 break;
980866de
KW
1624 case ALNUMU:
1625 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1626 LOAD_UTF8_CHARCLASS_ALNUM(),
1627 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1628 isWORDCHAR_L1((U8) *s)
1629 );
6895a8aa 1630 break;
980866de
KW
1631 case ALNUM:
1632 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1633 LOAD_UTF8_CHARCLASS_ALNUM(),
1634 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1635 isWORDCHAR((U8) *s)
1636 );
6895a8aa 1637 break;
cfaf538b 1638 case ALNUMA:
8e9da4d4
KW
1639 /* Don't need to worry about utf8, as it can match only a single
1640 * byte invariant character */
cfaf538b 1641 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1642 break;
980866de
KW
1643 case NALNUMU:
1644 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1645 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1646 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1647 ! isWORDCHAR_L1((U8) *s)
1648 );
6895a8aa 1649 break;
a0d0e21e 1650 case NALNUM:
4cadc6a9 1651 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1652 LOAD_UTF8_CHARCLASS_ALNUM(),
1653 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1654 ! isALNUM(*s)
4cadc6a9 1655 );
6895a8aa 1656 break;
cfaf538b 1657 case NALNUMA:
8e9da4d4
KW
1658 REXEC_FBC_CSCAN(
1659 !isWORDCHAR_A(*s),
1660 !isWORDCHAR_A(*s)
1661 );
1662 break;
bbce6d69 1663 case NALNUML:
4cadc6a9
YO
1664 REXEC_FBC_CSCAN_TAINT(
1665 !isALNUM_LC_utf8((U8*)s),
1666 !isALNUM_LC(*s)
1667 );
6895a8aa 1668 break;
980866de
KW
1669 case SPACEU:
1670 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1671 LOAD_UTF8_CHARCLASS_SPACE(),
1672 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1673 isSPACE_L1((U8) *s)
1674 );
6895a8aa 1675 break;
a0d0e21e 1676 case SPACE:
4cadc6a9 1677 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1678 LOAD_UTF8_CHARCLASS_SPACE(),
1679 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1680 isSPACE((U8) *s)
4cadc6a9 1681 );
6895a8aa 1682 break;
cfaf538b 1683 case SPACEA:
8e9da4d4
KW
1684 /* Don't need to worry about utf8, as it can match only a single
1685 * byte invariant character */
cfaf538b 1686 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1687 break;
bbce6d69 1688 case SPACEL:
4cadc6a9 1689 REXEC_FBC_CSCAN_TAINT(
6bbba904 1690 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1691 isSPACE_LC(*s)
1692 );
6895a8aa 1693 break;
980866de
KW
1694 case NSPACEU:
1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1698 ! isSPACE_L1((U8) *s)
1699 );
6895a8aa 1700 break;
a0d0e21e 1701 case NSPACE:
4cadc6a9 1702 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1703 LOAD_UTF8_CHARCLASS_SPACE(),
1704 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1705 ! isSPACE((U8) *s)
4cadc6a9 1706 );
6895a8aa 1707 break;
cfaf538b 1708 case NSPACEA:
8e9da4d4
KW
1709 REXEC_FBC_CSCAN(
1710 !isSPACE_A(*s),
1711 !isSPACE_A(*s)
1712 );
1713 break;
bbce6d69 1714 case NSPACEL:
4cadc6a9 1715 REXEC_FBC_CSCAN_TAINT(
6bbba904 1716 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1717 !isSPACE_LC(*s)
1718 );
6895a8aa 1719 break;
a0d0e21e 1720 case DIGIT:
4cadc6a9 1721 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1722 LOAD_UTF8_CHARCLASS_DIGIT(),
1723 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1724 isDIGIT(*s)
1725 );
6895a8aa 1726 break;
cfaf538b 1727 case DIGITA:
8e9da4d4
KW
1728 /* Don't need to worry about utf8, as it can match only a single
1729 * byte invariant character */
cfaf538b 1730 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1731 break;
b8c5462f 1732 case DIGITL:
4cadc6a9
YO
1733 REXEC_FBC_CSCAN_TAINT(
1734 isDIGIT_LC_utf8((U8*)s),
1735 isDIGIT_LC(*s)
1736 );
6895a8aa 1737 break;
a0d0e21e 1738 case NDIGIT:
4cadc6a9 1739 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1740 LOAD_UTF8_CHARCLASS_DIGIT(),
1741 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1742 !isDIGIT(*s)
1743 );
6895a8aa 1744 break;
cfaf538b 1745 case NDIGITA:
8e9da4d4
KW
1746 REXEC_FBC_CSCAN(
1747 !isDIGIT_A(*s),
1748 !isDIGIT_A(*s)
1749 );
1750 break;
b8c5462f 1751 case NDIGITL:
4cadc6a9
YO
1752 REXEC_FBC_CSCAN_TAINT(
1753 !isDIGIT_LC_utf8((U8*)s),
1754 !isDIGIT_LC(*s)
1755 );
6895a8aa 1756 break;
e1d1eefb
YO
1757 case LNBREAK:
1758 REXEC_FBC_CSCAN(
1759 is_LNBREAK_utf8(s),
1760 is_LNBREAK_latin1(s)
1761 );
6895a8aa 1762 break;
e1d1eefb
YO
1763 case VERTWS:
1764 REXEC_FBC_CSCAN(
1765 is_VERTWS_utf8(s),
1766 is_VERTWS_latin1(s)
1767 );
6895a8aa 1768 break;
e1d1eefb
YO
1769 case NVERTWS:
1770 REXEC_FBC_CSCAN(
1771 !is_VERTWS_utf8(s),
1772 !is_VERTWS_latin1(s)
1773 );
6895a8aa 1774 break;
e1d1eefb
YO
1775 case HORIZWS:
1776 REXEC_FBC_CSCAN(
1777 is_HORIZWS_utf8(s),
1778 is_HORIZWS_latin1(s)
1779 );
6895a8aa 1780 break;
e1d1eefb
YO
1781 case NHORIZWS:
1782 REXEC_FBC_CSCAN(
1783 !is_HORIZWS_utf8(s),
1784 !is_HORIZWS_latin1(s)
1785 );
6895a8aa 1786 break;
1de06328
YO
1787 case AHOCORASICKC:
1788 case AHOCORASICK:
07be1b83 1789 {
a0a388a1 1790 DECL_TRIE_TYPE(c);
07be1b83
YO
1791 /* what trie are we using right now */
1792 reg_ac_data *aho
f8fc2ecf 1793 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1794 reg_trie_data *trie
1795 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1796 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1797
1798 const char *last_start = strend - trie->minlen;
6148ee25 1799#ifdef DEBUGGING
07be1b83 1800 const char *real_start = s;
6148ee25 1801#endif
07be1b83 1802 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1803 SV *sv_points;
1804 U8 **points; /* map of where we were in the input string
786e8c11 1805 when reading a given char. For ASCII this
be8e71aa 1806 is unnecessary overhead as the relationship
38a44b82
NC
1807 is always 1:1, but for Unicode, especially
1808 case folded Unicode this is not true. */
f9e705e8 1809 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1810 U8 *bitmap=NULL;
1811
07be1b83
YO
1812
1813 GET_RE_DEBUG_FLAGS_DECL;
1814
be8e71aa
YO
1815 /* We can't just allocate points here. We need to wrap it in
1816 * an SV so it gets freed properly if there is a croak while
1817 * running the match */
1818 ENTER;
1819 SAVETMPS;
1820 sv_points=newSV(maxlen * sizeof(U8 *));
1821 SvCUR_set(sv_points,
1822 maxlen * sizeof(U8 *));
1823 SvPOK_on(sv_points);
1824 sv_2mortal(sv_points);
1825 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1826 if ( trie_type != trie_utf8_fold
1827 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1828 {
786e8c11
YO
1829 if (trie->bitmap)
1830 bitmap=(U8*)trie->bitmap;
1831 else
1832 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1833 }
786e8c11
YO
1834 /* this is the Aho-Corasick algorithm modified a touch
1835 to include special handling for long "unknown char"
1836 sequences. The basic idea being that we use AC as long
1837 as we are dealing with a possible matching char, when
1838 we encounter an unknown char (and we have not encountered
1839 an accepting state) we scan forward until we find a legal
1840 starting char.
1841 AC matching is basically that of trie matching, except
1842 that when we encounter a failing transition, we fall back
1843 to the current states "fail state", and try the current char
1844 again, a process we repeat until we reach the root state,
1845 state 1, or a legal transition. If we fail on the root state
1846 then we can either terminate if we have reached an accepting
1847 state previously, or restart the entire process from the beginning
1848 if we have not.
1849
1850 */
07be1b83
YO
1851 while (s <= last_start) {
1852 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1853 U8 *uc = (U8*)s;
1854 U16 charid = 0;
1855 U32 base = 1;
1856 U32 state = 1;
1857 UV uvc = 0;
1858 STRLEN len = 0;
1859 STRLEN foldlen = 0;
1860 U8 *uscan = (U8*)NULL;
1861 U8 *leftmost = NULL;
786e8c11
YO
1862#ifdef DEBUGGING
1863 U32 accepted_word= 0;
1864#endif
07be1b83
YO
1865 U32 pointpos = 0;
1866
1867 while ( state && uc <= (U8*)strend ) {
1868 int failed=0;
786e8c11
YO
1869 U32 word = aho->states[ state ].wordnum;
1870
1de06328
YO
1871 if( state==1 ) {
1872 if ( bitmap ) {
1873 DEBUG_TRIE_EXECUTE_r(
1874 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1875 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1876 (char *)uc, utf8_target );
1de06328
YO
1877 PerlIO_printf( Perl_debug_log,
1878 " Scanning for legal start char...\n");
1879 }
d085b490
YO
1880 );
1881 if (utf8_target) {
1882 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1883 uc += UTF8SKIP(uc);
1884 }
1885 } else {
1886 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1887 uc++;
1888 }
1889 }
1de06328 1890 s= (char *)uc;
786e8c11 1891 }
786e8c11
YO
1892 if (uc >(U8*)last_start) break;
1893 }
1894
1895 if ( word ) {
2e64971a 1896 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1897 if (!leftmost || lpos < leftmost) {
1898 DEBUG_r(accepted_word=word);
07be1b83 1899 leftmost= lpos;
786e8c11 1900 }
07be1b83 1901 if (base==0) break;
786e8c11 1902
07be1b83
YO
1903 }
1904 points[pointpos++ % maxlen]= uc;
55eed653
NC
1905 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1906 uscan, len, uvc, charid, foldlen,
1907 foldbuf, uniflags);
786e8c11
YO
1908 DEBUG_TRIE_EXECUTE_r({
1909 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1910 s, utf8_target );
07be1b83 1911 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1912 " Charid:%3u CP:%4"UVxf" ",
1913 charid, uvc);
1914 });
07be1b83
YO
1915
1916 do {
6148ee25 1917#ifdef DEBUGGING
786e8c11 1918 word = aho->states[ state ].wordnum;
6148ee25 1919#endif
07be1b83
YO
1920 base = aho->states[ state ].trans.base;
1921
786e8c11
YO
1922 DEBUG_TRIE_EXECUTE_r({
1923 if (failed)
1924 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1925 s, utf8_target );
07be1b83 1926 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1927 "%sState: %4"UVxf", word=%"UVxf,
1928 failed ? " Fail transition to " : "",
1929 (UV)state, (UV)word);
1930 });
07be1b83
YO
1931 if ( base ) {
1932 U32 tmp;
6dd2be57 1933 I32 offset;
07be1b83 1934 if (charid &&
6dd2be57
DM
1935 ( ((offset = base + charid
1936 - 1 - trie->uniquecharcount)) >= 0)
1937 && ((U32)offset < trie->lasttrans)
1938 && trie->trans[offset].check == state
1939 && (tmp=trie->trans[offset].next))
07be1b83 1940 {
786e8c11
YO
1941 DEBUG_TRIE_EXECUTE_r(
1942 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1943 state = tmp;
1944 break;
1945 }
1946 else {
786e8c11
YO
1947 DEBUG_TRIE_EXECUTE_r(
1948 PerlIO_printf( Perl_debug_log," - fail\n"));
1949 failed = 1;
1950 state = aho->fail[state];
07be1b83
YO
1951 }
1952 }
1953 else {
1954 /* we must be accepting here */
786e8c11
YO
1955 DEBUG_TRIE_EXECUTE_r(
1956 PerlIO_printf( Perl_debug_log," - accepting\n"));
1957 failed = 1;
07be1b83
YO
1958 break;
1959 }
1960 } while(state);
786e8c11 1961 uc += len;
07be1b83
YO
1962 if (failed) {
1963 if (leftmost)
1964 break;
786e8c11 1965 if (!state) state = 1;
07be1b83
YO
1966 }
1967 }
1968 if ( aho->states[ state ].wordnum ) {
2e64971a 1969 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1970 if (!leftmost || lpos < leftmost) {
1971 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1972 leftmost = lpos;
786e8c11 1973 }
07be1b83 1974 }
07be1b83
YO
1975 if (leftmost) {
1976 s = (char*)leftmost;
786e8c11
YO
1977 DEBUG_TRIE_EXECUTE_r({
1978 PerlIO_printf(
70685ca0
JH
1979 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1980 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1981 );
1982 });
24b23f37 1983 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1984 FREETMPS;
1985 LEAVE;
07be1b83 1986 goto got_it;
be8e71aa 1987 }
07be1b83 1988 s = HOPc(s,1);
786e8c11
YO
1989 DEBUG_TRIE_EXECUTE_r({
1990 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1991 });
07be1b83 1992 } else {
786e8c11
YO
1993 DEBUG_TRIE_EXECUTE_r(
1994 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1995 break;
1996 }
1997 }
be8e71aa
YO
1998 FREETMPS;
1999 LEAVE;
07be1b83
YO
2000 }
2001 break;
b3c9acc1 2002 default:
3c3eec57
GS
2003 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2004 break;
d6a28714 2005 }
6eb5f6b9
JH
2006 return 0;
2007 got_it:
2008 return s;
2009}
2010
fae667d5 2011
6eb5f6b9
JH
2012/*
2013 - regexec_flags - match a regexp against a string
2014 */
2015I32
288b8c02 2016Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2017 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2018/* strend: pointer to null at end of string */
2019/* strbeg: real beginning of string */
2020/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2021/* data: May be used for some additional optimizations.
2022 Currently its only used, with a U32 cast, for transmitting
2023 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2024/* nosave: For optimizations. */
2025{
97aff369 2026 dVAR;
288b8c02 2027 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2028 /*register*/ char *s;
6eb5f6b9 2029 register regnode *c;
24b23f37 2030 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2031 I32 minlen; /* must match at least this many chars */
2032 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2033 I32 end_shift = 0; /* Same for the end. */ /* CC */
2034 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2035 char *scream_olds = NULL;
f2ed9b32 2036 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2037 I32 multiline;
f8fc2ecf 2038 RXi_GET_DECL(prog,progi);
3b0527fe 2039 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2040 regexp_paren_pair *swap = NULL;
a3621e74
YO
2041 GET_RE_DEBUG_FLAGS_DECL;
2042
7918f24d 2043 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2044 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2045
2046 /* Be paranoid... */
2047 if (prog == NULL || startpos == NULL) {
2048 Perl_croak(aTHX_ "NULL regexp parameter");
2049 return 0;
2050 }
2051
bbe252da 2052 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2053 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2054
f2ed9b32 2055 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2056 DEBUG_EXECUTE_r(
f2ed9b32 2057 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2058 "Matching");
2059 );
bac06658 2060
6eb5f6b9 2061 minlen = prog->minlen;
1de06328
YO
2062
2063 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2064 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2065 "String too short [regexec_flags]...\n"));
2066 goto phooey;
1aa99e6b 2067 }
6eb5f6b9 2068
1de06328 2069
6eb5f6b9 2070 /* Check validity of program. */
f8fc2ecf 2071 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2072 Perl_croak(aTHX_ "corrupted regexp program");
2073 }
2074
2075 PL_reg_flags = 0;
ed301438 2076 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2077 PL_reg_maxiter = 0;
2078
3c8556c3 2079 if (RX_UTF8(rx))
6eb5f6b9
JH
2080 PL_reg_flags |= RF_utf8;
2081
2082 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2083 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2084 PL_bostr = strbeg;
3b0527fe 2085 reginfo.sv = sv;
6eb5f6b9
JH
2086
2087 /* Mark end of line for $ (and such) */
2088 PL_regeol = strend;
2089
2090 /* see how far we have to get to not match where we matched before */
3b0527fe 2091 reginfo.till = startpos+minend;
6eb5f6b9 2092
6eb5f6b9
JH
2093 /* If there is a "must appear" string, look for it. */
2094 s = startpos;
2095
bbe252da 2096 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2097 MAGIC *mg;
2c296965 2098 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2099 reginfo.ganch = startpos + prog->gofs;
2c296965 2100 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2101 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2102 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2103 && SvMAGIC(sv)
14befaf4
DM
2104 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2105 && mg->mg_len >= 0) {
3b0527fe 2106 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2107 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2108 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2109
bbe252da 2110 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2111 if (s > reginfo.ganch)
6eb5f6b9 2112 goto phooey;
58e23c8d 2113 s = reginfo.ganch - prog->gofs;
2c296965 2114 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2115 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2116 if (s < strbeg)
2117 goto phooey;
6eb5f6b9
JH
2118 }
2119 }
58e23c8d 2120 else if (data) {
70685ca0 2121 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2122 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2123 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2124
2125 } else { /* pos() not defined */
3b0527fe 2126 reginfo.ganch = strbeg;
2c296965
YO
2127 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2128 "GPOS: reginfo.ganch = strbeg\n"));
2129 }
6eb5f6b9 2130 }
288b8c02 2131 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2132 /* We have to be careful. If the previous successful match
2133 was from this regex we don't want a subsequent partially
2134 successful match to clobber the old results.
2135 So when we detect this possibility we add a swap buffer
2136 to the re, and switch the buffer each match. If we fail
2137 we switch it back, otherwise we leave it swapped.
2138 */
2139 swap = prog->offs;
2140 /* do we need a save destructor here for eval dies? */
2141 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2142 }
a0714e2c 2143 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2144 re_scream_pos_data d;
2145
2146 d.scream_olds = &scream_olds;
2147 d.scream_pos = &scream_pos;
288b8c02 2148 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2149 if (!s) {
a3621e74 2150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2151 goto phooey; /* not present */
3fa9c3d7 2152 }
6eb5f6b9
JH
2153 }
2154
1de06328 2155
6eb5f6b9
JH
2156
2157 /* Simplest case: anchored match need be tried only once. */
2158 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2159 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2160 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2161 goto got_it;
bbe252da
YO
2162 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2163 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2164 {
2165 char *end;
2166
2167 if (minlen)
2168 dontbother = minlen - 1;
1aa99e6b 2169 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2170 /* for multiline we only have to try after newlines */
33b8afdf 2171 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2172 /* because of the goto we can not easily reuse the macros for bifurcating the
2173 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2174 if (utf8_target) {
2175 if (s == startpos)
2176 goto after_try_utf8;
2177 while (1) {
2178 if (regtry(&reginfo, &s)) {
2179 goto got_it;
2180 }
2181 after_try_utf8:
2182 if (s > end) {
2183 goto phooey;
2184 }
2185 if (prog->extflags & RXf_USE_INTUIT) {
2186 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2187 if (!s) {
2188 goto phooey;
2189 }
2190 }
2191 else {
2192 s += UTF8SKIP(s);
2193 }
2194 }
2195 } /* end search for check string in unicode */
2196 else {
2197 if (s == startpos) {
2198 goto after_try_latin;
2199 }
2200 while (1) {
2201 if (regtry(&reginfo, &s)) {
2202 goto got_it;
2203 }
2204 after_try_latin:
2205 if (s > end) {
2206 goto phooey;
2207 }
2208 if (prog->extflags & RXf_USE_INTUIT) {
2209 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2210 if (!s) {
2211 goto phooey;
2212 }
2213 }
2214 else {
2215 s++;
2216 }
2217 }
2218 } /* end search for check string in latin*/
2219 } /* end search for check string */
2220 else { /* search for newline */
2221 if (s > startpos) {
2222 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2223 s--;
92f3d482 2224 }
21eede78
YO
2225 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2226 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2227 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2228 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2229 goto got_it;
2230 }
92f3d482
YO
2231 }
2232 } /* end search for newline */
2233 } /* end anchored/multiline check string search */
6eb5f6b9 2234 goto phooey;
bbe252da 2235 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2236 {
486ec47a 2237 /* the warning about reginfo.ganch being used without initialization
bbe252da 2238 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2239 and we only enter this block when the same bit is set. */
58e23c8d 2240 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2241
2242 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2243 goto got_it;
2244 goto phooey;
2245 }
2246
2247 /* Messy cases: unanchored match. */
bbe252da 2248 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2249 /* we have /x+whatever/ */
f2ed9b32 2250 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2251 char ch;
bf93d4cc
GS
2252#ifdef DEBUGGING
2253 int did_match = 0;
2254#endif
f2ed9b32
KW
2255 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2256 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2257 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2258
f2ed9b32 2259 if (utf8_target) {
4cadc6a9 2260 REXEC_FBC_SCAN(
6eb5f6b9 2261 if (*s == ch) {
a3621e74 2262 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2263 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2264 s += UTF8SKIP(s);
2265 while (s < strend && *s == ch)
2266 s += UTF8SKIP(s);
2267 }
4cadc6a9 2268 );
6eb5f6b9
JH
2269 }
2270 else {
4cadc6a9 2271 REXEC_FBC_SCAN(
6eb5f6b9 2272 if (*s == ch) {
a3621e74 2273 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2274 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2275 s++;
2276 while (s < strend && *s == ch)
2277 s++;
2278 }
4cadc6a9 2279 );
6eb5f6b9 2280 }
a3621e74 2281 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2282 PerlIO_printf(Perl_debug_log,
b7953727
JH
2283 "Did not find anchored character...\n")
2284 );
6eb5f6b9 2285 }
a0714e2c
SS
2286 else if (prog->anchored_substr != NULL
2287 || prog->anchored_utf8 != NULL
2288 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2289 && prog->float_max_offset < strend - s)) {
2290 SV *must;
2291 I32 back_max;
2292 I32 back_min;
2293 char *last;
6eb5f6b9 2294 char *last1; /* Last position checked before */
bf93d4cc
GS
2295#ifdef DEBUGGING
2296 int did_match = 0;
2297#endif
33b8afdf 2298 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2299 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2300 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2301 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2302 back_max = back_min = prog->anchored_offset;
2303 } else {
f2ed9b32
KW
2304 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2305 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2306 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2307 back_max = prog->float_max_offset;
2308 back_min = prog->float_min_offset;
2309 }
1de06328
YO
2310
2311
33b8afdf
JH
2312 if (must == &PL_sv_undef)
2313 /* could not downgrade utf8 check substring, so must fail */
2314 goto phooey;
2315
1de06328
YO
2316 if (back_min<0) {
2317 last = strend;
2318 } else {
2319 last = HOP3c(strend, /* Cannot start after this */
2320 -(I32)(CHR_SVLEN(must)
2321 - (SvTAIL(must) != 0) + back_min), strbeg);
2322 }
6eb5f6b9
JH
2323 if (s > PL_bostr)
2324 last1 = HOPc(s, -1);
2325 else
2326 last1 = s - 1; /* bogus */
2327
a0288114 2328 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2329 check_substr==must. */
2330 scream_pos = -1;
2331 dontbother = end_shift;
2332 strend = HOPc(strend, -dontbother);
2333 while ( (s <= last) &&
c33e64f0 2334 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2335 (unsigned char*)strend, must,
c33e64f0 2336 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2337 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2338 if (HOPc(s, -back_max) > last1) {
2339 last1 = HOPc(s, -back_min);
2340 s = HOPc(s, -back_max);
2341 }
2342 else {
52657f30 2343 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2344
2345 last1 = HOPc(s, -back_min);
52657f30 2346 s = t;
6eb5f6b9 2347 }
f2ed9b32 2348 if (utf8_target) {
6eb5f6b9 2349 while (s <= last1) {
24b23f37 2350 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2351 goto got_it;
2352 s += UTF8SKIP(s);
2353 }
2354 }
2355 else {
2356 while (s <= last1) {
24b23f37 2357 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2358 goto got_it;
2359 s++;
2360 }
2361 }
2362 }
ab3bbdeb 2363 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2364 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2365 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2366 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2367 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2368 ? "anchored" : "floating"),
ab3bbdeb
YO
2369 quoted, RE_SV_TAIL(must));
2370 });
6eb5f6b9
JH
2371 goto phooey;
2372 }
f8fc2ecf 2373 else if ( (c = progi->regstclass) ) {
f14c76ed 2374 if (minlen) {
f8fc2ecf 2375 const OPCODE op = OP(progi->regstclass);
66e933ab 2376 /* don't bother with what can't match */
786e8c11 2377 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2378 strend = HOPc(strend, -(minlen - 1));
2379 }
a3621e74 2380 DEBUG_EXECUTE_r({
be8e71aa 2381 SV * const prop = sv_newmortal();
32fc9b6a 2382 regprop(prog, prop, c);
0df25f3d 2383 {
f2ed9b32 2384 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2385 s,strend-s,60);
0df25f3d 2386 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2387 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2388 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2389 quoted, (int)(strend - s));
0df25f3d 2390 }
ffc61ed2 2391 });
3b0527fe 2392 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2393 goto got_it;
07be1b83 2394 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2395 }
2396 else {
2397 dontbother = 0;
a0714e2c 2398 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2399 /* Trim the end. */
6af40bd7 2400 char *last= NULL;
33b8afdf 2401 SV* float_real;
c33e64f0
FC
2402 STRLEN len;
2403 const char *little;
33b8afdf 2404
f2ed9b32
KW
2405 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2406 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2407 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2408
c33e64f0
FC
2409 little = SvPV_const(float_real, len);
2410 if (SvTAIL(float_real)) {
1a13b075
YO
2411 /* This means that float_real contains an artificial \n on the end
2412 * due to the presence of something like this: /foo$/
2413 * where we can match both "foo" and "foo\n" at the end of the string.
2414 * So we have to compare the end of the string first against the float_real
2415 * without the \n and then against the full float_real with the string.
2416 * We have to watch out for cases where the string might be smaller
2417 * than the float_real or the float_real without the \n.
2418 */
2419 char *checkpos= strend - len;
2420 DEBUG_OPTIMISE_r(
2421 PerlIO_printf(Perl_debug_log,
2422 "%sChecking for float_real.%s\n",
2423 PL_colors[4], PL_colors[5]));
2424 if (checkpos + 1 < strbeg) {
2425 /* can't match, even if we remove the trailing \n string is too short to match */
2426 DEBUG_EXECUTE_r(
2427 PerlIO_printf(Perl_debug_log,
2428 "%sString shorter than required trailing substring, cannot match.%s\n",
2429 PL_colors[4], PL_colors[5]));
2430 goto phooey;
2431 } else if (memEQ(checkpos + 1, little, len - 1)) {
2432 /* can match, the end of the string matches without the "\n" */
2433 last = checkpos + 1;
2434 } else if (checkpos < strbeg) {
2435 /* cant match, string is too short when the "\n" is included */
2436 DEBUG_EXECUTE_r(
2437 PerlIO_printf(Perl_debug_log,
2438 "%sString does not contain required trailing substring, cannot match.%s\n",
2439 PL_colors[4], PL_colors[5]));
2440 goto phooey;
2441 } else if (!multiline) {
2442 /* non multiline match, so compare with the "\n" at the end of the string */
2443 if (memEQ(checkpos, little, len)) {
2444 last= checkpos;
2445 } else {
2446 DEBUG_EXECUTE_r(
2447 PerlIO_printf(Perl_debug_log,
2448 "%sString does not contain required trailing substring, cannot match.%s\n",
2449 PL_colors[4], PL_colors[5]));
2450 goto phooey;
2451 }
2452 } else {
2453 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2454 goto find_last;
1a13b075 2455 }
c33e64f0 2456 } else {
d6a28714 2457 find_last:
9041c2e3 2458 if (len)
d6a28714 2459 last = rninstr(s, strend, little, little + len);
b8c5462f 2460 else
a0288114 2461 last = strend; /* matching "$" */
b8c5462f 2462 }
6af40bd7
YO
2463 if (!last) {
2464 /* at one point this block contained a comment which was probably
2465 * incorrect, which said that this was a "should not happen" case.
2466 * Even if it was true when it was written I am pretty sure it is
2467 * not anymore, so I have removed the comment and replaced it with
2468 * this one. Yves */
6bda09f9
YO
2469 DEBUG_EXECUTE_r(
2470 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2471 "String does not contain required substring, cannot match.\n"
2472 ));
2473 goto phooey;
bf93d4cc 2474 }
d6a28714
JH
2475 dontbother = strend - last + prog->float_min_offset;
2476 }
2477 if (minlen && (dontbother < minlen))
2478 dontbother = minlen - 1;
2479 strend -= dontbother; /* this one's always in bytes! */
2480 /* We don't know much -- general case. */
f2ed9b32 2481 if (utf8_target) {
d6a28714 2482 for (;;) {
24b23f37 2483 if (regtry(&reginfo, &s))
d6a28714
JH
2484 goto got_it;
2485 if (s >= strend)
2486 break;
b8c5462f 2487 s += UTF8SKIP(s);
d6a28714
JH
2488 };
2489 }
2490 else {
2491 do {
24b23f37 2492 if (regtry(&reginfo, &s))
d6a28714
JH
2493 goto got_it;
2494 } while (s++ < strend);
2495 }
2496 }
2497
2498 /* Failure. */
2499 goto phooey;
2500
2501got_it:
e9105d30 2502 Safefree(swap);
288b8c02 2503 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2504
ed301438 2505 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2506 restore_pos(aTHX_ prog);
5daac39c
NC
2507 if (RXp_PAREN_NAMES(prog))
2508 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2509
2510 /* make sure $`, $&, $', and $digit will work later */
2511 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2512 RX_MATCH_COPY_FREE(rx);
d6a28714 2513 if (flags & REXEC_COPY_STR) {
be8e71aa 2514 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2515#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2516 if ((SvIsCOW(sv)
2517 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2518 if (DEBUG_C_TEST) {
2519 PerlIO_printf(Perl_debug_log,
2520 "Copy on write: regexp capture, type %d\n",
2521 (int) SvTYPE(sv));
2522 }
2523 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2524 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2525 assert (SvPOKp(prog->saved_copy));
2526 } else
2527#endif
2528 {
288b8c02 2529 RX_MATCH_COPIED_on(rx);
ed252734
NC
2530 s = savepvn(strbeg, i);
2531 prog->subbeg = s;
2532 }
d6a28714 2533 prog->sublen = i;
d6a28714
JH
2534 }
2535 else {
2536 prog->subbeg = strbeg;
2537 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2538 }
2539 }
9041c2e3 2540
d6a28714
JH
2541 return 1;
2542
2543phooey:
a3621e74 2544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2545 PL_colors[4], PL_colors[5]));
ed301438 2546 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2547 restore_pos(aTHX_ prog);
e9105d30 2548 if (swap) {
c74340f9 2549 /* we failed :-( roll it back */
e9105d30
GG
2550 Safefree(prog->offs);
2551 prog->offs = swap;
2552 }
2553
d6a28714
JH
2554 return 0;
2555}
2556
6bda09f9 2557
d6a28714
JH
2558/*
2559 - regtry - try match at specific point
2560 */
2561STATIC I32 /* 0 failure, 1 success */
24b23f37 2562S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2563{
97aff369 2564 dVAR;
d6a28714 2565 CHECKPOINT lastcp;
288b8c02
NC
2566 REGEXP *const rx = reginfo->prog;
2567 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2568 RXi_GET_DECL(prog,progi);
a3621e74 2569 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2570
2571 PERL_ARGS_ASSERT_REGTRY;
2572
24b23f37 2573 reginfo->cutpoint=NULL;
d6a28714 2574
ed301438
DM
2575 if ((prog->extflags & RXf_EVAL_SEEN)
2576 && !PL_reg_state.re_state_eval_setup_done)
2577 {
d6a28714
JH
2578 MAGIC *mg;
2579
ed301438 2580 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2581 if (reginfo->sv) {
d6a28714 2582 /* Make $_ available to executed code. */
3b0527fe 2583 if (reginfo->sv != DEFSV) {
59f00321 2584 SAVE_DEFSV;
414bf5ae 2585 DEFSV_set(reginfo->sv);
b8c5462f 2586 }
d6a28714 2587
3b0527fe
DM
2588 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2589 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2590 /* prepare for quick setting of pos */
d300d9fa 2591#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2592 if (SvIsCOW(reginfo->sv))
2593 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2594#endif
3dab1dad 2595 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2596 &PL_vtbl_mglob, NULL, 0);
d6a28714 2597 mg->mg_len = -1;
b8c5462f 2598 }
d6a28714
JH
2599 PL_reg_magic = mg;
2600 PL_reg_oldpos = mg->mg_len;
4f639d21 2601 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2602 }
09687e5a 2603 if (!PL_reg_curpm) {
a02a5408 2604 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2605#ifdef USE_ITHREADS
2606 {
14a49a24 2607 SV* const repointer = &PL_sv_undef;
92313705
NC
2608 /* this regexp is also owned by the new PL_reg_curpm, which
2609 will try to free it. */
d2ece331 2610 av_push(PL_regex_padav, repointer);
09687e5a
AB
2611 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2612 PL_regex_pad = AvARRAY(PL_regex_padav);
2613 }
2614#endif
2615 }
86c29d75
NC
2616#ifdef USE_ITHREADS
2617 /* It seems that non-ithreads works both with and without this code.
2618 So for efficiency reasons it seems best not to have the code
2619 compiled when it is not needed. */
92313705
NC
2620 /* This is safe against NULLs: */
2621 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2622 /* PM_reg_curpm owns a reference to this regexp. */
cd733337 2623 (void)ReREFCNT_inc(rx);
86c29d75 2624#endif
288b8c02 2625 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2626 PL_reg_oldcurpm = PL_curpm;
2627 PL_curpm = PL_reg_curpm;
07bc277f 2628 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2629 /* Here is a serious problem: we cannot rewrite subbeg,
2630 since it may be needed if this match fails. Thus
2631 $` inside (?{}) could fail... */
2632 PL_reg_oldsaved = prog->subbeg;
2633 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2634#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2635 PL_nrs = prog->saved_copy;
2636#endif
07bc277f 2637 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2638 }
2639 else
bd61b366 2640 PL_reg_oldsaved = NULL;
d6a28714
JH
2641 prog->subbeg = PL_bostr;
2642 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2643 }
24b23f37 2644 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2645 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2646 PL_reginput = *startpos;
d6a28714 2647 prog->lastparen = 0;
03994de8 2648 prog->lastcloseparen = 0;
d6a28714 2649 PL_regsize = 0;
d6a28714
JH
2650
2651 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2652 to do this again and again, prog->lastparen should take care of
3dd2943c 2653 this! --ilya*/
dafc8851
JH
2654
2655 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2656 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2657 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2658 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2659 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2660 * Meanwhile, this code *is* needed for the
daf18116
JH
2661 * above-mentioned test suite tests to succeed. The common theme
2662 * on those tests seems to be returning null fields from matches.
225593e1 2663 * --jhi updated by dapm */
dafc8851 2664#if 1
d6a28714 2665 if (prog->nparens) {
b93070ed 2666 regexp_paren_pair *pp = prog->offs;
097eb12c 2667 register I32 i;
b93070ed 2668 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2669 ++pp;
2670 pp->start = -1;
2671 pp->end = -1;
d6a28714
JH
2672 }
2673 }
dafc8851 2674#endif
02db2b7b 2675 REGCP_SET(lastcp);
f8fc2ecf 2676 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2677 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2678 return 1;
2679 }
24b23f37
YO
2680 if (reginfo->cutpoint)
2681 *startpos= reginfo->cutpoint;
02db2b7b 2682 REGCP_UNWIND(lastcp);
d6a28714
JH
2683 return 0;
2684}
2685
02db2b7b 2686
8ba1375e
MJD
2687#define sayYES goto yes
2688#define sayNO goto no
262b90c4 2689#define sayNO_SILENT goto no_silent
8ba1375e 2690
f9f4320a
YO
2691/* we dont use STMT_START/END here because it leads to
2692 "unreachable code" warnings, which are bogus, but distracting. */
2693#define CACHEsayNO \
c476f425
DM
2694 if (ST.cache_mask) \
2695 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2696 sayNO
3298f257 2697
a3621e74 2698/* this is used to determine how far from the left messages like
265c4333
YO
2699 'failed...' are printed. It should be set such that messages
2700 are inline with the regop output that created them.
a3621e74 2701*/
265c4333 2702#define REPORT_CODE_OFF 32
a3621e74
YO
2703
2704
40a82448
DM
2705#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2706#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2707
86545054
DM
2708#define SLAB_FIRST(s) (&(s)->states[0])
2709#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2710
5d9a96ca
DM
2711/* grab a new slab and return the first slot in it */
2712
2713STATIC regmatch_state *
2714S_push_slab(pTHX)
2715{
a35a87e7 2716#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2717 dMY_CXT;
2718#endif
5d9a96ca
DM
2719 regmatch_slab *s = PL_regmatch_slab->next;
2720 if (!s) {
2721 Newx(s, 1, regmatch_slab);
2722 s->prev = PL_regmatch_slab;
2723 s->next = NULL;
2724 PL_regmatch_slab->next = s;
2725 }
2726 PL_regmatch_slab = s;
86545054 2727 return SLAB_FIRST(s);
5d9a96ca 2728}
5b47454d 2729
95b24440 2730
40a82448
DM
2731/* push a new state then goto it */
2732
2733#define PUSH_STATE_GOTO(state, node) \
2734 scan = node; \
2735 st->resume_state = state; \
2736 goto push_state;
2737
2738/* push a new state with success backtracking, then goto it */
2739
2740#define PUSH_YES_STATE_GOTO(state, node) \
2741 scan = node; \
2742 st->resume_state = state; \
2743 goto push_yes_state;
2744
aa283a38 2745
aa283a38 2746
d6a28714 2747/*
95b24440 2748
bf1f174e
DM
2749regmatch() - main matching routine
2750
2751This is basically one big switch statement in a loop. We execute an op,
2752set 'next' to point the next op, and continue. If we come to a point which
2753we may need to backtrack to on failure such as (A|B|C), we push a
2754backtrack state onto the backtrack stack. On failure, we pop the top
2755state, and re-enter the loop at the state indicated. If there are no more
2756states to pop, we return failure.
2757
2758Sometimes we also need to backtrack on success; for example /A+/, where
2759after successfully matching one A, we need to go back and try to
2760match another one; similarly for lookahead assertions: if the assertion
2761completes successfully, we backtrack to the state just before the assertion
2762and then carry on. In these cases, the pushed state is marked as
2763'backtrack on success too'. This marking is in fact done by a chain of
2764pointers, each pointing to the previous 'yes' state. On success, we pop to
2765the nearest yes state, discarding any intermediate failure-only states.
2766Sometimes a yes state is pushed just to force some cleanup code to be
2767called at the end of a successful match or submatch; e.g. (??{$re}) uses
2768it to free the inner regex.
2769
2770Note that failure backtracking rewinds the cursor position, while
2771success backtracking leaves it alone.
2772
2773A pattern is complete when the END op is executed, while a subpattern
2774such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2775ops trigger the "pop to last yes state if any, otherwise return true"
2776behaviour.
2777
2778A common convention in this function is to use A and B to refer to the two
2779subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2780the subpattern to be matched possibly multiple times, while B is the entire
2781rest of the pattern. Variable and state names reflect this convention.
2782
2783The states in the main switch are the union of ops and failure/success of
2784substates associated with with that op. For example, IFMATCH is the op
2785that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2786'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2787successfully matched A and IFMATCH_A_fail is a state saying that we have
2788just failed to match A. Resume states always come in pairs. The backtrack
2789state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2790at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2791on success or failure.
2792
2793The struct that holds a backtracking state is actually a big union, with
2794one variant for each major type of op. The variable st points to the
2795top-most backtrack struct. To make the code clearer, within each
2796block of code we #define ST to alias the relevant union.
2797
2798Here's a concrete example of a (vastly oversimplified) IFMATCH
2799implementation:
2800
2801 switch (state) {
2802 ....
2803
2804#define ST st->u.ifmatch
2805
2806 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2807 ST.foo = ...; // some state we wish to save
95b24440 2808 ...
bf1f174e
DM
2809 // push a yes backtrack state with a resume value of
2810 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2811 // first node of A:
2812 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2813 // NOTREACHED
2814
2815 case IFMATCH_A: // we have successfully executed A; now continue with B
2816 next = B;
2817 bar = ST.foo; // do something with the preserved value
2818 break;
2819
2820 case IFMATCH_A_fail: // A failed, so the assertion failed
2821 ...; // do some housekeeping, then ...
2822 sayNO; // propagate the failure
2823
2824#undef ST
95b24440 2825
bf1f174e
DM
2826 ...
2827 }
95b24440 2828
bf1f174e
DM
2829For any old-timers reading this who are familiar with the old recursive
2830approach, the code above is equivalent to:
95b24440 2831
bf1f174e
DM
2832 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2833 {
2834 int foo = ...
95b24440 2835 ...
bf1f174e
DM
2836 if (regmatch(A)) {
2837 next = B;
2838 bar = foo;
2839 break;
95b24440 2840 }
bf1f174e
DM
2841 ...; // do some housekeeping, then ...
2842 sayNO; // propagate the failure
95b24440 2843 }
bf1f174e
DM
2844
2845The topmost backtrack state, pointed to by st, is usually free. If you
2846want to claim it, populate any ST.foo fields in it with values you wish to
2847save, then do one of
2848
2849 PUSH_STATE_GOTO(resume_state, node);
2850 PUSH_YES_STATE_GOTO(resume_state, node);
2851
2852which sets that backtrack state's resume value to 'resume_state', pushes a
2853new free entry to the top of the backtrack stack, then goes to 'node'.
2854On backtracking, the free slot is popped, and the saved state becomes the
2855new free state. An ST.foo field in this new top state can be temporarily
2856accessed to retrieve values, but once the main loop is re-entered, it
2857becomes available for reuse.
2858
2859Note that the depth of the backtrack stack constantly increases during the
2860left-to-right execution of the pattern, rather than going up and down with
2861the pattern nesting. For example the stack is at its maximum at Z at the
2862end of the pattern, rather than at X in the following:
2863
2864 /(((X)+)+)+....(Y)+....Z/
2865
2866The only exceptions to this are lookahead/behind assertions and the cut,
2867(?>A), which pop all the backtrack states associated with A before
2868continuing.
2869
486ec47a 2870Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2871PL_regmatch_state and st always point to the currently active state,
2872and PL_regmatch_slab points to the slab currently containing
2873PL_regmatch_state. The first time regmatch() is called, the first slab is
2874allocated, and is never freed until interpreter destruction. When the slab
2875is full, a new one is allocated and chained to the end. At exit from
2876regmatch(), slabs allocated since entry are freed.
2877
2878*/
95b24440 2879
40a82448 2880
5bc10b2c 2881#define DEBUG_STATE_pp(pp) \
265c4333 2882 DEBUG_STATE_r({ \
f2ed9b32 2883 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2884 PerlIO_printf(Perl_debug_log, \
5d458dd8 2885 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2886 depth*2, "", \
13d6edb4 2887 PL_reg_name[st->resume_state], \
5d458dd8
YO
2888 ((st==yes_state||st==mark_state) ? "[" : ""), \
2889 ((st==yes_state) ? "Y" : ""), \
2890 ((st==mark_state) ? "M" : ""), \
2891 ((st==yes_state||st==mark_state) ? "]" : "") \
2892 ); \
265c4333 2893 });
5bc10b2c 2894
40a82448 2895
3dab1dad 2896#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2897
3df15adc 2898#ifdef DEBUGGING
5bc10b2c 2899
ab3bbdeb 2900STATIC void
f2ed9b32 2901S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2902 const char *start, const char *end, const char *blurb)
2903{
efd26800 2904 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2905
2906 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2907
ab3bbdeb
YO
2908 if (!PL_colorset)
2909 reginitcolors();
2910 {
2911 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2912 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2913
f2ed9b32 2914 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2915 start, end - start, 60);
2916
2917 PerlIO_printf(Perl_debug_log,
2918 "%s%s REx%s %s against %s\n",
2919 PL_colors[4], blurb, PL_colors[5], s0, s1);
2920
f2ed9b32 2921 if (utf8_target||utf8_pat)
1de06328
YO
2922 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2923 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2924 utf8_pat && utf8_target ? " and " : "",
2925 utf8_target ? "string" : ""
ab3bbdeb
YO
2926 );
2927 }
2928}
3df15adc
YO
2929
2930STATIC void
786e8c11
YO
2931S_dump_exec_pos(pTHX_ const char *locinput,
2932 const regnode *scan,
2933 const char *loc_regeol,
2934 const char *loc_bostr,
2935 const char *loc_reg_starttry,
f2ed9b32 2936 const bool utf8_target)
07be1b83 2937{
786e8c11 2938 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2939 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2940 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2941 /* The part of the string before starttry has one color
2942 (pref0_len chars), between starttry and current
2943 position another one (pref_len - pref0_len chars),
2944 after the current position the third one.
2945 We assume that pref0_len <= pref_len, otherwise we
2946 decrease pref0_len. */
786e8c11
YO
2947 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2948 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2949 int pref0_len;
2950
7918f24d
NC
2951 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2952
f2ed9b32 2953 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2954 pref_len++;
786e8c11
YO
2955 pref0_len = pref_len - (locinput - loc_reg_starttry);
2956 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2957 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2958 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2959 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2960 l--;
2961 if (pref0_len < 0)
2962 pref0_len = 0;
2963 if (pref0_len > pref_len)
2964 pref0_len = pref_len;
2965 {
f2ed9b32 2966 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2967
ab3bbdeb 2968 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2969 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2970
ab3bbdeb 2971 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2972 (locinput - pref_len + pref0_len),
1de06328 2973 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2974
ab3bbdeb 2975 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2976 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2977
1de06328 2978 const STRLEN tlen=len0+len1+len2;
3df15adc 2979 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2980 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2981 (IV)(locinput - loc_bostr),
07be1b83 2982 len0, s0,
07be1b83 2983 len1, s1,
07be1b83 2984 (docolor ? "" : "> <"),
07be1b83 2985 len2, s2,
f9f4320a 2986 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2987 "");
2988 }
2989}
3df15adc 2990
07be1b83
YO
2991#endif
2992
0a4db386
YO
2993/* reg_check_named_buff_matched()
2994 * Checks to see if a named buffer has matched. The data array of
2995 * buffer numbers corresponding to the buffer is expected to reside
2996 * in the regexp->data->data array in the slot stored in the ARG() of
2997 * node involved. Note that this routine doesn't actually care about the
2998 * name, that information is not preserved from compilation to execution.
2999 * Returns the index of the leftmost defined buffer with the given name
3000 * or 0 if non of the buffers matched.
3001 */
3002STATIC I32
7918f24d
NC
3003S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3004{
0a4db386 3005 I32 n;
f8fc2ecf 3006 RXi_GET_DECL(rex,rexi);
ad64d0ec 3007 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3008 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3009
3010 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3011
0a4db386 3012 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3013 if ((I32)rex->lastparen >= nums[n] &&
3014 rex->offs[nums[n]].end != -1)
0a4db386
YO
3015 {
3016 return nums[n];
3017 }
3018 }
3019 return 0;
3020}
3021
2f554ef7
DM
3022
3023/* free all slabs above current one - called during LEAVE_SCOPE */
3024
3025STATIC void
3026S_clear_backtrack_stack(pTHX_ void *p)
3027{
3028 regmatch_slab *s = PL_regmatch_slab->next;
3029 PERL_UNUSED_ARG(p);
3030
3031 if (!s)
3032 return;
3033 PL_regmatch_slab->next = NULL;
3034 while (s) {
3035 regmatch_slab * const osl = s;
3036 s = s->next;
3037 Safefree(osl);
3038 }
3039}
3040
3041
28d8d7f4 3042#define SETREX(Re1,Re2) \
ed301438
DM
3043 if (PL_reg_state.re_state_eval_setup_done) \
3044 PM_SETRE((PL_reg_curpm), (Re2)); \
28d8d7f4
YO
3045 Re1 = (Re2)
3046
d6a28714 3047STATIC I32 /* 0 failure, 1 success */
24b23f37 3048S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3049{
a35a87e7 3050#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3051 dMY_CXT;
3052#endif
27da23d5 3053 dVAR;
f2ed9b32 3054 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3055 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3056 REGEXP *rex_sv = reginfo->prog;
3057 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3058 RXi_GET_DECL(rex,rexi);
2f554ef7 3059 I32 oldsave;
5d9a96ca
DM
3060 /* the current state. This is a cached copy of PL_regmatch_state */
3061 register regmatch_state *st;
5d9a96ca
DM
3062 /* cache heavy used fields of st in registers */
3063 register regnode *scan;
3064 register regnode *next;
438e9bae 3065 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3066 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3067 register char *locinput = PL_reginput;
5d9a96ca 3068 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3069
b69b0499 3070 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3071 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3072 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3073 const U32 max_nochange_depth =
3074 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3075 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3076 regmatch_state *yes_state = NULL; /* state to pop to on success of
3077 subpattern */
e2e6a0f1
YO
3078 /* mark_state piggy backs on the yes_state logic so that when we unwind
3079 the stack on success we can update the mark_state as we go */
3080 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3081 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3082 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3083 U32 state_num;
5d458dd8
YO
3084 bool no_final = 0; /* prevent failure from backtracking? */
3085 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3086 char *startpoint = PL_reginput;
5d458dd8
YO
3087 SV *popmark = NULL; /* are we looking for a mark? */
3088 SV *sv_commit = NULL; /* last mark name seen in failure */
3089 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3090 during a successful match */
5d458dd8
YO
3091 U32 lastopen = 0; /* last open we saw */
3092 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3093 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3094 /* these three flags are set by various ops to signal information to
3095 * the very next op. They have a useful lifetime of exactly one loop
3096 * iteration, and are not preserved or restored by state pushes/pops
3097 */
3098 bool sw = 0; /* the condition value in (?(cond)a|b) */
3099 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3100 int logical = 0; /* the following EVAL is:
3101 0: (?{...})
3102 1: (?(?{...})X|Y)
3103 2: (??{...})
3104 or the following IFMATCH/UNLESSM is:
3105 false: plain (?=foo)
3106 true: used as a condition: (?(?=foo))
3107 */
81ed78b2
DM
3108 PAD* last_pad = NULL;
3109 dMULTICALL;
3110 I32 gimme = G_SCALAR;
3111 CV *caller_cv = NULL; /* who called us */
3112 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3113
95b24440 3114#ifdef DEBUGGING
e68ec53f 3115 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3116#endif
3117
81ed78b2
DM
3118 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3119 multicall_oldcatch = 0;
3120 multicall_cv = NULL;
3121 cx = NULL;
3122
3123
7918f24d
NC
3124 PERL_ARGS_ASSERT_REGMATCH;
3125
3b57cd43 3126 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3127 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3128 }));
5d9a96ca
DM
3129 /* on first ever call to regmatch, allocate first slab */
3130 if (!PL_regmatch_slab) {
3131 Newx(PL_regmatch_slab, 1, regmatch_slab);
3132 PL_regmatch_slab->prev = NULL;
3133 PL_regmatch_slab->next = NULL;
86545054 3134 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3135 }
3136
2f554ef7
DM
3137 oldsave = PL_savestack_ix;
3138 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3139 SAVEVPTR(PL_regmatch_slab);
3140 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3141
3142 /* grab next free state slot */
3143 st = ++PL_regmatch_state;
86545054 3144 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3145 st = PL_regmatch_state = S_push_slab(aTHX);
3146
d6a28714
JH
3147 /* Note that nextchr is a byte even in UTF */
3148 nextchr = UCHARAT(locinput);
3149 scan = prog;
3150 while (scan != NULL) {
8ba1375e 3151
a3621e74 3152 DEBUG_EXECUTE_r( {
6136c704 3153 SV * const prop = sv_newmortal();
1de06328 3154 regnode *rnext=regnext(scan);
f2ed9b32 3155 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3156 regprop(rex, prop, scan);
07be1b83
YO
3157
3158 PerlIO_printf(Perl_debug_log,
3159 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3160 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3161 SvPVX_const(prop),
1de06328 3162 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3163 0 : (IV)(rnext - rexi->program));
2a782b5b 3164 });
d6a28714
JH
3165
3166 next = scan + NEXT_OFF(scan);
3167 if (next == scan)
3168 next = NULL;
40a82448 3169 state_num = OP(scan);
d6a28714 3170
40a82448 3171 reenter_switch:
34a81e2b 3172
40a82448 3173 switch (state_num) {
d6a28714 3174 case BOL:
7fba1cd6 3175 if (locinput == PL_bostr)
d6a28714 3176 {
3b0527fe 3177 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3178 break;
3179 }
d6a28714
JH
3180 sayNO;
3181 case MBOL:
12d33761
HS
3182 if (locinput == PL_bostr ||
3183 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3184 {
b8c5462f
JH
3185 break;
3186 }
d6a28714
JH
3187 sayNO;
3188 case SBOL:
c2a73568 3189 if (locinput == PL_bostr)
b8c5462f 3190 break;
d6a28714
JH
3191 sayNO;
3192 case GPOS:
3b0527fe 3193 if (locinput == reginfo->ganch)
d6a28714
JH
3194 break;
3195 sayNO;
ee9b8eae
YO
3196
3197 case KEEPS:
3198 /* update the startpoint */
b93070ed 3199 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3200 PL_reginput = locinput;
b93070ed 3201 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3202 PUSH_STATE_GOTO(KEEPS_next, next);
3203 /*NOT-REACHED*/
3204 case KEEPS_next_fail:
3205 /* rollback the start point change */
b93070ed 3206 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3207 sayNO_SILENT;
3208 /*NOT-REACHED*/
d6a28714 3209 case EOL:
d6a28714
JH
3210 goto seol;
3211 case MEOL:
d6a28714 3212 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3213 sayNO;
b8c5462f 3214 break;
d6a28714
JH
3215 case SEOL:
3216 seol:
3217 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3218 sayNO;
d6a28714 3219 if (PL_regeol - locinput > 1)
b8c5462f 3220 sayNO;
b8c5462f 3221 break;
d6a28714
JH
3222 case EOS:
3223 if (PL_regeol != locinput)
b8c5462f 3224 sayNO;
d6a28714 3225 break;
ffc61ed2 3226 case SANY:
d6a28714 3227 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3228 sayNO;
f2ed9b32 3229 if (utf8_target) {
f33976b4
DB
3230 locinput += PL_utf8skip[nextchr];
3231 if (locinput > PL_regeol)
3232 sayNO;
3233 nextchr = UCHARAT(locinput);
3234 }
3235 else
3236 nextchr = UCHARAT(++locinput);
3237 break;
3238 case CANY:
3239 if (!nextchr && locinput >= PL_regeol)
3240 sayNO;
b8c5462f 3241 nextchr = UCHARAT(++locinput);
a0d0e21e 3242 break;
ffc61ed2 3243 case REG_ANY:
1aa99e6b
IH
3244 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3245 sayNO;
f2ed9b32 3246 if (utf8_target) {
b8c5462f 3247 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3248 if (locinput > PL_regeol)
3249 sayNO;
a0ed51b3 3250 nextchr = UCHARAT(locinput);
a0ed51b3 3251 }
1aa99e6b
IH
3252 else
3253 nextchr = UCHARAT(++locinput);
a0ed51b3 3254 break;
166ba7cd
DM
3255
3256#undef ST
3257#define ST st->u.trie
786e8c11
YO
3258 case TRIEC:
3259 /* In this case the charclass data is available inline so
3260 we can fail fast without a lot of extra overhead.
3261 */
fab2782b
YO
3262 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3263 DEBUG_EXECUTE_r(
3264 PerlIO_printf(Perl_debug_log,
3265 "%*s %sfailed to match trie start class...%s\n",
3266 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3267 );
3268 sayNO_SILENT;
3269 /* NOTREACHED */
786e8c11
YO
3270 }
3271 /* FALL THROUGH */
5b47454d 3272 case TRIE:
2e64971a
DM
3273 /* the basic plan of execution of the trie is:
3274 * At the beginning, run though all the states, and
3275 * find the longest-matching word. Also remember the position
3276 * of the shortest matching word. For example, this pattern:
3277 * 1 2 3 4 5
3278 * ab|a|x|abcd|abc
3279 * when matched against the string "abcde", will generate
3280 * accept states for all words except 3, with the longest
3281 * matching word being 4, and the shortest being 1 (with
3282 * the position being after char 1 of the string).
3283 *
3284 * Then for each matching word, in word order (i.e. 1,2,4,5),
3285 * we run the remainder of the pattern; on each try setting
3286 * the current position to the character following the word,
3287 * returning to try the next word on failure.
3288 *
3289 * We avoid having to build a list of words at runtime by
3290 * using a compile-time structure, wordinfo[].prev, which
3291 * gives, for each word, the previous accepting word (if any).
3292 * In the case above it would contain the mappings 1->2, 2->0,
3293 * 3->0, 4->5, 5->1. We can use this table to generate, from
3294 * the longest word (4 above), a list of all words, by
3295 * following the list of prev pointers; this gives us the
3296 * unordered list 4,5,1,2. Then given the current word we have
3297 * just tried, we can go through the list and find the
3298 * next-biggest word to try (so if we just failed on word 2,
3299 * the next in the list is 4).
3300 *
3301 * Since at runtime we don't record the matching position in
3302 * the string for each word, we have to work that out for
3303 * each word we're about to process. The wordinfo table holds
3304 * the character length of each word; given that we recorded
3305 * at the start: the position of the shortest word and its
3306 * length in chars, we just need to move the pointer the
3307 * difference between the two char lengths. Depending on
3308 * Unicode status and folding, that's cheap or expensive.
3309 *
3310 * This algorithm is optimised for the case where are only a
3311 * small number of accept states, i.e. 0,1, or maybe 2.
3312 * With lots of accepts states, and having to try all of them,
3313 * it becomes quadratic on number of accept states to find all
3314 * the next words.
3315 */
3316
3dab1dad 3317 {
07be1b83 3318 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3319 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3320
3321 /* what trie are we using right now */
be8e71aa 3322 reg_trie_data * const trie
f8fc2ecf 3323 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3324 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3325 U32 state = trie->startstate;
166ba7cd 3326
fab2782b 3327 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3328 if (trie->states[ state ].wordnum) {
3329 DEBUG_EXECUTE_r(
3330 PerlIO_printf(Perl_debug_log,
3331 "%*s %smatched empty string...%s\n",
5bc10b2c 3332 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3333 );
20dbff7c
YO
3334 if (!trie->jump)
3335 break;
3dab1dad
YO
3336 } else {
3337 DEBUG_EXECUTE_r(
3338 PerlIO_printf(Perl_debug_log,
786e8c11 3339 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3340 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3341 );
3342 sayNO_SILENT;
3343 }
3344 }
166ba7cd 3345
786e8c11
YO
3346 {
3347 U8 *uc = ( U8* )locinput;
3348
3349 STRLEN len = 0;
3350 STRLEN foldlen = 0;
3351 U8 *uscan = (U8*)NULL;
786e8c11 3352 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3353 U32 charcount = 0; /* how many input chars we have matched */
3354 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3355
786e8c11
YO
3356 ST.B = next;
3357 ST.jump = trie->jump;
786e8c11 3358 ST.me = scan;
2e64971a
DM
3359 ST.firstpos = NULL;
3360 ST.longfold = FALSE; /* char longer if folded => it's harder */
3361 ST.nextword = 0;
3362
3363 /* fully traverse the TRIE; note the position of the
3364 shortest accept state and the wordnum of the longest
3365 accept state */
07be1b83 3366
a3621e74 3367 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3368 U32 base = trie->states[ state ].trans.base;
f9f4320a 3369 UV uvc = 0;
acb909b4 3370 U16 charid = 0;
2e64971a
DM
3371 U16 wordnum;
3372 wordnum = trie->states[ state ].wordnum;
3373
3374 if (wordnum) { /* it's an accept state */
3375 if (!accepted) {
3376 accepted = 1;
3377 /* record first match position */
3378 if (ST.longfold) {
3379 ST.firstpos = (U8*)locinput;
3380 ST.firstchars = 0;
5b47454d 3381 }
2e64971a
DM
3382 else {
3383 ST.firstpos = uc;
3384 ST.firstchars = charcount;
3385 }
3386 }
3387 if (!ST.nextword || wordnum < ST.nextword)
3388 ST.nextword = wordnum;
3389 ST.topword = wordnum;
786e8c11 3390 }
a3621e74 3391
07be1b83 3392 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3393 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3394 PerlIO_printf( Perl_debug_log,
2e64971a 3395 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3396 2+depth * 2, "", PL_colors[4],
2e64971a 3397 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3398 });
a3621e74 3399
2e64971a 3400 /* read a char and goto next state */
a3621e74 3401 if ( base ) {
6dd2be57 3402 I32 offset;
55eed653
NC
3403 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3404 uscan, len, uvc, charid, foldlen,
3405 foldbuf, uniflags);
2e64971a
DM