This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make is_bare_re bool. not int in re_op_compile
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
7e2509c1
KW
94/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
95 * call if there are no complications: i.e., if everything matchable is
96 * straight forward in the bitmap */
af364d03
KW
97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
98 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 99
c277df42
IZ
100/*
101 * Forwards.
102 */
103
f2ed9b32 104#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 105#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 106
3dab1dad
YO
107#define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
52657f30 109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
110 : (U8*)(pos + off))
111#define HOPBACKc(pos, off) \
07be1b83
YO
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
8e11feef 115 ? (U8*)pos - off \
3dab1dad 116 : NULL)
efb30f32 117
e7409c1b 118#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 119#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 120
20d0b1e9 121/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
122#ifdef EBCDIC
123 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
124 * skip the check on EBCDIC platforms */
125# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
126#else
127# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
128 if (!CAT2(PL_utf8_,class)) { \
129 bool ok; \
130 ENTER; save_re_context(); \
131 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 133#endif
37e2e78e
KW
134
135/* Doesn't do an assert to verify that is correct */
136#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 137 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 138 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
139 ENTER; save_re_context(); \
140 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 LEAVE; } } STMT_END
37e2e78e 142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
144#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
145#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 146
37e2e78e 147#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
148 LOAD_UTF8_CHARCLASS(X_begin, " "); \
149 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
150 /* These are utf8 constants, and not utf-ebcdic constants, so the \
151 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 152 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
153 \
154 /* No asserts are done for these, in case called on an early \
155 * Unicode version in which they map to nothing */ \
d275fa5e
CB
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 163
1dcf4a1b 164#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 165
ee9a90b8
KW
166/* The actual code for CCC_TRY, which uses several variables from the routine
167 * it's callable from. It is designed to be the bulk of a case statement.
168 * FUNC is the macro or function to call on non-utf8 targets that indicate if
169 * nextchr matches the class.
170 * UTF8_TEST is the whole test string to use for utf8 targets
171 * LOAD is what to use to test, and if not present to load in the swash for the
172 * class
173 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
174 * UTF8_TEST test.
175 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
176 * utf8 and a variant, load the swash if necessary and test using the utf8
177 * test. Advance to the next character if test is ok, otherwise fail; If not
178 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
179 * fails, or advance to the next character */
180
181#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
182 if (locinput >= PL_regeol) { \
183 sayNO; \
184 } \
185 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
186 LOAD_UTF8_CHARCLASS(CLASS, STR); \
187 if (POS_OR_NEG (UTF8_TEST)) { \
188 sayNO; \
189 } \
190 locinput += PL_utf8skip[nextchr]; \
191 nextchr = UCHARAT(locinput); \
192 break; \
193 } \
194 if (POS_OR_NEG (FUNC(nextchr))) { \
195 sayNO; \
196 } \
197 nextchr = UCHARAT(++locinput); \
980866de
KW
198 break;
199
ee9a90b8
KW
200/* Handle the non-locale cases for a character class and its complement. It
201 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
202 * This is because that code fails when the test succeeds, so we want to have
203 * the test fail so that the code succeeds. The swash is stored in a
204 * predictable PL_ place */
cfaf538b
KW
205#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
206 CLASS, STR) \
ee9a90b8
KW
207 case NAME: \
208 _CCC_TRY_CODE( !, FUNC, \
209 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
210 (U8*)locinput, TRUE)), \
211 CLASS, STR) \
212 case NNAME: \
1dcf4a1b 213 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
214 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
215 (U8*)locinput, TRUE)), \
216 CLASS, STR) \
217
218/* Generate the case statements for both locale and non-locale character
219 * classes in regmatch for classes that don't have special unicode semantics.
220 * Locales don't use an immediate swash, but an intermediary special locale
221 * function that is called on the pointer to the current place in the input
222 * string. That function will resolve to needing the same swash. One might
223 * think that because we don't know what the locale will match, we shouldn't
224 * check with the swash loading function that it loaded properly; ie, that we
225 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
226 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 * irrelevant here */
228#define CCC_TRY(NAME, NNAME, FUNC, \
229 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 230 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
231 CLASS, STR) \
232 case NAMEL: \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 case NNAMEL: \
236 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
237 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
238 CLASS, STR) \
cfaf538b
KW
239 case NAMEA: \
240 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
244 nextchr = UCHARAT(++locinput); \
245 break; \
246 case NNAMEA: \
247 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 sayNO; \
249 } \
250 if (utf8_target) { \
251 locinput += PL_utf8skip[nextchr]; \
252 nextchr = UCHARAT(locinput); \
253 } \
254 else { \
255 nextchr = UCHARAT(++locinput); \
256 } \
257 break; \
ee9a90b8
KW
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260
261/* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263#define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
cfaf538b 266 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 267 CLASS, STR) \
cfaf538b
KW
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
271 CLASS, STR) \
ee9a90b8 272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 273
3dab1dad
YO
274/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275
5f80c4cf 276/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
277/* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 *
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
285*/
3e901dc0
YO
286#define JUMPABLE(rn) ( \
287 OP(rn) == OPEN || \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == EVAL || \
cca55fe3
JP
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 292 OP(rn) == KEEPS || \
3dab1dad 293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 294)
ee9b8eae 295#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 296
ee9b8eae
YO
297#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298
299#if 0
300/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 303#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305
306#else
307/* ... so we use this as its faster. */
308#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 309#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
310#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
312
313#endif
e2d8ce26 314
a84d97b6
HS
315/*
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318*/
cca55fe3 319#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 323 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 324 else if (type == PLUS) \
cca55fe3 325 rn = NEXTOPER(rn); \
3dab1dad 326 else if (type == IFMATCH) \
a84d97b6 327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 328 else rn += NEXT_OFF(rn); \
3dab1dad 329 } \
5f80c4cf 330} STMT_END
74750237 331
c476f425 332
acfe0abc 333static void restore_pos(pTHX_ void *arg);
51371543 334
620d5b66
NC
335#define REGCP_PAREN_ELEMS 4
336#define REGCP_OTHER_ELEMS 5
e0fa7e2b 337#define REGCP_FRAME_ELEMS 1
620d5b66
NC
338/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
339 * are needed for the regexp context stack bookkeeping. */
340
76e3520e 341STATIC CHECKPOINT
cea2e8a9 342S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 343{
97aff369 344 dVAR;
a3b680e6 345 const int retval = PL_savestack_ix;
a3b680e6 346 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
347 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
348 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 349 int p;
40a82448 350 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 351
e49a9654 352 if (paren_elems_to_push < 0)
5637ef5b
NC
353 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
354 paren_elems_to_push);
e49a9654 355
e0fa7e2b
NC
356 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
357 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
358 " out of range (%lu-%ld)",
359 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 360
620d5b66 361 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 362
3280af22 363 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 364/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
365 SSPUSHINT(PL_regoffs[p].end);
366 SSPUSHINT(PL_regoffs[p].start);
3280af22 367 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 368 SSPUSHINT(p);
e7707071 369 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 370 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 371 (UV)p, (IV)PL_regoffs[p].start,
40a82448 372 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 373 (IV)PL_regoffs[p].end
40a82448 374 ));
a0d0e21e 375 }
b1ce53c5 376/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 377 SSPUSHPTR(PL_regoffs);
3280af22
NIS
378 SSPUSHINT(PL_regsize);
379 SSPUSHINT(*PL_reglastparen);
a01268b5 380 SSPUSHINT(*PL_reglastcloseparen);
3280af22 381 SSPUSHPTR(PL_reginput);
e0fa7e2b 382 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 383
a0d0e21e
LW
384 return retval;
385}
386
c277df42 387/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
388#define REGCP_SET(cp) \
389 DEBUG_STATE_r( \
ab3bbdeb 390 PerlIO_printf(Perl_debug_log, \
e4f74956 391 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
392 (IV)PL_savestack_ix)); \
393 cp = PL_savestack_ix
c3464db5 394
ab3bbdeb 395#define REGCP_UNWIND(cp) \
e4f74956 396 DEBUG_STATE_r( \
ab3bbdeb 397 if (cp != PL_savestack_ix) \
e4f74956
YO
398 PerlIO_printf(Perl_debug_log, \
399 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
400 (IV)(cp), (IV)PL_savestack_ix)); \
401 regcpblow(cp)
c277df42 402
76e3520e 403STATIC char *
097eb12c 404S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 405{
97aff369 406 dVAR;
e0fa7e2b 407 UV i;
a0d0e21e 408 char *input;
a3621e74
YO
409 GET_RE_DEBUG_FLAGS_DECL;
410
7918f24d
NC
411 PERL_ARGS_ASSERT_REGCPPOP;
412
b1ce53c5 413 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 414 i = SSPOPUV;
e0fa7e2b
NC
415 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
416 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 417 input = (char *) SSPOPPTR;
a01268b5 418 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
419 *PL_reglastparen = SSPOPINT;
420 PL_regsize = SSPOPINT;
f0ab9afb 421 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 422
620d5b66 423 i -= REGCP_OTHER_ELEMS;
b1ce53c5 424 /* Now restore the parentheses context. */
620d5b66 425 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 426 I32 tmps;
097eb12c 427 U32 paren = (U32)SSPOPINT;
3280af22 428 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 429 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 430 tmps = SSPOPINT;
3280af22 431 if (paren <= *PL_reglastparen)
f0ab9afb 432 PL_regoffs[paren].end = tmps;
e7707071 433 DEBUG_BUFFERS_r(
c3464db5 434 PerlIO_printf(Perl_debug_log,
b900a521 435 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 436 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 437 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 438 (IV)PL_regoffs[paren].end,
3280af22 439 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 440 );
a0d0e21e 441 }
e7707071 442 DEBUG_BUFFERS_r(
bb7a0f54 443 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 444 PerlIO_printf(Perl_debug_log,
faccc32b 445 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 446 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
447 }
448 );
daf18116 449#if 1
dafc8851
JH
450 /* It would seem that the similar code in regtry()
451 * already takes care of this, and in fact it is in
452 * a better location to since this code can #if 0-ed out
453 * but the code in regtry() is needed or otherwise tests
454 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
455 * (as of patchlevel 7877) will fail. Then again,
456 * this code seems to be necessary or otherwise
225593e1
DM
457 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
458 * --jhi updated by dapm */
3b6647e0 459 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 460 if (i > PL_regsize)
f0ab9afb
NC
461 PL_regoffs[i].start = -1;
462 PL_regoffs[i].end = -1;
a0d0e21e 463 }
dafc8851 464#endif
a0d0e21e
LW
465 return input;
466}
467
02db2b7b 468#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 469
a687059c 470/*
e50aee73 471 * pregexec and friends
a687059c
LW
472 */
473
76234dfb 474#ifndef PERL_IN_XSUB_RE
a687059c 475/*
c277df42 476 - pregexec - match a regexp against a string
a687059c 477 */
c277df42 478I32
49d7dfbc 479Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 480 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
481/* strend: pointer to null at end of string */
482/* strbeg: real beginning of string */
483/* minend: end of match must be >=minend after stringarg. */
484/* nosave: For optimizations. */
485{
7918f24d
NC
486 PERL_ARGS_ASSERT_PREGEXEC;
487
c277df42 488 return
9041c2e3 489 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
490 nosave ? 0 : REXEC_COPY_STR);
491}
76234dfb 492#endif
22e551b9 493
9041c2e3 494/*
cad2e5aa
JH
495 * Need to implement the following flags for reg_anch:
496 *
497 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
498 * USE_INTUIT_ML
499 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
500 * INTUIT_AUTORITATIVE_ML
501 * INTUIT_ONCE_NOML - Intuit can match in one location only.
502 * INTUIT_ONCE_ML
503 *
504 * Another flag for this function: SECOND_TIME (so that float substrs
505 * with giant delta may be not rechecked).
506 */
507
508/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
509
3f7c398e 510/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
511 Otherwise, only SvCUR(sv) is used to get strbeg. */
512
513/* XXXX We assume that strpos is strbeg unless sv. */
514
6eb5f6b9
JH
515/* XXXX Some places assume that there is a fixed substring.
516 An update may be needed if optimizer marks as "INTUITable"
517 RExen without fixed substrings. Similarly, it is assumed that
518 lengths of all the strings are no more than minlen, thus they
519 cannot come from lookahead.
40d049e4
YO
520 (Or minlen should take into account lookahead.)
521 NOTE: Some of this comment is not correct. minlen does now take account
522 of lookahead/behind. Further research is required. -- demerphq
523
524*/
6eb5f6b9 525
2c2d71f5
JH
526/* A failure to find a constant substring means that there is no need to make
527 an expensive call to REx engine, thus we celebrate a failure. Similarly,
528 finding a substring too deep into the string means that less calls to
30944b6d
IZ
529 regtry() should be needed.
530
531 REx compiler's optimizer found 4 possible hints:
532 a) Anchored substring;
533 b) Fixed substring;
534 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 535 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 536 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
537 string which does not contradict any of them.
538 */
2c2d71f5 539
6eb5f6b9
JH
540/* Most of decisions we do here should have been done at compile time.
541 The nodes of the REx which we used for the search should have been
542 deleted from the finite automaton. */
543
cad2e5aa 544char *
288b8c02 545Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 546 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 547{
97aff369 548 dVAR;
288b8c02 549 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 550 register I32 start_shift = 0;
cad2e5aa 551 /* Should be nonnegative! */
b7953727 552 register I32 end_shift = 0;
2c2d71f5
JH
553 register char *s;
554 register SV *check;
a1933d95 555 char *strbeg;
cad2e5aa 556 char *t;
f2ed9b32 557 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 558 I32 ml_anch;
bd61b366
SS
559 register char *other_last = NULL; /* other substr checked before this */
560 char *check_at = NULL; /* check substr found at this pos */
d8080198 561 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 562 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 563 RXi_GET_DECL(prog,progi);
30944b6d 564#ifdef DEBUGGING
890ce7af 565 const char * const i_strpos = strpos;
30944b6d 566#endif
a3621e74
YO
567 GET_RE_DEBUG_FLAGS_DECL;
568
7918f24d 569 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
570 PERL_UNUSED_ARG(flags);
571 PERL_UNUSED_ARG(data);
7918f24d 572
f2ed9b32 573 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 574
3c8556c3 575 if (RX_UTF8(rx)) {
b8d68ded
JH
576 PL_reg_flags |= RF_utf8;
577 }
ab3bbdeb 578 DEBUG_EXECUTE_r(
f2ed9b32 579 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
580 sv ? "Guessing start of match in sv for"
581 : "Guessing start of match in string for");
2a782b5b 582 );
cad2e5aa 583
c344f387
JH
584 /* CHR_DIST() would be more correct here but it makes things slow. */
585 if (prog->minlen > strend - strpos) {
a3621e74 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 587 "String too short... [re_intuit_start]\n"));
cad2e5aa 588 goto fail;
2c2d71f5 589 }
1de06328 590
a1933d95 591 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 592 PL_regeol = strend;
f2ed9b32 593 if (utf8_target) {
33b8afdf
JH
594 if (!prog->check_utf8 && prog->check_substr)
595 to_utf8_substr(prog);
596 check = prog->check_utf8;
597 } else {
598 if (!prog->check_substr && prog->check_utf8)
599 to_byte_substr(prog);
600 check = prog->check_substr;
601 }
1de06328 602 if (check == &PL_sv_undef) {
a3621e74 603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 604 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
605 goto fail;
606 }
bbe252da
YO
607 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
608 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
609 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 610 && !multiline ) ); /* Check after \n? */
cad2e5aa 611
7e25d62c 612 if (!ml_anch) {
bbe252da
YO
613 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
614 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 615 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
616 && sv && !SvROK(sv)
617 && (strpos != strbeg)) {
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
619 goto fail;
620 }
621 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 622 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 623 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
624 I32 slen;
625
1aa99e6b 626 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 627
653099ff
GS
628 if (SvTAIL(check)) {
629 slen = SvCUR(check); /* >= 1 */
cad2e5aa 630
9041c2e3 631 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 632 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 634 goto fail_finish;
cad2e5aa
JH
635 }
636 /* Now should match s[0..slen-2] */
637 slen--;
3f7c398e 638 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 639 || (slen > 1
3f7c398e 640 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 641 report_neq:
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
643 goto fail_finish;
644 }
cad2e5aa 645 }
3f7c398e 646 else if (*SvPVX_const(check) != *s
653099ff 647 || ((slen = SvCUR(check)) > 1
3f7c398e 648 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 649 goto report_neq;
c315bfe8 650 check_at = s;
2c2d71f5 651 goto success_at_start;
7e25d62c 652 }
cad2e5aa 653 }
2c2d71f5 654 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 655 s = strpos;
2c2d71f5 656 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
657 end_shift = prog->check_end_shift;
658
2c2d71f5 659 if (!ml_anch) {
a3b680e6 660 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 661 - (SvTAIL(check) != 0);
a3b680e6 662 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
663
664 if (end_shift < eshift)
665 end_shift = eshift;
666 }
cad2e5aa 667 }
2c2d71f5 668 else { /* Can match at random position */
cad2e5aa
JH
669 ml_anch = 0;
670 s = strpos;
1de06328
YO
671 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
672 end_shift = prog->check_end_shift;
673
674 /* end shift should be non negative here */
cad2e5aa
JH
675 }
676
bcdf7404 677#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 678 if (end_shift < 0)
1de06328 679 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 680 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
681#endif
682
2c2d71f5
JH
683 restart:
684 /* Find a possible match in the region s..strend by looking for
685 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
686
687 {
688 I32 srch_start_shift = start_shift;
689 I32 srch_end_shift = end_shift;
c33e64f0
FC
690 U8* start_point;
691 U8* end_point;
1de06328
YO
692 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
693 srch_end_shift -= ((strbeg - s) - srch_start_shift);
694 srch_start_shift = strbeg - s;
695 }
6bda09f9 696 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
697 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
698 (IV)prog->check_offset_min,
699 (IV)srch_start_shift,
700 (IV)srch_end_shift,
701 (IV)prog->check_end_shift);
702 });
703
bbe252da 704 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
705 start_point= (U8*)(s + srch_start_shift);
706 end_point= (U8*)(strend - srch_end_shift);
707 } else {
708 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
709 end_point= HOP3(strend, -srch_end_shift, strbeg);
710 }
6bda09f9 711 DEBUG_OPTIMISE_MORE_r({
56570a2c 712 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 713 (int)(end_point - start_point),
fc8cd66c 714 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
715 start_point);
716 });
717
718 s = fbm_instr( start_point, end_point,
7fba1cd6 719 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 720 }
cad2e5aa
JH
721 /* Update the count-of-usability, remove useless subpatterns,
722 unshift s. */
2c2d71f5 723
ab3bbdeb 724 DEBUG_EXECUTE_r({
f2ed9b32 725 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
726 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
727 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 728 (s ? "Found" : "Did not find"),
f2ed9b32 729 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
730 ? "anchored" : "floating"),
731 quoted,
732 RE_SV_TAIL(check),
733 (s ? " at offset " : "...\n") );
734 });
2c2d71f5
JH
735
736 if (!s)
737 goto fail_finish;
2c2d71f5 738 /* Finish the diagnostic message */
a3621e74 739 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 740
1de06328
YO
741 /* XXX dmq: first branch is for positive lookbehind...
742 Our check string is offset from the beginning of the pattern.
743 So we need to do any stclass tests offset forward from that
744 point. I think. :-(
745 */
746
747
748
749 check_at=s;
750
751
2c2d71f5
JH
752 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
753 Start with the other substr.
754 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 755 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
756 *always* match. Probably should be marked during compile...
757 Probably it is right to do no SCREAM here...
758 */
759
f2ed9b32 760 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
761 : (prog->float_substr && prog->anchored_substr))
762 {
30944b6d 763 /* Take into account the "other" substring. */
2c2d71f5
JH
764 /* XXXX May be hopelessly wrong for UTF... */
765 if (!other_last)
6eb5f6b9 766 other_last = strpos;
f2ed9b32 767 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
768 do_other_anchored:
769 {
890ce7af
AL
770 char * const last = HOP3c(s, -start_shift, strbeg);
771 char *last1, *last2;
be8e71aa 772 char * const saved_s = s;
33b8afdf 773 SV* must;
2c2d71f5 774
2c2d71f5
JH
775 t = s - prog->check_offset_max;
776 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 777 && (!utf8_target
0ce71af7 778 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 779 && t > strpos)))
6f207bd3 780 NOOP;
2c2d71f5
JH
781 else
782 t = strpos;
1aa99e6b 783 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
784 if (t < other_last) /* These positions already checked */
785 t = other_last;
1aa99e6b 786 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
787 if (last < last1)
788 last1 = last;
1de06328
YO
789 /* XXXX It is not documented what units *_offsets are in.
790 We assume bytes, but this is clearly wrong.
791 Meaning this code needs to be carefully reviewed for errors.
792 dmq.
793 */
794
2c2d71f5 795 /* On end-of-str: see comment below. */
f2ed9b32 796 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
797 if (must == &PL_sv_undef) {
798 s = (char*)NULL;
1de06328 799 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
800 }
801 else
802 s = fbm_instr(
803 (unsigned char*)t,
804 HOP3(HOP3(last1, prog->anchored_offset, strend)
805 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
806 must,
7fba1cd6 807 multiline ? FBMrf_MULTILINE : 0
33b8afdf 808 );
ab3bbdeb 809 DEBUG_EXECUTE_r({
f2ed9b32 810 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
811 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
812 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 813 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
814 quoted, RE_SV_TAIL(must));
815 });
816
817
2c2d71f5
JH
818 if (!s) {
819 if (last1 >= last2) {
a3621e74 820 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
821 ", giving up...\n"));
822 goto fail_finish;
823 }
a3621e74 824 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 825 ", trying floating at offset %ld...\n",
be8e71aa 826 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
827 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
828 s = HOP3c(last, 1, strend);
2c2d71f5
JH
829 goto restart;
830 }
831 else {
a3621e74 832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 833 (long)(s - i_strpos)));
1aa99e6b
IH
834 t = HOP3c(s, -prog->anchored_offset, strbeg);
835 other_last = HOP3c(s, 1, strend);
be8e71aa 836 s = saved_s;
2c2d71f5
JH
837 if (t == strpos)
838 goto try_at_start;
2c2d71f5
JH
839 goto try_at_offset;
840 }
30944b6d 841 }
2c2d71f5
JH
842 }
843 else { /* Take into account the floating substring. */
33b8afdf 844 char *last, *last1;
be8e71aa 845 char * const saved_s = s;
33b8afdf
JH
846 SV* must;
847
848 t = HOP3c(s, -start_shift, strbeg);
849 last1 = last =
850 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
851 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
852 last = HOP3c(t, prog->float_max_offset, strend);
853 s = HOP3c(t, prog->float_min_offset, strend);
854 if (s < other_last)
855 s = other_last;
2c2d71f5 856 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 857 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
858 /* fbm_instr() takes into account exact value of end-of-str
859 if the check is SvTAIL(ed). Since false positives are OK,
860 and end-of-str is not later than strend we are OK. */
861 if (must == &PL_sv_undef) {
862 s = (char*)NULL;
1de06328 863 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
864 }
865 else
2c2d71f5 866 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
867 (unsigned char*)last + SvCUR(must)
868 - (SvTAIL(must)!=0),
7fba1cd6 869 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 870 DEBUG_EXECUTE_r({
f2ed9b32 871 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
872 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
873 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 874 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
875 quoted, RE_SV_TAIL(must));
876 });
33b8afdf
JH
877 if (!s) {
878 if (last1 == last) {
a3621e74 879 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
880 ", giving up...\n"));
881 goto fail_finish;
2c2d71f5 882 }
a3621e74 883 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 884 ", trying anchored starting at offset %ld...\n",
be8e71aa 885 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
886 other_last = last;
887 s = HOP3c(t, 1, strend);
888 goto restart;
889 }
890 else {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
892 (long)(s - i_strpos)));
893 other_last = s; /* Fix this later. --Hugo */
be8e71aa 894 s = saved_s;
33b8afdf
JH
895 if (t == strpos)
896 goto try_at_start;
897 goto try_at_offset;
898 }
2c2d71f5 899 }
cad2e5aa 900 }
2c2d71f5 901
1de06328 902
9ef43ace 903 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 904
6bda09f9 905 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
906 PerlIO_printf(Perl_debug_log,
907 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
908 (IV)prog->check_offset_min,
909 (IV)prog->check_offset_max,
910 (IV)(s-strpos),
911 (IV)(t-strpos),
912 (IV)(t-s),
913 (IV)(strend-strpos)
914 )
915 );
916
2c2d71f5 917 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 918 && (!utf8_target
9ef43ace 919 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
920 && t > strpos)))
921 {
2c2d71f5
JH
922 /* Fixed substring is found far enough so that the match
923 cannot start at strpos. */
924 try_at_offset:
cad2e5aa 925 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
926 /* Eventually fbm_*() should handle this, but often
927 anchored_offset is not 0, so this check will not be wasted. */
928 /* XXXX In the code below we prefer to look for "^" even in
929 presence of anchored substrings. And we search even
930 beyond the found float position. These pessimizations
931 are historical artefacts only. */
932 find_anchor:
2c2d71f5 933 while (t < strend - prog->minlen) {
cad2e5aa 934 if (*t == '\n') {
4ee3650e 935 if (t < check_at - prog->check_offset_min) {
f2ed9b32 936 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
937 /* Since we moved from the found position,
938 we definitely contradict the found anchored
30944b6d
IZ
939 substr. Due to the above check we do not
940 contradict "check" substr.
941 Thus we can arrive here only if check substr
942 is float. Redo checking for "other"=="fixed".
943 */
9041c2e3 944 strpos = t + 1;
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 946 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
947 goto do_other_anchored;
948 }
4ee3650e
GS
949 /* We don't contradict the found floating substring. */
950 /* XXXX Why not check for STCLASS? */
cad2e5aa 951 s = t + 1;
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 953 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
954 goto set_useful;
955 }
4ee3650e
GS
956 /* Position contradicts check-string */
957 /* XXXX probably better to look for check-string
958 than for "\n", so one should lower the limit for t? */
a3621e74 959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 960 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 961 other_last = strpos = s = t + 1;
cad2e5aa
JH
962 goto restart;
963 }
964 t++;
965 }
a3621e74 966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 967 PL_colors[0], PL_colors[1]));
2c2d71f5 968 goto fail_finish;
cad2e5aa 969 }
f5952150 970 else {
a3621e74 971 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 972 PL_colors[0], PL_colors[1]));
f5952150 973 }
cad2e5aa
JH
974 s = t;
975 set_useful:
f2ed9b32 976 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
977 }
978 else {
f5952150 979 /* The found string does not prohibit matching at strpos,
2c2d71f5 980 - no optimization of calling REx engine can be performed,
f5952150
GS
981 unless it was an MBOL and we are not after MBOL,
982 or a future STCLASS check will fail this. */
2c2d71f5
JH
983 try_at_start:
984 /* Even in this situation we may use MBOL flag if strpos is offset
985 wrt the start of the string. */
05b4157f 986 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 987 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 988 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 989 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 990 {
cad2e5aa
JH
991 t = strpos;
992 goto find_anchor;
993 }
a3621e74 994 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 995 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 996 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 997 );
2c2d71f5 998 success_at_start:
bbe252da 999 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1000 && (utf8_target ? (
33b8afdf
JH
1001 prog->check_utf8 /* Could be deleted already */
1002 && --BmUSEFUL(prog->check_utf8) < 0
1003 && (prog->check_utf8 == prog->float_utf8)
1004 ) : (
1005 prog->check_substr /* Could be deleted already */
1006 && --BmUSEFUL(prog->check_substr) < 0
1007 && (prog->check_substr == prog->float_substr)
1008 )))
66e933ab 1009 {
cad2e5aa 1010 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1011 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1012 /* XXX Does the destruction order has to change with utf8_target? */
1013 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1014 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1015 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1016 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1017 check = NULL; /* abort */
cad2e5aa 1018 s = strpos;
486ec47a 1019 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1020 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1021 if (prog->intflags & PREGf_IMPLICIT)
1022 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1023 /* XXXX This is a remnant of the old implementation. It
1024 looks wasteful, since now INTUIT can use many
6eb5f6b9 1025 other heuristics. */
bbe252da 1026 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1027 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1028 }
1029 else
1030 s = strpos;
1031 }
1032
6eb5f6b9
JH
1033 /* Last resort... */
1034 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1035 /* trie stclasses are too expensive to use here, we are better off to
1036 leave it to regmatch itself */
f8fc2ecf 1037 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1038 /* minlen == 0 is possible if regstclass is \b or \B,
1039 and the fixed substr is ''$.
1040 Since minlen is already taken into account, s+1 is before strend;
1041 accidentally, minlen >= 1 guaranties no false positives at s + 1
1042 even for \b or \B. But (minlen? 1 : 0) below assumes that
1043 regstclass does not come from lookahead... */
1044 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1045 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1046 const U8* const str = (U8*)STRING(progi->regstclass);
1047 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1048 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1049 : 1);
1de06328
YO
1050 char * endpos;
1051 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1052 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1053 else if (prog->float_substr || prog->float_utf8)
1054 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1055 else
1056 endpos= strend;
1057
d8080198
YO
1058 if (checked_upto < s)
1059 checked_upto = s;
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1061 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1062
6eb5f6b9 1063 t = s;
d8080198
YO
1064 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1065 if (s) {
1066 checked_upto = s;
1067 } else {
6eb5f6b9 1068#ifdef DEBUGGING
cbbf8932 1069 const char *what = NULL;
6eb5f6b9
JH
1070#endif
1071 if (endpos == strend) {
a3621e74 1072 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1073 "Could not match STCLASS...\n") );
1074 goto fail;
1075 }
a3621e74 1076 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1077 "This position contradicts STCLASS...\n") );
bbe252da 1078 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1079 goto fail;
d8080198
YO
1080 checked_upto = HOPBACKc(endpos, start_shift);
1081 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1082 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1083 /* Contradict one of substrings */
33b8afdf 1084 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1085 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1086 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1087 hop_and_restart:
1aa99e6b 1088 s = HOP3c(t, 1, strend);
66e933ab
GS
1089 if (s + start_shift + end_shift > strend) {
1090 /* XXXX Should be taken into account earlier? */
a3621e74 1091 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1092 "Could not match STCLASS...\n") );
1093 goto fail;
1094 }
5e39e1e5
HS
1095 if (!check)
1096 goto giveup;
a3621e74 1097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1098 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1099 what, (long)(s + start_shift - i_strpos)) );
1100 goto restart;
1101 }
66e933ab 1102 /* Have both, check_string is floating */
6eb5f6b9
JH
1103 if (t + start_shift >= check_at) /* Contradicts floating=check */
1104 goto retry_floating_check;
1105 /* Recheck anchored substring, but not floating... */
9041c2e3 1106 s = check_at;
5e39e1e5
HS
1107 if (!check)
1108 goto giveup;
a3621e74 1109 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1110 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1111 (long)(other_last - i_strpos)) );
1112 goto do_other_anchored;
1113 }
60e71179
GS
1114 /* Another way we could have checked stclass at the
1115 current position only: */
1116 if (ml_anch) {
1117 s = t = t + 1;
5e39e1e5
HS
1118 if (!check)
1119 goto giveup;
a3621e74 1120 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1121 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1122 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1123 goto try_at_offset;
66e933ab 1124 }
f2ed9b32 1125 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1126 goto fail;
486ec47a 1127 /* Check is floating substring. */
6eb5f6b9
JH
1128 retry_floating_check:
1129 t = check_at - start_shift;
a3621e74 1130 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1131 goto hop_and_restart;
1132 }
b7953727 1133 if (t != s) {
a3621e74 1134 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1135 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1136 (long)(t - i_strpos), (long)(s - i_strpos))
1137 );
1138 }
1139 else {
a3621e74 1140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1141 "Does not contradict STCLASS...\n");
1142 );
1143 }
6eb5f6b9 1144 }
5e39e1e5 1145 giveup:
a3621e74 1146 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1147 PL_colors[4], (check ? "Guessed" : "Giving up"),
1148 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1149 return s;
2c2d71f5
JH
1150
1151 fail_finish: /* Substring not found */
33b8afdf 1152 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1153 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1154 fail:
a3621e74 1155 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1156 PL_colors[4], PL_colors[5]));
bd61b366 1157 return NULL;
cad2e5aa 1158}
9661b544 1159
a0a388a1
YO
1160#define DECL_TRIE_TYPE(scan) \
1161 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1162 trie_type = ((scan->flags == EXACT) \
1163 ? (utf8_target ? trie_utf8 : trie_plain) \
1164 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1165
1166#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1167uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1168 STRLEN skiplen; \
1169 switch (trie_type) { \
1170 case trie_utf8_fold: \
1171 if ( foldlen>0 ) { \
1172 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1173 foldlen -= len; \
1174 uscan += len; \
1175 len=0; \
1176 } else { \
1177 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1178 len = UTF8SKIP(uc); \
1179 skiplen = UNISKIP( uvc ); \
1180 foldlen -= skiplen; \
1181 uscan = foldbuf + skiplen; \
1182 } \
1183 break; \
1184 case trie_latin_utf8_fold: \
1185 if ( foldlen>0 ) { \
1186 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1187 foldlen -= len; \
1188 uscan += len; \
1189 len=0; \
1190 } else { \
1191 len = 1; \
1192 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1193 skiplen = UNISKIP( uvc ); \
1194 foldlen -= skiplen; \
1195 uscan = foldbuf + skiplen; \
1196 } \
1197 break; \
1198 case trie_utf8: \
1199 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1200 break; \
1201 case trie_plain: \
1202 uvc = (UV)*uc; \
1203 len = 1; \
1204 } \
1205 if (uvc < 256) { \
1206 charid = trie->charmap[ uvc ]; \
1207 } \
1208 else { \
1209 charid = 0; \
1210 if (widecharmap) { \
1211 SV** const svpp = hv_fetch(widecharmap, \
1212 (char*)&uvc, sizeof(UV), 0); \
1213 if (svpp) \
1214 charid = (U16)SvIV(*svpp); \
1215 } \
1216 } \
4cadc6a9
YO
1217} STMT_END
1218
4cadc6a9
YO
1219#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1220STMT_START { \
1221 while (s <= e) { \
1222 if ( (CoNd) \
fac1af77 1223 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1224 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1225 goto got_it; \
1226 s++; \
1227 } \
1228} STMT_END
1229
1230#define REXEC_FBC_UTF8_SCAN(CoDe) \
1231STMT_START { \
1232 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1233 CoDe \
1234 s += uskip; \
1235 } \
1236} STMT_END
1237
1238#define REXEC_FBC_SCAN(CoDe) \
1239STMT_START { \
1240 while (s < strend) { \
1241 CoDe \
1242 s++; \
1243 } \
1244} STMT_END
1245
1246#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1247REXEC_FBC_UTF8_SCAN( \
1248 if (CoNd) { \
24b23f37 1249 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1250 goto got_it; \
1251 else \
1252 tmp = doevery; \
1253 } \
1254 else \
1255 tmp = 1; \
1256)
1257
1258#define REXEC_FBC_CLASS_SCAN(CoNd) \
1259REXEC_FBC_SCAN( \
1260 if (CoNd) { \
24b23f37 1261 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1262 goto got_it; \
1263 else \
1264 tmp = doevery; \
1265 } \
1266 else \
1267 tmp = 1; \
1268)
1269
1270#define REXEC_FBC_TRYIT \
24b23f37 1271if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1272 goto got_it
1273
e1d1eefb 1274#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1275 if (utf8_target) { \
e1d1eefb
YO
1276 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1277 } \
1278 else { \
1279 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1280 }
e1d1eefb 1281
4cadc6a9 1282#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1283 if (utf8_target) { \
4cadc6a9
YO
1284 UtFpReLoAd; \
1285 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1286 } \
1287 else { \
1288 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1289 }
4cadc6a9
YO
1290
1291#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1292 PL_reg_flags |= RF_tainted; \
f2ed9b32 1293 if (utf8_target) { \
4cadc6a9
YO
1294 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1295 } \
1296 else { \
1297 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1298 }
4cadc6a9 1299
786e8c11
YO
1300#define DUMP_EXEC_POS(li,s,doutf8) \
1301 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1302
cfaf538b
KW
1303
1304#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1305 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1306 tmp = TEST_NON_UTF8(tmp); \
1307 REXEC_FBC_UTF8_SCAN( \
1308 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1309 tmp = !tmp; \
1310 IF_SUCCESS; \
1311 } \
1312 else { \
1313 IF_FAIL; \
1314 } \
1315 ); \
1316
1317#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1318 if (s == PL_bostr) { \
1319 tmp = '\n'; \
1320 } \
1321 else { \
1322 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1323 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1324 } \
1325 tmp = TeSt1_UtF8; \
1326 LOAD_UTF8_CHARCLASS_ALNUM(); \
1327 REXEC_FBC_UTF8_SCAN( \
1328 if (tmp == ! (TeSt2_UtF8)) { \
1329 tmp = !tmp; \
1330 IF_SUCCESS; \
1331 } \
1332 else { \
1333 IF_FAIL; \
1334 } \
1335 ); \
1336
63ac0dad
KW
1337/* The only difference between the BOUND and NBOUND cases is that
1338 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1339 * NBOUND. This is accomplished by passing it in either the if or else clause,
1340 * with the other one being empty */
1341#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1342 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1343
1344#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1345 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1346
1347#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1348 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1349
1350#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1351 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1352
63ac0dad
KW
1353
1354/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1355 * be passed in completely with the variable name being tested, which isn't
1356 * such a clean interface, but this is easier to read than it was before. We
1357 * are looking for the boundary (or non-boundary between a word and non-word
1358 * character. The utf8 and non-utf8 cases have the same logic, but the details
1359 * must be different. Find the "wordness" of the character just prior to this
1360 * one, and compare it with the wordness of this one. If they differ, we have
1361 * a boundary. At the beginning of the string, pretend that the previous
1362 * character was a new-line */
cfaf538b 1363#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1364 if (utf8_target) { \
cfaf538b 1365 UTF8_CODE \
63ac0dad
KW
1366 } \
1367 else { /* Not utf8 */ \
1368 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1369 tmp = TEST_NON_UTF8(tmp); \
1370 REXEC_FBC_SCAN( \
1371 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1372 tmp = !tmp; \
1373 IF_SUCCESS; \
1374 } \
1375 else { \
1376 IF_FAIL; \
1377 } \
1378 ); \
1379 } \
1380 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1381 goto got_it;
1382
786e8c11
YO
1383/* We know what class REx starts with. Try to find this position... */
1384/* if reginfo is NULL, its a dryrun */
1385/* annoyingly all the vars in this routine have different names from their counterparts
1386 in regmatch. /grrr */
1387
3c3eec57 1388STATIC char *
07be1b83 1389S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1390 const char *strend, regmatch_info *reginfo)
a687059c 1391{
27da23d5 1392 dVAR;
bbe252da 1393 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1394 char *pat_string; /* The pattern's exactish string */
1395 char *pat_end; /* ptr to end char of pat_string */
1396 re_fold_t folder; /* Function for computing non-utf8 folds */
1397 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1398 STRLEN ln;
5dab1207 1399 STRLEN lnc;
078c425b 1400 register STRLEN uskip;
fac1af77
KW
1401 U8 c1;
1402 U8 c2;
6eb5f6b9
JH
1403 char *e;
1404 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1405 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1406 UV utf8_fold_flags = 0;
f8fc2ecf 1407 RXi_GET_DECL(prog,progi);
7918f24d
NC
1408
1409 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1410
6eb5f6b9
JH
1411 /* We know what class it must start with. */
1412 switch (OP(c)) {
f56b6394 1413 case ANYOFV:
6eb5f6b9 1414 case ANYOF:
f56b6394 1415 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1416 STRLEN inclasslen = strend - s;
1417 REXEC_FBC_UTF8_CLASS_SCAN(
1418 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1419 }
1420 else {
6ef69d56 1421 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1422 }
6eb5f6b9 1423 break;
f33976b4 1424 case CANY:
4cadc6a9 1425 REXEC_FBC_SCAN(
24b23f37 1426 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1427 goto got_it;
1428 else
1429 tmp = doevery;
4cadc6a9 1430 );
f33976b4 1431 break;
2f7f8cb1
KW
1432
1433 case EXACTFA:
1434 if (UTF_PATTERN || utf8_target) {
1435 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1436 goto do_exactf_utf8;
1437 }
1438 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1439 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1440 goto do_exactf_non_utf8; /* isn't dealt with by these */
1441
6eb5f6b9 1442 case EXACTF:
62bf7766 1443 if (utf8_target) {
77a6d856
KW
1444
1445 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1446 utf8_fold_flags = 0;
fac1af77
KW
1447 goto do_exactf_utf8;
1448 }
1449 fold_array = PL_fold;
1450 folder = foldEQ;
1451 goto do_exactf_non_utf8;
1452
1453 case EXACTFL:
1454 if (UTF_PATTERN || utf8_target) {
17580e7a 1455 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1456 goto do_exactf_utf8;
1457 }
1458 fold_array = PL_fold_locale;
1459 folder = foldEQ_locale;
16d951b7
KW
1460 goto do_exactf_non_utf8;
1461
3c760661
KW
1462 case EXACTFU_SS:
1463 if (UTF_PATTERN) {
1464 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1465 }
1466 goto do_exactf_utf8;
1467
fab2782b 1468 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1469 case EXACTFU:
1470 if (UTF_PATTERN || utf8_target) {
77a6d856 1471 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1472 goto do_exactf_utf8;
1473 }
1474
1475 /* Any 'ss' in the pattern should have been replaced by regcomp,
1476 * so we don't have to worry here about this single special case
1477 * in the Latin1 range */
1478 fold_array = PL_fold_latin1;
1479 folder = foldEQ_latin1;
fac1af77
KW
1480
1481 /* FALL THROUGH */
1482
62bf7766
KW
1483 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1484 are no glitches with fold-length differences
1485 between the target string and pattern */
fac1af77
KW
1486
1487 /* The idea in the non-utf8 EXACTF* cases is to first find the
1488 * first character of the EXACTF* node and then, if necessary,
1489 * case-insensitively compare the full text of the node. c1 is the
1490 * first character. c2 is its fold. This logic will not work for
1491 * Unicode semantics and the german sharp ss, which hence should
1492 * not be compiled into a node that gets here. */
1493 pat_string = STRING(c);
1494 ln = STR_LEN(c); /* length to match in octets/bytes */
1495
8a90a8fe
KW
1496 /* We know that we have to match at least 'ln' bytes (which is the
1497 * same as characters, since not utf8). If we have to match 3
1498 * characters, and there are only 2 availabe, we know without
1499 * trying that it will fail; so don't start a match past the
1500 * required minimum number from the far end */
fac1af77
KW
1501 e = HOP3c(strend, -((I32)ln), s);
1502
1503 if (!reginfo && e < s) {
1504 e = s; /* Due to minlen logic of intuit() */
1505 }
1506
1507 c1 = *pat_string;
1508 c2 = fold_array[c1];
1509 if (c1 == c2) { /* If char and fold are the same */
1510 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1511 }
1512 else {
1513 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1514 }
1515 break;
1516
1517 do_exactf_utf8:
e067297c
KW
1518 {
1519 unsigned expansion;
1520
fac1af77
KW
1521
1522 /* If one of the operands is in utf8, we can't use the simpler
1523 * folding above, due to the fact that many different characters
1524 * can have the same fold, or portion of a fold, or different-
1525 * length fold */
1526 pat_string = STRING(c);
1527 ln = STR_LEN(c); /* length to match in octets/bytes */
1528 pat_end = pat_string + ln;
1529 lnc = (UTF_PATTERN) /* length to match in characters */
1530 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1531 : ln;
1532
e067297c
KW
1533 /* We have 'lnc' characters to match in the pattern, but because of
1534 * multi-character folding, each character in the target can match
1535 * up to 3 characters (Unicode guarantees it will never exceed
1536 * this) if it is utf8-encoded; and up to 2 if not (based on the
1537 * fact that the Latin 1 folds are already determined, and the
1538 * only multi-char fold in that range is the sharp-s folding to
1539 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1540 * string character. Adjust lnc accordingly, rounding up, so that
1541 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1542 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1543 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1544
1545 /* As in the non-UTF8 case, if we have to match 3 characters, and
1546 * only 2 are left, it's guaranteed to fail, so don't start a
1547 * match that would require us to go beyond the end of the string
1548 */
1549 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1550
1551 if (!reginfo && e < s) {
1552 e = s; /* Due to minlen logic of intuit() */
1553 }
1554
b33105db
KW
1555 /* XXX Note that we could recalculate e to stop the loop earlier,
1556 * as the worst case expansion above will rarely be met, and as we
1557 * go along we would usually find that e moves further to the left.
1558 * This would happen only after we reached the point in the loop
1559 * where if there were no expansion we should fail. Unclear if
1560 * worth the expense */
e067297c 1561
fac1af77
KW
1562 while (s <= e) {
1563 char *my_strend= (char *)strend;
1564 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1565 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1566 && (!reginfo || regtry(reginfo, &s)) )
1567 {
1568 goto got_it;
1569 }
bbdd8bad 1570 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1571 }
1572 break;
e067297c 1573 }
bbce6d69 1574 case BOUNDL:
3280af22 1575 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1576 FBC_BOUND(isALNUM_LC,
1577 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1578 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1579 break;
bbce6d69 1580 case NBOUNDL:
3280af22 1581 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1582 FBC_NBOUND(isALNUM_LC,
1583 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1584 isALNUM_LC_utf8((U8*)s));
1585 break;
1586 case BOUND:
1587 FBC_BOUND(isWORDCHAR,
1588 isALNUM_uni(tmp),
1589 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1590 break;
cfaf538b
KW
1591 case BOUNDA:
1592 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1593 isWORDCHAR_A(tmp),
1594 isWORDCHAR_A((U8*)s));
1595 break;
a0d0e21e 1596 case NBOUND:
63ac0dad
KW
1597 FBC_NBOUND(isWORDCHAR,
1598 isALNUM_uni(tmp),
1599 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1600 break;
cfaf538b
KW
1601 case NBOUNDA:
1602 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1603 isWORDCHAR_A(tmp),
1604 isWORDCHAR_A((U8*)s));
1605 break;
63ac0dad
KW
1606 case BOUNDU:
1607 FBC_BOUND(isWORDCHAR_L1,
1608 isALNUM_uni(tmp),
1609 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1610 break;
1611 case NBOUNDU:
1612 FBC_NBOUND(isWORDCHAR_L1,
1613 isALNUM_uni(tmp),
1614 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1615 break;
bbce6d69 1616 case ALNUML:
4cadc6a9
YO
1617 REXEC_FBC_CSCAN_TAINT(
1618 isALNUM_LC_utf8((U8*)s),
1619 isALNUM_LC(*s)
1620 );
6895a8aa 1621 break;
980866de
KW
1622 case ALNUMU:
1623 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1624 LOAD_UTF8_CHARCLASS_ALNUM(),
1625 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1626 isWORDCHAR_L1((U8) *s)
1627 );
6895a8aa 1628 break;
980866de
KW
1629 case ALNUM:
1630 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1631 LOAD_UTF8_CHARCLASS_ALNUM(),
1632 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1633 isWORDCHAR((U8) *s)
1634 );
6895a8aa 1635 break;
cfaf538b 1636 case ALNUMA:
8e9da4d4
KW
1637 /* Don't need to worry about utf8, as it can match only a single
1638 * byte invariant character */
cfaf538b 1639 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1640 break;
980866de
KW
1641 case NALNUMU:
1642 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1643 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1644 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1645 ! isWORDCHAR_L1((U8) *s)
1646 );
6895a8aa 1647 break;
a0d0e21e 1648 case NALNUM:
4cadc6a9 1649 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1650 LOAD_UTF8_CHARCLASS_ALNUM(),
1651 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1652 ! isALNUM(*s)
4cadc6a9 1653 );
6895a8aa 1654 break;
cfaf538b 1655 case NALNUMA:
8e9da4d4
KW
1656 REXEC_FBC_CSCAN(
1657 !isWORDCHAR_A(*s),
1658 !isWORDCHAR_A(*s)
1659 );
1660 break;
bbce6d69 1661 case NALNUML:
4cadc6a9
YO
1662 REXEC_FBC_CSCAN_TAINT(
1663 !isALNUM_LC_utf8((U8*)s),
1664 !isALNUM_LC(*s)
1665 );
6895a8aa 1666 break;
980866de
KW
1667 case SPACEU:
1668 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1669 LOAD_UTF8_CHARCLASS_SPACE(),
1670 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1671 isSPACE_L1((U8) *s)
1672 );
6895a8aa 1673 break;
a0d0e21e 1674 case SPACE:
4cadc6a9 1675 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1676 LOAD_UTF8_CHARCLASS_SPACE(),
1677 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1678 isSPACE((U8) *s)
4cadc6a9 1679 );
6895a8aa 1680 break;
cfaf538b 1681 case SPACEA:
8e9da4d4
KW
1682 /* Don't need to worry about utf8, as it can match only a single
1683 * byte invariant character */
cfaf538b 1684 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1685 break;
bbce6d69 1686 case SPACEL:
4cadc6a9 1687 REXEC_FBC_CSCAN_TAINT(
6bbba904 1688 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1689 isSPACE_LC(*s)
1690 );
6895a8aa 1691 break;
980866de
KW
1692 case NSPACEU:
1693 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1694 LOAD_UTF8_CHARCLASS_SPACE(),
1695 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1696 ! isSPACE_L1((U8) *s)
1697 );
6895a8aa 1698 break;
a0d0e21e 1699 case NSPACE:
4cadc6a9 1700 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1701 LOAD_UTF8_CHARCLASS_SPACE(),
1702 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1703 ! isSPACE((U8) *s)
4cadc6a9 1704 );
6895a8aa 1705 break;
cfaf538b 1706 case NSPACEA:
8e9da4d4
KW
1707 REXEC_FBC_CSCAN(
1708 !isSPACE_A(*s),
1709 !isSPACE_A(*s)
1710 );
1711 break;
bbce6d69 1712 case NSPACEL:
4cadc6a9 1713 REXEC_FBC_CSCAN_TAINT(
6bbba904 1714 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1715 !isSPACE_LC(*s)
1716 );
6895a8aa 1717 break;
a0d0e21e 1718 case DIGIT:
4cadc6a9 1719 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1720 LOAD_UTF8_CHARCLASS_DIGIT(),
1721 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1722 isDIGIT(*s)
1723 );
6895a8aa 1724 break;
cfaf538b 1725 case DIGITA:
8e9da4d4
KW
1726 /* Don't need to worry about utf8, as it can match only a single
1727 * byte invariant character */
cfaf538b 1728 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1729 break;
b8c5462f 1730 case DIGITL:
4cadc6a9
YO
1731 REXEC_FBC_CSCAN_TAINT(
1732 isDIGIT_LC_utf8((U8*)s),
1733 isDIGIT_LC(*s)
1734 );
6895a8aa 1735 break;
a0d0e21e 1736 case NDIGIT:
4cadc6a9 1737 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1738 LOAD_UTF8_CHARCLASS_DIGIT(),
1739 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1740 !isDIGIT(*s)
1741 );
6895a8aa 1742 break;
cfaf538b 1743 case NDIGITA:
8e9da4d4
KW
1744 REXEC_FBC_CSCAN(
1745 !isDIGIT_A(*s),
1746 !isDIGIT_A(*s)
1747 );
1748 break;
b8c5462f 1749 case NDIGITL:
4cadc6a9
YO
1750 REXEC_FBC_CSCAN_TAINT(
1751 !isDIGIT_LC_utf8((U8*)s),
1752 !isDIGIT_LC(*s)
1753 );
6895a8aa 1754 break;
e1d1eefb
YO
1755 case LNBREAK:
1756 REXEC_FBC_CSCAN(
1757 is_LNBREAK_utf8(s),
1758 is_LNBREAK_latin1(s)
1759 );
6895a8aa 1760 break;
e1d1eefb
YO
1761 case VERTWS:
1762 REXEC_FBC_CSCAN(
1763 is_VERTWS_utf8(s),
1764 is_VERTWS_latin1(s)
1765 );
6895a8aa 1766 break;
e1d1eefb
YO
1767 case NVERTWS:
1768 REXEC_FBC_CSCAN(
1769 !is_VERTWS_utf8(s),
1770 !is_VERTWS_latin1(s)
1771 );
6895a8aa 1772 break;
e1d1eefb
YO
1773 case HORIZWS:
1774 REXEC_FBC_CSCAN(
1775 is_HORIZWS_utf8(s),
1776 is_HORIZWS_latin1(s)
1777 );
6895a8aa 1778 break;
e1d1eefb
YO
1779 case NHORIZWS:
1780 REXEC_FBC_CSCAN(
1781 !is_HORIZWS_utf8(s),
1782 !is_HORIZWS_latin1(s)
1783 );
6895a8aa 1784 break;
1de06328
YO
1785 case AHOCORASICKC:
1786 case AHOCORASICK:
07be1b83 1787 {
a0a388a1 1788 DECL_TRIE_TYPE(c);
07be1b83
YO
1789 /* what trie are we using right now */
1790 reg_ac_data *aho
f8fc2ecf 1791 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1792 reg_trie_data *trie
1793 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1794 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1795
1796 const char *last_start = strend - trie->minlen;
6148ee25 1797#ifdef DEBUGGING
07be1b83 1798 const char *real_start = s;
6148ee25 1799#endif
07be1b83 1800 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1801 SV *sv_points;
1802 U8 **points; /* map of where we were in the input string
786e8c11 1803 when reading a given char. For ASCII this
be8e71aa 1804 is unnecessary overhead as the relationship
38a44b82
NC
1805 is always 1:1, but for Unicode, especially
1806 case folded Unicode this is not true. */
f9e705e8 1807 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1808 U8 *bitmap=NULL;
1809
07be1b83
YO
1810
1811 GET_RE_DEBUG_FLAGS_DECL;
1812
be8e71aa
YO
1813 /* We can't just allocate points here. We need to wrap it in
1814 * an SV so it gets freed properly if there is a croak while
1815 * running the match */
1816 ENTER;
1817 SAVETMPS;
1818 sv_points=newSV(maxlen * sizeof(U8 *));
1819 SvCUR_set(sv_points,
1820 maxlen * sizeof(U8 *));
1821 SvPOK_on(sv_points);
1822 sv_2mortal(sv_points);
1823 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1824 if ( trie_type != trie_utf8_fold
1825 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1826 {
786e8c11
YO
1827 if (trie->bitmap)
1828 bitmap=(U8*)trie->bitmap;
1829 else
1830 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1831 }
786e8c11
YO
1832 /* this is the Aho-Corasick algorithm modified a touch
1833 to include special handling for long "unknown char"
1834 sequences. The basic idea being that we use AC as long
1835 as we are dealing with a possible matching char, when
1836 we encounter an unknown char (and we have not encountered
1837 an accepting state) we scan forward until we find a legal
1838 starting char.
1839 AC matching is basically that of trie matching, except
1840 that when we encounter a failing transition, we fall back
1841 to the current states "fail state", and try the current char
1842 again, a process we repeat until we reach the root state,
1843 state 1, or a legal transition. If we fail on the root state
1844 then we can either terminate if we have reached an accepting
1845 state previously, or restart the entire process from the beginning
1846 if we have not.
1847
1848 */
07be1b83
YO
1849 while (s <= last_start) {
1850 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1851 U8 *uc = (U8*)s;
1852 U16 charid = 0;
1853 U32 base = 1;
1854 U32 state = 1;
1855 UV uvc = 0;
1856 STRLEN len = 0;
1857 STRLEN foldlen = 0;
1858 U8 *uscan = (U8*)NULL;
1859 U8 *leftmost = NULL;
786e8c11
YO
1860#ifdef DEBUGGING
1861 U32 accepted_word= 0;
1862#endif
07be1b83
YO
1863 U32 pointpos = 0;
1864
1865 while ( state && uc <= (U8*)strend ) {
1866 int failed=0;
786e8c11
YO
1867 U32 word = aho->states[ state ].wordnum;
1868
1de06328
YO
1869 if( state==1 ) {
1870 if ( bitmap ) {
1871 DEBUG_TRIE_EXECUTE_r(
1872 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1873 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1874 (char *)uc, utf8_target );
1de06328
YO
1875 PerlIO_printf( Perl_debug_log,
1876 " Scanning for legal start char...\n");
1877 }
d085b490
YO
1878 );
1879 if (utf8_target) {
1880 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1881 uc += UTF8SKIP(uc);
1882 }
1883 } else {
1884 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1885 uc++;
1886 }
1887 }
1de06328 1888 s= (char *)uc;
786e8c11 1889 }
786e8c11
YO
1890 if (uc >(U8*)last_start) break;
1891 }
1892
1893 if ( word ) {
2e64971a 1894 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1895 if (!leftmost || lpos < leftmost) {
1896 DEBUG_r(accepted_word=word);
07be1b83 1897 leftmost= lpos;
786e8c11 1898 }
07be1b83 1899 if (base==0) break;
786e8c11 1900
07be1b83
YO
1901 }
1902 points[pointpos++ % maxlen]= uc;
55eed653
NC
1903 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1904 uscan, len, uvc, charid, foldlen,
1905 foldbuf, uniflags);
786e8c11
YO
1906 DEBUG_TRIE_EXECUTE_r({
1907 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1908 s, utf8_target );
07be1b83 1909 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1910 " Charid:%3u CP:%4"UVxf" ",
1911 charid, uvc);
1912 });
07be1b83
YO
1913
1914 do {
6148ee25 1915#ifdef DEBUGGING
786e8c11 1916 word = aho->states[ state ].wordnum;
6148ee25 1917#endif
07be1b83
YO
1918 base = aho->states[ state ].trans.base;
1919
786e8c11
YO
1920 DEBUG_TRIE_EXECUTE_r({
1921 if (failed)
1922 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1923 s, utf8_target );
07be1b83 1924 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1925 "%sState: %4"UVxf", word=%"UVxf,
1926 failed ? " Fail transition to " : "",
1927 (UV)state, (UV)word);
1928 });
07be1b83
YO
1929 if ( base ) {
1930 U32 tmp;
6dd2be57 1931 I32 offset;
07be1b83 1932 if (charid &&
6dd2be57
DM
1933 ( ((offset = base + charid
1934 - 1 - trie->uniquecharcount)) >= 0)
1935 && ((U32)offset < trie->lasttrans)
1936 && trie->trans[offset].check == state
1937 && (tmp=trie->trans[offset].next))
07be1b83 1938 {
786e8c11
YO
1939 DEBUG_TRIE_EXECUTE_r(
1940 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1941 state = tmp;
1942 break;
1943 }
1944 else {
786e8c11
YO
1945 DEBUG_TRIE_EXECUTE_r(
1946 PerlIO_printf( Perl_debug_log," - fail\n"));
1947 failed = 1;
1948 state = aho->fail[state];
07be1b83
YO
1949 }
1950 }
1951 else {
1952 /* we must be accepting here */
786e8c11
YO
1953 DEBUG_TRIE_EXECUTE_r(
1954 PerlIO_printf( Perl_debug_log," - accepting\n"));
1955 failed = 1;
07be1b83
YO
1956 break;
1957 }
1958 } while(state);
786e8c11 1959 uc += len;
07be1b83
YO
1960 if (failed) {
1961 if (leftmost)
1962 break;
786e8c11 1963 if (!state) state = 1;
07be1b83
YO
1964 }
1965 }
1966 if ( aho->states[ state ].wordnum ) {
2e64971a 1967 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1968 if (!leftmost || lpos < leftmost) {
1969 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1970 leftmost = lpos;
786e8c11 1971 }
07be1b83 1972 }
07be1b83
YO
1973 if (leftmost) {
1974 s = (char*)leftmost;
786e8c11
YO
1975 DEBUG_TRIE_EXECUTE_r({
1976 PerlIO_printf(
70685ca0
JH
1977 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1978 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1979 );
1980 });
24b23f37 1981 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1982 FREETMPS;
1983 LEAVE;
07be1b83 1984 goto got_it;
be8e71aa 1985 }
07be1b83 1986 s = HOPc(s,1);
786e8c11
YO
1987 DEBUG_TRIE_EXECUTE_r({
1988 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1989 });
07be1b83 1990 } else {
786e8c11
YO
1991 DEBUG_TRIE_EXECUTE_r(
1992 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1993 break;
1994 }
1995 }
be8e71aa
YO
1996 FREETMPS;
1997 LEAVE;
07be1b83
YO
1998 }
1999 break;
b3c9acc1 2000 default:
3c3eec57
GS
2001 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2002 break;
d6a28714 2003 }
6eb5f6b9
JH
2004 return 0;
2005 got_it:
2006 return s;
2007}
2008
fae667d5 2009
6eb5f6b9
JH
2010/*
2011 - regexec_flags - match a regexp against a string
2012 */
2013I32
288b8c02 2014Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2015 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2016/* strend: pointer to null at end of string */
2017/* strbeg: real beginning of string */
2018/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2019/* data: May be used for some additional optimizations.
2020 Currently its only used, with a U32 cast, for transmitting
2021 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2022/* nosave: For optimizations. */
2023{
97aff369 2024 dVAR;
288b8c02 2025 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2026 /*register*/ char *s;
6eb5f6b9 2027 register regnode *c;
24b23f37 2028 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2029 I32 minlen; /* must match at least this many chars */
2030 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2031 I32 end_shift = 0; /* Same for the end. */ /* CC */
2032 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2033 char *scream_olds = NULL;
f2ed9b32 2034 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2035 I32 multiline;
f8fc2ecf 2036 RXi_GET_DECL(prog,progi);
3b0527fe 2037 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2038 regexp_paren_pair *swap = NULL;
a3621e74
YO
2039 GET_RE_DEBUG_FLAGS_DECL;
2040
7918f24d 2041 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2042 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2043
2044 /* Be paranoid... */
2045 if (prog == NULL || startpos == NULL) {
2046 Perl_croak(aTHX_ "NULL regexp parameter");
2047 return 0;
2048 }
2049
bbe252da 2050 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2051 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2052
f2ed9b32 2053 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2054 DEBUG_EXECUTE_r(
f2ed9b32 2055 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2056 "Matching");
2057 );
bac06658 2058
6eb5f6b9 2059 minlen = prog->minlen;
1de06328
YO
2060
2061 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2062 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2063 "String too short [regexec_flags]...\n"));
2064 goto phooey;
1aa99e6b 2065 }
6eb5f6b9 2066
1de06328 2067
6eb5f6b9 2068 /* Check validity of program. */
f8fc2ecf 2069 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2070 Perl_croak(aTHX_ "corrupted regexp program");
2071 }
2072
2073 PL_reg_flags = 0;
ed301438 2074 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2075 PL_reg_maxiter = 0;
2076
3c8556c3 2077 if (RX_UTF8(rx))
6eb5f6b9
JH
2078 PL_reg_flags |= RF_utf8;
2079
2080 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2081 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2082 PL_bostr = strbeg;
3b0527fe 2083 reginfo.sv = sv;
6eb5f6b9
JH
2084
2085 /* Mark end of line for $ (and such) */
2086 PL_regeol = strend;
2087
2088 /* see how far we have to get to not match where we matched before */
3b0527fe 2089 reginfo.till = startpos+minend;
6eb5f6b9 2090
6eb5f6b9
JH
2091 /* If there is a "must appear" string, look for it. */
2092 s = startpos;
2093
bbe252da 2094 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2095 MAGIC *mg;
2c296965 2096 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2097 reginfo.ganch = startpos + prog->gofs;
2c296965 2098 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2099 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2100 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2101 && SvMAGIC(sv)
14befaf4
DM
2102 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2103 && mg->mg_len >= 0) {
3b0527fe 2104 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2105 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2106 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2107
bbe252da 2108 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2109 if (s > reginfo.ganch)
6eb5f6b9 2110 goto phooey;
58e23c8d 2111 s = reginfo.ganch - prog->gofs;
2c296965 2112 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2113 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2114 if (s < strbeg)
2115 goto phooey;
6eb5f6b9
JH
2116 }
2117 }
58e23c8d 2118 else if (data) {
70685ca0 2119 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2120 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2121 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2122
2123 } else { /* pos() not defined */
3b0527fe 2124 reginfo.ganch = strbeg;
2c296965
YO
2125 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2126 "GPOS: reginfo.ganch = strbeg\n"));
2127 }
6eb5f6b9 2128 }
288b8c02 2129 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2130 /* We have to be careful. If the previous successful match
2131 was from this regex we don't want a subsequent partially
2132 successful match to clobber the old results.
2133 So when we detect this possibility we add a swap buffer
2134 to the re, and switch the buffer each match. If we fail
2135 we switch it back, otherwise we leave it swapped.
2136 */
2137 swap = prog->offs;
2138 /* do we need a save destructor here for eval dies? */
2139 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2140 }
a0714e2c 2141 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2142 re_scream_pos_data d;
2143
2144 d.scream_olds = &scream_olds;
2145 d.scream_pos = &scream_pos;
288b8c02 2146 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2147 if (!s) {
a3621e74 2148 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2149 goto phooey; /* not present */
3fa9c3d7 2150 }
6eb5f6b9
JH
2151 }
2152
1de06328 2153
6eb5f6b9
JH
2154
2155 /* Simplest case: anchored match need be tried only once. */
2156 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2157 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2158 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2159 goto got_it;
bbe252da
YO
2160 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2161 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2162 {
2163 char *end;
2164
2165 if (minlen)
2166 dontbother = minlen - 1;
1aa99e6b 2167 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2168 /* for multiline we only have to try after newlines */
33b8afdf 2169 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2170 /* because of the goto we can not easily reuse the macros for bifurcating the
2171 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2172 if (utf8_target) {
2173 if (s == startpos)
2174 goto after_try_utf8;
2175 while (1) {
2176 if (regtry(&reginfo, &s)) {
2177 goto got_it;
2178 }
2179 after_try_utf8:
2180 if (s > end) {
2181 goto phooey;
2182 }
2183 if (prog->extflags & RXf_USE_INTUIT) {
2184 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2185 if (!s) {
2186 goto phooey;
2187 }
2188 }
2189 else {
2190 s += UTF8SKIP(s);
2191 }
2192 }
2193 } /* end search for check string in unicode */
2194 else {
2195 if (s == startpos) {
2196 goto after_try_latin;
2197 }
2198 while (1) {
2199 if (regtry(&reginfo, &s)) {
2200 goto got_it;
2201 }
2202 after_try_latin:
2203 if (s > end) {
2204 goto phooey;
2205 }
2206 if (prog->extflags & RXf_USE_INTUIT) {
2207 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2208 if (!s) {
2209 goto phooey;
2210 }
2211 }
2212 else {
2213 s++;
2214 }
2215 }
2216 } /* end search for check string in latin*/
2217 } /* end search for check string */
2218 else { /* search for newline */
2219 if (s > startpos) {
2220 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2221 s--;
92f3d482 2222 }
21eede78
YO
2223 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2224 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2225 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2226 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2227 goto got_it;
2228 }
92f3d482
YO
2229 }
2230 } /* end search for newline */
2231 } /* end anchored/multiline check string search */
6eb5f6b9 2232 goto phooey;
bbe252da 2233 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2234 {
486ec47a 2235 /* the warning about reginfo.ganch being used without initialization
bbe252da 2236 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2237 and we only enter this block when the same bit is set. */
58e23c8d 2238 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2239
2240 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2241 goto got_it;
2242 goto phooey;
2243 }
2244
2245 /* Messy cases: unanchored match. */
bbe252da 2246 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2247 /* we have /x+whatever/ */
f2ed9b32 2248 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2249 char ch;
bf93d4cc
GS
2250#ifdef DEBUGGING
2251 int did_match = 0;
2252#endif
f2ed9b32
KW
2253 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2254 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2255 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2256
f2ed9b32 2257 if (utf8_target) {
4cadc6a9 2258 REXEC_FBC_SCAN(
6eb5f6b9 2259 if (*s == ch) {
a3621e74 2260 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2261 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2262 s += UTF8SKIP(s);
2263 while (s < strend && *s == ch)
2264 s += UTF8SKIP(s);
2265 }
4cadc6a9 2266 );
6eb5f6b9
JH
2267 }
2268 else {
4cadc6a9 2269 REXEC_FBC_SCAN(
6eb5f6b9 2270 if (*s == ch) {
a3621e74 2271 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2272 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2273 s++;
2274 while (s < strend && *s == ch)
2275 s++;
2276 }
4cadc6a9 2277 );
6eb5f6b9 2278 }
a3621e74 2279 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2280 PerlIO_printf(Perl_debug_log,
b7953727
JH
2281 "Did not find anchored character...\n")
2282 );
6eb5f6b9 2283 }
a0714e2c
SS
2284 else if (prog->anchored_substr != NULL
2285 || prog->anchored_utf8 != NULL
2286 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2287 && prog->float_max_offset < strend - s)) {
2288 SV *must;
2289 I32 back_max;
2290 I32 back_min;
2291 char *last;
6eb5f6b9 2292 char *last1; /* Last position checked before */
bf93d4cc
GS
2293#ifdef DEBUGGING
2294 int did_match = 0;
2295#endif
33b8afdf 2296 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2297 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2300 back_max = back_min = prog->anchored_offset;
2301 } else {
f2ed9b32
KW
2302 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2303 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2304 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2305 back_max = prog->float_max_offset;
2306 back_min = prog->float_min_offset;
2307 }
1de06328
YO
2308
2309
33b8afdf
JH
2310 if (must == &PL_sv_undef)
2311 /* could not downgrade utf8 check substring, so must fail */
2312 goto phooey;
2313
1de06328
YO
2314 if (back_min<0) {
2315 last = strend;
2316 } else {
2317 last = HOP3c(strend, /* Cannot start after this */
2318 -(I32)(CHR_SVLEN(must)
2319 - (SvTAIL(must) != 0) + back_min), strbeg);
2320 }
6eb5f6b9
JH
2321 if (s > PL_bostr)
2322 last1 = HOPc(s, -1);
2323 else
2324 last1 = s - 1; /* bogus */
2325
a0288114 2326 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2327 check_substr==must. */
2328 scream_pos = -1;
2329 dontbother = end_shift;
2330 strend = HOPc(strend, -dontbother);
2331 while ( (s <= last) &&
c33e64f0 2332 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2333 (unsigned char*)strend, must,
c33e64f0 2334 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2335 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2336 if (HOPc(s, -back_max) > last1) {
2337 last1 = HOPc(s, -back_min);
2338 s = HOPc(s, -back_max);
2339 }
2340 else {
52657f30 2341 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2342
2343 last1 = HOPc(s, -back_min);
52657f30 2344 s = t;
6eb5f6b9 2345 }
f2ed9b32 2346 if (utf8_target) {
6eb5f6b9 2347 while (s <= last1) {
24b23f37 2348 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2349 goto got_it;
2350 s += UTF8SKIP(s);
2351 }
2352 }
2353 else {
2354 while (s <= last1) {
24b23f37 2355 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2356 goto got_it;
2357 s++;
2358 }
2359 }
2360 }
ab3bbdeb 2361 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2362 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2363 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2364 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2365 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2366 ? "anchored" : "floating"),
ab3bbdeb
YO
2367 quoted, RE_SV_TAIL(must));
2368 });
6eb5f6b9
JH
2369 goto phooey;
2370 }
f8fc2ecf 2371 else if ( (c = progi->regstclass) ) {
f14c76ed 2372 if (minlen) {
f8fc2ecf 2373 const OPCODE op = OP(progi->regstclass);
66e933ab 2374 /* don't bother with what can't match */
786e8c11 2375 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2376 strend = HOPc(strend, -(minlen - 1));
2377 }
a3621e74 2378 DEBUG_EXECUTE_r({
be8e71aa 2379 SV * const prop = sv_newmortal();
32fc9b6a 2380 regprop(prog, prop, c);
0df25f3d 2381 {
f2ed9b32 2382 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2383 s,strend-s,60);
0df25f3d 2384 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2385 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2386 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2387 quoted, (int)(strend - s));
0df25f3d 2388 }
ffc61ed2 2389 });
3b0527fe 2390 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2391 goto got_it;
07be1b83 2392 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2393 }
2394 else {
2395 dontbother = 0;
a0714e2c 2396 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2397 /* Trim the end. */
6af40bd7 2398 char *last= NULL;
33b8afdf 2399 SV* float_real;
c33e64f0
FC
2400 STRLEN len;
2401 const char *little;
33b8afdf 2402
f2ed9b32
KW
2403 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2404 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2405 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2406
c33e64f0
FC
2407 little = SvPV_const(float_real, len);
2408 if (SvTAIL(float_real)) {
1a13b075
YO
2409 /* This means that float_real contains an artificial \n on the end
2410 * due to the presence of something like this: /foo$/
2411 * where we can match both "foo" and "foo\n" at the end of the string.
2412 * So we have to compare the end of the string first against the float_real
2413 * without the \n and then against the full float_real with the string.
2414 * We have to watch out for cases where the string might be smaller
2415 * than the float_real or the float_real without the \n.
2416 */
2417 char *checkpos= strend - len;
2418 DEBUG_OPTIMISE_r(
2419 PerlIO_printf(Perl_debug_log,
2420 "%sChecking for float_real.%s\n",
2421 PL_colors[4], PL_colors[5]));
2422 if (checkpos + 1 < strbeg) {
2423 /* can't match, even if we remove the trailing \n string is too short to match */
2424 DEBUG_EXECUTE_r(
2425 PerlIO_printf(Perl_debug_log,
2426 "%sString shorter than required trailing substring, cannot match.%s\n",
2427 PL_colors[4], PL_colors[5]));
2428 goto phooey;
2429 } else if (memEQ(checkpos + 1, little, len - 1)) {
2430 /* can match, the end of the string matches without the "\n" */
2431 last = checkpos + 1;
2432 } else if (checkpos < strbeg) {
2433 /* cant match, string is too short when the "\n" is included */
2434 DEBUG_EXECUTE_r(
2435 PerlIO_printf(Perl_debug_log,
2436 "%sString does not contain required trailing substring, cannot match.%s\n",
2437 PL_colors[4], PL_colors[5]));
2438 goto phooey;
2439 } else if (!multiline) {
2440 /* non multiline match, so compare with the "\n" at the end of the string */
2441 if (memEQ(checkpos, little, len)) {
2442 last= checkpos;
2443 } else {
2444 DEBUG_EXECUTE_r(
2445 PerlIO_printf(Perl_debug_log,
2446 "%sString does not contain required trailing substring, cannot match.%s\n",
2447 PL_colors[4], PL_colors[5]));
2448 goto phooey;
2449 }
2450 } else {
2451 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2452 goto find_last;
1a13b075 2453 }
c33e64f0 2454 } else {
d6a28714 2455 find_last:
9041c2e3 2456 if (len)
d6a28714 2457 last = rninstr(s, strend, little, little + len);
b8c5462f 2458 else
a0288114 2459 last = strend; /* matching "$" */
b8c5462f 2460 }
6af40bd7
YO
2461 if (!last) {
2462 /* at one point this block contained a comment which was probably
2463 * incorrect, which said that this was a "should not happen" case.
2464 * Even if it was true when it was written I am pretty sure it is
2465 * not anymore, so I have removed the comment and replaced it with
2466 * this one. Yves */
6bda09f9
YO
2467 DEBUG_EXECUTE_r(
2468 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2469 "String does not contain required substring, cannot match.\n"
2470 ));
2471 goto phooey;
bf93d4cc 2472 }
d6a28714
JH
2473 dontbother = strend - last + prog->float_min_offset;
2474 }
2475 if (minlen && (dontbother < minlen))
2476 dontbother = minlen - 1;
2477 strend -= dontbother; /* this one's always in bytes! */
2478 /* We don't know much -- general case. */
f2ed9b32 2479 if (utf8_target) {
d6a28714 2480 for (;;) {
24b23f37 2481 if (regtry(&reginfo, &s))
d6a28714
JH
2482 goto got_it;
2483 if (s >= strend)
2484 break;
b8c5462f 2485 s += UTF8SKIP(s);
d6a28714
JH
2486 };
2487 }
2488 else {
2489 do {
24b23f37 2490 if (regtry(&reginfo, &s))
d6a28714
JH
2491 goto got_it;
2492 } while (s++ < strend);
2493 }
2494 }
2495
2496 /* Failure. */
2497 goto phooey;
2498
2499got_it:
e9105d30 2500 Safefree(swap);
288b8c02 2501 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2502
ed301438 2503 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2504 restore_pos(aTHX_ prog);
5daac39c
NC
2505 if (RXp_PAREN_NAMES(prog))
2506 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2507
2508 /* make sure $`, $&, $', and $digit will work later */
2509 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2510 RX_MATCH_COPY_FREE(rx);
d6a28714 2511 if (flags & REXEC_COPY_STR) {
be8e71aa 2512 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2513#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2514 if ((SvIsCOW(sv)
2515 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2516 if (DEBUG_C_TEST) {
2517 PerlIO_printf(Perl_debug_log,
2518 "Copy on write: regexp capture, type %d\n",
2519 (int) SvTYPE(sv));
2520 }
2521 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2522 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2523 assert (SvPOKp(prog->saved_copy));
2524 } else
2525#endif
2526 {
288b8c02 2527 RX_MATCH_COPIED_on(rx);
ed252734
NC
2528 s = savepvn(strbeg, i);
2529 prog->subbeg = s;
2530 }
d6a28714 2531 prog->sublen = i;
d6a28714
JH
2532 }
2533 else {
2534 prog->subbeg = strbeg;
2535 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2536 }
2537 }
9041c2e3 2538
d6a28714
JH
2539 return 1;
2540
2541phooey:
a3621e74 2542 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2543 PL_colors[4], PL_colors[5]));
ed301438 2544 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2545 restore_pos(aTHX_ prog);
e9105d30 2546 if (swap) {
c74340f9 2547 /* we failed :-( roll it back */
e9105d30
GG
2548 Safefree(prog->offs);
2549 prog->offs = swap;
2550 }
2551
d6a28714
JH
2552 return 0;
2553}
2554
6bda09f9 2555
d6a28714
JH
2556/*
2557 - regtry - try match at specific point
2558 */
2559STATIC I32 /* 0 failure, 1 success */
24b23f37 2560S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2561{
97aff369 2562 dVAR;
d6a28714 2563 CHECKPOINT lastcp;
288b8c02
NC
2564 REGEXP *const rx = reginfo->prog;
2565 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2566 RXi_GET_DECL(prog,progi);
a3621e74 2567 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2568
2569 PERL_ARGS_ASSERT_REGTRY;
2570
24b23f37 2571 reginfo->cutpoint=NULL;
d6a28714 2572
ed301438
DM
2573 if ((prog->extflags & RXf_EVAL_SEEN)
2574 && !PL_reg_state.re_state_eval_setup_done)
2575 {
d6a28714
JH
2576 MAGIC *mg;
2577
ed301438 2578 PL_reg_state.re_state_eval_setup_done = TRUE;
a3621e74 2579 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2580 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2581 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2582 ));
ea8d6ae1 2583 SAVESTACK_CXPOS();
d6a28714
JH
2584 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2585 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2586 SAVETMPS;
2587 /* Apparently this is not needed, judging by wantarray. */
e8347627 2588 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2589 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2590
3b0527fe 2591 if (reginfo->sv) {
d6a28714 2592 /* Make $_ available to executed code. */
3b0527fe 2593 if (reginfo->sv != DEFSV) {
59f00321 2594 SAVE_DEFSV;
414bf5ae 2595 DEFSV_set(reginfo->sv);
b8c5462f 2596 }
d6a28714 2597
3b0527fe
DM
2598 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2599 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2600 /* prepare for quick setting of pos */
d300d9fa 2601#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2602 if (SvIsCOW(reginfo->sv))
2603 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2604#endif
3dab1dad 2605 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2606 &PL_vtbl_mglob, NULL, 0);
d6a28714 2607 mg->mg_len = -1;
b8c5462f 2608 }
d6a28714
JH
2609 PL_reg_magic = mg;
2610 PL_reg_oldpos = mg->mg_len;
4f639d21 2611 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2612 }
09687e5a 2613 if (!PL_reg_curpm) {
a02a5408 2614 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2615#ifdef USE_ITHREADS
2616 {
14a49a24 2617 SV* const repointer = &PL_sv_undef;
92313705
NC
2618 /* this regexp is also owned by the new PL_reg_curpm, which
2619 will try to free it. */
d2ece331 2620 av_push(PL_regex_padav, repointer);
09687e5a
AB
2621 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2622 PL_regex_pad = AvARRAY(PL_regex_padav);
2623 }
2624#endif
2625 }
86c29d75
NC
2626#ifdef USE_ITHREADS
2627 /* It seems that non-ithreads works both with and without this code.
2628 So for efficiency reasons it seems best not to have the code
2629 compiled when it is not needed. */
92313705
NC
2630 /* This is safe against NULLs: */
2631 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2632 /* PM_reg_curpm owns a reference to this regexp. */
cd733337 2633 (void)ReREFCNT_inc(rx);
86c29d75 2634#endif
288b8c02 2635 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2636 PL_reg_oldcurpm = PL_curpm;
2637 PL_curpm = PL_reg_curpm;
07bc277f 2638 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2639 /* Here is a serious problem: we cannot rewrite subbeg,
2640 since it may be needed if this match fails. Thus
2641 $` inside (?{}) could fail... */
2642 PL_reg_oldsaved = prog->subbeg;
2643 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2644#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2645 PL_nrs = prog->saved_copy;
2646#endif
07bc277f 2647 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2648 }
2649 else
bd61b366 2650 PL_reg_oldsaved = NULL;
d6a28714
JH
2651 prog->subbeg = PL_bostr;
2652 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2653 }
24b23f37 2654 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2655 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2656 PL_reginput = *startpos;
d6a28714 2657 PL_reglastparen = &prog->lastparen;
a01268b5 2658 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2659 prog->lastparen = 0;
03994de8 2660 prog->lastcloseparen = 0;
d6a28714 2661 PL_regsize = 0;
f0ab9afb 2662 PL_regoffs = prog->offs;
d6a28714
JH
2663 if (PL_reg_start_tmpl <= prog->nparens) {
2664 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2665 if(PL_reg_start_tmp)
2666 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2667 else
a02a5408 2668 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2669 }
2670
2671 /* XXXX What this code is doing here?!!! There should be no need
2672 to do this again and again, PL_reglastparen should take care of
3dd2943c 2673 this! --ilya*/
dafc8851
JH
2674
2675 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2676 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2677 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2678 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2679 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2680 * Meanwhile, this code *is* needed for the
daf18116
JH
2681 * above-mentioned test suite tests to succeed. The common theme
2682 * on those tests seems to be returning null fields from matches.
225593e1 2683 * --jhi updated by dapm */
dafc8851 2684#if 1
d6a28714 2685 if (prog->nparens) {
f0ab9afb 2686 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2687 register I32 i;
eb160463 2688 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2689 ++pp;
2690 pp->start = -1;
2691 pp->end = -1;
d6a28714
JH
2692 }
2693 }
dafc8851 2694#endif
02db2b7b 2695 REGCP_SET(lastcp);
f8fc2ecf 2696 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2697 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2698 return 1;
2699 }
24b23f37
YO
2700 if (reginfo->cutpoint)
2701 *startpos= reginfo->cutpoint;
02db2b7b 2702 REGCP_UNWIND(lastcp);
d6a28714
JH
2703 return 0;
2704}
2705
02db2b7b 2706
8ba1375e
MJD
2707#define sayYES goto yes
2708#define sayNO goto no
262b90c4 2709#define sayNO_SILENT goto no_silent
8ba1375e 2710
f9f4320a
YO
2711/* we dont use STMT_START/END here because it leads to
2712 "unreachable code" warnings, which are bogus, but distracting. */
2713#define CACHEsayNO \
c476f425
DM
2714 if (ST.cache_mask) \
2715 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2716 sayNO
3298f257 2717
a3621e74 2718/* this is used to determine how far from the left messages like
265c4333
YO
2719 'failed...' are printed. It should be set such that messages
2720 are inline with the regop output that created them.
a3621e74 2721*/
265c4333 2722#define REPORT_CODE_OFF 32
a3621e74
YO
2723
2724
40a82448
DM
2725#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2726#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2727
86545054
DM
2728#define SLAB_FIRST(s) (&(s)->states[0])
2729#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2730
5d9a96ca
DM
2731/* grab a new slab and return the first slot in it */
2732
2733STATIC regmatch_state *
2734S_push_slab(pTHX)
2735{
a35a87e7 2736#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2737 dMY_CXT;
2738#endif
5d9a96ca
DM
2739 regmatch_slab *s = PL_regmatch_slab->next;
2740 if (!s) {
2741 Newx(s, 1, regmatch_slab);
2742 s->prev = PL_regmatch_slab;
2743 s->next = NULL;
2744 PL_regmatch_slab->next = s;
2745 }
2746 PL_regmatch_slab = s;
86545054 2747 return SLAB_FIRST(s);
5d9a96ca 2748}
5b47454d 2749
95b24440 2750
40a82448
DM
2751/* push a new state then goto it */
2752
2753#define PUSH_STATE_GOTO(state, node) \
2754 scan = node; \
2755 st->resume_state = state; \
2756 goto push_state;
2757
2758/* push a new state with success backtracking, then goto it */
2759
2760#define PUSH_YES_STATE_GOTO(state, node) \
2761 scan = node; \
2762 st->resume_state = state; \
2763 goto push_yes_state;
2764
aa283a38 2765
aa283a38 2766
d6a28714 2767/*
95b24440 2768
bf1f174e
DM
2769regmatch() - main matching routine
2770
2771This is basically one big switch statement in a loop. We execute an op,
2772set 'next' to point the next op, and continue. If we come to a point which
2773we may need to backtrack to on failure such as (A|B|C), we push a
2774backtrack state onto the backtrack stack. On failure, we pop the top
2775state, and re-enter the loop at the state indicated. If there are no more
2776states to pop, we return failure.
2777
2778Sometimes we also need to backtrack on success; for example /A+/, where
2779after successfully matching one A, we need to go back and try to
2780match another one; similarly for lookahead assertions: if the assertion
2781completes successfully, we backtrack to the state just before the assertion
2782and then carry on. In these cases, the pushed state is marked as
2783'backtrack on success too'. This marking is in fact done by a chain of
2784pointers, each pointing to the previous 'yes' state. On success, we pop to
2785the nearest yes state, discarding any intermediate failure-only states.
2786Sometimes a yes state is pushed just to force some cleanup code to be
2787called at the end of a successful match or submatch; e.g. (??{$re}) uses
2788it to free the inner regex.
2789
2790Note that failure backtracking rewinds the cursor position, while
2791success backtracking leaves it alone.
2792
2793A pattern is complete when the END op is executed, while a subpattern
2794such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2795ops trigger the "pop to last yes state if any, otherwise return true"
2796behaviour.
2797
2798A common convention in this function is to use A and B to refer to the two
2799subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2800the subpattern to be matched possibly multiple times, while B is the entire
2801rest of the pattern. Variable and state names reflect this convention.
2802
2803The states in the main switch are the union of ops and failure/success of
2804substates associated with with that op. For example, IFMATCH is the op
2805that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2806'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2807successfully matched A and IFMATCH_A_fail is a state saying that we have
2808just failed to match A. Resume states always come in pairs. The backtrack
2809state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2810at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2811on success or failure.
2812
2813The struct that holds a backtracking state is actually a big union, with
2814one variant for each major type of op. The variable st points to the
2815top-most backtrack struct. To make the code clearer, within each
2816block of code we #define ST to alias the relevant union.
2817
2818Here's a concrete example of a (vastly oversimplified) IFMATCH
2819implementation:
2820
2821 switch (state) {
2822 ....
2823
2824#define ST st->u.ifmatch
2825
2826 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2827 ST.foo = ...; // some state we wish to save
95b24440 2828 ...
bf1f174e
DM
2829 // push a yes backtrack state with a resume value of
2830 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2831 // first node of A:
2832 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2833 // NOTREACHED
2834
2835 case IFMATCH_A: // we have successfully executed A; now continue with B
2836 next = B;
2837 bar = ST.foo; // do something with the preserved value
2838 break;
2839
2840 case IFMATCH_A_fail: // A failed, so the assertion failed
2841 ...; // do some housekeeping, then ...
2842 sayNO; // propagate the failure
2843
2844#undef ST
95b24440 2845
bf1f174e
DM
2846 ...
2847 }
95b24440 2848
bf1f174e
DM
2849For any old-timers reading this who are familiar with the old recursive
2850approach, the code above is equivalent to:
95b24440 2851
bf1f174e
DM
2852 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2853 {
2854 int foo = ...
95b24440 2855 ...
bf1f174e
DM
2856 if (regmatch(A)) {
2857 next = B;
2858 bar = foo;
2859 break;
95b24440 2860 }
bf1f174e
DM
2861 ...; // do some housekeeping, then ...
2862 sayNO; // propagate the failure
95b24440 2863 }
bf1f174e
DM
2864
2865The topmost backtrack state, pointed to by st, is usually free. If you
2866want to claim it, populate any ST.foo fields in it with values you wish to
2867save, then do one of
2868
2869 PUSH_STATE_GOTO(resume_state, node);
2870 PUSH_YES_STATE_GOTO(resume_state, node);
2871
2872which sets that backtrack state's resume value to 'resume_state', pushes a
2873new free entry to the top of the backtrack stack, then goes to 'node'.
2874On backtracking, the free slot is popped, and the saved state becomes the
2875new free state. An ST.foo field in this new top state can be temporarily
2876accessed to retrieve values, but once the main loop is re-entered, it
2877becomes available for reuse.
2878
2879Note that the depth of the backtrack stack constantly increases during the
2880left-to-right execution of the pattern, rather than going up and down with
2881the pattern nesting. For example the stack is at its maximum at Z at the
2882end of the pattern, rather than at X in the following:
2883
2884 /(((X)+)+)+....(Y)+....Z/
2885
2886The only exceptions to this are lookahead/behind assertions and the cut,
2887(?>A), which pop all the backtrack states associated with A before
2888continuing.
2889
486ec47a 2890Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2891PL_regmatch_state and st always point to the currently active state,
2892and PL_regmatch_slab points to the slab currently containing
2893PL_regmatch_state. The first time regmatch() is called, the first slab is
2894allocated, and is never freed until interpreter destruction. When the slab
2895is full, a new one is allocated and chained to the end. At exit from
2896regmatch(), slabs allocated since entry are freed.
2897
2898*/
95b24440 2899
40a82448 2900
5bc10b2c 2901#define DEBUG_STATE_pp(pp) \
265c4333 2902 DEBUG_STATE_r({ \
f2ed9b32 2903 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2904 PerlIO_printf(Perl_debug_log, \
5d458dd8 2905 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2906 depth*2, "", \
13d6edb4 2907 PL_reg_name[st->resume_state], \
5d458dd8
YO
2908 ((st==yes_state||st==mark_state) ? "[" : ""), \
2909 ((st==yes_state) ? "Y" : ""), \
2910 ((st==mark_state) ? "M" : ""), \
2911 ((st==yes_state||st==mark_state) ? "]" : "") \
2912 ); \
265c4333 2913 });
5bc10b2c 2914
40a82448 2915
3dab1dad 2916#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2917
3df15adc 2918#ifdef DEBUGGING
5bc10b2c 2919
ab3bbdeb 2920STATIC void
f2ed9b32 2921S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2922 const char *start, const char *end, const char *blurb)
2923{
efd26800 2924 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2925
2926 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2927
ab3bbdeb
YO
2928 if (!PL_colorset)
2929 reginitcolors();
2930 {
2931 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2932 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2933
f2ed9b32 2934 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2935 start, end - start, 60);
2936
2937 PerlIO_printf(Perl_debug_log,
2938 "%s%s REx%s %s against %s\n",
2939 PL_colors[4], blurb, PL_colors[5], s0, s1);
2940
f2ed9b32 2941 if (utf8_target||utf8_pat)
1de06328
YO
2942 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2943 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2944 utf8_pat && utf8_target ? " and " : "",
2945 utf8_target ? "string" : ""
ab3bbdeb
YO
2946 );
2947 }
2948}
3df15adc
YO
2949
2950STATIC void
786e8c11
YO
2951S_dump_exec_pos(pTHX_ const char *locinput,
2952 const regnode *scan,
2953 const char *loc_regeol,
2954 const char *loc_bostr,
2955 const char *loc_reg_starttry,
f2ed9b32 2956 const bool utf8_target)
07be1b83 2957{
786e8c11 2958 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2959 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2960 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2961 /* The part of the string before starttry has one color
2962 (pref0_len chars), between starttry and current
2963 position another one (pref_len - pref0_len chars),
2964 after the current position the third one.
2965 We assume that pref0_len <= pref_len, otherwise we
2966 decrease pref0_len. */
786e8c11
YO
2967 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2968 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2969 int pref0_len;
2970
7918f24d
NC
2971 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2972
f2ed9b32 2973 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2974 pref_len++;
786e8c11
YO
2975 pref0_len = pref_len - (locinput - loc_reg_starttry);
2976 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2977 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2978 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2979 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2980 l--;
2981 if (pref0_len < 0)
2982 pref0_len = 0;
2983 if (pref0_len > pref_len)
2984 pref0_len = pref_len;
2985 {
f2ed9b32 2986 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2987
ab3bbdeb 2988 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2989 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2990
ab3bbdeb 2991 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2992 (locinput - pref_len + pref0_len),
1de06328 2993 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2994
ab3bbdeb 2995 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2996 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2997
1de06328 2998 const STRLEN tlen=len0+len1+len2;
3df15adc 2999 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3000 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3001 (IV)(locinput - loc_bostr),
07be1b83 3002 len0, s0,
07be1b83 3003 len1, s1,
07be1b83 3004 (docolor ? "" : "> <"),
07be1b83 3005 len2, s2,
f9f4320a 3006 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3007 "");
3008 }
3009}
3df15adc 3010
07be1b83
YO
3011#endif
3012
0a4db386
YO
3013/* reg_check_named_buff_matched()
3014 * Checks to see if a named buffer has matched. The data array of
3015 * buffer numbers corresponding to the buffer is expected to reside
3016 * in the regexp->data->data array in the slot stored in the ARG() of
3017 * node involved. Note that this routine doesn't actually care about the
3018 * name, that information is not preserved from compilation to execution.
3019 * Returns the index of the leftmost defined buffer with the given name
3020 * or 0 if non of the buffers matched.
3021 */
3022STATIC I32
7918f24d
NC
3023S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3024{
0a4db386 3025 I32 n;
f8fc2ecf 3026 RXi_GET_DECL(rex,rexi);
ad64d0ec 3027 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3028 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3029
3030 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3031
0a4db386
YO
3032 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3033 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 3034 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
3035 {
3036 return nums[n];
3037 }
3038 }
3039 return 0;
3040}
3041
2f554ef7
DM
3042
3043/* free all slabs above current one - called during LEAVE_SCOPE */
3044
3045STATIC void
3046S_clear_backtrack_stack(pTHX_ void *p)
3047{
3048 regmatch_slab *s = PL_regmatch_slab->next;
3049 PERL_UNUSED_ARG(p);
3050
3051 if (!s)
3052 return;
3053 PL_regmatch_slab->next = NULL;
3054 while (s) {
3055 regmatch_slab * const osl = s;
3056 s = s->next;
3057 Safefree(osl);
3058 }
3059}
3060
3061
28d8d7f4 3062#define SETREX(Re1,Re2) \
ed301438
DM
3063 if (PL_reg_state.re_state_eval_setup_done) \
3064 PM_SETRE((PL_reg_curpm), (Re2)); \
28d8d7f4
YO
3065 Re1 = (Re2)
3066
d6a28714 3067STATIC I32 /* 0 failure, 1 success */
24b23f37 3068S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3069{
a35a87e7 3070#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3071 dMY_CXT;
3072#endif
27da23d5 3073 dVAR;
f2ed9b32 3074 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3075 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3076 REGEXP *rex_sv = reginfo->prog;
3077 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3078 RXi_GET_DECL(rex,rexi);
2f554ef7 3079 I32 oldsave;
5d9a96ca
DM
3080 /* the current state. This is a cached copy of PL_regmatch_state */
3081 register regmatch_state *st;
5d9a96ca
DM
3082 /* cache heavy used fields of st in registers */
3083 register regnode *scan;
3084 register regnode *next;
438e9bae 3085 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3086 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3087 register char *locinput = PL_reginput;
5d9a96ca 3088 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3089
b69b0499 3090 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3091 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3092 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3093 const U32 max_nochange_depth =
3094 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3095 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3096 regmatch_state *yes_state = NULL; /* state to pop to on success of
3097 subpattern */
e2e6a0f1
YO
3098 /* mark_state piggy backs on the yes_state logic so that when we unwind
3099 the stack on success we can update the mark_state as we go */
3100 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3101 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3102 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3103 U32 state_num;
5d458dd8
YO
3104 bool no_final = 0; /* prevent failure from backtracking? */
3105 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3106 char *startpoint = PL_reginput;
5d458dd8
YO
3107 SV *popmark = NULL; /* are we looking for a mark? */
3108 SV *sv_commit = NULL; /* last mark name seen in failure */
3109 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3110 during a successful match */
5d458dd8
YO
3111 U32 lastopen = 0; /* last open we saw */
3112 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3113 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3114 /* these three flags are set by various ops to signal information to
3115 * the very next op. They have a useful lifetime of exactly one loop
3116 * iteration, and are not preserved or restored by state pushes/pops
3117 */
3118 bool sw = 0; /* the condition value in (?(cond)a|b) */
3119 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3120 int logical = 0; /* the following EVAL is:
3121 0: (?{...})
3122 1: (?(?{...})X|Y)
3123 2: (??{...})
3124 or the following IFMATCH/UNLESSM is:
3125 false: plain (?=foo)
3126 true: used as a condition: (?(?=foo))
3127 */
68e2671b 3128 PAD* const initial_pad = PL_comppad;
95b24440 3129#ifdef DEBUGGING
e68ec53f 3130 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3131#endif
3132
7918f24d
NC
3133 PERL_ARGS_ASSERT_REGMATCH;
3134
3b57cd43 3135 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3136 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3137 }));
5d9a96ca
DM
3138 /* on first ever call to regmatch, allocate first slab */
3139 if (!PL_regmatch_slab) {
3140 Newx(PL_regmatch_slab, 1, regmatch_slab);
3141 PL_regmatch_slab->prev = NULL;
3142 PL_regmatch_slab->next = NULL;
86545054 3143 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3144 }
3145
2f554ef7
DM
3146 oldsave = PL_savestack_ix;
3147 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3148 SAVEVPTR(PL_regmatch_slab);
3149 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3150
3151 /* grab next free state slot */
3152 st = ++PL_regmatch_state;
86545054 3153 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3154 st = PL_regmatch_state = S_push_slab(aTHX);
3155
d6a28714
JH
3156 /* Note that nextchr is a byte even in UTF */
3157 nextchr = UCHARAT(locinput);
3158 scan = prog;
3159 while (scan != NULL) {
8ba1375e 3160
a3621e74 3161 DEBUG_EXECUTE_r( {
6136c704 3162 SV * const prop = sv_newmortal();
1de06328 3163 regnode *rnext=regnext(scan);
f2ed9b32 3164 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3165 regprop(rex, prop, scan);
07be1b83
YO
3166
3167 PerlIO_printf(Perl_debug_log,
3168 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3169 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3170 SvPVX_const(prop),
1de06328 3171 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3172 0 : (IV)(rnext - rexi->program));
2a782b5b 3173 });
d6a28714
JH
3174
3175 next = scan + NEXT_OFF(scan);
3176 if (next == scan)
3177 next = NULL;
40a82448 3178 state_num = OP(scan);
d6a28714 3179
40a82448 3180 reenter_switch:
34a81e2b
B
3181
3182 assert(PL_reglastparen == &rex->lastparen);
3183 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3184 assert(PL_regoffs == rex->offs);
3185
40a82448 3186 switch (state_num) {
d6a28714 3187 case BOL:
7fba1cd6 3188 if (locinput == PL_bostr)
d6a28714 3189 {
3b0527fe 3190 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3191 break;
3192 }
d6a28714
JH
3193 sayNO;
3194 case MBOL:
12d33761
HS
3195 if (locinput == PL_bostr ||
3196 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3197 {
b8c5462f
JH
3198 break;
3199 }
d6a28714
JH
3200 sayNO;
3201 case SBOL:
c2a73568 3202 if (locinput == PL_bostr)
b8c5462f 3203 break;
d6a28714
JH
3204 sayNO;
3205 case GPOS:
3b0527fe 3206 if (locinput == reginfo->ganch)
d6a28714
JH
3207 break;
3208 sayNO;
ee9b8eae
YO
3209
3210 case KEEPS:
3211 /* update the startpoint */
f0ab9afb 3212 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3213 PL_reginput = locinput;
f0ab9afb 3214 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3215 PUSH_STATE_GOTO(KEEPS_next, next);
3216 /*NOT-REACHED*/
3217 case KEEPS_next_fail:
3218 /* rollback the start point change */
f0ab9afb 3219 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3220 sayNO_SILENT;
3221 /*NOT-REACHED*/
d6a28714 3222 case EOL:
d6a28714
JH
3223 goto seol;
3224 case MEOL:
d6a28714 3225 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3226 sayNO;
b8c5462f 3227 break;
d6a28714
JH
3228 case SEOL:
3229 seol:
3230 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3231 sayNO;
d6a28714 3232 if (PL_regeol - locinput > 1)
b8c5462f 3233 sayNO;
b8c5462f 3234 break;
d6a28714
JH
3235 case EOS:
3236 if (PL_regeol != locinput)
b8c5462f 3237 sayNO;
d6a28714 3238 break;
ffc61ed2 3239 case SANY:
d6a28714 3240 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3241 sayNO;
f2ed9b32 3242 if (utf8_target) {
f33976b4
DB
3243 locinput += PL_utf8skip[nextchr];
3244 if (locinput > PL_regeol)
3245 sayNO;
3246 nextchr = UCHARAT(locinput);
3247 }
3248 else
3249 nextchr = UCHARAT(++locinput);
3250 break;
3251 case CANY:
3252 if (!nextchr && locinput >= PL_regeol)
3253 sayNO;
b8c5462f 3254 nextchr = UCHARAT(++locinput);
a0d0e21e 3255 break;
ffc61ed2 3256 case REG_ANY:
1aa99e6b
IH
3257 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3258 sayNO;
f2ed9b32 3259 if (utf8_target) {
b8c5462f 3260 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3261 if (locinput > PL_regeol)
3262 sayNO;
a0ed51b3 3263 nextchr = UCHARAT(locinput);
a0ed51b3 3264 }
1aa99e6b
IH
3265 else
3266 nextchr = UCHARAT(++locinput);
a0ed51b3 3267 break;
166ba7cd
DM
3268
3269#undef ST
3270#define ST st->u.trie
786e8c11
YO
3271 case TRIEC:
3272 /* In this case the charclass data is available inline so
3273 we can fail fast without a lot of extra overhead.
3274 */
fab2782b
YO
3275 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3276 DEBUG_EXECUTE_r(
3277 PerlIO_printf(Perl_debug_log,
3278 "%*s %sfailed to match trie start class...%s\n",
3279 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3280 );
3281 sayNO_SILENT;
3282 /* NOTREACHED */
786e8c11
YO
3283 }
3284 /* FALL THROUGH */
5b47454d 3285 case TRIE:
2e64971a
DM
3286 /* the basic plan of execution of the trie is:
3287 * At the beginning, run though all the states, and
3288 * find the longest-matching word. Also remember the position
3289 * of the shortest matching word. For example, this pattern:
3290 * 1 2 3 4 5
3291 * ab|a|x|abcd|abc
3292 * when matched against the string "abcde", will generate
3293 * accept states for all words except 3, with the longest
3294 * matching word being 4, and the shortest being 1 (with
3295 * the position being after char 1 of the string).
3296 *
3297 * Then for each matching word, in word order (i.e. 1,2,4,5),
3298 * we run the remainder of the pattern; on each try setting
3299 * the current position to the character following the word,
3300 * returning to try the next word on failure.
3301 *
3302 * We avoid having to build a list of words at runtime by
3303 * using a compile-time structure, wordinfo[].prev, which
3304 * gives, for each word, the previous accepting word (if any).
3305 * In the case above it would contain the mappings 1->2, 2->0,
3306 * 3->0, 4->5, 5->1. We can use this table to generate, from
3307 * the longest word (4 above), a list of all words, by
3308 * following the list of prev pointers; this gives us the
3309 * unordered list 4,5,1,2. Then given the current word we have
3310 * just tried, we can go through the list and find the
3311 * next-biggest word to try (so if we just failed on word 2,
3312 * the next in the list is 4).
3313 *
3314 * Since at runtime we don't record the matching position in
3315 * the string for each word, we have to work that out for
3316 * each word we're about to process. The wordinfo table holds
3317 * the character length of each word; given that we recorded
3318 * at the start: the position of the shortest word and its
3319 * length in chars, we just need to move the pointer the
3320 * difference between the two char lengths. Depending on
3321 * Unicode status and folding, that's cheap or expensive.
3322 *
3323 * This algorithm is optimised for the case where are only a
3324 * small number of accept states, i.e. 0,1, or maybe 2.
3325 * With lots of accepts states, and having to try all of them,
3326 * it becomes quadratic on number of accept states to find all
3327 * the next words.
3328 */
3329
3dab1dad 3330 {
07be1b83 3331 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3332 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3333
3334 /* what trie are we using right now */
be8e71aa 3335 reg_trie_data * const trie
f8fc2ecf 3336 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3337 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3338 U32 state = trie->startstate;
166ba7cd 3339
fab2782b 3340 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3341 if (trie->states[ state ].wordnum) {
3342 DEBUG_EXECUTE_r(
3343 PerlIO_printf(Perl_debug_log,
3344 "%*s %smatched empty string...%s\n",
5bc10b2c 3345 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3346 );
20dbff7c
YO
3347 if (!trie->jump)
3348 break;
3dab1dad
YO
3349 } else {
3350 DEBUG_EXECUTE_r(
3351 PerlIO_printf(Perl_debug_log,
786e8c11 3352 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3353 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3354 );
3355 sayNO_SILENT;
3356 }
3357 }
166ba7cd 3358
786e8c11
YO
3359 {
3360 U8 *uc = ( U8* )locinput;
3361
3362 STRLEN len = 0;
3363 STRLEN foldlen = 0;
3364 U8 *uscan = (U8*)NULL;
786e8c11 3365 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3366 U32 charcount = 0; /* how many input chars we have matched */
3367 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3368
786e8c11
YO
3369 ST.B = next;
3370 ST.jump = trie->jump;
786e8c11 3371 ST.me = scan;
2e64971a
DM
3372 ST.firstpos = NULL;
3373 ST.longfold = FALSE; /* char longer if folded => it's harder */
3374 ST.nextword = 0;
3375
3376 /* fully traverse the TRIE; note the position of the
3377 shortest accept state and the wordnum of the longest
3378 accept state */
07be1b83 3379
a3621e74 3380 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3381 U32 base = trie->states[ state ].trans.base;
f9f4320a 3382 UV uvc = 0;
acb909b4 3383 U16 charid = 0;
2e64971a
DM
3384 U16 wordnum;
3385 wordnum = trie->states[ state ].wordnum;
3386
3387 if (wordnum) { /* it's an accept state */
3388 if (!accepted) {
3389 accepted = 1;
3390 /* record first match position */
3391 if (ST.longfold) {
3392 ST.firstpos = (U8*)locinput;
3393 ST.firstchars = 0;
5b47454d 3394 }
2e64971a
DM
3395 else {
3396 ST.firstpos = uc;
3397 ST.firstchars = charcount;
3398 }
3399 }
3400 if (!ST.nextword || wordnum < ST.nextword)
3401 ST.nextword = wordnum;
3402 ST.topword = wordnum;
786e8c11 3403 }