This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
propagate /msix and (?msix) etc flags into (??{})
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
7e2509c1
KW
94/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
95 * call if there are no complications: i.e., if everything matchable is
96 * straight forward in the bitmap */
af364d03
KW
97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
98 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 99
c277df42
IZ
100/*
101 * Forwards.
102 */
103
f2ed9b32 104#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 105#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 106
3dab1dad
YO
107#define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
52657f30 109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
110 : (U8*)(pos + off))
111#define HOPBACKc(pos, off) \
07be1b83
YO
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
8e11feef 115 ? (U8*)pos - off \
3dab1dad 116 : NULL)
efb30f32 117
e7409c1b 118#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 119#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 120
20d0b1e9 121/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
122#ifdef EBCDIC
123 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
124 * skip the check on EBCDIC platforms */
125# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
126#else
127# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
128 if (!CAT2(PL_utf8_,class)) { \
129 bool ok; \
130 ENTER; save_re_context(); \
131 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 133#endif
37e2e78e
KW
134
135/* Doesn't do an assert to verify that is correct */
136#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 137 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 138 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
139 ENTER; save_re_context(); \
140 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 LEAVE; } } STMT_END
37e2e78e 142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
144#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
145#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 146
37e2e78e 147#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
148 LOAD_UTF8_CHARCLASS(X_begin, " "); \
149 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
150 /* These are utf8 constants, and not utf-ebcdic constants, so the \
151 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 152 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
153 \
154 /* No asserts are done for these, in case called on an early \
155 * Unicode version in which they map to nothing */ \
d275fa5e
CB
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 163
1dcf4a1b 164#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 165
ee9a90b8
KW
166/* The actual code for CCC_TRY, which uses several variables from the routine
167 * it's callable from. It is designed to be the bulk of a case statement.
168 * FUNC is the macro or function to call on non-utf8 targets that indicate if
169 * nextchr matches the class.
170 * UTF8_TEST is the whole test string to use for utf8 targets
171 * LOAD is what to use to test, and if not present to load in the swash for the
172 * class
173 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
174 * UTF8_TEST test.
175 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
176 * utf8 and a variant, load the swash if necessary and test using the utf8
177 * test. Advance to the next character if test is ok, otherwise fail; If not
178 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
179 * fails, or advance to the next character */
180
181#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
182 if (locinput >= PL_regeol) { \
183 sayNO; \
184 } \
185 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
186 LOAD_UTF8_CHARCLASS(CLASS, STR); \
187 if (POS_OR_NEG (UTF8_TEST)) { \
188 sayNO; \
189 } \
190 locinput += PL_utf8skip[nextchr]; \
191 nextchr = UCHARAT(locinput); \
192 break; \
193 } \
194 if (POS_OR_NEG (FUNC(nextchr))) { \
195 sayNO; \
196 } \
197 nextchr = UCHARAT(++locinput); \
980866de
KW
198 break;
199
ee9a90b8
KW
200/* Handle the non-locale cases for a character class and its complement. It
201 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
202 * This is because that code fails when the test succeeds, so we want to have
203 * the test fail so that the code succeeds. The swash is stored in a
204 * predictable PL_ place */
cfaf538b
KW
205#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
206 CLASS, STR) \
ee9a90b8
KW
207 case NAME: \
208 _CCC_TRY_CODE( !, FUNC, \
209 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
210 (U8*)locinput, TRUE)), \
211 CLASS, STR) \
212 case NNAME: \
1dcf4a1b 213 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
214 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
215 (U8*)locinput, TRUE)), \
216 CLASS, STR) \
217
218/* Generate the case statements for both locale and non-locale character
219 * classes in regmatch for classes that don't have special unicode semantics.
220 * Locales don't use an immediate swash, but an intermediary special locale
221 * function that is called on the pointer to the current place in the input
222 * string. That function will resolve to needing the same swash. One might
223 * think that because we don't know what the locale will match, we shouldn't
224 * check with the swash loading function that it loaded properly; ie, that we
225 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
226 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 * irrelevant here */
228#define CCC_TRY(NAME, NNAME, FUNC, \
229 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 230 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
231 CLASS, STR) \
232 case NAMEL: \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 case NNAMEL: \
236 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
237 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
238 CLASS, STR) \
cfaf538b
KW
239 case NAMEA: \
240 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
244 nextchr = UCHARAT(++locinput); \
245 break; \
246 case NNAMEA: \
247 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 sayNO; \
249 } \
250 if (utf8_target) { \
251 locinput += PL_utf8skip[nextchr]; \
252 nextchr = UCHARAT(locinput); \
253 } \
254 else { \
255 nextchr = UCHARAT(++locinput); \
256 } \
257 break; \
ee9a90b8
KW
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260
261/* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263#define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
cfaf538b 266 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 267 CLASS, STR) \
cfaf538b
KW
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
271 CLASS, STR) \
ee9a90b8 272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 273
3dab1dad
YO
274/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275
5f80c4cf 276/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
277/* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 *
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
285*/
3e901dc0
YO
286#define JUMPABLE(rn) ( \
287 OP(rn) == OPEN || \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == EVAL || \
cca55fe3
JP
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 292 OP(rn) == KEEPS || \
3dab1dad 293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 294)
ee9b8eae 295#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 296
ee9b8eae
YO
297#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298
299#if 0
300/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 303#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305
306#else
307/* ... so we use this as its faster. */
308#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 309#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
310#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
312
313#endif
e2d8ce26 314
a84d97b6
HS
315/*
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318*/
cca55fe3 319#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 323 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 324 else if (type == PLUS) \
cca55fe3 325 rn = NEXTOPER(rn); \
3dab1dad 326 else if (type == IFMATCH) \
a84d97b6 327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 328 else rn += NEXT_OFF(rn); \
3dab1dad 329 } \
5f80c4cf 330} STMT_END
74750237 331
c476f425 332
acfe0abc 333static void restore_pos(pTHX_ void *arg);
51371543 334
87c0511b 335#define REGCP_PAREN_ELEMS 3
f067efbf 336#define REGCP_OTHER_ELEMS 3
e0fa7e2b 337#define REGCP_FRAME_ELEMS 1
620d5b66
NC
338/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
339 * are needed for the regexp context stack bookkeeping. */
340
76e3520e 341STATIC CHECKPOINT
b93070ed 342S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 343{
97aff369 344 dVAR;
a3b680e6 345 const int retval = PL_savestack_ix;
a3b680e6 346 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
347 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
348 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 349 I32 p;
40a82448 350 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 351
b93070ed
DM
352 PERL_ARGS_ASSERT_REGCPPUSH;
353
e49a9654 354 if (paren_elems_to_push < 0)
5637ef5b
NC
355 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
356 paren_elems_to_push);
e49a9654 357
e0fa7e2b
NC
358 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
359 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
360 " out of range (%lu-%ld)",
361 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 362
620d5b66 363 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 364
495f47a5
DM
365 DEBUG_BUFFERS_r(
366 if ((int)PL_regsize > (int)parenfloor)
367 PerlIO_printf(Perl_debug_log,
368 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
369 PTR2UV(rex),
370 PTR2UV(rex->offs)
371 );
372 );
87c0511b 373 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 374/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
375 SSPUSHINT(rex->offs[p].end);
376 SSPUSHINT(rex->offs[p].start);
1ca2007e 377 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 378 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
379 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
380 (UV)p,
381 (IV)rex->offs[p].start,
382 (IV)rex->offs[p].start_tmp,
383 (IV)rex->offs[p].end
40a82448 384 ));
a0d0e21e 385 }
b1ce53c5 386/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 387 SSPUSHINT(PL_regsize);
b93070ed
DM
388 SSPUSHINT(rex->lastparen);
389 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 390 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 391
a0d0e21e
LW
392 return retval;
393}
394
c277df42 395/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
396#define REGCP_SET(cp) \
397 DEBUG_STATE_r( \
ab3bbdeb 398 PerlIO_printf(Perl_debug_log, \
e4f74956 399 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
400 (IV)PL_savestack_ix)); \
401 cp = PL_savestack_ix
c3464db5 402
ab3bbdeb 403#define REGCP_UNWIND(cp) \
e4f74956 404 DEBUG_STATE_r( \
ab3bbdeb 405 if (cp != PL_savestack_ix) \
e4f74956
YO
406 PerlIO_printf(Perl_debug_log, \
407 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
408 (IV)(cp), (IV)PL_savestack_ix)); \
409 regcpblow(cp)
c277df42 410
f067efbf 411STATIC void
b93070ed 412S_regcppop(pTHX_ regexp *rex)
a0d0e21e 413{
97aff369 414 dVAR;
e0fa7e2b 415 UV i;
87c0511b 416 U32 paren;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
87c0511b 439 paren = PL_regsize;
620d5b66 440 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 441 I32 tmps;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
87c0511b 455 paren--;
a0d0e21e 456 }
daf18116 457#if 1
dafc8851
JH
458 /* It would seem that the similar code in regtry()
459 * already takes care of this, and in fact it is in
460 * a better location to since this code can #if 0-ed out
461 * but the code in regtry() is needed or otherwise tests
462 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
463 * (as of patchlevel 7877) will fail. Then again,
464 * this code seems to be necessary or otherwise
225593e1
DM
465 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
466 * --jhi updated by dapm */
b93070ed 467 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 468 if (i > PL_regsize)
b93070ed
DM
469 rex->offs[i].start = -1;
470 rex->offs[i].end = -1;
495f47a5
DM
471 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
472 " \\%"UVuf": %s ..-1 undeffing\n",
473 (UV)i,
474 (i > PL_regsize) ? "-1" : " "
475 ));
a0d0e21e 476 }
dafc8851 477#endif
a0d0e21e
LW
478}
479
74088413
DM
480/* restore the parens and associated vars at savestack position ix,
481 * but without popping the stack */
482
483STATIC void
484S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485{
486 I32 tmpix = PL_savestack_ix;
487 PL_savestack_ix = ix;
488 regcppop(rex);
489 PL_savestack_ix = tmpix;
490}
491
02db2b7b 492#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 493
a687059c 494/*
e50aee73 495 * pregexec and friends
a687059c
LW
496 */
497
76234dfb 498#ifndef PERL_IN_XSUB_RE
a687059c 499/*
c277df42 500 - pregexec - match a regexp against a string
a687059c 501 */
c277df42 502I32
49d7dfbc 503Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 504 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
505/* strend: pointer to null at end of string */
506/* strbeg: real beginning of string */
507/* minend: end of match must be >=minend after stringarg. */
508/* nosave: For optimizations. */
509{
7918f24d
NC
510 PERL_ARGS_ASSERT_PREGEXEC;
511
c277df42 512 return
9041c2e3 513 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
514 nosave ? 0 : REXEC_COPY_STR);
515}
76234dfb 516#endif
22e551b9 517
9041c2e3 518/*
cad2e5aa
JH
519 * Need to implement the following flags for reg_anch:
520 *
521 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
522 * USE_INTUIT_ML
523 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
524 * INTUIT_AUTORITATIVE_ML
525 * INTUIT_ONCE_NOML - Intuit can match in one location only.
526 * INTUIT_ONCE_ML
527 *
528 * Another flag for this function: SECOND_TIME (so that float substrs
529 * with giant delta may be not rechecked).
530 */
531
532/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
533
3f7c398e 534/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
535 Otherwise, only SvCUR(sv) is used to get strbeg. */
536
537/* XXXX We assume that strpos is strbeg unless sv. */
538
6eb5f6b9
JH
539/* XXXX Some places assume that there is a fixed substring.
540 An update may be needed if optimizer marks as "INTUITable"
541 RExen without fixed substrings. Similarly, it is assumed that
542 lengths of all the strings are no more than minlen, thus they
543 cannot come from lookahead.
40d049e4
YO
544 (Or minlen should take into account lookahead.)
545 NOTE: Some of this comment is not correct. minlen does now take account
546 of lookahead/behind. Further research is required. -- demerphq
547
548*/
6eb5f6b9 549
2c2d71f5
JH
550/* A failure to find a constant substring means that there is no need to make
551 an expensive call to REx engine, thus we celebrate a failure. Similarly,
552 finding a substring too deep into the string means that less calls to
30944b6d
IZ
553 regtry() should be needed.
554
555 REx compiler's optimizer found 4 possible hints:
556 a) Anchored substring;
557 b) Fixed substring;
558 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 559 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 560 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
561 string which does not contradict any of them.
562 */
2c2d71f5 563
6eb5f6b9
JH
564/* Most of decisions we do here should have been done at compile time.
565 The nodes of the REx which we used for the search should have been
566 deleted from the finite automaton. */
567
cad2e5aa 568char *
288b8c02 569Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 570 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 571{
97aff369 572 dVAR;
288b8c02 573 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 574 register I32 start_shift = 0;
cad2e5aa 575 /* Should be nonnegative! */
b7953727 576 register I32 end_shift = 0;
2c2d71f5
JH
577 register char *s;
578 register SV *check;
a1933d95 579 char *strbeg;
cad2e5aa 580 char *t;
f2ed9b32 581 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 582 I32 ml_anch;
bd61b366
SS
583 register char *other_last = NULL; /* other substr checked before this */
584 char *check_at = NULL; /* check substr found at this pos */
d8080198 585 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 586 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 587 RXi_GET_DECL(prog,progi);
30944b6d 588#ifdef DEBUGGING
890ce7af 589 const char * const i_strpos = strpos;
30944b6d 590#endif
a3621e74
YO
591 GET_RE_DEBUG_FLAGS_DECL;
592
7918f24d 593 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
594 PERL_UNUSED_ARG(flags);
595 PERL_UNUSED_ARG(data);
7918f24d 596
f2ed9b32 597 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 598
3c8556c3 599 if (RX_UTF8(rx)) {
b8d68ded
JH
600 PL_reg_flags |= RF_utf8;
601 }
ab3bbdeb 602 DEBUG_EXECUTE_r(
f2ed9b32 603 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
604 sv ? "Guessing start of match in sv for"
605 : "Guessing start of match in string for");
2a782b5b 606 );
cad2e5aa 607
c344f387
JH
608 /* CHR_DIST() would be more correct here but it makes things slow. */
609 if (prog->minlen > strend - strpos) {
a3621e74 610 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 611 "String too short... [re_intuit_start]\n"));
cad2e5aa 612 goto fail;
2c2d71f5 613 }
1de06328 614
a1933d95 615 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 616 PL_regeol = strend;
f2ed9b32 617 if (utf8_target) {
33b8afdf
JH
618 if (!prog->check_utf8 && prog->check_substr)
619 to_utf8_substr(prog);
620 check = prog->check_utf8;
621 } else {
622 if (!prog->check_substr && prog->check_utf8)
623 to_byte_substr(prog);
624 check = prog->check_substr;
625 }
1de06328 626 if (check == &PL_sv_undef) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 628 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
629 goto fail;
630 }
bbe252da
YO
631 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
632 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
633 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 634 && !multiline ) ); /* Check after \n? */
cad2e5aa 635
7e25d62c 636 if (!ml_anch) {
bbe252da
YO
637 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
638 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 639 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
640 && sv && !SvROK(sv)
641 && (strpos != strbeg)) {
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
643 goto fail;
644 }
645 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 646 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 647 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
648 I32 slen;
649
1aa99e6b 650 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 651
653099ff
GS
652 if (SvTAIL(check)) {
653 slen = SvCUR(check); /* >= 1 */
cad2e5aa 654
9041c2e3 655 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 656 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 658 goto fail_finish;
cad2e5aa
JH
659 }
660 /* Now should match s[0..slen-2] */
661 slen--;
3f7c398e 662 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 663 || (slen > 1
3f7c398e 664 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 665 report_neq:
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
667 goto fail_finish;
668 }
cad2e5aa 669 }
3f7c398e 670 else if (*SvPVX_const(check) != *s
653099ff 671 || ((slen = SvCUR(check)) > 1
3f7c398e 672 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 673 goto report_neq;
c315bfe8 674 check_at = s;
2c2d71f5 675 goto success_at_start;
7e25d62c 676 }
cad2e5aa 677 }
2c2d71f5 678 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 679 s = strpos;
2c2d71f5 680 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
681 end_shift = prog->check_end_shift;
682
2c2d71f5 683 if (!ml_anch) {
a3b680e6 684 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 685 - (SvTAIL(check) != 0);
a3b680e6 686 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
687
688 if (end_shift < eshift)
689 end_shift = eshift;
690 }
cad2e5aa 691 }
2c2d71f5 692 else { /* Can match at random position */
cad2e5aa
JH
693 ml_anch = 0;
694 s = strpos;
1de06328
YO
695 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
696 end_shift = prog->check_end_shift;
697
698 /* end shift should be non negative here */
cad2e5aa
JH
699 }
700
bcdf7404 701#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 702 if (end_shift < 0)
1de06328 703 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 704 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
705#endif
706
2c2d71f5
JH
707 restart:
708 /* Find a possible match in the region s..strend by looking for
709 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
710
711 {
712 I32 srch_start_shift = start_shift;
713 I32 srch_end_shift = end_shift;
c33e64f0
FC
714 U8* start_point;
715 U8* end_point;
1de06328
YO
716 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
717 srch_end_shift -= ((strbeg - s) - srch_start_shift);
718 srch_start_shift = strbeg - s;
719 }
6bda09f9 720 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
721 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
722 (IV)prog->check_offset_min,
723 (IV)srch_start_shift,
724 (IV)srch_end_shift,
725 (IV)prog->check_end_shift);
726 });
727
bbe252da 728 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
729 start_point= (U8*)(s + srch_start_shift);
730 end_point= (U8*)(strend - srch_end_shift);
731 } else {
732 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
733 end_point= HOP3(strend, -srch_end_shift, strbeg);
734 }
6bda09f9 735 DEBUG_OPTIMISE_MORE_r({
56570a2c 736 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 737 (int)(end_point - start_point),
fc8cd66c 738 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
739 start_point);
740 });
741
742 s = fbm_instr( start_point, end_point,
7fba1cd6 743 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 744 }
cad2e5aa
JH
745 /* Update the count-of-usability, remove useless subpatterns,
746 unshift s. */
2c2d71f5 747
ab3bbdeb 748 DEBUG_EXECUTE_r({
f2ed9b32 749 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
750 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
751 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 752 (s ? "Found" : "Did not find"),
f2ed9b32 753 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
754 ? "anchored" : "floating"),
755 quoted,
756 RE_SV_TAIL(check),
757 (s ? " at offset " : "...\n") );
758 });
2c2d71f5
JH
759
760 if (!s)
761 goto fail_finish;
2c2d71f5 762 /* Finish the diagnostic message */
a3621e74 763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 764
1de06328
YO
765 /* XXX dmq: first branch is for positive lookbehind...
766 Our check string is offset from the beginning of the pattern.
767 So we need to do any stclass tests offset forward from that
768 point. I think. :-(
769 */
770
771
772
773 check_at=s;
774
775
2c2d71f5
JH
776 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
777 Start with the other substr.
778 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 779 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
780 *always* match. Probably should be marked during compile...
781 Probably it is right to do no SCREAM here...
782 */
783
f2ed9b32 784 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
785 : (prog->float_substr && prog->anchored_substr))
786 {
30944b6d 787 /* Take into account the "other" substring. */
2c2d71f5
JH
788 /* XXXX May be hopelessly wrong for UTF... */
789 if (!other_last)
6eb5f6b9 790 other_last = strpos;
f2ed9b32 791 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
792 do_other_anchored:
793 {
890ce7af
AL
794 char * const last = HOP3c(s, -start_shift, strbeg);
795 char *last1, *last2;
be8e71aa 796 char * const saved_s = s;
33b8afdf 797 SV* must;
2c2d71f5 798
2c2d71f5
JH
799 t = s - prog->check_offset_max;
800 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 801 && (!utf8_target
0ce71af7 802 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 803 && t > strpos)))
6f207bd3 804 NOOP;
2c2d71f5
JH
805 else
806 t = strpos;
1aa99e6b 807 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
808 if (t < other_last) /* These positions already checked */
809 t = other_last;
1aa99e6b 810 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
811 if (last < last1)
812 last1 = last;
1de06328
YO
813 /* XXXX It is not documented what units *_offsets are in.
814 We assume bytes, but this is clearly wrong.
815 Meaning this code needs to be carefully reviewed for errors.
816 dmq.
817 */
818
2c2d71f5 819 /* On end-of-str: see comment below. */
f2ed9b32 820 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
821 if (must == &PL_sv_undef) {
822 s = (char*)NULL;
1de06328 823 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
824 }
825 else
826 s = fbm_instr(
827 (unsigned char*)t,
828 HOP3(HOP3(last1, prog->anchored_offset, strend)
829 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
830 must,
7fba1cd6 831 multiline ? FBMrf_MULTILINE : 0
33b8afdf 832 );
ab3bbdeb 833 DEBUG_EXECUTE_r({
f2ed9b32 834 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
835 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
836 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 837 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
838 quoted, RE_SV_TAIL(must));
839 });
840
841
2c2d71f5
JH
842 if (!s) {
843 if (last1 >= last2) {
a3621e74 844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
845 ", giving up...\n"));
846 goto fail_finish;
847 }
a3621e74 848 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 849 ", trying floating at offset %ld...\n",
be8e71aa 850 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
851 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
852 s = HOP3c(last, 1, strend);
2c2d71f5
JH
853 goto restart;
854 }
855 else {
a3621e74 856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 857 (long)(s - i_strpos)));
1aa99e6b
IH
858 t = HOP3c(s, -prog->anchored_offset, strbeg);
859 other_last = HOP3c(s, 1, strend);
be8e71aa 860 s = saved_s;
2c2d71f5
JH
861 if (t == strpos)
862 goto try_at_start;
2c2d71f5
JH
863 goto try_at_offset;
864 }
30944b6d 865 }
2c2d71f5
JH
866 }
867 else { /* Take into account the floating substring. */
33b8afdf 868 char *last, *last1;
be8e71aa 869 char * const saved_s = s;
33b8afdf
JH
870 SV* must;
871
872 t = HOP3c(s, -start_shift, strbeg);
873 last1 = last =
874 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
875 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
876 last = HOP3c(t, prog->float_max_offset, strend);
877 s = HOP3c(t, prog->float_min_offset, strend);
878 if (s < other_last)
879 s = other_last;
2c2d71f5 880 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 881 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
882 /* fbm_instr() takes into account exact value of end-of-str
883 if the check is SvTAIL(ed). Since false positives are OK,
884 and end-of-str is not later than strend we are OK. */
885 if (must == &PL_sv_undef) {
886 s = (char*)NULL;
1de06328 887 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
888 }
889 else
2c2d71f5 890 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
891 (unsigned char*)last + SvCUR(must)
892 - (SvTAIL(must)!=0),
7fba1cd6 893 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 894 DEBUG_EXECUTE_r({
f2ed9b32 895 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
896 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
897 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 898 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
899 quoted, RE_SV_TAIL(must));
900 });
33b8afdf
JH
901 if (!s) {
902 if (last1 == last) {
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
904 ", giving up...\n"));
905 goto fail_finish;
2c2d71f5 906 }
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 908 ", trying anchored starting at offset %ld...\n",
be8e71aa 909 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
910 other_last = last;
911 s = HOP3c(t, 1, strend);
912 goto restart;
913 }
914 else {
a3621e74 915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
916 (long)(s - i_strpos)));
917 other_last = s; /* Fix this later. --Hugo */
be8e71aa 918 s = saved_s;
33b8afdf
JH
919 if (t == strpos)
920 goto try_at_start;
921 goto try_at_offset;
922 }
2c2d71f5 923 }
cad2e5aa 924 }
2c2d71f5 925
1de06328 926
9ef43ace 927 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 928
6bda09f9 929 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
930 PerlIO_printf(Perl_debug_log,
931 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
932 (IV)prog->check_offset_min,
933 (IV)prog->check_offset_max,
934 (IV)(s-strpos),
935 (IV)(t-strpos),
936 (IV)(t-s),
937 (IV)(strend-strpos)
938 )
939 );
940
2c2d71f5 941 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 942 && (!utf8_target
9ef43ace 943 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
944 && t > strpos)))
945 {
2c2d71f5
JH
946 /* Fixed substring is found far enough so that the match
947 cannot start at strpos. */
948 try_at_offset:
cad2e5aa 949 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
950 /* Eventually fbm_*() should handle this, but often
951 anchored_offset is not 0, so this check will not be wasted. */
952 /* XXXX In the code below we prefer to look for "^" even in
953 presence of anchored substrings. And we search even
954 beyond the found float position. These pessimizations
955 are historical artefacts only. */
956 find_anchor:
2c2d71f5 957 while (t < strend - prog->minlen) {
cad2e5aa 958 if (*t == '\n') {
4ee3650e 959 if (t < check_at - prog->check_offset_min) {
f2ed9b32 960 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
961 /* Since we moved from the found position,
962 we definitely contradict the found anchored
30944b6d
IZ
963 substr. Due to the above check we do not
964 contradict "check" substr.
965 Thus we can arrive here only if check substr
966 is float. Redo checking for "other"=="fixed".
967 */
9041c2e3 968 strpos = t + 1;
a3621e74 969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 970 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
971 goto do_other_anchored;
972 }
4ee3650e
GS
973 /* We don't contradict the found floating substring. */
974 /* XXXX Why not check for STCLASS? */
cad2e5aa 975 s = t + 1;
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 977 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
978 goto set_useful;
979 }
4ee3650e
GS
980 /* Position contradicts check-string */
981 /* XXXX probably better to look for check-string
982 than for "\n", so one should lower the limit for t? */
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 984 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 985 other_last = strpos = s = t + 1;
cad2e5aa
JH
986 goto restart;
987 }
988 t++;
989 }
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 991 PL_colors[0], PL_colors[1]));
2c2d71f5 992 goto fail_finish;
cad2e5aa 993 }
f5952150 994 else {
a3621e74 995 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 996 PL_colors[0], PL_colors[1]));
f5952150 997 }
cad2e5aa
JH
998 s = t;
999 set_useful:
f2ed9b32 1000 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1001 }
1002 else {
f5952150 1003 /* The found string does not prohibit matching at strpos,
2c2d71f5 1004 - no optimization of calling REx engine can be performed,
f5952150
GS
1005 unless it was an MBOL and we are not after MBOL,
1006 or a future STCLASS check will fail this. */
2c2d71f5
JH
1007 try_at_start:
1008 /* Even in this situation we may use MBOL flag if strpos is offset
1009 wrt the start of the string. */
05b4157f 1010 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1011 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1012 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1013 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1014 {
cad2e5aa
JH
1015 t = strpos;
1016 goto find_anchor;
1017 }
a3621e74 1018 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1019 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1020 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1021 );
2c2d71f5 1022 success_at_start:
bbe252da 1023 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1024 && (utf8_target ? (
33b8afdf
JH
1025 prog->check_utf8 /* Could be deleted already */
1026 && --BmUSEFUL(prog->check_utf8) < 0
1027 && (prog->check_utf8 == prog->float_utf8)
1028 ) : (
1029 prog->check_substr /* Could be deleted already */
1030 && --BmUSEFUL(prog->check_substr) < 0
1031 && (prog->check_substr == prog->float_substr)
1032 )))
66e933ab 1033 {
cad2e5aa 1034 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1035 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1036 /* XXX Does the destruction order has to change with utf8_target? */
1037 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1038 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1039 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1040 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1041 check = NULL; /* abort */
cad2e5aa 1042 s = strpos;
486ec47a 1043 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1044 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1045 if (prog->intflags & PREGf_IMPLICIT)
1046 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1047 /* XXXX This is a remnant of the old implementation. It
1048 looks wasteful, since now INTUIT can use many
6eb5f6b9 1049 other heuristics. */
bbe252da 1050 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1051 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1052 }
1053 else
1054 s = strpos;
1055 }
1056
6eb5f6b9
JH
1057 /* Last resort... */
1058 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1059 /* trie stclasses are too expensive to use here, we are better off to
1060 leave it to regmatch itself */
f8fc2ecf 1061 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1062 /* minlen == 0 is possible if regstclass is \b or \B,
1063 and the fixed substr is ''$.
1064 Since minlen is already taken into account, s+1 is before strend;
1065 accidentally, minlen >= 1 guaranties no false positives at s + 1
1066 even for \b or \B. But (minlen? 1 : 0) below assumes that
1067 regstclass does not come from lookahead... */
1068 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1069 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1070 const U8* const str = (U8*)STRING(progi->regstclass);
1071 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1072 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1073 : 1);
1de06328
YO
1074 char * endpos;
1075 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1076 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1077 else if (prog->float_substr || prog->float_utf8)
1078 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1079 else
1080 endpos= strend;
1081
d8080198
YO
1082 if (checked_upto < s)
1083 checked_upto = s;
1084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1085 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1086
6eb5f6b9 1087 t = s;
d8080198
YO
1088 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1089 if (s) {
1090 checked_upto = s;
1091 } else {
6eb5f6b9 1092#ifdef DEBUGGING
cbbf8932 1093 const char *what = NULL;
6eb5f6b9
JH
1094#endif
1095 if (endpos == strend) {
a3621e74 1096 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1097 "Could not match STCLASS...\n") );
1098 goto fail;
1099 }
a3621e74 1100 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1101 "This position contradicts STCLASS...\n") );
bbe252da 1102 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1103 goto fail;
d8080198
YO
1104 checked_upto = HOPBACKc(endpos, start_shift);
1105 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1106 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1107 /* Contradict one of substrings */
33b8afdf 1108 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1109 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1110 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1111 hop_and_restart:
1aa99e6b 1112 s = HOP3c(t, 1, strend);
66e933ab
GS
1113 if (s + start_shift + end_shift > strend) {
1114 /* XXXX Should be taken into account earlier? */
a3621e74 1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1116 "Could not match STCLASS...\n") );
1117 goto fail;
1118 }
5e39e1e5
HS
1119 if (!check)
1120 goto giveup;
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1122 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1123 what, (long)(s + start_shift - i_strpos)) );
1124 goto restart;
1125 }
66e933ab 1126 /* Have both, check_string is floating */
6eb5f6b9
JH
1127 if (t + start_shift >= check_at) /* Contradicts floating=check */
1128 goto retry_floating_check;
1129 /* Recheck anchored substring, but not floating... */
9041c2e3 1130 s = check_at;
5e39e1e5
HS
1131 if (!check)
1132 goto giveup;
a3621e74 1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1134 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1135 (long)(other_last - i_strpos)) );
1136 goto do_other_anchored;
1137 }
60e71179
GS
1138 /* Another way we could have checked stclass at the
1139 current position only: */
1140 if (ml_anch) {
1141 s = t = t + 1;
5e39e1e5
HS
1142 if (!check)
1143 goto giveup;
a3621e74 1144 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1145 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1146 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1147 goto try_at_offset;
66e933ab 1148 }
f2ed9b32 1149 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1150 goto fail;
486ec47a 1151 /* Check is floating substring. */
6eb5f6b9
JH
1152 retry_floating_check:
1153 t = check_at - start_shift;
a3621e74 1154 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1155 goto hop_and_restart;
1156 }
b7953727 1157 if (t != s) {
a3621e74 1158 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1159 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1160 (long)(t - i_strpos), (long)(s - i_strpos))
1161 );
1162 }
1163 else {
a3621e74 1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1165 "Does not contradict STCLASS...\n");
1166 );
1167 }
6eb5f6b9 1168 }
5e39e1e5 1169 giveup:
a3621e74 1170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1171 PL_colors[4], (check ? "Guessed" : "Giving up"),
1172 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1173 return s;
2c2d71f5
JH
1174
1175 fail_finish: /* Substring not found */
33b8afdf 1176 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1177 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1178 fail:
a3621e74 1179 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1180 PL_colors[4], PL_colors[5]));
bd61b366 1181 return NULL;
cad2e5aa 1182}
9661b544 1183
a0a388a1
YO
1184#define DECL_TRIE_TYPE(scan) \
1185 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1186 trie_type = ((scan->flags == EXACT) \
1187 ? (utf8_target ? trie_utf8 : trie_plain) \
1188 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1189
1190#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1191uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1192 STRLEN skiplen; \
1193 switch (trie_type) { \
1194 case trie_utf8_fold: \
1195 if ( foldlen>0 ) { \
1196 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1197 foldlen -= len; \
1198 uscan += len; \
1199 len=0; \
1200 } else { \
1201 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1202 len = UTF8SKIP(uc); \
1203 skiplen = UNISKIP( uvc ); \
1204 foldlen -= skiplen; \
1205 uscan = foldbuf + skiplen; \
1206 } \
1207 break; \
1208 case trie_latin_utf8_fold: \
1209 if ( foldlen>0 ) { \
1210 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1211 foldlen -= len; \
1212 uscan += len; \
1213 len=0; \
1214 } else { \
1215 len = 1; \
1216 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1217 skiplen = UNISKIP( uvc ); \
1218 foldlen -= skiplen; \
1219 uscan = foldbuf + skiplen; \
1220 } \
1221 break; \
1222 case trie_utf8: \
1223 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1224 break; \
1225 case trie_plain: \
1226 uvc = (UV)*uc; \
1227 len = 1; \
1228 } \
1229 if (uvc < 256) { \
1230 charid = trie->charmap[ uvc ]; \
1231 } \
1232 else { \
1233 charid = 0; \
1234 if (widecharmap) { \
1235 SV** const svpp = hv_fetch(widecharmap, \
1236 (char*)&uvc, sizeof(UV), 0); \
1237 if (svpp) \
1238 charid = (U16)SvIV(*svpp); \
1239 } \
1240 } \
4cadc6a9
YO
1241} STMT_END
1242
4cadc6a9
YO
1243#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1244STMT_START { \
1245 while (s <= e) { \
1246 if ( (CoNd) \
fac1af77 1247 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1248 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1249 goto got_it; \
1250 s++; \
1251 } \
1252} STMT_END
1253
1254#define REXEC_FBC_UTF8_SCAN(CoDe) \
1255STMT_START { \
1256 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1257 CoDe \
1258 s += uskip; \
1259 } \
1260} STMT_END
1261
1262#define REXEC_FBC_SCAN(CoDe) \
1263STMT_START { \
1264 while (s < strend) { \
1265 CoDe \
1266 s++; \
1267 } \
1268} STMT_END
1269
1270#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1271REXEC_FBC_UTF8_SCAN( \
1272 if (CoNd) { \
24b23f37 1273 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it; \
1275 else \
1276 tmp = doevery; \
1277 } \
1278 else \
1279 tmp = 1; \
1280)
1281
1282#define REXEC_FBC_CLASS_SCAN(CoNd) \
1283REXEC_FBC_SCAN( \
1284 if (CoNd) { \
24b23f37 1285 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1286 goto got_it; \
1287 else \
1288 tmp = doevery; \
1289 } \
1290 else \
1291 tmp = 1; \
1292)
1293
1294#define REXEC_FBC_TRYIT \
24b23f37 1295if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1296 goto got_it
1297
e1d1eefb 1298#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1299 if (utf8_target) { \
e1d1eefb
YO
1300 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1301 } \
1302 else { \
1303 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1304 }
e1d1eefb 1305
4cadc6a9 1306#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1307 if (utf8_target) { \
4cadc6a9
YO
1308 UtFpReLoAd; \
1309 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1310 } \
1311 else { \
1312 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1313 }
4cadc6a9
YO
1314
1315#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1316 PL_reg_flags |= RF_tainted; \
f2ed9b32 1317 if (utf8_target) { \
4cadc6a9
YO
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 } \
1320 else { \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1322 }
4cadc6a9 1323
786e8c11
YO
1324#define DUMP_EXEC_POS(li,s,doutf8) \
1325 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
cfaf538b
KW
1327
1328#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1330 tmp = TEST_NON_UTF8(tmp); \
1331 REXEC_FBC_UTF8_SCAN( \
1332 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333 tmp = !tmp; \
1334 IF_SUCCESS; \
1335 } \
1336 else { \
1337 IF_FAIL; \
1338 } \
1339 ); \
1340
1341#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342 if (s == PL_bostr) { \
1343 tmp = '\n'; \
1344 } \
1345 else { \
1346 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1347 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1348 } \
1349 tmp = TeSt1_UtF8; \
1350 LOAD_UTF8_CHARCLASS_ALNUM(); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! (TeSt2_UtF8)) { \
1353 tmp = !tmp; \
1354 IF_SUCCESS; \
1355 } \
1356 else { \
1357 IF_FAIL; \
1358 } \
1359 ); \
1360
63ac0dad
KW
1361/* The only difference between the BOUND and NBOUND cases is that
1362 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363 * NBOUND. This is accomplished by passing it in either the if or else clause,
1364 * with the other one being empty */
1365#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1366 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1367
1368#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1369 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1370
1371#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1372 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1373
1374#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1375 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1376
63ac0dad
KW
1377
1378/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1379 * be passed in completely with the variable name being tested, which isn't
1380 * such a clean interface, but this is easier to read than it was before. We
1381 * are looking for the boundary (or non-boundary between a word and non-word
1382 * character. The utf8 and non-utf8 cases have the same logic, but the details
1383 * must be different. Find the "wordness" of the character just prior to this
1384 * one, and compare it with the wordness of this one. If they differ, we have
1385 * a boundary. At the beginning of the string, pretend that the previous
1386 * character was a new-line */
cfaf538b 1387#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1388 if (utf8_target) { \
cfaf538b 1389 UTF8_CODE \
63ac0dad
KW
1390 } \
1391 else { /* Not utf8 */ \
1392 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1393 tmp = TEST_NON_UTF8(tmp); \
1394 REXEC_FBC_SCAN( \
1395 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1396 tmp = !tmp; \
1397 IF_SUCCESS; \
1398 } \
1399 else { \
1400 IF_FAIL; \
1401 } \
1402 ); \
1403 } \
1404 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1405 goto got_it;
1406
786e8c11
YO
1407/* We know what class REx starts with. Try to find this position... */
1408/* if reginfo is NULL, its a dryrun */
1409/* annoyingly all the vars in this routine have different names from their counterparts
1410 in regmatch. /grrr */
1411
3c3eec57 1412STATIC char *
07be1b83 1413S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1414 const char *strend, regmatch_info *reginfo)
a687059c 1415{
27da23d5 1416 dVAR;
bbe252da 1417 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1418 char *pat_string; /* The pattern's exactish string */
1419 char *pat_end; /* ptr to end char of pat_string */
1420 re_fold_t folder; /* Function for computing non-utf8 folds */
1421 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1422 STRLEN ln;
5dab1207 1423 STRLEN lnc;
078c425b 1424 register STRLEN uskip;
fac1af77
KW
1425 U8 c1;
1426 U8 c2;
6eb5f6b9
JH
1427 char *e;
1428 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1429 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1430 UV utf8_fold_flags = 0;
f8fc2ecf 1431 RXi_GET_DECL(prog,progi);
7918f24d
NC
1432
1433 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1434
6eb5f6b9
JH
1435 /* We know what class it must start with. */
1436 switch (OP(c)) {
f56b6394 1437 case ANYOFV:
6eb5f6b9 1438 case ANYOF:
f56b6394 1439 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1440 STRLEN inclasslen = strend - s;
1441 REXEC_FBC_UTF8_CLASS_SCAN(
1442 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1443 }
1444 else {
6ef69d56 1445 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1446 }
6eb5f6b9 1447 break;
f33976b4 1448 case CANY:
4cadc6a9 1449 REXEC_FBC_SCAN(
24b23f37 1450 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1451 goto got_it;
1452 else
1453 tmp = doevery;
4cadc6a9 1454 );
f33976b4 1455 break;
2f7f8cb1
KW
1456
1457 case EXACTFA:
1458 if (UTF_PATTERN || utf8_target) {
1459 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1460 goto do_exactf_utf8;
1461 }
1462 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1463 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1464 goto do_exactf_non_utf8; /* isn't dealt with by these */
1465
6eb5f6b9 1466 case EXACTF:
62bf7766 1467 if (utf8_target) {
77a6d856
KW
1468
1469 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1470 utf8_fold_flags = 0;
fac1af77
KW
1471 goto do_exactf_utf8;
1472 }
1473 fold_array = PL_fold;
1474 folder = foldEQ;
1475 goto do_exactf_non_utf8;
1476
1477 case EXACTFL:
1478 if (UTF_PATTERN || utf8_target) {
17580e7a 1479 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1480 goto do_exactf_utf8;
1481 }
1482 fold_array = PL_fold_locale;
1483 folder = foldEQ_locale;
16d951b7
KW
1484 goto do_exactf_non_utf8;
1485
3c760661
KW
1486 case EXACTFU_SS:
1487 if (UTF_PATTERN) {
1488 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1489 }
1490 goto do_exactf_utf8;
1491
fab2782b 1492 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1493 case EXACTFU:
1494 if (UTF_PATTERN || utf8_target) {
77a6d856 1495 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1496 goto do_exactf_utf8;
1497 }
1498
1499 /* Any 'ss' in the pattern should have been replaced by regcomp,
1500 * so we don't have to worry here about this single special case
1501 * in the Latin1 range */
1502 fold_array = PL_fold_latin1;
1503 folder = foldEQ_latin1;
fac1af77
KW
1504
1505 /* FALL THROUGH */
1506
62bf7766
KW
1507 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1508 are no glitches with fold-length differences
1509 between the target string and pattern */
fac1af77
KW
1510
1511 /* The idea in the non-utf8 EXACTF* cases is to first find the
1512 * first character of the EXACTF* node and then, if necessary,
1513 * case-insensitively compare the full text of the node. c1 is the
1514 * first character. c2 is its fold. This logic will not work for
1515 * Unicode semantics and the german sharp ss, which hence should
1516 * not be compiled into a node that gets here. */
1517 pat_string = STRING(c);
1518 ln = STR_LEN(c); /* length to match in octets/bytes */
1519
8a90a8fe
KW
1520 /* We know that we have to match at least 'ln' bytes (which is the
1521 * same as characters, since not utf8). If we have to match 3
1522 * characters, and there are only 2 availabe, we know without
1523 * trying that it will fail; so don't start a match past the
1524 * required minimum number from the far end */
fac1af77
KW
1525 e = HOP3c(strend, -((I32)ln), s);
1526
1527 if (!reginfo && e < s) {
1528 e = s; /* Due to minlen logic of intuit() */
1529 }
1530
1531 c1 = *pat_string;
1532 c2 = fold_array[c1];
1533 if (c1 == c2) { /* If char and fold are the same */
1534 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1535 }
1536 else {
1537 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1538 }
1539 break;
1540
1541 do_exactf_utf8:
e067297c
KW
1542 {
1543 unsigned expansion;
1544
fac1af77
KW
1545
1546 /* If one of the operands is in utf8, we can't use the simpler
1547 * folding above, due to the fact that many different characters
1548 * can have the same fold, or portion of a fold, or different-
1549 * length fold */
1550 pat_string = STRING(c);
1551 ln = STR_LEN(c); /* length to match in octets/bytes */
1552 pat_end = pat_string + ln;
1553 lnc = (UTF_PATTERN) /* length to match in characters */
1554 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1555 : ln;
1556
e067297c
KW
1557 /* We have 'lnc' characters to match in the pattern, but because of
1558 * multi-character folding, each character in the target can match
1559 * up to 3 characters (Unicode guarantees it will never exceed
1560 * this) if it is utf8-encoded; and up to 2 if not (based on the
1561 * fact that the Latin 1 folds are already determined, and the
1562 * only multi-char fold in that range is the sharp-s folding to
1563 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1564 * string character. Adjust lnc accordingly, rounding up, so that
1565 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1566 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1567 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1568
1569 /* As in the non-UTF8 case, if we have to match 3 characters, and
1570 * only 2 are left, it's guaranteed to fail, so don't start a
1571 * match that would require us to go beyond the end of the string
1572 */
1573 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1574
1575 if (!reginfo && e < s) {
1576 e = s; /* Due to minlen logic of intuit() */
1577 }
1578
b33105db
KW
1579 /* XXX Note that we could recalculate e to stop the loop earlier,
1580 * as the worst case expansion above will rarely be met, and as we
1581 * go along we would usually find that e moves further to the left.
1582 * This would happen only after we reached the point in the loop
1583 * where if there were no expansion we should fail. Unclear if
1584 * worth the expense */
e067297c 1585
fac1af77
KW
1586 while (s <= e) {
1587 char *my_strend= (char *)strend;
1588 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1589 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1590 && (!reginfo || regtry(reginfo, &s)) )
1591 {
1592 goto got_it;
1593 }
bbdd8bad 1594 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1595 }
1596 break;
e067297c 1597 }
bbce6d69 1598 case BOUNDL:
3280af22 1599 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1600 FBC_BOUND(isALNUM_LC,
1601 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1602 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1603 break;
bbce6d69 1604 case NBOUNDL:
3280af22 1605 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1606 FBC_NBOUND(isALNUM_LC,
1607 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1608 isALNUM_LC_utf8((U8*)s));
1609 break;
1610 case BOUND:
1611 FBC_BOUND(isWORDCHAR,
1612 isALNUM_uni(tmp),
1613 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614 break;
cfaf538b
KW
1615 case BOUNDA:
1616 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1617 isWORDCHAR_A(tmp),
1618 isWORDCHAR_A((U8*)s));
1619 break;
a0d0e21e 1620 case NBOUND:
63ac0dad
KW
1621 FBC_NBOUND(isWORDCHAR,
1622 isALNUM_uni(tmp),
1623 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1624 break;
cfaf538b
KW
1625 case NBOUNDA:
1626 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1627 isWORDCHAR_A(tmp),
1628 isWORDCHAR_A((U8*)s));
1629 break;
63ac0dad
KW
1630 case BOUNDU:
1631 FBC_BOUND(isWORDCHAR_L1,
1632 isALNUM_uni(tmp),
1633 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1634 break;
1635 case NBOUNDU:
1636 FBC_NBOUND(isWORDCHAR_L1,
1637 isALNUM_uni(tmp),
1638 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1639 break;
bbce6d69 1640 case ALNUML:
4cadc6a9
YO
1641 REXEC_FBC_CSCAN_TAINT(
1642 isALNUM_LC_utf8((U8*)s),
1643 isALNUM_LC(*s)
1644 );
6895a8aa 1645 break;
980866de
KW
1646 case ALNUMU:
1647 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1648 LOAD_UTF8_CHARCLASS_ALNUM(),
1649 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1650 isWORDCHAR_L1((U8) *s)
1651 );
6895a8aa 1652 break;
980866de
KW
1653 case ALNUM:
1654 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1655 LOAD_UTF8_CHARCLASS_ALNUM(),
1656 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1657 isWORDCHAR((U8) *s)
1658 );
6895a8aa 1659 break;
cfaf538b 1660 case ALNUMA:
8e9da4d4
KW
1661 /* Don't need to worry about utf8, as it can match only a single
1662 * byte invariant character */
cfaf538b 1663 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1664 break;
980866de
KW
1665 case NALNUMU:
1666 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1667 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1668 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1669 ! isWORDCHAR_L1((U8) *s)
1670 );
6895a8aa 1671 break;
a0d0e21e 1672 case NALNUM:
4cadc6a9 1673 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1674 LOAD_UTF8_CHARCLASS_ALNUM(),
1675 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1676 ! isALNUM(*s)
4cadc6a9 1677 );
6895a8aa 1678 break;
cfaf538b 1679 case NALNUMA:
8e9da4d4
KW
1680 REXEC_FBC_CSCAN(
1681 !isWORDCHAR_A(*s),
1682 !isWORDCHAR_A(*s)
1683 );
1684 break;
bbce6d69 1685 case NALNUML:
4cadc6a9
YO
1686 REXEC_FBC_CSCAN_TAINT(
1687 !isALNUM_LC_utf8((U8*)s),
1688 !isALNUM_LC(*s)
1689 );
6895a8aa 1690 break;
980866de
KW
1691 case SPACEU:
1692 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1693 LOAD_UTF8_CHARCLASS_SPACE(),
1694 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1695 isSPACE_L1((U8) *s)
1696 );
6895a8aa 1697 break;
a0d0e21e 1698 case SPACE:
4cadc6a9 1699 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1700 LOAD_UTF8_CHARCLASS_SPACE(),
1701 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1702 isSPACE((U8) *s)
4cadc6a9 1703 );
6895a8aa 1704 break;
cfaf538b 1705 case SPACEA:
8e9da4d4
KW
1706 /* Don't need to worry about utf8, as it can match only a single
1707 * byte invariant character */
cfaf538b 1708 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1709 break;
bbce6d69 1710 case SPACEL:
4cadc6a9 1711 REXEC_FBC_CSCAN_TAINT(
6bbba904 1712 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1713 isSPACE_LC(*s)
1714 );
6895a8aa 1715 break;
980866de
KW
1716 case NSPACEU:
1717 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1718 LOAD_UTF8_CHARCLASS_SPACE(),
1719 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1720 ! isSPACE_L1((U8) *s)
1721 );
6895a8aa 1722 break;
a0d0e21e 1723 case NSPACE:
4cadc6a9 1724 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1725 LOAD_UTF8_CHARCLASS_SPACE(),
1726 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1727 ! isSPACE((U8) *s)
4cadc6a9 1728 );
6895a8aa 1729 break;
cfaf538b 1730 case NSPACEA:
8e9da4d4
KW
1731 REXEC_FBC_CSCAN(
1732 !isSPACE_A(*s),
1733 !isSPACE_A(*s)
1734 );
1735 break;
bbce6d69 1736 case NSPACEL:
4cadc6a9 1737 REXEC_FBC_CSCAN_TAINT(
6bbba904 1738 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1739 !isSPACE_LC(*s)
1740 );
6895a8aa 1741 break;
a0d0e21e 1742 case DIGIT:
4cadc6a9 1743 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1744 LOAD_UTF8_CHARCLASS_DIGIT(),
1745 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1746 isDIGIT(*s)
1747 );
6895a8aa 1748 break;
cfaf538b 1749 case DIGITA:
8e9da4d4
KW
1750 /* Don't need to worry about utf8, as it can match only a single
1751 * byte invariant character */
cfaf538b 1752 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1753 break;
b8c5462f 1754 case DIGITL:
4cadc6a9
YO
1755 REXEC_FBC_CSCAN_TAINT(
1756 isDIGIT_LC_utf8((U8*)s),
1757 isDIGIT_LC(*s)
1758 );
6895a8aa 1759 break;
a0d0e21e 1760 case NDIGIT:
4cadc6a9 1761 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1762 LOAD_UTF8_CHARCLASS_DIGIT(),
1763 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1764 !isDIGIT(*s)
1765 );
6895a8aa 1766 break;
cfaf538b 1767 case NDIGITA:
8e9da4d4
KW
1768 REXEC_FBC_CSCAN(
1769 !isDIGIT_A(*s),
1770 !isDIGIT_A(*s)
1771 );
1772 break;
b8c5462f 1773 case NDIGITL:
4cadc6a9
YO
1774 REXEC_FBC_CSCAN_TAINT(
1775 !isDIGIT_LC_utf8((U8*)s),
1776 !isDIGIT_LC(*s)
1777 );
6895a8aa 1778 break;
e1d1eefb
YO
1779 case LNBREAK:
1780 REXEC_FBC_CSCAN(
1781 is_LNBREAK_utf8(s),
1782 is_LNBREAK_latin1(s)
1783 );
6895a8aa 1784 break;
e1d1eefb
YO
1785 case VERTWS:
1786 REXEC_FBC_CSCAN(
1787 is_VERTWS_utf8(s),
1788 is_VERTWS_latin1(s)
1789 );
6895a8aa 1790 break;
e1d1eefb
YO
1791 case NVERTWS:
1792 REXEC_FBC_CSCAN(
1793 !is_VERTWS_utf8(s),
1794 !is_VERTWS_latin1(s)
1795 );
6895a8aa 1796 break;
e1d1eefb
YO
1797 case HORIZWS:
1798 REXEC_FBC_CSCAN(
1799 is_HORIZWS_utf8(s),
1800 is_HORIZWS_latin1(s)
1801 );
6895a8aa 1802 break;
e1d1eefb
YO
1803 case NHORIZWS:
1804 REXEC_FBC_CSCAN(
1805 !is_HORIZWS_utf8(s),
1806 !is_HORIZWS_latin1(s)
1807 );
6895a8aa 1808 break;
1de06328
YO
1809 case AHOCORASICKC:
1810 case AHOCORASICK:
07be1b83 1811 {
a0a388a1 1812 DECL_TRIE_TYPE(c);
07be1b83
YO
1813 /* what trie are we using right now */
1814 reg_ac_data *aho
f8fc2ecf 1815 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1816 reg_trie_data *trie
1817 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1818 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1819
1820 const char *last_start = strend - trie->minlen;
6148ee25 1821#ifdef DEBUGGING
07be1b83 1822 const char *real_start = s;
6148ee25 1823#endif
07be1b83 1824 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1825 SV *sv_points;
1826 U8 **points; /* map of where we were in the input string
786e8c11 1827 when reading a given char. For ASCII this
be8e71aa 1828 is unnecessary overhead as the relationship
38a44b82
NC
1829 is always 1:1, but for Unicode, especially
1830 case folded Unicode this is not true. */
f9e705e8 1831 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1832 U8 *bitmap=NULL;
1833
07be1b83
YO
1834
1835 GET_RE_DEBUG_FLAGS_DECL;
1836
be8e71aa
YO
1837 /* We can't just allocate points here. We need to wrap it in
1838 * an SV so it gets freed properly if there is a croak while
1839 * running the match */
1840 ENTER;
1841 SAVETMPS;
1842 sv_points=newSV(maxlen * sizeof(U8 *));
1843 SvCUR_set(sv_points,
1844 maxlen * sizeof(U8 *));
1845 SvPOK_on(sv_points);
1846 sv_2mortal(sv_points);
1847 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1848 if ( trie_type != trie_utf8_fold
1849 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1850 {
786e8c11
YO
1851 if (trie->bitmap)
1852 bitmap=(U8*)trie->bitmap;
1853 else
1854 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1855 }
786e8c11
YO
1856 /* this is the Aho-Corasick algorithm modified a touch
1857 to include special handling for long "unknown char"
1858 sequences. The basic idea being that we use AC as long
1859 as we are dealing with a possible matching char, when
1860 we encounter an unknown char (and we have not encountered
1861 an accepting state) we scan forward until we find a legal
1862 starting char.
1863 AC matching is basically that of trie matching, except
1864 that when we encounter a failing transition, we fall back
1865 to the current states "fail state", and try the current char
1866 again, a process we repeat until we reach the root state,
1867 state 1, or a legal transition. If we fail on the root state
1868 then we can either terminate if we have reached an accepting
1869 state previously, or restart the entire process from the beginning
1870 if we have not.
1871
1872 */
07be1b83
YO
1873 while (s <= last_start) {
1874 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1875 U8 *uc = (U8*)s;
1876 U16 charid = 0;
1877 U32 base = 1;
1878 U32 state = 1;
1879 UV uvc = 0;
1880 STRLEN len = 0;
1881 STRLEN foldlen = 0;
1882 U8 *uscan = (U8*)NULL;
1883 U8 *leftmost = NULL;
786e8c11
YO
1884#ifdef DEBUGGING
1885 U32 accepted_word= 0;
1886#endif
07be1b83
YO
1887 U32 pointpos = 0;
1888
1889 while ( state && uc <= (U8*)strend ) {
1890 int failed=0;
786e8c11
YO
1891 U32 word = aho->states[ state ].wordnum;
1892
1de06328
YO
1893 if( state==1 ) {
1894 if ( bitmap ) {
1895 DEBUG_TRIE_EXECUTE_r(
1896 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1897 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1898 (char *)uc, utf8_target );
1de06328
YO
1899 PerlIO_printf( Perl_debug_log,
1900 " Scanning for legal start char...\n");
1901 }
d085b490
YO
1902 );
1903 if (utf8_target) {
1904 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1905 uc += UTF8SKIP(uc);
1906 }
1907 } else {
1908 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1909 uc++;
1910 }
1911 }
1de06328 1912 s= (char *)uc;
786e8c11 1913 }
786e8c11
YO
1914 if (uc >(U8*)last_start) break;
1915 }
1916
1917 if ( word ) {
2e64971a 1918 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1919 if (!leftmost || lpos < leftmost) {
1920 DEBUG_r(accepted_word=word);
07be1b83 1921 leftmost= lpos;
786e8c11 1922 }
07be1b83 1923 if (base==0) break;
786e8c11 1924
07be1b83
YO
1925 }
1926 points[pointpos++ % maxlen]= uc;
55eed653
NC
1927 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1928 uscan, len, uvc, charid, foldlen,
1929 foldbuf, uniflags);
786e8c11
YO
1930 DEBUG_TRIE_EXECUTE_r({
1931 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1932 s, utf8_target );
07be1b83 1933 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1934 " Charid:%3u CP:%4"UVxf" ",
1935 charid, uvc);
1936 });
07be1b83
YO
1937
1938 do {
6148ee25 1939#ifdef DEBUGGING
786e8c11 1940 word = aho->states[ state ].wordnum;
6148ee25 1941#endif
07be1b83
YO
1942 base = aho->states[ state ].trans.base;
1943
786e8c11
YO
1944 DEBUG_TRIE_EXECUTE_r({
1945 if (failed)
1946 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1947 s, utf8_target );
07be1b83 1948 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1949 "%sState: %4"UVxf", word=%"UVxf,
1950 failed ? " Fail transition to " : "",
1951 (UV)state, (UV)word);
1952 });
07be1b83
YO
1953 if ( base ) {
1954 U32 tmp;
6dd2be57 1955 I32 offset;
07be1b83 1956 if (charid &&
6dd2be57
DM
1957 ( ((offset = base + charid
1958 - 1 - trie->uniquecharcount)) >= 0)
1959 && ((U32)offset < trie->lasttrans)
1960 && trie->trans[offset].check == state
1961 && (tmp=trie->trans[offset].next))
07be1b83 1962 {
786e8c11
YO
1963 DEBUG_TRIE_EXECUTE_r(
1964 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1965 state = tmp;
1966 break;
1967 }
1968 else {
786e8c11
YO
1969 DEBUG_TRIE_EXECUTE_r(
1970 PerlIO_printf( Perl_debug_log," - fail\n"));
1971 failed = 1;
1972 state = aho->fail[state];
07be1b83
YO
1973 }
1974 }
1975 else {
1976 /* we must be accepting here */
786e8c11
YO
1977 DEBUG_TRIE_EXECUTE_r(
1978 PerlIO_printf( Perl_debug_log," - accepting\n"));
1979 failed = 1;
07be1b83
YO
1980 break;
1981 }
1982 } while(state);
786e8c11 1983 uc += len;
07be1b83
YO
1984 if (failed) {
1985 if (leftmost)
1986 break;
786e8c11 1987 if (!state) state = 1;
07be1b83
YO
1988 }
1989 }
1990 if ( aho->states[ state ].wordnum ) {
2e64971a 1991 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1992 if (!leftmost || lpos < leftmost) {
1993 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1994 leftmost = lpos;
786e8c11 1995 }
07be1b83 1996 }
07be1b83
YO
1997 if (leftmost) {
1998 s = (char*)leftmost;
786e8c11
YO
1999 DEBUG_TRIE_EXECUTE_r({
2000 PerlIO_printf(
70685ca0
JH
2001 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2002 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2003 );
2004 });
24b23f37 2005 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2006 FREETMPS;
2007 LEAVE;
07be1b83 2008 goto got_it;
be8e71aa 2009 }
07be1b83 2010 s = HOPc(s,1);
786e8c11
YO
2011 DEBUG_TRIE_EXECUTE_r({
2012 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2013 });
07be1b83 2014 } else {
786e8c11
YO
2015 DEBUG_TRIE_EXECUTE_r(
2016 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2017 break;
2018 }
2019 }
be8e71aa
YO
2020 FREETMPS;
2021 LEAVE;
07be1b83
YO
2022 }
2023 break;
b3c9acc1 2024 default:
3c3eec57
GS
2025 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2026 break;
d6a28714 2027 }
6eb5f6b9
JH
2028 return 0;
2029 got_it:
2030 return s;
2031}
2032
fae667d5 2033
6eb5f6b9
JH
2034/*
2035 - regexec_flags - match a regexp against a string
2036 */
2037I32
288b8c02 2038Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2039 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2040/* strend: pointer to null at end of string */
2041/* strbeg: real beginning of string */
2042/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2043/* data: May be used for some additional optimizations.
2044 Currently its only used, with a U32 cast, for transmitting
2045 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2046/* nosave: For optimizations. */
2047{
97aff369 2048 dVAR;
288b8c02 2049 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2050 /*register*/ char *s;
6eb5f6b9 2051 register regnode *c;
24b23f37 2052 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2053 I32 minlen; /* must match at least this many chars */
2054 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2055 I32 end_shift = 0; /* Same for the end. */ /* CC */
2056 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2057 char *scream_olds = NULL;
f2ed9b32 2058 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2059 I32 multiline;
f8fc2ecf 2060 RXi_GET_DECL(prog,progi);
3b0527fe 2061 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2062 regexp_paren_pair *swap = NULL;
a3621e74
YO
2063 GET_RE_DEBUG_FLAGS_DECL;
2064
7918f24d 2065 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2066 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2067
2068 /* Be paranoid... */
2069 if (prog == NULL || startpos == NULL) {
2070 Perl_croak(aTHX_ "NULL regexp parameter");
2071 return 0;
2072 }
2073
bbe252da 2074 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2075 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2076
f2ed9b32 2077 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2078 DEBUG_EXECUTE_r(
f2ed9b32 2079 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2080 "Matching");
2081 );
bac06658 2082
6eb5f6b9 2083 minlen = prog->minlen;
1de06328
YO
2084
2085 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2087 "String too short [regexec_flags]...\n"));
2088 goto phooey;
1aa99e6b 2089 }
6eb5f6b9 2090
1de06328 2091
6eb5f6b9 2092 /* Check validity of program. */
f8fc2ecf 2093 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2094 Perl_croak(aTHX_ "corrupted regexp program");
2095 }
2096
2097 PL_reg_flags = 0;
ed301438 2098 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2099 PL_reg_maxiter = 0;
2100
3c8556c3 2101 if (RX_UTF8(rx))
6eb5f6b9
JH
2102 PL_reg_flags |= RF_utf8;
2103
2104 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2105 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2106 PL_bostr = strbeg;
3b0527fe 2107 reginfo.sv = sv;
6eb5f6b9
JH
2108
2109 /* Mark end of line for $ (and such) */
2110 PL_regeol = strend;
2111
2112 /* see how far we have to get to not match where we matched before */
3b0527fe 2113 reginfo.till = startpos+minend;
6eb5f6b9 2114
6eb5f6b9
JH
2115 /* If there is a "must appear" string, look for it. */
2116 s = startpos;
2117
bbe252da 2118 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2119 MAGIC *mg;
2c296965 2120 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2121 reginfo.ganch = startpos + prog->gofs;
2c296965 2122 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2123 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2124 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2125 && SvMAGIC(sv)
14befaf4
DM
2126 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2127 && mg->mg_len >= 0) {
3b0527fe 2128 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2129 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2130 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2131
bbe252da 2132 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2133 if (s > reginfo.ganch)
6eb5f6b9 2134 goto phooey;
58e23c8d 2135 s = reginfo.ganch - prog->gofs;
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2138 if (s < strbeg)
2139 goto phooey;
6eb5f6b9
JH
2140 }
2141 }
58e23c8d 2142 else if (data) {
70685ca0 2143 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2144 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2146
2147 } else { /* pos() not defined */
3b0527fe 2148 reginfo.ganch = strbeg;
2c296965
YO
2149 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2150 "GPOS: reginfo.ganch = strbeg\n"));
2151 }
6eb5f6b9 2152 }
288b8c02 2153 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2154 /* We have to be careful. If the previous successful match
2155 was from this regex we don't want a subsequent partially
2156 successful match to clobber the old results.
2157 So when we detect this possibility we add a swap buffer
2158 to the re, and switch the buffer each match. If we fail
2159 we switch it back, otherwise we leave it swapped.
2160 */
2161 swap = prog->offs;
2162 /* do we need a save destructor here for eval dies? */
2163 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2164 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2165 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2166 PTR2UV(prog),
2167 PTR2UV(swap),
2168 PTR2UV(prog->offs)
2169 ));
c74340f9 2170 }
a0714e2c 2171 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2172 re_scream_pos_data d;
2173
2174 d.scream_olds = &scream_olds;
2175 d.scream_pos = &scream_pos;
288b8c02 2176 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2177 if (!s) {
a3621e74 2178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2179 goto phooey; /* not present */
3fa9c3d7 2180 }
6eb5f6b9
JH
2181 }
2182
1de06328 2183
6eb5f6b9
JH
2184
2185 /* Simplest case: anchored match need be tried only once. */
2186 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2187 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2188 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2189 goto got_it;
bbe252da
YO
2190 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2191 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2192 {
2193 char *end;
2194
2195 if (minlen)
2196 dontbother = minlen - 1;
1aa99e6b 2197 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2198 /* for multiline we only have to try after newlines */
33b8afdf 2199 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2200 /* because of the goto we can not easily reuse the macros for bifurcating the
2201 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2202 if (utf8_target) {
2203 if (s == startpos)
2204 goto after_try_utf8;
2205 while (1) {
2206 if (regtry(&reginfo, &s)) {
2207 goto got_it;
2208 }
2209 after_try_utf8:
2210 if (s > end) {
2211 goto phooey;
2212 }
2213 if (prog->extflags & RXf_USE_INTUIT) {
2214 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2215 if (!s) {
2216 goto phooey;
2217 }
2218 }
2219 else {
2220 s += UTF8SKIP(s);
2221 }
2222 }
2223 } /* end search for check string in unicode */
2224 else {
2225 if (s == startpos) {
2226 goto after_try_latin;
2227 }
2228 while (1) {
2229 if (regtry(&reginfo, &s)) {
2230 goto got_it;
2231 }
2232 after_try_latin:
2233 if (s > end) {
2234 goto phooey;
2235 }
2236 if (prog->extflags & RXf_USE_INTUIT) {
2237 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2238 if (!s) {
2239 goto phooey;
2240 }
2241 }
2242 else {
2243 s++;
2244 }
2245 }
2246 } /* end search for check string in latin*/
2247 } /* end search for check string */
2248 else { /* search for newline */
2249 if (s > startpos) {
2250 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2251 s--;
92f3d482 2252 }
21eede78
YO
2253 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2254 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2255 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2256 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2257 goto got_it;
2258 }
92f3d482
YO
2259 }
2260 } /* end search for newline */
2261 } /* end anchored/multiline check string search */
6eb5f6b9 2262 goto phooey;
bbe252da 2263 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2264 {
486ec47a 2265 /* the warning about reginfo.ganch being used without initialization
bbe252da 2266 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2267 and we only enter this block when the same bit is set. */
58e23c8d 2268 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2269
2270 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2271 goto got_it;
2272 goto phooey;
2273 }
2274
2275 /* Messy cases: unanchored match. */
bbe252da 2276 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2277 /* we have /x+whatever/ */
f2ed9b32 2278 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2279 char ch;
bf93d4cc
GS
2280#ifdef DEBUGGING
2281 int did_match = 0;
2282#endif
f2ed9b32
KW
2283 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2284 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2285 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2286
f2ed9b32 2287 if (utf8_target) {
4cadc6a9 2288 REXEC_FBC_SCAN(
6eb5f6b9 2289 if (*s == ch) {
a3621e74 2290 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2291 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2292 s += UTF8SKIP(s);
2293 while (s < strend && *s == ch)
2294 s += UTF8SKIP(s);
2295 }
4cadc6a9 2296 );
6eb5f6b9
JH
2297 }
2298 else {
4cadc6a9 2299 REXEC_FBC_SCAN(
6eb5f6b9 2300 if (*s == ch) {
a3621e74 2301 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2302 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2303 s++;
2304 while (s < strend && *s == ch)
2305 s++;
2306 }
4cadc6a9 2307 );
6eb5f6b9 2308 }
a3621e74 2309 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2310 PerlIO_printf(Perl_debug_log,
b7953727
JH
2311 "Did not find anchored character...\n")
2312 );
6eb5f6b9 2313 }
a0714e2c
SS
2314 else if (prog->anchored_substr != NULL
2315 || prog->anchored_utf8 != NULL
2316 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2317 && prog->float_max_offset < strend - s)) {
2318 SV *must;
2319 I32 back_max;
2320 I32 back_min;
2321 char *last;
6eb5f6b9 2322 char *last1; /* Last position checked before */
bf93d4cc
GS
2323#ifdef DEBUGGING
2324 int did_match = 0;
2325#endif
33b8afdf 2326 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2327 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2328 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2329 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2330 back_max = back_min = prog->anchored_offset;
2331 } else {
f2ed9b32
KW
2332 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2333 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2334 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2335 back_max = prog->float_max_offset;
2336 back_min = prog->float_min_offset;
2337 }
1de06328
YO
2338
2339
33b8afdf
JH
2340 if (must == &PL_sv_undef)
2341 /* could not downgrade utf8 check substring, so must fail */
2342 goto phooey;
2343
1de06328
YO
2344 if (back_min<0) {
2345 last = strend;
2346 } else {
2347 last = HOP3c(strend, /* Cannot start after this */
2348 -(I32)(CHR_SVLEN(must)
2349 - (SvTAIL(must) != 0) + back_min), strbeg);
2350 }
6eb5f6b9
JH
2351 if (s > PL_bostr)
2352 last1 = HOPc(s, -1);
2353 else
2354 last1 = s - 1; /* bogus */
2355
a0288114 2356 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2357 check_substr==must. */
2358 scream_pos = -1;
2359 dontbother = end_shift;
2360 strend = HOPc(strend, -dontbother);
2361 while ( (s <= last) &&
c33e64f0 2362 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2363 (unsigned char*)strend, must,
c33e64f0 2364 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2365 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2366 if (HOPc(s, -back_max) > last1) {
2367 last1 = HOPc(s, -back_min);
2368 s = HOPc(s, -back_max);
2369 }
2370 else {
52657f30 2371 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2372
2373 last1 = HOPc(s, -back_min);
52657f30 2374 s = t;
6eb5f6b9 2375 }
f2ed9b32 2376 if (utf8_target) {
6eb5f6b9 2377 while (s <= last1) {
24b23f37 2378 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2379 goto got_it;
2380 s += UTF8SKIP(s);
2381 }
2382 }
2383 else {
2384 while (s <= last1) {
24b23f37 2385 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2386 goto got_it;
2387 s++;
2388 }
2389 }
2390 }
ab3bbdeb 2391 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2392 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2393 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2394 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2395 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2396 ? "anchored" : "floating"),
ab3bbdeb
YO
2397 quoted, RE_SV_TAIL(must));
2398 });
6eb5f6b9
JH
2399 goto phooey;
2400 }
f8fc2ecf 2401 else if ( (c = progi->regstclass) ) {
f14c76ed 2402 if (minlen) {
f8fc2ecf 2403 const OPCODE op = OP(progi->regstclass);
66e933ab 2404 /* don't bother with what can't match */
786e8c11 2405 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2406 strend = HOPc(strend, -(minlen - 1));
2407 }
a3621e74 2408 DEBUG_EXECUTE_r({
be8e71aa 2409 SV * const prop = sv_newmortal();
32fc9b6a 2410 regprop(prog, prop, c);
0df25f3d 2411 {
f2ed9b32 2412 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2413 s,strend-s,60);
0df25f3d 2414 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2415 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2416 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2417 quoted, (int)(strend - s));
0df25f3d 2418 }
ffc61ed2 2419 });
3b0527fe 2420 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2421 goto got_it;
07be1b83 2422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2423 }
2424 else {
2425 dontbother = 0;
a0714e2c 2426 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2427 /* Trim the end. */
6af40bd7 2428 char *last= NULL;
33b8afdf 2429 SV* float_real;
c33e64f0
FC
2430 STRLEN len;
2431 const char *little;
33b8afdf 2432
f2ed9b32
KW
2433 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2434 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2435 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2436
c33e64f0
FC
2437 little = SvPV_const(float_real, len);
2438 if (SvTAIL(float_real)) {
1a13b075
YO
2439 /* This means that float_real contains an artificial \n on the end
2440 * due to the presence of something like this: /foo$/
2441 * where we can match both "foo" and "foo\n" at the end of the string.
2442 * So we have to compare the end of the string first against the float_real
2443 * without the \n and then against the full float_real with the string.
2444 * We have to watch out for cases where the string might be smaller
2445 * than the float_real or the float_real without the \n.
2446 */
2447 char *checkpos= strend - len;
2448 DEBUG_OPTIMISE_r(
2449 PerlIO_printf(Perl_debug_log,
2450 "%sChecking for float_real.%s\n",
2451 PL_colors[4], PL_colors[5]));
2452 if (checkpos + 1 < strbeg) {
2453 /* can't match, even if we remove the trailing \n string is too short to match */
2454 DEBUG_EXECUTE_r(
2455 PerlIO_printf(Perl_debug_log,
2456 "%sString shorter than required trailing substring, cannot match.%s\n",
2457 PL_colors[4], PL_colors[5]));
2458 goto phooey;
2459 } else if (memEQ(checkpos + 1, little, len - 1)) {
2460 /* can match, the end of the string matches without the "\n" */
2461 last = checkpos + 1;
2462 } else if (checkpos < strbeg) {
2463 /* cant match, string is too short when the "\n" is included */
2464 DEBUG_EXECUTE_r(
2465 PerlIO_printf(Perl_debug_log,
2466 "%sString does not contain required trailing substring, cannot match.%s\n",
2467 PL_colors[4], PL_colors[5]));
2468 goto phooey;
2469 } else if (!multiline) {
2470 /* non multiline match, so compare with the "\n" at the end of the string */
2471 if (memEQ(checkpos, little, len)) {
2472 last= checkpos;
2473 } else {
2474 DEBUG_EXECUTE_r(
2475 PerlIO_printf(Perl_debug_log,
2476 "%sString does not contain required trailing substring, cannot match.%s\n",
2477 PL_colors[4], PL_colors[5]));
2478 goto phooey;
2479 }
2480 } else {
2481 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2482 goto find_last;
1a13b075 2483 }
c33e64f0 2484 } else {
d6a28714 2485 find_last:
9041c2e3 2486 if (len)
d6a28714 2487 last = rninstr(s, strend, little, little + len);
b8c5462f 2488 else
a0288114 2489 last = strend; /* matching "$" */
b8c5462f 2490 }
6af40bd7
YO
2491 if (!last) {
2492 /* at one point this block contained a comment which was probably
2493 * incorrect, which said that this was a "should not happen" case.
2494 * Even if it was true when it was written I am pretty sure it is
2495 * not anymore, so I have removed the comment and replaced it with
2496 * this one. Yves */
6bda09f9
YO
2497 DEBUG_EXECUTE_r(
2498 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2499 "String does not contain required substring, cannot match.\n"
2500 ));
2501 goto phooey;
bf93d4cc 2502 }
d6a28714
JH
2503 dontbother = strend - last + prog->float_min_offset;
2504 }
2505 if (minlen && (dontbother < minlen))
2506 dontbother = minlen - 1;
2507 strend -= dontbother; /* this one's always in bytes! */
2508 /* We don't know much -- general case. */
f2ed9b32 2509 if (utf8_target) {
d6a28714 2510 for (;;) {
24b23f37 2511 if (regtry(&reginfo, &s))
d6a28714
JH
2512 goto got_it;
2513 if (s >= strend)
2514 break;
b8c5462f 2515 s += UTF8SKIP(s);
d6a28714
JH
2516 };
2517 }
2518 else {
2519 do {
24b23f37 2520 if (regtry(&reginfo, &s))
d6a28714
JH
2521 goto got_it;
2522 } while (s++ < strend);
2523 }
2524 }
2525
2526 /* Failure. */
2527 goto phooey;
2528
2529got_it:
495f47a5
DM
2530 DEBUG_BUFFERS_r(
2531 if (swap)
2532 PerlIO_printf(Perl_debug_log,
2533 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2534 PTR2UV(prog),
2535 PTR2UV(swap)
2536 );
2537 );
e9105d30 2538 Safefree(swap);
288b8c02 2539 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2540
ed301438 2541 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2542 restore_pos(aTHX_ prog);
5daac39c
NC
2543 if (RXp_PAREN_NAMES(prog))
2544 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2545
2546 /* make sure $`, $&, $', and $digit will work later */
2547 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2548 RX_MATCH_COPY_FREE(rx);
d6a28714 2549 if (flags & REXEC_COPY_STR) {
be8e71aa 2550 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2551#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2552 if ((SvIsCOW(sv)
2553 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2554 if (DEBUG_C_TEST) {
2555 PerlIO_printf(Perl_debug_log,
2556 "Copy on write: regexp capture, type %d\n",
2557 (int) SvTYPE(sv));
2558 }
2559 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2560 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2561 assert (SvPOKp(prog->saved_copy));
2562 } else
2563#endif
2564 {
288b8c02 2565 RX_MATCH_COPIED_on(rx);
ed252734
NC
2566 s = savepvn(strbeg, i);
2567 prog->subbeg = s;
2568 }
d6a28714 2569 prog->sublen = i;
d6a28714
JH
2570 }
2571 else {
2572 prog->subbeg = strbeg;
2573 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2574 }
2575 }
9041c2e3 2576
d6a28714
JH
2577 return 1;
2578
2579phooey:
a3621e74 2580 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2581 PL_colors[4], PL_colors[5]));
ed301438 2582 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2583 restore_pos(aTHX_ prog);
e9105d30 2584 if (swap) {
c74340f9 2585 /* we failed :-( roll it back */
495f47a5
DM
2586 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2587 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2588 PTR2UV(prog),
2589 PTR2UV(prog->offs),
2590 PTR2UV(swap)
2591 ));
e9105d30
GG
2592 Safefree(prog->offs);
2593 prog->offs = swap;
2594 }
2595
d6a28714
JH
2596 return 0;
2597}
2598
6bda09f9 2599
ec43f78b
DM
2600/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2601 * Do inc before dec, in case old and new rex are the same */
2602#define SET_reg_curpm(Re2) \
2603 if (PL_reg_state.re_state_eval_setup_done) { \
2604 (void)ReREFCNT_inc(Re2); \
2605 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2606 PM_SETRE((PL_reg_curpm), (Re2)); \
2607 }
2608
2609
d6a28714
JH
2610/*
2611 - regtry - try match at specific point
2612 */
2613STATIC I32 /* 0 failure, 1 success */
24b23f37 2614S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2615{
97aff369 2616 dVAR;
d6a28714 2617 CHECKPOINT lastcp;
288b8c02
NC
2618 REGEXP *const rx = reginfo->prog;
2619 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2620 RXi_GET_DECL(prog,progi);
a3621e74 2621 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2622
2623 PERL_ARGS_ASSERT_REGTRY;
2624
24b23f37 2625 reginfo->cutpoint=NULL;
d6a28714 2626
ed301438
DM
2627 if ((prog->extflags & RXf_EVAL_SEEN)
2628 && !PL_reg_state.re_state_eval_setup_done)
2629 {
d6a28714
JH
2630 MAGIC *mg;
2631
ed301438 2632 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2633 if (reginfo->sv) {
d6a28714 2634 /* Make $_ available to executed code. */
3b0527fe 2635 if (reginfo->sv != DEFSV) {
59f00321 2636 SAVE_DEFSV;
414bf5ae 2637 DEFSV_set(reginfo->sv);
b8c5462f 2638 }
d6a28714 2639
3b0527fe
DM
2640 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2641 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2642 /* prepare for quick setting of pos */
d300d9fa 2643#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2644 if (SvIsCOW(reginfo->sv))
2645 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2646#endif
3dab1dad 2647 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2648 &PL_vtbl_mglob, NULL, 0);
d6a28714 2649 mg->mg_len = -1;
b8c5462f 2650 }
d6a28714
JH
2651 PL_reg_magic = mg;
2652 PL_reg_oldpos = mg->mg_len;
4f639d21 2653 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2654 }
09687e5a 2655 if (!PL_reg_curpm) {
a02a5408 2656 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2657#ifdef USE_ITHREADS
2658 {
14a49a24 2659 SV* const repointer = &PL_sv_undef;
92313705
NC
2660 /* this regexp is also owned by the new PL_reg_curpm, which
2661 will try to free it. */
d2ece331 2662 av_push(PL_regex_padav, repointer);
09687e5a
AB
2663 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2664 PL_regex_pad = AvARRAY(PL_regex_padav);
2665 }
2666#endif
2667 }
ec43f78b 2668 SET_reg_curpm(rx);
d6a28714
JH
2669 PL_reg_oldcurpm = PL_curpm;
2670 PL_curpm = PL_reg_curpm;
07bc277f 2671 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2672 /* Here is a serious problem: we cannot rewrite subbeg,
2673 since it may be needed if this match fails. Thus
2674 $` inside (?{}) could fail... */
2675 PL_reg_oldsaved = prog->subbeg;
2676 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2677#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2678 PL_nrs = prog->saved_copy;
2679#endif
07bc277f 2680 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2681 }
2682 else
bd61b366 2683 PL_reg_oldsaved = NULL;
d6a28714
JH
2684 prog->subbeg = PL_bostr;
2685 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2686 }
97ca13b7
DM
2687#ifdef DEBUGGING
2688 PL_reg_starttry = *startpos;
2689#endif
f0ab9afb 2690 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2691 PL_reginput = *startpos;
d6a28714 2692 prog->lastparen = 0;
03994de8 2693 prog->lastcloseparen = 0;
d6a28714 2694 PL_regsize = 0;
d6a28714
JH
2695
2696 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2697 to do this again and again, prog->lastparen should take care of
3dd2943c 2698 this! --ilya*/
dafc8851
JH
2699
2700 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2701 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2702 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2703 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2704 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2705 * Meanwhile, this code *is* needed for the
daf18116
JH
2706 * above-mentioned test suite tests to succeed. The common theme
2707 * on those tests seems to be returning null fields from matches.
225593e1 2708 * --jhi updated by dapm */
dafc8851 2709#if 1
d6a28714 2710 if (prog->nparens) {
b93070ed 2711 regexp_paren_pair *pp = prog->offs;
097eb12c 2712 register I32 i;
b93070ed 2713 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2714 ++pp;
2715 pp->start = -1;
2716 pp->end = -1;
d6a28714
JH
2717 }
2718 }
dafc8851 2719#endif
02db2b7b 2720 REGCP_SET(lastcp);
f8fc2ecf 2721 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2722 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2723 return 1;
2724 }
24b23f37
YO
2725 if (reginfo->cutpoint)
2726 *startpos= reginfo->cutpoint;
02db2b7b 2727 REGCP_UNWIND(lastcp);
d6a28714
JH
2728 return 0;
2729}
2730
02db2b7b 2731
8ba1375e
MJD
2732#define sayYES goto yes
2733#define sayNO goto no
262b90c4 2734#define sayNO_SILENT goto no_silent
8ba1375e 2735
f9f4320a
YO
2736/* we dont use STMT_START/END here because it leads to
2737 "unreachable code" warnings, which are bogus, but distracting. */
2738#define CACHEsayNO \
c476f425
DM
2739 if (ST.cache_mask) \
2740 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2741 sayNO
3298f257 2742
a3621e74 2743/* this is used to determine how far from the left messages like
265c4333
YO
2744 'failed...' are printed. It should be set such that messages
2745 are inline with the regop output that created them.
a3621e74 2746*/
265c4333 2747#define REPORT_CODE_OFF 32
a3621e74
YO
2748
2749
40a82448
DM
2750#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2751#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2752
86545054
DM
2753#define SLAB_FIRST(s) (&(s)->states[0])
2754#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2755
5d9a96ca
DM
2756/* grab a new slab and return the first slot in it */
2757
2758STATIC regmatch_state *
2759S_push_slab(pTHX)
2760{
a35a87e7 2761#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2762 dMY_CXT;
2763#endif
5d9a96ca
DM
2764 regmatch_slab *s = PL_regmatch_slab->next;
2765 if (!s) {
2766 Newx(s, 1, regmatch_slab);
2767 s->prev = PL_regmatch_slab;
2768 s->next = NULL;
2769 PL_regmatch_slab->next = s;
2770 }
2771 PL_regmatch_slab = s;
86545054 2772 return SLAB_FIRST(s);
5d9a96ca 2773}
5b47454d 2774
95b24440 2775
40a82448
DM
2776/* push a new state then goto it */
2777
2778#define PUSH_STATE_GOTO(state, node) \
2779 scan = node; \
2780 st->resume_state = state; \
2781 goto push_state;
2782
2783/* push a new state with success backtracking, then goto it */
2784
2785#define PUSH_YES_STATE_GOTO(state, node) \
2786 scan = node; \
2787 st->resume_state = state; \
2788 goto push_yes_state;
2789
aa283a38 2790
aa283a38 2791
d6a28714 2792/*
95b24440 2793
bf1f174e
DM
2794regmatch() - main matching routine
2795
2796This is basically one big switch statement in a loop. We execute an op,
2797set 'next' to point the next op, and continue. If we come to a point which
2798we may need to backtrack to on failure such as (A|B|C), we push a
2799backtrack state onto the backtrack stack. On failure, we pop the top
2800state, and re-enter the loop at the state indicated. If there are no more
2801states to pop, we return failure.
2802
2803Sometimes we also need to backtrack on success; for example /A+/, where
2804after successfully matching one A, we need to go back and try to
2805match another one; similarly for lookahead assertions: if the assertion
2806completes successfully, we backtrack to the state just before the assertion
2807and then carry on. In these cases, the pushed state is marked as
2808'backtrack on success too'. This marking is in fact done by a chain of
2809pointers, each pointing to the previous 'yes' state. On success, we pop to
2810the nearest yes state, discarding any intermediate failure-only states.
2811Sometimes a yes state is pushed just to force some cleanup code to be
2812called at the end of a successful match or submatch; e.g. (??{$re}) uses
2813it to free the inner regex.
2814
2815Note that failure backtracking rewinds the cursor position, while
2816success backtracking leaves it alone.
2817
2818A pattern is complete when the END op is executed, while a subpattern
2819such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2820ops trigger the "pop to last yes state if any, otherwise return true"
2821behaviour.
2822
2823A common convention in this function is to use A and B to refer to the two
2824subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2825the subpattern to be matched possibly multiple times, while B is the entire
2826rest of the pattern. Variable and state names reflect this convention.
2827
2828The states in the main switch are the union of ops and failure/success of
2829substates associated with with that op. For example, IFMATCH is the op
2830that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2831'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2832successfully matched A and IFMATCH_A_fail is a state saying that we have
2833just failed to match A. Resume states always come in pairs. The backtrack
2834state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2835at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2836on success or failure.
2837
2838The struct that holds a backtracking state is actually a big union, with
2839one variant for each major type of op. The variable st points to the
2840top-most backtrack struct. To make the code clearer, within each
2841block of code we #define ST to alias the relevant union.
2842
2843Here's a concrete example of a (vastly oversimplified) IFMATCH
2844implementation:
2845
2846 switch (state) {
2847 ....
2848
2849#define ST st->u.ifmatch
2850
2851 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2852 ST.foo = ...; // some state we wish to save
95b24440 2853 ...
bf1f174e
DM
2854 // push a yes backtrack state with a resume value of
2855 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2856 // first node of A:
2857 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2858 // NOTREACHED
2859
2860 case IFMATCH_A: // we have successfully executed A; now continue with B
2861 next = B;
2862 bar = ST.foo; // do something with the preserved value
2863 break;
2864
2865 case IFMATCH_A_fail: // A failed, so the assertion failed
2866 ...; // do some housekeeping, then ...
2867 sayNO; // propagate the failure
2868
2869#undef ST
95b24440 2870
bf1f174e
DM
2871 ...
2872 }
95b24440 2873
bf1f174e
DM
2874For any old-timers reading this who are familiar with the old recursive
2875approach, the code above is equivalent to:
95b24440 2876
bf1f174e
DM
2877 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2878 {
2879 int foo = ...
95b24440 2880 ...
bf1f174e
DM
2881 if (regmatch(A)) {
2882 next = B;
2883 bar = foo;
2884 break;
95b24440 2885 }
bf1f174e
DM
2886 ...; // do some housekeeping, then ...
2887 sayNO; // propagate the failure
95b24440 2888 }
bf1f174e
DM
2889
2890The topmost backtrack state, pointed to by st, is usually free. If you
2891want to claim it, populate any ST.foo fields in it with values you wish to
2892save, then do one of
2893
2894 PUSH_STATE_GOTO(resume_state, node);
2895 PUSH_YES_STATE_GOTO(resume_state, node);
2896
2897which sets that backtrack state's resume value to 'resume_state', pushes a
2898new free entry to the top of the backtrack stack, then goes to 'node'.
2899On backtracking, the free slot is popped, and the saved state becomes the
2900new free state. An ST.foo field in this new top state can be temporarily
2901accessed to retrieve values, but once the main loop is re-entered, it
2902becomes available for reuse.
2903
2904Note that the depth of the backtrack stack constantly increases during the
2905left-to-right execution of the pattern, rather than going up and down with
2906the pattern nesting. For example the stack is at its maximum at Z at the
2907end of the pattern, rather than at X in the following:
2908
2909 /(((X)+)+)+....(Y)+....Z/
2910
2911The only exceptions to this are lookahead/behind assertions and the cut,
2912(?>A), which pop all the backtrack states associated with A before
2913continuing.
2914
486ec47a 2915Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2916PL_regmatch_state and st always point to the currently active state,
2917and PL_regmatch_slab points to the slab currently containing
2918PL_regmatch_state. The first time regmatch() is called, the first slab is
2919allocated, and is never freed until interpreter destruction. When the slab
2920is full, a new one is allocated and chained to the end. At exit from
2921regmatch(), slabs allocated since entry are freed.
2922
2923*/
95b24440 2924
40a82448 2925
5bc10b2c 2926#define DEBUG_STATE_pp(pp) \
265c4333 2927 DEBUG_STATE_r({ \
f2ed9b32 2928 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2929 PerlIO_printf(Perl_debug_log, \
5d458dd8 2930 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2931 depth*2, "", \
13d6edb4 2932 PL_reg_name[st->resume_state], \
5d458dd8
YO
2933 ((st==yes_state||st==mark_state) ? "[" : ""), \
2934 ((st==yes_state) ? "Y" : ""), \
2935 ((st==mark_state) ? "M" : ""), \
2936 ((st==yes_state||st==mark_state) ? "]" : "") \
2937 ); \
265c4333 2938 });
5bc10b2c 2939
40a82448 2940
3dab1dad 2941#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2942
3df15adc 2943#ifdef DEBUGGING
5bc10b2c 2944
ab3bbdeb 2945STATIC void
f2ed9b32 2946S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2947 const char *start, const char *end, const char *blurb)
2948{
efd26800 2949 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2950
2951 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2952
ab3bbdeb
YO
2953 if (!PL_colorset)
2954 reginitcolors();
2955 {
2956 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2957 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2958
f2ed9b32 2959 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2960 start, end - start, 60);
2961
2962 PerlIO_printf(Perl_debug_log,
2963 "%s%s REx%s %s against %s\n",
2964 PL_colors[4], blurb, PL_colors[5], s0, s1);
2965
f2ed9b32 2966 if (utf8_target||utf8_pat)
1de06328
YO
2967 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2968 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2969 utf8_pat && utf8_target ? " and " : "",
2970 utf8_target ? "string" : ""
ab3bbdeb
YO
2971 );
2972 }
2973}
3df15adc
YO
2974
2975STATIC void
786e8c11
YO
2976S_dump_exec_pos(pTHX_ const char *locinput,
2977 const regnode *scan,
2978 const char *loc_regeol,
2979 const char *loc_bostr,
2980 const char *loc_reg_starttry,
f2ed9b32 2981 const bool utf8_target)
07be1b83 2982{
786e8c11 2983 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2984 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2985 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2986 /* The part of the string before starttry has one color
2987 (pref0_len chars), between starttry and current
2988 position another one (pref_len - pref0_len chars),
2989 after the current position the third one.
2990 We assume that pref0_len <= pref_len, otherwise we
2991 decrease pref0_len. */
786e8c11
YO
2992 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2993 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2994 int pref0_len;
2995
7918f24d
NC
2996 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2997
f2ed9b32 2998 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2999 pref_len++;
786e8c11
YO
3000 pref0_len = pref_len - (locinput - loc_reg_starttry);
3001 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3002 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3003 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3004 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3005 l--;
3006 if (pref0_len < 0)
3007 pref0_len = 0;
3008 if (pref0_len > pref_len)
3009 pref0_len = pref_len;
3010 {
f2ed9b32 3011 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3012
ab3bbdeb 3013 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3014 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3015
ab3bbdeb 3016 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3017 (locinput - pref_len + pref0_len),
1de06328 3018 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3019
ab3bbdeb 3020 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3021 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3022
1de06328 3023 const STRLEN tlen=len0+len1+len2;
3df15adc 3024 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3025 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3026 (IV)(locinput - loc_bostr),
07be1b83 3027 len0, s0,
07be1b83 3028 len1, s1,
07be1b83 3029 (docolor ? "" : "> <"),
07be1b83 3030 len2, s2,
f9f4320a 3031 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3032 "");
3033 }
3034}
3df15adc 3035
07be1b83
YO
3036#endif
3037
0a4db386
YO
3038/* reg_check_named_buff_matched()
3039 * Checks to see if a named buffer has matched. The data array of
3040 * buffer numbers corresponding to the buffer is expected to reside
3041 * in the regexp->data->data array in the slot stored in the ARG() of
3042 * node involved. Note that this routine doesn't actually care about the
3043 * name, that information is not preserved from compilation to execution.
3044 * Returns the index of the leftmost defined buffer with the given name
3045 * or 0 if non of the buffers matched.
3046 */
3047STATIC I32
7918f24d
NC
3048S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3049{
0a4db386 3050 I32 n;
f8fc2ecf 3051 RXi_GET_DECL(rex,rexi);
ad64d0ec 3052 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3053 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3054
3055 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3056
0a4db386 3057 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3058 if ((I32)rex->lastparen >= nums[n] &&
3059 rex->offs[nums[n]].end != -1)
0a4db386
YO
3060 {
3061 return nums[n];
3062 }
3063 }
3064 return 0;
3065}
3066
2f554ef7
DM
3067
3068/* free all slabs above current one - called during LEAVE_SCOPE */
3069
3070STATIC void
3071S_clear_backtrack_stack(pTHX_ void *p)
3072{
3073 regmatch_slab *s = PL_regmatch_slab->next;
3074 PERL_UNUSED_ARG(p);
3075
3076 if (!s)
3077 return;
3078 PL_regmatch_slab->next = NULL;
3079 while (s) {
3080 regmatch_slab * const osl = s;
3081 s = s->next;
3082 Safefree(osl);
3083 }
3084}
3085
3086
d6a28714 3087STATIC I32 /* 0 failure, 1 success */
24b23f37 3088S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3089{
a35a87e7 3090#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3091 dMY_CXT;
3092#endif
27da23d5 3093 dVAR;
f2ed9b32 3094 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3095 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3096 REGEXP *rex_sv = reginfo->prog;
3097 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3098 RXi_GET_DECL(rex,rexi);
2f554ef7 3099 I32 oldsave;
5d9a96ca
DM
3100 /* the current state. This is a cached copy of PL_regmatch_state */
3101 register regmatch_state *st;
5d9a96ca
DM
3102 /* cache heavy used fields of st in registers */
3103 register regnode *scan;
3104 register regnode *next;
438e9bae 3105 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3106 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3107 register char *locinput = PL_reginput;
5d9a96ca 3108 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3109
b69b0499 3110 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3111 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3112 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3113 const U32 max_nochange_depth =
3114 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3115 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3116 regmatch_state *yes_state = NULL; /* state to pop to on success of
3117 subpattern */
e2e6a0f1
YO
3118 /* mark_state piggy backs on the yes_state logic so that when we unwind
3119 the stack on success we can update the mark_state as we go */
3120 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3121 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3122 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3123 U32 state_num;
5d458dd8
YO
3124 bool no_final = 0; /* prevent failure from backtracking? */
3125 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3126 char *startpoint = PL_reginput;
5d458dd8
YO
3127 SV *popmark = NULL; /* are we looking for a mark? */
3128 SV *sv_commit = NULL; /* last mark name seen in failure */
3129 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3130 during a successful match */
5d458dd8
YO
3131 U32 lastopen = 0; /* last open we saw */
3132 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3133 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3134 /* these three flags are set by various ops to signal information to
3135 * the very next op. They have a useful lifetime of exactly one loop
3136 * iteration, and are not preserved or restored by state pushes/pops
3137 */
3138 bool sw = 0; /* the condition value in (?(cond)a|b) */
3139 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3140 int logical = 0; /* the following EVAL is:
3141 0: (?{...})
3142 1: (?(?{...})X|Y)
3143 2: (??{...})
3144 or the following IFMATCH/UNLESSM is:
3145 false: plain (?=foo)
3146 true: used as a condition: (?(?=foo))
3147 */
81ed78b2
DM
3148 PAD* last_pad = NULL;
3149 dMULTICALL;
3150 I32 gimme = G_SCALAR;
3151 CV *caller_cv = NULL; /* who called us */
3152 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3153 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3154
95b24440 3155#ifdef DEBUGGING
e68ec53f 3156 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3157#endif
3158
81ed78b2
DM
3159 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3160 multicall_oldcatch = 0;
3161 multicall_cv = NULL;
3162 cx = NULL;
3163
3164
7918f24d
NC
3165 PERL_ARGS_ASSERT_REGMATCH;
3166
3b57cd43 3167 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3168 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3169 }));
5d9a96ca
DM
3170 /* on first ever call to regmatch, allocate first slab */
3171 if (!PL_regmatch_slab) {
3172 Newx(PL_regmatch_slab, 1, regmatch_slab);
3173 PL_regmatch_slab->prev = NULL;
3174 PL_regmatch_slab->next = NULL;
86545054 3175 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3176 }
3177
2f554ef7
DM
3178 oldsave = PL_savestack_ix;
3179 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3180 SAVEVPTR(PL_regmatch_slab);
3181 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3182
3183 /* grab next free state slot */
3184 st = ++PL_regmatch_state;
86545054 3185 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3186 st = PL_regmatch_state = S_push_slab(aTHX);
3187
d6a28714
JH
3188 /* Note that nextchr is a byte even in UTF */
3189 nextchr = UCHARAT(locinput);
3190 scan = prog;
3191 while (scan != NULL) {
8ba1375e 3192
a3621e74 3193 DEBUG_EXECUTE_r( {
6136c704 3194 SV * const prop = sv_newmortal();
1de06328 3195 regnode *rnext=regnext(scan);
f2ed9b32 3196 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3197 regprop(rex, prop, scan);
07be1b83
YO
3198
3199 PerlIO_printf(Perl_debug_log,
3200 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3201 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3202 SvPVX_const(prop),
1de06328 3203 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3204 0 : (IV)(rnext - rexi->program));
2a782b5b 3205 });
d6a28714
JH
3206
3207 next = scan + NEXT_OFF(scan);
3208 if (next == scan)
3209 next = NULL;
40a82448 3210 state_num = OP(scan);
d6a28714 3211
40a82448 3212 reenter_switch:
34a81e2b 3213
40a82448 3214 switch (state_num) {
d6a28714 3215 case BOL:
7fba1cd6 3216 if (locinput == PL_bostr)
d6a28714 3217 {
3b0527fe 3218 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3219 break;
3220 }
d6a28714
JH
3221 sayNO;
3222 case MBOL:
12d33761
HS
3223 if (locinput == PL_bostr ||
3224 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3225 {
b8c5462f
JH
3226 break;
3227 }
d6a28714
JH
3228 sayNO;
3229 case SBOL:
c2a73568 3230 if (locinput == PL_bostr)
b8c5462f 3231 break;
d6a28714
JH
3232 sayNO;
3233 case GPOS:
3b0527fe 3234 if (locinput == reginfo->ganch)
d6a28714
JH
3235 break;
3236 sayNO;
ee9b8eae
YO
3237
3238 case KEEPS:
3239 /* update the startpoint */
b93070ed 3240 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3241 PL_reginput = locinput;
b93070ed 3242 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3243 PUSH_STATE_GOTO(KEEPS_next, next);
3244 /*NOT-REACHED*/
3245 case KEEPS_next_fail:
3246 /* rollback the start point change */
b93070ed 3247 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3248 sayNO_SILENT;
3249 /*NOT-REACHED*/
d6a28714 3250 case EOL:
d6a28714
JH
3251 goto seol;
3252 case MEOL:
d6a28714 3253 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3254 sayNO;
b8c5462f 3255 break;
d6a28714
JH
3256 case SEOL:
3257 seol:
3258 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3259 sayNO;
d6a28714 3260 if (PL_regeol - locinput > 1)
b8c5462f 3261 sayNO;
b8c5462f 3262 break;
d6a28714
JH
3263 case EOS:
3264 if (PL_regeol != locinput)
b8c5462f 3265 sayNO;
d6a28714 3266 break;
ffc61ed2 3267 case SANY:
d6a28714 3268 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3269 sayNO;
f2ed9b32 3270 if (utf8_target) {
f33976b4
DB
3271 locinput += PL_utf8skip[nextchr];
3272 if (locinput > PL_regeol)
3273 sayNO;
3274 nextchr = UCHARAT(locinput);
3275 }
3276 else
3277 nextchr = UCHARAT(++locinput);
3278 break;
3279 case CANY:
3280 if (!nextchr && locinput >= PL_regeol)
3281 sayNO;
b8c5462f 3282 nextchr = UCHARAT(++locinput);
a0d0e21e 3283 break;
ffc61ed2 3284 case REG_ANY:
1aa99e6b
IH
3285 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3286 sayNO;
f2ed9b32 3287 if (utf8_target) {
b8c5462f 3288 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3289 if (locinput > PL_regeol)
3290 sayNO;
a0ed51b3 3291 nextchr = UCHARAT(locinput);
a0ed51b3 3292 }
1aa99e6b
IH
3293 else
3294 nextchr = UCHARAT(++locinput);
a0ed51b3 3295 break;
166ba7cd
DM
3296
3297#undef ST
3298#define ST st->u.trie
786e8c11
YO
3299 case TRIEC:
3300 /* In this case the charclass data is available inline so
3301 we can fail fast without a lot of extra overhead.
3302 */
fab2782b
YO
3303 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3304 DEBUG_EXECUTE_r(
3305 PerlIO_printf(Perl_debug_log,
3306 "%*s %sfailed to match trie start class...%s\n",
3307 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3308 );
3309 sayNO_SILENT;
3310 /* NOTREACHED */
786e8c11
YO
3311 }
3312 /* FALL THROUGH */
5b47454d 3313 case TRIE:
2e64971a
DM
3314 /* the basic plan of execution of the trie is:
3315 * At the beginning, run though all the states, and
3316 * find the longest-matching word. Also remember the position
3317 * of the shortest matching word. For example, this pattern:
3318 * 1 2 3 4 5
3319 * ab|a|x|abcd|abc
3320 * when matched against the string "abcde", will generate
3321 * accept states for all words except 3, with the longest
3322 * matching word being 4, and the shortest being 1 (with
3323 * the position being after char 1 of the string).
3324 *
3325 * Then for each matching word, in word order (i.e. 1,2,4,5),
3326 * we run the remainder of the pattern; on each try setting
3327 * the current position to the character following the word,
3328 * returning to try the next word on failure.
3329 *
3330 * We avoid having to build a list of words at runtime by
3331 * using a compile-time structure, wordinfo[].prev, which
3332 * gives, for each word, the previous accepting word (if any).
3333 * In the case above it would contain the mappings 1->2, 2->0,
3334 * 3->0, 4->5, 5->1. We can use this table to generate, from
3335 * the longest word (4 above), a list of all words, by
3336 * following the list of prev pointers; this gives us the
3337 * unordered list 4,5,1,2. Then given the current word we have
3338 * just tried, we can go through the list and find the
3339 * next-biggest word to try (so if we just failed on word 2,
3340 * the next in the list is 4).
3341 *
3342 * Since at runtime we don't record the matching position in
3343 * the string for each word, we have to work that out for
3344 * each word we're about to process. The wordinfo table holds
3345 * the character length of each word; given that we recorded
3346 * at the start: the position of the shortest word and its
3347 * length in chars, we just need to move the pointer the
3348 * difference between the two char lengths. Depending on
3349 * Unicode status and folding, that's cheap or expensive.
3350 *
3351 * This algorithm is optimised for the case where are only a
3352 * small number of accept states, i.e. 0,1, or maybe 2.
3353 * With lots of accepts states, and having to try all of them,
3354 * it becomes quadratic on number of accept states to find all
3355 * the next words.
3356 */
3357
3dab1dad 3358 {
07be1b83 3359 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3360 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3361
3362 /* what trie are we using right now */
be8e71aa 3363 reg_trie_data * const trie
f8fc2ecf 3364 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3365 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3366 U32 state = trie->startstate;
166ba7cd 3367
fab2782b 3368 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3369 if (trie->states[ state ].wordnum) {
3370 DEBUG_EXECUTE_r(
3371 PerlIO_printf(Perl_debug_log,
3372 "%*s %smatched empty string...%s\n",
5bc10b2c 3373 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3374 );
20dbff7c
YO
3375 if (!trie->jump)
3376 break;
3dab1dad
YO
3377 } else {
3378 DEBUG_EXECUTE_r(
3379 PerlIO_printf(Perl_debug_log,
786e8c11 3380 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3381 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3382 );
3383 sayNO_SILENT;
3384 }
3385 }
166ba7cd 3386
786e8c11
YO
3387 {
3388 U8 *uc = ( U8* )locinput;
3389
3390 STRLEN len = 0;
3391 STRLEN foldlen = 0;
3392 U8 *uscan = (U8*)NULL;
786e8c11 3393 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3394 U32 charcount = 0; /* how many input chars we have matched */
3395 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3396
786e8c11 3397 ST.jump = trie->jump;
786e8c11 3398 ST.me = scan;
2e64971a
DM
3399 ST.firstpos = NULL;
3400 ST.longfold = FALSE; /* char longer if folded => it's harder */
3401 ST.nextword = 0;
3402
3403 /* fully traverse the TRIE; note the position of the
3404 shortest accept state and the wordnum of the longest
3405 accept state */
07be1b83 3406
a3621e74 3407 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3408 U32 base = trie->states[ state ].trans.base;
f9f4320a 3409 UV uvc = 0;
acb909b4 3410 U16 charid = 0;
2e64971a
DM
3411 U16 wordnum;
3412 wordnum = trie->states[ state ].wordnum;
3413
3414 if (wordnum) { /* it's an accept state */
3415 if (!accepted) {
3416 accepted = 1;
3417 /* record first match position */
3418 if (ST.longfold) {
3419 ST.firstpos = (U8*)locinput;
3420 ST.firstchars = 0;
5b47454d 3421 }
2e64971a
DM
3422 else {
3423 ST.firstpos = uc;
3424 ST.firstchars = charcount;