This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec_flags(): simplify length calculation
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
61dad979 84#include "utf8_strings.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
27d4fc33 147 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979
KW
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
149 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
151 LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \
152 LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \
153 LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \
154 LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \
155 LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
20d0b1e9 156
1dcf4a1b 157#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 158
ee9a90b8
KW
159/* The actual code for CCC_TRY, which uses several variables from the routine
160 * it's callable from. It is designed to be the bulk of a case statement.
161 * FUNC is the macro or function to call on non-utf8 targets that indicate if
162 * nextchr matches the class.
163 * UTF8_TEST is the whole test string to use for utf8 targets
164 * LOAD is what to use to test, and if not present to load in the swash for the
165 * class
166 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
167 * UTF8_TEST test.
168 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
169 * utf8 and a variant, load the swash if necessary and test using the utf8
170 * test. Advance to the next character if test is ok, otherwise fail; If not
171 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
172 * fails, or advance to the next character */
173
174#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
175 if (locinput >= PL_regeol) { \
176 sayNO; \
177 } \
178 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
179 LOAD_UTF8_CHARCLASS(CLASS, STR); \
180 if (POS_OR_NEG (UTF8_TEST)) { \
181 sayNO; \
182 } \
183 locinput += PL_utf8skip[nextchr]; \
184 nextchr = UCHARAT(locinput); \
185 break; \
186 } \
187 if (POS_OR_NEG (FUNC(nextchr))) { \
188 sayNO; \
189 } \
190 nextchr = UCHARAT(++locinput); \
980866de
KW
191 break;
192
ee9a90b8
KW
193/* Handle the non-locale cases for a character class and its complement. It
194 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
195 * This is because that code fails when the test succeeds, so we want to have
196 * the test fail so that the code succeeds. The swash is stored in a
197 * predictable PL_ place */
cfaf538b
KW
198#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
199 CLASS, STR) \
ee9a90b8
KW
200 case NAME: \
201 _CCC_TRY_CODE( !, FUNC, \
202 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
203 (U8*)locinput, TRUE)), \
204 CLASS, STR) \
205 case NNAME: \
1dcf4a1b 206 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
207 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
208 (U8*)locinput, TRUE)), \
209 CLASS, STR) \
210
211/* Generate the case statements for both locale and non-locale character
212 * classes in regmatch for classes that don't have special unicode semantics.
213 * Locales don't use an immediate swash, but an intermediary special locale
214 * function that is called on the pointer to the current place in the input
215 * string. That function will resolve to needing the same swash. One might
216 * think that because we don't know what the locale will match, we shouldn't
217 * check with the swash loading function that it loaded properly; ie, that we
218 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
219 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
220 * irrelevant here */
221#define CCC_TRY(NAME, NNAME, FUNC, \
222 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 223 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
224 CLASS, STR) \
225 case NAMEL: \
226 PL_reg_flags |= RF_tainted; \
227 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
228 case NNAMEL: \
229 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
230 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
231 CLASS, STR) \
cfaf538b
KW
232 case NAMEA: \
233 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
237 nextchr = UCHARAT(++locinput); \
238 break; \
239 case NNAMEA: \
240 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 if (utf8_target) { \
244 locinput += PL_utf8skip[nextchr]; \
245 nextchr = UCHARAT(locinput); \
246 } \
247 else { \
248 nextchr = UCHARAT(++locinput); \
249 } \
250 break; \
ee9a90b8
KW
251 /* Generate the non-locale cases */ \
252 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
253
254/* This is like CCC_TRY, but has an extra set of parameters for generating case
255 * statements to handle separate Unicode semantics nodes */
256#define CCC_TRY_U(NAME, NNAME, FUNC, \
257 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
258 NAMEU, NNAMEU, FUNCU, \
cfaf538b 259 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 260 CLASS, STR) \
cfaf538b
KW
261 CCC_TRY(NAME, NNAME, FUNC, \
262 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
263 NAMEA, NNAMEA, FUNCA, \
264 CLASS, STR) \
ee9a90b8 265 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 296#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 302#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
303#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
305
306#endif
e2d8ce26 307
a84d97b6
HS
308/*
309 Search for mandatory following text node; for lookahead, the text must
310 follow but for lookbehind (rn->flags != 0) we skip to the next step.
311*/
cca55fe3 312#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
313 while (JUMPABLE(rn)) { \
314 const OPCODE type = OP(rn); \
315 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 316 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 317 else if (type == PLUS) \
cca55fe3 318 rn = NEXTOPER(rn); \
3dab1dad 319 else if (type == IFMATCH) \
a84d97b6 320 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 321 else rn += NEXT_OFF(rn); \
3dab1dad 322 } \
5f80c4cf 323} STMT_END
74750237 324
c476f425 325
acfe0abc 326static void restore_pos(pTHX_ void *arg);
51371543 327
87c0511b 328#define REGCP_PAREN_ELEMS 3
f067efbf 329#define REGCP_OTHER_ELEMS 3
e0fa7e2b 330#define REGCP_FRAME_ELEMS 1
620d5b66
NC
331/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
332 * are needed for the regexp context stack bookkeeping. */
333
76e3520e 334STATIC CHECKPOINT
b93070ed 335S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 336{
97aff369 337 dVAR;
a3b680e6 338 const int retval = PL_savestack_ix;
a3b680e6 339 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
340 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
341 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 342 I32 p;
40a82448 343 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 344
b93070ed
DM
345 PERL_ARGS_ASSERT_REGCPPUSH;
346
e49a9654 347 if (paren_elems_to_push < 0)
5637ef5b
NC
348 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
349 paren_elems_to_push);
e49a9654 350
e0fa7e2b
NC
351 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
352 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
353 " out of range (%lu-%ld)",
354 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 355
620d5b66 356 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 357
495f47a5
DM
358 DEBUG_BUFFERS_r(
359 if ((int)PL_regsize > (int)parenfloor)
360 PerlIO_printf(Perl_debug_log,
361 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
362 PTR2UV(rex),
363 PTR2UV(rex->offs)
364 );
365 );
87c0511b 366 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
368 SSPUSHINT(rex->offs[p].end);
369 SSPUSHINT(rex->offs[p].start);
1ca2007e 370 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 371 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
372 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
373 (UV)p,
374 (IV)rex->offs[p].start,
375 (IV)rex->offs[p].start_tmp,
376 (IV)rex->offs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 380 SSPUSHINT(PL_regsize);
b93070ed
DM
381 SSPUSHINT(rex->lastparen);
382 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 383 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 384
a0d0e21e
LW
385 return retval;
386}
387
c277df42 388/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
389#define REGCP_SET(cp) \
390 DEBUG_STATE_r( \
ab3bbdeb 391 PerlIO_printf(Perl_debug_log, \
e4f74956 392 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
393 (IV)PL_savestack_ix)); \
394 cp = PL_savestack_ix
c3464db5 395
ab3bbdeb 396#define REGCP_UNWIND(cp) \
e4f74956 397 DEBUG_STATE_r( \
ab3bbdeb 398 if (cp != PL_savestack_ix) \
e4f74956
YO
399 PerlIO_printf(Perl_debug_log, \
400 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
401 (IV)(cp), (IV)PL_savestack_ix)); \
402 regcpblow(cp)
c277df42 403
a8d1f4b4
DM
404#define UNWIND_PAREN(lp, lcp) \
405 for (n = rex->lastparen; n > lp; n--) \
406 rex->offs[n].end = -1; \
407 rex->lastparen = n; \
408 rex->lastcloseparen = lcp;
409
410
f067efbf 411STATIC void
b93070ed 412S_regcppop(pTHX_ regexp *rex)
a0d0e21e 413{
97aff369 414 dVAR;
e0fa7e2b 415 UV i;
87c0511b 416 U32 paren;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
87c0511b 439 paren = PL_regsize;
620d5b66 440 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 441 I32 tmps;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
87c0511b 455 paren--;
a0d0e21e 456 }
daf18116 457#if 1
dafc8851
JH
458 /* It would seem that the similar code in regtry()
459 * already takes care of this, and in fact it is in
460 * a better location to since this code can #if 0-ed out
461 * but the code in regtry() is needed or otherwise tests
462 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
463 * (as of patchlevel 7877) will fail. Then again,
464 * this code seems to be necessary or otherwise
225593e1
DM
465 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
466 * --jhi updated by dapm */
b93070ed 467 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 468 if (i > PL_regsize)
b93070ed
DM
469 rex->offs[i].start = -1;
470 rex->offs[i].end = -1;
495f47a5
DM
471 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
472 " \\%"UVuf": %s ..-1 undeffing\n",
473 (UV)i,
474 (i > PL_regsize) ? "-1" : " "
475 ));
a0d0e21e 476 }
dafc8851 477#endif
a0d0e21e
LW
478}
479
74088413
DM
480/* restore the parens and associated vars at savestack position ix,
481 * but without popping the stack */
482
483STATIC void
484S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485{
486 I32 tmpix = PL_savestack_ix;
487 PL_savestack_ix = ix;
488 regcppop(rex);
489 PL_savestack_ix = tmpix;
490}
491
02db2b7b 492#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 493
a687059c 494/*
e50aee73 495 * pregexec and friends
a687059c
LW
496 */
497
76234dfb 498#ifndef PERL_IN_XSUB_RE
a687059c 499/*
c277df42 500 - pregexec - match a regexp against a string
a687059c 501 */
c277df42 502I32
49d7dfbc 503Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 504 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
505/* stringarg: the point in the string at which to begin matching */
506/* strend: pointer to null at end of string */
507/* strbeg: real beginning of string */
508/* minend: end of match must be >= minend bytes after stringarg. */
509/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
510 * itself is accessed via the pointers above */
511/* nosave: For optimizations. */
c277df42 512{
7918f24d
NC
513 PERL_ARGS_ASSERT_PREGEXEC;
514
c277df42 515 return
9041c2e3 516 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
517 nosave ? 0 : REXEC_COPY_STR);
518}
76234dfb 519#endif
22e551b9 520
9041c2e3 521/*
cad2e5aa
JH
522 * Need to implement the following flags for reg_anch:
523 *
524 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
525 * USE_INTUIT_ML
526 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
527 * INTUIT_AUTORITATIVE_ML
528 * INTUIT_ONCE_NOML - Intuit can match in one location only.
529 * INTUIT_ONCE_ML
530 *
531 * Another flag for this function: SECOND_TIME (so that float substrs
532 * with giant delta may be not rechecked).
533 */
534
535/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
536
3f7c398e 537/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
538 Otherwise, only SvCUR(sv) is used to get strbeg. */
539
540/* XXXX We assume that strpos is strbeg unless sv. */
541
6eb5f6b9
JH
542/* XXXX Some places assume that there is a fixed substring.
543 An update may be needed if optimizer marks as "INTUITable"
544 RExen without fixed substrings. Similarly, it is assumed that
545 lengths of all the strings are no more than minlen, thus they
546 cannot come from lookahead.
40d049e4
YO
547 (Or minlen should take into account lookahead.)
548 NOTE: Some of this comment is not correct. minlen does now take account
549 of lookahead/behind. Further research is required. -- demerphq
550
551*/
6eb5f6b9 552
2c2d71f5
JH
553/* A failure to find a constant substring means that there is no need to make
554 an expensive call to REx engine, thus we celebrate a failure. Similarly,
555 finding a substring too deep into the string means that less calls to
30944b6d
IZ
556 regtry() should be needed.
557
558 REx compiler's optimizer found 4 possible hints:
559 a) Anchored substring;
560 b) Fixed substring;
561 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 562 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 563 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
564 string which does not contradict any of them.
565 */
2c2d71f5 566
6eb5f6b9
JH
567/* Most of decisions we do here should have been done at compile time.
568 The nodes of the REx which we used for the search should have been
569 deleted from the finite automaton. */
570
cad2e5aa 571char *
288b8c02 572Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 573 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 574{
97aff369 575 dVAR;
288b8c02 576 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 577 I32 start_shift = 0;
cad2e5aa 578 /* Should be nonnegative! */
eb578fdb
KW
579 I32 end_shift = 0;
580 char *s;
581 SV *check;
a1933d95 582 char *strbeg;
cad2e5aa 583 char *t;
f2ed9b32 584 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 585 I32 ml_anch;
eb578fdb 586 char *other_last = NULL; /* other substr checked before this */
bd61b366 587 char *check_at = NULL; /* check substr found at this pos */
d8080198 588 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 589 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 590 RXi_GET_DECL(prog,progi);
30944b6d 591#ifdef DEBUGGING
890ce7af 592 const char * const i_strpos = strpos;
30944b6d 593#endif
a3621e74
YO
594 GET_RE_DEBUG_FLAGS_DECL;
595
7918f24d 596 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
597 PERL_UNUSED_ARG(flags);
598 PERL_UNUSED_ARG(data);
7918f24d 599
f2ed9b32 600 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 601
3c8556c3 602 if (RX_UTF8(rx)) {
b8d68ded
JH
603 PL_reg_flags |= RF_utf8;
604 }
ab3bbdeb 605 DEBUG_EXECUTE_r(
f2ed9b32 606 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
607 sv ? "Guessing start of match in sv for"
608 : "Guessing start of match in string for");
2a782b5b 609 );
cad2e5aa 610
c344f387
JH
611 /* CHR_DIST() would be more correct here but it makes things slow. */
612 if (prog->minlen > strend - strpos) {
a3621e74 613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 614 "String too short... [re_intuit_start]\n"));
cad2e5aa 615 goto fail;
2c2d71f5 616 }
1de06328 617
a1933d95 618 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 619 PL_regeol = strend;
f2ed9b32 620 if (utf8_target) {
33b8afdf
JH
621 if (!prog->check_utf8 && prog->check_substr)
622 to_utf8_substr(prog);
623 check = prog->check_utf8;
624 } else {
625 if (!prog->check_substr && prog->check_utf8)
626 to_byte_substr(prog);
627 check = prog->check_substr;
628 }
1de06328 629 if (check == &PL_sv_undef) {
a3621e74 630 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 631 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
632 goto fail;
633 }
bbe252da
YO
634 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
635 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
636 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 637 && !multiline ) ); /* Check after \n? */
cad2e5aa 638
7e25d62c 639 if (!ml_anch) {
bbe252da
YO
640 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
641 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 642 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
643 && sv && !SvROK(sv)
644 && (strpos != strbeg)) {
a3621e74 645 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
646 goto fail;
647 }
648 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 649 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 650 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
651 I32 slen;
652
1aa99e6b 653 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 654
653099ff
GS
655 if (SvTAIL(check)) {
656 slen = SvCUR(check); /* >= 1 */
cad2e5aa 657
9041c2e3 658 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 659 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 661 goto fail_finish;
cad2e5aa
JH
662 }
663 /* Now should match s[0..slen-2] */
664 slen--;
3f7c398e 665 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 666 || (slen > 1
3f7c398e 667 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 668 report_neq:
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
670 goto fail_finish;
671 }
cad2e5aa 672 }
3f7c398e 673 else if (*SvPVX_const(check) != *s
653099ff 674 || ((slen = SvCUR(check)) > 1
3f7c398e 675 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 676 goto report_neq;
c315bfe8 677 check_at = s;
2c2d71f5 678 goto success_at_start;
7e25d62c 679 }
cad2e5aa 680 }
2c2d71f5 681 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 682 s = strpos;
2c2d71f5 683 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
684 end_shift = prog->check_end_shift;
685
2c2d71f5 686 if (!ml_anch) {
a3b680e6 687 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 688 - (SvTAIL(check) != 0);
a3b680e6 689 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
690
691 if (end_shift < eshift)
692 end_shift = eshift;
693 }
cad2e5aa 694 }
2c2d71f5 695 else { /* Can match at random position */
cad2e5aa
JH
696 ml_anch = 0;
697 s = strpos;
1de06328
YO
698 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
699 end_shift = prog->check_end_shift;
700
701 /* end shift should be non negative here */
cad2e5aa
JH
702 }
703
bcdf7404 704#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 705 if (end_shift < 0)
1de06328 706 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 707 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
708#endif
709
2c2d71f5
JH
710 restart:
711 /* Find a possible match in the region s..strend by looking for
712 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
713
714 {
715 I32 srch_start_shift = start_shift;
716 I32 srch_end_shift = end_shift;
c33e64f0
FC
717 U8* start_point;
718 U8* end_point;
1de06328
YO
719 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
720 srch_end_shift -= ((strbeg - s) - srch_start_shift);
721 srch_start_shift = strbeg - s;
722 }
6bda09f9 723 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
724 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
725 (IV)prog->check_offset_min,
726 (IV)srch_start_shift,
727 (IV)srch_end_shift,
728 (IV)prog->check_end_shift);
729 });
730
bbe252da 731 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
732 start_point= (U8*)(s + srch_start_shift);
733 end_point= (U8*)(strend - srch_end_shift);
734 } else {
735 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
736 end_point= HOP3(strend, -srch_end_shift, strbeg);
737 }
6bda09f9 738 DEBUG_OPTIMISE_MORE_r({
56570a2c 739 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 740 (int)(end_point - start_point),
fc8cd66c 741 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
742 start_point);
743 });
744
745 s = fbm_instr( start_point, end_point,
7fba1cd6 746 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 747 }
cad2e5aa
JH
748 /* Update the count-of-usability, remove useless subpatterns,
749 unshift s. */
2c2d71f5 750
ab3bbdeb 751 DEBUG_EXECUTE_r({
f2ed9b32 752 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
753 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
754 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 755 (s ? "Found" : "Did not find"),
f2ed9b32 756 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
757 ? "anchored" : "floating"),
758 quoted,
759 RE_SV_TAIL(check),
760 (s ? " at offset " : "...\n") );
761 });
2c2d71f5
JH
762
763 if (!s)
764 goto fail_finish;
2c2d71f5 765 /* Finish the diagnostic message */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 767
1de06328
YO
768 /* XXX dmq: first branch is for positive lookbehind...
769 Our check string is offset from the beginning of the pattern.
770 So we need to do any stclass tests offset forward from that
771 point. I think. :-(
772 */
773
774
775
776 check_at=s;
777
778
2c2d71f5
JH
779 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
780 Start with the other substr.
781 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 782 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
783 *always* match. Probably should be marked during compile...
784 Probably it is right to do no SCREAM here...
785 */
786
f2ed9b32 787 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
788 : (prog->float_substr && prog->anchored_substr))
789 {
30944b6d 790 /* Take into account the "other" substring. */
2c2d71f5
JH
791 /* XXXX May be hopelessly wrong for UTF... */
792 if (!other_last)
6eb5f6b9 793 other_last = strpos;
f2ed9b32 794 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
795 do_other_anchored:
796 {
890ce7af
AL
797 char * const last = HOP3c(s, -start_shift, strbeg);
798 char *last1, *last2;
be8e71aa 799 char * const saved_s = s;
33b8afdf 800 SV* must;
2c2d71f5 801
2c2d71f5
JH
802 t = s - prog->check_offset_max;
803 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 804 && (!utf8_target
0ce71af7 805 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 806 && t > strpos)))
6f207bd3 807 NOOP;
2c2d71f5
JH
808 else
809 t = strpos;
1aa99e6b 810 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
811 if (t < other_last) /* These positions already checked */
812 t = other_last;
1aa99e6b 813 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
814 if (last < last1)
815 last1 = last;
1de06328
YO
816 /* XXXX It is not documented what units *_offsets are in.
817 We assume bytes, but this is clearly wrong.
818 Meaning this code needs to be carefully reviewed for errors.
819 dmq.
820 */
821
2c2d71f5 822 /* On end-of-str: see comment below. */
f2ed9b32 823 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
824 if (must == &PL_sv_undef) {
825 s = (char*)NULL;
1de06328 826 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
827 }
828 else
829 s = fbm_instr(
830 (unsigned char*)t,
831 HOP3(HOP3(last1, prog->anchored_offset, strend)
832 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
833 must,
7fba1cd6 834 multiline ? FBMrf_MULTILINE : 0
33b8afdf 835 );
ab3bbdeb 836 DEBUG_EXECUTE_r({
f2ed9b32 837 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
838 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
839 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 840 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
841 quoted, RE_SV_TAIL(must));
842 });
843
844
2c2d71f5
JH
845 if (!s) {
846 if (last1 >= last2) {
a3621e74 847 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
848 ", giving up...\n"));
849 goto fail_finish;
850 }
a3621e74 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 852 ", trying floating at offset %ld...\n",
be8e71aa 853 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
854 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
855 s = HOP3c(last, 1, strend);
2c2d71f5
JH
856 goto restart;
857 }
858 else {
a3621e74 859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 860 (long)(s - i_strpos)));
1aa99e6b
IH
861 t = HOP3c(s, -prog->anchored_offset, strbeg);
862 other_last = HOP3c(s, 1, strend);
be8e71aa 863 s = saved_s;
2c2d71f5
JH
864 if (t == strpos)
865 goto try_at_start;
2c2d71f5
JH
866 goto try_at_offset;
867 }
30944b6d 868 }
2c2d71f5
JH
869 }
870 else { /* Take into account the floating substring. */
33b8afdf 871 char *last, *last1;
be8e71aa 872 char * const saved_s = s;
33b8afdf
JH
873 SV* must;
874
875 t = HOP3c(s, -start_shift, strbeg);
876 last1 = last =
877 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
878 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
879 last = HOP3c(t, prog->float_max_offset, strend);
880 s = HOP3c(t, prog->float_min_offset, strend);
881 if (s < other_last)
882 s = other_last;
2c2d71f5 883 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 884 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
885 /* fbm_instr() takes into account exact value of end-of-str
886 if the check is SvTAIL(ed). Since false positives are OK,
887 and end-of-str is not later than strend we are OK. */
888 if (must == &PL_sv_undef) {
889 s = (char*)NULL;
1de06328 890 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
891 }
892 else
2c2d71f5 893 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
894 (unsigned char*)last + SvCUR(must)
895 - (SvTAIL(must)!=0),
7fba1cd6 896 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 897 DEBUG_EXECUTE_r({
f2ed9b32 898 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
899 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
900 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 901 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
902 quoted, RE_SV_TAIL(must));
903 });
33b8afdf
JH
904 if (!s) {
905 if (last1 == last) {
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
907 ", giving up...\n"));
908 goto fail_finish;
2c2d71f5 909 }
a3621e74 910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 911 ", trying anchored starting at offset %ld...\n",
be8e71aa 912 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
913 other_last = last;
914 s = HOP3c(t, 1, strend);
915 goto restart;
916 }
917 else {
a3621e74 918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
919 (long)(s - i_strpos)));
920 other_last = s; /* Fix this later. --Hugo */
be8e71aa 921 s = saved_s;
33b8afdf
JH
922 if (t == strpos)
923 goto try_at_start;
924 goto try_at_offset;
925 }
2c2d71f5 926 }
cad2e5aa 927 }
2c2d71f5 928
1de06328 929
9ef43ace 930 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 931
6bda09f9 932 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
933 PerlIO_printf(Perl_debug_log,
934 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
935 (IV)prog->check_offset_min,
936 (IV)prog->check_offset_max,
937 (IV)(s-strpos),
938 (IV)(t-strpos),
939 (IV)(t-s),
940 (IV)(strend-strpos)
941 )
942 );
943
2c2d71f5 944 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 945 && (!utf8_target
9ef43ace 946 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
947 && t > strpos)))
948 {
2c2d71f5
JH
949 /* Fixed substring is found far enough so that the match
950 cannot start at strpos. */
951 try_at_offset:
cad2e5aa 952 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
953 /* Eventually fbm_*() should handle this, but often
954 anchored_offset is not 0, so this check will not be wasted. */
955 /* XXXX In the code below we prefer to look for "^" even in
956 presence of anchored substrings. And we search even
957 beyond the found float position. These pessimizations
958 are historical artefacts only. */
959 find_anchor:
2c2d71f5 960 while (t < strend - prog->minlen) {
cad2e5aa 961 if (*t == '\n') {
4ee3650e 962 if (t < check_at - prog->check_offset_min) {
f2ed9b32 963 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
964 /* Since we moved from the found position,
965 we definitely contradict the found anchored
30944b6d
IZ
966 substr. Due to the above check we do not
967 contradict "check" substr.
968 Thus we can arrive here only if check substr
969 is float. Redo checking for "other"=="fixed".
970 */
9041c2e3 971 strpos = t + 1;
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 973 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
974 goto do_other_anchored;
975 }
4ee3650e
GS
976 /* We don't contradict the found floating substring. */
977 /* XXXX Why not check for STCLASS? */
cad2e5aa 978 s = t + 1;
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 980 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
981 goto set_useful;
982 }
4ee3650e
GS
983 /* Position contradicts check-string */
984 /* XXXX probably better to look for check-string
985 than for "\n", so one should lower the limit for t? */
a3621e74 986 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 987 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 988 other_last = strpos = s = t + 1;
cad2e5aa
JH
989 goto restart;
990 }
991 t++;
992 }
a3621e74 993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 994 PL_colors[0], PL_colors[1]));
2c2d71f5 995 goto fail_finish;
cad2e5aa 996 }
f5952150 997 else {
a3621e74 998 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 999 PL_colors[0], PL_colors[1]));
f5952150 1000 }
cad2e5aa
JH
1001 s = t;
1002 set_useful:
f2ed9b32 1003 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1004 }
1005 else {
f5952150 1006 /* The found string does not prohibit matching at strpos,
2c2d71f5 1007 - no optimization of calling REx engine can be performed,
f5952150
GS
1008 unless it was an MBOL and we are not after MBOL,
1009 or a future STCLASS check will fail this. */
2c2d71f5
JH
1010 try_at_start:
1011 /* Even in this situation we may use MBOL flag if strpos is offset
1012 wrt the start of the string. */
05b4157f 1013 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1014 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1015 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1016 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1017 {
cad2e5aa
JH
1018 t = strpos;
1019 goto find_anchor;
1020 }
a3621e74 1021 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1022 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1023 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1024 );
2c2d71f5 1025 success_at_start:
bbe252da 1026 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1027 && (utf8_target ? (
33b8afdf
JH
1028 prog->check_utf8 /* Could be deleted already */
1029 && --BmUSEFUL(prog->check_utf8) < 0
1030 && (prog->check_utf8 == prog->float_utf8)
1031 ) : (
1032 prog->check_substr /* Could be deleted already */
1033 && --BmUSEFUL(prog->check_substr) < 0
1034 && (prog->check_substr == prog->float_substr)
1035 )))
66e933ab 1036 {
cad2e5aa 1037 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1038 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1039 /* XXX Does the destruction order has to change with utf8_target? */
1040 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1041 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1042 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1043 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1044 check = NULL; /* abort */
cad2e5aa 1045 s = strpos;
486ec47a 1046 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1047 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1048 if (prog->intflags & PREGf_IMPLICIT)
1049 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1050 /* XXXX This is a remnant of the old implementation. It
1051 looks wasteful, since now INTUIT can use many
6eb5f6b9 1052 other heuristics. */
bbe252da 1053 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1054 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1055 }
1056 else
1057 s = strpos;
1058 }
1059
6eb5f6b9
JH
1060 /* Last resort... */
1061 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1062 /* trie stclasses are too expensive to use here, we are better off to
1063 leave it to regmatch itself */
f8fc2ecf 1064 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1065 /* minlen == 0 is possible if regstclass is \b or \B,
1066 and the fixed substr is ''$.
1067 Since minlen is already taken into account, s+1 is before strend;
1068 accidentally, minlen >= 1 guaranties no false positives at s + 1
1069 even for \b or \B. But (minlen? 1 : 0) below assumes that
1070 regstclass does not come from lookahead... */
1071 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1072 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1073 const U8* const str = (U8*)STRING(progi->regstclass);
1074 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1075 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1076 : 1);
1de06328
YO
1077 char * endpos;
1078 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1079 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1080 else if (prog->float_substr || prog->float_utf8)
1081 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1082 else
1083 endpos= strend;
1084
d8080198
YO
1085 if (checked_upto < s)
1086 checked_upto = s;
1087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1088 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1089
6eb5f6b9 1090 t = s;
d8080198
YO
1091 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1092 if (s) {
1093 checked_upto = s;
1094 } else {
6eb5f6b9 1095#ifdef DEBUGGING
cbbf8932 1096 const char *what = NULL;
6eb5f6b9
JH
1097#endif
1098 if (endpos == strend) {
a3621e74 1099 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1100 "Could not match STCLASS...\n") );
1101 goto fail;
1102 }
a3621e74 1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1104 "This position contradicts STCLASS...\n") );
bbe252da 1105 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1106 goto fail;
d8080198
YO
1107 checked_upto = HOPBACKc(endpos, start_shift);
1108 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1109 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1110 /* Contradict one of substrings */
33b8afdf 1111 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1112 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1113 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1114 hop_and_restart:
1aa99e6b 1115 s = HOP3c(t, 1, strend);
66e933ab
GS
1116 if (s + start_shift + end_shift > strend) {
1117 /* XXXX Should be taken into account earlier? */
a3621e74 1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1119 "Could not match STCLASS...\n") );
1120 goto fail;
1121 }
5e39e1e5
HS
1122 if (!check)
1123 goto giveup;
a3621e74 1124 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1125 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1126 what, (long)(s + start_shift - i_strpos)) );
1127 goto restart;
1128 }
66e933ab 1129 /* Have both, check_string is floating */
6eb5f6b9
JH
1130 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131 goto retry_floating_check;
1132 /* Recheck anchored substring, but not floating... */
9041c2e3 1133 s = check_at;
5e39e1e5
HS
1134 if (!check)
1135 goto giveup;
a3621e74 1136 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1137 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1138 (long)(other_last - i_strpos)) );
1139 goto do_other_anchored;
1140 }
60e71179
GS
1141 /* Another way we could have checked stclass at the
1142 current position only: */
1143 if (ml_anch) {
1144 s = t = t + 1;
5e39e1e5
HS
1145 if (!check)
1146 goto giveup;
a3621e74 1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1148 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1149 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1150 goto try_at_offset;
66e933ab 1151 }
f2ed9b32 1152 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1153 goto fail;
486ec47a 1154 /* Check is floating substring. */
6eb5f6b9
JH
1155 retry_floating_check:
1156 t = check_at - start_shift;
a3621e74 1157 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1158 goto hop_and_restart;
1159 }
b7953727 1160 if (t != s) {
a3621e74 1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1162 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1163 (long)(t - i_strpos), (long)(s - i_strpos))
1164 );
1165 }
1166 else {
a3621e74 1167 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1168 "Does not contradict STCLASS...\n");
1169 );
1170 }
6eb5f6b9 1171 }
5e39e1e5 1172 giveup:
a3621e74 1173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1174 PL_colors[4], (check ? "Guessed" : "Giving up"),
1175 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1176 return s;
2c2d71f5
JH
1177
1178 fail_finish: /* Substring not found */
33b8afdf 1179 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1180 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1181 fail:
a3621e74 1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1183 PL_colors[4], PL_colors[5]));
bd61b366 1184 return NULL;
cad2e5aa 1185}
9661b544 1186
a0a388a1
YO
1187#define DECL_TRIE_TYPE(scan) \
1188 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1189 trie_type = ((scan->flags == EXACT) \
1190 ? (utf8_target ? trie_utf8 : trie_plain) \
1191 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1192
1193#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1194uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1195 STRLEN skiplen; \
1196 switch (trie_type) { \
1197 case trie_utf8_fold: \
1198 if ( foldlen>0 ) { \
1199 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1200 foldlen -= len; \
1201 uscan += len; \
1202 len=0; \
1203 } else { \
1204 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1205 len = UTF8SKIP(uc); \
1206 skiplen = UNISKIP( uvc ); \
1207 foldlen -= skiplen; \
1208 uscan = foldbuf + skiplen; \
1209 } \
1210 break; \
1211 case trie_latin_utf8_fold: \
1212 if ( foldlen>0 ) { \
1213 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1214 foldlen -= len; \
1215 uscan += len; \
1216 len=0; \
1217 } else { \
1218 len = 1; \
1219 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1220 skiplen = UNISKIP( uvc ); \
1221 foldlen -= skiplen; \
1222 uscan = foldbuf + skiplen; \
1223 } \
1224 break; \
1225 case trie_utf8: \
1226 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1227 break; \
1228 case trie_plain: \
1229 uvc = (UV)*uc; \
1230 len = 1; \
1231 } \
1232 if (uvc < 256) { \
1233 charid = trie->charmap[ uvc ]; \
1234 } \
1235 else { \
1236 charid = 0; \
1237 if (widecharmap) { \
1238 SV** const svpp = hv_fetch(widecharmap, \
1239 (char*)&uvc, sizeof(UV), 0); \
1240 if (svpp) \
1241 charid = (U16)SvIV(*svpp); \
1242 } \
1243 } \
4cadc6a9
YO
1244} STMT_END
1245
4cadc6a9
YO
1246#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1247STMT_START { \
1248 while (s <= e) { \
1249 if ( (CoNd) \
fac1af77 1250 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1251 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1252 goto got_it; \
1253 s++; \
1254 } \
1255} STMT_END
1256
1257#define REXEC_FBC_UTF8_SCAN(CoDe) \
1258STMT_START { \
1259 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1260 CoDe \
1261 s += uskip; \
1262 } \
1263} STMT_END
1264
1265#define REXEC_FBC_SCAN(CoDe) \
1266STMT_START { \
1267 while (s < strend) { \
1268 CoDe \
1269 s++; \
1270 } \
1271} STMT_END
1272
1273#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1274REXEC_FBC_UTF8_SCAN( \
1275 if (CoNd) { \
24b23f37 1276 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1277 goto got_it; \
1278 else \
1279 tmp = doevery; \
1280 } \
1281 else \
1282 tmp = 1; \
1283)
1284
1285#define REXEC_FBC_CLASS_SCAN(CoNd) \
1286REXEC_FBC_SCAN( \
1287 if (CoNd) { \
24b23f37 1288 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1289 goto got_it; \
1290 else \
1291 tmp = doevery; \
1292 } \
1293 else \
1294 tmp = 1; \
1295)
1296
1297#define REXEC_FBC_TRYIT \
24b23f37 1298if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1299 goto got_it
1300
e1d1eefb 1301#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1302 if (utf8_target) { \
e1d1eefb
YO
1303 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1304 } \
1305 else { \
1306 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1307 }
e1d1eefb 1308
4cadc6a9 1309#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1310 if (utf8_target) { \
4cadc6a9
YO
1311 UtFpReLoAd; \
1312 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1313 } \
1314 else { \
1315 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1316 }
4cadc6a9
YO
1317
1318#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1319 PL_reg_flags |= RF_tainted; \
f2ed9b32 1320 if (utf8_target) { \
4cadc6a9
YO
1321 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1322 } \
1323 else { \
1324 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1325 }
4cadc6a9 1326
786e8c11
YO
1327#define DUMP_EXEC_POS(li,s,doutf8) \
1328 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1329
cfaf538b
KW
1330
1331#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1332 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1333 tmp = TEST_NON_UTF8(tmp); \
1334 REXEC_FBC_UTF8_SCAN( \
1335 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1336 tmp = !tmp; \
1337 IF_SUCCESS; \
1338 } \
1339 else { \
1340 IF_FAIL; \
1341 } \
1342 ); \
1343
1344#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1345 if (s == PL_bostr) { \
1346 tmp = '\n'; \
1347 } \
1348 else { \
1349 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1350 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1351 } \
1352 tmp = TeSt1_UtF8; \
1353 LOAD_UTF8_CHARCLASS_ALNUM(); \
1354 REXEC_FBC_UTF8_SCAN( \
1355 if (tmp == ! (TeSt2_UtF8)) { \
1356 tmp = !tmp; \
1357 IF_SUCCESS; \
1358 } \
1359 else { \
1360 IF_FAIL; \
1361 } \
1362 ); \
1363
63ac0dad
KW
1364/* The only difference between the BOUND and NBOUND cases is that
1365 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1366 * NBOUND. This is accomplished by passing it in either the if or else clause,
1367 * with the other one being empty */
1368#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1369 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1370
1371#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1372 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1373
1374#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1375 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1376
1377#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1378 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1379
63ac0dad
KW
1380
1381/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1382 * be passed in completely with the variable name being tested, which isn't
1383 * such a clean interface, but this is easier to read than it was before. We
1384 * are looking for the boundary (or non-boundary between a word and non-word
1385 * character. The utf8 and non-utf8 cases have the same logic, but the details
1386 * must be different. Find the "wordness" of the character just prior to this
1387 * one, and compare it with the wordness of this one. If they differ, we have
1388 * a boundary. At the beginning of the string, pretend that the previous
1389 * character was a new-line */
cfaf538b 1390#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1391 if (utf8_target) { \
cfaf538b 1392 UTF8_CODE \
63ac0dad
KW
1393 } \
1394 else { /* Not utf8 */ \
1395 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1396 tmp = TEST_NON_UTF8(tmp); \
1397 REXEC_FBC_SCAN( \
1398 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1399 tmp = !tmp; \
1400 IF_SUCCESS; \
1401 } \
1402 else { \
1403 IF_FAIL; \
1404 } \
1405 ); \
1406 } \
1407 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1408 goto got_it;
1409
786e8c11
YO
1410/* We know what class REx starts with. Try to find this position... */
1411/* if reginfo is NULL, its a dryrun */
1412/* annoyingly all the vars in this routine have different names from their counterparts
1413 in regmatch. /grrr */
1414
3c3eec57 1415STATIC char *
07be1b83 1416S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1417 const char *strend, regmatch_info *reginfo)
a687059c 1418{
27da23d5 1419 dVAR;
bbe252da 1420 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1421 char *pat_string; /* The pattern's exactish string */
1422 char *pat_end; /* ptr to end char of pat_string */
1423 re_fold_t folder; /* Function for computing non-utf8 folds */
1424 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1425 STRLEN ln;
5dab1207 1426 STRLEN lnc;
eb578fdb 1427 STRLEN uskip;
fac1af77
KW
1428 U8 c1;
1429 U8 c2;
6eb5f6b9 1430 char *e;
eb578fdb
KW
1431 I32 tmp = 1; /* Scratch variable? */
1432 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1433 UV utf8_fold_flags = 0;
f8fc2ecf 1434 RXi_GET_DECL(prog,progi);
7918f24d
NC
1435
1436 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1437
6eb5f6b9
JH
1438 /* We know what class it must start with. */
1439 switch (OP(c)) {
f56b6394 1440 case ANYOFV:
6eb5f6b9 1441 case ANYOF:
f56b6394 1442 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1443 STRLEN inclasslen = strend - s;
1444 REXEC_FBC_UTF8_CLASS_SCAN(
1445 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1446 }
1447 else {
6ef69d56 1448 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1449 }
6eb5f6b9 1450 break;
f33976b4 1451 case CANY:
4cadc6a9 1452 REXEC_FBC_SCAN(
24b23f37 1453 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1454 goto got_it;
1455 else
1456 tmp = doevery;
4cadc6a9 1457 );
f33976b4 1458 break;
2f7f8cb1
KW
1459
1460 case EXACTFA:
1461 if (UTF_PATTERN || utf8_target) {
1462 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1463 goto do_exactf_utf8;
1464 }
1465 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1466 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1467 goto do_exactf_non_utf8; /* isn't dealt with by these */
1468
6eb5f6b9 1469 case EXACTF:
62bf7766 1470 if (utf8_target) {
77a6d856
KW
1471
1472 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1473 utf8_fold_flags = 0;
fac1af77
KW
1474 goto do_exactf_utf8;
1475 }
1476 fold_array = PL_fold;
1477 folder = foldEQ;
1478 goto do_exactf_non_utf8;
1479
1480 case EXACTFL:
1481 if (UTF_PATTERN || utf8_target) {
17580e7a 1482 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1483 goto do_exactf_utf8;
1484 }
1485 fold_array = PL_fold_locale;
1486 folder = foldEQ_locale;
16d951b7
KW
1487 goto do_exactf_non_utf8;
1488
3c760661
KW
1489 case EXACTFU_SS:
1490 if (UTF_PATTERN) {
1491 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1492 }
1493 goto do_exactf_utf8;
1494
fab2782b 1495 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1496 case EXACTFU:
1497 if (UTF_PATTERN || utf8_target) {
77a6d856 1498 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1499 goto do_exactf_utf8;
1500 }
1501
1502 /* Any 'ss' in the pattern should have been replaced by regcomp,
1503 * so we don't have to worry here about this single special case
1504 * in the Latin1 range */
1505 fold_array = PL_fold_latin1;
1506 folder = foldEQ_latin1;
fac1af77
KW
1507
1508 /* FALL THROUGH */
1509
62bf7766
KW
1510 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1511 are no glitches with fold-length differences
1512 between the target string and pattern */
fac1af77
KW
1513
1514 /* The idea in the non-utf8 EXACTF* cases is to first find the
1515 * first character of the EXACTF* node and then, if necessary,
1516 * case-insensitively compare the full text of the node. c1 is the
1517 * first character. c2 is its fold. This logic will not work for
1518 * Unicode semantics and the german sharp ss, which hence should
1519 * not be compiled into a node that gets here. */
1520 pat_string = STRING(c);
1521 ln = STR_LEN(c); /* length to match in octets/bytes */
1522
8a90a8fe
KW
1523 /* We know that we have to match at least 'ln' bytes (which is the
1524 * same as characters, since not utf8). If we have to match 3
1525 * characters, and there are only 2 availabe, we know without
1526 * trying that it will fail; so don't start a match past the
1527 * required minimum number from the far end */
fac1af77
KW
1528 e = HOP3c(strend, -((I32)ln), s);
1529
1530 if (!reginfo && e < s) {
1531 e = s; /* Due to minlen logic of intuit() */
1532 }
1533
1534 c1 = *pat_string;
1535 c2 = fold_array[c1];
1536 if (c1 == c2) { /* If char and fold are the same */
1537 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1538 }
1539 else {
1540 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1541 }
1542 break;
1543
1544 do_exactf_utf8:
e067297c
KW
1545 {
1546 unsigned expansion;
1547
fac1af77
KW
1548
1549 /* If one of the operands is in utf8, we can't use the simpler
1550 * folding above, due to the fact that many different characters
1551 * can have the same fold, or portion of a fold, or different-
1552 * length fold */
1553 pat_string = STRING(c);
1554 ln = STR_LEN(c); /* length to match in octets/bytes */
1555 pat_end = pat_string + ln;
1556 lnc = (UTF_PATTERN) /* length to match in characters */
1557 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1558 : ln;
1559
e067297c
KW
1560 /* We have 'lnc' characters to match in the pattern, but because of
1561 * multi-character folding, each character in the target can match
1562 * up to 3 characters (Unicode guarantees it will never exceed
1563 * this) if it is utf8-encoded; and up to 2 if not (based on the
1564 * fact that the Latin 1 folds are already determined, and the
1565 * only multi-char fold in that range is the sharp-s folding to
1566 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1567 * string character. Adjust lnc accordingly, rounding up, so that
1568 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1569 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1570 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1571
1572 /* As in the non-UTF8 case, if we have to match 3 characters, and
1573 * only 2 are left, it's guaranteed to fail, so don't start a
1574 * match that would require us to go beyond the end of the string
1575 */
1576 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1577
1578 if (!reginfo && e < s) {
1579 e = s; /* Due to minlen logic of intuit() */
1580 }
1581
b33105db
KW
1582 /* XXX Note that we could recalculate e to stop the loop earlier,
1583 * as the worst case expansion above will rarely be met, and as we
1584 * go along we would usually find that e moves further to the left.
1585 * This would happen only after we reached the point in the loop
1586 * where if there were no expansion we should fail. Unclear if
1587 * worth the expense */
e067297c 1588
fac1af77
KW
1589 while (s <= e) {
1590 char *my_strend= (char *)strend;
1591 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1592 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1593 && (!reginfo || regtry(reginfo, &s)) )
1594 {
1595 goto got_it;
1596 }
bbdd8bad 1597 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1598 }
1599 break;
e067297c 1600 }
bbce6d69 1601 case BOUNDL:
3280af22 1602 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1603 FBC_BOUND(isALNUM_LC,
1604 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1605 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1606 break;
bbce6d69 1607 case NBOUNDL:
3280af22 1608 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1609 FBC_NBOUND(isALNUM_LC,
1610 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1611 isALNUM_LC_utf8((U8*)s));
1612 break;
1613 case BOUND:
1614 FBC_BOUND(isWORDCHAR,
1615 isALNUM_uni(tmp),
1616 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1617 break;
cfaf538b
KW
1618 case BOUNDA:
1619 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1620 isWORDCHAR_A(tmp),
1621 isWORDCHAR_A((U8*)s));
1622 break;
a0d0e21e 1623 case NBOUND:
63ac0dad
KW
1624 FBC_NBOUND(isWORDCHAR,
1625 isALNUM_uni(tmp),
1626 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1627 break;
cfaf538b
KW
1628 case NBOUNDA:
1629 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1630 isWORDCHAR_A(tmp),
1631 isWORDCHAR_A((U8*)s));
1632 break;
63ac0dad
KW
1633 case BOUNDU:
1634 FBC_BOUND(isWORDCHAR_L1,
1635 isALNUM_uni(tmp),
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637 break;
1638 case NBOUNDU:
1639 FBC_NBOUND(isWORDCHAR_L1,
1640 isALNUM_uni(tmp),
1641 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1642 break;
bbce6d69 1643 case ALNUML:
4cadc6a9
YO
1644 REXEC_FBC_CSCAN_TAINT(
1645 isALNUM_LC_utf8((U8*)s),
1646 isALNUM_LC(*s)
1647 );
6895a8aa 1648 break;
980866de
KW
1649 case ALNUMU:
1650 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1651 LOAD_UTF8_CHARCLASS_ALNUM(),
1652 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1653 isWORDCHAR_L1((U8) *s)
1654 );
6895a8aa 1655 break;
980866de
KW
1656 case ALNUM:
1657 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1658 LOAD_UTF8_CHARCLASS_ALNUM(),
1659 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1660 isWORDCHAR((U8) *s)
1661 );
6895a8aa 1662 break;
cfaf538b 1663 case ALNUMA:
8e9da4d4
KW
1664 /* Don't need to worry about utf8, as it can match only a single
1665 * byte invariant character */
cfaf538b 1666 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1667 break;
980866de
KW
1668 case NALNUMU:
1669 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1670 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1671 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1672 ! isWORDCHAR_L1((U8) *s)
1673 );
6895a8aa 1674 break;
a0d0e21e 1675 case NALNUM:
4cadc6a9 1676 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1677 LOAD_UTF8_CHARCLASS_ALNUM(),
1678 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1679 ! isALNUM(*s)
4cadc6a9 1680 );
6895a8aa 1681 break;
cfaf538b 1682 case NALNUMA:
8e9da4d4
KW
1683 REXEC_FBC_CSCAN(
1684 !isWORDCHAR_A(*s),
1685 !isWORDCHAR_A(*s)
1686 );
1687 break;
bbce6d69 1688 case NALNUML:
4cadc6a9
YO
1689 REXEC_FBC_CSCAN_TAINT(
1690 !isALNUM_LC_utf8((U8*)s),
1691 !isALNUM_LC(*s)
1692 );
6895a8aa 1693 break;
980866de
KW
1694 case SPACEU:
1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1698 isSPACE_L1((U8) *s)
1699 );
6895a8aa 1700 break;
a0d0e21e 1701 case SPACE:
4cadc6a9 1702 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1703 LOAD_UTF8_CHARCLASS_SPACE(),
1704 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1705 isSPACE((U8) *s)
4cadc6a9 1706 );
6895a8aa 1707 break;
cfaf538b 1708 case SPACEA:
8e9da4d4
KW
1709 /* Don't need to worry about utf8, as it can match only a single
1710 * byte invariant character */
cfaf538b 1711 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1712 break;
bbce6d69 1713 case SPACEL:
4cadc6a9 1714 REXEC_FBC_CSCAN_TAINT(
6bbba904 1715 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1716 isSPACE_LC(*s)
1717 );
6895a8aa 1718 break;
980866de
KW
1719 case NSPACEU:
1720 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1721 LOAD_UTF8_CHARCLASS_SPACE(),
1722 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1723 ! isSPACE_L1((U8) *s)
1724 );
6895a8aa 1725 break;
a0d0e21e 1726 case NSPACE:
4cadc6a9 1727 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1728 LOAD_UTF8_CHARCLASS_SPACE(),
1729 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1730 ! isSPACE((U8) *s)
4cadc6a9 1731 );
6895a8aa 1732 break;
cfaf538b 1733 case NSPACEA:
8e9da4d4
KW
1734 REXEC_FBC_CSCAN(
1735 !isSPACE_A(*s),
1736 !isSPACE_A(*s)
1737 );
1738 break;
bbce6d69 1739 case NSPACEL:
4cadc6a9 1740 REXEC_FBC_CSCAN_TAINT(
6bbba904 1741 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1742 !isSPACE_LC(*s)
1743 );
6895a8aa 1744 break;
a0d0e21e 1745 case DIGIT:
4cadc6a9 1746 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1747 LOAD_UTF8_CHARCLASS_DIGIT(),
1748 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1749 isDIGIT(*s)
1750 );
6895a8aa 1751 break;
cfaf538b 1752 case DIGITA:
8e9da4d4
KW
1753 /* Don't need to worry about utf8, as it can match only a single
1754 * byte invariant character */
cfaf538b 1755 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1756 break;
b8c5462f 1757 case DIGITL:
4cadc6a9
YO
1758 REXEC_FBC_CSCAN_TAINT(
1759 isDIGIT_LC_utf8((U8*)s),
1760 isDIGIT_LC(*s)
1761 );
6895a8aa 1762 break;
a0d0e21e 1763 case NDIGIT:
4cadc6a9 1764 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1765 LOAD_UTF8_CHARCLASS_DIGIT(),
1766 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1767 !isDIGIT(*s)
1768 );
6895a8aa 1769 break;
cfaf538b 1770 case NDIGITA:
8e9da4d4
KW
1771 REXEC_FBC_CSCAN(
1772 !isDIGIT_A(*s),
1773 !isDIGIT_A(*s)
1774 );
1775 break;
b8c5462f 1776 case NDIGITL:
4cadc6a9
YO
1777 REXEC_FBC_CSCAN_TAINT(
1778 !isDIGIT_LC_utf8((U8*)s),
1779 !isDIGIT_LC(*s)
1780 );
6895a8aa 1781 break;
e1d1eefb
YO
1782 case LNBREAK:
1783 REXEC_FBC_CSCAN(
1784 is_LNBREAK_utf8(s),
1785 is_LNBREAK_latin1(s)
1786 );
6895a8aa 1787 break;
e1d1eefb
YO
1788 case VERTWS:
1789 REXEC_FBC_CSCAN(
1790 is_VERTWS_utf8(s),
1791 is_VERTWS_latin1(s)
1792 );
6895a8aa 1793 break;
e1d1eefb
YO
1794 case NVERTWS:
1795 REXEC_FBC_CSCAN(
1796 !is_VERTWS_utf8(s),
1797 !is_VERTWS_latin1(s)
1798 );
6895a8aa 1799 break;
e1d1eefb
YO
1800 case HORIZWS:
1801 REXEC_FBC_CSCAN(
1802 is_HORIZWS_utf8(s),
1803 is_HORIZWS_latin1(s)
1804 );
6895a8aa 1805 break;
e1d1eefb
YO
1806 case NHORIZWS:
1807 REXEC_FBC_CSCAN(
1808 !is_HORIZWS_utf8(s),
1809 !is_HORIZWS_latin1(s)
1810 );
6895a8aa 1811 break;
0658cdde
KW
1812 case POSIXA:
1813 /* Don't need to worry about utf8, as it can match only a single
1814 * byte invariant character. The flag in this node type is the
1815 * class number to pass to _generic_isCC() to build a mask for
1816 * searching in PL_charclass[] */
1817 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1818 break;
1819 case NPOSIXA:
1820 REXEC_FBC_CSCAN(
1821 !_generic_isCC_A(*s, FLAGS(c)),
1822 !_generic_isCC_A(*s, FLAGS(c))
1823 );
1824 break;
1825
1de06328
YO
1826 case AHOCORASICKC:
1827 case AHOCORASICK:
07be1b83 1828 {
a0a388a1 1829 DECL_TRIE_TYPE(c);
07be1b83
YO
1830 /* what trie are we using right now */
1831 reg_ac_data *aho
f8fc2ecf 1832 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1833 reg_trie_data *trie
1834 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1835 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1836
1837 const char *last_start = strend - trie->minlen;
6148ee25 1838#ifdef DEBUGGING
07be1b83 1839 const char *real_start = s;
6148ee25 1840#endif
07be1b83 1841 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1842 SV *sv_points;
1843 U8 **points; /* map of where we were in the input string
786e8c11 1844 when reading a given char. For ASCII this
be8e71aa 1845 is unnecessary overhead as the relationship
38a44b82
NC
1846 is always 1:1, but for Unicode, especially
1847 case folded Unicode this is not true. */
f9e705e8 1848 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1849 U8 *bitmap=NULL;
1850
07be1b83
YO
1851
1852 GET_RE_DEBUG_FLAGS_DECL;
1853
be8e71aa
YO
1854 /* We can't just allocate points here. We need to wrap it in
1855 * an SV so it gets freed properly if there is a croak while
1856 * running the match */
1857 ENTER;
1858 SAVETMPS;
1859 sv_points=newSV(maxlen * sizeof(U8 *));
1860 SvCUR_set(sv_points,
1861 maxlen * sizeof(U8 *));
1862 SvPOK_on(sv_points);
1863 sv_2mortal(sv_points);
1864 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1865 if ( trie_type != trie_utf8_fold
1866 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1867 {
786e8c11
YO
1868 if (trie->bitmap)
1869 bitmap=(U8*)trie->bitmap;
1870 else
1871 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1872 }
786e8c11
YO
1873 /* this is the Aho-Corasick algorithm modified a touch
1874 to include special handling for long "unknown char"
1875 sequences. The basic idea being that we use AC as long
1876 as we are dealing with a possible matching char, when
1877 we encounter an unknown char (and we have not encountered
1878 an accepting state) we scan forward until we find a legal
1879 starting char.
1880 AC matching is basically that of trie matching, except
1881 that when we encounter a failing transition, we fall back
1882 to the current states "fail state", and try the current char
1883 again, a process we repeat until we reach the root state,
1884 state 1, or a legal transition. If we fail on the root state
1885 then we can either terminate if we have reached an accepting
1886 state previously, or restart the entire process from the beginning
1887 if we have not.
1888
1889 */
07be1b83
YO
1890 while (s <= last_start) {
1891 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1892 U8 *uc = (U8*)s;
1893 U16 charid = 0;
1894 U32 base = 1;
1895 U32 state = 1;
1896 UV uvc = 0;
1897 STRLEN len = 0;
1898 STRLEN foldlen = 0;
1899 U8 *uscan = (U8*)NULL;
1900 U8 *leftmost = NULL;
786e8c11
YO
1901#ifdef DEBUGGING
1902 U32 accepted_word= 0;
1903#endif
07be1b83
YO
1904 U32 pointpos = 0;
1905
1906 while ( state && uc <= (U8*)strend ) {
1907 int failed=0;
786e8c11
YO
1908 U32 word = aho->states[ state ].wordnum;
1909
1de06328
YO
1910 if( state==1 ) {
1911 if ( bitmap ) {
1912 DEBUG_TRIE_EXECUTE_r(
1913 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1914 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1915 (char *)uc, utf8_target );
1de06328
YO
1916 PerlIO_printf( Perl_debug_log,
1917 " Scanning for legal start char...\n");
1918 }
d085b490
YO
1919 );
1920 if (utf8_target) {
1921 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1922 uc += UTF8SKIP(uc);
1923 }
1924 } else {
1925 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926 uc++;
1927 }
1928 }
1de06328 1929 s= (char *)uc;
786e8c11 1930 }
786e8c11
YO
1931 if (uc >(U8*)last_start) break;
1932 }
1933
1934 if ( word ) {
2e64971a 1935 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1936 if (!leftmost || lpos < leftmost) {
1937 DEBUG_r(accepted_word=word);
07be1b83 1938 leftmost= lpos;
786e8c11 1939 }
07be1b83 1940 if (base==0) break;
786e8c11 1941
07be1b83
YO
1942 }
1943 points[pointpos++ % maxlen]= uc;
55eed653
NC
1944 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1945 uscan, len, uvc, charid, foldlen,
1946 foldbuf, uniflags);
786e8c11
YO
1947 DEBUG_TRIE_EXECUTE_r({
1948 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1949 s, utf8_target );
07be1b83 1950 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1951 " Charid:%3u CP:%4"UVxf" ",
1952 charid, uvc);
1953 });
07be1b83
YO
1954
1955 do {
6148ee25 1956#ifdef DEBUGGING
786e8c11 1957 word = aho->states[ state ].wordnum;
6148ee25 1958#endif
07be1b83
YO
1959 base = aho->states[ state ].trans.base;
1960
786e8c11
YO
1961 DEBUG_TRIE_EXECUTE_r({
1962 if (failed)
1963 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1964 s, utf8_target );
07be1b83 1965 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1966 "%sState: %4"UVxf", word=%"UVxf,
1967 failed ? " Fail transition to " : "",
1968 (UV)state, (UV)word);
1969 });
07be1b83
YO
1970 if ( base ) {
1971 U32 tmp;
6dd2be57 1972 I32 offset;
07be1b83 1973 if (charid &&
6dd2be57
DM
1974 ( ((offset = base + charid
1975 - 1 - trie->uniquecharcount)) >= 0)
1976 && ((U32)offset < trie->lasttrans)
1977 && trie->trans[offset].check == state
1978 && (tmp=trie->trans[offset].next))
07be1b83 1979 {
786e8c11
YO
1980 DEBUG_TRIE_EXECUTE_r(
1981 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1982 state = tmp;
1983 break;
1984 }
1985 else {
786e8c11
YO
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log," - fail\n"));
1988 failed = 1;
1989 state = aho->fail[state];
07be1b83
YO
1990 }
1991 }
1992 else {
1993 /* we must be accepting here */
786e8c11
YO
1994 DEBUG_TRIE_EXECUTE_r(
1995 PerlIO_printf( Perl_debug_log," - accepting\n"));
1996 failed = 1;
07be1b83
YO
1997 break;
1998 }
1999 } while(state);
786e8c11 2000 uc += len;
07be1b83
YO
2001 if (failed) {
2002 if (leftmost)
2003 break;
786e8c11 2004 if (!state) state = 1;
07be1b83
YO
2005 }
2006 }
2007 if ( aho->states[ state ].wordnum ) {
2e64971a 2008 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2009 if (!leftmost || lpos < leftmost) {
2010 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2011 leftmost = lpos;
786e8c11 2012 }
07be1b83 2013 }
07be1b83
YO
2014 if (leftmost) {
2015 s = (char*)leftmost;
786e8c11
YO
2016 DEBUG_TRIE_EXECUTE_r({
2017 PerlIO_printf(
70685ca0
JH
2018 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2019 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2020 );
2021 });
24b23f37 2022 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2023 FREETMPS;
2024 LEAVE;
07be1b83 2025 goto got_it;
be8e71aa 2026 }
07be1b83 2027 s = HOPc(s,1);
786e8c11
YO
2028 DEBUG_TRIE_EXECUTE_r({
2029 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2030 });
07be1b83 2031 } else {
786e8c11
YO
2032 DEBUG_TRIE_EXECUTE_r(
2033 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2034 break;
2035 }
2036 }
be8e71aa
YO
2037 FREETMPS;
2038 LEAVE;
07be1b83
YO
2039 }
2040 break;
b3c9acc1 2041 default:
3c3eec57
GS
2042 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2043 break;
d6a28714 2044 }
6eb5f6b9
JH
2045 return 0;
2046 got_it:
2047 return s;
2048}
2049
fae667d5 2050
6eb5f6b9
JH
2051/*
2052 - regexec_flags - match a regexp against a string
2053 */
2054I32
288b8c02 2055Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2056 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2057/* stringarg: the point in the string at which to begin matching */
2058/* strend: pointer to null at end of string */
2059/* strbeg: real beginning of string */
2060/* minend: end of match must be >= minend bytes after stringarg. */
2061/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2062 * itself is accessed via the pointers above */
2063/* data: May be used for some additional optimizations.
2064 Currently its only used, with a U32 cast, for transmitting
2065 the ganch offset when doing a /g match. This will change */
2066/* nosave: For optimizations. */
2067
6eb5f6b9 2068{
97aff369 2069 dVAR;
288b8c02 2070 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2071 /*register*/ char *s;
eb578fdb 2072 regnode *c;
24b23f37 2073 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2074 I32 minlen; /* must match at least this many chars */
2075 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2076 I32 end_shift = 0; /* Same for the end. */ /* CC */
2077 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2078 char *scream_olds = NULL;
f2ed9b32 2079 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2080 I32 multiline;
f8fc2ecf 2081 RXi_GET_DECL(prog,progi);
3b0527fe 2082 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2083 regexp_paren_pair *swap = NULL;
a3621e74
YO
2084 GET_RE_DEBUG_FLAGS_DECL;
2085
7918f24d 2086 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2087 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2088
2089 /* Be paranoid... */
2090 if (prog == NULL || startpos == NULL) {
2091 Perl_croak(aTHX_ "NULL regexp parameter");
2092 return 0;
2093 }
2094
bbe252da 2095 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2096 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2097
f2ed9b32 2098 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2099 DEBUG_EXECUTE_r(
f2ed9b32 2100 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2101 "Matching");
2102 );
bac06658 2103
6eb5f6b9 2104 minlen = prog->minlen;
1de06328
YO
2105
2106 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2107 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2108 "String too short [regexec_flags]...\n"));
2109 goto phooey;
1aa99e6b 2110 }
6eb5f6b9 2111
1de06328 2112
6eb5f6b9 2113 /* Check validity of program. */
f8fc2ecf 2114 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2115 Perl_croak(aTHX_ "corrupted regexp program");
2116 }
2117
2118 PL_reg_flags = 0;
ed301438 2119 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2120 PL_reg_maxiter = 0;
2121
3c8556c3 2122 if (RX_UTF8(rx))
6eb5f6b9
JH
2123 PL_reg_flags |= RF_utf8;
2124
2125 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2126 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2127 PL_bostr = strbeg;
3b0527fe 2128 reginfo.sv = sv;
6eb5f6b9
JH
2129
2130 /* Mark end of line for $ (and such) */
2131 PL_regeol = strend;
2132
2133 /* see how far we have to get to not match where we matched before */
3b0527fe 2134 reginfo.till = startpos+minend;
6eb5f6b9 2135
6eb5f6b9
JH
2136 /* If there is a "must appear" string, look for it. */
2137 s = startpos;
2138
bbe252da 2139 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2140 MAGIC *mg;
2c296965 2141 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2142 reginfo.ganch = startpos + prog->gofs;
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2145 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2146 && SvMAGIC(sv)
14befaf4
DM
2147 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2148 && mg->mg_len >= 0) {
3b0527fe 2149 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2152
bbe252da 2153 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2154 if (s > reginfo.ganch)
6eb5f6b9 2155 goto phooey;
58e23c8d 2156 s = reginfo.ganch - prog->gofs;
2c296965 2157 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2158 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2159 if (s < strbeg)
2160 goto phooey;
6eb5f6b9
JH
2161 }
2162 }
58e23c8d 2163 else if (data) {
70685ca0 2164 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2165 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2166 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2167
2168 } else { /* pos() not defined */
3b0527fe 2169 reginfo.ganch = strbeg;
2c296965
YO
2170 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2171 "GPOS: reginfo.ganch = strbeg\n"));
2172 }
6eb5f6b9 2173 }
288b8c02 2174 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2175 /* We have to be careful. If the previous successful match
2176 was from this regex we don't want a subsequent partially
2177 successful match to clobber the old results.
2178 So when we detect this possibility we add a swap buffer
2179 to the re, and switch the buffer each match. If we fail
2180 we switch it back, otherwise we leave it swapped.
2181 */
2182 swap = prog->offs;
2183 /* do we need a save destructor here for eval dies? */
2184 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2185 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2186 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2187 PTR2UV(prog),
2188 PTR2UV(swap),
2189 PTR2UV(prog->offs)
2190 ));
c74340f9 2191 }
a0714e2c 2192 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2193 re_scream_pos_data d;
2194
2195 d.scream_olds = &scream_olds;
2196 d.scream_pos = &scream_pos;
288b8c02 2197 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2198 if (!s) {
a3621e74 2199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2200 goto phooey; /* not present */
3fa9c3d7 2201 }
6eb5f6b9
JH
2202 }
2203
1de06328 2204
6eb5f6b9
JH
2205
2206 /* Simplest case: anchored match need be tried only once. */
2207 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2208 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2209 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2210 goto got_it;
bbe252da
YO
2211 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2212 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2213 {
2214 char *end;
2215
2216 if (minlen)
2217 dontbother = minlen - 1;
1aa99e6b 2218 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2219 /* for multiline we only have to try after newlines */
33b8afdf 2220 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2221 /* because of the goto we can not easily reuse the macros for bifurcating the
2222 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2223 if (utf8_target) {
2224 if (s == startpos)
2225 goto after_try_utf8;
2226 while (1) {
2227 if (regtry(&reginfo, &s)) {
2228 goto got_it;
2229 }
2230 after_try_utf8:
2231 if (s > end) {
2232 goto phooey;
2233 }
2234 if (prog->extflags & RXf_USE_INTUIT) {
2235 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2236 if (!s) {
2237 goto phooey;
2238 }
2239 }
2240 else {
2241 s += UTF8SKIP(s);
2242 }
2243 }
2244 } /* end search for check string in unicode */
2245 else {
2246 if (s == startpos) {
2247 goto after_try_latin;
2248 }
2249 while (1) {
2250 if (regtry(&reginfo, &s)) {
2251 goto got_it;
2252 }
2253 after_try_latin:
2254 if (s > end) {
2255 goto phooey;
2256 }
2257 if (prog->extflags & RXf_USE_INTUIT) {
2258 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2259 if (!s) {
2260 goto phooey;
2261 }
2262 }
2263 else {
2264 s++;
2265 }
2266 }
2267 } /* end search for check string in latin*/
2268 } /* end search for check string */
2269 else { /* search for newline */
2270 if (s > startpos) {
2271 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2272 s--;
92f3d482 2273 }
21eede78
YO
2274 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2275 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2276 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2277 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2278 goto got_it;
2279 }
92f3d482
YO
2280 }
2281 } /* end search for newline */
2282 } /* end anchored/multiline check string search */
6eb5f6b9 2283 goto phooey;
bbe252da 2284 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2285 {
486ec47a 2286 /* the warning about reginfo.ganch being used without initialization
bbe252da 2287 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2288 and we only enter this block when the same bit is set. */
58e23c8d 2289 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2290
2291 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2292 goto got_it;
2293 goto phooey;
2294 }
2295
2296 /* Messy cases: unanchored match. */
bbe252da 2297 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2298 /* we have /x+whatever/ */
f2ed9b32 2299 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2300 char ch;
bf93d4cc
GS
2301#ifdef DEBUGGING
2302 int did_match = 0;
2303#endif
f2ed9b32
KW
2304 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2305 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2306 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2307
f2ed9b32 2308 if (utf8_target) {
4cadc6a9 2309 REXEC_FBC_SCAN(
6eb5f6b9 2310 if (*s == ch) {
a3621e74 2311 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2312 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2313 s += UTF8SKIP(s);
2314 while (s < strend && *s == ch)
2315 s += UTF8SKIP(s);
2316 }
4cadc6a9 2317 );
6eb5f6b9
JH
2318 }
2319 else {
4cadc6a9 2320 REXEC_FBC_SCAN(
6eb5f6b9 2321 if (*s == ch) {
a3621e74 2322 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2323 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2324 s++;
2325 while (s < strend && *s == ch)
2326 s++;
2327 }
4cadc6a9 2328 );
6eb5f6b9 2329 }
a3621e74 2330 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2331 PerlIO_printf(Perl_debug_log,
b7953727
JH
2332 "Did not find anchored character...\n")
2333 );
6eb5f6b9 2334 }
a0714e2c
SS
2335 else if (prog->anchored_substr != NULL
2336 || prog->anchored_utf8 != NULL
2337 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2338 && prog->float_max_offset < strend - s)) {
2339 SV *must;
2340 I32 back_max;
2341 I32 back_min;
2342 char *last;
6eb5f6b9 2343 char *last1; /* Last position checked before */
bf93d4cc
GS
2344#ifdef DEBUGGING
2345 int did_match = 0;
2346#endif
33b8afdf 2347 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2348 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2349 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2350 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2351 back_max = back_min = prog->anchored_offset;
2352 } else {
f2ed9b32
KW
2353 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2354 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2355 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2356 back_max = prog->float_max_offset;
2357 back_min = prog->float_min_offset;
2358 }
1de06328
YO
2359
2360
33b8afdf
JH
2361 if (must == &PL_sv_undef)
2362 /* could not downgrade utf8 check substring, so must fail */
2363 goto phooey;
2364
1de06328
YO
2365 if (back_min<0) {
2366 last = strend;
2367 } else {
2368 last = HOP3c(strend, /* Cannot start after this */
2369 -(I32)(CHR_SVLEN(must)
2370 - (SvTAIL(must) != 0) + back_min), strbeg);
2371 }
6eb5f6b9
JH
2372 if (s > PL_bostr)
2373 last1 = HOPc(s, -1);
2374 else
2375 last1 = s - 1; /* bogus */
2376
a0288114 2377 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2378 check_substr==must. */
2379 scream_pos = -1;
2380 dontbother = end_shift;
2381 strend = HOPc(strend, -dontbother);
2382 while ( (s <= last) &&
c33e64f0 2383 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2384 (unsigned char*)strend, must,
c33e64f0 2385 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2386 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2387 if (HOPc(s, -back_max) > last1) {
2388 last1 = HOPc(s, -back_min);
2389 s = HOPc(s, -back_max);
2390 }
2391 else {
52657f30 2392 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2393
2394 last1 = HOPc(s, -back_min);
52657f30 2395 s = t;
6eb5f6b9 2396 }
f2ed9b32 2397 if (utf8_target) {
6eb5f6b9 2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s += UTF8SKIP(s);
2402 }
2403 }
2404 else {
2405 while (s <= last1) {
24b23f37 2406 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2407 goto got_it;
2408 s++;
2409 }
2410 }
2411 }
ab3bbdeb 2412 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2413 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2414 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2415 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2416 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2417 ? "anchored" : "floating"),
ab3bbdeb
YO
2418 quoted, RE_SV_TAIL(must));
2419 });
6eb5f6b9
JH
2420 goto phooey;
2421 }
f8fc2ecf 2422 else if ( (c = progi->regstclass) ) {
f14c76ed 2423 if (minlen) {
f8fc2ecf 2424 const OPCODE op = OP(progi->regstclass);
66e933ab 2425 /* don't bother with what can't match */
786e8c11 2426 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2427 strend = HOPc(strend, -(minlen - 1));
2428 }
a3621e74 2429 DEBUG_EXECUTE_r({
be8e71aa 2430 SV * const prop = sv_newmortal();
32fc9b6a 2431 regprop(prog, prop, c);
0df25f3d 2432 {
f2ed9b32 2433 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2434 s,strend-s,60);
0df25f3d 2435 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2436 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2437 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2438 quoted, (int)(strend - s));
0df25f3d 2439 }
ffc61ed2 2440 });
3b0527fe 2441 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2442 goto got_it;
07be1b83 2443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2444 }
2445 else {
2446 dontbother = 0;
a0714e2c 2447 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2448 /* Trim the end. */
6af40bd7 2449 char *last= NULL;
33b8afdf 2450 SV* float_real;
c33e64f0
FC
2451 STRLEN len;
2452 const char *little;
33b8afdf 2453
f2ed9b32
KW
2454 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2455 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2456 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2457
c33e64f0
FC
2458 little = SvPV_const(float_real, len);
2459 if (SvTAIL(float_real)) {
1a13b075
YO
2460 /* This means that float_real contains an artificial \n on the end
2461 * due to the presence of something like this: /foo$/
2462 * where we can match both "foo" and "foo\n" at the end of the string.
2463 * So we have to compare the end of the string first against the float_real
2464 * without the \n and then against the full float_real with the string.
2465 * We have to watch out for cases where the string might be smaller
2466 * than the float_real or the float_real without the \n.
2467 */
2468 char *checkpos= strend - len;
2469 DEBUG_OPTIMISE_r(
2470 PerlIO_printf(Perl_debug_log,
2471 "%sChecking for float_real.%s\n",
2472 PL_colors[4], PL_colors[5]));
2473 if (checkpos + 1 < strbeg) {
2474 /* can't match, even if we remove the trailing \n string is too short to match */
2475 DEBUG_EXECUTE_r(
2476 PerlIO_printf(Perl_debug_log,
2477 "%sString shorter than required trailing substring, cannot match.%s\n",
2478 PL_colors[4], PL_colors[5]));
2479 goto phooey;
2480 } else if (memEQ(checkpos + 1, little, len - 1)) {
2481 /* can match, the end of the string matches without the "\n" */
2482 last = checkpos + 1;
2483 } else if (checkpos < strbeg) {
2484 /* cant match, string is too short when the "\n" is included */
2485 DEBUG_EXECUTE_r(
2486 PerlIO_printf(Perl_debug_log,
2487 "%sString does not contain required trailing substring, cannot match.%s\n",
2488 PL_colors[4], PL_colors[5]));
2489 goto phooey;
2490 } else if (!multiline) {
2491 /* non multiline match, so compare with the "\n" at the end of the string */
2492 if (memEQ(checkpos, little, len)) {
2493 last= checkpos;
2494 } else {
2495 DEBUG_EXECUTE_r(
2496 PerlIO_printf(Perl_debug_log,
2497 "%sString does not contain required trailing substring, cannot match.%s\n",
2498 PL_colors[4], PL_colors[5]));
2499 goto phooey;
2500 }
2501 } else {
2502 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2503 goto find_last;
1a13b075 2504 }
c33e64f0 2505 } else {
d6a28714 2506 find_last:
9041c2e3 2507 if (len)
d6a28714 2508 last = rninstr(s, strend, little, little + len);
b8c5462f 2509 else
a0288114 2510 last = strend; /* matching "$" */
b8c5462f 2511 }
6af40bd7
YO
2512 if (!last) {
2513 /* at one point this block contained a comment which was probably
2514 * incorrect, which said that this was a "should not happen" case.
2515 * Even if it was true when it was written I am pretty sure it is
2516 * not anymore, so I have removed the comment and replaced it with
2517 * this one. Yves */
6bda09f9
YO
2518 DEBUG_EXECUTE_r(
2519 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2520 "String does not contain required substring, cannot match.\n"
2521 ));
2522 goto phooey;
bf93d4cc 2523 }
d6a28714
JH
2524 dontbother = strend - last + prog->float_min_offset;
2525 }
2526 if (minlen && (dontbother < minlen))
2527 dontbother = minlen - 1;
2528 strend -= dontbother; /* this one's always in bytes! */
2529 /* We don't know much -- general case. */
f2ed9b32 2530 if (utf8_target) {
d6a28714 2531 for (;;) {
24b23f37 2532 if (regtry(&reginfo, &s))
d6a28714
JH
2533 goto got_it;
2534 if (s >= strend)
2535 break;
b8c5462f 2536 s += UTF8SKIP(s);
d6a28714
JH
2537 };
2538 }
2539 else {
2540 do {
24b23f37 2541 if (regtry(&reginfo, &s))
d6a28714
JH
2542 goto got_it;
2543 } while (s++ < strend);
2544 }
2545 }
2546
2547 /* Failure. */
2548 goto phooey;
2549
2550got_it:
495f47a5
DM
2551 DEBUG_BUFFERS_r(
2552 if (swap)
2553 PerlIO_printf(Perl_debug_log,
2554 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2555 PTR2UV(prog),
2556 PTR2UV(swap)
2557 );
2558 );
e9105d30 2559 Safefree(swap);
288b8c02 2560 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2561
ed301438 2562 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2563 restore_pos(aTHX_ prog);
5daac39c
NC
2564 if (RXp_PAREN_NAMES(prog))
2565 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2566
2567 /* make sure $`, $&, $', and $digit will work later */
2568 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2569 RX_MATCH_COPY_FREE(rx);
d6a28714 2570 if (flags & REXEC_COPY_STR) {
ac0ba89f 2571 const I32 i = PL_regeol - strbeg;
f8c7b90f 2572#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2573 if ((SvIsCOW(sv)
2574 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2575 if (DEBUG_C_TEST) {
2576 PerlIO_printf(Perl_debug_log,
2577 "Copy on write: regexp capture, type %d\n",
2578 (int) SvTYPE(sv));
2579 }
2580 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2581 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2582 assert (SvPOKp(prog->saved_copy));
2583 } else
2584#endif
2585 {
288b8c02 2586 RX_MATCH_COPIED_on(rx);
ed252734
NC
2587 s = savepvn(strbeg, i);
2588 prog->subbeg = s;
2589 }
d6a28714 2590 prog->sublen = i;
d6a28714
JH
2591 }
2592 else {
2593 prog->subbeg = strbeg;
2594 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2595 }
2596 }
9041c2e3 2597
d6a28714
JH
2598 return 1;
2599
2600phooey:
a3621e74 2601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2602 PL_colors[4], PL_colors[5]));
ed301438 2603 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2604 restore_pos(aTHX_ prog);
e9105d30 2605 if (swap) {
c74340f9 2606 /* we failed :-( roll it back */
495f47a5
DM
2607 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2608 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2609 PTR2UV(prog),
2610 PTR2UV(prog->offs),
2611 PTR2UV(swap)
2612 ));
e9105d30
GG
2613 Safefree(prog->offs);
2614 prog->offs = swap;
2615 }
2616
d6a28714
JH
2617 return 0;
2618}
2619
6bda09f9 2620
ec43f78b
DM
2621/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2622 * Do inc before dec, in case old and new rex are the same */
2623#define SET_reg_curpm(Re2) \
2624 if (PL_reg_state.re_state_eval_setup_done) { \
2625 (void)ReREFCNT_inc(Re2); \
2626 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2627 PM_SETRE((PL_reg_curpm), (Re2)); \
2628 }
2629
2630
d6a28714
JH
2631/*
2632 - regtry - try match at specific point
2633 */
2634STATIC I32 /* 0 failure, 1 success */
24b23f37 2635S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2636{
97aff369 2637 dVAR;
d6a28714 2638 CHECKPOINT lastcp;
288b8c02
NC
2639 REGEXP *const rx = reginfo->prog;
2640 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2641 RXi_GET_DECL(prog,progi);
a3621e74 2642 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2643
2644 PERL_ARGS_ASSERT_REGTRY;
2645
24b23f37 2646 reginfo->cutpoint=NULL;
d6a28714 2647
ed301438
DM
2648 if ((prog->extflags & RXf_EVAL_SEEN)
2649 && !PL_reg_state.re_state_eval_setup_done)
2650 {
d6a28714
JH
2651 MAGIC *mg;
2652
ed301438 2653 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2654 if (reginfo->sv) {
d6a28714 2655 /* Make $_ available to executed code. */
3b0527fe 2656 if (reginfo->sv != DEFSV) {
59f00321 2657 SAVE_DEFSV;
414bf5ae 2658 DEFSV_set(reginfo->sv);
b8c5462f 2659 }
d6a28714 2660
3b0527fe
DM
2661 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2662 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2663 /* prepare for quick setting of pos */
d300d9fa 2664#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2665 if (SvIsCOW(reginfo->sv))
2666 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2667#endif
3dab1dad 2668 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2669 &PL_vtbl_mglob, NULL, 0);
d6a28714 2670 mg->mg_len = -1;
b8c5462f 2671 }
d6a28714
JH
2672 PL_reg_magic = mg;
2673 PL_reg_oldpos = mg->mg_len;
4f639d21 2674 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2675 }
09687e5a 2676 if (!PL_reg_curpm) {
a02a5408 2677 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2678#ifdef USE_ITHREADS
2679 {
14a49a24 2680 SV* const repointer = &PL_sv_undef;
92313705
NC
2681 /* this regexp is also owned by the new PL_reg_curpm, which
2682 will try to free it. */
d2ece331 2683 av_push(PL_regex_padav, repointer);
09687e5a
AB
2684 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2685 PL_regex_pad = AvARRAY(PL_regex_padav);
2686 }
2687#endif
2688 }
ec43f78b 2689 SET_reg_curpm(rx);
d6a28714
JH
2690 PL_reg_oldcurpm = PL_curpm;
2691 PL_curpm = PL_reg_curpm;
07bc277f 2692 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2693 /* Here is a serious problem: we cannot rewrite subbeg,
2694 since it may be needed if this match fails. Thus
2695 $` inside (?{}) could fail... */
2696 PL_reg_oldsaved = prog->subbeg;
2697 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2698#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2699 PL_nrs = prog->saved_copy;
2700#endif
07bc277f 2701 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2702 }
2703 else
bd61b366 2704 PL_reg_oldsaved = NULL;
d6a28714
JH
2705 prog->subbeg = PL_bostr;
2706 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2707 }
97ca13b7
DM
2708#ifdef DEBUGGING
2709 PL_reg_starttry = *startpos;
2710#endif
f0ab9afb 2711 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2712 PL_reginput = *startpos;
d6a28714 2713 prog->lastparen = 0;
03994de8 2714 prog->lastcloseparen = 0;
d6a28714 2715 PL_regsize = 0;
d6a28714
JH
2716
2717 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2718 to do this again and again, prog->lastparen should take care of
3dd2943c 2719 this! --ilya*/
dafc8851
JH
2720
2721 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2722 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2723 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2724 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2725 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2726 * Meanwhile, this code *is* needed for the
daf18116
JH
2727 * above-mentioned test suite tests to succeed. The common theme
2728 * on those tests seems to be returning null fields from matches.
225593e1 2729 * --jhi updated by dapm */
dafc8851 2730#if 1
d6a28714 2731 if (prog->nparens) {
b93070ed 2732 regexp_paren_pair *pp = prog->offs;
eb578fdb 2733 I32 i;
b93070ed 2734 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2735 ++pp;
2736 pp->start = -1;
2737 pp->end = -1;
d6a28714
JH
2738 }
2739 }
dafc8851 2740#endif
02db2b7b 2741 REGCP_SET(lastcp);
f8fc2ecf 2742 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2743 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2744 return 1;
2745 }
24b23f37
YO
2746 if (reginfo->cutpoint)
2747 *startpos= reginfo->cutpoint;
02db2b7b 2748 REGCP_UNWIND(lastcp);
d6a28714
JH
2749 return 0;
2750}
2751
02db2b7b 2752
8ba1375e
MJD
2753#define sayYES goto yes
2754#define sayNO goto no
262b90c4 2755#define sayNO_SILENT goto no_silent
8ba1375e 2756
f9f4320a
YO
2757/* we dont use STMT_START/END here because it leads to
2758 "unreachable code" warnings, which are bogus, but distracting. */
2759#define CACHEsayNO \
c476f425
DM
2760 if (ST.cache_mask) \
2761 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2762 sayNO
3298f257 2763
a3621e74 2764/* this is used to determine how far from the left messages like
265c4333
YO
2765 'failed...' are printed. It should be set such that messages
2766 are inline with the regop output that created them.
a3621e74 2767*/
265c4333 2768#define REPORT_CODE_OFF 32
a3621e74
YO
2769
2770
40a82448
DM
2771#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2772#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2773
86545054
DM
2774#define SLAB_FIRST(s) (&(s)->states[0])
2775#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2776
5d9a96ca
DM
2777/* grab a new slab and return the first slot in it */
2778
2779STATIC regmatch_state *
2780S_push_slab(pTHX)
2781{
a35a87e7 2782#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2783 dMY_CXT;
2784#endif
5d9a96ca
DM
2785 regmatch_slab *s = PL_regmatch_slab->next;
2786 if (!s) {
2787 Newx(s, 1, regmatch_slab);
2788 s->prev = PL_regmatch_slab;
2789 s->next = NULL;
2790 PL_regmatch_slab->next = s;
2791 }
2792 PL_regmatch_slab = s;
86545054 2793 return SLAB_FIRST(s);
5d9a96ca 2794}
5b47454d 2795
95b24440 2796
40a82448
DM
2797/* push a new state then goto it */
2798
2799#define PUSH_STATE_GOTO(state, node) \
2800 scan = node; \
2801 st->resume_state = state; \
2802 goto push_state;
2803
2804/* push a new state with success backtracking, then goto it */
2805
2806#define PUSH_YES_STATE_GOTO(state, node) \
2807 scan = node; \
2808 st->resume_state = state; \
2809 goto push_yes_state;
2810
aa283a38 2811
aa283a38 2812
d6a28714 2813/*
95b24440 2814
bf1f174e
DM
2815regmatch() - main matching routine
2816
2817This is basically one big switch statement in a loop. We execute an op,
2818set 'next' to point the next op, and continue. If we come to a point which
2819we may need to backtrack to on failure such as (A|B|C), we push a
2820backtrack state onto the backtrack stack. On failure, we pop the top
2821state, and re-enter the loop at the state indicated. If there are no more
2822states to pop, we return failure.
2823
2824Sometimes we also need to backtrack on success; for example /A+/, where
2825after successfully matching one A, we need to go back and try to
2826match another one; similarly for lookahead assertions: if the assertion
2827completes successfully, we backtrack to the state just before the assertion
2828and then carry on. In these cases, the pushed state is marked as
2829'backtrack on success too'. This marking is in fact done by a chain of
2830pointers, each pointing to the previous 'yes' state. On success, we pop to
2831the nearest yes state, discarding any intermediate failure-only states.
2832Sometimes a yes state is pushed just to force some cleanup code to be
2833called at the end of a successful match or submatch; e.g. (??{$re}) uses
2834it to free the inner regex.
2835
2836Note that failure backtracking rewinds the cursor position, while
2837success backtracking leaves it alone.
2838
2839A pattern is complete when the END op is executed, while a subpattern
2840such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2841ops trigger the "pop to last yes state if any, otherwise return true"
2842behaviour.
2843
2844A common convention in this function is to use A and B to refer to the two
2845subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2846the subpattern to be matched possibly multiple times, while B is the entire
2847rest of the pattern. Variable and state names reflect this convention.
2848
2849The states in the main switch are the union of ops and failure/success of
2850substates associated with with that op. For example, IFMATCH is the op
2851that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2852'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2853successfully matched A and IFMATCH_A_fail is a state saying that we have
2854just failed to match A. Resume states always come in pairs. The backtrack
2855state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2856at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2857on success or failure.
2858
2859The struct that holds a backtracking state is actually a big union, with
2860one variant for each major type of op. The variable st points to the
2861top-most backtrack struct. To make the code clearer, within each
2862block of code we #define ST to alias the relevant union.
2863
2864Here's a concrete example of a (vastly oversimplified) IFMATCH
2865implementation:
2866
2867 switch (state) {
2868 ....
2869
2870#define ST st->u.ifmatch
2871
2872 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2873 ST.foo = ...; // some state we wish to save
95b24440 2874 ...
bf1f174e
DM
2875 // push a yes backtrack state with a resume value of
2876 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2877 // first node of A:
2878 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2879 // NOTREACHED
2880
2881 case IFMATCH_A: // we have successfully executed A; now continue with B
2882 next = B;
2883 bar = ST.foo; // do something with the preserved value
2884 break;
2885
2886 case IFMATCH_A_fail: // A failed, so the assertion failed
2887 ...; // do some housekeeping, then ...
2888 sayNO; // propagate the failure
2889
2890#undef ST
95b24440 2891
bf1f174e
DM
2892 ...
2893 }
95b24440 2894
bf1f174e
DM
2895For any old-timers reading this who are familiar with the old recursive
2896approach, the code above is equivalent to:
95b24440 2897
bf1f174e
DM
2898 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2899 {
2900 int foo = ...
95b24440 2901 ...
bf1f174e
DM
2902 if (regmatch(A)) {
2903 next = B;
2904 bar = foo;
2905 break;
95b24440 2906 }
bf1f174e
DM
2907 ...; // do some housekeeping, then ...
2908 sayNO; // propagate the failure
95b24440 2909 }
bf1f174e
DM
2910
2911The topmost backtrack state, pointed to by st, is usually free. If you
2912want to claim it, populate any ST.foo fields in it with values you wish to
2913save, then do one of
2914
2915 PUSH_STATE_GOTO(resume_state, node);
2916 PUSH_YES_STATE_GOTO(resume_state, node);
2917
2918which sets that backtrack state's resume value to 'resume_state', pushes a
2919new free entry to the top of the backtrack stack, then goes to 'node'.
2920On backtracking, the free slot is popped, and the saved state becomes the
2921new free state. An ST.foo field in this new top state can be temporarily
2922accessed to retrieve values, but once the main loop is re-entered, it
2923becomes available for reuse.
2924
2925Note that the depth of the backtrack stack constantly increases during the
2926left-to-right execution of the pattern, rather than going up and down with
2927the pattern nesting. For example the stack is at its maximum at Z at the
2928end of the pattern, rather than at X in the following:
2929
2930 /(((X)+)+)+....(Y)+....Z/
2931
2932The only exceptions to this are lookahead/behind assertions and the cut,
2933(?>A), which pop all the backtrack states associated with A before
2934continuing.
2935
486ec47a 2936Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2937PL_regmatch_state and st always point to the currently active state,
2938and PL_regmatch_slab points to the slab currently containing
2939PL_regmatch_state. The first time regmatch() is called, the first slab is
2940allocated, and is never freed until interpreter destruction. When the slab
2941is full, a new one is allocated and chained to the end. At exit from
2942regmatch(), slabs allocated since entry are freed.
2943
2944*/
95b24440 2945
40a82448 2946
5bc10b2c 2947#define DEBUG_STATE_pp(pp) \
265c4333 2948 DEBUG_STATE_r({ \
f2ed9b32 2949 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2950 PerlIO_printf(Perl_debug_log, \
5d458dd8 2951 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2952 depth*2, "", \
13d6edb4 2953 PL_reg_name[st->resume_state], \
5d458dd8
YO
2954 ((st==yes_state||st==mark_state) ? "[" : ""), \
2955 ((st==yes_state) ? "Y" : ""), \
2956 ((st==mark_state) ? "M" : ""), \
2957 ((st==yes_state||st==mark_state) ? "]" : "") \
2958 ); \
265c4333 2959 });
5bc10b2c 2960
40a82448 2961
3dab1dad 2962#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2963
3df15adc 2964#ifdef DEBUGGING
5bc10b2c 2965
ab3bbdeb 2966STATIC void
f2ed9b32 2967S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2968 const char *start, const char *end, const char *blurb)
2969{
efd26800 2970 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2971
2972 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2973
ab3bbdeb
YO
2974 if (!PL_colorset)
2975 reginitcolors();
2976 {
2977 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2978 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2979
f2ed9b32 2980 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2981 start, end - start, 60);
2982
2983 PerlIO_printf(Perl_debug_log,
2984 "%s%s REx%s %s against %s\n",
2985 PL_colors[4], blurb, PL_colors[5], s0, s1);
2986
f2ed9b32 2987 if (utf8_target||utf8_pat)
1de06328
YO
2988 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2989 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2990 utf8_pat && utf8_target ? " and " : "",
2991 utf8_target ? "string" : ""
ab3bbdeb
YO
2992 );
2993 }
2994}
3df15adc
YO
2995
2996STATIC void
786e8c11
YO
2997S_dump_exec_pos(pTHX_ const char *locinput,
2998 const regnode *scan,
2999 const char *loc_regeol,
3000 const char *loc_bostr,
3001 const char *loc_reg_starttry,
f2ed9b32 3002 const bool utf8_target)
07be1b83 3003{
786e8c11 3004 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3005 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3006 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3007 /* The part of the string before starttry has one color
3008 (pref0_len chars), between starttry and current
3009 position another one (pref_len - pref0_len chars),
3010 after the current position the third one.
3011 We assume that pref0_len <= pref_len, otherwise we
3012 decrease pref0_len. */
786e8c11
YO
3013 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3014 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3015 int pref0_len;
3016
7918f24d
NC
3017 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3018
f2ed9b32 3019 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3020 pref_len++;
786e8c11
YO
3021 pref0_len = pref_len - (locinput - loc_reg_starttry);
3022 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3023 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3024 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3025 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3026 l--;
3027 if (pref0_len < 0)
3028 pref0_len = 0;
3029 if (pref0_len > pref_len)
3030 pref0_len = pref_len;
3031 {
f2ed9b32 3032 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3033
ab3bbdeb 3034 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3035 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3036
ab3bbdeb 3037 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3038 (locinput - pref_len + pref0_len),
1de06328 3039 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3040
ab3bbdeb 3041 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3042 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3043
1de06328 3044 const STRLEN tlen=len0+len1+len2;
3df15adc 3045 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3046 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3047 (IV)(locinput - loc_bostr),
07be1b83 3048 len0, s0,
07be1b83 3049 len1, s1,
07be1b83 3050 (docolor ? "" : "> <"),
07be1b83 3051 len2, s2,
f9f4320a 3052 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3053 "");
3054 }
3055}
3df15adc 3056
07be1b83
YO
3057#endif
3058
0a4db386
YO
3059/* reg_check_named_buff_matched()
3060 * Checks to see if a named buffer has matched. The data array of
3061 * buffer numbers corresponding to the buffer is expected to reside
3062 * in the regexp->data->data array in the slot stored in the ARG() of
3063 * node involved. Note that this routine doesn't actually care about the
3064 * name, that information is not preserved from compilation to execution.
3065 * Returns the index of the leftmost defined buffer with the given name
3066 * or 0 if non of the buffers matched.
3067 */
3068STATIC I32
7918f24d
NC
3069S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3070{
0a4db386 3071 I32 n;
f8fc2ecf 3072 RXi_GET_DECL(rex,rexi);
ad64d0ec 3073 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3074 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3075
3076 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3077
0a4db386 3078 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3079 if ((I32)rex->lastparen >= nums[n] &&
3080 rex->offs[nums[n]].end != -1)
0a4db386
YO
3081 {
3082 return nums[n];
3083 }
3084 }
3085 return 0;
3086}
3087
2f554ef7
DM
3088
3089/* free all slabs above current one - called during LEAVE_SCOPE */
3090
3091STATIC void
3092S_clear_backtrack_stack(pTHX_ void *p)
3093{
3094 regmatch_slab *s = PL_regmatch_slab->next;
3095 PERL_UNUSED_ARG(p);
3096
3097 if (!s)
3098 return;
3099 PL_regmatch_slab->next = NULL;
3100 while (s) {
3101 regmatch_slab * const osl = s;
3102 s = s->next;
3103 Safefree(osl);
3104 }
3105}
3106
3107
d6a28714 3108STATIC I32 /* 0 failure, 1 success */
24b23f37 3109S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3110{
a35a87e7 3111#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3112 dMY_CXT;
3113#endif
27da23d5 3114 dVAR;
eb578fdb 3115 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3116 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3117 REGEXP *rex_sv = reginfo->prog;
3118 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3119 RXi_GET_DECL(rex,rexi);
2f554ef7 3120 I32 oldsave;
5d9a96ca 3121 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3122 regmatch_state *st;
5d9a96ca 3123 /* cache heavy used fields of st in registers */
eb578fdb
KW
3124 regnode *scan;
3125 regnode *next;
3126 U32 n = 0; /* general value; init to avoid compiler warning */
3127 I32 ln = 0; /* len or last; init to avoid compiler warning */
3128 char *locinput = PL_reginput;
3129 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3130
b69b0499 3131 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3132 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3133 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3134 const U32 max_nochange_depth =
3135 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3136 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3137 regmatch_state *yes_state = NULL; /* state to pop to on success of
3138 subpattern */
e2e6a0f1
YO
3139 /* mark_state piggy backs on the yes_state logic so that when we unwind
3140 the stack on success we can update the mark_state as we go */
3141 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3142 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3143 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3144 U32 state_num;
5d458dd8
YO
3145 bool no_final = 0; /* prevent failure from backtracking? */
3146 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3147 char *startpoint = PL_reginput;
5d458dd8
YO
3148 SV *popmark = NULL; /* are we looking for a mark? */
3149 SV *sv_commit = NULL; /* last mark name seen in failure */
3150 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3151 during a successful match */
5d458dd8
YO
3152 U32 lastopen = 0; /* last open we saw */
3153 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3154 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3155 /* these three flags are set by various ops to signal information to
3156 * the very next op. They have a useful lifetime of exactly one loop
3157 * iteration, and are not preserved or restored by state pushes/pops
3158 */
3159 bool sw = 0; /* the condition value in (?(cond)a|b) */
3160 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3161 int logical = 0; /* the following EVAL is:
3162 0: (?{...})
3163 1: (?(?{...})X|Y)
3164 2: (??{...})
3165 or the following IFMATCH/UNLESSM is:
3166 false: plain (?=foo)
3167 true: used as a condition: (?(?=foo))
3168 */
81ed78b2
DM
3169 PAD* last_pad = NULL;
3170 dMULTICALL;
3171 I32 gimme = G_SCALAR;
3172 CV *caller_cv = NULL; /* who called us */
3173 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3174 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3175
95b24440 3176#ifdef DEBUGGING
e68ec53f 3177 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3178#endif
3179
81ed78b2
DM
3180 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3181 multicall_oldcatch = 0;
3182 multicall_cv = NULL;
3183 cx = NULL;
4f8dbb2d
JL
3184 PERL_UNUSED_VAR(multicall_cop);
3185 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3186
3187
7918f24d
NC
3188 PERL_ARGS_ASSERT_REGMATCH;
3189
3b57cd43 3190 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3191 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3192 }));
5d9a96ca
DM
3193 /* on first ever call to regmatch, allocate first slab */
3194 if (!PL_regmatch_slab) {
3195 Newx(PL_regmatch_slab, 1, regmatch_slab);
3196 PL_regmatch_slab->prev = NULL;
3197 PL_regmatch_slab->next = NULL;
86545054 3198 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3199 }
3200
2f554ef7
DM
3201 oldsave = PL_savestack_ix;
3202 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3203 SAVEVPTR(PL_regmatch_slab);
3204 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3205
3206 /* grab next free state slot */
3207 st = ++PL_regmatch_state;
86545054 3208 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3209 st = PL_regmatch_state = S_push_slab(aTHX);
3210
d6a28714
JH
3211 /* Note that nextchr is a byte even in UTF */
3212 nextchr = UCHARAT(locinput);
3213 scan = prog;
3214 while (scan != NULL) {
8ba1375e 3215
a3621e74 3216 DEBUG_EXECUTE_r( {
6136c704 3217 SV * const prop = sv_newmortal();
1de06328 3218 regnode *rnext=regnext(scan);
f2ed9b32 3219 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3220 regprop(rex, prop, scan);
07be1b83
YO
3221
3222 PerlIO_printf(Perl_debug_log,
3223 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3224 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3225 SvPVX_const(prop),
1de06328 3226 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3227 0 : (IV)(rnext - rexi->program));
2a782b5b 3228 });
d6a28714
JH
3229
3230 next = scan + NEXT_OFF(scan);
3231 if (next == scan)
3232 next = NULL;
40a82448 3233 state_num = OP(scan);
d6a28714 3234
40a82448 3235 reenter_switch:
34a81e2b 3236
40a82448 3237 switch (state_num) {
d6a28714 3238 case BOL:
7fba1cd6 3239 if (locinput == PL_bostr)
d6a28714 3240 {
3b0527fe 3241 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3242 break;
3243 }
d6a28714
JH
3244 sayNO;
3245 case MBOL:
12d33761
HS
3246 if (locinput == PL_bostr ||
3247 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3248 {
b8c5462f
JH
3249 break;
3250 }
d6a28714
JH
3251 sayNO;
3252 case SBOL:
c2a73568 3253 if (locinput == PL_bostr)
b8c5462f 3254 break;
d6a28714
JH
3255 sayNO;
3256 case GPOS:
3b0527fe 3257 if (locinput == reginfo->ganch)
d6a28714
JH
3258 break;
3259 sayNO;
ee9b8eae
YO
3260
3261 case KEEPS:
3262 /* update the startpoint */
b93070ed 3263 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3264 PL_reginput = locinput;
b93070ed 3265 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3266 PUSH_STATE_GOTO(KEEPS_next, next);
3267 /*NOT-REACHED*/
3268 case KEEPS_next_fail:
3269 /* rollback the start point change */
b93070ed 3270 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3271 sayNO_SILENT;
3272 /*NOT-REACHED*/
d6a28714 3273 case EOL:
d6a28714
JH
3274 goto seol;
3275 case MEOL:
d6a28714 3276 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3277 sayNO;
b8c5462f 3278 break;
d6a28714
JH
3279 case SEOL:
3280 seol:
3281 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3282 sayNO;
d6a28714 3283 if (PL_regeol - locinput > 1)
b8c5462f 3284 sayNO;
b8c5462f 3285 break;
d6a28714
JH
3286 case EOS:
3287 if (PL_regeol != locinput)
b8c5462f 3288 sayNO;
d6a28714 3289 break;
ffc61ed2 3290 case SANY:
d6a28714 3291 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3292 sayNO;
f2ed9b32 3293 if (utf8_target) {
f33976b4
DB
3294 locinput += PL_utf8skip[nextchr];
3295 if (locinput > PL_regeol)
3296 sayNO;
3297 nextchr = UCHARAT(locinput);
3298 }
3299 else
3300 nextchr = UCHARAT(++locinput);
3301 break;
3302 case CANY:
3303 if (!nextchr && locinput >= PL_regeol)
3304 sayNO;
b8c5462f 3305 nextchr = UCHARAT(++locinput);
a0d0e21e 3306 break;
ffc61ed2 3307 case REG_ANY:
1aa99e6b
IH
3308 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3309 sayNO;
f2ed9b32 3310 if (utf8_target) {
b8c5462f 3311 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3312 if (locinput > PL_regeol)
3313 sayNO;
a0ed51b3 3314 nextchr = UCHARAT(locinput);
a0ed51b3 3315 }
1aa99e6b
IH
3316 else
3317 nextchr = UCHARAT(++locinput);
a0ed51b3 3318 break;
166ba7cd
DM
3319
3320#undef ST
3321#define ST st->u.trie
786e8c11
YO
3322 case TRIEC:
3323 /* In this case the charclass data is available inline so
3324 we can fail fast without a lot of extra overhead.
3325 */
fab2782b
YO
3326 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3327 DEBUG_EXECUTE_r(
3328 PerlIO_printf(Perl_debug_log,
3329 "%*s %sfailed to match trie start class...%s\n",
3330 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3331 );
3332 sayNO_SILENT;
118e2215 3333 assert(0); /* NOTREACHED */
786e8c11
YO
3334 }
3335 /* FALL THROUGH */
5b47454d 3336 case TRIE:
2e64971a
DM
3337 /* the basic plan of execution of the trie is:
3338 * At the beginning, run though all the states, and
3339 * find the longest-matching word. Also remember the position
3340 * of the shortest matching word. For example, this pattern:
3341 * 1 2 3 4 5
3342 * ab|a|x|abcd|abc
3343 * when matched against the string "abcde", will generate
3344 * accept states for all words except 3, with the longest
3345 * matching word being 4, and the shortest being 1 (with
3346 * the position being after char 1 of the string).
3347 *
3348 * Then for each matching word, in word order (i.e. 1,2,4,5),
3349 * we run the remainder of the pattern; on each try setting
3350 * the current position to the character following the word,
3351 * returning to try the next word on failure.
3352 *
3353 * We avoid having to build a list of words at runtime by
3354 * using a compile-time structure, wordinfo[].prev, which
3355 * gives, for each word, the previous accepting word (if any).
3356 * In the case above it would contain the mappings 1->2, 2->0,
3357 * 3->0, 4->5, 5->1. We can use this table to generate, from
3358 * the longest word (4 above), a list of all words, by
3359 * following the list of prev pointers; this gives us the
3360 * unordered list 4,5,1,2. Then given the current word we have
3361 * just tried, we can go through the list and find the
3362 * next-biggest word to try (so if we just failed on word 2,
3363 * the next in the list is 4).
3364 *
3365 * Since at runtime we don't record the matching position in
3366 * the string for each word, we have to work that out for
3367 * each word we're about to process. The wordinfo table holds
3368 * the character length of each word; given that we recorded
3369 * at the start: the position of the shortest word and its
3370 * length in chars, we just need to move the pointer the
3371 * difference between the two char lengths. Depending on
3372 * Unicode status and folding, that's cheap or expensive.
3373 *
3374 * This algorithm is optimised for the case where are only a
3375 * small number of accept states, i.e. 0,1, or maybe 2.
3376 * With lots of accepts states, and having to try all of them,
3377 * it becomes quadratic on number of accept states to find all
3378 * the next words.
3379 */
3380
3dab1dad 3381 {
07be1b83 3382 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3383 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3384
3385 /* what trie are we using right now */
be8e71aa 3386 reg_trie_data * const trie
f8fc2ecf 3387 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3388 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3389 U32 state = trie->startstate;
166ba7cd 3390
fab2782b 3391 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3392 if (trie->states[ state ].wordnum) {
3393 DEBUG_EXECUTE_r(
3394 PerlIO_printf(Perl_debug_log,
3395 "%*s %smatched empty string...%s\n",
5bc10b2c 3396 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3397 );
20dbff7c
YO
3398 if (!trie->jump)
3399 break;
3dab1dad
YO
3400 } else {
3401 DEBUG_EXECUTE_r(
3402 PerlIO_printf(Perl_debug_log,
786e8c11 3403 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3404 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3405 );
3406 sayNO_SILENT;
3407 }
3408 }
166ba7cd 3409
786e8c11
YO
3410 {
3411 U8 *uc = ( U8* )locinput;
3412
3413 STRLEN len = 0;
3414 STRLEN foldlen = 0;
3415 U8 *uscan = (U8*)NULL;
786e8c11 3416 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3417 U32 charcount = 0; /* how many input chars we have matched */
3418 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3419
786e8c11 3420 ST.jump = trie->jump;
786e8c11 3421 ST.me = scan;
2e64971a
DM
3422 ST.firstpos = NULL;
3423 ST.longfold = FALSE; /* char longer if folded => it's harder */
3424 ST.nextword = 0;
3425
3426 /* fully traverse the TRIE; note the position of the
3427 shortest accept state and the wordnum of the longest
3428