This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regmatch(): use nextchar where available
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
27d4fc33 147 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 148 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 149
1dcf4a1b 150#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 151
ee9a90b8
KW
152/* The actual code for CCC_TRY, which uses several variables from the routine
153 * it's callable from. It is designed to be the bulk of a case statement.
154 * FUNC is the macro or function to call on non-utf8 targets that indicate if
155 * nextchr matches the class.
156 * UTF8_TEST is the whole test string to use for utf8 targets
157 * LOAD is what to use to test, and if not present to load in the swash for the
158 * class
159 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
160 * UTF8_TEST test.
161 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
162 * utf8 and a variant, load the swash if necessary and test using the utf8
163 * test. Advance to the next character if test is ok, otherwise fail; If not
164 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
165 * fails, or advance to the next character */
166
167#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
168 if (locinput >= PL_regeol) { \
169 sayNO; \
170 } \
171 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
172 LOAD_UTF8_CHARCLASS(CLASS, STR); \
173 if (POS_OR_NEG (UTF8_TEST)) { \
174 sayNO; \
175 } \
176 locinput += PL_utf8skip[nextchr]; \
177 nextchr = UCHARAT(locinput); \
178 break; \
179 } \
180 if (POS_OR_NEG (FUNC(nextchr))) { \
181 sayNO; \
182 } \
183 nextchr = UCHARAT(++locinput); \
980866de
KW
184 break;
185
ee9a90b8
KW
186/* Handle the non-locale cases for a character class and its complement. It
187 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
188 * This is because that code fails when the test succeeds, so we want to have
189 * the test fail so that the code succeeds. The swash is stored in a
190 * predictable PL_ place */
cfaf538b
KW
191#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
192 CLASS, STR) \
ee9a90b8
KW
193 case NAME: \
194 _CCC_TRY_CODE( !, FUNC, \
195 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
196 (U8*)locinput, TRUE)), \
197 CLASS, STR) \
198 case NNAME: \
1dcf4a1b 199 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
200 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
201 (U8*)locinput, TRUE)), \
202 CLASS, STR) \
203
204/* Generate the case statements for both locale and non-locale character
205 * classes in regmatch for classes that don't have special unicode semantics.
206 * Locales don't use an immediate swash, but an intermediary special locale
207 * function that is called on the pointer to the current place in the input
208 * string. That function will resolve to needing the same swash. One might
209 * think that because we don't know what the locale will match, we shouldn't
210 * check with the swash loading function that it loaded properly; ie, that we
211 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
212 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
213 * irrelevant here */
214#define CCC_TRY(NAME, NNAME, FUNC, \
215 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 216 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
217 CLASS, STR) \
218 case NAMEL: \
219 PL_reg_flags |= RF_tainted; \
220 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
221 case NNAMEL: \
222 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
223 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
224 CLASS, STR) \
cfaf538b
KW
225 case NAMEA: \
226 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
227 sayNO; \
228 } \
229 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
230 nextchr = UCHARAT(++locinput); \
231 break; \
232 case NNAMEA: \
233 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 if (utf8_target) { \
237 locinput += PL_utf8skip[nextchr]; \
238 nextchr = UCHARAT(locinput); \
239 } \
240 else { \
241 nextchr = UCHARAT(++locinput); \
242 } \
243 break; \
ee9a90b8
KW
244 /* Generate the non-locale cases */ \
245 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
246
247/* This is like CCC_TRY, but has an extra set of parameters for generating case
248 * statements to handle separate Unicode semantics nodes */
249#define CCC_TRY_U(NAME, NNAME, FUNC, \
250 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
251 NAMEU, NNAMEU, FUNCU, \
cfaf538b 252 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 253 CLASS, STR) \
cfaf538b
KW
254 CCC_TRY(NAME, NNAME, FUNC, \
255 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
256 NAMEA, NNAMEA, FUNCA, \
257 CLASS, STR) \
ee9a90b8 258 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 259
3dab1dad
YO
260/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
261
5f80c4cf 262/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
263/* it would be nice to rework regcomp.sym to generate this stuff. sigh
264 *
265 * NOTE that *nothing* that affects backtracking should be in here, specifically
266 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
267 * node that is in between two EXACT like nodes when ascertaining what the required
268 * "follow" character is. This should probably be moved to regex compile time
269 * although it may be done at run time beause of the REF possibility - more
270 * investigation required. -- demerphq
271*/
3e901dc0
YO
272#define JUMPABLE(rn) ( \
273 OP(rn) == OPEN || \
274 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
275 OP(rn) == EVAL || \
cca55fe3
JP
276 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
277 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 278 OP(rn) == KEEPS || \
3dab1dad 279 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 280)
ee9b8eae 281#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 282
ee9b8eae
YO
283#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
284
285#if 0
286/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
287 we don't need this definition. */
288#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 289#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
290#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
291
292#else
293/* ... so we use this as its faster. */
294#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 295#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
296#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
298
299#endif
e2d8ce26 300
a84d97b6
HS
301/*
302 Search for mandatory following text node; for lookahead, the text must
303 follow but for lookbehind (rn->flags != 0) we skip to the next step.
304*/
cca55fe3 305#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
306 while (JUMPABLE(rn)) { \
307 const OPCODE type = OP(rn); \
308 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 309 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 310 else if (type == PLUS) \
cca55fe3 311 rn = NEXTOPER(rn); \
3dab1dad 312 else if (type == IFMATCH) \
a84d97b6 313 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 314 else rn += NEXT_OFF(rn); \
3dab1dad 315 } \
5f80c4cf 316} STMT_END
74750237 317
c476f425 318
acfe0abc 319static void restore_pos(pTHX_ void *arg);
51371543 320
87c0511b 321#define REGCP_PAREN_ELEMS 3
f067efbf 322#define REGCP_OTHER_ELEMS 3
e0fa7e2b 323#define REGCP_FRAME_ELEMS 1
620d5b66
NC
324/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
325 * are needed for the regexp context stack bookkeeping. */
326
76e3520e 327STATIC CHECKPOINT
b93070ed 328S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 329{
97aff369 330 dVAR;
a3b680e6 331 const int retval = PL_savestack_ix;
a3b680e6 332 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
333 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
334 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 335 I32 p;
40a82448 336 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 337
b93070ed
DM
338 PERL_ARGS_ASSERT_REGCPPUSH;
339
e49a9654 340 if (paren_elems_to_push < 0)
5637ef5b
NC
341 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
342 paren_elems_to_push);
e49a9654 343
e0fa7e2b
NC
344 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
345 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
346 " out of range (%lu-%ld)",
347 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 348
620d5b66 349 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 350
495f47a5
DM
351 DEBUG_BUFFERS_r(
352 if ((int)PL_regsize > (int)parenfloor)
353 PerlIO_printf(Perl_debug_log,
354 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
355 PTR2UV(rex),
356 PTR2UV(rex->offs)
357 );
358 );
87c0511b 359 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 360/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
361 SSPUSHINT(rex->offs[p].end);
362 SSPUSHINT(rex->offs[p].start);
1ca2007e 363 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 364 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
365 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
366 (UV)p,
367 (IV)rex->offs[p].start,
368 (IV)rex->offs[p].start_tmp,
369 (IV)rex->offs[p].end
40a82448 370 ));
a0d0e21e 371 }
b1ce53c5 372/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 373 SSPUSHINT(PL_regsize);
b93070ed
DM
374 SSPUSHINT(rex->lastparen);
375 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 376 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 377
a0d0e21e
LW
378 return retval;
379}
380
c277df42 381/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
382#define REGCP_SET(cp) \
383 DEBUG_STATE_r( \
ab3bbdeb 384 PerlIO_printf(Perl_debug_log, \
e4f74956 385 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
386 (IV)PL_savestack_ix)); \
387 cp = PL_savestack_ix
c3464db5 388
ab3bbdeb 389#define REGCP_UNWIND(cp) \
e4f74956 390 DEBUG_STATE_r( \
ab3bbdeb 391 if (cp != PL_savestack_ix) \
e4f74956
YO
392 PerlIO_printf(Perl_debug_log, \
393 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
394 (IV)(cp), (IV)PL_savestack_ix)); \
395 regcpblow(cp)
c277df42 396
a8d1f4b4
DM
397#define UNWIND_PAREN(lp, lcp) \
398 for (n = rex->lastparen; n > lp; n--) \
399 rex->offs[n].end = -1; \
400 rex->lastparen = n; \
401 rex->lastcloseparen = lcp;
402
403
f067efbf 404STATIC void
b93070ed 405S_regcppop(pTHX_ regexp *rex)
a0d0e21e 406{
97aff369 407 dVAR;
e0fa7e2b 408 UV i;
87c0511b 409 U32 paren;
a3621e74
YO
410 GET_RE_DEBUG_FLAGS_DECL;
411
7918f24d
NC
412 PERL_ARGS_ASSERT_REGCPPOP;
413
b1ce53c5 414 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 415 i = SSPOPUV;
e0fa7e2b
NC
416 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
417 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
418 rex->lastcloseparen = SSPOPINT;
419 rex->lastparen = SSPOPINT;
3280af22 420 PL_regsize = SSPOPINT;
b1ce53c5 421
620d5b66 422 i -= REGCP_OTHER_ELEMS;
b1ce53c5 423 /* Now restore the parentheses context. */
495f47a5
DM
424 DEBUG_BUFFERS_r(
425 if (i || rex->lastparen + 1 <= rex->nparens)
426 PerlIO_printf(Perl_debug_log,
427 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
428 PTR2UV(rex),
429 PTR2UV(rex->offs)
430 );
431 );
87c0511b 432 paren = PL_regsize;
620d5b66 433 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 434 I32 tmps;
1ca2007e 435 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 436 rex->offs[paren].start = SSPOPINT;
cf93c79d 437 tmps = SSPOPINT;
b93070ed
DM
438 if (paren <= rex->lastparen)
439 rex->offs[paren].end = tmps;
495f47a5
DM
440 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
441 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
442 (UV)paren,
443 (IV)rex->offs[paren].start,
444 (IV)rex->offs[paren].start_tmp,
445 (IV)rex->offs[paren].end,
446 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 447 );
87c0511b 448 paren--;
a0d0e21e 449 }
daf18116 450#if 1
dafc8851
JH
451 /* It would seem that the similar code in regtry()
452 * already takes care of this, and in fact it is in
453 * a better location to since this code can #if 0-ed out
454 * but the code in regtry() is needed or otherwise tests
455 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
456 * (as of patchlevel 7877) will fail. Then again,
457 * this code seems to be necessary or otherwise
225593e1
DM
458 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
459 * --jhi updated by dapm */
b93070ed 460 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 461 if (i > PL_regsize)
b93070ed
DM
462 rex->offs[i].start = -1;
463 rex->offs[i].end = -1;
495f47a5
DM
464 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
465 " \\%"UVuf": %s ..-1 undeffing\n",
466 (UV)i,
467 (i > PL_regsize) ? "-1" : " "
468 ));
a0d0e21e 469 }
dafc8851 470#endif
a0d0e21e
LW
471}
472
74088413
DM
473/* restore the parens and associated vars at savestack position ix,
474 * but without popping the stack */
475
476STATIC void
477S_regcp_restore(pTHX_ regexp *rex, I32 ix)
478{
479 I32 tmpix = PL_savestack_ix;
480 PL_savestack_ix = ix;
481 regcppop(rex);
482 PL_savestack_ix = tmpix;
483}
484
02db2b7b 485#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 486
a687059c 487/*
e50aee73 488 * pregexec and friends
a687059c
LW
489 */
490
76234dfb 491#ifndef PERL_IN_XSUB_RE
a687059c 492/*
c277df42 493 - pregexec - match a regexp against a string
a687059c 494 */
c277df42 495I32
49d7dfbc 496Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 497 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
498/* stringarg: the point in the string at which to begin matching */
499/* strend: pointer to null at end of string */
500/* strbeg: real beginning of string */
501/* minend: end of match must be >= minend bytes after stringarg. */
502/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
503 * itself is accessed via the pointers above */
504/* nosave: For optimizations. */
c277df42 505{
7918f24d
NC
506 PERL_ARGS_ASSERT_PREGEXEC;
507
c277df42 508 return
9041c2e3 509 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
510 nosave ? 0 : REXEC_COPY_STR);
511}
76234dfb 512#endif
22e551b9 513
9041c2e3 514/*
cad2e5aa
JH
515 * Need to implement the following flags for reg_anch:
516 *
517 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
518 * USE_INTUIT_ML
519 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
520 * INTUIT_AUTORITATIVE_ML
521 * INTUIT_ONCE_NOML - Intuit can match in one location only.
522 * INTUIT_ONCE_ML
523 *
524 * Another flag for this function: SECOND_TIME (so that float substrs
525 * with giant delta may be not rechecked).
526 */
527
528/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
529
3f7c398e 530/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
531 Otherwise, only SvCUR(sv) is used to get strbeg. */
532
533/* XXXX We assume that strpos is strbeg unless sv. */
534
6eb5f6b9
JH
535/* XXXX Some places assume that there is a fixed substring.
536 An update may be needed if optimizer marks as "INTUITable"
537 RExen without fixed substrings. Similarly, it is assumed that
538 lengths of all the strings are no more than minlen, thus they
539 cannot come from lookahead.
40d049e4
YO
540 (Or minlen should take into account lookahead.)
541 NOTE: Some of this comment is not correct. minlen does now take account
542 of lookahead/behind. Further research is required. -- demerphq
543
544*/
6eb5f6b9 545
2c2d71f5
JH
546/* A failure to find a constant substring means that there is no need to make
547 an expensive call to REx engine, thus we celebrate a failure. Similarly,
548 finding a substring too deep into the string means that less calls to
30944b6d
IZ
549 regtry() should be needed.
550
551 REx compiler's optimizer found 4 possible hints:
552 a) Anchored substring;
553 b) Fixed substring;
554 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 555 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 556 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
557 string which does not contradict any of them.
558 */
2c2d71f5 559
6eb5f6b9
JH
560/* Most of decisions we do here should have been done at compile time.
561 The nodes of the REx which we used for the search should have been
562 deleted from the finite automaton. */
563
cad2e5aa 564char *
288b8c02 565Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 566 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 567{
97aff369 568 dVAR;
288b8c02 569 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 570 I32 start_shift = 0;
cad2e5aa 571 /* Should be nonnegative! */
eb578fdb
KW
572 I32 end_shift = 0;
573 char *s;
574 SV *check;
a1933d95 575 char *strbeg;
cad2e5aa 576 char *t;
f2ed9b32 577 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 578 I32 ml_anch;
eb578fdb 579 char *other_last = NULL; /* other substr checked before this */
bd61b366 580 char *check_at = NULL; /* check substr found at this pos */
d8080198 581 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 582 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 583 RXi_GET_DECL(prog,progi);
30944b6d 584#ifdef DEBUGGING
890ce7af 585 const char * const i_strpos = strpos;
30944b6d 586#endif
a3621e74
YO
587 GET_RE_DEBUG_FLAGS_DECL;
588
7918f24d 589 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
590 PERL_UNUSED_ARG(flags);
591 PERL_UNUSED_ARG(data);
7918f24d 592
f2ed9b32 593 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 594
3c8556c3 595 if (RX_UTF8(rx)) {
b8d68ded
JH
596 PL_reg_flags |= RF_utf8;
597 }
ab3bbdeb 598 DEBUG_EXECUTE_r(
f2ed9b32 599 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
600 sv ? "Guessing start of match in sv for"
601 : "Guessing start of match in string for");
2a782b5b 602 );
cad2e5aa 603
c344f387
JH
604 /* CHR_DIST() would be more correct here but it makes things slow. */
605 if (prog->minlen > strend - strpos) {
a3621e74 606 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 607 "String too short... [re_intuit_start]\n"));
cad2e5aa 608 goto fail;
2c2d71f5 609 }
1de06328 610
a1933d95 611 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 612 PL_regeol = strend;
f2ed9b32 613 if (utf8_target) {
33b8afdf
JH
614 if (!prog->check_utf8 && prog->check_substr)
615 to_utf8_substr(prog);
616 check = prog->check_utf8;
617 } else {
618 if (!prog->check_substr && prog->check_utf8)
619 to_byte_substr(prog);
620 check = prog->check_substr;
621 }
1de06328 622 if (check == &PL_sv_undef) {
a3621e74 623 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 624 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
625 goto fail;
626 }
bbe252da
YO
627 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
628 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
629 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 630 && !multiline ) ); /* Check after \n? */
cad2e5aa 631
7e25d62c 632 if (!ml_anch) {
bbe252da
YO
633 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
634 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 635 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
636 && sv && !SvROK(sv)
637 && (strpos != strbeg)) {
a3621e74 638 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
639 goto fail;
640 }
641 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 642 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 643 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
644 I32 slen;
645
1aa99e6b 646 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 647
653099ff
GS
648 if (SvTAIL(check)) {
649 slen = SvCUR(check); /* >= 1 */
cad2e5aa 650
9041c2e3 651 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 652 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 654 goto fail_finish;
cad2e5aa
JH
655 }
656 /* Now should match s[0..slen-2] */
657 slen--;
3f7c398e 658 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 659 || (slen > 1
3f7c398e 660 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 661 report_neq:
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
663 goto fail_finish;
664 }
cad2e5aa 665 }
3f7c398e 666 else if (*SvPVX_const(check) != *s
653099ff 667 || ((slen = SvCUR(check)) > 1
3f7c398e 668 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 669 goto report_neq;
c315bfe8 670 check_at = s;
2c2d71f5 671 goto success_at_start;
7e25d62c 672 }
cad2e5aa 673 }
2c2d71f5 674 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 675 s = strpos;
2c2d71f5 676 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
677 end_shift = prog->check_end_shift;
678
2c2d71f5 679 if (!ml_anch) {
a3b680e6 680 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 681 - (SvTAIL(check) != 0);
a3b680e6 682 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
683
684 if (end_shift < eshift)
685 end_shift = eshift;
686 }
cad2e5aa 687 }
2c2d71f5 688 else { /* Can match at random position */
cad2e5aa
JH
689 ml_anch = 0;
690 s = strpos;
1de06328
YO
691 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
692 end_shift = prog->check_end_shift;
693
694 /* end shift should be non negative here */
cad2e5aa
JH
695 }
696
bcdf7404 697#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 698 if (end_shift < 0)
1de06328 699 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 700 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
701#endif
702
2c2d71f5
JH
703 restart:
704 /* Find a possible match in the region s..strend by looking for
705 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
706
707 {
708 I32 srch_start_shift = start_shift;
709 I32 srch_end_shift = end_shift;
c33e64f0
FC
710 U8* start_point;
711 U8* end_point;
1de06328
YO
712 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
713 srch_end_shift -= ((strbeg - s) - srch_start_shift);
714 srch_start_shift = strbeg - s;
715 }
6bda09f9 716 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
717 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
718 (IV)prog->check_offset_min,
719 (IV)srch_start_shift,
720 (IV)srch_end_shift,
721 (IV)prog->check_end_shift);
722 });
723
bbe252da 724 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
725 start_point= (U8*)(s + srch_start_shift);
726 end_point= (U8*)(strend - srch_end_shift);
727 } else {
728 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
729 end_point= HOP3(strend, -srch_end_shift, strbeg);
730 }
6bda09f9 731 DEBUG_OPTIMISE_MORE_r({
56570a2c 732 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 733 (int)(end_point - start_point),
fc8cd66c 734 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
735 start_point);
736 });
737
738 s = fbm_instr( start_point, end_point,
7fba1cd6 739 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 740 }
cad2e5aa
JH
741 /* Update the count-of-usability, remove useless subpatterns,
742 unshift s. */
2c2d71f5 743
ab3bbdeb 744 DEBUG_EXECUTE_r({
f2ed9b32 745 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
746 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
747 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 748 (s ? "Found" : "Did not find"),
f2ed9b32 749 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
750 ? "anchored" : "floating"),
751 quoted,
752 RE_SV_TAIL(check),
753 (s ? " at offset " : "...\n") );
754 });
2c2d71f5
JH
755
756 if (!s)
757 goto fail_finish;
2c2d71f5 758 /* Finish the diagnostic message */
a3621e74 759 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 760
1de06328
YO
761 /* XXX dmq: first branch is for positive lookbehind...
762 Our check string is offset from the beginning of the pattern.
763 So we need to do any stclass tests offset forward from that
764 point. I think. :-(
765 */
766
767
768
769 check_at=s;
770
771
2c2d71f5
JH
772 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
773 Start with the other substr.
774 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 775 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
776 *always* match. Probably should be marked during compile...
777 Probably it is right to do no SCREAM here...
778 */
779
f2ed9b32 780 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
781 : (prog->float_substr && prog->anchored_substr))
782 {
30944b6d 783 /* Take into account the "other" substring. */
2c2d71f5
JH
784 /* XXXX May be hopelessly wrong for UTF... */
785 if (!other_last)
6eb5f6b9 786 other_last = strpos;
f2ed9b32 787 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
788 do_other_anchored:
789 {
890ce7af
AL
790 char * const last = HOP3c(s, -start_shift, strbeg);
791 char *last1, *last2;
be8e71aa 792 char * const saved_s = s;
33b8afdf 793 SV* must;
2c2d71f5 794
2c2d71f5
JH
795 t = s - prog->check_offset_max;
796 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 797 && (!utf8_target
0ce71af7 798 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 799 && t > strpos)))
6f207bd3 800 NOOP;
2c2d71f5
JH
801 else
802 t = strpos;
1aa99e6b 803 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
804 if (t < other_last) /* These positions already checked */
805 t = other_last;
1aa99e6b 806 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
807 if (last < last1)
808 last1 = last;
1de06328
YO
809 /* XXXX It is not documented what units *_offsets are in.
810 We assume bytes, but this is clearly wrong.
811 Meaning this code needs to be carefully reviewed for errors.
812 dmq.
813 */
814
2c2d71f5 815 /* On end-of-str: see comment below. */
f2ed9b32 816 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
817 if (must == &PL_sv_undef) {
818 s = (char*)NULL;
1de06328 819 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
820 }
821 else
822 s = fbm_instr(
823 (unsigned char*)t,
824 HOP3(HOP3(last1, prog->anchored_offset, strend)
825 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
826 must,
7fba1cd6 827 multiline ? FBMrf_MULTILINE : 0
33b8afdf 828 );
ab3bbdeb 829 DEBUG_EXECUTE_r({
f2ed9b32 830 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
831 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
832 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 833 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
834 quoted, RE_SV_TAIL(must));
835 });
836
837
2c2d71f5
JH
838 if (!s) {
839 if (last1 >= last2) {
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
841 ", giving up...\n"));
842 goto fail_finish;
843 }
a3621e74 844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 845 ", trying floating at offset %ld...\n",
be8e71aa 846 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
847 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
848 s = HOP3c(last, 1, strend);
2c2d71f5
JH
849 goto restart;
850 }
851 else {
a3621e74 852 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 853 (long)(s - i_strpos)));
1aa99e6b
IH
854 t = HOP3c(s, -prog->anchored_offset, strbeg);
855 other_last = HOP3c(s, 1, strend);
be8e71aa 856 s = saved_s;
2c2d71f5
JH
857 if (t == strpos)
858 goto try_at_start;
2c2d71f5
JH
859 goto try_at_offset;
860 }
30944b6d 861 }
2c2d71f5
JH
862 }
863 else { /* Take into account the floating substring. */
33b8afdf 864 char *last, *last1;
be8e71aa 865 char * const saved_s = s;
33b8afdf
JH
866 SV* must;
867
868 t = HOP3c(s, -start_shift, strbeg);
869 last1 = last =
870 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
871 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
872 last = HOP3c(t, prog->float_max_offset, strend);
873 s = HOP3c(t, prog->float_min_offset, strend);
874 if (s < other_last)
875 s = other_last;
2c2d71f5 876 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 877 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
878 /* fbm_instr() takes into account exact value of end-of-str
879 if the check is SvTAIL(ed). Since false positives are OK,
880 and end-of-str is not later than strend we are OK. */
881 if (must == &PL_sv_undef) {
882 s = (char*)NULL;
1de06328 883 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
884 }
885 else
2c2d71f5 886 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
887 (unsigned char*)last + SvCUR(must)
888 - (SvTAIL(must)!=0),
7fba1cd6 889 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 890 DEBUG_EXECUTE_r({
f2ed9b32 891 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
892 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
893 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 894 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
895 quoted, RE_SV_TAIL(must));
896 });
33b8afdf
JH
897 if (!s) {
898 if (last1 == last) {
a3621e74 899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
900 ", giving up...\n"));
901 goto fail_finish;
2c2d71f5 902 }
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 904 ", trying anchored starting at offset %ld...\n",
be8e71aa 905 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
906 other_last = last;
907 s = HOP3c(t, 1, strend);
908 goto restart;
909 }
910 else {
a3621e74 911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
912 (long)(s - i_strpos)));
913 other_last = s; /* Fix this later. --Hugo */
be8e71aa 914 s = saved_s;
33b8afdf
JH
915 if (t == strpos)
916 goto try_at_start;
917 goto try_at_offset;
918 }
2c2d71f5 919 }
cad2e5aa 920 }
2c2d71f5 921
1de06328 922
9ef43ace 923 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 924
6bda09f9 925 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
926 PerlIO_printf(Perl_debug_log,
927 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
928 (IV)prog->check_offset_min,
929 (IV)prog->check_offset_max,
930 (IV)(s-strpos),
931 (IV)(t-strpos),
932 (IV)(t-s),
933 (IV)(strend-strpos)
934 )
935 );
936
2c2d71f5 937 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 938 && (!utf8_target
9ef43ace 939 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
940 && t > strpos)))
941 {
2c2d71f5
JH
942 /* Fixed substring is found far enough so that the match
943 cannot start at strpos. */
944 try_at_offset:
cad2e5aa 945 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
946 /* Eventually fbm_*() should handle this, but often
947 anchored_offset is not 0, so this check will not be wasted. */
948 /* XXXX In the code below we prefer to look for "^" even in
949 presence of anchored substrings. And we search even
950 beyond the found float position. These pessimizations
951 are historical artefacts only. */
952 find_anchor:
2c2d71f5 953 while (t < strend - prog->minlen) {
cad2e5aa 954 if (*t == '\n') {
4ee3650e 955 if (t < check_at - prog->check_offset_min) {
f2ed9b32 956 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
957 /* Since we moved from the found position,
958 we definitely contradict the found anchored
30944b6d
IZ
959 substr. Due to the above check we do not
960 contradict "check" substr.
961 Thus we can arrive here only if check substr
962 is float. Redo checking for "other"=="fixed".
963 */
9041c2e3 964 strpos = t + 1;
a3621e74 965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 966 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
967 goto do_other_anchored;
968 }
4ee3650e
GS
969 /* We don't contradict the found floating substring. */
970 /* XXXX Why not check for STCLASS? */
cad2e5aa 971 s = t + 1;
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 973 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
974 goto set_useful;
975 }
4ee3650e
GS
976 /* Position contradicts check-string */
977 /* XXXX probably better to look for check-string
978 than for "\n", so one should lower the limit for t? */
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 980 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 981 other_last = strpos = s = t + 1;
cad2e5aa
JH
982 goto restart;
983 }
984 t++;
985 }
a3621e74 986 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 987 PL_colors[0], PL_colors[1]));
2c2d71f5 988 goto fail_finish;
cad2e5aa 989 }
f5952150 990 else {
a3621e74 991 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 992 PL_colors[0], PL_colors[1]));
f5952150 993 }
cad2e5aa
JH
994 s = t;
995 set_useful:
f2ed9b32 996 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
997 }
998 else {
f5952150 999 /* The found string does not prohibit matching at strpos,
2c2d71f5 1000 - no optimization of calling REx engine can be performed,
f5952150
GS
1001 unless it was an MBOL and we are not after MBOL,
1002 or a future STCLASS check will fail this. */
2c2d71f5
JH
1003 try_at_start:
1004 /* Even in this situation we may use MBOL flag if strpos is offset
1005 wrt the start of the string. */
05b4157f 1006 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1007 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1008 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1009 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1010 {
cad2e5aa
JH
1011 t = strpos;
1012 goto find_anchor;
1013 }
a3621e74 1014 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1015 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1016 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1017 );
2c2d71f5 1018 success_at_start:
bbe252da 1019 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1020 && (utf8_target ? (
33b8afdf
JH
1021 prog->check_utf8 /* Could be deleted already */
1022 && --BmUSEFUL(prog->check_utf8) < 0
1023 && (prog->check_utf8 == prog->float_utf8)
1024 ) : (
1025 prog->check_substr /* Could be deleted already */
1026 && --BmUSEFUL(prog->check_substr) < 0
1027 && (prog->check_substr == prog->float_substr)
1028 )))
66e933ab 1029 {
cad2e5aa 1030 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1031 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1032 /* XXX Does the destruction order has to change with utf8_target? */
1033 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1034 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1035 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1036 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1037 check = NULL; /* abort */
cad2e5aa 1038 s = strpos;
486ec47a 1039 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1040 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1041 if (prog->intflags & PREGf_IMPLICIT)
1042 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1043 /* XXXX This is a remnant of the old implementation. It
1044 looks wasteful, since now INTUIT can use many
6eb5f6b9 1045 other heuristics. */
bbe252da 1046 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1047 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1048 }
1049 else
1050 s = strpos;
1051 }
1052
6eb5f6b9
JH
1053 /* Last resort... */
1054 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1055 /* trie stclasses are too expensive to use here, we are better off to
1056 leave it to regmatch itself */
f8fc2ecf 1057 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1058 /* minlen == 0 is possible if regstclass is \b or \B,
1059 and the fixed substr is ''$.
1060 Since minlen is already taken into account, s+1 is before strend;
1061 accidentally, minlen >= 1 guaranties no false positives at s + 1
1062 even for \b or \B. But (minlen? 1 : 0) below assumes that
1063 regstclass does not come from lookahead... */
1064 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1065 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1066 const U8* const str = (U8*)STRING(progi->regstclass);
1067 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1068 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1069 : 1);
1de06328
YO
1070 char * endpos;
1071 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1072 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1073 else if (prog->float_substr || prog->float_utf8)
1074 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1075 else
1076 endpos= strend;
1077
d8080198
YO
1078 if (checked_upto < s)
1079 checked_upto = s;
1080 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1081 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1082
6eb5f6b9 1083 t = s;
d8080198
YO
1084 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1085 if (s) {
1086 checked_upto = s;
1087 } else {
6eb5f6b9 1088#ifdef DEBUGGING
cbbf8932 1089 const char *what = NULL;
6eb5f6b9
JH
1090#endif
1091 if (endpos == strend) {
a3621e74 1092 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1093 "Could not match STCLASS...\n") );
1094 goto fail;
1095 }
a3621e74 1096 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1097 "This position contradicts STCLASS...\n") );
bbe252da 1098 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1099 goto fail;
d8080198
YO
1100 checked_upto = HOPBACKc(endpos, start_shift);
1101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1102 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1103 /* Contradict one of substrings */
33b8afdf 1104 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1105 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1106 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1107 hop_and_restart:
1aa99e6b 1108 s = HOP3c(t, 1, strend);
66e933ab
GS
1109 if (s + start_shift + end_shift > strend) {
1110 /* XXXX Should be taken into account earlier? */
a3621e74 1111 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1112 "Could not match STCLASS...\n") );
1113 goto fail;
1114 }
5e39e1e5
HS
1115 if (!check)
1116 goto giveup;
a3621e74 1117 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1118 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1119 what, (long)(s + start_shift - i_strpos)) );
1120 goto restart;
1121 }
66e933ab 1122 /* Have both, check_string is floating */
6eb5f6b9
JH
1123 if (t + start_shift >= check_at) /* Contradicts floating=check */
1124 goto retry_floating_check;
1125 /* Recheck anchored substring, but not floating... */
9041c2e3 1126 s = check_at;
5e39e1e5
HS
1127 if (!check)
1128 goto giveup;
a3621e74 1129 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1130 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1131 (long)(other_last - i_strpos)) );
1132 goto do_other_anchored;
1133 }
60e71179
GS
1134 /* Another way we could have checked stclass at the
1135 current position only: */
1136 if (ml_anch) {
1137 s = t = t + 1;
5e39e1e5
HS
1138 if (!check)
1139 goto giveup;
a3621e74 1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1141 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1142 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1143 goto try_at_offset;
66e933ab 1144 }
f2ed9b32 1145 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1146 goto fail;
486ec47a 1147 /* Check is floating substring. */
6eb5f6b9
JH
1148 retry_floating_check:
1149 t = check_at - start_shift;
a3621e74 1150 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1151 goto hop_and_restart;
1152 }
b7953727 1153 if (t != s) {
a3621e74 1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1155 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1156 (long)(t - i_strpos), (long)(s - i_strpos))
1157 );
1158 }
1159 else {
a3621e74 1160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1161 "Does not contradict STCLASS...\n");
1162 );
1163 }
6eb5f6b9 1164 }
5e39e1e5 1165 giveup:
a3621e74 1166 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1167 PL_colors[4], (check ? "Guessed" : "Giving up"),
1168 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1169 return s;
2c2d71f5
JH
1170
1171 fail_finish: /* Substring not found */
33b8afdf 1172 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1173 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1174 fail:
a3621e74 1175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1176 PL_colors[4], PL_colors[5]));
bd61b366 1177 return NULL;
cad2e5aa 1178}
9661b544 1179
a0a388a1
YO
1180#define DECL_TRIE_TYPE(scan) \
1181 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1182 trie_type = ((scan->flags == EXACT) \
1183 ? (utf8_target ? trie_utf8 : trie_plain) \
1184 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1185
1186#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1187uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1188 STRLEN skiplen; \
1189 switch (trie_type) { \
1190 case trie_utf8_fold: \
1191 if ( foldlen>0 ) { \
1192 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1193 foldlen -= len; \
1194 uscan += len; \
1195 len=0; \
1196 } else { \
1197 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1198 len = UTF8SKIP(uc); \
1199 skiplen = UNISKIP( uvc ); \
1200 foldlen -= skiplen; \
1201 uscan = foldbuf + skiplen; \
1202 } \
1203 break; \
1204 case trie_latin_utf8_fold: \
1205 if ( foldlen>0 ) { \
1206 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1207 foldlen -= len; \
1208 uscan += len; \
1209 len=0; \
1210 } else { \
1211 len = 1; \
1212 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1213 skiplen = UNISKIP( uvc ); \
1214 foldlen -= skiplen; \
1215 uscan = foldbuf + skiplen; \
1216 } \
1217 break; \
1218 case trie_utf8: \
1219 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1220 break; \
1221 case trie_plain: \
1222 uvc = (UV)*uc; \
1223 len = 1; \
1224 } \
1225 if (uvc < 256) { \
1226 charid = trie->charmap[ uvc ]; \
1227 } \
1228 else { \
1229 charid = 0; \
1230 if (widecharmap) { \
1231 SV** const svpp = hv_fetch(widecharmap, \
1232 (char*)&uvc, sizeof(UV), 0); \
1233 if (svpp) \
1234 charid = (U16)SvIV(*svpp); \
1235 } \
1236 } \
4cadc6a9
YO
1237} STMT_END
1238
4cadc6a9
YO
1239#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1240STMT_START { \
1241 while (s <= e) { \
1242 if ( (CoNd) \
fac1af77 1243 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1244 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1245 goto got_it; \
1246 s++; \
1247 } \
1248} STMT_END
1249
1250#define REXEC_FBC_UTF8_SCAN(CoDe) \
1251STMT_START { \
1252 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1253 CoDe \
1254 s += uskip; \
1255 } \
1256} STMT_END
1257
1258#define REXEC_FBC_SCAN(CoDe) \
1259STMT_START { \
1260 while (s < strend) { \
1261 CoDe \
1262 s++; \
1263 } \
1264} STMT_END
1265
1266#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1267REXEC_FBC_UTF8_SCAN( \
1268 if (CoNd) { \
24b23f37 1269 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1270 goto got_it; \
1271 else \
1272 tmp = doevery; \
1273 } \
1274 else \
1275 tmp = 1; \
1276)
1277
1278#define REXEC_FBC_CLASS_SCAN(CoNd) \
1279REXEC_FBC_SCAN( \
1280 if (CoNd) { \
24b23f37 1281 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1282 goto got_it; \
1283 else \
1284 tmp = doevery; \
1285 } \
1286 else \
1287 tmp = 1; \
1288)
1289
1290#define REXEC_FBC_TRYIT \
24b23f37 1291if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1292 goto got_it
1293
e1d1eefb 1294#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1295 if (utf8_target) { \
e1d1eefb
YO
1296 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1297 } \
1298 else { \
1299 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1300 }
e1d1eefb 1301
4cadc6a9 1302#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1303 if (utf8_target) { \
4cadc6a9
YO
1304 UtFpReLoAd; \
1305 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1306 } \
1307 else { \
1308 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1309 }
4cadc6a9
YO
1310
1311#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1312 PL_reg_flags |= RF_tainted; \
f2ed9b32 1313 if (utf8_target) { \
4cadc6a9
YO
1314 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1315 } \
1316 else { \
1317 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1318 }
4cadc6a9 1319
786e8c11
YO
1320#define DUMP_EXEC_POS(li,s,doutf8) \
1321 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1322
cfaf538b
KW
1323
1324#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1325 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1326 tmp = TEST_NON_UTF8(tmp); \
1327 REXEC_FBC_UTF8_SCAN( \
1328 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1329 tmp = !tmp; \
1330 IF_SUCCESS; \
1331 } \
1332 else { \
1333 IF_FAIL; \
1334 } \
1335 ); \
1336
1337#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1338 if (s == PL_bostr) { \
1339 tmp = '\n'; \
1340 } \
1341 else { \
1342 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1343 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1344 } \
1345 tmp = TeSt1_UtF8; \
1346 LOAD_UTF8_CHARCLASS_ALNUM(); \
1347 REXEC_FBC_UTF8_SCAN( \
1348 if (tmp == ! (TeSt2_UtF8)) { \
1349 tmp = !tmp; \
1350 IF_SUCCESS; \
1351 } \
1352 else { \
1353 IF_FAIL; \
1354 } \
1355 ); \
1356
63ac0dad
KW
1357/* The only difference between the BOUND and NBOUND cases is that
1358 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1359 * NBOUND. This is accomplished by passing it in either the if or else clause,
1360 * with the other one being empty */
1361#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1362 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1363
1364#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1365 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1366
1367#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1368 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1369
1370#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1371 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1372
63ac0dad
KW
1373
1374/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1375 * be passed in completely with the variable name being tested, which isn't
1376 * such a clean interface, but this is easier to read than it was before. We
1377 * are looking for the boundary (or non-boundary between a word and non-word
1378 * character. The utf8 and non-utf8 cases have the same logic, but the details
1379 * must be different. Find the "wordness" of the character just prior to this
1380 * one, and compare it with the wordness of this one. If they differ, we have
1381 * a boundary. At the beginning of the string, pretend that the previous
1382 * character was a new-line */
cfaf538b 1383#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1384 if (utf8_target) { \
cfaf538b 1385 UTF8_CODE \
63ac0dad
KW
1386 } \
1387 else { /* Not utf8 */ \
1388 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1389 tmp = TEST_NON_UTF8(tmp); \
1390 REXEC_FBC_SCAN( \
1391 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1392 tmp = !tmp; \
1393 IF_SUCCESS; \
1394 } \
1395 else { \
1396 IF_FAIL; \
1397 } \
1398 ); \
1399 } \
1400 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1401 goto got_it;
1402
786e8c11
YO
1403/* We know what class REx starts with. Try to find this position... */
1404/* if reginfo is NULL, its a dryrun */
1405/* annoyingly all the vars in this routine have different names from their counterparts
1406 in regmatch. /grrr */
1407
3c3eec57 1408STATIC char *
07be1b83 1409S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1410 const char *strend, regmatch_info *reginfo)
a687059c 1411{
27da23d5 1412 dVAR;
bbe252da 1413 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1414 char *pat_string; /* The pattern's exactish string */
1415 char *pat_end; /* ptr to end char of pat_string */
1416 re_fold_t folder; /* Function for computing non-utf8 folds */
1417 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1418 STRLEN ln;
5dab1207 1419 STRLEN lnc;
eb578fdb 1420 STRLEN uskip;
fac1af77
KW
1421 U8 c1;
1422 U8 c2;
6eb5f6b9 1423 char *e;
eb578fdb
KW
1424 I32 tmp = 1; /* Scratch variable? */
1425 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1426 UV utf8_fold_flags = 0;
f8fc2ecf 1427 RXi_GET_DECL(prog,progi);
7918f24d
NC
1428
1429 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1430
6eb5f6b9
JH
1431 /* We know what class it must start with. */
1432 switch (OP(c)) {
f56b6394 1433 case ANYOFV:
6eb5f6b9 1434 case ANYOF:
f56b6394 1435 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1436 STRLEN inclasslen = strend - s;
1437 REXEC_FBC_UTF8_CLASS_SCAN(
1438 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1439 }
1440 else {
6ef69d56 1441 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1442 }
6eb5f6b9 1443 break;
f33976b4 1444 case CANY:
4cadc6a9 1445 REXEC_FBC_SCAN(
24b23f37 1446 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1447 goto got_it;
1448 else
1449 tmp = doevery;
4cadc6a9 1450 );
f33976b4 1451 break;
2f7f8cb1
KW
1452
1453 case EXACTFA:
1454 if (UTF_PATTERN || utf8_target) {
1455 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1456 goto do_exactf_utf8;
1457 }
1458 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1459 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1460 goto do_exactf_non_utf8; /* isn't dealt with by these */
1461
6eb5f6b9 1462 case EXACTF:
62bf7766 1463 if (utf8_target) {
77a6d856
KW
1464
1465 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1466 utf8_fold_flags = 0;
fac1af77
KW
1467 goto do_exactf_utf8;
1468 }
1469 fold_array = PL_fold;
1470 folder = foldEQ;
1471 goto do_exactf_non_utf8;
1472
1473 case EXACTFL:
1474 if (UTF_PATTERN || utf8_target) {
17580e7a 1475 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1476 goto do_exactf_utf8;
1477 }
1478 fold_array = PL_fold_locale;
1479 folder = foldEQ_locale;
16d951b7
KW
1480 goto do_exactf_non_utf8;
1481
3c760661
KW
1482 case EXACTFU_SS:
1483 if (UTF_PATTERN) {
1484 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1485 }
1486 goto do_exactf_utf8;
1487
fab2782b 1488 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1489 case EXACTFU:
1490 if (UTF_PATTERN || utf8_target) {
77a6d856 1491 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1492 goto do_exactf_utf8;
1493 }
1494
1495 /* Any 'ss' in the pattern should have been replaced by regcomp,
1496 * so we don't have to worry here about this single special case
1497 * in the Latin1 range */
1498 fold_array = PL_fold_latin1;
1499 folder = foldEQ_latin1;
fac1af77
KW
1500
1501 /* FALL THROUGH */
1502
62bf7766
KW
1503 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1504 are no glitches with fold-length differences
1505 between the target string and pattern */
fac1af77
KW
1506
1507 /* The idea in the non-utf8 EXACTF* cases is to first find the
1508 * first character of the EXACTF* node and then, if necessary,
1509 * case-insensitively compare the full text of the node. c1 is the
1510 * first character. c2 is its fold. This logic will not work for
1511 * Unicode semantics and the german sharp ss, which hence should
1512 * not be compiled into a node that gets here. */
1513 pat_string = STRING(c);
1514 ln = STR_LEN(c); /* length to match in octets/bytes */
1515
8a90a8fe
KW
1516 /* We know that we have to match at least 'ln' bytes (which is the
1517 * same as characters, since not utf8). If we have to match 3
1518 * characters, and there are only 2 availabe, we know without
1519 * trying that it will fail; so don't start a match past the
1520 * required minimum number from the far end */
fac1af77
KW
1521 e = HOP3c(strend, -((I32)ln), s);
1522
1523 if (!reginfo && e < s) {
1524 e = s; /* Due to minlen logic of intuit() */
1525 }
1526
1527 c1 = *pat_string;
1528 c2 = fold_array[c1];
1529 if (c1 == c2) { /* If char and fold are the same */
1530 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1531 }
1532 else {
1533 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1534 }
1535 break;
1536
1537 do_exactf_utf8:
e067297c
KW
1538 {
1539 unsigned expansion;
1540
fac1af77
KW
1541
1542 /* If one of the operands is in utf8, we can't use the simpler
1543 * folding above, due to the fact that many different characters
1544 * can have the same fold, or portion of a fold, or different-
1545 * length fold */
1546 pat_string = STRING(c);
1547 ln = STR_LEN(c); /* length to match in octets/bytes */
1548 pat_end = pat_string + ln;
1549 lnc = (UTF_PATTERN) /* length to match in characters */
1550 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1551 : ln;
1552
e067297c
KW
1553 /* We have 'lnc' characters to match in the pattern, but because of
1554 * multi-character folding, each character in the target can match
1555 * up to 3 characters (Unicode guarantees it will never exceed
1556 * this) if it is utf8-encoded; and up to 2 if not (based on the
1557 * fact that the Latin 1 folds are already determined, and the
1558 * only multi-char fold in that range is the sharp-s folding to
1559 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1560 * string character. Adjust lnc accordingly, rounding up, so that
1561 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1562 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1563 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1564
1565 /* As in the non-UTF8 case, if we have to match 3 characters, and
1566 * only 2 are left, it's guaranteed to fail, so don't start a
1567 * match that would require us to go beyond the end of the string
1568 */
1569 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1570
1571 if (!reginfo && e < s) {
1572 e = s; /* Due to minlen logic of intuit() */
1573 }
1574
b33105db
KW
1575 /* XXX Note that we could recalculate e to stop the loop earlier,
1576 * as the worst case expansion above will rarely be met, and as we
1577 * go along we would usually find that e moves further to the left.
1578 * This would happen only after we reached the point in the loop
1579 * where if there were no expansion we should fail. Unclear if
1580 * worth the expense */
e067297c 1581
fac1af77
KW
1582 while (s <= e) {
1583 char *my_strend= (char *)strend;
1584 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1585 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1586 && (!reginfo || regtry(reginfo, &s)) )
1587 {
1588 goto got_it;
1589 }
bbdd8bad 1590 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1591 }
1592 break;
e067297c 1593 }
bbce6d69 1594 case BOUNDL:
3280af22 1595 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1596 FBC_BOUND(isALNUM_LC,
1597 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1598 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1599 break;
bbce6d69 1600 case NBOUNDL:
3280af22 1601 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1602 FBC_NBOUND(isALNUM_LC,
1603 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1604 isALNUM_LC_utf8((U8*)s));
1605 break;
1606 case BOUND:
1607 FBC_BOUND(isWORDCHAR,
1608 isALNUM_uni(tmp),
1609 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1610 break;
cfaf538b
KW
1611 case BOUNDA:
1612 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1613 isWORDCHAR_A(tmp),
1614 isWORDCHAR_A((U8*)s));
1615 break;
a0d0e21e 1616 case NBOUND:
63ac0dad
KW
1617 FBC_NBOUND(isWORDCHAR,
1618 isALNUM_uni(tmp),
1619 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1620 break;
cfaf538b
KW
1621 case NBOUNDA:
1622 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1623 isWORDCHAR_A(tmp),
1624 isWORDCHAR_A((U8*)s));
1625 break;
63ac0dad
KW
1626 case BOUNDU:
1627 FBC_BOUND(isWORDCHAR_L1,
1628 isALNUM_uni(tmp),
1629 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1630 break;
1631 case NBOUNDU:
1632 FBC_NBOUND(isWORDCHAR_L1,
1633 isALNUM_uni(tmp),
1634 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1635 break;
bbce6d69 1636 case ALNUML:
4cadc6a9
YO
1637 REXEC_FBC_CSCAN_TAINT(
1638 isALNUM_LC_utf8((U8*)s),
1639 isALNUM_LC(*s)
1640 );
6895a8aa 1641 break;
980866de
KW
1642 case ALNUMU:
1643 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1644 LOAD_UTF8_CHARCLASS_ALNUM(),
1645 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1646 isWORDCHAR_L1((U8) *s)
1647 );
6895a8aa 1648 break;
980866de
KW
1649 case ALNUM:
1650 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1651 LOAD_UTF8_CHARCLASS_ALNUM(),
1652 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1653 isWORDCHAR((U8) *s)
1654 );
6895a8aa 1655 break;
cfaf538b 1656 case ALNUMA:
8e9da4d4
KW
1657 /* Don't need to worry about utf8, as it can match only a single
1658 * byte invariant character */
cfaf538b 1659 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1660 break;
980866de
KW
1661 case NALNUMU:
1662 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1663 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1664 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1665 ! isWORDCHAR_L1((U8) *s)
1666 );
6895a8aa 1667 break;
a0d0e21e 1668 case NALNUM:
4cadc6a9 1669 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1670 LOAD_UTF8_CHARCLASS_ALNUM(),
1671 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1672 ! isALNUM(*s)
4cadc6a9 1673 );
6895a8aa 1674 break;
cfaf538b 1675 case NALNUMA:
8e9da4d4
KW
1676 REXEC_FBC_CSCAN(
1677 !isWORDCHAR_A(*s),
1678 !isWORDCHAR_A(*s)
1679 );
1680 break;
bbce6d69 1681 case NALNUML:
4cadc6a9
YO
1682 REXEC_FBC_CSCAN_TAINT(
1683 !isALNUM_LC_utf8((U8*)s),
1684 !isALNUM_LC(*s)
1685 );
6895a8aa 1686 break;
980866de
KW
1687 case SPACEU:
1688 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1689 LOAD_UTF8_CHARCLASS_SPACE(),
1690 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1691 isSPACE_L1((U8) *s)
1692 );
6895a8aa 1693 break;
a0d0e21e 1694 case SPACE:
4cadc6a9 1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1698 isSPACE((U8) *s)
4cadc6a9 1699 );
6895a8aa 1700 break;
cfaf538b 1701 case SPACEA:
8e9da4d4
KW
1702 /* Don't need to worry about utf8, as it can match only a single
1703 * byte invariant character */
cfaf538b 1704 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1705 break;
bbce6d69 1706 case SPACEL:
4cadc6a9 1707 REXEC_FBC_CSCAN_TAINT(
6bbba904 1708 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1709 isSPACE_LC(*s)
1710 );
6895a8aa 1711 break;
980866de
KW
1712 case NSPACEU:
1713 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1714 LOAD_UTF8_CHARCLASS_SPACE(),
1715 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1716 ! isSPACE_L1((U8) *s)
1717 );
6895a8aa 1718 break;
a0d0e21e 1719 case NSPACE:
4cadc6a9 1720 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1721 LOAD_UTF8_CHARCLASS_SPACE(),
1722 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1723 ! isSPACE((U8) *s)
4cadc6a9 1724 );
6895a8aa 1725 break;
cfaf538b 1726 case NSPACEA:
8e9da4d4
KW
1727 REXEC_FBC_CSCAN(
1728 !isSPACE_A(*s),
1729 !isSPACE_A(*s)
1730 );
1731 break;
bbce6d69 1732 case NSPACEL:
4cadc6a9 1733 REXEC_FBC_CSCAN_TAINT(
6bbba904 1734 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1735 !isSPACE_LC(*s)
1736 );
6895a8aa 1737 break;
a0d0e21e 1738 case DIGIT:
4cadc6a9 1739 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1740 LOAD_UTF8_CHARCLASS_DIGIT(),
1741 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1742 isDIGIT(*s)
1743 );
6895a8aa 1744 break;
cfaf538b 1745 case DIGITA:
8e9da4d4
KW
1746 /* Don't need to worry about utf8, as it can match only a single
1747 * byte invariant character */
cfaf538b 1748 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1749 break;
b8c5462f 1750 case DIGITL:
4cadc6a9
YO
1751 REXEC_FBC_CSCAN_TAINT(
1752 isDIGIT_LC_utf8((U8*)s),
1753 isDIGIT_LC(*s)
1754 );
6895a8aa 1755 break;
a0d0e21e 1756 case NDIGIT:
4cadc6a9 1757 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1758 LOAD_UTF8_CHARCLASS_DIGIT(),
1759 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1760 !isDIGIT(*s)
1761 );
6895a8aa 1762 break;
cfaf538b 1763 case NDIGITA:
8e9da4d4
KW
1764 REXEC_FBC_CSCAN(
1765 !isDIGIT_A(*s),
1766 !isDIGIT_A(*s)
1767 );
1768 break;
b8c5462f 1769 case NDIGITL:
4cadc6a9
YO
1770 REXEC_FBC_CSCAN_TAINT(
1771 !isDIGIT_LC_utf8((U8*)s),
1772 !isDIGIT_LC(*s)
1773 );
6895a8aa 1774 break;
e1d1eefb
YO
1775 case LNBREAK:
1776 REXEC_FBC_CSCAN(
1777 is_LNBREAK_utf8(s),
1778 is_LNBREAK_latin1(s)
1779 );
6895a8aa 1780 break;
e1d1eefb
YO
1781 case VERTWS:
1782 REXEC_FBC_CSCAN(
1783 is_VERTWS_utf8(s),
1784 is_VERTWS_latin1(s)
1785 );
6895a8aa 1786 break;
e1d1eefb
YO
1787 case NVERTWS:
1788 REXEC_FBC_CSCAN(
1789 !is_VERTWS_utf8(s),
1790 !is_VERTWS_latin1(s)
1791 );
6895a8aa 1792 break;
e1d1eefb
YO
1793 case HORIZWS:
1794 REXEC_FBC_CSCAN(
1795 is_HORIZWS_utf8(s),
1796 is_HORIZWS_latin1(s)
1797 );
6895a8aa 1798 break;
e1d1eefb
YO
1799 case NHORIZWS:
1800 REXEC_FBC_CSCAN(
1801 !is_HORIZWS_utf8(s),
1802 !is_HORIZWS_latin1(s)
1803 );
6895a8aa 1804 break;
0658cdde
KW
1805 case POSIXA:
1806 /* Don't need to worry about utf8, as it can match only a single
1807 * byte invariant character. The flag in this node type is the
1808 * class number to pass to _generic_isCC() to build a mask for
1809 * searching in PL_charclass[] */
1810 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1811 break;
1812 case NPOSIXA:
1813 REXEC_FBC_CSCAN(
1814 !_generic_isCC_A(*s, FLAGS(c)),
1815 !_generic_isCC_A(*s, FLAGS(c))
1816 );
1817 break;
1818
1de06328
YO
1819 case AHOCORASICKC:
1820 case AHOCORASICK:
07be1b83 1821 {
a0a388a1 1822 DECL_TRIE_TYPE(c);
07be1b83
YO
1823 /* what trie are we using right now */
1824 reg_ac_data *aho
f8fc2ecf 1825 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1826 reg_trie_data *trie
1827 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1828 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1829
1830 const char *last_start = strend - trie->minlen;
6148ee25 1831#ifdef DEBUGGING
07be1b83 1832 const char *real_start = s;
6148ee25 1833#endif
07be1b83 1834 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1835 SV *sv_points;
1836 U8 **points; /* map of where we were in the input string
786e8c11 1837 when reading a given char. For ASCII this
be8e71aa 1838 is unnecessary overhead as the relationship
38a44b82
NC
1839 is always 1:1, but for Unicode, especially
1840 case folded Unicode this is not true. */
f9e705e8 1841 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1842 U8 *bitmap=NULL;
1843
07be1b83
YO
1844
1845 GET_RE_DEBUG_FLAGS_DECL;
1846
be8e71aa
YO
1847 /* We can't just allocate points here. We need to wrap it in
1848 * an SV so it gets freed properly if there is a croak while
1849 * running the match */
1850 ENTER;
1851 SAVETMPS;
1852 sv_points=newSV(maxlen * sizeof(U8 *));
1853 SvCUR_set(sv_points,
1854 maxlen * sizeof(U8 *));
1855 SvPOK_on(sv_points);
1856 sv_2mortal(sv_points);
1857 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1858 if ( trie_type != trie_utf8_fold
1859 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1860 {
786e8c11
YO
1861 if (trie->bitmap)
1862 bitmap=(U8*)trie->bitmap;
1863 else
1864 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1865 }
786e8c11
YO
1866 /* this is the Aho-Corasick algorithm modified a touch
1867 to include special handling for long "unknown char"
1868 sequences. The basic idea being that we use AC as long
1869 as we are dealing with a possible matching char, when
1870 we encounter an unknown char (and we have not encountered
1871 an accepting state) we scan forward until we find a legal
1872 starting char.
1873 AC matching is basically that of trie matching, except
1874 that when we encounter a failing transition, we fall back
1875 to the current states "fail state", and try the current char
1876 again, a process we repeat until we reach the root state,
1877 state 1, or a legal transition. If we fail on the root state
1878 then we can either terminate if we have reached an accepting
1879 state previously, or restart the entire process from the beginning
1880 if we have not.
1881
1882 */
07be1b83
YO
1883 while (s <= last_start) {
1884 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885 U8 *uc = (U8*)s;
1886 U16 charid = 0;
1887 U32 base = 1;
1888 U32 state = 1;
1889 UV uvc = 0;
1890 STRLEN len = 0;
1891 STRLEN foldlen = 0;
1892 U8 *uscan = (U8*)NULL;
1893 U8 *leftmost = NULL;
786e8c11
YO
1894#ifdef DEBUGGING
1895 U32 accepted_word= 0;
1896#endif
07be1b83
YO
1897 U32 pointpos = 0;
1898
1899 while ( state && uc <= (U8*)strend ) {
1900 int failed=0;
786e8c11
YO
1901 U32 word = aho->states[ state ].wordnum;
1902
1de06328
YO
1903 if( state==1 ) {
1904 if ( bitmap ) {
1905 DEBUG_TRIE_EXECUTE_r(
1906 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1908 (char *)uc, utf8_target );
1de06328
YO
1909 PerlIO_printf( Perl_debug_log,
1910 " Scanning for legal start char...\n");
1911 }
d085b490
YO
1912 );
1913 if (utf8_target) {
1914 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915 uc += UTF8SKIP(uc);
1916 }
1917 } else {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 uc++;
1920 }
1921 }
1de06328 1922 s= (char *)uc;
786e8c11 1923 }
786e8c11
YO
1924 if (uc >(U8*)last_start) break;
1925 }
1926
1927 if ( word ) {
2e64971a 1928 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1929 if (!leftmost || lpos < leftmost) {
1930 DEBUG_r(accepted_word=word);
07be1b83 1931 leftmost= lpos;
786e8c11 1932 }
07be1b83 1933 if (base==0) break;
786e8c11 1934
07be1b83
YO
1935 }
1936 points[pointpos++ % maxlen]= uc;
55eed653
NC
1937 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1938 uscan, len, uvc, charid, foldlen,
1939 foldbuf, uniflags);
786e8c11
YO
1940 DEBUG_TRIE_EXECUTE_r({
1941 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1942 s, utf8_target );
07be1b83 1943 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1944 " Charid:%3u CP:%4"UVxf" ",
1945 charid, uvc);
1946 });
07be1b83
YO
1947
1948 do {
6148ee25 1949#ifdef DEBUGGING
786e8c11 1950 word = aho->states[ state ].wordnum;
6148ee25 1951#endif
07be1b83
YO
1952 base = aho->states[ state ].trans.base;
1953
786e8c11
YO
1954 DEBUG_TRIE_EXECUTE_r({
1955 if (failed)
1956 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1957 s, utf8_target );
07be1b83 1958 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1959 "%sState: %4"UVxf", word=%"UVxf,
1960 failed ? " Fail transition to " : "",
1961 (UV)state, (UV)word);
1962 });
07be1b83
YO
1963 if ( base ) {
1964 U32 tmp;
6dd2be57 1965 I32 offset;
07be1b83 1966 if (charid &&
6dd2be57
DM
1967 ( ((offset = base + charid
1968 - 1 - trie->uniquecharcount)) >= 0)
1969 && ((U32)offset < trie->lasttrans)
1970 && trie->trans[offset].check == state
1971 && (tmp=trie->trans[offset].next))
07be1b83 1972 {
786e8c11
YO
1973 DEBUG_TRIE_EXECUTE_r(
1974 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1975 state = tmp;
1976 break;
1977 }
1978 else {
786e8c11
YO
1979 DEBUG_TRIE_EXECUTE_r(
1980 PerlIO_printf( Perl_debug_log," - fail\n"));
1981 failed = 1;
1982 state = aho->fail[state];
07be1b83
YO
1983 }
1984 }
1985 else {
1986 /* we must be accepting here */
786e8c11
YO
1987 DEBUG_TRIE_EXECUTE_r(
1988 PerlIO_printf( Perl_debug_log," - accepting\n"));
1989 failed = 1;
07be1b83
YO
1990 break;
1991 }
1992 } while(state);
786e8c11 1993 uc += len;
07be1b83
YO
1994 if (failed) {
1995 if (leftmost)
1996 break;
786e8c11 1997 if (!state) state = 1;
07be1b83
YO
1998 }
1999 }
2000 if ( aho->states[ state ].wordnum ) {
2e64971a 2001 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2002 if (!leftmost || lpos < leftmost) {
2003 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2004 leftmost = lpos;
786e8c11 2005 }
07be1b83 2006 }
07be1b83
YO
2007 if (leftmost) {
2008 s = (char*)leftmost;
786e8c11
YO
2009 DEBUG_TRIE_EXECUTE_r({
2010 PerlIO_printf(
70685ca0
JH
2011 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2012 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2013 );
2014 });
24b23f37 2015 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2016 FREETMPS;
2017 LEAVE;
07be1b83 2018 goto got_it;
be8e71aa 2019 }
07be1b83 2020 s = HOPc(s,1);
786e8c11
YO
2021 DEBUG_TRIE_EXECUTE_r({
2022 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2023 });
07be1b83 2024 } else {
786e8c11
YO
2025 DEBUG_TRIE_EXECUTE_r(
2026 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2027 break;
2028 }
2029 }
be8e71aa
YO
2030 FREETMPS;
2031 LEAVE;
07be1b83
YO
2032 }
2033 break;
b3c9acc1 2034 default:
3c3eec57
GS
2035 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2036 break;
d6a28714 2037 }
6eb5f6b9
JH
2038 return 0;
2039 got_it:
2040 return s;
2041}
2042
fae667d5 2043
6eb5f6b9
JH
2044/*
2045 - regexec_flags - match a regexp against a string
2046 */
2047I32
288b8c02 2048Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2049 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2050/* stringarg: the point in the string at which to begin matching */
2051/* strend: pointer to null at end of string */
2052/* strbeg: real beginning of string */
2053/* minend: end of match must be >= minend bytes after stringarg. */
2054/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2055 * itself is accessed via the pointers above */
2056/* data: May be used for some additional optimizations.
2057 Currently its only used, with a U32 cast, for transmitting
2058 the ganch offset when doing a /g match. This will change */
2059/* nosave: For optimizations. */
2060
6eb5f6b9 2061{
97aff369 2062 dVAR;
288b8c02 2063 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2064 /*register*/ char *s;
eb578fdb 2065 regnode *c;
24b23f37 2066 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2067 I32 minlen; /* must match at least this many chars */
2068 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2069 I32 end_shift = 0; /* Same for the end. */ /* CC */
2070 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2071 char *scream_olds = NULL;
f2ed9b32 2072 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2073 I32 multiline;
f8fc2ecf 2074 RXi_GET_DECL(prog,progi);
3b0527fe 2075 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2076 regexp_paren_pair *swap = NULL;
a3621e74
YO
2077 GET_RE_DEBUG_FLAGS_DECL;
2078
7918f24d 2079 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2080 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2081
2082 /* Be paranoid... */
2083 if (prog == NULL || startpos == NULL) {
2084 Perl_croak(aTHX_ "NULL regexp parameter");
2085 return 0;
2086 }
2087
bbe252da 2088 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2089 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2090
f2ed9b32 2091 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2092 DEBUG_EXECUTE_r(
f2ed9b32 2093 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2094 "Matching");
2095 );
bac06658 2096
6eb5f6b9 2097 minlen = prog->minlen;
1de06328
YO
2098
2099 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2101 "String too short [regexec_flags]...\n"));
2102 goto phooey;
1aa99e6b 2103 }
6eb5f6b9 2104
1de06328 2105
6eb5f6b9 2106 /* Check validity of program. */
f8fc2ecf 2107 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2108 Perl_croak(aTHX_ "corrupted regexp program");
2109 }
2110
2111 PL_reg_flags = 0;
ed301438 2112 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2113 PL_reg_maxiter = 0;
2114
3c8556c3 2115 if (RX_UTF8(rx))
6eb5f6b9
JH
2116 PL_reg_flags |= RF_utf8;
2117
2118 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2119 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2120 PL_bostr = strbeg;
3b0527fe 2121 reginfo.sv = sv;
6eb5f6b9
JH
2122
2123 /* Mark end of line for $ (and such) */
2124 PL_regeol = strend;
2125
2126 /* see how far we have to get to not match where we matched before */
3b0527fe 2127 reginfo.till = startpos+minend;
6eb5f6b9 2128
6eb5f6b9
JH
2129 /* If there is a "must appear" string, look for it. */
2130 s = startpos;
2131
bbe252da 2132 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2133 MAGIC *mg;
2c296965 2134 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2135 reginfo.ganch = startpos + prog->gofs;
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2138 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2139 && SvMAGIC(sv)
14befaf4
DM
2140 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141 && mg->mg_len >= 0) {
3b0527fe 2142 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2145
bbe252da 2146 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2147 if (s > reginfo.ganch)
6eb5f6b9 2148 goto phooey;
58e23c8d 2149 s = reginfo.ganch - prog->gofs;
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2152 if (s < strbeg)
2153 goto phooey;
6eb5f6b9
JH
2154 }
2155 }
58e23c8d 2156 else if (data) {
70685ca0 2157 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161 } else { /* pos() not defined */
3b0527fe 2162 reginfo.ganch = strbeg;
2c296965
YO
2163 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164 "GPOS: reginfo.ganch = strbeg\n"));
2165 }
6eb5f6b9 2166 }
288b8c02 2167 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2168 /* We have to be careful. If the previous successful match
2169 was from this regex we don't want a subsequent partially
2170 successful match to clobber the old results.
2171 So when we detect this possibility we add a swap buffer
2172 to the re, and switch the buffer each match. If we fail
2173 we switch it back, otherwise we leave it swapped.
2174 */
2175 swap = prog->offs;
2176 /* do we need a save destructor here for eval dies? */
2177 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2178 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180 PTR2UV(prog),
2181 PTR2UV(swap),
2182 PTR2UV(prog->offs)
2183 ));
c74340f9 2184 }
a0714e2c 2185 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2186 re_scream_pos_data d;
2187
2188 d.scream_olds = &scream_olds;
2189 d.scream_pos = &scream_pos;
288b8c02 2190 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2191 if (!s) {
a3621e74 2192 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2193 goto phooey; /* not present */
3fa9c3d7 2194 }
6eb5f6b9
JH
2195 }
2196
1de06328 2197
6eb5f6b9
JH
2198
2199 /* Simplest case: anchored match need be tried only once. */
2200 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2201 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2202 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2203 goto got_it;
bbe252da
YO
2204 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2206 {
2207 char *end;
2208
2209 if (minlen)
2210 dontbother = minlen - 1;
1aa99e6b 2211 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2212 /* for multiline we only have to try after newlines */
33b8afdf 2213 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2214 /* because of the goto we can not easily reuse the macros for bifurcating the
2215 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216 if (utf8_target) {
2217 if (s == startpos)
2218 goto after_try_utf8;
2219 while (1) {
2220 if (regtry(&reginfo, &s)) {
2221 goto got_it;
2222 }
2223 after_try_utf8:
2224 if (s > end) {
2225 goto phooey;
2226 }
2227 if (prog->extflags & RXf_USE_INTUIT) {
2228 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229 if (!s) {
2230 goto phooey;
2231 }
2232 }
2233 else {
2234 s += UTF8SKIP(s);
2235 }
2236 }
2237 } /* end search for check string in unicode */
2238 else {
2239 if (s == startpos) {
2240 goto after_try_latin;
2241 }
2242 while (1) {
2243 if (regtry(&reginfo, &s)) {
2244 goto got_it;
2245 }
2246 after_try_latin:
2247 if (s > end) {
2248 goto phooey;
2249 }
2250 if (prog->extflags & RXf_USE_INTUIT) {
2251 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252 if (!s) {
2253 goto phooey;
2254 }
2255 }
2256 else {
2257 s++;
2258 }
2259 }
2260 } /* end search for check string in latin*/
2261 } /* end search for check string */
2262 else { /* search for newline */
2263 if (s > startpos) {
2264 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2265 s--;
92f3d482 2266 }
21eede78
YO
2267 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2269 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2270 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2271 goto got_it;
2272 }
92f3d482
YO
2273 }
2274 } /* end search for newline */
2275 } /* end anchored/multiline check string search */
6eb5f6b9 2276 goto phooey;
bbe252da 2277 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2278 {
486ec47a 2279 /* the warning about reginfo.ganch being used without initialization
bbe252da 2280 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2281 and we only enter this block when the same bit is set. */
58e23c8d 2282 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2283
2284 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2285 goto got_it;
2286 goto phooey;
2287 }
2288
2289 /* Messy cases: unanchored match. */
bbe252da 2290 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2291 /* we have /x+whatever/ */
f2ed9b32 2292 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2293 char ch;
bf93d4cc
GS
2294#ifdef DEBUGGING
2295 int did_match = 0;
2296#endif
f2ed9b32
KW
2297 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2300
f2ed9b32 2301 if (utf8_target) {
4cadc6a9 2302 REXEC_FBC_SCAN(
6eb5f6b9 2303 if (*s == ch) {
a3621e74 2304 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2305 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2306 s += UTF8SKIP(s);
2307 while (s < strend && *s == ch)
2308 s += UTF8SKIP(s);
2309 }
4cadc6a9 2310 );
6eb5f6b9
JH
2311 }
2312 else {
4cadc6a9 2313 REXEC_FBC_SCAN(
6eb5f6b9 2314 if (*s == ch) {
a3621e74 2315 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2316 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2317 s++;
2318 while (s < strend && *s == ch)
2319 s++;
2320 }
4cadc6a9 2321 );
6eb5f6b9 2322 }
a3621e74 2323 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2324 PerlIO_printf(Perl_debug_log,
b7953727
JH
2325 "Did not find anchored character...\n")
2326 );
6eb5f6b9 2327 }
a0714e2c
SS
2328 else if (prog->anchored_substr != NULL
2329 || prog->anchored_utf8 != NULL
2330 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2331 && prog->float_max_offset < strend - s)) {
2332 SV *must;
2333 I32 back_max;
2334 I32 back_min;
2335 char *last;
6eb5f6b9 2336 char *last1; /* Last position checked before */
bf93d4cc
GS
2337#ifdef DEBUGGING
2338 int did_match = 0;
2339#endif
33b8afdf 2340 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2341 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2344 back_max = back_min = prog->anchored_offset;
2345 } else {
f2ed9b32
KW
2346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2349 back_max = prog->float_max_offset;
2350 back_min = prog->float_min_offset;
2351 }
1de06328
YO
2352
2353
33b8afdf
JH
2354 if (must == &PL_sv_undef)
2355 /* could not downgrade utf8 check substring, so must fail */
2356 goto phooey;
2357
1de06328
YO
2358 if (back_min<0) {
2359 last = strend;
2360 } else {
2361 last = HOP3c(strend, /* Cannot start after this */
2362 -(I32)(CHR_SVLEN(must)
2363 - (SvTAIL(must) != 0) + back_min), strbeg);
2364 }
6eb5f6b9
JH
2365 if (s > PL_bostr)
2366 last1 = HOPc(s, -1);
2367 else
2368 last1 = s - 1; /* bogus */
2369
a0288114 2370 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2371 check_substr==must. */
2372 scream_pos = -1;
2373 dontbother = end_shift;
2374 strend = HOPc(strend, -dontbother);
2375 while ( (s <= last) &&
c33e64f0 2376 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2377 (unsigned char*)strend, must,
c33e64f0 2378 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2379 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2380 if (HOPc(s, -back_max) > last1) {
2381 last1 = HOPc(s, -back_min);
2382 s = HOPc(s, -back_max);
2383 }
2384 else {
52657f30 2385 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2386
2387 last1 = HOPc(s, -back_min);
52657f30 2388 s = t;
6eb5f6b9 2389 }
f2ed9b32 2390 if (utf8_target) {
6eb5f6b9 2391 while (s <= last1) {
24b23f37 2392 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2393 goto got_it;
2394 s += UTF8SKIP(s);
2395 }
2396 }
2397 else {
2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s++;
2402 }
2403 }
2404 }
ab3bbdeb 2405 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2406 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2407 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2409 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2410 ? "anchored" : "floating"),
ab3bbdeb
YO
2411 quoted, RE_SV_TAIL(must));
2412 });
6eb5f6b9
JH
2413 goto phooey;
2414 }
f8fc2ecf 2415 else if ( (c = progi->regstclass) ) {
f14c76ed 2416 if (minlen) {
f8fc2ecf 2417 const OPCODE op = OP(progi->regstclass);
66e933ab 2418 /* don't bother with what can't match */
786e8c11 2419 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2420 strend = HOPc(strend, -(minlen - 1));
2421 }
a3621e74 2422 DEBUG_EXECUTE_r({
be8e71aa 2423 SV * const prop = sv_newmortal();
32fc9b6a 2424 regprop(prog, prop, c);
0df25f3d 2425 {
f2ed9b32 2426 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2427 s,strend-s,60);
0df25f3d 2428 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2429 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2430 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2431 quoted, (int)(strend - s));
0df25f3d 2432 }
ffc61ed2 2433 });
3b0527fe 2434 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2435 goto got_it;
07be1b83 2436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2437 }
2438 else {
2439 dontbother = 0;
a0714e2c 2440 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2441 /* Trim the end. */
6af40bd7 2442 char *last= NULL;
33b8afdf 2443 SV* float_real;
c33e64f0
FC
2444 STRLEN len;
2445 const char *little;
33b8afdf 2446
f2ed9b32
KW
2447 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2450
c33e64f0
FC
2451 little = SvPV_const(float_real, len);
2452 if (SvTAIL(float_real)) {
1a13b075
YO
2453 /* This means that float_real contains an artificial \n on the end
2454 * due to the presence of something like this: /foo$/
2455 * where we can match both "foo" and "foo\n" at the end of the string.
2456 * So we have to compare the end of the string first against the float_real
2457 * without the \n and then against the full float_real with the string.
2458 * We have to watch out for cases where the string might be smaller
2459 * than the float_real or the float_real without the \n.
2460 */
2461 char *checkpos= strend - len;
2462 DEBUG_OPTIMISE_r(
2463 PerlIO_printf(Perl_debug_log,
2464 "%sChecking for float_real.%s\n",
2465 PL_colors[4], PL_colors[5]));
2466 if (checkpos + 1 < strbeg) {
2467 /* can't match, even if we remove the trailing \n string is too short to match */
2468 DEBUG_EXECUTE_r(
2469 PerlIO_printf(Perl_debug_log,
2470 "%sString shorter than required trailing substring, cannot match.%s\n",
2471 PL_colors[4], PL_colors[5]));
2472 goto phooey;
2473 } else if (memEQ(checkpos + 1, little, len - 1)) {
2474 /* can match, the end of the string matches without the "\n" */
2475 last = checkpos + 1;
2476 } else if (checkpos < strbeg) {
2477 /* cant match, string is too short when the "\n" is included */
2478 DEBUG_EXECUTE_r(
2479 PerlIO_printf(Perl_debug_log,
2480 "%sString does not contain required trailing substring, cannot match.%s\n",
2481 PL_colors[4], PL_colors[5]));
2482 goto phooey;
2483 } else if (!multiline) {
2484 /* non multiline match, so compare with the "\n" at the end of the string */
2485 if (memEQ(checkpos, little, len)) {
2486 last= checkpos;
2487 } else {
2488 DEBUG_EXECUTE_r(
2489 PerlIO_printf(Perl_debug_log,
2490 "%sString does not contain required trailing substring, cannot match.%s\n",
2491 PL_colors[4], PL_colors[5]));
2492 goto phooey;
2493 }
2494 } else {
2495 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2496 goto find_last;
1a13b075 2497 }
c33e64f0 2498 } else {
d6a28714 2499 find_last:
9041c2e3 2500 if (len)
d6a28714 2501 last = rninstr(s, strend, little, little + len);
b8c5462f 2502 else
a0288114 2503 last = strend; /* matching "$" */
b8c5462f 2504 }
6af40bd7
YO
2505 if (!last) {
2506 /* at one point this block contained a comment which was probably
2507 * incorrect, which said that this was a "should not happen" case.
2508 * Even if it was true when it was written I am pretty sure it is
2509 * not anymore, so I have removed the comment and replaced it with
2510 * this one. Yves */
6bda09f9
YO
2511 DEBUG_EXECUTE_r(
2512 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2513 "String does not contain required substring, cannot match.\n"
2514 ));
2515 goto phooey;
bf93d4cc 2516 }
d6a28714
JH
2517 dontbother = strend - last + prog->float_min_offset;
2518 }
2519 if (minlen && (dontbother < minlen))
2520 dontbother = minlen - 1;
2521 strend -= dontbother; /* this one's always in bytes! */
2522 /* We don't know much -- general case. */
f2ed9b32 2523 if (utf8_target) {
d6a28714 2524 for (;;) {
24b23f37 2525 if (regtry(&reginfo, &s))
d6a28714
JH
2526 goto got_it;
2527 if (s >= strend)
2528 break;
b8c5462f 2529 s += UTF8SKIP(s);
d6a28714
JH
2530 };
2531 }
2532 else {
2533 do {
24b23f37 2534 if (regtry(&reginfo, &s))
d6a28714
JH
2535 goto got_it;
2536 } while (s++ < strend);
2537 }
2538 }
2539
2540 /* Failure. */
2541 goto phooey;
2542
2543got_it:
495f47a5
DM
2544 DEBUG_BUFFERS_r(
2545 if (swap)
2546 PerlIO_printf(Perl_debug_log,
2547 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548 PTR2UV(prog),
2549 PTR2UV(swap)
2550 );
2551 );
e9105d30 2552 Safefree(swap);
288b8c02 2553 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2554
ed301438 2555 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2556 restore_pos(aTHX_ prog);
5daac39c
NC
2557 if (RXp_PAREN_NAMES(prog))
2558 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2559
2560 /* make sure $`, $&, $', and $digit will work later */
2561 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2562 if (flags & REXEC_COPY_STR) {
f8c7b90f 2563#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2564 if ((SvIsCOW(sv)
2565 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2566 if (DEBUG_C_TEST) {
2567 PerlIO_printf(Perl_debug_log,
2568 "Copy on write: regexp capture, type %d\n",
2569 (int) SvTYPE(sv));
2570 }
77f8f7c1 2571 RX_MATCH_COPY_FREE(rx);
ed252734 2572 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2573 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2574 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2575 prog->sublen = PL_regeol - strbeg;
2576 prog->suboffset = 0;
2577 prog->subcoffset = 0;
ed252734
NC
2578 } else
2579#endif
2580 {
6502e081
DM
2581 I32 min = 0;
2582 I32 max = PL_regeol - strbeg;
2583 I32 sublen;
2584
2585 if ( (flags & REXEC_COPY_SKIP_POST)
2586 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2587 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2588 ) { /* don't copy $' part of string */
3de645a8 2589 U32 n = 0;
6502e081
DM
2590 max = -1;
2591 /* calculate the right-most part of the string covered
2592 * by a capture. Due to look-ahead, this may be to
2593 * the right of $&, so we have to scan all captures */
2594 while (n <= prog->lastparen) {
2595 if (prog->offs[n].end > max)
2596 max = prog->offs[n].end;
2597 n++;
2598 }
2599 if (max == -1)
2600 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2601 ? prog->offs[0].start
2602 : 0;
2603 assert(max >= 0 && max <= PL_regeol - strbeg);
2604 }
2605
2606 if ( (flags & REXEC_COPY_SKIP_PRE)
2607 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2608 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2609 ) { /* don't copy $` part of string */
3de645a8 2610 U32 n = 0;
6502e081
DM
2611 min = max;
2612 /* calculate the left-most part of the string covered
2613 * by a capture. Due to look-behind, this may be to
2614 * the left of $&, so we have to scan all captures */
2615 while (min && n <= prog->lastparen) {
2616 if ( prog->offs[n].start != -1
2617 && prog->offs[n].start < min)
2618 {
2619 min = prog->offs[n].start;
2620 }
2621 n++;
2622 }
2623 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2624 && min > prog->offs[0].end
2625 )
2626 min = prog->offs[0].end;
2627
2628 }
2629
2630 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2631 sublen = max - min;
2632
2633 if (RX_MATCH_COPIED(rx)) {
2634 if (sublen > prog->sublen)
2635 prog->subbeg =
2636 (char*)saferealloc(prog->subbeg, sublen+1);
2637 }
2638 else
2639 prog->subbeg = (char*)safemalloc(sublen+1);
2640 Copy(strbeg + min, prog->subbeg, sublen, char);
2641 prog->subbeg[sublen] = '\0';
2642 prog->suboffset = min;
2643 prog->sublen = sublen;
77f8f7c1 2644 RX_MATCH_COPIED_on(rx);
6502e081 2645 }
6502e081
DM
2646 prog->subcoffset = prog->suboffset;
2647 if (prog->suboffset && utf8_target) {
2648 /* Convert byte offset to chars.
2649 * XXX ideally should only compute this if @-/@+
2650 * has been seen, a la PL_sawampersand ??? */
2651
2652 /* If there's a direct correspondence between the
2653 * string which we're matching and the original SV,
2654 * then we can use the utf8 len cache associated with
2655 * the SV. In particular, it means that under //g,
2656 * sv_pos_b2u() will use the previously cached
2657 * position to speed up working out the new length of
2658 * subcoffset, rather than counting from the start of
2659 * the string each time. This stops
2660 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2661 * from going quadratic */
2662 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2663 sv_pos_b2u(sv, &(prog->subcoffset));
2664 else
2665 prog->subcoffset = utf8_length((U8*)strbeg,
2666 (U8*)(strbeg+prog->suboffset));
2667 }
d6a28714
JH
2668 }
2669 else {
6502e081 2670 RX_MATCH_COPY_FREE(rx);
d6a28714 2671 prog->subbeg = strbeg;
6502e081
DM
2672 prog->suboffset = 0;
2673 prog->subcoffset = 0;
d6a28714
JH
2674 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2675 }
2676 }
9041c2e3 2677
d6a28714
JH
2678 return 1;
2679
2680phooey:
a3621e74 2681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2682 PL_colors[4], PL_colors[5]));
ed301438 2683 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2684 restore_pos(aTHX_ prog);
e9105d30 2685 if (swap) {
c74340f9 2686 /* we failed :-( roll it back */
495f47a5
DM
2687 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2688 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2689 PTR2UV(prog),
2690 PTR2UV(prog->offs),
2691 PTR2UV(swap)
2692 ));
e9105d30
GG
2693 Safefree(prog->offs);
2694 prog->offs = swap;
2695 }
2696
d6a28714
JH
2697 return 0;
2698}
2699
6bda09f9 2700
ec43f78b
DM
2701/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2702 * Do inc before dec, in case old and new rex are the same */
2703#define SET_reg_curpm(Re2) \
2704 if (PL_reg_state.re_state_eval_setup_done) { \
2705 (void)ReREFCNT_inc(Re2); \
2706 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2707 PM_SETRE((PL_reg_curpm), (Re2)); \
2708 }
2709
2710
d6a28714
JH
2711/*
2712 - regtry - try match at specific point
2713 */
2714STATIC I32 /* 0 failure, 1 success */
f73aaa43 2715S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2716{
97aff369 2717 dVAR;
d6a28714 2718 CHECKPOINT lastcp;
288b8c02
NC
2719 REGEXP *const rx = reginfo->prog;
2720 regexp *const prog = (struct regexp *)SvANY(rx);
f73aaa43 2721 I32 result;
f8fc2ecf 2722 RXi_GET_DECL(prog,progi);
a3621e74 2723 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2724
2725 PERL_ARGS_ASSERT_REGTRY;
2726
24b23f37 2727 reginfo->cutpoint=NULL;
d6a28714 2728
ed301438
DM
2729 if ((prog->extflags & RXf_EVAL_SEEN)
2730 && !PL_reg_state.re_state_eval_setup_done)
2731 {
d6a28714
JH
2732 MAGIC *mg;
2733
ed301438 2734 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2735 if (reginfo->sv) {
d6a28714 2736 /* Make $_ available to executed code. */
3b0527fe 2737 if (reginfo->sv != DEFSV) {
59f00321 2738 SAVE_DEFSV;
414bf5ae 2739 DEFSV_set(reginfo->sv);
b8c5462f 2740 }
d6a28714 2741
3b0527fe
DM
2742 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2743 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2744 /* prepare for quick setting of pos */
d300d9fa 2745#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2746 if (SvIsCOW(reginfo->sv))
2747 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2748#endif
3dab1dad 2749 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2750 &PL_vtbl_mglob, NULL, 0);
d6a28714 2751 mg->mg_len = -1;
b8c5462f 2752 }
d6a28714
JH
2753 PL_reg_magic = mg;
2754 PL_reg_oldpos = mg->mg_len;
4f639d21 2755 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2756 }
09687e5a 2757 if (!PL_reg_curpm) {
a02a5408 2758 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2759#ifdef USE_ITHREADS
2760 {
14a49a24 2761 SV* const repointer = &PL_sv_undef;
92313705
NC
2762 /* this regexp is also owned by the new PL_reg_curpm, which
2763 will try to free it. */
d2ece331 2764 av_push(PL_regex_padav, repointer);
09687e5a
AB
2765 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2766 PL_regex_pad = AvARRAY(PL_regex_padav);
2767 }
2768#endif
2769 }
ec43f78b 2770 SET_reg_curpm(rx);
d6a28714
JH
2771 PL_reg_oldcurpm = PL_curpm;
2772 PL_curpm = PL_reg_curpm;
07bc277f 2773 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2774 /* Here is a serious problem: we cannot rewrite subbeg,
2775 since it may be needed if this match fails. Thus
2776 $` inside (?{}) could fail... */
2777 PL_reg_oldsaved = prog->subbeg;
2778 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2779 PL_reg_oldsavedoffset = prog->suboffset;
2780 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2781#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2782 PL_nrs = prog->saved_copy;
2783#endif
07bc277f 2784 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2785 }
2786 else
bd61b366 2787 PL_reg_oldsaved = NULL;
d6a28714 2788 prog->subbeg = PL_bostr;
6502e081
DM
2789 prog->suboffset = 0;
2790 prog->subcoffset = 0;
d6a28714
JH
2791 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2792 }
97ca13b7 2793#ifdef DEBUGGING
f73aaa43 2794 PL_reg_starttry = *startposp;
97ca13b7 2795#endif
f73aaa43 2796 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2797 prog->lastparen = 0;
03994de8 2798 prog->lastcloseparen = 0;
d6a28714 2799 PL_regsize = 0;
d6a28714
JH
2800
2801 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2802 to do this again and again, prog->lastparen should take care of
3dd2943c 2803 this! --ilya*/
dafc8851
JH
2804
2805 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2806 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2807 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2808 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2809 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2810 * Meanwhile, this code *is* needed for the
daf18116
JH
2811 * above-mentioned test suite tests to succeed. The common theme
2812 * on those tests seems to be returning null fields from matches.
225593e1 2813 * --jhi updated by dapm */
dafc8851 2814#if 1
d6a28714 2815 if (prog->nparens) {
b93070ed 2816 regexp_paren_pair *pp = prog->offs;
eb578fdb 2817 I32 i;
b93070ed 2818 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2819 ++pp;
2820 pp->start = -1;
2821 pp->end = -1;
d6a28714
JH
2822 }
2823 }
dafc8851 2824#endif
02db2b7b 2825 REGCP_SET(lastcp);
f73aaa43
DM
2826 result = regmatch(reginfo, *startposp, progi->program + 1);
2827 if (result != -1) {
2828 prog->offs[0].end = result;
d6a28714
JH
2829 return 1;
2830 }
24b23f37 2831 if (reginfo->cutpoint)
f73aaa43 2832 *startposp= reginfo->cutpoint;
02db2b7b 2833 REGCP_UNWIND(lastcp);
d6a28714
JH
2834 return 0;
2835}
2836
02db2b7b 2837
8ba1375e
MJD
2838#define sayYES goto yes
2839#define sayNO goto no
262b90c4 2840#define sayNO_SILENT goto no_silent
8ba1375e 2841
f9f4320a
YO
2842/* we dont use STMT_START/END here because it leads to
2843 "unreachable code" warnings, which are bogus, but distracting. */
2844#define CACHEsayNO \
c476f425
DM
2845 if (ST.cache_mask) \
2846 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2847 sayNO
3298f257 2848
a3621e74 2849/* this is used to determine how far from the left messages like
265c4333
YO
2850 'failed...' are printed. It should be set such that messages
2851 are inline with the regop output that created them.
a3621e74 2852*/
265c4333 2853#define REPORT_CODE_OFF 32
a3621e74
YO
2854
2855
40a82448
DM
2856#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2857#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2858
86545054
DM
2859#define SLAB_FIRST(s) (&(s)->states[0])
2860#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2861
5d9a96ca
DM
2862/* grab a new slab and return the first slot in it */
2863
2864STATIC regmatch_state *
2865S_push_slab(pTHX)
2866{
a35a87e7 2867#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2868 dMY_CXT;
2869#endif
5d9a96ca
DM
2870 regmatch_slab *s = PL_regmatch_slab->next;
2871 if (!s) {
2872 Newx(s, 1, regmatch_slab);
2873 s->prev = PL_regmatch_slab;
2874 s->next = NULL;
2875 PL_regmatch_slab->next = s;
2876 }
2877 PL_regmatch_slab = s;
86545054 2878 return SLAB_FIRST(s);
5d9a96ca 2879}
5b47454d 2880
95b24440 2881
40a82448
DM
2882/* push a new state then goto it */
2883
4d5016e5
DM
2884#define PUSH_STATE_GOTO(state, node, input) \
2885 pushinput = input; \
40a82448
DM
2886 scan = node; \
2887 st->resume_state = state; \
2888 goto push_state;
2889
2890/* push a new state with success backtracking, then goto it */
2891
4d5016e5
DM
2892#define PUSH_YES_STATE_GOTO(state, node, input) \
2893 pushinput = input; \
40a82448
DM
2894 scan = node; \
2895 st->resume_state = state; \
2896 goto push_yes_state;
2897
aa283a38 2898
aa283a38 2899
4d5016e5 2900
d6a28714 2901/*
95b24440 2902
bf1f174e
DM
2903regmatch() - main matching routine
2904
2905This is basically one big switch statement in a loop. We execute an op,
2906set 'next' to point the next op, and continue. If we come to a point which
2907we may need to backtrack to on failure such as (A|B|C), we push a
2908backtrack state onto the backtrack stack. On failure, we pop the top
2909state, and re-enter the loop at the state indicated. If there are no more
2910states to pop, we return failure.
2911
2912Sometimes we also need to backtrack on success; for example /A+/, where
2913after successfully matching one A, we need to go back and try to
2914match another one; similarly for lookahead assertions: if the assertion
2915completes successfully, we backtrack to the state just before the assertion
2916and then carry on. In these cases, the pushed state is marked as
2917'backtrack on success too'. This marking is in fact done by a chain of
2918pointers, each pointing to the previous 'yes' state. On success, we pop to
2919the nearest yes state, discarding any intermediate failure-only states.
2920Sometimes a yes state is pushed just to force some cleanup code to be
2921called at the end of a successful match or submatch; e.g. (??{$re}) uses
2922it to free the inner regex.
2923
2924Note that failure backtracking rewinds the cursor position, while
2925success backtracking leaves it alone.
2926
2927A pattern is complete when the END op is executed, while a subpattern
2928such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2929ops trigger the "pop to last yes state if any, otherwise return true"
2930behaviour.
2931
2932A common convention in this function is to use A and B to refer to the two
2933subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2934the subpattern to be matched possibly multiple times, while B is the entire
2935rest of the pattern. Variable and state names reflect this convention.
2936
2937The states in the main switch are the union of ops and failure/success of
2938substates associated with with that op. For example, IFMATCH is the op
2939that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2940'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2941successfully matched A and IFMATCH_A_fail is a state saying that we have
2942just failed to match A. Resume states always come in pairs. The backtrack
2943state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2944at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2945on success or failure.
2946
2947The struct that holds a backtracking state is actually a big union, with
2948one variant for each major type of op. The variable st points to the
2949top-most backtrack struct. To make the code clearer, within each
2950block of code we #define ST to alias the relevant union.
2951
2952Here's a concrete example of a (vastly oversimplified) IFMATCH
2953implementation:
2954
2955 switch (state) {
2956 ....
2957
2958#define ST st->u.ifmatch
2959
2960 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2961 ST.foo = ...; // some state we wish to save
95b24440 2962 ...
bf1f174e
DM
2963 // push a yes backtrack state with a resume value of
2964 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2965 // first node of A:
4d5016e5 2966 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
2967 // NOTREACHED
2968
2969 case IFMATCH_A: // we have successfully executed A; now continue with B
2970 next = B;
2971 bar = ST.foo; // do something with the preserved value
2972 break;
2973
2974 case IFMATCH_A_fail: // A failed, so the assertion failed
2975 ...; // do some housekeeping, then ...
2976 sayNO; // propagate the failure
2977
2978#undef ST
95b24440 2979
bf1f174e
DM
2980 ...
2981 }
95b24440 2982
bf1f174e
DM
2983For any old-timers reading this who are familiar with the old recursive
2984approach, the code above is equivalent to:
95b24440 2985
bf1f174e
DM
2986 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2987 {
2988 int foo = ...
95b24440 2989 ...
bf1f174e
DM
2990 if (regmatch(A)) {
2991 next = B;
2992 bar = foo;
2993 break;
95b24440 2994 }
bf1f174e
DM
2995 ...; // do some housekeeping, then ...
2996 sayNO; // propagate the failure
95b24440 2997 }
bf1f174e
DM
2998
2999The topmost backtrack state, pointed to by st, is usually free. If you
3000want to claim it, populate any ST.foo fields in it with values you wish to
3001save, then do one of
3002
4d5016e5
DM
3003 PUSH_STATE_GOTO(resume_state, node, newinput);
3004 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3005
3006which sets that backtrack state's resume value to 'resume_state', pushes a
3007new free entry to the top of the backtrack stack, then goes to 'node'.
3008On backtracking, the free slot is popped, and the saved state becomes the
3009new free state. An ST.foo field in this new top state can be temporarily
3010accessed to retrieve values, but once the main loop is re-entered, it
3011becomes available for reuse.
3012
3013Note that the depth of the backtrack stack constantly increases during the
3014left-to-right execution of the pattern, rather than going up and down with
3015the pattern nesting. For example the stack is at its maximum at Z at the
3016end of the pattern, rather than at X in the following:
3017
3018 /(((X)+)+)+....(Y)+....Z/
3019
3020The only exceptions to this are lookahead/behind assertions and the cut,
3021(?>A), which pop all the backtrack states associated with A before
3022continuing.
3023
486ec47a 3024Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3025PL_regmatch_state and st always point to the currently active state,
3026and PL_regmatch_slab points to the slab currently containing
3027PL_regmatch_state. The first time regmatch() is called, the first slab is
3028allocated, and is never freed until interpreter destruction. When the slab
3029is full, a new one is allocated and chained to the end. At exit from
3030regmatch(), slabs allocated since entry are freed.
3031
3032*/
95b24440 3033
40a82448 3034
5bc10b2c 3035#define DEBUG_STATE_pp(pp) \
265c4333 3036 DEBUG_STATE_r({ \
f2ed9b32 3037 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3038 PerlIO_printf(Perl_debug_log, \
5d458dd8 3039 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3040 depth*2, "", \
13d6edb4 3041 PL_reg_name[st->resume_state], \
5d458dd8
YO
3042 ((st==yes_state||st==mark_state) ? "[" : ""), \
3043 ((st==yes_state) ? "Y" : ""), \
3044 ((st==mark_state) ? "M" : ""), \
3045 ((st==yes_state||st==mark_state) ? "]" : "") \
3046 ); \
265c4333 3047 });
5bc10b2c 3048
40a82448 3049
3dab1dad 3050#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3051
3df15adc 3052#ifdef DEBUGGING
5bc10b2c 3053
ab3bbdeb 3054STATIC void
f2ed9b32 3055S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3056 const char *start, const char *end, const char *blurb)
3057{
efd26800 3058 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3059
3060 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3061
ab3bbdeb
YO
3062 if (!PL_colorset)
3063 reginitcolors();
3064 {
3065 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3066 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3067
f2ed9b32 3068 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3069 start, end - start, 60);
3070
3071 PerlIO_printf(Perl_debug_log,
3072 "%s%s REx%s %s against %s\n",
3073 PL_colors[4], blurb, PL_colors[5], s0, s1);
3074
f2ed9b32 3075 if (utf8_target||utf8_pat)
1de06328
YO
3076 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3077 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3078 utf8_pat && utf8_target ? " and " : "",
3079 utf8_target ? "string" : ""
ab3bbdeb
YO
3080 );
3081 }
3082}
3df15adc
YO
3083
3084STATIC void
786e8c11
YO
3085S_dump_exec_pos(pTHX_ const char *locinput,
3086 const regnode *scan,
3087 const char *loc_regeol,
3088 const char *loc_bostr,
3089 const char *loc_reg_starttry,
f2ed9b32 3090 const bool utf8_target)
07be1b83 3091{
786e8c11 3092 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3093 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3094 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3095 /* The part of the string before starttry has one color
3096 (pref0_len chars), between starttry and current
3097 position another one (pref_len - pref0_len chars),
3098 after the current position the third one.
3099 We assume that pref0_len <= pref_len, otherwise we
3100 decrease pref0_len. */
786e8c11
YO
3101 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3102 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3103 int pref0_len;
3104
7918f24d
NC
3105 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3106
f2ed9b32 3107 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3108 pref_len++;
786e8c11
YO
3109 pref0_len = pref_len - (locinput - loc_reg_starttry);
3110 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3111 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3112 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3113 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3114 l--;
3115 if (pref0_len < 0)
3116 pref0_len = 0;
3117 if (pref0_len > pref_len)
3118 pref0_len = pref_len;
3119 {
f2ed9b32 3120 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3121
ab3bbdeb 3122 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3123 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3124
ab3bbdeb 3125 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3126 (locinput - pref_len + pref0_len),
1de06328 3127 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3128
ab3bbdeb 3129 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3130 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3131
1de06328 3132 const STRLEN tlen=len0+len1+len2;
3df15adc 3133 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3134 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3135 (IV)(locinput - loc_bostr),
07be1b83 3136 len0, s0,
07be1b83 3137 len1, s1,
07be1b83 3138 (docolor ? "" : "> <"),
07be1b83 3139 len2, s2,
f9f4320a 3140 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3141 "");
3142 }
3143}
3df15adc 3144
07be1b83
YO
3145#endif
3146
0a4db386
YO
3147/* reg_check_named_buff_matched()
3148 * Checks to see if a named buffer has matched. The data array of
3149 * buffer numbers corresponding to the buffer is expected to reside
3150 * in the regexp->data->data array in the slot stored in the ARG() of
3151 * node involved. Note that this routine doesn't actually care about the
3152 * name, that information is not preserved from compilation to execution.
3153 * Returns the index of the leftmost defined buffer with the given name
3154 * or 0 if non of the buffers matched.
3155 */
3156STATIC I32
7918f24d
NC
3157S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3158{
0a4db386 3159 I32 n;
f8fc2ecf 3160 RXi_GET_DECL(rex,rexi);
ad64d0ec 3161 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3162 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3163
3164 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3165
0a4db386 3166 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3167 if ((I32)rex->lastparen >= nums[n] &&
3168 rex->offs[nums[n]].end != -1)
0a4db386
YO
3169 {
3170 return nums[n];
3171 }
3172 }
3173 return 0;
3174}
3175
2f554ef7
DM
3176
3177/* free all slabs above current one - called during LEAVE_SCOPE */
3178
3179STATIC void
3180S_clear_backtrack_stack(pTHX_ void *p)
3181{
3182 regmatch_slab *s = PL_regmatch_slab->next;
3183 PERL_UNUSED_ARG(p);
3184
3185 if (!s)
3186 return;
3187 PL_regmatch_slab->next = NULL;
3188 while (s) {
3189 regmatch_slab * const osl = s;
3190 s = s->next;
3191 Safefree(osl);
3192 }
3193}
3194
3195
f73aaa43
DM
3196/* returns -1 on failure, $+[0] on success */
3197STATIC I32
3198S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3199{
a35a87e7 3200#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3201 dMY_CXT;
3202#endif
27da23d5 3203 dVAR;
eb578fdb 3204 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3205 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3206 REGEXP *rex_sv = reginfo->prog;
3207 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3208 RXi_GET_DECL(rex,rexi);
2f554ef7 3209 I32 oldsave;
5d9a96ca 3210 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3211 regmatch_state *st;
5d9a96ca 3212 /* cache heavy used fields of st in registers */
eb578fdb
KW
3213 regnode *scan;
3214 regnode *next;
3215 U32 n = 0; /* general value; init to avoid compiler warning */
3216 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3217 char *locinput = startpos;
4d5016e5 3218 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3219 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3220
b69b0499 3221 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3222 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3223 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3224 const U32 max_nochange_depth =
3225 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3226 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3227 regmatch_state *yes_state = NULL; /* state to pop to on success of
3228 subpattern */
e2e6a0f1
YO
3229 /* mark_state piggy backs on the yes_state logic so that when we unwind
3230 the stack on success we can update the mark_state as we go */
3231 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3232 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3233 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3234 U32 state_num;
5d458dd8
YO
3235 bool no_final = 0; /* prevent failure from backtracking? */
3236 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3237 char *startpoint = locinput;
5d458dd8
YO
3238 SV *popmark = NULL; /* are we looking for a mark? */
3239 SV *sv_commit = NULL; /* last mark name seen in failure */
3240 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3241 during a successful match */
5d458dd8
YO
3242 U32 lastopen = 0; /* last open we saw */
3243 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3244 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3245 /* these three flags are set by various ops to signal information to
3246 * the very next op. They have a useful lifetime of exactly one loop
3247 * iteration, and are not preserved or restored by state pushes/pops
3248 */
3249 bool sw = 0; /* the condition value in (?(cond)a|b) */
3250 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3251 int logical = 0; /* the following EVAL is:
3252 0: (?{...})
3253 1: (?(?{...})X|Y)
3254 2: (??{...})
3255 or the following IFMATCH/UNLESSM is:
3256 false: plain (?=foo)
3257 true: used as a condition: (?(?=foo))
3258 */
81ed78b2
DM
3259 PAD* last_pad = NULL;
3260 dMULTICALL;
3261 I32 gimme = G_SCALAR;
3262 CV *caller_cv = NULL; /* who called us */
3263 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3264 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3265
95b24440 3266#ifdef DEBUGGING
e68ec53f 3267 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3268#endif
3269
81ed78b2
DM
3270 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3271 multicall_oldcatch = 0;
3272 multicall_cv = NULL;
3273 cx = NULL;
4f8dbb2d
JL
3274 PERL_UNUSED_VAR(multicall_cop);
3275 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3276
3277
7918f24d
NC
3278 PERL_ARGS_ASSERT_REGMATCH;
3279
3b57cd43 3280 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3281 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3282 }));
5d9a96ca
DM
3283 /* on first ever call to regmatch, allocate first slab */
3284 if (!PL_regmatch_slab) {
3285 Newx(PL_regmatch_slab, 1, regmatch_slab);
3286 PL_regmatch_slab->prev = NULL;
3287 PL_regmatch_slab->next = NULL;
86545054 3288 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3289 }
3290
2f554ef7
DM
3291 oldsave = PL_savestack_ix;
3292 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3293 SAVEVPTR(PL_regmatch_slab);
3294 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3295
3296 /* grab next free state slot */
3297 st = ++PL_regmatch_state;
86545054 3298 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3299 st = PL_regmatch_state = S_push_slab(aTHX);
3300
d6a28714
JH
3301 /* Note that nextchr is a byte even in UTF */
3302 nextchr = UCHARAT(locinput);
3303 scan = prog;
3304 while (scan != NULL) {
8ba1375e 3305
a3621e74 3306 DEBUG_EXECUTE_r( {
6136c704 3307 SV * const prop = sv_newmortal();
1de06328 3308 regnode *rnext=regnext(scan);
f2ed9b32 3309 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3310 regprop(rex, prop, scan);
07be1b83
YO
3311
3312 PerlIO_printf(Perl_debug_log,
3313 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3314 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3315 SvPVX_const(prop),
1de06328 3316 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3317 0 : (IV)(rnext - rexi->program));
2a782b5b 3318 });
d6a28714
JH
3319
3320 next = scan + NEXT_OFF(scan);
3321 if (next == scan)
3322 next = NULL;
40a82448 3323 state_num = OP(scan);
d6a28714 3324
40a82448 3325 reenter_switch:
34a81e2b 3326
40a82448 3327 switch (state_num) {
d6a28714 3328 case BOL:
7fba1cd6 3329 if (locinput == PL_bostr)
d6a28714 3330 {
3b0527fe 3331 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3332 break;
3333 }
d6a28714
JH
3334 sayNO;
3335 case MBOL:
12d33761
HS
3336 if (locinput == PL_bostr ||
3337 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3338 {
b8c5462f
JH
3339 break;
3340 }
d6a28714
JH
3341 sayNO;
3342 case SBOL:
c2a73568 3343 if (locinput == PL_bostr)
b8c5462f 3344 break;
d6a28714
JH
3345 sayNO;
3346 case GPOS:
3b0527fe 3347 if (locinput == reginfo->ganch)
d6a28714
JH
3348 break;
3349 sayNO;
ee9b8eae
YO
3350
3351 case KEEPS:
3352 /* update the startpoint */
b93070ed 3353 st->u.keeper.val = rex->offs[0].start;
b93070ed 3354 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3355 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
ee9b8eae
YO
3356 /*NOT-REACHED*/
3357 case KEEPS_next_fail:
3358 /* rollback the start point change */
b93070ed 3359 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3360 sayNO_SILENT;
3361 /*NOT-REACHED*/
d6a28714 3362 case EOL:
d6a28714
JH
3363 goto seol;
3364 case MEOL:
d6a28714 3365 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3366 sayNO;
b8c5462f 3367 break;
d6a28714
JH
3368 case SEOL:
3369 seol:
3370 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3371 sayNO;
d6a28714 3372 if (PL_regeol - locinput > 1)
b8c5462f 3373 sayNO;
b8c5462f 3374 break;
d6a28714
JH
3375 case EOS:
3376 if (PL_regeol != locinput)
b8c5462f 3377 sayNO;
d6a28714 3378 break;
ffc61ed2 3379 case SANY:
d6a28714 3380 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3381 sayNO;
f2ed9b32 3382 if (utf8_target) {
f33976b4
DB
3383 locinput += PL_utf8skip[nextchr];
3384 if (locinput > PL_regeol)
3385 sayNO;
3386 nextchr = UCHARAT(locinput);
3387 }
3388 else
3389 nextchr = UCHARAT(++locinput);
3390 break;
3391 case CANY:
3392 if (!nextchr && locinput >= PL_regeol)
3393 sayNO;
b8c5462f 3394 nextchr = UCHARAT(++locinput);
a0d0e21e 3395 break;
ffc61ed2 3396 case REG_ANY:
1aa99e6b
IH
3397 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3398 sayNO;
f2ed9b32 3399 if (utf8_target) {
b8c5462f 3400 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3401 if (locinput > PL_regeol)
3402 sayNO;
a0ed51b3 3403 nextchr = UCHARAT(locinput);
a0ed51b3 3404 }
1aa99e6b
IH
3405 else
3406 nextchr = UCHARAT(++locinput);
a0ed51b3 3407 break;
166ba7cd
DM
3408
3409#undef ST
3410#define ST st->u.trie
786e8c11
YO
3411 case TRIEC:
3412 /* In this case the charclass data is available inline so
3413 we can fail fast without a lot of extra overhead.
3414 */
b9b31e9d 3415 if(!ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3416 DEBUG_EXECUTE_r(
3417 PerlIO_printf(Perl_debug_log,
3418 "%*s %sfailed to match trie start class...%s\n",
3419 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3420 );
3421 sayNO_SILENT;
118e2215 3422 assert(0); /* NOTREACHED */
786e8c11
YO
3423 }
3424 /* FALL THROUGH */
5b47454d 3425 case TRIE: