This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
set PL_reg_starttry correctly
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
7e2509c1
KW
94/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
95 * call if there are no complications: i.e., if everything matchable is
96 * straight forward in the bitmap */
af364d03
KW
97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
98 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 99
c277df42
IZ
100/*
101 * Forwards.
102 */
103
f2ed9b32 104#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 105#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 106
3dab1dad
YO
107#define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
52657f30 109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
110 : (U8*)(pos + off))
111#define HOPBACKc(pos, off) \
07be1b83
YO
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
8e11feef 115 ? (U8*)pos - off \
3dab1dad 116 : NULL)
efb30f32 117
e7409c1b 118#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 119#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 120
20d0b1e9 121/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
122#ifdef EBCDIC
123 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
124 * skip the check on EBCDIC platforms */
125# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
126#else
127# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
128 if (!CAT2(PL_utf8_,class)) { \
129 bool ok; \
130 ENTER; save_re_context(); \
131 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 133#endif
37e2e78e
KW
134
135/* Doesn't do an assert to verify that is correct */
136#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 137 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 138 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
139 ENTER; save_re_context(); \
140 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 LEAVE; } } STMT_END
37e2e78e 142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
144#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
145#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 146
37e2e78e 147#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
148 LOAD_UTF8_CHARCLASS(X_begin, " "); \
149 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
150 /* These are utf8 constants, and not utf-ebcdic constants, so the \
151 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 152 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
153 \
154 /* No asserts are done for these, in case called on an early \
155 * Unicode version in which they map to nothing */ \
d275fa5e
CB
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 163
1dcf4a1b 164#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 165
ee9a90b8
KW
166/* The actual code for CCC_TRY, which uses several variables from the routine
167 * it's callable from. It is designed to be the bulk of a case statement.
168 * FUNC is the macro or function to call on non-utf8 targets that indicate if
169 * nextchr matches the class.
170 * UTF8_TEST is the whole test string to use for utf8 targets
171 * LOAD is what to use to test, and if not present to load in the swash for the
172 * class
173 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
174 * UTF8_TEST test.
175 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
176 * utf8 and a variant, load the swash if necessary and test using the utf8
177 * test. Advance to the next character if test is ok, otherwise fail; If not
178 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
179 * fails, or advance to the next character */
180
181#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
182 if (locinput >= PL_regeol) { \
183 sayNO; \
184 } \
185 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
186 LOAD_UTF8_CHARCLASS(CLASS, STR); \
187 if (POS_OR_NEG (UTF8_TEST)) { \
188 sayNO; \
189 } \
190 locinput += PL_utf8skip[nextchr]; \
191 nextchr = UCHARAT(locinput); \
192 break; \
193 } \
194 if (POS_OR_NEG (FUNC(nextchr))) { \
195 sayNO; \
196 } \
197 nextchr = UCHARAT(++locinput); \
980866de
KW
198 break;
199
ee9a90b8
KW
200/* Handle the non-locale cases for a character class and its complement. It
201 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
202 * This is because that code fails when the test succeeds, so we want to have
203 * the test fail so that the code succeeds. The swash is stored in a
204 * predictable PL_ place */
cfaf538b
KW
205#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
206 CLASS, STR) \
ee9a90b8
KW
207 case NAME: \
208 _CCC_TRY_CODE( !, FUNC, \
209 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
210 (U8*)locinput, TRUE)), \
211 CLASS, STR) \
212 case NNAME: \
1dcf4a1b 213 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
214 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
215 (U8*)locinput, TRUE)), \
216 CLASS, STR) \
217
218/* Generate the case statements for both locale and non-locale character
219 * classes in regmatch for classes that don't have special unicode semantics.
220 * Locales don't use an immediate swash, but an intermediary special locale
221 * function that is called on the pointer to the current place in the input
222 * string. That function will resolve to needing the same swash. One might
223 * think that because we don't know what the locale will match, we shouldn't
224 * check with the swash loading function that it loaded properly; ie, that we
225 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
226 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 * irrelevant here */
228#define CCC_TRY(NAME, NNAME, FUNC, \
229 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 230 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
231 CLASS, STR) \
232 case NAMEL: \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 case NNAMEL: \
236 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
237 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
238 CLASS, STR) \
cfaf538b
KW
239 case NAMEA: \
240 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
244 nextchr = UCHARAT(++locinput); \
245 break; \
246 case NNAMEA: \
247 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 sayNO; \
249 } \
250 if (utf8_target) { \
251 locinput += PL_utf8skip[nextchr]; \
252 nextchr = UCHARAT(locinput); \
253 } \
254 else { \
255 nextchr = UCHARAT(++locinput); \
256 } \
257 break; \
ee9a90b8
KW
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260
261/* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263#define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
cfaf538b 266 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 267 CLASS, STR) \
cfaf538b
KW
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
271 CLASS, STR) \
ee9a90b8 272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 273
3dab1dad
YO
274/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275
5f80c4cf 276/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
277/* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 *
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
285*/
3e901dc0
YO
286#define JUMPABLE(rn) ( \
287 OP(rn) == OPEN || \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == EVAL || \
cca55fe3
JP
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 292 OP(rn) == KEEPS || \
3dab1dad 293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 294)
ee9b8eae 295#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 296
ee9b8eae
YO
297#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298
299#if 0
300/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 303#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305
306#else
307/* ... so we use this as its faster. */
308#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 309#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
310#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
312
313#endif
e2d8ce26 314
a84d97b6
HS
315/*
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318*/
cca55fe3 319#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 323 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 324 else if (type == PLUS) \
cca55fe3 325 rn = NEXTOPER(rn); \
3dab1dad 326 else if (type == IFMATCH) \
a84d97b6 327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 328 else rn += NEXT_OFF(rn); \
3dab1dad 329 } \
5f80c4cf 330} STMT_END
74750237 331
c476f425 332
acfe0abc 333static void restore_pos(pTHX_ void *arg);
51371543 334
620d5b66 335#define REGCP_PAREN_ELEMS 4
f067efbf 336#define REGCP_OTHER_ELEMS 3
e0fa7e2b 337#define REGCP_FRAME_ELEMS 1
620d5b66
NC
338/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
339 * are needed for the regexp context stack bookkeeping. */
340
76e3520e 341STATIC CHECKPOINT
b93070ed 342S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 343{
97aff369 344 dVAR;
a3b680e6 345 const int retval = PL_savestack_ix;
a3b680e6 346 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
347 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
348 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 349 int p;
40a82448 350 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 351
b93070ed
DM
352 PERL_ARGS_ASSERT_REGCPPUSH;
353
e49a9654 354 if (paren_elems_to_push < 0)
5637ef5b
NC
355 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
356 paren_elems_to_push);
e49a9654 357
e0fa7e2b
NC
358 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
359 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
360 " out of range (%lu-%ld)",
361 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 362
620d5b66 363 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 364
495f47a5
DM
365 DEBUG_BUFFERS_r(
366 if ((int)PL_regsize > (int)parenfloor)
367 PerlIO_printf(Perl_debug_log,
368 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
369 PTR2UV(rex),
370 PTR2UV(rex->offs)
371 );
372 );
3280af22 373 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 374/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
375 SSPUSHINT(rex->offs[p].end);
376 SSPUSHINT(rex->offs[p].start);
1ca2007e 377 SSPUSHINT(rex->offs[p].start_tmp);
a0d0e21e 378 SSPUSHINT(p);
e7707071 379 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
380 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
381 (UV)p,
382 (IV)rex->offs[p].start,
383 (IV)rex->offs[p].start_tmp,
384 (IV)rex->offs[p].end
40a82448 385 ));
a0d0e21e 386 }
b1ce53c5 387/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 388 SSPUSHINT(PL_regsize);
b93070ed
DM
389 SSPUSHINT(rex->lastparen);
390 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 391 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 392
a0d0e21e
LW
393 return retval;
394}
395
c277df42 396/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
397#define REGCP_SET(cp) \
398 DEBUG_STATE_r( \
ab3bbdeb 399 PerlIO_printf(Perl_debug_log, \
e4f74956 400 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
401 (IV)PL_savestack_ix)); \
402 cp = PL_savestack_ix
c3464db5 403
ab3bbdeb 404#define REGCP_UNWIND(cp) \
e4f74956 405 DEBUG_STATE_r( \
ab3bbdeb 406 if (cp != PL_savestack_ix) \
e4f74956
YO
407 PerlIO_printf(Perl_debug_log, \
408 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
409 (IV)(cp), (IV)PL_savestack_ix)); \
410 regcpblow(cp)
c277df42 411
f067efbf 412STATIC void
b93070ed 413S_regcppop(pTHX_ regexp *rex)
a0d0e21e 414{
97aff369 415 dVAR;
e0fa7e2b 416 UV i;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
620d5b66 439 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 440 I32 tmps;
097eb12c 441 U32 paren = (U32)SSPOPINT;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
a0d0e21e 455 }
daf18116 456#if 1
dafc8851
JH
457 /* It would seem that the similar code in regtry()
458 * already takes care of this, and in fact it is in
459 * a better location to since this code can #if 0-ed out
460 * but the code in regtry() is needed or otherwise tests
461 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
462 * (as of patchlevel 7877) will fail. Then again,
463 * this code seems to be necessary or otherwise
225593e1
DM
464 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
465 * --jhi updated by dapm */
b93070ed 466 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 467 if (i > PL_regsize)
b93070ed
DM
468 rex->offs[i].start = -1;
469 rex->offs[i].end = -1;
495f47a5
DM
470 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
471 " \\%"UVuf": %s ..-1 undeffing\n",
472 (UV)i,
473 (i > PL_regsize) ? "-1" : " "
474 ));
a0d0e21e 475 }
dafc8851 476#endif
a0d0e21e
LW
477}
478
02db2b7b 479#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 480
a687059c 481/*
e50aee73 482 * pregexec and friends
a687059c
LW
483 */
484
76234dfb 485#ifndef PERL_IN_XSUB_RE
a687059c 486/*
c277df42 487 - pregexec - match a regexp against a string
a687059c 488 */
c277df42 489I32
49d7dfbc 490Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 491 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
492/* strend: pointer to null at end of string */
493/* strbeg: real beginning of string */
494/* minend: end of match must be >=minend after stringarg. */
495/* nosave: For optimizations. */
496{
7918f24d
NC
497 PERL_ARGS_ASSERT_PREGEXEC;
498
c277df42 499 return
9041c2e3 500 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
501 nosave ? 0 : REXEC_COPY_STR);
502}
76234dfb 503#endif
22e551b9 504
9041c2e3 505/*
cad2e5aa
JH
506 * Need to implement the following flags for reg_anch:
507 *
508 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
509 * USE_INTUIT_ML
510 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
511 * INTUIT_AUTORITATIVE_ML
512 * INTUIT_ONCE_NOML - Intuit can match in one location only.
513 * INTUIT_ONCE_ML
514 *
515 * Another flag for this function: SECOND_TIME (so that float substrs
516 * with giant delta may be not rechecked).
517 */
518
519/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
520
3f7c398e 521/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
522 Otherwise, only SvCUR(sv) is used to get strbeg. */
523
524/* XXXX We assume that strpos is strbeg unless sv. */
525
6eb5f6b9
JH
526/* XXXX Some places assume that there is a fixed substring.
527 An update may be needed if optimizer marks as "INTUITable"
528 RExen without fixed substrings. Similarly, it is assumed that
529 lengths of all the strings are no more than minlen, thus they
530 cannot come from lookahead.
40d049e4
YO
531 (Or minlen should take into account lookahead.)
532 NOTE: Some of this comment is not correct. minlen does now take account
533 of lookahead/behind. Further research is required. -- demerphq
534
535*/
6eb5f6b9 536
2c2d71f5
JH
537/* A failure to find a constant substring means that there is no need to make
538 an expensive call to REx engine, thus we celebrate a failure. Similarly,
539 finding a substring too deep into the string means that less calls to
30944b6d
IZ
540 regtry() should be needed.
541
542 REx compiler's optimizer found 4 possible hints:
543 a) Anchored substring;
544 b) Fixed substring;
545 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 546 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 547 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
548 string which does not contradict any of them.
549 */
2c2d71f5 550
6eb5f6b9
JH
551/* Most of decisions we do here should have been done at compile time.
552 The nodes of the REx which we used for the search should have been
553 deleted from the finite automaton. */
554
cad2e5aa 555char *
288b8c02 556Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 557 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 558{
97aff369 559 dVAR;
288b8c02 560 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 561 register I32 start_shift = 0;
cad2e5aa 562 /* Should be nonnegative! */
b7953727 563 register I32 end_shift = 0;
2c2d71f5
JH
564 register char *s;
565 register SV *check;
a1933d95 566 char *strbeg;
cad2e5aa 567 char *t;
f2ed9b32 568 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 569 I32 ml_anch;
bd61b366
SS
570 register char *other_last = NULL; /* other substr checked before this */
571 char *check_at = NULL; /* check substr found at this pos */
d8080198 572 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 573 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 574 RXi_GET_DECL(prog,progi);
30944b6d 575#ifdef DEBUGGING
890ce7af 576 const char * const i_strpos = strpos;
30944b6d 577#endif
a3621e74
YO
578 GET_RE_DEBUG_FLAGS_DECL;
579
7918f24d 580 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
581 PERL_UNUSED_ARG(flags);
582 PERL_UNUSED_ARG(data);
7918f24d 583
f2ed9b32 584 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 585
3c8556c3 586 if (RX_UTF8(rx)) {
b8d68ded
JH
587 PL_reg_flags |= RF_utf8;
588 }
ab3bbdeb 589 DEBUG_EXECUTE_r(
f2ed9b32 590 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
591 sv ? "Guessing start of match in sv for"
592 : "Guessing start of match in string for");
2a782b5b 593 );
cad2e5aa 594
c344f387
JH
595 /* CHR_DIST() would be more correct here but it makes things slow. */
596 if (prog->minlen > strend - strpos) {
a3621e74 597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 598 "String too short... [re_intuit_start]\n"));
cad2e5aa 599 goto fail;
2c2d71f5 600 }
1de06328 601
a1933d95 602 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 603 PL_regeol = strend;
f2ed9b32 604 if (utf8_target) {
33b8afdf
JH
605 if (!prog->check_utf8 && prog->check_substr)
606 to_utf8_substr(prog);
607 check = prog->check_utf8;
608 } else {
609 if (!prog->check_substr && prog->check_utf8)
610 to_byte_substr(prog);
611 check = prog->check_substr;
612 }
1de06328 613 if (check == &PL_sv_undef) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 615 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
616 goto fail;
617 }
bbe252da
YO
618 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
619 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
620 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 621 && !multiline ) ); /* Check after \n? */
cad2e5aa 622
7e25d62c 623 if (!ml_anch) {
bbe252da
YO
624 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
625 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 626 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
627 && sv && !SvROK(sv)
628 && (strpos != strbeg)) {
a3621e74 629 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
630 goto fail;
631 }
632 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 633 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 634 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
635 I32 slen;
636
1aa99e6b 637 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 638
653099ff
GS
639 if (SvTAIL(check)) {
640 slen = SvCUR(check); /* >= 1 */
cad2e5aa 641
9041c2e3 642 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 643 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 645 goto fail_finish;
cad2e5aa
JH
646 }
647 /* Now should match s[0..slen-2] */
648 slen--;
3f7c398e 649 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 650 || (slen > 1
3f7c398e 651 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 652 report_neq:
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
654 goto fail_finish;
655 }
cad2e5aa 656 }
3f7c398e 657 else if (*SvPVX_const(check) != *s
653099ff 658 || ((slen = SvCUR(check)) > 1
3f7c398e 659 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 660 goto report_neq;
c315bfe8 661 check_at = s;
2c2d71f5 662 goto success_at_start;
7e25d62c 663 }
cad2e5aa 664 }
2c2d71f5 665 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 666 s = strpos;
2c2d71f5 667 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
668 end_shift = prog->check_end_shift;
669
2c2d71f5 670 if (!ml_anch) {
a3b680e6 671 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 672 - (SvTAIL(check) != 0);
a3b680e6 673 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
674
675 if (end_shift < eshift)
676 end_shift = eshift;
677 }
cad2e5aa 678 }
2c2d71f5 679 else { /* Can match at random position */
cad2e5aa
JH
680 ml_anch = 0;
681 s = strpos;
1de06328
YO
682 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
683 end_shift = prog->check_end_shift;
684
685 /* end shift should be non negative here */
cad2e5aa
JH
686 }
687
bcdf7404 688#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 689 if (end_shift < 0)
1de06328 690 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 691 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
692#endif
693
2c2d71f5
JH
694 restart:
695 /* Find a possible match in the region s..strend by looking for
696 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
697
698 {
699 I32 srch_start_shift = start_shift;
700 I32 srch_end_shift = end_shift;
c33e64f0
FC
701 U8* start_point;
702 U8* end_point;
1de06328
YO
703 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
704 srch_end_shift -= ((strbeg - s) - srch_start_shift);
705 srch_start_shift = strbeg - s;
706 }
6bda09f9 707 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
708 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
709 (IV)prog->check_offset_min,
710 (IV)srch_start_shift,
711 (IV)srch_end_shift,
712 (IV)prog->check_end_shift);
713 });
714
bbe252da 715 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
716 start_point= (U8*)(s + srch_start_shift);
717 end_point= (U8*)(strend - srch_end_shift);
718 } else {
719 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
720 end_point= HOP3(strend, -srch_end_shift, strbeg);
721 }
6bda09f9 722 DEBUG_OPTIMISE_MORE_r({
56570a2c 723 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 724 (int)(end_point - start_point),
fc8cd66c 725 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
726 start_point);
727 });
728
729 s = fbm_instr( start_point, end_point,
7fba1cd6 730 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 731 }
cad2e5aa
JH
732 /* Update the count-of-usability, remove useless subpatterns,
733 unshift s. */
2c2d71f5 734
ab3bbdeb 735 DEBUG_EXECUTE_r({
f2ed9b32 736 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
737 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
738 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 739 (s ? "Found" : "Did not find"),
f2ed9b32 740 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
741 ? "anchored" : "floating"),
742 quoted,
743 RE_SV_TAIL(check),
744 (s ? " at offset " : "...\n") );
745 });
2c2d71f5
JH
746
747 if (!s)
748 goto fail_finish;
2c2d71f5 749 /* Finish the diagnostic message */
a3621e74 750 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 751
1de06328
YO
752 /* XXX dmq: first branch is for positive lookbehind...
753 Our check string is offset from the beginning of the pattern.
754 So we need to do any stclass tests offset forward from that
755 point. I think. :-(
756 */
757
758
759
760 check_at=s;
761
762
2c2d71f5
JH
763 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
764 Start with the other substr.
765 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 766 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
767 *always* match. Probably should be marked during compile...
768 Probably it is right to do no SCREAM here...
769 */
770
f2ed9b32 771 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
772 : (prog->float_substr && prog->anchored_substr))
773 {
30944b6d 774 /* Take into account the "other" substring. */
2c2d71f5
JH
775 /* XXXX May be hopelessly wrong for UTF... */
776 if (!other_last)
6eb5f6b9 777 other_last = strpos;
f2ed9b32 778 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
779 do_other_anchored:
780 {
890ce7af
AL
781 char * const last = HOP3c(s, -start_shift, strbeg);
782 char *last1, *last2;
be8e71aa 783 char * const saved_s = s;
33b8afdf 784 SV* must;
2c2d71f5 785
2c2d71f5
JH
786 t = s - prog->check_offset_max;
787 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 788 && (!utf8_target
0ce71af7 789 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 790 && t > strpos)))
6f207bd3 791 NOOP;
2c2d71f5
JH
792 else
793 t = strpos;
1aa99e6b 794 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
795 if (t < other_last) /* These positions already checked */
796 t = other_last;
1aa99e6b 797 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
798 if (last < last1)
799 last1 = last;
1de06328
YO
800 /* XXXX It is not documented what units *_offsets are in.
801 We assume bytes, but this is clearly wrong.
802 Meaning this code needs to be carefully reviewed for errors.
803 dmq.
804 */
805
2c2d71f5 806 /* On end-of-str: see comment below. */
f2ed9b32 807 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
808 if (must == &PL_sv_undef) {
809 s = (char*)NULL;
1de06328 810 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
811 }
812 else
813 s = fbm_instr(
814 (unsigned char*)t,
815 HOP3(HOP3(last1, prog->anchored_offset, strend)
816 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
817 must,
7fba1cd6 818 multiline ? FBMrf_MULTILINE : 0
33b8afdf 819 );
ab3bbdeb 820 DEBUG_EXECUTE_r({
f2ed9b32 821 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
822 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
823 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 824 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
825 quoted, RE_SV_TAIL(must));
826 });
827
828
2c2d71f5
JH
829 if (!s) {
830 if (last1 >= last2) {
a3621e74 831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
832 ", giving up...\n"));
833 goto fail_finish;
834 }
a3621e74 835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 836 ", trying floating at offset %ld...\n",
be8e71aa 837 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
838 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
839 s = HOP3c(last, 1, strend);
2c2d71f5
JH
840 goto restart;
841 }
842 else {
a3621e74 843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 844 (long)(s - i_strpos)));
1aa99e6b
IH
845 t = HOP3c(s, -prog->anchored_offset, strbeg);
846 other_last = HOP3c(s, 1, strend);
be8e71aa 847 s = saved_s;
2c2d71f5
JH
848 if (t == strpos)
849 goto try_at_start;
2c2d71f5
JH
850 goto try_at_offset;
851 }
30944b6d 852 }
2c2d71f5
JH
853 }
854 else { /* Take into account the floating substring. */
33b8afdf 855 char *last, *last1;
be8e71aa 856 char * const saved_s = s;
33b8afdf
JH
857 SV* must;
858
859 t = HOP3c(s, -start_shift, strbeg);
860 last1 = last =
861 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
862 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
863 last = HOP3c(t, prog->float_max_offset, strend);
864 s = HOP3c(t, prog->float_min_offset, strend);
865 if (s < other_last)
866 s = other_last;
2c2d71f5 867 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 868 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
869 /* fbm_instr() takes into account exact value of end-of-str
870 if the check is SvTAIL(ed). Since false positives are OK,
871 and end-of-str is not later than strend we are OK. */
872 if (must == &PL_sv_undef) {
873 s = (char*)NULL;
1de06328 874 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
875 }
876 else
2c2d71f5 877 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
878 (unsigned char*)last + SvCUR(must)
879 - (SvTAIL(must)!=0),
7fba1cd6 880 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 881 DEBUG_EXECUTE_r({
f2ed9b32 882 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
883 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
884 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 885 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
886 quoted, RE_SV_TAIL(must));
887 });
33b8afdf
JH
888 if (!s) {
889 if (last1 == last) {
a3621e74 890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
891 ", giving up...\n"));
892 goto fail_finish;
2c2d71f5 893 }
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 895 ", trying anchored starting at offset %ld...\n",
be8e71aa 896 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
897 other_last = last;
898 s = HOP3c(t, 1, strend);
899 goto restart;
900 }
901 else {
a3621e74 902 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
903 (long)(s - i_strpos)));
904 other_last = s; /* Fix this later. --Hugo */
be8e71aa 905 s = saved_s;
33b8afdf
JH
906 if (t == strpos)
907 goto try_at_start;
908 goto try_at_offset;
909 }
2c2d71f5 910 }
cad2e5aa 911 }
2c2d71f5 912
1de06328 913
9ef43ace 914 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 915
6bda09f9 916 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
917 PerlIO_printf(Perl_debug_log,
918 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
919 (IV)prog->check_offset_min,
920 (IV)prog->check_offset_max,
921 (IV)(s-strpos),
922 (IV)(t-strpos),
923 (IV)(t-s),
924 (IV)(strend-strpos)
925 )
926 );
927
2c2d71f5 928 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 929 && (!utf8_target
9ef43ace 930 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
931 && t > strpos)))
932 {
2c2d71f5
JH
933 /* Fixed substring is found far enough so that the match
934 cannot start at strpos. */
935 try_at_offset:
cad2e5aa 936 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
937 /* Eventually fbm_*() should handle this, but often
938 anchored_offset is not 0, so this check will not be wasted. */
939 /* XXXX In the code below we prefer to look for "^" even in
940 presence of anchored substrings. And we search even
941 beyond the found float position. These pessimizations
942 are historical artefacts only. */
943 find_anchor:
2c2d71f5 944 while (t < strend - prog->minlen) {
cad2e5aa 945 if (*t == '\n') {
4ee3650e 946 if (t < check_at - prog->check_offset_min) {
f2ed9b32 947 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
948 /* Since we moved from the found position,
949 we definitely contradict the found anchored
30944b6d
IZ
950 substr. Due to the above check we do not
951 contradict "check" substr.
952 Thus we can arrive here only if check substr
953 is float. Redo checking for "other"=="fixed".
954 */
9041c2e3 955 strpos = t + 1;
a3621e74 956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 957 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
958 goto do_other_anchored;
959 }
4ee3650e
GS
960 /* We don't contradict the found floating substring. */
961 /* XXXX Why not check for STCLASS? */
cad2e5aa 962 s = t + 1;
a3621e74 963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 964 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
965 goto set_useful;
966 }
4ee3650e
GS
967 /* Position contradicts check-string */
968 /* XXXX probably better to look for check-string
969 than for "\n", so one should lower the limit for t? */
a3621e74 970 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 971 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 972 other_last = strpos = s = t + 1;
cad2e5aa
JH
973 goto restart;
974 }
975 t++;
976 }
a3621e74 977 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 978 PL_colors[0], PL_colors[1]));
2c2d71f5 979 goto fail_finish;
cad2e5aa 980 }
f5952150 981 else {
a3621e74 982 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 983 PL_colors[0], PL_colors[1]));
f5952150 984 }
cad2e5aa
JH
985 s = t;
986 set_useful:
f2ed9b32 987 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
988 }
989 else {
f5952150 990 /* The found string does not prohibit matching at strpos,
2c2d71f5 991 - no optimization of calling REx engine can be performed,
f5952150
GS
992 unless it was an MBOL and we are not after MBOL,
993 or a future STCLASS check will fail this. */
2c2d71f5
JH
994 try_at_start:
995 /* Even in this situation we may use MBOL flag if strpos is offset
996 wrt the start of the string. */
05b4157f 997 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 998 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 999 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1000 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1001 {
cad2e5aa
JH
1002 t = strpos;
1003 goto find_anchor;
1004 }
a3621e74 1005 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1006 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1007 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1008 );
2c2d71f5 1009 success_at_start:
bbe252da 1010 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1011 && (utf8_target ? (
33b8afdf
JH
1012 prog->check_utf8 /* Could be deleted already */
1013 && --BmUSEFUL(prog->check_utf8) < 0
1014 && (prog->check_utf8 == prog->float_utf8)
1015 ) : (
1016 prog->check_substr /* Could be deleted already */
1017 && --BmUSEFUL(prog->check_substr) < 0
1018 && (prog->check_substr == prog->float_substr)
1019 )))
66e933ab 1020 {
cad2e5aa 1021 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1022 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1023 /* XXX Does the destruction order has to change with utf8_target? */
1024 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1025 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1026 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1027 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1028 check = NULL; /* abort */
cad2e5aa 1029 s = strpos;
486ec47a 1030 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1031 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1032 if (prog->intflags & PREGf_IMPLICIT)
1033 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1034 /* XXXX This is a remnant of the old implementation. It
1035 looks wasteful, since now INTUIT can use many
6eb5f6b9 1036 other heuristics. */
bbe252da 1037 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1038 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1039 }
1040 else
1041 s = strpos;
1042 }
1043
6eb5f6b9
JH
1044 /* Last resort... */
1045 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1046 /* trie stclasses are too expensive to use here, we are better off to
1047 leave it to regmatch itself */
f8fc2ecf 1048 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1049 /* minlen == 0 is possible if regstclass is \b or \B,
1050 and the fixed substr is ''$.
1051 Since minlen is already taken into account, s+1 is before strend;
1052 accidentally, minlen >= 1 guaranties no false positives at s + 1
1053 even for \b or \B. But (minlen? 1 : 0) below assumes that
1054 regstclass does not come from lookahead... */
1055 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1056 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1057 const U8* const str = (U8*)STRING(progi->regstclass);
1058 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1059 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1060 : 1);
1de06328
YO
1061 char * endpos;
1062 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1063 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1064 else if (prog->float_substr || prog->float_utf8)
1065 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1066 else
1067 endpos= strend;
1068
d8080198
YO
1069 if (checked_upto < s)
1070 checked_upto = s;
1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1072 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1073
6eb5f6b9 1074 t = s;
d8080198
YO
1075 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1076 if (s) {
1077 checked_upto = s;
1078 } else {
6eb5f6b9 1079#ifdef DEBUGGING
cbbf8932 1080 const char *what = NULL;
6eb5f6b9
JH
1081#endif
1082 if (endpos == strend) {
a3621e74 1083 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1084 "Could not match STCLASS...\n") );
1085 goto fail;
1086 }
a3621e74 1087 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1088 "This position contradicts STCLASS...\n") );
bbe252da 1089 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1090 goto fail;
d8080198
YO
1091 checked_upto = HOPBACKc(endpos, start_shift);
1092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1093 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1094 /* Contradict one of substrings */
33b8afdf 1095 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1096 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1097 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1098 hop_and_restart:
1aa99e6b 1099 s = HOP3c(t, 1, strend);
66e933ab
GS
1100 if (s + start_shift + end_shift > strend) {
1101 /* XXXX Should be taken into account earlier? */
a3621e74 1102 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1103 "Could not match STCLASS...\n") );
1104 goto fail;
1105 }
5e39e1e5
HS
1106 if (!check)
1107 goto giveup;
a3621e74 1108 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1109 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1110 what, (long)(s + start_shift - i_strpos)) );
1111 goto restart;
1112 }
66e933ab 1113 /* Have both, check_string is floating */
6eb5f6b9
JH
1114 if (t + start_shift >= check_at) /* Contradicts floating=check */
1115 goto retry_floating_check;
1116 /* Recheck anchored substring, but not floating... */
9041c2e3 1117 s = check_at;
5e39e1e5
HS
1118 if (!check)
1119 goto giveup;
a3621e74 1120 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1121 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1122 (long)(other_last - i_strpos)) );
1123 goto do_other_anchored;
1124 }
60e71179
GS
1125 /* Another way we could have checked stclass at the
1126 current position only: */
1127 if (ml_anch) {
1128 s = t = t + 1;
5e39e1e5
HS
1129 if (!check)
1130 goto giveup;
a3621e74 1131 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1132 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1133 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1134 goto try_at_offset;
66e933ab 1135 }
f2ed9b32 1136 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1137 goto fail;
486ec47a 1138 /* Check is floating substring. */
6eb5f6b9
JH
1139 retry_floating_check:
1140 t = check_at - start_shift;
a3621e74 1141 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1142 goto hop_and_restart;
1143 }
b7953727 1144 if (t != s) {
a3621e74 1145 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1146 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1147 (long)(t - i_strpos), (long)(s - i_strpos))
1148 );
1149 }
1150 else {
a3621e74 1151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1152 "Does not contradict STCLASS...\n");
1153 );
1154 }
6eb5f6b9 1155 }
5e39e1e5 1156 giveup:
a3621e74 1157 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1158 PL_colors[4], (check ? "Guessed" : "Giving up"),
1159 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1160 return s;
2c2d71f5
JH
1161
1162 fail_finish: /* Substring not found */
33b8afdf 1163 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1164 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1165 fail:
a3621e74 1166 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1167 PL_colors[4], PL_colors[5]));
bd61b366 1168 return NULL;
cad2e5aa 1169}
9661b544 1170
a0a388a1
YO
1171#define DECL_TRIE_TYPE(scan) \
1172 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1173 trie_type = ((scan->flags == EXACT) \
1174 ? (utf8_target ? trie_utf8 : trie_plain) \
1175 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1176
1177#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1178uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1179 STRLEN skiplen; \
1180 switch (trie_type) { \
1181 case trie_utf8_fold: \
1182 if ( foldlen>0 ) { \
1183 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1184 foldlen -= len; \
1185 uscan += len; \
1186 len=0; \
1187 } else { \
1188 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1189 len = UTF8SKIP(uc); \
1190 skiplen = UNISKIP( uvc ); \
1191 foldlen -= skiplen; \
1192 uscan = foldbuf + skiplen; \
1193 } \
1194 break; \
1195 case trie_latin_utf8_fold: \
1196 if ( foldlen>0 ) { \
1197 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1198 foldlen -= len; \
1199 uscan += len; \
1200 len=0; \
1201 } else { \
1202 len = 1; \
1203 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1204 skiplen = UNISKIP( uvc ); \
1205 foldlen -= skiplen; \
1206 uscan = foldbuf + skiplen; \
1207 } \
1208 break; \
1209 case trie_utf8: \
1210 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1211 break; \
1212 case trie_plain: \
1213 uvc = (UV)*uc; \
1214 len = 1; \
1215 } \
1216 if (uvc < 256) { \
1217 charid = trie->charmap[ uvc ]; \
1218 } \
1219 else { \
1220 charid = 0; \
1221 if (widecharmap) { \
1222 SV** const svpp = hv_fetch(widecharmap, \
1223 (char*)&uvc, sizeof(UV), 0); \
1224 if (svpp) \
1225 charid = (U16)SvIV(*svpp); \
1226 } \
1227 } \
4cadc6a9
YO
1228} STMT_END
1229
4cadc6a9
YO
1230#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1231STMT_START { \
1232 while (s <= e) { \
1233 if ( (CoNd) \
fac1af77 1234 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1235 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1236 goto got_it; \
1237 s++; \
1238 } \
1239} STMT_END
1240
1241#define REXEC_FBC_UTF8_SCAN(CoDe) \
1242STMT_START { \
1243 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1244 CoDe \
1245 s += uskip; \
1246 } \
1247} STMT_END
1248
1249#define REXEC_FBC_SCAN(CoDe) \
1250STMT_START { \
1251 while (s < strend) { \
1252 CoDe \
1253 s++; \
1254 } \
1255} STMT_END
1256
1257#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1258REXEC_FBC_UTF8_SCAN( \
1259 if (CoNd) { \
24b23f37 1260 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1261 goto got_it; \
1262 else \
1263 tmp = doevery; \
1264 } \
1265 else \
1266 tmp = 1; \
1267)
1268
1269#define REXEC_FBC_CLASS_SCAN(CoNd) \
1270REXEC_FBC_SCAN( \
1271 if (CoNd) { \
24b23f37 1272 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1273 goto got_it; \
1274 else \
1275 tmp = doevery; \
1276 } \
1277 else \
1278 tmp = 1; \
1279)
1280
1281#define REXEC_FBC_TRYIT \
24b23f37 1282if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1283 goto got_it
1284
e1d1eefb 1285#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1286 if (utf8_target) { \
e1d1eefb
YO
1287 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1288 } \
1289 else { \
1290 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1291 }
e1d1eefb 1292
4cadc6a9 1293#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1294 if (utf8_target) { \
4cadc6a9
YO
1295 UtFpReLoAd; \
1296 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1297 } \
1298 else { \
1299 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1300 }
4cadc6a9
YO
1301
1302#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1303 PL_reg_flags |= RF_tainted; \
f2ed9b32 1304 if (utf8_target) { \
4cadc6a9
YO
1305 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1306 } \
1307 else { \
1308 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1309 }
4cadc6a9 1310
786e8c11
YO
1311#define DUMP_EXEC_POS(li,s,doutf8) \
1312 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1313
cfaf538b
KW
1314
1315#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1316 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1317 tmp = TEST_NON_UTF8(tmp); \
1318 REXEC_FBC_UTF8_SCAN( \
1319 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1320 tmp = !tmp; \
1321 IF_SUCCESS; \
1322 } \
1323 else { \
1324 IF_FAIL; \
1325 } \
1326 ); \
1327
1328#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1329 if (s == PL_bostr) { \
1330 tmp = '\n'; \
1331 } \
1332 else { \
1333 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1334 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1335 } \
1336 tmp = TeSt1_UtF8; \
1337 LOAD_UTF8_CHARCLASS_ALNUM(); \
1338 REXEC_FBC_UTF8_SCAN( \
1339 if (tmp == ! (TeSt2_UtF8)) { \
1340 tmp = !tmp; \
1341 IF_SUCCESS; \
1342 } \
1343 else { \
1344 IF_FAIL; \
1345 } \
1346 ); \
1347
63ac0dad
KW
1348/* The only difference between the BOUND and NBOUND cases is that
1349 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1350 * NBOUND. This is accomplished by passing it in either the if or else clause,
1351 * with the other one being empty */
1352#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1353 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1354
1355#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1356 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1357
1358#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1359 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1360
1361#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1362 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1363
63ac0dad
KW
1364
1365/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1366 * be passed in completely with the variable name being tested, which isn't
1367 * such a clean interface, but this is easier to read than it was before. We
1368 * are looking for the boundary (or non-boundary between a word and non-word
1369 * character. The utf8 and non-utf8 cases have the same logic, but the details
1370 * must be different. Find the "wordness" of the character just prior to this
1371 * one, and compare it with the wordness of this one. If they differ, we have
1372 * a boundary. At the beginning of the string, pretend that the previous
1373 * character was a new-line */
cfaf538b 1374#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1375 if (utf8_target) { \
cfaf538b 1376 UTF8_CODE \
63ac0dad
KW
1377 } \
1378 else { /* Not utf8 */ \
1379 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1380 tmp = TEST_NON_UTF8(tmp); \
1381 REXEC_FBC_SCAN( \
1382 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1383 tmp = !tmp; \
1384 IF_SUCCESS; \
1385 } \
1386 else { \
1387 IF_FAIL; \
1388 } \
1389 ); \
1390 } \
1391 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1392 goto got_it;
1393
786e8c11
YO
1394/* We know what class REx starts with. Try to find this position... */
1395/* if reginfo is NULL, its a dryrun */
1396/* annoyingly all the vars in this routine have different names from their counterparts
1397 in regmatch. /grrr */
1398
3c3eec57 1399STATIC char *
07be1b83 1400S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1401 const char *strend, regmatch_info *reginfo)
a687059c 1402{
27da23d5 1403 dVAR;
bbe252da 1404 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1405 char *pat_string; /* The pattern's exactish string */
1406 char *pat_end; /* ptr to end char of pat_string */
1407 re_fold_t folder; /* Function for computing non-utf8 folds */
1408 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1409 STRLEN ln;
5dab1207 1410 STRLEN lnc;
078c425b 1411 register STRLEN uskip;
fac1af77
KW
1412 U8 c1;
1413 U8 c2;
6eb5f6b9
JH
1414 char *e;
1415 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1416 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1417 UV utf8_fold_flags = 0;
f8fc2ecf 1418 RXi_GET_DECL(prog,progi);
7918f24d
NC
1419
1420 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1421
6eb5f6b9
JH
1422 /* We know what class it must start with. */
1423 switch (OP(c)) {
f56b6394 1424 case ANYOFV:
6eb5f6b9 1425 case ANYOF:
f56b6394 1426 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1427 STRLEN inclasslen = strend - s;
1428 REXEC_FBC_UTF8_CLASS_SCAN(
1429 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1430 }
1431 else {
6ef69d56 1432 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1433 }
6eb5f6b9 1434 break;
f33976b4 1435 case CANY:
4cadc6a9 1436 REXEC_FBC_SCAN(
24b23f37 1437 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1438 goto got_it;
1439 else
1440 tmp = doevery;
4cadc6a9 1441 );
f33976b4 1442 break;
2f7f8cb1
KW
1443
1444 case EXACTFA:
1445 if (UTF_PATTERN || utf8_target) {
1446 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1447 goto do_exactf_utf8;
1448 }
1449 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1450 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1451 goto do_exactf_non_utf8; /* isn't dealt with by these */
1452
6eb5f6b9 1453 case EXACTF:
62bf7766 1454 if (utf8_target) {
77a6d856
KW
1455
1456 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1457 utf8_fold_flags = 0;
fac1af77
KW
1458 goto do_exactf_utf8;
1459 }
1460 fold_array = PL_fold;
1461 folder = foldEQ;
1462 goto do_exactf_non_utf8;
1463
1464 case EXACTFL:
1465 if (UTF_PATTERN || utf8_target) {
17580e7a 1466 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1467 goto do_exactf_utf8;
1468 }
1469 fold_array = PL_fold_locale;
1470 folder = foldEQ_locale;
16d951b7
KW
1471 goto do_exactf_non_utf8;
1472
3c760661
KW
1473 case EXACTFU_SS:
1474 if (UTF_PATTERN) {
1475 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1476 }
1477 goto do_exactf_utf8;
1478
fab2782b 1479 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1480 case EXACTFU:
1481 if (UTF_PATTERN || utf8_target) {
77a6d856 1482 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1483 goto do_exactf_utf8;
1484 }
1485
1486 /* Any 'ss' in the pattern should have been replaced by regcomp,
1487 * so we don't have to worry here about this single special case
1488 * in the Latin1 range */
1489 fold_array = PL_fold_latin1;
1490 folder = foldEQ_latin1;
fac1af77
KW
1491
1492 /* FALL THROUGH */
1493
62bf7766
KW
1494 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1495 are no glitches with fold-length differences
1496 between the target string and pattern */
fac1af77
KW
1497
1498 /* The idea in the non-utf8 EXACTF* cases is to first find the
1499 * first character of the EXACTF* node and then, if necessary,
1500 * case-insensitively compare the full text of the node. c1 is the
1501 * first character. c2 is its fold. This logic will not work for
1502 * Unicode semantics and the german sharp ss, which hence should
1503 * not be compiled into a node that gets here. */
1504 pat_string = STRING(c);
1505 ln = STR_LEN(c); /* length to match in octets/bytes */
1506
8a90a8fe
KW
1507 /* We know that we have to match at least 'ln' bytes (which is the
1508 * same as characters, since not utf8). If we have to match 3
1509 * characters, and there are only 2 availabe, we know without
1510 * trying that it will fail; so don't start a match past the
1511 * required minimum number from the far end */
fac1af77
KW
1512 e = HOP3c(strend, -((I32)ln), s);
1513
1514 if (!reginfo && e < s) {
1515 e = s; /* Due to minlen logic of intuit() */
1516 }
1517
1518 c1 = *pat_string;
1519 c2 = fold_array[c1];
1520 if (c1 == c2) { /* If char and fold are the same */
1521 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1522 }
1523 else {
1524 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1525 }
1526 break;
1527
1528 do_exactf_utf8:
e067297c
KW
1529 {
1530 unsigned expansion;
1531
fac1af77
KW
1532
1533 /* If one of the operands is in utf8, we can't use the simpler
1534 * folding above, due to the fact that many different characters
1535 * can have the same fold, or portion of a fold, or different-
1536 * length fold */
1537 pat_string = STRING(c);
1538 ln = STR_LEN(c); /* length to match in octets/bytes */
1539 pat_end = pat_string + ln;
1540 lnc = (UTF_PATTERN) /* length to match in characters */
1541 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1542 : ln;
1543
e067297c
KW
1544 /* We have 'lnc' characters to match in the pattern, but because of
1545 * multi-character folding, each character in the target can match
1546 * up to 3 characters (Unicode guarantees it will never exceed
1547 * this) if it is utf8-encoded; and up to 2 if not (based on the
1548 * fact that the Latin 1 folds are already determined, and the
1549 * only multi-char fold in that range is the sharp-s folding to
1550 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1551 * string character. Adjust lnc accordingly, rounding up, so that
1552 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1553 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1554 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1555
1556 /* As in the non-UTF8 case, if we have to match 3 characters, and
1557 * only 2 are left, it's guaranteed to fail, so don't start a
1558 * match that would require us to go beyond the end of the string
1559 */
1560 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1561
1562 if (!reginfo && e < s) {
1563 e = s; /* Due to minlen logic of intuit() */
1564 }
1565
b33105db
KW
1566 /* XXX Note that we could recalculate e to stop the loop earlier,
1567 * as the worst case expansion above will rarely be met, and as we
1568 * go along we would usually find that e moves further to the left.
1569 * This would happen only after we reached the point in the loop
1570 * where if there were no expansion we should fail. Unclear if
1571 * worth the expense */
e067297c 1572
fac1af77
KW
1573 while (s <= e) {
1574 char *my_strend= (char *)strend;
1575 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1576 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1577 && (!reginfo || regtry(reginfo, &s)) )
1578 {
1579 goto got_it;
1580 }
bbdd8bad 1581 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1582 }
1583 break;
e067297c 1584 }
bbce6d69 1585 case BOUNDL:
3280af22 1586 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1587 FBC_BOUND(isALNUM_LC,
1588 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1589 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1590 break;
bbce6d69 1591 case NBOUNDL:
3280af22 1592 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1593 FBC_NBOUND(isALNUM_LC,
1594 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1595 isALNUM_LC_utf8((U8*)s));
1596 break;
1597 case BOUND:
1598 FBC_BOUND(isWORDCHAR,
1599 isALNUM_uni(tmp),
1600 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1601 break;
cfaf538b
KW
1602 case BOUNDA:
1603 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1604 isWORDCHAR_A(tmp),
1605 isWORDCHAR_A((U8*)s));
1606 break;
a0d0e21e 1607 case NBOUND:
63ac0dad
KW
1608 FBC_NBOUND(isWORDCHAR,
1609 isALNUM_uni(tmp),
1610 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1611 break;
cfaf538b
KW
1612 case NBOUNDA:
1613 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1614 isWORDCHAR_A(tmp),
1615 isWORDCHAR_A((U8*)s));
1616 break;
63ac0dad
KW
1617 case BOUNDU:
1618 FBC_BOUND(isWORDCHAR_L1,
1619 isALNUM_uni(tmp),
1620 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1621 break;
1622 case NBOUNDU:
1623 FBC_NBOUND(isWORDCHAR_L1,
1624 isALNUM_uni(tmp),
1625 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1626 break;
bbce6d69 1627 case ALNUML:
4cadc6a9
YO
1628 REXEC_FBC_CSCAN_TAINT(
1629 isALNUM_LC_utf8((U8*)s),
1630 isALNUM_LC(*s)
1631 );
6895a8aa 1632 break;
980866de
KW
1633 case ALNUMU:
1634 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1635 LOAD_UTF8_CHARCLASS_ALNUM(),
1636 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1637 isWORDCHAR_L1((U8) *s)
1638 );
6895a8aa 1639 break;
980866de
KW
1640 case ALNUM:
1641 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1642 LOAD_UTF8_CHARCLASS_ALNUM(),
1643 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1644 isWORDCHAR((U8) *s)
1645 );
6895a8aa 1646 break;
cfaf538b 1647 case ALNUMA:
8e9da4d4
KW
1648 /* Don't need to worry about utf8, as it can match only a single
1649 * byte invariant character */
cfaf538b 1650 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1651 break;
980866de
KW
1652 case NALNUMU:
1653 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1654 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1655 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1656 ! isWORDCHAR_L1((U8) *s)
1657 );
6895a8aa 1658 break;
a0d0e21e 1659 case NALNUM:
4cadc6a9 1660 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1661 LOAD_UTF8_CHARCLASS_ALNUM(),
1662 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1663 ! isALNUM(*s)
4cadc6a9 1664 );
6895a8aa 1665 break;
cfaf538b 1666 case NALNUMA:
8e9da4d4
KW
1667 REXEC_FBC_CSCAN(
1668 !isWORDCHAR_A(*s),
1669 !isWORDCHAR_A(*s)
1670 );
1671 break;
bbce6d69 1672 case NALNUML:
4cadc6a9
YO
1673 REXEC_FBC_CSCAN_TAINT(
1674 !isALNUM_LC_utf8((U8*)s),
1675 !isALNUM_LC(*s)
1676 );
6895a8aa 1677 break;
980866de
KW
1678 case SPACEU:
1679 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1680 LOAD_UTF8_CHARCLASS_SPACE(),
1681 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1682 isSPACE_L1((U8) *s)
1683 );
6895a8aa 1684 break;
a0d0e21e 1685 case SPACE:
4cadc6a9 1686 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1687 LOAD_UTF8_CHARCLASS_SPACE(),
1688 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1689 isSPACE((U8) *s)
4cadc6a9 1690 );
6895a8aa 1691 break;
cfaf538b 1692 case SPACEA:
8e9da4d4
KW
1693 /* Don't need to worry about utf8, as it can match only a single
1694 * byte invariant character */
cfaf538b 1695 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1696 break;
bbce6d69 1697 case SPACEL:
4cadc6a9 1698 REXEC_FBC_CSCAN_TAINT(
6bbba904 1699 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1700 isSPACE_LC(*s)
1701 );
6895a8aa 1702 break;
980866de
KW
1703 case NSPACEU:
1704 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1705 LOAD_UTF8_CHARCLASS_SPACE(),
1706 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1707 ! isSPACE_L1((U8) *s)
1708 );
6895a8aa 1709 break;
a0d0e21e 1710 case NSPACE:
4cadc6a9 1711 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1712 LOAD_UTF8_CHARCLASS_SPACE(),
1713 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1714 ! isSPACE((U8) *s)
4cadc6a9 1715 );
6895a8aa 1716 break;
cfaf538b 1717 case NSPACEA:
8e9da4d4
KW
1718 REXEC_FBC_CSCAN(
1719 !isSPACE_A(*s),
1720 !isSPACE_A(*s)
1721 );
1722 break;
bbce6d69 1723 case NSPACEL:
4cadc6a9 1724 REXEC_FBC_CSCAN_TAINT(
6bbba904 1725 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1726 !isSPACE_LC(*s)
1727 );
6895a8aa 1728 break;
a0d0e21e 1729 case DIGIT:
4cadc6a9 1730 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1731 LOAD_UTF8_CHARCLASS_DIGIT(),
1732 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1733 isDIGIT(*s)
1734 );
6895a8aa 1735 break;
cfaf538b 1736 case DIGITA:
8e9da4d4
KW
1737 /* Don't need to worry about utf8, as it can match only a single
1738 * byte invariant character */
cfaf538b 1739 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1740 break;
b8c5462f 1741 case DIGITL:
4cadc6a9
YO
1742 REXEC_FBC_CSCAN_TAINT(
1743 isDIGIT_LC_utf8((U8*)s),
1744 isDIGIT_LC(*s)
1745 );
6895a8aa 1746 break;
a0d0e21e 1747 case NDIGIT:
4cadc6a9 1748 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1749 LOAD_UTF8_CHARCLASS_DIGIT(),
1750 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1751 !isDIGIT(*s)
1752 );
6895a8aa 1753 break;
cfaf538b 1754 case NDIGITA:
8e9da4d4
KW
1755 REXEC_FBC_CSCAN(
1756 !isDIGIT_A(*s),
1757 !isDIGIT_A(*s)
1758 );
1759 break;
b8c5462f 1760 case NDIGITL:
4cadc6a9
YO
1761 REXEC_FBC_CSCAN_TAINT(
1762 !isDIGIT_LC_utf8((U8*)s),
1763 !isDIGIT_LC(*s)
1764 );
6895a8aa 1765 break;
e1d1eefb
YO
1766 case LNBREAK:
1767 REXEC_FBC_CSCAN(
1768 is_LNBREAK_utf8(s),
1769 is_LNBREAK_latin1(s)
1770 );
6895a8aa 1771 break;
e1d1eefb
YO
1772 case VERTWS:
1773 REXEC_FBC_CSCAN(
1774 is_VERTWS_utf8(s),
1775 is_VERTWS_latin1(s)
1776 );
6895a8aa 1777 break;
e1d1eefb
YO
1778 case NVERTWS:
1779 REXEC_FBC_CSCAN(
1780 !is_VERTWS_utf8(s),
1781 !is_VERTWS_latin1(s)
1782 );
6895a8aa 1783 break;
e1d1eefb
YO
1784 case HORIZWS:
1785 REXEC_FBC_CSCAN(
1786 is_HORIZWS_utf8(s),
1787 is_HORIZWS_latin1(s)
1788 );
6895a8aa 1789 break;
e1d1eefb
YO
1790 case NHORIZWS:
1791 REXEC_FBC_CSCAN(
1792 !is_HORIZWS_utf8(s),
1793 !is_HORIZWS_latin1(s)
1794 );
6895a8aa 1795 break;
1de06328
YO
1796 case AHOCORASICKC:
1797 case AHOCORASICK:
07be1b83 1798 {
a0a388a1 1799 DECL_TRIE_TYPE(c);
07be1b83
YO
1800 /* what trie are we using right now */
1801 reg_ac_data *aho
f8fc2ecf 1802 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1803 reg_trie_data *trie
1804 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1805 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1806
1807 const char *last_start = strend - trie->minlen;
6148ee25 1808#ifdef DEBUGGING
07be1b83 1809 const char *real_start = s;
6148ee25 1810#endif
07be1b83 1811 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1812 SV *sv_points;
1813 U8 **points; /* map of where we were in the input string
786e8c11 1814 when reading a given char. For ASCII this
be8e71aa 1815 is unnecessary overhead as the relationship
38a44b82
NC
1816 is always 1:1, but for Unicode, especially
1817 case folded Unicode this is not true. */
f9e705e8 1818 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1819 U8 *bitmap=NULL;
1820
07be1b83
YO
1821
1822 GET_RE_DEBUG_FLAGS_DECL;
1823
be8e71aa
YO
1824 /* We can't just allocate points here. We need to wrap it in
1825 * an SV so it gets freed properly if there is a croak while
1826 * running the match */
1827 ENTER;
1828 SAVETMPS;
1829 sv_points=newSV(maxlen * sizeof(U8 *));
1830 SvCUR_set(sv_points,
1831 maxlen * sizeof(U8 *));
1832 SvPOK_on(sv_points);
1833 sv_2mortal(sv_points);
1834 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1835 if ( trie_type != trie_utf8_fold
1836 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1837 {
786e8c11
YO
1838 if (trie->bitmap)
1839 bitmap=(U8*)trie->bitmap;
1840 else
1841 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1842 }
786e8c11
YO
1843 /* this is the Aho-Corasick algorithm modified a touch
1844 to include special handling for long "unknown char"
1845 sequences. The basic idea being that we use AC as long
1846 as we are dealing with a possible matching char, when
1847 we encounter an unknown char (and we have not encountered
1848 an accepting state) we scan forward until we find a legal
1849 starting char.
1850 AC matching is basically that of trie matching, except
1851 that when we encounter a failing transition, we fall back
1852 to the current states "fail state", and try the current char
1853 again, a process we repeat until we reach the root state,
1854 state 1, or a legal transition. If we fail on the root state
1855 then we can either terminate if we have reached an accepting
1856 state previously, or restart the entire process from the beginning
1857 if we have not.
1858
1859 */
07be1b83
YO
1860 while (s <= last_start) {
1861 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1862 U8 *uc = (U8*)s;
1863 U16 charid = 0;
1864 U32 base = 1;
1865 U32 state = 1;
1866 UV uvc = 0;
1867 STRLEN len = 0;
1868 STRLEN foldlen = 0;
1869 U8 *uscan = (U8*)NULL;
1870 U8 *leftmost = NULL;
786e8c11
YO
1871#ifdef DEBUGGING
1872 U32 accepted_word= 0;
1873#endif
07be1b83
YO
1874 U32 pointpos = 0;
1875
1876 while ( state && uc <= (U8*)strend ) {
1877 int failed=0;
786e8c11
YO
1878 U32 word = aho->states[ state ].wordnum;
1879
1de06328
YO
1880 if( state==1 ) {
1881 if ( bitmap ) {
1882 DEBUG_TRIE_EXECUTE_r(
1883 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1884 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1885 (char *)uc, utf8_target );
1de06328
YO
1886 PerlIO_printf( Perl_debug_log,
1887 " Scanning for legal start char...\n");
1888 }
d085b490
YO
1889 );
1890 if (utf8_target) {
1891 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1892 uc += UTF8SKIP(uc);
1893 }
1894 } else {
1895 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1896 uc++;
1897 }
1898 }
1de06328 1899 s= (char *)uc;
786e8c11 1900 }
786e8c11
YO
1901 if (uc >(U8*)last_start) break;
1902 }
1903
1904 if ( word ) {
2e64971a 1905 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1906 if (!leftmost || lpos < leftmost) {
1907 DEBUG_r(accepted_word=word);
07be1b83 1908 leftmost= lpos;
786e8c11 1909 }
07be1b83 1910 if (base==0) break;
786e8c11 1911
07be1b83
YO
1912 }
1913 points[pointpos++ % maxlen]= uc;
55eed653
NC
1914 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1915 uscan, len, uvc, charid, foldlen,
1916 foldbuf, uniflags);
786e8c11
YO
1917 DEBUG_TRIE_EXECUTE_r({
1918 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1919 s, utf8_target );
07be1b83 1920 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1921 " Charid:%3u CP:%4"UVxf" ",
1922 charid, uvc);
1923 });
07be1b83
YO
1924
1925 do {
6148ee25 1926#ifdef DEBUGGING
786e8c11 1927 word = aho->states[ state ].wordnum;
6148ee25 1928#endif
07be1b83
YO
1929 base = aho->states[ state ].trans.base;
1930
786e8c11
YO
1931 DEBUG_TRIE_EXECUTE_r({
1932 if (failed)
1933 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1934 s, utf8_target );
07be1b83 1935 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1936 "%sState: %4"UVxf", word=%"UVxf,
1937 failed ? " Fail transition to " : "",
1938 (UV)state, (UV)word);
1939 });
07be1b83
YO
1940 if ( base ) {
1941 U32 tmp;
6dd2be57 1942 I32 offset;
07be1b83 1943 if (charid &&
6dd2be57
DM
1944 ( ((offset = base + charid
1945 - 1 - trie->uniquecharcount)) >= 0)
1946 && ((U32)offset < trie->lasttrans)
1947 && trie->trans[offset].check == state
1948 && (tmp=trie->trans[offset].next))
07be1b83 1949 {
786e8c11
YO
1950 DEBUG_TRIE_EXECUTE_r(
1951 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1952 state = tmp;
1953 break;
1954 }
1955 else {
786e8c11
YO
1956 DEBUG_TRIE_EXECUTE_r(
1957 PerlIO_printf( Perl_debug_log," - fail\n"));
1958 failed = 1;
1959 state = aho->fail[state];
07be1b83
YO
1960 }
1961 }
1962 else {
1963 /* we must be accepting here */
786e8c11
YO
1964 DEBUG_TRIE_EXECUTE_r(
1965 PerlIO_printf( Perl_debug_log," - accepting\n"));
1966 failed = 1;
07be1b83
YO
1967 break;
1968 }
1969 } while(state);
786e8c11 1970 uc += len;
07be1b83
YO
1971 if (failed) {
1972 if (leftmost)
1973 break;
786e8c11 1974 if (!state) state = 1;
07be1b83
YO
1975 }
1976 }
1977 if ( aho->states[ state ].wordnum ) {
2e64971a 1978 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1979 if (!leftmost || lpos < leftmost) {
1980 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1981 leftmost = lpos;
786e8c11 1982 }
07be1b83 1983 }
07be1b83
YO
1984 if (leftmost) {
1985 s = (char*)leftmost;
786e8c11
YO
1986 DEBUG_TRIE_EXECUTE_r({
1987 PerlIO_printf(
70685ca0
JH
1988 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1989 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1990 );
1991 });
24b23f37 1992 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1993 FREETMPS;
1994 LEAVE;
07be1b83 1995 goto got_it;
be8e71aa 1996 }
07be1b83 1997 s = HOPc(s,1);
786e8c11
YO
1998 DEBUG_TRIE_EXECUTE_r({
1999 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2000 });
07be1b83 2001 } else {
786e8c11
YO
2002 DEBUG_TRIE_EXECUTE_r(
2003 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2004 break;
2005 }
2006 }
be8e71aa
YO
2007 FREETMPS;
2008 LEAVE;
07be1b83
YO
2009 }
2010 break;
b3c9acc1 2011 default:
3c3eec57
GS
2012 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2013 break;
d6a28714 2014 }
6eb5f6b9
JH
2015 return 0;
2016 got_it:
2017 return s;
2018}
2019
fae667d5 2020
6eb5f6b9
JH
2021/*
2022 - regexec_flags - match a regexp against a string
2023 */
2024I32
288b8c02 2025Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2026 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2027/* strend: pointer to null at end of string */
2028/* strbeg: real beginning of string */
2029/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2030/* data: May be used for some additional optimizations.
2031 Currently its only used, with a U32 cast, for transmitting
2032 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2033/* nosave: For optimizations. */
2034{
97aff369 2035 dVAR;
288b8c02 2036 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2037 /*register*/ char *s;
6eb5f6b9 2038 register regnode *c;
24b23f37 2039 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2040 I32 minlen; /* must match at least this many chars */
2041 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2042 I32 end_shift = 0; /* Same for the end. */ /* CC */
2043 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2044 char *scream_olds = NULL;
f2ed9b32 2045 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2046 I32 multiline;
f8fc2ecf 2047 RXi_GET_DECL(prog,progi);
3b0527fe 2048 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2049 regexp_paren_pair *swap = NULL;
a3621e74
YO
2050 GET_RE_DEBUG_FLAGS_DECL;
2051
7918f24d 2052 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2053 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2054
2055 /* Be paranoid... */
2056 if (prog == NULL || startpos == NULL) {
2057 Perl_croak(aTHX_ "NULL regexp parameter");
2058 return 0;
2059 }
2060
bbe252da 2061 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2062 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2063
f2ed9b32 2064 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2065 DEBUG_EXECUTE_r(
f2ed9b32 2066 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2067 "Matching");
2068 );
bac06658 2069
6eb5f6b9 2070 minlen = prog->minlen;
1de06328
YO
2071
2072 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2074 "String too short [regexec_flags]...\n"));
2075 goto phooey;
1aa99e6b 2076 }
6eb5f6b9 2077
1de06328 2078
6eb5f6b9 2079 /* Check validity of program. */
f8fc2ecf 2080 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2081 Perl_croak(aTHX_ "corrupted regexp program");
2082 }
2083
2084 PL_reg_flags = 0;
ed301438 2085 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2086 PL_reg_maxiter = 0;
2087
3c8556c3 2088 if (RX_UTF8(rx))
6eb5f6b9
JH
2089 PL_reg_flags |= RF_utf8;
2090
2091 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2092 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2093 PL_bostr = strbeg;
3b0527fe 2094 reginfo.sv = sv;
6eb5f6b9
JH
2095
2096 /* Mark end of line for $ (and such) */
2097 PL_regeol = strend;
2098
2099 /* see how far we have to get to not match where we matched before */
3b0527fe 2100 reginfo.till = startpos+minend;
6eb5f6b9 2101
6eb5f6b9
JH
2102 /* If there is a "must appear" string, look for it. */
2103 s = startpos;
2104
bbe252da 2105 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2106 MAGIC *mg;
2c296965 2107 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2108 reginfo.ganch = startpos + prog->gofs;
2c296965 2109 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2110 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2111 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2112 && SvMAGIC(sv)
14befaf4
DM
2113 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2114 && mg->mg_len >= 0) {
3b0527fe 2115 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2116 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2117 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2118
bbe252da 2119 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2120 if (s > reginfo.ganch)
6eb5f6b9 2121 goto phooey;
58e23c8d 2122 s = reginfo.ganch - prog->gofs;
2c296965 2123 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2124 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2125 if (s < strbeg)
2126 goto phooey;
6eb5f6b9
JH
2127 }
2128 }
58e23c8d 2129 else if (data) {
70685ca0 2130 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2131 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2132 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2133
2134 } else { /* pos() not defined */
3b0527fe 2135 reginfo.ganch = strbeg;
2c296965
YO
2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2137 "GPOS: reginfo.ganch = strbeg\n"));
2138 }
6eb5f6b9 2139 }
288b8c02 2140 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2141 /* We have to be careful. If the previous successful match
2142 was from this regex we don't want a subsequent partially
2143 successful match to clobber the old results.
2144 So when we detect this possibility we add a swap buffer
2145 to the re, and switch the buffer each match. If we fail
2146 we switch it back, otherwise we leave it swapped.
2147 */
2148 swap = prog->offs;
2149 /* do we need a save destructor here for eval dies? */
2150 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2151 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2152 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2153 PTR2UV(prog),
2154 PTR2UV(swap),
2155 PTR2UV(prog->offs)
2156 ));
c74340f9 2157 }
a0714e2c 2158 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2159 re_scream_pos_data d;
2160
2161 d.scream_olds = &scream_olds;
2162 d.scream_pos = &scream_pos;
288b8c02 2163 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2164 if (!s) {
a3621e74 2165 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2166 goto phooey; /* not present */
3fa9c3d7 2167 }
6eb5f6b9
JH
2168 }
2169
1de06328 2170
6eb5f6b9
JH
2171
2172 /* Simplest case: anchored match need be tried only once. */
2173 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2174 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2175 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2176 goto got_it;
bbe252da
YO
2177 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2178 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2179 {
2180 char *end;
2181
2182 if (minlen)
2183 dontbother = minlen - 1;
1aa99e6b 2184 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2185 /* for multiline we only have to try after newlines */
33b8afdf 2186 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2187 /* because of the goto we can not easily reuse the macros for bifurcating the
2188 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2189 if (utf8_target) {
2190 if (s == startpos)
2191 goto after_try_utf8;
2192 while (1) {
2193 if (regtry(&reginfo, &s)) {
2194 goto got_it;
2195 }
2196 after_try_utf8:
2197 if (s > end) {
2198 goto phooey;
2199 }
2200 if (prog->extflags & RXf_USE_INTUIT) {
2201 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2202 if (!s) {
2203 goto phooey;
2204 }
2205 }
2206 else {
2207 s += UTF8SKIP(s);
2208 }
2209 }
2210 } /* end search for check string in unicode */
2211 else {
2212 if (s == startpos) {
2213 goto after_try_latin;
2214 }
2215 while (1) {
2216 if (regtry(&reginfo, &s)) {
2217 goto got_it;
2218 }
2219 after_try_latin:
2220 if (s > end) {
2221 goto phooey;
2222 }
2223 if (prog->extflags & RXf_USE_INTUIT) {
2224 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2225 if (!s) {
2226 goto phooey;
2227 }
2228 }
2229 else {
2230 s++;
2231 }
2232 }
2233 } /* end search for check string in latin*/
2234 } /* end search for check string */
2235 else { /* search for newline */
2236 if (s > startpos) {
2237 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2238 s--;
92f3d482 2239 }
21eede78
YO
2240 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2241 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2242 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2243 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2244 goto got_it;
2245 }
92f3d482
YO
2246 }
2247 } /* end search for newline */
2248 } /* end anchored/multiline check string search */
6eb5f6b9 2249 goto phooey;
bbe252da 2250 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2251 {
486ec47a 2252 /* the warning about reginfo.ganch being used without initialization
bbe252da 2253 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2254 and we only enter this block when the same bit is set. */
58e23c8d 2255 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2256
2257 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2258 goto got_it;
2259 goto phooey;
2260 }
2261
2262 /* Messy cases: unanchored match. */
bbe252da 2263 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2264 /* we have /x+whatever/ */
f2ed9b32 2265 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2266 char ch;
bf93d4cc
GS
2267#ifdef DEBUGGING
2268 int did_match = 0;
2269#endif
f2ed9b32
KW
2270 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2271 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2272 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2273
f2ed9b32 2274 if (utf8_target) {
4cadc6a9 2275 REXEC_FBC_SCAN(
6eb5f6b9 2276 if (*s == ch) {
a3621e74 2277 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2278 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2279 s += UTF8SKIP(s);
2280 while (s < strend && *s == ch)
2281 s += UTF8SKIP(s);
2282 }
4cadc6a9 2283 );
6eb5f6b9
JH
2284 }
2285 else {
4cadc6a9 2286 REXEC_FBC_SCAN(
6eb5f6b9 2287 if (*s == ch) {
a3621e74 2288 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2289 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2290 s++;
2291 while (s < strend && *s == ch)
2292 s++;
2293 }
4cadc6a9 2294 );
6eb5f6b9 2295 }
a3621e74 2296 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2297 PerlIO_printf(Perl_debug_log,
b7953727
JH
2298 "Did not find anchored character...\n")
2299 );
6eb5f6b9 2300 }
a0714e2c
SS
2301 else if (prog->anchored_substr != NULL
2302 || prog->anchored_utf8 != NULL
2303 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2304 && prog->float_max_offset < strend - s)) {
2305 SV *must;
2306 I32 back_max;
2307 I32 back_min;
2308 char *last;
6eb5f6b9 2309 char *last1; /* Last position checked before */
bf93d4cc
GS
2310#ifdef DEBUGGING
2311 int did_match = 0;
2312#endif
33b8afdf 2313 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2314 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2315 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2316 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2317 back_max = back_min = prog->anchored_offset;
2318 } else {
f2ed9b32
KW
2319 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2320 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2321 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2322 back_max = prog->float_max_offset;
2323 back_min = prog->float_min_offset;
2324 }
1de06328
YO
2325
2326
33b8afdf
JH
2327 if (must == &PL_sv_undef)
2328 /* could not downgrade utf8 check substring, so must fail */
2329 goto phooey;
2330
1de06328
YO
2331 if (back_min<0) {
2332 last = strend;
2333 } else {
2334 last = HOP3c(strend, /* Cannot start after this */
2335 -(I32)(CHR_SVLEN(must)
2336 - (SvTAIL(must) != 0) + back_min), strbeg);
2337 }
6eb5f6b9
JH
2338 if (s > PL_bostr)
2339 last1 = HOPc(s, -1);
2340 else
2341 last1 = s - 1; /* bogus */
2342
a0288114 2343 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2344 check_substr==must. */
2345 scream_pos = -1;
2346 dontbother = end_shift;
2347 strend = HOPc(strend, -dontbother);
2348 while ( (s <= last) &&
c33e64f0 2349 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2350 (unsigned char*)strend, must,
c33e64f0 2351 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2352 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2353 if (HOPc(s, -back_max) > last1) {
2354 last1 = HOPc(s, -back_min);
2355 s = HOPc(s, -back_max);
2356 }
2357 else {
52657f30 2358 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2359
2360 last1 = HOPc(s, -back_min);
52657f30 2361 s = t;
6eb5f6b9 2362 }
f2ed9b32 2363 if (utf8_target) {
6eb5f6b9 2364 while (s <= last1) {
24b23f37 2365 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2366 goto got_it;
2367 s += UTF8SKIP(s);
2368 }
2369 }
2370 else {
2371 while (s <= last1) {
24b23f37 2372 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2373 goto got_it;
2374 s++;
2375 }
2376 }
2377 }
ab3bbdeb 2378 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2379 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2380 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2381 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2382 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2383 ? "anchored" : "floating"),
ab3bbdeb
YO
2384 quoted, RE_SV_TAIL(must));
2385 });
6eb5f6b9
JH
2386 goto phooey;
2387 }
f8fc2ecf 2388 else if ( (c = progi->regstclass) ) {
f14c76ed 2389 if (minlen) {
f8fc2ecf 2390 const OPCODE op = OP(progi->regstclass);
66e933ab 2391 /* don't bother with what can't match */
786e8c11 2392 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2393 strend = HOPc(strend, -(minlen - 1));
2394 }
a3621e74 2395 DEBUG_EXECUTE_r({
be8e71aa 2396 SV * const prop = sv_newmortal();
32fc9b6a 2397 regprop(prog, prop, c);
0df25f3d 2398 {
f2ed9b32 2399 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2400 s,strend-s,60);
0df25f3d 2401 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2402 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2403 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2404 quoted, (int)(strend - s));
0df25f3d 2405 }
ffc61ed2 2406 });
3b0527fe 2407 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2408 goto got_it;
07be1b83 2409 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2410 }
2411 else {
2412 dontbother = 0;
a0714e2c 2413 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2414 /* Trim the end. */
6af40bd7 2415 char *last= NULL;
33b8afdf 2416 SV* float_real;
c33e64f0
FC
2417 STRLEN len;
2418 const char *little;
33b8afdf 2419
f2ed9b32
KW
2420 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2421 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2422 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2423
c33e64f0
FC
2424 little = SvPV_const(float_real, len);
2425 if (SvTAIL(float_real)) {
1a13b075
YO
2426 /* This means that float_real contains an artificial \n on the end
2427 * due to the presence of something like this: /foo$/
2428 * where we can match both "foo" and "foo\n" at the end of the string.
2429 * So we have to compare the end of the string first against the float_real
2430 * without the \n and then against the full float_real with the string.
2431 * We have to watch out for cases where the string might be smaller
2432 * than the float_real or the float_real without the \n.
2433 */
2434 char *checkpos= strend - len;
2435 DEBUG_OPTIMISE_r(
2436 PerlIO_printf(Perl_debug_log,
2437 "%sChecking for float_real.%s\n",
2438 PL_colors[4], PL_colors[5]));
2439 if (checkpos + 1 < strbeg) {
2440 /* can't match, even if we remove the trailing \n string is too short to match */
2441 DEBUG_EXECUTE_r(
2442 PerlIO_printf(Perl_debug_log,
2443 "%sString shorter than required trailing substring, cannot match.%s\n",
2444 PL_colors[4], PL_colors[5]));
2445 goto phooey;
2446 } else if (memEQ(checkpos + 1, little, len - 1)) {
2447 /* can match, the end of the string matches without the "\n" */
2448 last = checkpos + 1;
2449 } else if (checkpos < strbeg) {
2450 /* cant match, string is too short when the "\n" is included */
2451 DEBUG_EXECUTE_r(
2452 PerlIO_printf(Perl_debug_log,
2453 "%sString does not contain required trailing substring, cannot match.%s\n",
2454 PL_colors[4], PL_colors[5]));
2455 goto phooey;
2456 } else if (!multiline) {
2457 /* non multiline match, so compare with the "\n" at the end of the string */
2458 if (memEQ(checkpos, little, len)) {
2459 last= checkpos;
2460 } else {
2461 DEBUG_EXECUTE_r(
2462 PerlIO_printf(Perl_debug_log,
2463 "%sString does not contain required trailing substring, cannot match.%s\n",
2464 PL_colors[4], PL_colors[5]));
2465 goto phooey;
2466 }
2467 } else {
2468 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2469 goto find_last;
1a13b075 2470 }
c33e64f0 2471 } else {
d6a28714 2472 find_last:
9041c2e3 2473 if (len)
d6a28714 2474 last = rninstr(s, strend, little, little + len);
b8c5462f 2475 else
a0288114 2476 last = strend; /* matching "$" */
b8c5462f 2477 }
6af40bd7
YO
2478 if (!last) {
2479 /* at one point this block contained a comment which was probably
2480 * incorrect, which said that this was a "should not happen" case.
2481 * Even if it was true when it was written I am pretty sure it is
2482 * not anymore, so I have removed the comment and replaced it with
2483 * this one. Yves */
6bda09f9
YO
2484 DEBUG_EXECUTE_r(
2485 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2486 "String does not contain required substring, cannot match.\n"
2487 ));
2488 goto phooey;
bf93d4cc 2489 }
d6a28714
JH
2490 dontbother = strend - last + prog->float_min_offset;
2491 }
2492 if (minlen && (dontbother < minlen))
2493 dontbother = minlen - 1;
2494 strend -= dontbother; /* this one's always in bytes! */
2495 /* We don't know much -- general case. */
f2ed9b32 2496 if (utf8_target) {
d6a28714 2497 for (;;) {
24b23f37 2498 if (regtry(&reginfo, &s))
d6a28714
JH
2499 goto got_it;
2500 if (s >= strend)
2501 break;
b8c5462f 2502 s += UTF8SKIP(s);
d6a28714
JH
2503 };
2504 }
2505 else {
2506 do {
24b23f37 2507 if (regtry(&reginfo, &s))
d6a28714
JH
2508 goto got_it;
2509 } while (s++ < strend);
2510 }
2511 }
2512
2513 /* Failure. */
2514 goto phooey;
2515
2516got_it:
495f47a5
DM
2517 DEBUG_BUFFERS_r(
2518 if (swap)
2519 PerlIO_printf(Perl_debug_log,
2520 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2521 PTR2UV(prog),
2522 PTR2UV(swap)
2523 );
2524 );
e9105d30 2525 Safefree(swap);
288b8c02 2526 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2527
ed301438 2528 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2529 restore_pos(aTHX_ prog);
5daac39c
NC
2530 if (RXp_PAREN_NAMES(prog))
2531 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2532
2533 /* make sure $`, $&, $', and $digit will work later */
2534 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2535 RX_MATCH_COPY_FREE(rx);
d6a28714 2536 if (flags & REXEC_COPY_STR) {
be8e71aa 2537 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2538#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2539 if ((SvIsCOW(sv)
2540 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2541 if (DEBUG_C_TEST) {
2542 PerlIO_printf(Perl_debug_log,
2543 "Copy on write: regexp capture, type %d\n",
2544 (int) SvTYPE(sv));
2545 }
2546 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2547 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2548 assert (SvPOKp(prog->saved_copy));
2549 } else
2550#endif
2551 {
288b8c02 2552 RX_MATCH_COPIED_on(rx);
ed252734
NC
2553 s = savepvn(strbeg, i);
2554 prog->subbeg = s;
2555 }
d6a28714 2556 prog->sublen = i;
d6a28714
JH
2557 }
2558 else {
2559 prog->subbeg = strbeg;
2560 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2561 }
2562 }
9041c2e3 2563
d6a28714
JH
2564 return 1;
2565
2566phooey:
a3621e74 2567 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2568 PL_colors[4], PL_colors[5]));
ed301438 2569 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2570 restore_pos(aTHX_ prog);
e9105d30 2571 if (swap) {
c74340f9 2572 /* we failed :-( roll it back */
495f47a5
DM
2573 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2574 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2575 PTR2UV(prog),
2576 PTR2UV(prog->offs),
2577 PTR2UV(swap)
2578 ));
e9105d30
GG
2579 Safefree(prog->offs);
2580 prog->offs = swap;
2581 }
2582
d6a28714
JH
2583 return 0;
2584}
2585
6bda09f9 2586
d6a28714
JH
2587/*
2588 - regtry - try match at specific point
2589 */
2590STATIC I32 /* 0 failure, 1 success */
24b23f37 2591S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2592{
97aff369 2593 dVAR;
d6a28714 2594 CHECKPOINT lastcp;
288b8c02
NC
2595 REGEXP *const rx = reginfo->prog;
2596 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2597 RXi_GET_DECL(prog,progi);
a3621e74 2598 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2599
2600 PERL_ARGS_ASSERT_REGTRY;
2601
24b23f37 2602 reginfo->cutpoint=NULL;
d6a28714 2603
ed301438
DM
2604 if ((prog->extflags & RXf_EVAL_SEEN)
2605 && !PL_reg_state.re_state_eval_setup_done)
2606 {
d6a28714
JH
2607 MAGIC *mg;
2608
ed301438 2609 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2610 if (reginfo->sv) {
d6a28714 2611 /* Make $_ available to executed code. */
3b0527fe 2612 if (reginfo->sv != DEFSV) {
59f00321 2613 SAVE_DEFSV;
414bf5ae 2614 DEFSV_set(reginfo->sv);
b8c5462f 2615 }
d6a28714 2616
3b0527fe
DM
2617 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2618 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2619 /* prepare for quick setting of pos */
d300d9fa 2620#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2621 if (SvIsCOW(reginfo->sv))
2622 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2623#endif
3dab1dad 2624 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2625 &PL_vtbl_mglob, NULL, 0);
d6a28714 2626 mg->mg_len = -1;
b8c5462f 2627 }
d6a28714
JH
2628 PL_reg_magic = mg;
2629 PL_reg_oldpos = mg->mg_len;
4f639d21 2630 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2631 }
09687e5a 2632 if (!PL_reg_curpm) {
a02a5408 2633 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2634#ifdef USE_ITHREADS
2635 {
14a49a24 2636 SV* const repointer = &PL_sv_undef;
92313705
NC
2637 /* this regexp is also owned by the new PL_reg_curpm, which
2638 will try to free it. */
d2ece331 2639 av_push(PL_regex_padav, repointer);
09687e5a
AB
2640 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2641 PL_regex_pad = AvARRAY(PL_regex_padav);
2642 }
2643#endif
2644 }
86c29d75
NC
2645#ifdef USE_ITHREADS
2646 /* It seems that non-ithreads works both with and without this code.
2647 So for efficiency reasons it seems best not to have the code
2648 compiled when it is not needed. */
92313705
NC
2649 /* This is safe against NULLs: */
2650 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2651 /* PM_reg_curpm owns a reference to this regexp. */
cd733337 2652 (void)ReREFCNT_inc(rx);
86c29d75 2653#endif
288b8c02 2654 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2655 PL_reg_oldcurpm = PL_curpm;
2656 PL_curpm = PL_reg_curpm;
07bc277f 2657 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2658 /* Here is a serious problem: we cannot rewrite subbeg,
2659 since it may be needed if this match fails. Thus
2660 $` inside (?{}) could fail... */
2661 PL_reg_oldsaved = prog->subbeg;
2662 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2663#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2664 PL_nrs = prog->saved_copy;
2665#endif
07bc277f 2666 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2667 }
2668 else
bd61b366 2669 PL_reg_oldsaved = NULL;
d6a28714
JH
2670 prog->subbeg = PL_bostr;
2671 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2672 }
97ca13b7
DM
2673#ifdef DEBUGGING
2674 PL_reg_starttry = *startpos;
2675#endif
f0ab9afb 2676 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2677 PL_reginput = *startpos;
d6a28714 2678 prog->lastparen = 0;
03994de8 2679 prog->lastcloseparen = 0;
d6a28714 2680 PL_regsize = 0;
d6a28714
JH
2681
2682 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2683 to do this again and again, prog->lastparen should take care of
3dd2943c 2684 this! --ilya*/
dafc8851
JH
2685
2686 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2687 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2688 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2689 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2690 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2691 * Meanwhile, this code *is* needed for the
daf18116
JH
2692 * above-mentioned test suite tests to succeed. The common theme
2693 * on those tests seems to be returning null fields from matches.
225593e1 2694 * --jhi updated by dapm */
dafc8851 2695#if 1
d6a28714 2696 if (prog->nparens) {
b93070ed 2697 regexp_paren_pair *pp = prog->offs;
097eb12c 2698 register I32 i;
b93070ed 2699 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2700 ++pp;
2701 pp->start = -1;
2702 pp->end = -1;
d6a28714
JH
2703 }
2704 }
dafc8851 2705#endif
02db2b7b 2706 REGCP_SET(lastcp);
f8fc2ecf 2707 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2708 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2709 return 1;
2710 }
24b23f37
YO
2711 if (reginfo->cutpoint)
2712 *startpos= reginfo->cutpoint;
02db2b7b 2713 REGCP_UNWIND(lastcp);
d6a28714
JH
2714 return 0;
2715}
2716
02db2b7b 2717
8ba1375e
MJD
2718#define sayYES goto yes
2719#define sayNO goto no
262b90c4 2720#define sayNO_SILENT goto no_silent
8ba1375e 2721
f9f4320a
YO
2722/* we dont use STMT_START/END here because it leads to
2723 "unreachable code" warnings, which are bogus, but distracting. */
2724#define CACHEsayNO \
c476f425
DM
2725 if (ST.cache_mask) \
2726 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2727 sayNO
3298f257 2728
a3621e74 2729/* this is used to determine how far from the left messages like
265c4333
YO
2730 'failed...' are printed. It should be set such that messages
2731 are inline with the regop output that created them.
a3621e74 2732*/
265c4333 2733#define REPORT_CODE_OFF 32
a3621e74
YO
2734
2735
40a82448
DM
2736#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2737#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2738
86545054
DM
2739#define SLAB_FIRST(s) (&(s)->states[0])
2740#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2741
5d9a96ca
DM
2742/* grab a new slab and return the first slot in it */
2743
2744STATIC regmatch_state *
2745S_push_slab(pTHX)
2746{
a35a87e7 2747#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2748 dMY_CXT;
2749#endif
5d9a96ca
DM
2750 regmatch_slab *s = PL_regmatch_slab->next;
2751 if (!s) {
2752 Newx(s, 1, regmatch_slab);
2753 s->prev = PL_regmatch_slab;
2754 s->next = NULL;
2755 PL_regmatch_slab->next = s;
2756 }
2757 PL_regmatch_slab = s;
86545054 2758 return SLAB_FIRST(s);
5d9a96ca 2759}
5b47454d 2760
95b24440 2761
40a82448
DM
2762/* push a new state then goto it */
2763
2764#define PUSH_STATE_GOTO(state, node) \
2765 scan = node; \
2766 st->resume_state = state; \
2767 goto push_state;
2768
2769/* push a new state with success backtracking, then goto it */
2770
2771#define PUSH_YES_STATE_GOTO(state, node) \
2772 scan = node; \
2773 st->resume_state = state; \
2774 goto push_yes_state;
2775
aa283a38 2776
aa283a38 2777
d6a28714 2778/*
95b24440 2779
bf1f174e
DM
2780regmatch() - main matching routine
2781
2782This is basically one big switch statement in a loop. We execute an op,
2783set 'next' to point the next op, and continue. If we come to a point which
2784we may need to backtrack to on failure such as (A|B|C), we push a
2785backtrack state onto the backtrack stack. On failure, we pop the top
2786state, and re-enter the loop at the state indicated. If there are no more
2787states to pop, we return failure.
2788
2789Sometimes we also need to backtrack on success; for example /A+/, where
2790after successfully matching one A, we need to go back and try to
2791match another one; similarly for lookahead assertions: if the assertion
2792completes successfully, we backtrack to the state just before the assertion
2793and then carry on. In these cases, the pushed state is marked as
2794'backtrack on success too'. This marking is in fact done by a chain of
2795pointers, each pointing to the previous 'yes' state. On success, we pop to
2796the nearest yes state, discarding any intermediate failure-only states.
2797Sometimes a yes state is pushed just to force some cleanup code to be
2798called at the end of a successful match or submatch; e.g. (??{$re}) uses
2799it to free the inner regex.
2800
2801Note that failure backtracking rewinds the cursor position, while
2802success backtracking leaves it alone.
2803
2804A pattern is complete when the END op is executed, while a subpattern
2805such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2806ops trigger the "pop to last yes state if any, otherwise return true"
2807behaviour.
2808
2809A common convention in this function is to use A and B to refer to the two
2810subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2811the subpattern to be matched possibly multiple times, while B is the entire
2812rest of the pattern. Variable and state names reflect this convention.
2813
2814The states in the main switch are the union of ops and failure/success of
2815substates associated with with that op. For example, IFMATCH is the op
2816that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2817'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2818successfully matched A and IFMATCH_A_fail is a state saying that we have
2819just failed to match A. Resume states always come in pairs. The backtrack
2820state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2821at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2822on success or failure.
2823
2824The struct that holds a backtracking state is actually a big union, with
2825one variant for each major type of op. The variable st points to the
2826top-most backtrack struct. To make the code clearer, within each
2827block of code we #define ST to alias the relevant union.
2828
2829Here's a concrete example of a (vastly oversimplified) IFMATCH
2830implementation:
2831
2832 switch (state) {
2833 ....
2834
2835#define ST st->u.ifmatch
2836
2837 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2838 ST.foo = ...; // some state we wish to save
95b24440 2839 ...
bf1f174e
DM
2840 // push a yes backtrack state with a resume value of
2841 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2842 // first node of A:
2843 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2844 // NOTREACHED
2845
2846 case IFMATCH_A: // we have successfully executed A; now continue with B
2847 next = B;
2848 bar = ST.foo; // do something with the preserved value
2849 break;
2850
2851 case IFMATCH_A_fail: // A failed, so the assertion failed
2852 ...; // do some housekeeping, then ...
2853 sayNO; // propagate the failure
2854
2855#undef ST
95b24440 2856
bf1f174e
DM
2857 ...
2858 }
95b24440 2859
bf1f174e
DM
2860For any old-timers reading this who are familiar with the old recursive
2861approach, the code above is equivalent to:
95b24440 2862
bf1f174e
DM
2863 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2864 {
2865 int foo = ...
95b24440 2866 ...
bf1f174e
DM
2867 if (regmatch(A)) {
2868 next = B;
2869 bar = foo;
2870 break;
95b24440 2871 }
bf1f174e
DM
2872 ...; // do some housekeeping, then ...
2873 sayNO; // propagate the failure
95b24440 2874 }
bf1f174e
DM
2875
2876The topmost backtrack state, pointed to by st, is usually free. If you
2877want to claim it, populate any ST.foo fields in it with values you wish to
2878save, then do one of
2879
2880 PUSH_STATE_GOTO(resume_state, node);
2881 PUSH_YES_STATE_GOTO(resume_state, node);
2882
2883which sets that backtrack state's resume value to 'resume_state', pushes a
2884new free entry to the top of the backtrack stack, then goes to 'node'.
2885On backtracking, the free slot is popped, and the saved state becomes the
2886new free state. An ST.foo field in this new top state can be temporarily
2887accessed to retrieve values, but once the main loop is re-entered, it
2888becomes available for reuse.
2889
2890Note that the depth of the backtrack stack constantly increases during the
2891left-to-right execution of the pattern, rather than going up and down with
2892the pattern nesting. For example the stack is at its maximum at Z at the
2893end of the pattern, rather than at X in the following:
2894
2895 /(((X)+)+)+....(Y)+....Z/
2896
2897The only exceptions to this are lookahead/behind assertions and the cut,
2898(?>A), which pop all the backtrack states associated with A before
2899continuing.
2900
486ec47a 2901Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2902PL_regmatch_state and st always point to the currently active state,
2903and PL_regmatch_slab points to the slab currently containing
2904PL_regmatch_state. The first time regmatch() is called, the first slab is
2905allocated, and is never freed until interpreter destruction. When the slab
2906is full, a new one is allocated and chained to the end. At exit from
2907regmatch(), slabs allocated since entry are freed.
2908
2909*/
95b24440 2910
40a82448 2911
5bc10b2c 2912#define DEBUG_STATE_pp(pp) \
265c4333 2913 DEBUG_STATE_r({ \
f2ed9b32 2914 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2915 PerlIO_printf(Perl_debug_log, \
5d458dd8 2916 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2917 depth*2, "", \
13d6edb4 2918 PL_reg_name[st->resume_state], \
5d458dd8
YO
2919 ((st==yes_state||st==mark_state) ? "[" : ""), \
2920 ((st==yes_state) ? "Y" : ""), \
2921 ((st==mark_state) ? "M" : ""), \
2922 ((st==yes_state||st==mark_state) ? "]" : "") \
2923 ); \
265c4333 2924 });
5bc10b2c 2925
40a82448 2926
3dab1dad 2927#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2928
3df15adc 2929#ifdef DEBUGGING
5bc10b2c 2930
ab3bbdeb 2931STATIC void
f2ed9b32 2932S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2933 const char *start, const char *end, const char *blurb)
2934{
efd26800 2935 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2936
2937 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2938
ab3bbdeb
YO
2939 if (!PL_colorset)
2940 reginitcolors();
2941 {
2942 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2943 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2944
f2ed9b32 2945 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2946 start, end - start, 60);
2947
2948 PerlIO_printf(Perl_debug_log,
2949 "%s%s REx%s %s against %s\n",
2950 PL_colors[4], blurb, PL_colors[5], s0, s1);
2951
f2ed9b32 2952 if (utf8_target||utf8_pat)
1de06328
YO
2953 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2954 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2955 utf8_pat && utf8_target ? " and " : "",
2956 utf8_target ? "string" : ""
ab3bbdeb
YO
2957 );
2958 }
2959}
3df15adc
YO
2960
2961STATIC void
786e8c11
YO
2962S_dump_exec_pos(pTHX_ const char *locinput,
2963 const regnode *scan,
2964 const char *loc_regeol,
2965 const char *loc_bostr,
2966 const char *loc_reg_starttry,
f2ed9b32 2967 const bool utf8_target)
07be1b83 2968{
786e8c11 2969 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2970 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2971 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2972 /* The part of the string before starttry has one color
2973 (pref0_len chars), between starttry and current
2974 position another one (pref_len - pref0_len chars),
2975 after the current position the third one.
2976 We assume that pref0_len <= pref_len, otherwise we
2977 decrease pref0_len. */
786e8c11
YO
2978 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2979 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2980 int pref0_len;
2981
7918f24d
NC
2982 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2983
f2ed9b32 2984 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2985 pref_len++;
786e8c11
YO
2986 pref0_len = pref_len - (locinput - loc_reg_starttry);
2987 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2988 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2989 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2990 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2991 l--;
2992 if (pref0_len < 0)
2993 pref0_len = 0;
2994 if (pref0_len > pref_len)
2995 pref0_len = pref_len;
2996 {
f2ed9b32 2997 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2998
ab3bbdeb 2999 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3000 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3001
ab3bbdeb 3002 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3003 (locinput - pref_len + pref0_len),
1de06328 3004 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3005
ab3bbdeb 3006 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3007 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3008
1de06328 3009 const STRLEN tlen=len0+len1+len2;
3df15adc 3010 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3011 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3012 (IV)(locinput - loc_bostr),
07be1b83 3013 len0, s0,
07be1b83 3014 len1, s1,
07be1b83 3015 (docolor ? "" : "> <"),
07be1b83 3016 len2, s2,
f9f4320a 3017 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3018 "");
3019 }
3020}
3df15adc 3021
07be1b83
YO
3022#endif
3023
0a4db386
YO
3024/* reg_check_named_buff_matched()
3025 * Checks to see if a named buffer has matched. The data array of
3026 * buffer numbers corresponding to the buffer is expected to reside
3027 * in the regexp->data->data array in the slot stored in the ARG() of
3028 * node involved. Note that this routine doesn't actually care about the
3029 * name, that information is not preserved from compilation to execution.
3030 * Returns the index of the leftmost defined buffer with the given name
3031 * or 0 if non of the buffers matched.
3032 */
3033STATIC I32
7918f24d
NC
3034S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3035{
0a4db386 3036 I32 n;
f8fc2ecf 3037 RXi_GET_DECL(rex,rexi);
ad64d0ec 3038 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3039 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3040
3041 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3042
0a4db386 3043 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3044 if ((I32)rex->lastparen >= nums[n] &&
3045 rex->offs[nums[n]].end != -1)
0a4db386
YO
3046 {
3047 return nums[n];
3048 }
3049 }
3050 return 0;
3051}
3052
2f554ef7
DM
3053
3054/* free all slabs above current one - called during LEAVE_SCOPE */
3055
3056STATIC void
3057S_clear_backtrack_stack(pTHX_ void *p)
3058{
3059 regmatch_slab *s = PL_regmatch_slab->next;
3060 PERL_UNUSED_ARG(p);
3061
3062 if (!s)
3063 return;
3064 PL_regmatch_slab->next = NULL;
3065 while (s) {
3066 regmatch_slab * const osl = s;
3067 s = s->next;
3068 Safefree(osl);
3069 }
3070}
3071
3072
28d8d7f4 3073#define SETREX(Re1,Re2) \
ed301438
DM
3074 if (PL_reg_state.re_state_eval_setup_done) \
3075 PM_SETRE((PL_reg_curpm), (Re2)); \
28d8d7f4
YO
3076 Re1 = (Re2)
3077
d6a28714 3078STATIC I32 /* 0 failure, 1 success */
24b23f37 3079S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3080{
a35a87e7 3081#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3082 dMY_CXT;
3083#endif
27da23d5 3084 dVAR;
f2ed9b32 3085 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3086 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3087 REGEXP *rex_sv = reginfo->prog;
3088 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3089 RXi_GET_DECL(rex,rexi);
2f554ef7 3090 I32 oldsave;
5d9a96ca
DM
3091 /* the current state. This is a cached copy of PL_regmatch_state */
3092 register regmatch_state *st;
5d9a96ca
DM
3093 /* cache heavy used fields of st in registers */
3094 register regnode *scan;
3095 register regnode *next;
438e9bae 3096 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3097 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3098 register char *locinput = PL_reginput;
5d9a96ca 3099 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3100
b69b0499 3101 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3102 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3103 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3104 const U32 max_nochange_depth =
3105 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3106 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3107 regmatch_state *yes_state = NULL; /* state to pop to on success of
3108 subpattern */
e2e6a0f1
YO
3109 /* mark_state piggy backs on the yes_state logic so that when we unwind
3110 the stack on success we can update the mark_state as we go */
3111 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3112 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3113 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3114 U32 state_num;
5d458dd8
YO
3115 bool no_final = 0; /* prevent failure from backtracking? */
3116 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3117 char *startpoint = PL_reginput;
5d458dd8
YO
3118 SV *popmark = NULL; /* are we looking for a mark? */
3119 SV *sv_commit = NULL; /* last mark name seen in failure */
3120 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3121 during a successful match */
5d458dd8
YO
3122 U32 lastopen = 0; /* last open we saw */
3123 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3124 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3125 /* these three flags are set by various ops to signal information to
3126 * the very next op. They have a useful lifetime of exactly one loop
3127 * iteration, and are not preserved or restored by state pushes/pops
3128 */
3129 bool sw = 0; /* the condition value in (?(cond)a|b) */
3130 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3131 int logical = 0; /* the following EVAL is:
3132 0: (?{...})
3133 1: (?(?{...})X|Y)
3134 2: (??{...})
3135 or the following IFMATCH/UNLESSM is:
3136 false: plain (?=foo)
3137 true: used as a condition: (?(?=foo))
3138 */
81ed78b2
DM
3139 PAD* last_pad = NULL;
3140 dMULTICALL;
3141 I32 gimme = G_SCALAR;
3142 CV *caller_cv = NULL; /* who called us */
3143 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3144
95b24440 3145#ifdef DEBUGGING
e68ec53f 3146 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3147#endif
3148
81ed78b2
DM
3149 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3150 multicall_oldcatch = 0;
3151 multicall_cv = NULL;
3152 cx = NULL;
3153
3154
7918f24d
NC
3155 PERL_ARGS_ASSERT_REGMATCH;
3156
3b57cd43 3157 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3158 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3159 }));
5d9a96ca
DM
3160 /* on first ever call to regmatch, allocate first slab */
3161 if (!PL_regmatch_slab) {
3162 Newx(PL_regmatch_slab, 1, regmatch_slab);
3163 PL_regmatch_slab->prev = NULL;
3164 PL_regmatch_slab->next = NULL;
86545054 3165 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3166 }
3167
2f554ef7
DM
3168 oldsave = PL_savestack_ix;
3169 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3170 SAVEVPTR(PL_regmatch_slab);
3171 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3172
3173 /* grab next free state slot */
3174 st = ++PL_regmatch_state;
86545054 3175 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3176 st = PL_regmatch_state = S_push_slab(aTHX);
3177
d6a28714
JH
3178 /* Note that nextchr is a byte even in UTF */
3179 nextchr = UCHARAT(locinput);
3180 scan = prog;
3181 while (scan != NULL) {
8ba1375e 3182
a3621e74 3183 DEBUG_EXECUTE_r( {
6136c704 3184 SV * const prop = sv_newmortal();
1de06328 3185 regnode *rnext=regnext(scan);
f2ed9b32 3186 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3187 regprop(rex, prop, scan);
07be1b83
YO
3188
3189 PerlIO_printf(Perl_debug_log,
3190 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3191 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3192 SvPVX_const(prop),
1de06328 3193 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3194 0 : (IV)(rnext - rexi->program));
2a782b5b 3195 });
d6a28714
JH
3196
3197 next = scan + NEXT_OFF(scan);
3198 if (next == scan)
3199 next = NULL;
40a82448 3200 state_num = OP(scan);
d6a28714 3201
40a82448 3202 reenter_switch:
34a81e2b 3203
40a82448 3204 switch (state_num) {
d6a28714 3205 case BOL:
7fba1cd6 3206 if (locinput == PL_bostr)
d6a28714 3207 {
3b0527fe 3208 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3209 break;
3210 }
d6a28714
JH
3211 sayNO;
3212 case MBOL:
12d33761
HS
3213 if (locinput == PL_bostr ||
3214 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3215 {
b8c5462f
JH
3216 break;
3217 }
d6a28714
JH
3218 sayNO;
3219 case SBOL:
c2a73568 3220 if (locinput == PL_bostr)
b8c5462f 3221 break;
d6a28714
JH
3222 sayNO;
3223 case GPOS:
3b0527fe 3224 if (locinput == reginfo->ganch)
d6a28714
JH
3225 break;
3226 sayNO;
ee9b8eae
YO
3227
3228 case KEEPS:
3229 /* update the startpoint */
b93070ed 3230 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3231 PL_reginput = locinput;
b93070ed 3232 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3233 PUSH_STATE_GOTO(KEEPS_next, next);
3234 /*NOT-REACHED*/
3235 case KEEPS_next_fail:
3236 /* rollback the start point change */
b93070ed 3237 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3238 sayNO_SILENT;
3239 /*NOT-REACHED*/
d6a28714 3240 case EOL:
d6a28714
JH
3241 goto seol;
3242 case MEOL:
d6a28714 3243 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3244 sayNO;
b8c5462f 3245 break;
d6a28714
JH
3246 case SEOL:
3247 seol:
3248 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3249 sayNO;
d6a28714 3250 if (PL_regeol - locinput > 1)
b8c5462f 3251 sayNO;
b8c5462f 3252 break;
d6a28714
JH
3253 case EOS:
3254 if (PL_regeol != locinput)
b8c5462f 3255 sayNO;
d6a28714 3256 break;
ffc61ed2 3257 case SANY:
d6a28714 3258 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3259 sayNO;
f2ed9b32 3260 if (utf8_target) {
f33976b4
DB
3261 locinput += PL_utf8skip[nextchr];
3262 if (locinput > PL_regeol)
3263 sayNO;
3264 nextchr = UCHARAT(locinput);
3265 }
3266 else
3267 nextchr = UCHARAT(++locinput);
3268 break;
3269 case CANY:
3270 if (!nextchr && locinput >= PL_regeol)
3271 sayNO;
b8c5462f 3272 nextchr = UCHARAT(++locinput);
a0d0e21e 3273 break;
ffc61ed2 3274 case REG_ANY:
1aa99e6b
IH
3275 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3276 sayNO;
f2ed9b32 3277 if (utf8_target) {
b8c5462f 3278 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3279 if (locinput > PL_regeol)
3280 sayNO;
a0ed51b3 3281 nextchr = UCHARAT(locinput);
a0ed51b3 3282 }
1aa99e6b
IH
3283 else
3284 nextchr = UCHARAT(++locinput);
a0ed51b3 3285 break;
166ba7cd
DM
3286
3287#undef ST
3288#define ST st->u.trie
786e8c11
YO
3289 case TRIEC:
3290 /* In this case the charclass data is available inline so
3291 we can fail fast without a lot of extra overhead.
3292 */
fab2782b
YO
3293 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3294 DEBUG_EXECUTE_r(
3295 PerlIO_printf(Perl_debug_log,
3296 "%*s %sfailed to match trie start class...%s\n",
3297 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3298 );
3299 sayNO_SILENT;
3300 /* NOTREACHED */
786e8c11
YO
3301 }
3302 /* FALL THROUGH */
5b47454d 3303 case TRIE:
2e64971a
DM
3304 /* the basic plan of execution of the trie is:
3305 * At the beginning, run though all the states, and
3306 * find the longest-matching word. Also remember the position
3307 * of the shortest matching word. For example, this pattern:
3308 * 1 2 3 4 5
3309 * ab|a|x|abcd|abc
3310 * when matched against the string "abcde", will generate
3311 * accept states for all words except 3, with the longest
3312 * matching word being 4, and the shortest being 1 (with
3313 * the position being after char 1 of the string).
3314 *
3315 * Then for each matching word, in word order (i.e. 1,2,4,5),
3316 * we run the remainder of the pattern; on each try setting
3317 * the current position to the character following the word,
3318 * returning to try the next word on failure.
3319 *
3320 * We avoid having to build a list of words at runtime by
3321 * using a compile-time structure, wordinfo[].prev, which
3322 * gives, for each word, the previous accepting word (if any).
3323 * In the case above it would contain the mappings 1->2, 2->0,
3324 * 3->0, 4->5, 5->1. We can use this table to generate, from
3325 * the longest word (4 above), a list of all words, by
3326 * following the list of prev pointers; this gives us the
3327 * unordered list 4,5,1,2. Then given the current word we have
3328 * just tried, we can go through the list and find the
3329 * next-biggest word to try (so if we just failed on word 2,
3330 * the next in the list is 4).
3331 *
3332 * Since at runtime we don't record the matching position in
3333 * the string for each word, we have to work that out for
3334 * each word we're about to process. The wordinfo table holds
3335 * the character length of each word; given that we recorded
3336 * at the start: the position of the shortest word and its
3337 * length in chars, we just need to move the pointer the
3338 * difference between the two char lengths. Depending on
3339 * Unicode status and folding, that's cheap or expensive.
3340 *
3341 * This algorithm is optimised for the case where are only a
3342 * small number of accept states, i.e. 0,1, or maybe 2.
3343 * With lots of accepts states, and having to try all of them,
3344 * it becomes quadratic on number of accept states to find all
3345 * the next words.
3346 */
3347
3dab1dad 3348 {
07be1b83 3349 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3350 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3351
3352 /* what trie are we using right now */
be8e71aa 3353 reg_trie_data * const trie
f8fc2ecf 3354 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3355 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3356 U32 state = trie->startstate;
166ba7cd 3357
fab2782b 3358 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3359 if (trie->states[ state ].wordnum) {
3360 DEBUG_EXECUTE_r(
3361 PerlIO_printf(Perl_debug_log,
3362 "%*s %smatched empty string...%s\n",
5bc10b2c 3363 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3364 );
20dbff7c
YO
3365 if (!trie->jump)
3366 break;
3dab1dad
YO
3367 } else {
3368 DEBUG_EXECUTE_r(
3369 PerlIO_printf(Perl_debug_log,
786e8c11 3370 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3371 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3372 );
3373 sayNO_SILENT;
3374 }
3375 }
166ba7cd 3376
786e8c11
YO
3377 {
3378 U8 *uc = ( U8* )locinput;
3379
3380 STRLEN len = 0;
3381 STRLEN foldlen = 0;
3382 U8 *uscan = (U8*)NULL;
786e8c11 3383 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3384 U32 charcount = 0; /* how many input chars we have matched */
3385 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3386
786e8c11
YO
3387 ST.B = next;
3388 ST.jump = trie->jump;
786e8c11 3389 ST.me = scan;
2e64971a
DM
3390 ST.firstpos = NULL;
3391 ST.longfold = FALSE; /* char longer if folded => it's harder */
3392 ST.nextword = 0;
3393
3394 /* fully traverse the TRIE; note the position of the
3395 shortest accept state and the wordnum of the longest
3396 accept state */
07be1b83 3397
a3621e74 3398 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3399 U32 base = trie->states[ state ].trans.base;
f9f4320a 3400 UV uvc = 0;
acb909b4 3401 U16 charid = 0;
2e64971a
DM
3402 U16 wordnum;
3403 wordnum = trie->states[ state ].wordnum;
3404
3405 if (wordnum) { /* it's an accept state */
3406 if (!accepted) {
3407 accepted = 1;
3408 /* record first match position */
3409 if (ST.longfold) {
3410 ST.firstpos = (U8*)locinput;
3411 ST.firstchars = 0;
5b47454d 3412 }
2e64971a
DM
3413 else {
3414 ST.firstpos = uc;
3415 ST.firstchars = charcount;
3416 }
3417 }
3418 if (!ST.nextword || wordnum < ST.nextword)
3419 ST.nextword = wordnum;
3420 ST.topword = wordnum;
786e8c11 3421 }
a3621e74