This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add utility and .h for character's UTF-8
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
61dad979 84#include "utf8_strings.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
147 LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8); \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
149 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
151 LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \
152 LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \
153 LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \
154 LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \
155 LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
20d0b1e9 156
1dcf4a1b 157#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 158
ee9a90b8
KW
159/* The actual code for CCC_TRY, which uses several variables from the routine
160 * it's callable from. It is designed to be the bulk of a case statement.
161 * FUNC is the macro or function to call on non-utf8 targets that indicate if
162 * nextchr matches the class.
163 * UTF8_TEST is the whole test string to use for utf8 targets
164 * LOAD is what to use to test, and if not present to load in the swash for the
165 * class
166 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
167 * UTF8_TEST test.
168 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
169 * utf8 and a variant, load the swash if necessary and test using the utf8
170 * test. Advance to the next character if test is ok, otherwise fail; If not
171 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
172 * fails, or advance to the next character */
173
174#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
175 if (locinput >= PL_regeol) { \
176 sayNO; \
177 } \
178 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
179 LOAD_UTF8_CHARCLASS(CLASS, STR); \
180 if (POS_OR_NEG (UTF8_TEST)) { \
181 sayNO; \
182 } \
183 locinput += PL_utf8skip[nextchr]; \
184 nextchr = UCHARAT(locinput); \
185 break; \
186 } \
187 if (POS_OR_NEG (FUNC(nextchr))) { \
188 sayNO; \
189 } \
190 nextchr = UCHARAT(++locinput); \
980866de
KW
191 break;
192
ee9a90b8
KW
193/* Handle the non-locale cases for a character class and its complement. It
194 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
195 * This is because that code fails when the test succeeds, so we want to have
196 * the test fail so that the code succeeds. The swash is stored in a
197 * predictable PL_ place */
cfaf538b
KW
198#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
199 CLASS, STR) \
ee9a90b8
KW
200 case NAME: \
201 _CCC_TRY_CODE( !, FUNC, \
202 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
203 (U8*)locinput, TRUE)), \
204 CLASS, STR) \
205 case NNAME: \
1dcf4a1b 206 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
207 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
208 (U8*)locinput, TRUE)), \
209 CLASS, STR) \
210
211/* Generate the case statements for both locale and non-locale character
212 * classes in regmatch for classes that don't have special unicode semantics.
213 * Locales don't use an immediate swash, but an intermediary special locale
214 * function that is called on the pointer to the current place in the input
215 * string. That function will resolve to needing the same swash. One might
216 * think that because we don't know what the locale will match, we shouldn't
217 * check with the swash loading function that it loaded properly; ie, that we
218 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
219 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
220 * irrelevant here */
221#define CCC_TRY(NAME, NNAME, FUNC, \
222 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 223 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
224 CLASS, STR) \
225 case NAMEL: \
226 PL_reg_flags |= RF_tainted; \
227 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
228 case NNAMEL: \
229 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
230 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
231 CLASS, STR) \
cfaf538b
KW
232 case NAMEA: \
233 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
237 nextchr = UCHARAT(++locinput); \
238 break; \
239 case NNAMEA: \
240 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 if (utf8_target) { \
244 locinput += PL_utf8skip[nextchr]; \
245 nextchr = UCHARAT(locinput); \
246 } \
247 else { \
248 nextchr = UCHARAT(++locinput); \
249 } \
250 break; \
ee9a90b8
KW
251 /* Generate the non-locale cases */ \
252 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
253
254/* This is like CCC_TRY, but has an extra set of parameters for generating case
255 * statements to handle separate Unicode semantics nodes */
256#define CCC_TRY_U(NAME, NNAME, FUNC, \
257 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
258 NAMEU, NNAMEU, FUNCU, \
cfaf538b 259 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 260 CLASS, STR) \
cfaf538b
KW
261 CCC_TRY(NAME, NNAME, FUNC, \
262 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
263 NAMEA, NNAMEA, FUNCA, \
264 CLASS, STR) \
ee9a90b8 265 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 296#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 302#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
303#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
305
306#endif
e2d8ce26 307
a84d97b6
HS
308/*
309 Search for mandatory following text node; for lookahead, the text must
310 follow but for lookbehind (rn->flags != 0) we skip to the next step.
311*/
cca55fe3 312#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
313 while (JUMPABLE(rn)) { \
314 const OPCODE type = OP(rn); \
315 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 316 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 317 else if (type == PLUS) \
cca55fe3 318 rn = NEXTOPER(rn); \
3dab1dad 319 else if (type == IFMATCH) \
a84d97b6 320 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 321 else rn += NEXT_OFF(rn); \
3dab1dad 322 } \
5f80c4cf 323} STMT_END
74750237 324
c476f425 325
acfe0abc 326static void restore_pos(pTHX_ void *arg);
51371543 327
87c0511b 328#define REGCP_PAREN_ELEMS 3
f067efbf 329#define REGCP_OTHER_ELEMS 3
e0fa7e2b 330#define REGCP_FRAME_ELEMS 1
620d5b66
NC
331/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
332 * are needed for the regexp context stack bookkeeping. */
333
76e3520e 334STATIC CHECKPOINT
b93070ed 335S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 336{
97aff369 337 dVAR;
a3b680e6 338 const int retval = PL_savestack_ix;
a3b680e6 339 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
340 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
341 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 342 I32 p;
40a82448 343 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 344
b93070ed
DM
345 PERL_ARGS_ASSERT_REGCPPUSH;
346
e49a9654 347 if (paren_elems_to_push < 0)
5637ef5b
NC
348 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
349 paren_elems_to_push);
e49a9654 350
e0fa7e2b
NC
351 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
352 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
353 " out of range (%lu-%ld)",
354 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 355
620d5b66 356 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 357
495f47a5
DM
358 DEBUG_BUFFERS_r(
359 if ((int)PL_regsize > (int)parenfloor)
360 PerlIO_printf(Perl_debug_log,
361 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
362 PTR2UV(rex),
363 PTR2UV(rex->offs)
364 );
365 );
87c0511b 366 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
368 SSPUSHINT(rex->offs[p].end);
369 SSPUSHINT(rex->offs[p].start);
1ca2007e 370 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 371 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
372 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
373 (UV)p,
374 (IV)rex->offs[p].start,
375 (IV)rex->offs[p].start_tmp,
376 (IV)rex->offs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 380 SSPUSHINT(PL_regsize);
b93070ed
DM
381 SSPUSHINT(rex->lastparen);
382 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 383 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 384
a0d0e21e
LW
385 return retval;
386}
387
c277df42 388/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
389#define REGCP_SET(cp) \
390 DEBUG_STATE_r( \
ab3bbdeb 391 PerlIO_printf(Perl_debug_log, \
e4f74956 392 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
393 (IV)PL_savestack_ix)); \
394 cp = PL_savestack_ix
c3464db5 395
ab3bbdeb 396#define REGCP_UNWIND(cp) \
e4f74956 397 DEBUG_STATE_r( \
ab3bbdeb 398 if (cp != PL_savestack_ix) \
e4f74956
YO
399 PerlIO_printf(Perl_debug_log, \
400 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
401 (IV)(cp), (IV)PL_savestack_ix)); \
402 regcpblow(cp)
c277df42 403
a8d1f4b4
DM
404#define UNWIND_PAREN(lp, lcp) \
405 for (n = rex->lastparen; n > lp; n--) \
406 rex->offs[n].end = -1; \
407 rex->lastparen = n; \
408 rex->lastcloseparen = lcp;
409
410
f067efbf 411STATIC void
b93070ed 412S_regcppop(pTHX_ regexp *rex)
a0d0e21e 413{
97aff369 414 dVAR;
e0fa7e2b 415 UV i;
87c0511b 416 U32 paren;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
87c0511b 439 paren = PL_regsize;
620d5b66 440 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 441 I32 tmps;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
87c0511b 455 paren--;
a0d0e21e 456 }
daf18116 457#if 1
dafc8851
JH
458 /* It would seem that the similar code in regtry()
459 * already takes care of this, and in fact it is in
460 * a better location to since this code can #if 0-ed out
461 * but the code in regtry() is needed or otherwise tests
462 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
463 * (as of patchlevel 7877) will fail. Then again,
464 * this code seems to be necessary or otherwise
225593e1
DM
465 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
466 * --jhi updated by dapm */
b93070ed 467 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 468 if (i > PL_regsize)
b93070ed
DM
469 rex->offs[i].start = -1;
470 rex->offs[i].end = -1;
495f47a5
DM
471 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
472 " \\%"UVuf": %s ..-1 undeffing\n",
473 (UV)i,
474 (i > PL_regsize) ? "-1" : " "
475 ));
a0d0e21e 476 }
dafc8851 477#endif
a0d0e21e
LW
478}
479
74088413
DM
480/* restore the parens and associated vars at savestack position ix,
481 * but without popping the stack */
482
483STATIC void
484S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485{
486 I32 tmpix = PL_savestack_ix;
487 PL_savestack_ix = ix;
488 regcppop(rex);
489 PL_savestack_ix = tmpix;
490}
491
02db2b7b 492#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 493
a687059c 494/*
e50aee73 495 * pregexec and friends
a687059c
LW
496 */
497
76234dfb 498#ifndef PERL_IN_XSUB_RE
a687059c 499/*
c277df42 500 - pregexec - match a regexp against a string
a687059c 501 */
c277df42 502I32
49d7dfbc 503Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 504 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
505/* strend: pointer to null at end of string */
506/* strbeg: real beginning of string */
507/* minend: end of match must be >=minend after stringarg. */
508/* nosave: For optimizations. */
509{
7918f24d
NC
510 PERL_ARGS_ASSERT_PREGEXEC;
511
c277df42 512 return
9041c2e3 513 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
514 nosave ? 0 : REXEC_COPY_STR);
515}
76234dfb 516#endif
22e551b9 517
9041c2e3 518/*
cad2e5aa
JH
519 * Need to implement the following flags for reg_anch:
520 *
521 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
522 * USE_INTUIT_ML
523 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
524 * INTUIT_AUTORITATIVE_ML
525 * INTUIT_ONCE_NOML - Intuit can match in one location only.
526 * INTUIT_ONCE_ML
527 *
528 * Another flag for this function: SECOND_TIME (so that float substrs
529 * with giant delta may be not rechecked).
530 */
531
532/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
533
3f7c398e 534/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
535 Otherwise, only SvCUR(sv) is used to get strbeg. */
536
537/* XXXX We assume that strpos is strbeg unless sv. */
538
6eb5f6b9
JH
539/* XXXX Some places assume that there is a fixed substring.
540 An update may be needed if optimizer marks as "INTUITable"
541 RExen without fixed substrings. Similarly, it is assumed that
542 lengths of all the strings are no more than minlen, thus they
543 cannot come from lookahead.
40d049e4
YO
544 (Or minlen should take into account lookahead.)
545 NOTE: Some of this comment is not correct. minlen does now take account
546 of lookahead/behind. Further research is required. -- demerphq
547
548*/
6eb5f6b9 549
2c2d71f5
JH
550/* A failure to find a constant substring means that there is no need to make
551 an expensive call to REx engine, thus we celebrate a failure. Similarly,
552 finding a substring too deep into the string means that less calls to
30944b6d
IZ
553 regtry() should be needed.
554
555 REx compiler's optimizer found 4 possible hints:
556 a) Anchored substring;
557 b) Fixed substring;
558 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 559 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 560 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
561 string which does not contradict any of them.
562 */
2c2d71f5 563
6eb5f6b9
JH
564/* Most of decisions we do here should have been done at compile time.
565 The nodes of the REx which we used for the search should have been
566 deleted from the finite automaton. */
567
cad2e5aa 568char *
288b8c02 569Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 570 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 571{
97aff369 572 dVAR;
288b8c02 573 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 574 I32 start_shift = 0;
cad2e5aa 575 /* Should be nonnegative! */
eb578fdb
KW
576 I32 end_shift = 0;
577 char *s;
578 SV *check;
a1933d95 579 char *strbeg;
cad2e5aa 580 char *t;
f2ed9b32 581 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 582 I32 ml_anch;
eb578fdb 583 char *other_last = NULL; /* other substr checked before this */
bd61b366 584 char *check_at = NULL; /* check substr found at this pos */
d8080198 585 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 586 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 587 RXi_GET_DECL(prog,progi);
30944b6d 588#ifdef DEBUGGING
890ce7af 589 const char * const i_strpos = strpos;
30944b6d 590#endif
a3621e74
YO
591 GET_RE_DEBUG_FLAGS_DECL;
592
7918f24d 593 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
594 PERL_UNUSED_ARG(flags);
595 PERL_UNUSED_ARG(data);
7918f24d 596
f2ed9b32 597 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 598
3c8556c3 599 if (RX_UTF8(rx)) {
b8d68ded
JH
600 PL_reg_flags |= RF_utf8;
601 }
ab3bbdeb 602 DEBUG_EXECUTE_r(
f2ed9b32 603 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
604 sv ? "Guessing start of match in sv for"
605 : "Guessing start of match in string for");
2a782b5b 606 );
cad2e5aa 607
c344f387
JH
608 /* CHR_DIST() would be more correct here but it makes things slow. */
609 if (prog->minlen > strend - strpos) {
a3621e74 610 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 611 "String too short... [re_intuit_start]\n"));
cad2e5aa 612 goto fail;
2c2d71f5 613 }
1de06328 614
a1933d95 615 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 616 PL_regeol = strend;
f2ed9b32 617 if (utf8_target) {
33b8afdf
JH
618 if (!prog->check_utf8 && prog->check_substr)
619 to_utf8_substr(prog);
620 check = prog->check_utf8;
621 } else {
622 if (!prog->check_substr && prog->check_utf8)
623 to_byte_substr(prog);
624 check = prog->check_substr;
625 }
1de06328 626 if (check == &PL_sv_undef) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 628 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
629 goto fail;
630 }
bbe252da
YO
631 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
632 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
633 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 634 && !multiline ) ); /* Check after \n? */
cad2e5aa 635
7e25d62c 636 if (!ml_anch) {
bbe252da
YO
637 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
638 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 639 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
640 && sv && !SvROK(sv)
641 && (strpos != strbeg)) {
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
643 goto fail;
644 }
645 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 646 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 647 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
648 I32 slen;
649
1aa99e6b 650 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 651
653099ff
GS
652 if (SvTAIL(check)) {
653 slen = SvCUR(check); /* >= 1 */
cad2e5aa 654
9041c2e3 655 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 656 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 658 goto fail_finish;
cad2e5aa
JH
659 }
660 /* Now should match s[0..slen-2] */
661 slen--;
3f7c398e 662 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 663 || (slen > 1
3f7c398e 664 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 665 report_neq:
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
667 goto fail_finish;
668 }
cad2e5aa 669 }
3f7c398e 670 else if (*SvPVX_const(check) != *s
653099ff 671 || ((slen = SvCUR(check)) > 1
3f7c398e 672 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 673 goto report_neq;
c315bfe8 674 check_at = s;
2c2d71f5 675 goto success_at_start;
7e25d62c 676 }
cad2e5aa 677 }
2c2d71f5 678 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 679 s = strpos;
2c2d71f5 680 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
681 end_shift = prog->check_end_shift;
682
2c2d71f5 683 if (!ml_anch) {
a3b680e6 684 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 685 - (SvTAIL(check) != 0);
a3b680e6 686 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
687
688 if (end_shift < eshift)
689 end_shift = eshift;
690 }
cad2e5aa 691 }
2c2d71f5 692 else { /* Can match at random position */
cad2e5aa
JH
693 ml_anch = 0;
694 s = strpos;
1de06328
YO
695 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
696 end_shift = prog->check_end_shift;
697
698 /* end shift should be non negative here */
cad2e5aa
JH
699 }
700
bcdf7404 701#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 702 if (end_shift < 0)
1de06328 703 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 704 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
705#endif
706
2c2d71f5
JH
707 restart:
708 /* Find a possible match in the region s..strend by looking for
709 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
710
711 {
712 I32 srch_start_shift = start_shift;
713 I32 srch_end_shift = end_shift;
c33e64f0
FC
714 U8* start_point;
715 U8* end_point;
1de06328
YO
716 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
717 srch_end_shift -= ((strbeg - s) - srch_start_shift);
718 srch_start_shift = strbeg - s;
719 }
6bda09f9 720 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
721 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
722 (IV)prog->check_offset_min,
723 (IV)srch_start_shift,
724 (IV)srch_end_shift,
725 (IV)prog->check_end_shift);
726 });
727
bbe252da 728 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
729 start_point= (U8*)(s + srch_start_shift);
730 end_point= (U8*)(strend - srch_end_shift);
731 } else {
732 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
733 end_point= HOP3(strend, -srch_end_shift, strbeg);
734 }
6bda09f9 735 DEBUG_OPTIMISE_MORE_r({
56570a2c 736 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 737 (int)(end_point - start_point),
fc8cd66c 738 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
739 start_point);
740 });
741
742 s = fbm_instr( start_point, end_point,
7fba1cd6 743 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 744 }
cad2e5aa
JH
745 /* Update the count-of-usability, remove useless subpatterns,
746 unshift s. */
2c2d71f5 747
ab3bbdeb 748 DEBUG_EXECUTE_r({
f2ed9b32 749 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
750 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
751 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 752 (s ? "Found" : "Did not find"),
f2ed9b32 753 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
754 ? "anchored" : "floating"),
755 quoted,
756 RE_SV_TAIL(check),
757 (s ? " at offset " : "...\n") );
758 });
2c2d71f5
JH
759
760 if (!s)
761 goto fail_finish;
2c2d71f5 762 /* Finish the diagnostic message */
a3621e74 763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 764
1de06328
YO
765 /* XXX dmq: first branch is for positive lookbehind...
766 Our check string is offset from the beginning of the pattern.
767 So we need to do any stclass tests offset forward from that
768 point. I think. :-(
769 */
770
771
772
773 check_at=s;
774
775
2c2d71f5
JH
776 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
777 Start with the other substr.
778 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 779 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
780 *always* match. Probably should be marked during compile...
781 Probably it is right to do no SCREAM here...
782 */
783
f2ed9b32 784 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
785 : (prog->float_substr && prog->anchored_substr))
786 {
30944b6d 787 /* Take into account the "other" substring. */
2c2d71f5
JH
788 /* XXXX May be hopelessly wrong for UTF... */
789 if (!other_last)
6eb5f6b9 790 other_last = strpos;
f2ed9b32 791 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
792 do_other_anchored:
793 {
890ce7af
AL
794 char * const last = HOP3c(s, -start_shift, strbeg);
795 char *last1, *last2;
be8e71aa 796 char * const saved_s = s;
33b8afdf 797 SV* must;
2c2d71f5 798
2c2d71f5
JH
799 t = s - prog->check_offset_max;
800 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 801 && (!utf8_target
0ce71af7 802 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 803 && t > strpos)))
6f207bd3 804 NOOP;
2c2d71f5
JH
805 else
806 t = strpos;
1aa99e6b 807 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
808 if (t < other_last) /* These positions already checked */
809 t = other_last;
1aa99e6b 810 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
811 if (last < last1)
812 last1 = last;
1de06328
YO
813 /* XXXX It is not documented what units *_offsets are in.
814 We assume bytes, but this is clearly wrong.
815 Meaning this code needs to be carefully reviewed for errors.
816 dmq.
817 */
818
2c2d71f5 819 /* On end-of-str: see comment below. */
f2ed9b32 820 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
821 if (must == &PL_sv_undef) {
822 s = (char*)NULL;
1de06328 823 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
824 }
825 else
826 s = fbm_instr(
827 (unsigned char*)t,
828 HOP3(HOP3(last1, prog->anchored_offset, strend)
829 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
830 must,
7fba1cd6 831 multiline ? FBMrf_MULTILINE : 0
33b8afdf 832 );
ab3bbdeb 833 DEBUG_EXECUTE_r({
f2ed9b32 834 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
835 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
836 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 837 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
838 quoted, RE_SV_TAIL(must));
839 });
840
841
2c2d71f5
JH
842 if (!s) {
843 if (last1 >= last2) {
a3621e74 844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
845 ", giving up...\n"));
846 goto fail_finish;
847 }
a3621e74 848 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 849 ", trying floating at offset %ld...\n",
be8e71aa 850 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
851 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
852 s = HOP3c(last, 1, strend);
2c2d71f5
JH
853 goto restart;
854 }
855 else {
a3621e74 856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 857 (long)(s - i_strpos)));
1aa99e6b
IH
858 t = HOP3c(s, -prog->anchored_offset, strbeg);
859 other_last = HOP3c(s, 1, strend);
be8e71aa 860 s = saved_s;
2c2d71f5
JH
861 if (t == strpos)
862 goto try_at_start;
2c2d71f5
JH
863 goto try_at_offset;
864 }
30944b6d 865 }
2c2d71f5
JH
866 }
867 else { /* Take into account the floating substring. */
33b8afdf 868 char *last, *last1;
be8e71aa 869 char * const saved_s = s;
33b8afdf
JH
870 SV* must;
871
872 t = HOP3c(s, -start_shift, strbeg);
873 last1 = last =
874 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
875 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
876 last = HOP3c(t, prog->float_max_offset, strend);
877 s = HOP3c(t, prog->float_min_offset, strend);
878 if (s < other_last)
879 s = other_last;
2c2d71f5 880 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 881 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
882 /* fbm_instr() takes into account exact value of end-of-str
883 if the check is SvTAIL(ed). Since false positives are OK,
884 and end-of-str is not later than strend we are OK. */
885 if (must == &PL_sv_undef) {
886 s = (char*)NULL;
1de06328 887 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
888 }
889 else
2c2d71f5 890 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
891 (unsigned char*)last + SvCUR(must)
892 - (SvTAIL(must)!=0),
7fba1cd6 893 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 894 DEBUG_EXECUTE_r({
f2ed9b32 895 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
896 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
897 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 898 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
899 quoted, RE_SV_TAIL(must));
900 });
33b8afdf
JH
901 if (!s) {
902 if (last1 == last) {
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
904 ", giving up...\n"));
905 goto fail_finish;
2c2d71f5 906 }
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 908 ", trying anchored starting at offset %ld...\n",
be8e71aa 909 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
910 other_last = last;
911 s = HOP3c(t, 1, strend);
912 goto restart;
913 }
914 else {
a3621e74 915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
916 (long)(s - i_strpos)));
917 other_last = s; /* Fix this later. --Hugo */
be8e71aa 918 s = saved_s;
33b8afdf
JH
919 if (t == strpos)
920 goto try_at_start;
921 goto try_at_offset;
922 }
2c2d71f5 923 }
cad2e5aa 924 }
2c2d71f5 925
1de06328 926
9ef43ace 927 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 928
6bda09f9 929 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
930 PerlIO_printf(Perl_debug_log,
931 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
932 (IV)prog->check_offset_min,
933 (IV)prog->check_offset_max,
934 (IV)(s-strpos),
935 (IV)(t-strpos),
936 (IV)(t-s),
937 (IV)(strend-strpos)
938 )
939 );
940
2c2d71f5 941 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 942 && (!utf8_target
9ef43ace 943 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
944 && t > strpos)))
945 {
2c2d71f5
JH
946 /* Fixed substring is found far enough so that the match
947 cannot start at strpos. */
948 try_at_offset:
cad2e5aa 949 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
950 /* Eventually fbm_*() should handle this, but often
951 anchored_offset is not 0, so this check will not be wasted. */
952 /* XXXX In the code below we prefer to look for "^" even in
953 presence of anchored substrings. And we search even
954 beyond the found float position. These pessimizations
955 are historical artefacts only. */
956 find_anchor:
2c2d71f5 957 while (t < strend - prog->minlen) {
cad2e5aa 958 if (*t == '\n') {
4ee3650e 959 if (t < check_at - prog->check_offset_min) {
f2ed9b32 960 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
961 /* Since we moved from the found position,
962 we definitely contradict the found anchored
30944b6d
IZ
963 substr. Due to the above check we do not
964 contradict "check" substr.
965 Thus we can arrive here only if check substr
966 is float. Redo checking for "other"=="fixed".
967 */
9041c2e3 968 strpos = t + 1;
a3621e74 969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 970 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
971 goto do_other_anchored;
972 }
4ee3650e
GS
973 /* We don't contradict the found floating substring. */
974 /* XXXX Why not check for STCLASS? */
cad2e5aa 975 s = t + 1;
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 977 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
978 goto set_useful;
979 }
4ee3650e
GS
980 /* Position contradicts check-string */
981 /* XXXX probably better to look for check-string
982 than for "\n", so one should lower the limit for t? */
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 984 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 985 other_last = strpos = s = t + 1;
cad2e5aa
JH
986 goto restart;
987 }
988 t++;
989 }
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 991 PL_colors[0], PL_colors[1]));
2c2d71f5 992 goto fail_finish;
cad2e5aa 993 }
f5952150 994 else {
a3621e74 995 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 996 PL_colors[0], PL_colors[1]));
f5952150 997 }
cad2e5aa
JH
998 s = t;
999 set_useful:
f2ed9b32 1000 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1001 }
1002 else {
f5952150 1003 /* The found string does not prohibit matching at strpos,
2c2d71f5 1004 - no optimization of calling REx engine can be performed,
f5952150
GS
1005 unless it was an MBOL and we are not after MBOL,
1006 or a future STCLASS check will fail this. */
2c2d71f5
JH
1007 try_at_start:
1008 /* Even in this situation we may use MBOL flag if strpos is offset
1009 wrt the start of the string. */
05b4157f 1010 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1011 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1012 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1013 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1014 {
cad2e5aa
JH
1015 t = strpos;
1016 goto find_anchor;
1017 }
a3621e74 1018 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1019 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1020 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1021 );
2c2d71f5 1022 success_at_start:
bbe252da 1023 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1024 && (utf8_target ? (
33b8afdf
JH
1025 prog->check_utf8 /* Could be deleted already */
1026 && --BmUSEFUL(prog->check_utf8) < 0
1027 && (prog->check_utf8 == prog->float_utf8)
1028 ) : (
1029 prog->check_substr /* Could be deleted already */
1030 && --BmUSEFUL(prog->check_substr) < 0
1031 && (prog->check_substr == prog->float_substr)
1032 )))
66e933ab 1033 {
cad2e5aa 1034 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1035 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1036 /* XXX Does the destruction order has to change with utf8_target? */
1037 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1038 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1039 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1040 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1041 check = NULL; /* abort */
cad2e5aa 1042 s = strpos;
486ec47a 1043 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1044 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1045 if (prog->intflags & PREGf_IMPLICIT)
1046 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1047 /* XXXX This is a remnant of the old implementation. It
1048 looks wasteful, since now INTUIT can use many
6eb5f6b9 1049 other heuristics. */
bbe252da 1050 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1051 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1052 }
1053 else
1054 s = strpos;
1055 }
1056
6eb5f6b9
JH
1057 /* Last resort... */
1058 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1059 /* trie stclasses are too expensive to use here, we are better off to
1060 leave it to regmatch itself */
f8fc2ecf 1061 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1062 /* minlen == 0 is possible if regstclass is \b or \B,
1063 and the fixed substr is ''$.
1064 Since minlen is already taken into account, s+1 is before strend;
1065 accidentally, minlen >= 1 guaranties no false positives at s + 1
1066 even for \b or \B. But (minlen? 1 : 0) below assumes that
1067 regstclass does not come from lookahead... */
1068 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1069 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1070 const U8* const str = (U8*)STRING(progi->regstclass);
1071 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1072 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1073 : 1);
1de06328
YO
1074 char * endpos;
1075 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1076 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1077 else if (prog->float_substr || prog->float_utf8)
1078 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1079 else
1080 endpos= strend;
1081
d8080198
YO
1082 if (checked_upto < s)
1083 checked_upto = s;
1084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1085 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1086
6eb5f6b9 1087 t = s;
d8080198
YO
1088 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1089 if (s) {
1090 checked_upto = s;
1091 } else {
6eb5f6b9 1092#ifdef DEBUGGING
cbbf8932 1093 const char *what = NULL;
6eb5f6b9
JH
1094#endif
1095 if (endpos == strend) {
a3621e74 1096 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1097 "Could not match STCLASS...\n") );
1098 goto fail;
1099 }
a3621e74 1100 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1101 "This position contradicts STCLASS...\n") );
bbe252da 1102 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1103 goto fail;
d8080198
YO
1104 checked_upto = HOPBACKc(endpos, start_shift);
1105 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1106 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1107 /* Contradict one of substrings */
33b8afdf 1108 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1109 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1110 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1111 hop_and_restart:
1aa99e6b 1112 s = HOP3c(t, 1, strend);
66e933ab
GS
1113 if (s + start_shift + end_shift > strend) {
1114 /* XXXX Should be taken into account earlier? */
a3621e74 1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1116 "Could not match STCLASS...\n") );
1117 goto fail;
1118 }
5e39e1e5
HS
1119 if (!check)
1120 goto giveup;
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1122 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1123 what, (long)(s + start_shift - i_strpos)) );
1124 goto restart;
1125 }
66e933ab 1126 /* Have both, check_string is floating */
6eb5f6b9
JH
1127 if (t + start_shift >= check_at) /* Contradicts floating=check */
1128 goto retry_floating_check;
1129 /* Recheck anchored substring, but not floating... */
9041c2e3 1130 s = check_at;
5e39e1e5
HS
1131 if (!check)
1132 goto giveup;
a3621e74 1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1134 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1135 (long)(other_last - i_strpos)) );
1136 goto do_other_anchored;
1137 }
60e71179
GS
1138 /* Another way we could have checked stclass at the
1139 current position only: */
1140 if (ml_anch) {
1141 s = t = t + 1;
5e39e1e5
HS
1142 if (!check)
1143 goto giveup;
a3621e74 1144 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1145 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1146 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1147 goto try_at_offset;
66e933ab 1148 }
f2ed9b32 1149 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1150 goto fail;
486ec47a 1151 /* Check is floating substring. */
6eb5f6b9
JH
1152 retry_floating_check:
1153 t = check_at - start_shift;
a3621e74 1154 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1155 goto hop_and_restart;
1156 }
b7953727 1157 if (t != s) {
a3621e74 1158 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1159 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1160 (long)(t - i_strpos), (long)(s - i_strpos))
1161 );
1162 }
1163 else {
a3621e74 1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1165 "Does not contradict STCLASS...\n");
1166 );
1167 }
6eb5f6b9 1168 }
5e39e1e5 1169 giveup:
a3621e74 1170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1171 PL_colors[4], (check ? "Guessed" : "Giving up"),
1172 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1173 return s;
2c2d71f5
JH
1174
1175 fail_finish: /* Substring not found */
33b8afdf 1176 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1177 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1178 fail:
a3621e74 1179 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1180 PL_colors[4], PL_colors[5]));
bd61b366 1181 return NULL;
cad2e5aa 1182}
9661b544 1183
a0a388a1
YO
1184#define DECL_TRIE_TYPE(scan) \
1185 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1186 trie_type = ((scan->flags == EXACT) \
1187 ? (utf8_target ? trie_utf8 : trie_plain) \
1188 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1189
1190#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1191uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1192 STRLEN skiplen; \
1193 switch (trie_type) { \
1194 case trie_utf8_fold: \
1195 if ( foldlen>0 ) { \
1196 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1197 foldlen -= len; \
1198 uscan += len; \
1199 len=0; \
1200 } else { \
1201 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1202 len = UTF8SKIP(uc); \
1203 skiplen = UNISKIP( uvc ); \
1204 foldlen -= skiplen; \
1205 uscan = foldbuf + skiplen; \
1206 } \
1207 break; \
1208 case trie_latin_utf8_fold: \
1209 if ( foldlen>0 ) { \
1210 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1211 foldlen -= len; \
1212 uscan += len; \
1213 len=0; \
1214 } else { \
1215 len = 1; \
1216 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1217 skiplen = UNISKIP( uvc ); \
1218 foldlen -= skiplen; \
1219 uscan = foldbuf + skiplen; \
1220 } \
1221 break; \
1222 case trie_utf8: \
1223 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1224 break; \
1225 case trie_plain: \
1226 uvc = (UV)*uc; \
1227 len = 1; \
1228 } \
1229 if (uvc < 256) { \
1230 charid = trie->charmap[ uvc ]; \
1231 } \
1232 else { \
1233 charid = 0; \
1234 if (widecharmap) { \
1235 SV** const svpp = hv_fetch(widecharmap, \
1236 (char*)&uvc, sizeof(UV), 0); \
1237 if (svpp) \
1238 charid = (U16)SvIV(*svpp); \
1239 } \
1240 } \
4cadc6a9
YO
1241} STMT_END
1242
4cadc6a9
YO
1243#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1244STMT_START { \
1245 while (s <= e) { \
1246 if ( (CoNd) \
fac1af77 1247 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1248 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1249 goto got_it; \
1250 s++; \
1251 } \
1252} STMT_END
1253
1254#define REXEC_FBC_UTF8_SCAN(CoDe) \
1255STMT_START { \
1256 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1257 CoDe \
1258 s += uskip; \
1259 } \
1260} STMT_END
1261
1262#define REXEC_FBC_SCAN(CoDe) \
1263STMT_START { \
1264 while (s < strend) { \
1265 CoDe \
1266 s++; \
1267 } \
1268} STMT_END
1269
1270#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1271REXEC_FBC_UTF8_SCAN( \
1272 if (CoNd) { \
24b23f37 1273 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it; \
1275 else \
1276 tmp = doevery; \
1277 } \
1278 else \
1279 tmp = 1; \
1280)
1281
1282#define REXEC_FBC_CLASS_SCAN(CoNd) \
1283REXEC_FBC_SCAN( \
1284 if (CoNd) { \
24b23f37 1285 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1286 goto got_it; \
1287 else \
1288 tmp = doevery; \
1289 } \
1290 else \
1291 tmp = 1; \
1292)
1293
1294#define REXEC_FBC_TRYIT \
24b23f37 1295if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1296 goto got_it
1297
e1d1eefb 1298#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1299 if (utf8_target) { \
e1d1eefb
YO
1300 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1301 } \
1302 else { \
1303 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1304 }
e1d1eefb 1305
4cadc6a9 1306#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1307 if (utf8_target) { \
4cadc6a9
YO
1308 UtFpReLoAd; \
1309 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1310 } \
1311 else { \
1312 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1313 }
4cadc6a9
YO
1314
1315#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1316 PL_reg_flags |= RF_tainted; \
f2ed9b32 1317 if (utf8_target) { \
4cadc6a9
YO
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 } \
1320 else { \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1322 }
4cadc6a9 1323
786e8c11
YO
1324#define DUMP_EXEC_POS(li,s,doutf8) \
1325 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
cfaf538b
KW
1327
1328#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1330 tmp = TEST_NON_UTF8(tmp); \
1331 REXEC_FBC_UTF8_SCAN( \
1332 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333 tmp = !tmp; \
1334 IF_SUCCESS; \
1335 } \
1336 else { \
1337 IF_FAIL; \
1338 } \
1339 ); \
1340
1341#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342 if (s == PL_bostr) { \
1343 tmp = '\n'; \
1344 } \
1345 else { \
1346 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1347 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1348 } \
1349 tmp = TeSt1_UtF8; \
1350 LOAD_UTF8_CHARCLASS_ALNUM(); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! (TeSt2_UtF8)) { \
1353 tmp = !tmp; \
1354 IF_SUCCESS; \
1355 } \
1356 else { \
1357 IF_FAIL; \
1358 } \
1359 ); \
1360
63ac0dad
KW
1361/* The only difference between the BOUND and NBOUND cases is that
1362 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363 * NBOUND. This is accomplished by passing it in either the if or else clause,
1364 * with the other one being empty */
1365#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1366 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1367
1368#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1369 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1370
1371#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1372 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1373
1374#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1375 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1376
63ac0dad
KW
1377
1378/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1379 * be passed in completely with the variable name being tested, which isn't
1380 * such a clean interface, but this is easier to read than it was before. We
1381 * are looking for the boundary (or non-boundary between a word and non-word
1382 * character. The utf8 and non-utf8 cases have the same logic, but the details
1383 * must be different. Find the "wordness" of the character just prior to this
1384 * one, and compare it with the wordness of this one. If they differ, we have
1385 * a boundary. At the beginning of the string, pretend that the previous
1386 * character was a new-line */
cfaf538b 1387#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1388 if (utf8_target) { \
cfaf538b 1389 UTF8_CODE \
63ac0dad
KW
1390 } \
1391 else { /* Not utf8 */ \
1392 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1393 tmp = TEST_NON_UTF8(tmp); \
1394 REXEC_FBC_SCAN( \
1395 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1396 tmp = !tmp; \
1397 IF_SUCCESS; \
1398 } \
1399 else { \
1400 IF_FAIL; \
1401 } \
1402 ); \
1403 } \
1404 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1405 goto got_it;
1406
786e8c11
YO
1407/* We know what class REx starts with. Try to find this position... */
1408/* if reginfo is NULL, its a dryrun */
1409/* annoyingly all the vars in this routine have different names from their counterparts
1410 in regmatch. /grrr */
1411
3c3eec57 1412STATIC char *
07be1b83 1413S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1414 const char *strend, regmatch_info *reginfo)
a687059c 1415{
27da23d5 1416 dVAR;
bbe252da 1417 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1418 char *pat_string; /* The pattern's exactish string */
1419 char *pat_end; /* ptr to end char of pat_string */
1420 re_fold_t folder; /* Function for computing non-utf8 folds */
1421 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1422 STRLEN ln;
5dab1207 1423 STRLEN lnc;
eb578fdb 1424 STRLEN uskip;
fac1af77
KW
1425 U8 c1;
1426 U8 c2;
6eb5f6b9 1427 char *e;
eb578fdb
KW
1428 I32 tmp = 1; /* Scratch variable? */
1429 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1430 UV utf8_fold_flags = 0;
f8fc2ecf 1431 RXi_GET_DECL(prog,progi);
7918f24d
NC
1432
1433 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1434
6eb5f6b9
JH
1435 /* We know what class it must start with. */
1436 switch (OP(c)) {
f56b6394 1437 case ANYOFV:
6eb5f6b9 1438 case ANYOF:
f56b6394 1439 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1440 STRLEN inclasslen = strend - s;
1441 REXEC_FBC_UTF8_CLASS_SCAN(
1442 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1443 }
1444 else {
6ef69d56 1445 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1446 }
6eb5f6b9 1447 break;
f33976b4 1448 case CANY:
4cadc6a9 1449 REXEC_FBC_SCAN(
24b23f37 1450 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1451 goto got_it;
1452 else
1453 tmp = doevery;
4cadc6a9 1454 );
f33976b4 1455 break;
2f7f8cb1
KW
1456
1457 case EXACTFA:
1458 if (UTF_PATTERN || utf8_target) {
1459 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1460 goto do_exactf_utf8;
1461 }
1462 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1463 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1464 goto do_exactf_non_utf8; /* isn't dealt with by these */
1465
6eb5f6b9 1466 case EXACTF:
62bf7766 1467 if (utf8_target) {
77a6d856
KW
1468
1469 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1470 utf8_fold_flags = 0;
fac1af77
KW
1471 goto do_exactf_utf8;
1472 }
1473 fold_array = PL_fold;
1474 folder = foldEQ;
1475 goto do_exactf_non_utf8;
1476
1477 case EXACTFL:
1478 if (UTF_PATTERN || utf8_target) {
17580e7a 1479 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1480 goto do_exactf_utf8;
1481 }
1482 fold_array = PL_fold_locale;
1483 folder = foldEQ_locale;
16d951b7
KW
1484 goto do_exactf_non_utf8;
1485
3c760661
KW
1486 case EXACTFU_SS:
1487 if (UTF_PATTERN) {
1488 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1489 }
1490 goto do_exactf_utf8;
1491
fab2782b 1492 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1493 case EXACTFU:
1494 if (UTF_PATTERN || utf8_target) {
77a6d856 1495 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1496 goto do_exactf_utf8;
1497 }
1498
1499 /* Any 'ss' in the pattern should have been replaced by regcomp,
1500 * so we don't have to worry here about this single special case
1501 * in the Latin1 range */
1502 fold_array = PL_fold_latin1;
1503 folder = foldEQ_latin1;
fac1af77
KW
1504
1505 /* FALL THROUGH */
1506
62bf7766
KW
1507 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1508 are no glitches with fold-length differences
1509 between the target string and pattern */
fac1af77
KW
1510
1511 /* The idea in the non-utf8 EXACTF* cases is to first find the
1512 * first character of the EXACTF* node and then, if necessary,
1513 * case-insensitively compare the full text of the node. c1 is the
1514 * first character. c2 is its fold. This logic will not work for
1515 * Unicode semantics and the german sharp ss, which hence should
1516 * not be compiled into a node that gets here. */
1517 pat_string = STRING(c);
1518 ln = STR_LEN(c); /* length to match in octets/bytes */
1519
8a90a8fe
KW
1520 /* We know that we have to match at least 'ln' bytes (which is the
1521 * same as characters, since not utf8). If we have to match 3
1522 * characters, and there are only 2 availabe, we know without
1523 * trying that it will fail; so don't start a match past the
1524 * required minimum number from the far end */
fac1af77
KW
1525 e = HOP3c(strend, -((I32)ln), s);
1526
1527 if (!reginfo && e < s) {
1528 e = s; /* Due to minlen logic of intuit() */
1529 }
1530
1531 c1 = *pat_string;
1532 c2 = fold_array[c1];
1533 if (c1 == c2) { /* If char and fold are the same */
1534 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1535 }
1536 else {
1537 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1538 }
1539 break;
1540
1541 do_exactf_utf8:
e067297c
KW
1542 {
1543 unsigned expansion;
1544
fac1af77
KW
1545
1546 /* If one of the operands is in utf8, we can't use the simpler
1547 * folding above, due to the fact that many different characters
1548 * can have the same fold, or portion of a fold, or different-
1549 * length fold */
1550 pat_string = STRING(c);
1551 ln = STR_LEN(c); /* length to match in octets/bytes */
1552 pat_end = pat_string + ln;
1553 lnc = (UTF_PATTERN) /* length to match in characters */
1554 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1555 : ln;
1556
e067297c
KW
1557 /* We have 'lnc' characters to match in the pattern, but because of
1558 * multi-character folding, each character in the target can match
1559 * up to 3 characters (Unicode guarantees it will never exceed
1560 * this) if it is utf8-encoded; and up to 2 if not (based on the
1561 * fact that the Latin 1 folds are already determined, and the
1562 * only multi-char fold in that range is the sharp-s folding to
1563 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1564 * string character. Adjust lnc accordingly, rounding up, so that
1565 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1566 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1567 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1568
1569 /* As in the non-UTF8 case, if we have to match 3 characters, and
1570 * only 2 are left, it's guaranteed to fail, so don't start a
1571 * match that would require us to go beyond the end of the string
1572 */
1573 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1574
1575 if (!reginfo && e < s) {
1576 e = s; /* Due to minlen logic of intuit() */
1577 }
1578
b33105db
KW
1579 /* XXX Note that we could recalculate e to stop the loop earlier,
1580 * as the worst case expansion above will rarely be met, and as we
1581 * go along we would usually find that e moves further to the left.
1582 * This would happen only after we reached the point in the loop
1583 * where if there were no expansion we should fail. Unclear if
1584 * worth the expense */
e067297c 1585
fac1af77
KW
1586 while (s <= e) {
1587 char *my_strend= (char *)strend;
1588 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1589 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1590 && (!reginfo || regtry(reginfo, &s)) )
1591 {
1592 goto got_it;
1593 }
bbdd8bad 1594 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1595 }
1596 break;
e067297c 1597 }
bbce6d69 1598 case BOUNDL:
3280af22 1599 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1600 FBC_BOUND(isALNUM_LC,
1601 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1602 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1603 break;
bbce6d69 1604 case NBOUNDL:
3280af22 1605 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1606 FBC_NBOUND(isALNUM_LC,
1607 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1608 isALNUM_LC_utf8((U8*)s));
1609 break;
1610 case BOUND:
1611 FBC_BOUND(isWORDCHAR,
1612 isALNUM_uni(tmp),
1613 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614 break;
cfaf538b
KW
1615 case BOUNDA:
1616 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1617 isWORDCHAR_A(tmp),
1618 isWORDCHAR_A((U8*)s));
1619 break;
a0d0e21e 1620 case NBOUND:
63ac0dad
KW
1621 FBC_NBOUND(isWORDCHAR,
1622 isALNUM_uni(tmp),
1623 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1624 break;
cfaf538b
KW
1625 case NBOUNDA:
1626 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1627 isWORDCHAR_A(tmp),
1628 isWORDCHAR_A((U8*)s));
1629 break;
63ac0dad
KW
1630 case BOUNDU:
1631 FBC_BOUND(isWORDCHAR_L1,
1632 isALNUM_uni(tmp),
1633 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1634 break;
1635 case NBOUNDU:
1636 FBC_NBOUND(isWORDCHAR_L1,
1637 isALNUM_uni(tmp),
1638 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1639 break;
bbce6d69 1640 case ALNUML:
4cadc6a9
YO
1641 REXEC_FBC_CSCAN_TAINT(
1642 isALNUM_LC_utf8((U8*)s),
1643 isALNUM_LC(*s)
1644 );
6895a8aa 1645 break;
980866de
KW
1646 case ALNUMU:
1647 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1648 LOAD_UTF8_CHARCLASS_ALNUM(),
1649 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1650 isWORDCHAR_L1((U8) *s)
1651 );
6895a8aa 1652 break;
980866de
KW
1653 case ALNUM:
1654 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1655 LOAD_UTF8_CHARCLASS_ALNUM(),
1656 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1657 isWORDCHAR((U8) *s)
1658 );
6895a8aa 1659 break;
cfaf538b 1660 case ALNUMA:
8e9da4d4
KW
1661 /* Don't need to worry about utf8, as it can match only a single
1662 * byte invariant character */
cfaf538b 1663 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1664 break;
980866de
KW
1665 case NALNUMU:
1666 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1667 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1668 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1669 ! isWORDCHAR_L1((U8) *s)
1670 );
6895a8aa 1671 break;
a0d0e21e 1672 case NALNUM:
4cadc6a9 1673 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1674 LOAD_UTF8_CHARCLASS_ALNUM(),
1675 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1676 ! isALNUM(*s)
4cadc6a9 1677 );
6895a8aa 1678 break;
cfaf538b 1679 case NALNUMA:
8e9da4d4
KW
1680 REXEC_FBC_CSCAN(
1681 !isWORDCHAR_A(*s),
1682 !isWORDCHAR_A(*s)
1683 );
1684 break;
bbce6d69 1685 case NALNUML:
4cadc6a9
YO
1686 REXEC_FBC_CSCAN_TAINT(
1687 !isALNUM_LC_utf8((U8*)s),
1688 !isALNUM_LC(*s)
1689 );
6895a8aa 1690 break;
980866de
KW
1691 case SPACEU:
1692 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1693 LOAD_UTF8_CHARCLASS_SPACE(),
1694 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1695 isSPACE_L1((U8) *s)
1696 );
6895a8aa 1697 break;
a0d0e21e 1698 case SPACE:
4cadc6a9 1699 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1700 LOAD_UTF8_CHARCLASS_SPACE(),
1701 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1702 isSPACE((U8) *s)
4cadc6a9 1703 );
6895a8aa 1704 break;
cfaf538b 1705 case SPACEA:
8e9da4d4
KW
1706 /* Don't need to worry about utf8, as it can match only a single
1707 * byte invariant character */
cfaf538b 1708 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1709 break;
bbce6d69 1710 case SPACEL:
4cadc6a9 1711 REXEC_FBC_CSCAN_TAINT(
6bbba904 1712 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1713 isSPACE_LC(*s)
1714 );
6895a8aa 1715 break;
980866de
KW
1716 case NSPACEU:
1717 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1718 LOAD_UTF8_CHARCLASS_SPACE(),
1719 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1720 ! isSPACE_L1((U8) *s)
1721 );
6895a8aa 1722 break;
a0d0e21e 1723 case NSPACE:
4cadc6a9 1724 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1725 LOAD_UTF8_CHARCLASS_SPACE(),
1726 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1727 ! isSPACE((U8) *s)
4cadc6a9 1728 );
6895a8aa 1729 break;
cfaf538b 1730 case NSPACEA:
8e9da4d4
KW
1731 REXEC_FBC_CSCAN(
1732 !isSPACE_A(*s),
1733 !isSPACE_A(*s)
1734 );
1735 break;
bbce6d69 1736 case NSPACEL:
4cadc6a9 1737 REXEC_FBC_CSCAN_TAINT(
6bbba904 1738 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1739 !isSPACE_LC(*s)
1740 );
6895a8aa 1741 break;
a0d0e21e 1742 case DIGIT:
4cadc6a9 1743 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1744 LOAD_UTF8_CHARCLASS_DIGIT(),
1745 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1746 isDIGIT(*s)
1747 );
6895a8aa 1748 break;
cfaf538b 1749 case DIGITA:
8e9da4d4
KW
1750 /* Don't need to worry about utf8, as it can match only a single
1751 * byte invariant character */
cfaf538b 1752 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1753 break;
b8c5462f 1754 case DIGITL:
4cadc6a9
YO
1755 REXEC_FBC_CSCAN_TAINT(
1756 isDIGIT_LC_utf8((U8*)s),
1757 isDIGIT_LC(*s)
1758 );
6895a8aa 1759 break;
a0d0e21e 1760 case NDIGIT:
4cadc6a9 1761 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1762 LOAD_UTF8_CHARCLASS_DIGIT(),
1763 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1764 !isDIGIT(*s)
1765 );
6895a8aa 1766 break;
cfaf538b 1767 case NDIGITA:
8e9da4d4
KW
1768 REXEC_FBC_CSCAN(
1769 !isDIGIT_A(*s),
1770 !isDIGIT_A(*s)
1771 );
1772 break;
b8c5462f 1773 case NDIGITL:
4cadc6a9
YO
1774 REXEC_FBC_CSCAN_TAINT(
1775 !isDIGIT_LC_utf8((U8*)s),
1776 !isDIGIT_LC(*s)
1777 );
6895a8aa 1778 break;
e1d1eefb
YO
1779 case LNBREAK:
1780 REXEC_FBC_CSCAN(
1781 is_LNBREAK_utf8(s),
1782 is_LNBREAK_latin1(s)
1783 );
6895a8aa 1784 break;
e1d1eefb
YO
1785 case VERTWS:
1786 REXEC_FBC_CSCAN(
1787 is_VERTWS_utf8(s),
1788 is_VERTWS_latin1(s)
1789 );
6895a8aa 1790 break;
e1d1eefb
YO
1791 case NVERTWS:
1792 REXEC_FBC_CSCAN(
1793 !is_VERTWS_utf8(s),
1794 !is_VERTWS_latin1(s)
1795 );
6895a8aa 1796 break;
e1d1eefb
YO
1797 case HORIZWS:
1798 REXEC_FBC_CSCAN(
1799 is_HORIZWS_utf8(s),
1800 is_HORIZWS_latin1(s)
1801 );
6895a8aa 1802 break;
e1d1eefb
YO
1803 case NHORIZWS:
1804 REXEC_FBC_CSCAN(
1805 !is_HORIZWS_utf8(s),
1806 !is_HORIZWS_latin1(s)
1807 );
6895a8aa 1808 break;
0658cdde
KW
1809 case POSIXA:
1810 /* Don't need to worry about utf8, as it can match only a single
1811 * byte invariant character. The flag in this node type is the
1812 * class number to pass to _generic_isCC() to build a mask for
1813 * searching in PL_charclass[] */
1814 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1815 break;
1816 case NPOSIXA:
1817 REXEC_FBC_CSCAN(
1818 !_generic_isCC_A(*s, FLAGS(c)),
1819 !_generic_isCC_A(*s, FLAGS(c))
1820 );
1821 break;
1822
1de06328
YO
1823 case AHOCORASICKC:
1824 case AHOCORASICK:
07be1b83 1825 {
a0a388a1 1826 DECL_TRIE_TYPE(c);
07be1b83
YO
1827 /* what trie are we using right now */
1828 reg_ac_data *aho
f8fc2ecf 1829 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1830 reg_trie_data *trie
1831 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1832 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1833
1834 const char *last_start = strend - trie->minlen;
6148ee25 1835#ifdef DEBUGGING
07be1b83 1836 const char *real_start = s;
6148ee25 1837#endif
07be1b83 1838 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1839 SV *sv_points;
1840 U8 **points; /* map of where we were in the input string
786e8c11 1841 when reading a given char. For ASCII this
be8e71aa 1842 is unnecessary overhead as the relationship
38a44b82
NC
1843 is always 1:1, but for Unicode, especially
1844 case folded Unicode this is not true. */
f9e705e8 1845 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1846 U8 *bitmap=NULL;
1847
07be1b83
YO
1848
1849 GET_RE_DEBUG_FLAGS_DECL;
1850
be8e71aa
YO
1851 /* We can't just allocate points here. We need to wrap it in
1852 * an SV so it gets freed properly if there is a croak while
1853 * running the match */
1854 ENTER;
1855 SAVETMPS;
1856 sv_points=newSV(maxlen * sizeof(U8 *));
1857 SvCUR_set(sv_points,
1858 maxlen * sizeof(U8 *));
1859 SvPOK_on(sv_points);
1860 sv_2mortal(sv_points);
1861 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1862 if ( trie_type != trie_utf8_fold
1863 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1864 {
786e8c11
YO
1865 if (trie->bitmap)
1866 bitmap=(U8*)trie->bitmap;
1867 else
1868 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1869 }
786e8c11
YO
1870 /* this is the Aho-Corasick algorithm modified a touch
1871 to include special handling for long "unknown char"
1872 sequences. The basic idea being that we use AC as long
1873 as we are dealing with a possible matching char, when
1874 we encounter an unknown char (and we have not encountered
1875 an accepting state) we scan forward until we find a legal
1876 starting char.
1877 AC matching is basically that of trie matching, except
1878 that when we encounter a failing transition, we fall back
1879 to the current states "fail state", and try the current char
1880 again, a process we repeat until we reach the root state,
1881 state 1, or a legal transition. If we fail on the root state
1882 then we can either terminate if we have reached an accepting
1883 state previously, or restart the entire process from the beginning
1884 if we have not.
1885
1886 */
07be1b83
YO
1887 while (s <= last_start) {
1888 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1889 U8 *uc = (U8*)s;
1890 U16 charid = 0;
1891 U32 base = 1;
1892 U32 state = 1;
1893 UV uvc = 0;
1894 STRLEN len = 0;
1895 STRLEN foldlen = 0;
1896 U8 *uscan = (U8*)NULL;
1897 U8 *leftmost = NULL;
786e8c11
YO
1898#ifdef DEBUGGING
1899 U32 accepted_word= 0;
1900#endif
07be1b83
YO
1901 U32 pointpos = 0;
1902
1903 while ( state && uc <= (U8*)strend ) {
1904 int failed=0;
786e8c11
YO
1905 U32 word = aho->states[ state ].wordnum;
1906
1de06328
YO
1907 if( state==1 ) {
1908 if ( bitmap ) {
1909 DEBUG_TRIE_EXECUTE_r(
1910 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1911 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1912 (char *)uc, utf8_target );
1de06328
YO
1913 PerlIO_printf( Perl_debug_log,
1914 " Scanning for legal start char...\n");
1915 }
d085b490
YO
1916 );
1917 if (utf8_target) {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 uc += UTF8SKIP(uc);
1920 }
1921 } else {
1922 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1923 uc++;
1924 }
1925 }
1de06328 1926 s= (char *)uc;
786e8c11 1927 }
786e8c11
YO
1928 if (uc >(U8*)last_start) break;
1929 }
1930
1931 if ( word ) {
2e64971a 1932 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1933 if (!leftmost || lpos < leftmost) {
1934 DEBUG_r(accepted_word=word);
07be1b83 1935 leftmost= lpos;
786e8c11 1936 }
07be1b83 1937 if (base==0) break;
786e8c11 1938
07be1b83
YO
1939 }
1940 points[pointpos++ % maxlen]= uc;
55eed653
NC
1941 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1942 uscan, len, uvc, charid, foldlen,
1943 foldbuf, uniflags);
786e8c11
YO
1944 DEBUG_TRIE_EXECUTE_r({
1945 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1946 s, utf8_target );
07be1b83 1947 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1948 " Charid:%3u CP:%4"UVxf" ",
1949 charid, uvc);
1950 });
07be1b83
YO
1951
1952 do {
6148ee25 1953#ifdef DEBUGGING
786e8c11 1954 word = aho->states[ state ].wordnum;
6148ee25 1955#endif
07be1b83
YO
1956 base = aho->states[ state ].trans.base;
1957
786e8c11
YO
1958 DEBUG_TRIE_EXECUTE_r({
1959 if (failed)
1960 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1961 s, utf8_target );
07be1b83 1962 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1963 "%sState: %4"UVxf", word=%"UVxf,
1964 failed ? " Fail transition to " : "",
1965 (UV)state, (UV)word);
1966 });
07be1b83
YO
1967 if ( base ) {
1968 U32 tmp;
6dd2be57 1969 I32 offset;
07be1b83 1970 if (charid &&
6dd2be57
DM
1971 ( ((offset = base + charid
1972 - 1 - trie->uniquecharcount)) >= 0)
1973 && ((U32)offset < trie->lasttrans)
1974 && trie->trans[offset].check == state
1975 && (tmp=trie->trans[offset].next))
07be1b83 1976 {
786e8c11
YO
1977 DEBUG_TRIE_EXECUTE_r(
1978 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1979 state = tmp;
1980 break;
1981 }
1982 else {
786e8c11
YO
1983 DEBUG_TRIE_EXECUTE_r(
1984 PerlIO_printf( Perl_debug_log," - fail\n"));
1985 failed = 1;
1986 state = aho->fail[state];
07be1b83
YO
1987 }
1988 }
1989 else {
1990 /* we must be accepting here */
786e8c11
YO
1991 DEBUG_TRIE_EXECUTE_r(
1992 PerlIO_printf( Perl_debug_log," - accepting\n"));
1993 failed = 1;
07be1b83
YO
1994 break;
1995 }
1996 } while(state);
786e8c11 1997 uc += len;
07be1b83
YO
1998 if (failed) {
1999 if (leftmost)
2000 break;
786e8c11 2001 if (!state) state = 1;
07be1b83
YO
2002 }
2003 }
2004 if ( aho->states[ state ].wordnum ) {
2e64971a 2005 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2006 if (!leftmost || lpos < leftmost) {
2007 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2008 leftmost = lpos;
786e8c11 2009 }
07be1b83 2010 }
07be1b83
YO
2011 if (leftmost) {
2012 s = (char*)leftmost;
786e8c11
YO
2013 DEBUG_TRIE_EXECUTE_r({
2014 PerlIO_printf(
70685ca0
JH
2015 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2016 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2017 );
2018 });
24b23f37 2019 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2020 FREETMPS;
2021 LEAVE;
07be1b83 2022 goto got_it;
be8e71aa 2023 }
07be1b83 2024 s = HOPc(s,1);
786e8c11
YO
2025 DEBUG_TRIE_EXECUTE_r({
2026 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2027 });
07be1b83 2028 } else {
786e8c11
YO
2029 DEBUG_TRIE_EXECUTE_r(
2030 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2031 break;
2032 }
2033 }
be8e71aa
YO
2034 FREETMPS;
2035 LEAVE;
07be1b83
YO
2036 }
2037 break;
b3c9acc1 2038 default:
3c3eec57
GS
2039 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2040 break;
d6a28714 2041 }
6eb5f6b9
JH
2042 return 0;
2043 got_it:
2044 return s;
2045}
2046
fae667d5 2047
6eb5f6b9
JH
2048/*
2049 - regexec_flags - match a regexp against a string
2050 */
2051I32
288b8c02 2052Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2053 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2054/* strend: pointer to null at end of string */
2055/* strbeg: real beginning of string */
2056/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2057/* data: May be used for some additional optimizations.
2058 Currently its only used, with a U32 cast, for transmitting
2059 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2060/* nosave: For optimizations. */
2061{
97aff369 2062 dVAR;
288b8c02 2063 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2064 /*register*/ char *s;
eb578fdb 2065 regnode *c;
24b23f37 2066 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2067 I32 minlen; /* must match at least this many chars */
2068 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2069 I32 end_shift = 0; /* Same for the end. */ /* CC */
2070 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2071 char *scream_olds = NULL;
f2ed9b32 2072 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2073 I32 multiline;
f8fc2ecf 2074 RXi_GET_DECL(prog,progi);
3b0527fe 2075 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2076 regexp_paren_pair *swap = NULL;
a3621e74
YO
2077 GET_RE_DEBUG_FLAGS_DECL;
2078
7918f24d 2079 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2080 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2081
2082 /* Be paranoid... */
2083 if (prog == NULL || startpos == NULL) {
2084 Perl_croak(aTHX_ "NULL regexp parameter");
2085 return 0;
2086 }
2087
bbe252da 2088 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2089 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2090
f2ed9b32 2091 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2092 DEBUG_EXECUTE_r(
f2ed9b32 2093 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2094 "Matching");
2095 );
bac06658 2096
6eb5f6b9 2097 minlen = prog->minlen;
1de06328
YO
2098
2099 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2101 "String too short [regexec_flags]...\n"));
2102 goto phooey;
1aa99e6b 2103 }
6eb5f6b9 2104
1de06328 2105
6eb5f6b9 2106 /* Check validity of program. */
f8fc2ecf 2107 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2108 Perl_croak(aTHX_ "corrupted regexp program");
2109 }
2110
2111 PL_reg_flags = 0;
ed301438 2112 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2113 PL_reg_maxiter = 0;
2114
3c8556c3 2115 if (RX_UTF8(rx))
6eb5f6b9
JH
2116 PL_reg_flags |= RF_utf8;
2117
2118 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2119 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2120 PL_bostr = strbeg;
3b0527fe 2121 reginfo.sv = sv;
6eb5f6b9
JH
2122
2123 /* Mark end of line for $ (and such) */
2124 PL_regeol = strend;
2125
2126 /* see how far we have to get to not match where we matched before */
3b0527fe 2127 reginfo.till = startpos+minend;
6eb5f6b9 2128
6eb5f6b9
JH
2129 /* If there is a "must appear" string, look for it. */
2130 s = startpos;
2131
bbe252da 2132 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2133 MAGIC *mg;
2c296965 2134 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2135 reginfo.ganch = startpos + prog->gofs;
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2138 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2139 && SvMAGIC(sv)
14befaf4
DM
2140 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141 && mg->mg_len >= 0) {
3b0527fe 2142 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2145
bbe252da 2146 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2147 if (s > reginfo.ganch)
6eb5f6b9 2148 goto phooey;
58e23c8d 2149 s = reginfo.ganch - prog->gofs;
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2152 if (s < strbeg)
2153 goto phooey;
6eb5f6b9
JH
2154 }
2155 }
58e23c8d 2156 else if (data) {
70685ca0 2157 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161 } else { /* pos() not defined */
3b0527fe 2162 reginfo.ganch = strbeg;
2c296965
YO
2163 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164 "GPOS: reginfo.ganch = strbeg\n"));
2165 }
6eb5f6b9 2166 }
288b8c02 2167 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2168 /* We have to be careful. If the previous successful match
2169 was from this regex we don't want a subsequent partially
2170 successful match to clobber the old results.
2171 So when we detect this possibility we add a swap buffer
2172 to the re, and switch the buffer each match. If we fail
2173 we switch it back, otherwise we leave it swapped.
2174 */
2175 swap = prog->offs;
2176 /* do we need a save destructor here for eval dies? */
2177 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2178 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180 PTR2UV(prog),
2181 PTR2UV(swap),
2182 PTR2UV(prog->offs)
2183 ));
c74340f9 2184 }
a0714e2c 2185 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2186 re_scream_pos_data d;
2187
2188 d.scream_olds = &scream_olds;
2189 d.scream_pos = &scream_pos;
288b8c02 2190 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2191 if (!s) {
a3621e74 2192 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2193 goto phooey; /* not present */
3fa9c3d7 2194 }
6eb5f6b9
JH
2195 }
2196
1de06328 2197
6eb5f6b9
JH
2198
2199 /* Simplest case: anchored match need be tried only once. */
2200 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2201 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2202 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2203 goto got_it;
bbe252da
YO
2204 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2206 {
2207 char *end;
2208
2209 if (minlen)
2210 dontbother = minlen - 1;
1aa99e6b 2211 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2212 /* for multiline we only have to try after newlines */
33b8afdf 2213 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2214 /* because of the goto we can not easily reuse the macros for bifurcating the
2215 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216 if (utf8_target) {
2217 if (s == startpos)
2218 goto after_try_utf8;
2219 while (1) {
2220 if (regtry(&reginfo, &s)) {
2221 goto got_it;
2222 }
2223 after_try_utf8:
2224 if (s > end) {
2225 goto phooey;
2226 }
2227 if (prog->extflags & RXf_USE_INTUIT) {
2228 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229 if (!s) {
2230 goto phooey;
2231 }
2232 }
2233 else {
2234 s += UTF8SKIP(s);
2235 }
2236 }
2237 } /* end search for check string in unicode */
2238 else {
2239 if (s == startpos) {
2240 goto after_try_latin;
2241 }
2242 while (1) {
2243 if (regtry(&reginfo, &s)) {
2244 goto got_it;
2245 }
2246 after_try_latin:
2247 if (s > end) {
2248 goto phooey;
2249 }
2250 if (prog->extflags & RXf_USE_INTUIT) {
2251 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252 if (!s) {
2253 goto phooey;
2254 }
2255 }
2256 else {
2257 s++;
2258 }
2259 }
2260 } /* end search for check string in latin*/
2261 } /* end search for check string */
2262 else { /* search for newline */
2263 if (s > startpos) {
2264 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2265 s--;
92f3d482 2266 }
21eede78
YO
2267 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2269 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2270 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2271 goto got_it;
2272 }
92f3d482
YO
2273 }
2274 } /* end search for newline */
2275 } /* end anchored/multiline check string search */
6eb5f6b9 2276 goto phooey;
bbe252da 2277 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2278 {
486ec47a 2279 /* the warning about reginfo.ganch being used without initialization
bbe252da 2280 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2281 and we only enter this block when the same bit is set. */
58e23c8d 2282 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2283
2284 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2285 goto got_it;
2286 goto phooey;
2287 }
2288
2289 /* Messy cases: unanchored match. */
bbe252da 2290 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2291 /* we have /x+whatever/ */
f2ed9b32 2292 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2293 char ch;
bf93d4cc
GS
2294#ifdef DEBUGGING
2295 int did_match = 0;
2296#endif
f2ed9b32
KW
2297 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2300
f2ed9b32 2301 if (utf8_target) {
4cadc6a9 2302 REXEC_FBC_SCAN(
6eb5f6b9 2303 if (*s == ch) {
a3621e74 2304 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2305 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2306 s += UTF8SKIP(s);
2307 while (s < strend && *s == ch)
2308 s += UTF8SKIP(s);
2309 }
4cadc6a9 2310 );
6eb5f6b9
JH
2311 }
2312 else {
4cadc6a9 2313 REXEC_FBC_SCAN(
6eb5f6b9 2314 if (*s == ch) {
a3621e74 2315 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2316 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2317 s++;
2318 while (s < strend && *s == ch)
2319 s++;
2320 }
4cadc6a9 2321 );
6eb5f6b9 2322 }
a3621e74 2323 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2324 PerlIO_printf(Perl_debug_log,
b7953727
JH
2325 "Did not find anchored character...\n")
2326 );
6eb5f6b9 2327 }
a0714e2c
SS
2328 else if (prog->anchored_substr != NULL
2329 || prog->anchored_utf8 != NULL
2330 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2331 && prog->float_max_offset < strend - s)) {
2332 SV *must;
2333 I32 back_max;
2334 I32 back_min;
2335 char *last;
6eb5f6b9 2336 char *last1; /* Last position checked before */
bf93d4cc
GS
2337#ifdef DEBUGGING
2338 int did_match = 0;
2339#endif
33b8afdf 2340 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2341 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2344 back_max = back_min = prog->anchored_offset;
2345 } else {
f2ed9b32
KW
2346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2349 back_max = prog->float_max_offset;
2350 back_min = prog->float_min_offset;
2351 }
1de06328
YO
2352
2353
33b8afdf
JH
2354 if (must == &PL_sv_undef)
2355 /* could not downgrade utf8 check substring, so must fail */
2356 goto phooey;
2357
1de06328
YO
2358 if (back_min<0) {
2359 last = strend;
2360 } else {
2361 last = HOP3c(strend, /* Cannot start after this */
2362 -(I32)(CHR_SVLEN(must)
2363 - (SvTAIL(must) != 0) + back_min), strbeg);
2364 }
6eb5f6b9
JH
2365 if (s > PL_bostr)
2366 last1 = HOPc(s, -1);
2367 else
2368 last1 = s - 1; /* bogus */
2369
a0288114 2370 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2371 check_substr==must. */
2372 scream_pos = -1;
2373 dontbother = end_shift;
2374 strend = HOPc(strend, -dontbother);
2375 while ( (s <= last) &&
c33e64f0 2376 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2377 (unsigned char*)strend, must,
c33e64f0 2378 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2379 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2380 if (HOPc(s, -back_max) > last1) {
2381 last1 = HOPc(s, -back_min);
2382 s = HOPc(s, -back_max);
2383 }
2384 else {
52657f30 2385 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2386
2387 last1 = HOPc(s, -back_min);
52657f30 2388 s = t;
6eb5f6b9 2389 }
f2ed9b32 2390 if (utf8_target) {
6eb5f6b9 2391 while (s <= last1) {
24b23f37 2392 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2393 goto got_it;
2394 s += UTF8SKIP(s);
2395 }
2396 }
2397 else {
2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s++;
2402 }
2403 }
2404 }
ab3bbdeb 2405 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2406 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2407 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2409 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2410 ? "anchored" : "floating"),
ab3bbdeb
YO
2411 quoted, RE_SV_TAIL(must));
2412 });
6eb5f6b9
JH
2413 goto phooey;
2414 }
f8fc2ecf 2415 else if ( (c = progi->regstclass) ) {
f14c76ed 2416 if (minlen) {
f8fc2ecf 2417 const OPCODE op = OP(progi->regstclass);
66e933ab 2418 /* don't bother with what can't match */
786e8c11 2419 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2420 strend = HOPc(strend, -(minlen - 1));
2421 }
a3621e74 2422 DEBUG_EXECUTE_r({
be8e71aa 2423 SV * const prop = sv_newmortal();
32fc9b6a 2424 regprop(prog, prop, c);
0df25f3d 2425 {
f2ed9b32 2426 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2427 s,strend-s,60);
0df25f3d 2428 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2429 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2430 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2431 quoted, (int)(strend - s));
0df25f3d 2432 }
ffc61ed2 2433 });
3b0527fe 2434 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2435 goto got_it;
07be1b83 2436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2437 }
2438 else {
2439 dontbother = 0;
a0714e2c 2440 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2441 /* Trim the end. */
6af40bd7 2442 char *last= NULL;
33b8afdf 2443 SV* float_real;
c33e64f0
FC
2444 STRLEN len;
2445 const char *little;
33b8afdf 2446
f2ed9b32
KW
2447 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2450
c33e64f0
FC
2451 little = SvPV_const(float_real, len);
2452 if (SvTAIL(float_real)) {
1a13b075
YO
2453 /* This means that float_real contains an artificial \n on the end
2454 * due to the presence of something like this: /foo$/
2455 * where we can match both "foo" and "foo\n" at the end of the string.
2456 * So we have to compare the end of the string first against the float_real
2457 * without the \n and then against the full float_real with the string.
2458 * We have to watch out for cases where the string might be smaller
2459 * than the float_real or the float_real without the \n.
2460 */
2461 char *checkpos= strend - len;
2462 DEBUG_OPTIMISE_r(
2463 PerlIO_printf(Perl_debug_log,
2464 "%sChecking for float_real.%s\n",
2465 PL_colors[4], PL_colors[5]));
2466 if (checkpos + 1 < strbeg) {
2467 /* can't match, even if we remove the trailing \n string is too short to match */
2468 DEBUG_EXECUTE_r(
2469 PerlIO_printf(Perl_debug_log,
2470 "%sString shorter than required trailing substring, cannot match.%s\n",
2471 PL_colors[4], PL_colors[5]));
2472 goto phooey;
2473 } else if (memEQ(checkpos + 1, little, len - 1)) {
2474 /* can match, the end of the string matches without the "\n" */
2475 last = checkpos + 1;
2476 } else if (checkpos < strbeg) {
2477 /* cant match, string is too short when the "\n" is included */
2478 DEBUG_EXECUTE_r(
2479 PerlIO_printf(Perl_debug_log,
2480 "%sString does not contain required trailing substring, cannot match.%s\n",
2481 PL_colors[4], PL_colors[5]));
2482 goto phooey;
2483 } else if (!multiline) {
2484 /* non multiline match, so compare with the "\n" at the end of the string */
2485 if (memEQ(checkpos, little, len)) {
2486 last= checkpos;
2487 } else {
2488 DEBUG_EXECUTE_r(
2489 PerlIO_printf(Perl_debug_log,
2490 "%sString does not contain required trailing substring, cannot match.%s\n",
2491 PL_colors[4], PL_colors[5]));
2492 goto phooey;
2493 }
2494 } else {
2495 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2496 goto find_last;
1a13b075 2497 }
c33e64f0 2498 } else {
d6a28714 2499 find_last:
9041c2e3 2500 if (len)
d6a28714 2501 last = rninstr(s, strend, little, little + len);
b8c5462f 2502 else
a0288114 2503 last = strend; /* matching "$" */
b8c5462f 2504 }
6af40bd7
YO
2505 if (!last) {
2506 /* at one point this block contained a comment which was probably
2507 * incorrect, which said that this was a "should not happen" case.
2508 * Even if it was true when it was written I am pretty sure it is
2509 * not anymore, so I have removed the comment and replaced it with
2510 * this one. Yves */
6bda09f9
YO
2511 DEBUG_EXECUTE_r(
2512 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2513 "String does not contain required substring, cannot match.\n"
2514 ));
2515 goto phooey;
bf93d4cc 2516 }
d6a28714
JH
2517 dontbother = strend - last + prog->float_min_offset;
2518 }
2519 if (minlen && (dontbother < minlen))
2520 dontbother = minlen - 1;
2521 strend -= dontbother; /* this one's always in bytes! */
2522 /* We don't know much -- general case. */
f2ed9b32 2523 if (utf8_target) {
d6a28714 2524 for (;;) {
24b23f37 2525 if (regtry(&reginfo, &s))
d6a28714
JH
2526 goto got_it;
2527 if (s >= strend)
2528 break;
b8c5462f 2529 s += UTF8SKIP(s);
d6a28714
JH
2530 };
2531 }
2532 else {
2533 do {
24b23f37 2534 if (regtry(&reginfo, &s))
d6a28714
JH
2535 goto got_it;
2536 } while (s++ < strend);
2537 }
2538 }
2539
2540 /* Failure. */
2541 goto phooey;
2542
2543got_it:
495f47a5
DM
2544 DEBUG_BUFFERS_r(
2545 if (swap)
2546 PerlIO_printf(Perl_debug_log,
2547 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548 PTR2UV(prog),
2549 PTR2UV(swap)
2550 );
2551 );
e9105d30 2552 Safefree(swap);
288b8c02 2553 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2554
ed301438 2555 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2556 restore_pos(aTHX_ prog);
5daac39c
NC
2557 if (RXp_PAREN_NAMES(prog))
2558 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2559
2560 /* make sure $`, $&, $', and $digit will work later */
2561 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2562 RX_MATCH_COPY_FREE(rx);
d6a28714 2563 if (flags & REXEC_COPY_STR) {
be8e71aa 2564 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2565#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2566 if ((SvIsCOW(sv)
2567 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2568 if (DEBUG_C_TEST) {
2569 PerlIO_printf(Perl_debug_log,
2570 "Copy on write: regexp capture, type %d\n",
2571 (int) SvTYPE(sv));
2572 }
2573 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2574 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2575 assert (SvPOKp(prog->saved_copy));
2576 } else
2577#endif
2578 {
288b8c02 2579 RX_MATCH_COPIED_on(rx);
ed252734
NC
2580 s = savepvn(strbeg, i);
2581 prog->subbeg = s;
2582 }
d6a28714 2583 prog->sublen = i;
d6a28714
JH
2584 }
2585 else {
2586 prog->subbeg = strbeg;
2587 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2588 }
2589 }
9041c2e3 2590
d6a28714
JH
2591 return 1;
2592
2593phooey:
a3621e74 2594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2595 PL_colors[4], PL_colors[5]));
ed301438 2596 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2597 restore_pos(aTHX_ prog);
e9105d30 2598 if (swap) {
c74340f9 2599 /* we failed :-( roll it back */
495f47a5
DM
2600 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2601 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2602 PTR2UV(prog),
2603 PTR2UV(prog->offs),
2604 PTR2UV(swap)
2605 ));
e9105d30
GG
2606 Safefree(prog->offs);
2607 prog->offs = swap;
2608 }
2609
d6a28714
JH
2610 return 0;
2611}
2612
6bda09f9 2613
ec43f78b
DM
2614/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2615 * Do inc before dec, in case old and new rex are the same */
2616#define SET_reg_curpm(Re2) \
2617 if (PL_reg_state.re_state_eval_setup_done) { \
2618 (void)ReREFCNT_inc(Re2); \
2619 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2620 PM_SETRE((PL_reg_curpm), (Re2)); \
2621 }
2622
2623
d6a28714
JH
2624/*
2625 - regtry - try match at specific point
2626 */
2627STATIC I32 /* 0 failure, 1 success */
24b23f37 2628S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2629{
97aff369 2630 dVAR;
d6a28714 2631 CHECKPOINT lastcp;
288b8c02
NC
2632 REGEXP *const rx = reginfo->prog;
2633 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2634 RXi_GET_DECL(prog,progi);
a3621e74 2635 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2636
2637 PERL_ARGS_ASSERT_REGTRY;
2638
24b23f37 2639 reginfo->cutpoint=NULL;
d6a28714 2640
ed301438
DM
2641 if ((prog->extflags & RXf_EVAL_SEEN)
2642 && !PL_reg_state.re_state_eval_setup_done)
2643 {
d6a28714
JH
2644 MAGIC *mg;
2645
ed301438 2646 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2647 if (reginfo->sv) {
d6a28714 2648 /* Make $_ available to executed code. */
3b0527fe 2649 if (reginfo->sv != DEFSV) {
59f00321 2650 SAVE_DEFSV;
414bf5ae 2651 DEFSV_set(reginfo->sv);
b8c5462f 2652 }
d6a28714 2653
3b0527fe
DM
2654 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2655 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2656 /* prepare for quick setting of pos */
d300d9fa 2657#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2658 if (SvIsCOW(reginfo->sv))
2659 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2660#endif
3dab1dad 2661 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2662 &PL_vtbl_mglob, NULL, 0);
d6a28714 2663 mg->mg_len = -1;
b8c5462f 2664 }
d6a28714
JH
2665 PL_reg_magic = mg;
2666 PL_reg_oldpos = mg->mg_len;
4f639d21 2667 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2668 }
09687e5a 2669 if (!PL_reg_curpm) {
a02a5408 2670 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2671#ifdef USE_ITHREADS
2672 {
14a49a24 2673 SV* const repointer = &PL_sv_undef;
92313705
NC
2674 /* this regexp is also owned by the new PL_reg_curpm, which
2675 will try to free it. */
d2ece331 2676 av_push(PL_regex_padav, repointer);
09687e5a
AB
2677 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2678 PL_regex_pad = AvARRAY(PL_regex_padav);
2679 }
2680#endif
2681 }
ec43f78b 2682 SET_reg_curpm(rx);
d6a28714
JH
2683 PL_reg_oldcurpm = PL_curpm;
2684 PL_curpm = PL_reg_curpm;
07bc277f 2685 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2686 /* Here is a serious problem: we cannot rewrite subbeg,
2687 since it may be needed if this match fails. Thus
2688 $` inside (?{}) could fail... */
2689 PL_reg_oldsaved = prog->subbeg;
2690 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2691#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2692 PL_nrs = prog->saved_copy;
2693#endif
07bc277f 2694 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2695 }
2696 else
bd61b366 2697 PL_reg_oldsaved = NULL;
d6a28714
JH
2698 prog->subbeg = PL_bostr;
2699 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2700 }
97ca13b7
DM
2701#ifdef DEBUGGING
2702 PL_reg_starttry = *startpos;
2703#endif
f0ab9afb 2704 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2705 PL_reginput = *startpos;
d6a28714 2706 prog->lastparen = 0;
03994de8 2707 prog->lastcloseparen = 0;
d6a28714 2708 PL_regsize = 0;
d6a28714
JH
2709
2710 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2711 to do this again and again, prog->lastparen should take care of
3dd2943c 2712 this! --ilya*/
dafc8851
JH
2713
2714 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2715 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2716 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2717 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2718 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2719 * Meanwhile, this code *is* needed for the
daf18116
JH
2720 * above-mentioned test suite tests to succeed. The common theme
2721 * on those tests seems to be returning null fields from matches.
225593e1 2722 * --jhi updated by dapm */
dafc8851 2723#if 1
d6a28714 2724 if (prog->nparens) {
b93070ed 2725 regexp_paren_pair *pp = prog->offs;
eb578fdb 2726 I32 i;
b93070ed 2727 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2728 ++pp;
2729 pp->start = -1;
2730 pp->end = -1;
d6a28714
JH
2731 }
2732 }
dafc8851 2733#endif
02db2b7b 2734 REGCP_SET(lastcp);
f8fc2ecf 2735 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2736 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2737 return 1;
2738 }
24b23f37
YO
2739 if (reginfo->cutpoint)
2740 *startpos= reginfo->cutpoint;
02db2b7b 2741 REGCP_UNWIND(lastcp);
d6a28714
JH
2742 return 0;
2743}
2744
02db2b7b 2745
8ba1375e
MJD
2746#define sayYES goto yes
2747#define sayNO goto no
262b90c4 2748#define sayNO_SILENT goto no_silent
8ba1375e 2749
f9f4320a
YO
2750/* we dont use STMT_START/END here because it leads to
2751 "unreachable code" warnings, which are bogus, but distracting. */
2752#define CACHEsayNO \
c476f425
DM
2753 if (ST.cache_mask) \
2754 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2755 sayNO
3298f257 2756
a3621e74 2757/* this is used to determine how far from the left messages like
265c4333
YO
2758 'failed...' are printed. It should be set such that messages
2759 are inline with the regop output that created them.
a3621e74 2760*/
265c4333 2761#define REPORT_CODE_OFF 32
a3621e74
YO
2762
2763
40a82448
DM
2764#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2765#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2766
86545054
DM
2767#define SLAB_FIRST(s) (&(s)->states[0])
2768#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2769
5d9a96ca
DM
2770/* grab a new slab and return the first slot in it */
2771
2772STATIC regmatch_state *
2773S_push_slab(pTHX)
2774{
a35a87e7 2775#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2776 dMY_CXT;
2777#endif
5d9a96ca
DM
2778 regmatch_slab *s = PL_regmatch_slab->next;
2779 if (!s) {
2780 Newx(s, 1, regmatch_slab);
2781 s->prev = PL_regmatch_slab;
2782 s->next = NULL;
2783 PL_regmatch_slab->next = s;
2784 }
2785 PL_regmatch_slab = s;
86545054 2786 return SLAB_FIRST(s);
5d9a96ca 2787}
5b47454d 2788
95b24440 2789
40a82448
DM
2790/* push a new state then goto it */
2791
2792#define PUSH_STATE_GOTO(state, node) \
2793 scan = node; \
2794 st->resume_state = state; \
2795 goto push_state;
2796
2797/* push a new state with success backtracking, then goto it */
2798
2799#define PUSH_YES_STATE_GOTO(state, node) \
2800 scan = node; \
2801 st->resume_state = state; \
2802 goto push_yes_state;
2803
aa283a38 2804
aa283a38 2805
d6a28714 2806/*
95b24440 2807
bf1f174e
DM
2808regmatch() - main matching routine
2809
2810This is basically one big switch statement in a loop. We execute an op,
2811set 'next' to point the next op, and continue. If we come to a point which
2812we may need to backtrack to on failure such as (A|B|C), we push a
2813backtrack state onto the backtrack stack. On failure, we pop the top
2814state, and re-enter the loop at the state indicated. If there are no more
2815states to pop, we return failure.
2816
2817Sometimes we also need to backtrack on success; for example /A+/, where
2818after successfully matching one A, we need to go back and try to
2819match another one; similarly for lookahead assertions: if the assertion
2820completes successfully, we backtrack to the state just before the assertion
2821and then carry on. In these cases, the pushed state is marked as
2822'backtrack on success too'. This marking is in fact done by a chain of
2823pointers, each pointing to the previous 'yes' state. On success, we pop to
2824the nearest yes state, discarding any intermediate failure-only states.
2825Sometimes a yes state is pushed just to force some cleanup code to be
2826called at the end of a successful match or submatch; e.g. (??{$re}) uses
2827it to free the inner regex.
2828
2829Note that failure backtracking rewinds the cursor position, while
2830success backtracking leaves it alone.
2831
2832A pattern is complete when the END op is executed, while a subpattern
2833such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2834ops trigger the "pop to last yes state if any, otherwise return true"
2835behaviour.
2836
2837A common convention in this function is to use A and B to refer to the two
2838subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2839the subpattern to be matched possibly multiple times, while B is the entire
2840rest of the pattern. Variable and state names reflect this convention.
2841
2842The states in the main switch are the union of ops and failure/success of
2843substates associated with with that op. For example, IFMATCH is the op
2844that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2845'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2846successfully matched A and IFMATCH_A_fail is a state saying that we have
2847just failed to match A. Resume states always come in pairs. The backtrack
2848state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2849at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2850on success or failure.
2851
2852The struct that holds a backtracking state is actually a big union, with
2853one variant for each major type of op. The variable st points to the
2854top-most backtrack struct. To make the code clearer, within each
2855block of code we #define ST to alias the relevant union.
2856
2857Here's a concrete example of a (vastly oversimplified) IFMATCH
2858implementation:
2859
2860 switch (state) {
2861 ....
2862
2863#define ST st->u.ifmatch
2864
2865 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2866 ST.foo = ...; // some state we wish to save
95b24440 2867 ...
bf1f174e
DM
2868 // push a yes backtrack state with a resume value of
2869 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2870 // first node of A:
2871 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2872 // NOTREACHED
2873
2874 case IFMATCH_A: // we have successfully executed A; now continue with B
2875 next = B;
2876 bar = ST.foo; // do something with the preserved value
2877 break;
2878
2879 case IFMATCH_A_fail: // A failed, so the assertion failed
2880 ...; // do some housekeeping, then ...
2881 sayNO; // propagate the failure
2882
2883#undef ST
95b24440 2884
bf1f174e
DM
2885 ...
2886 }
95b24440 2887
bf1f174e
DM
2888For any old-timers reading this who are familiar with the old recursive
2889approach, the code above is equivalent to:
95b24440 2890
bf1f174e
DM
2891 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2892 {
2893 int foo = ...
95b24440 2894 ...
bf1f174e
DM
2895 if (regmatch(A)) {
2896 next = B;
2897 bar = foo;
2898 break;
95b24440 2899 }
bf1f174e
DM
2900 ...; // do some housekeeping, then ...
2901 sayNO; // propagate the failure
95b24440 2902 }
bf1f174e
DM
2903
2904The topmost backtrack state, pointed to by st, is usually free. If you
2905want to claim it, populate any ST.foo fields in it with values you wish to
2906save, then do one of
2907
2908 PUSH_STATE_GOTO(resume_state, node);
2909 PUSH_YES_STATE_GOTO(resume_state, node);
2910
2911which sets that backtrack state's resume value to 'resume_state', pushes a
2912new free entry to the top of the backtrack stack, then goes to 'node'.
2913On backtracking, the free slot is popped, and the saved state becomes the
2914new free state. An ST.foo field in this new top state can be temporarily
2915accessed to retrieve values, but once the main loop is re-entered, it
2916becomes available for reuse.
2917
2918Note that the depth of the backtrack stack constantly increases during the
2919left-to-right execution of the pattern, rather than going up and down with
2920the pattern nesting. For example the stack is at its maximum at Z at the
2921end of the pattern, rather than at X in the following:
2922
2923 /(((X)+)+)+....(Y)+....Z/
2924
2925The only exceptions to this are lookahead/behind assertions and the cut,
2926(?>A), which pop all the backtrack states associated with A before
2927continuing.
2928
486ec47a 2929Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2930PL_regmatch_state and st always point to the currently active state,
2931and PL_regmatch_slab points to the slab currently containing
2932PL_regmatch_state. The first time regmatch() is called, the first slab is
2933allocated, and is never freed until interpreter destruction. When the slab
2934is full, a new one is allocated and chained to the end. At exit from
2935regmatch(), slabs allocated since entry are freed.
2936
2937*/
95b24440 2938
40a82448 2939
5bc10b2c 2940#define DEBUG_STATE_pp(pp) \
265c4333 2941 DEBUG_STATE_r({ \
f2ed9b32 2942 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2943 PerlIO_printf(Perl_debug_log, \
5d458dd8 2944 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2945 depth*2, "", \
13d6edb4 2946 PL_reg_name[st->resume_state], \
5d458dd8
YO
2947 ((st==yes_state||st==mark_state) ? "[" : ""), \
2948 ((st==yes_state) ? "Y" : ""), \
2949 ((st==mark_state) ? "M" : ""), \
2950 ((st==yes_state||st==mark_state) ? "]" : "") \
2951 ); \
265c4333 2952 });
5bc10b2c 2953
40a82448 2954
3dab1dad 2955#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2956
3df15adc 2957#ifdef DEBUGGING
5bc10b2c 2958
ab3bbdeb 2959STATIC void
f2ed9b32 2960S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2961 const char *start, const char *end, const char *blurb)
2962{
efd26800 2963 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2964
2965 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2966
ab3bbdeb
YO
2967 if (!PL_colorset)
2968 reginitcolors();
2969 {
2970 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2971 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2972
f2ed9b32 2973 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2974 start, end - start, 60);
2975
2976 PerlIO_printf(Perl_debug_log,
2977 "%s%s REx%s %s against %s\n",
2978 PL_colors[4], blurb, PL_colors[5], s0, s1);
2979
f2ed9b32 2980 if (utf8_target||utf8_pat)
1de06328
YO
2981 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2982 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2983 utf8_pat && utf8_target ? " and " : "",
2984 utf8_target ? "string" : ""
ab3bbdeb
YO
2985 );
2986 }
2987}
3df15adc
YO
2988
2989STATIC void
786e8c11
YO
2990S_dump_exec_pos(pTHX_ const char *locinput,
2991 const regnode *scan,
2992 const char *loc_regeol,
2993 const char *loc_bostr,
2994 const char *loc_reg_starttry,
f2ed9b32 2995 const bool utf8_target)
07be1b83 2996{
786e8c11 2997 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2998 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2999 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3000 /* The part of the string before starttry has one color
3001 (pref0_len chars), between starttry and current
3002 position another one (pref_len - pref0_len chars),
3003 after the current position the third one.
3004 We assume that pref0_len <= pref_len, otherwise we
3005 decrease pref0_len. */
786e8c11
YO
3006 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3007 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3008 int pref0_len;
3009
7918f24d
NC
3010 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3011
f2ed9b32 3012 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3013 pref_len++;
786e8c11
YO
3014 pref0_len = pref_len - (locinput - loc_reg_starttry);
3015 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3016 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3017 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3018 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3019 l--;
3020 if (pref0_len < 0)
3021 pref0_len = 0;
3022 if (pref0_len > pref_len)
3023 pref0_len = pref_len;
3024 {
f2ed9b32 3025 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3026
ab3bbdeb 3027 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3028 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3029
ab3bbdeb 3030 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3031 (locinput - pref_len + pref0_len),
1de06328 3032 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3033
ab3bbdeb 3034 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3035 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3036
1de06328 3037 const STRLEN tlen=len0+len1+len2;
3df15adc 3038 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3039 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3040 (IV)(locinput - loc_bostr),
07be1b83 3041 len0, s0,
07be1b83 3042 len1, s1,
07be1b83 3043 (docolor ? "" : "> <"),
07be1b83 3044 len2, s2,
f9f4320a 3045 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3046 "");
3047 }
3048}
3df15adc 3049
07be1b83
YO
3050#endif
3051
0a4db386
YO
3052/* reg_check_named_buff_matched()
3053 * Checks to see if a named buffer has matched. The data array of
3054 * buffer numbers corresponding to the buffer is expected to reside
3055 * in the regexp->data->data array in the slot stored in the ARG() of
3056 * node involved. Note that this routine doesn't actually care about the
3057 * name, that information is not preserved from compilation to execution.
3058 * Returns the index of the leftmost defined buffer with the given name
3059 * or 0 if non of the buffers matched.
3060 */
3061STATIC I32
7918f24d
NC
3062S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3063{
0a4db386 3064 I32 n;
f8fc2ecf 3065 RXi_GET_DECL(rex,rexi);
ad64d0ec 3066 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3067 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3068
3069 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3070
0a4db386 3071 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3072 if ((I32)rex->lastparen >= nums[n] &&
3073 rex->offs[nums[n]].end != -1)
0a4db386
YO
3074 {
3075 return nums[n];
3076 }
3077 }
3078 return 0;
3079}
3080
2f554ef7
DM
3081
3082/* free all slabs above current one - called during LEAVE_SCOPE */
3083
3084STATIC void
3085S_clear_backtrack_stack(pTHX_ void *p)
3086{
3087 regmatch_slab *s = PL_regmatch_slab->next;
3088 PERL_UNUSED_ARG(p);
3089
3090 if (!s)
3091 return;
3092 PL_regmatch_slab->next = NULL;
3093 while (s) {
3094 regmatch_slab * const osl = s;
3095 s = s->next;
3096 Safefree(osl);
3097 }
3098}
3099
3100
d6a28714 3101STATIC I32 /* 0 failure, 1 success */
24b23f37 3102S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3103{
a35a87e7 3104#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3105 dMY_CXT;
3106#endif
27da23d5 3107 dVAR;
eb578fdb 3108 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3109 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3110 REGEXP *rex_sv = reginfo->prog;
3111 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3112 RXi_GET_DECL(rex,rexi);
2f554ef7 3113 I32 oldsave;
5d9a96ca 3114 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3115 regmatch_state *st;
5d9a96ca 3116 /* cache heavy used fields of st in registers */
eb578fdb
KW
3117 regnode *scan;
3118 regnode *next;
3119 U32 n = 0; /* general value; init to avoid compiler warning */
3120 I32 ln = 0; /* len or last; init to avoid compiler warning */
3121 char *locinput = PL_reginput;
3122 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3123
b69b0499 3124 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3125 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3126 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3127 const U32 max_nochange_depth =
3128 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3129 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3130 regmatch_state *yes_state = NULL; /* state to pop to on success of
3131 subpattern */
e2e6a0f1
YO
3132 /* mark_state piggy backs on the yes_state logic so that when we unwind
3133 the stack on success we can update the mark_state as we go */
3134 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3135 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3136 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3137 U32 state_num;
5d458dd8
YO
3138 bool no_final = 0; /* prevent failure from backtracking? */
3139 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3140 char *startpoint = PL_reginput;
5d458dd8
YO
3141 SV *popmark = NULL; /* are we looking for a mark? */
3142 SV *sv_commit = NULL; /* last mark name seen in failure */
3143 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3144 during a successful match */
5d458dd8
YO
3145 U32 lastopen = 0; /* last open we saw */
3146 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3147 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3148 /* these three flags are set by various ops to signal information to
3149 * the very next op. They have a useful lifetime of exactly one loop
3150 * iteration, and are not preserved or restored by state pushes/pops
3151 */
3152 bool sw = 0; /* the condition value in (?(cond)a|b) */
3153 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3154 int logical = 0; /* the following EVAL is:
3155 0: (?{...})
3156 1: (?(?{...})X|Y)
3157 2: (??{...})
3158 or the following IFMATCH/UNLESSM is:
3159 false: plain (?=foo)
3160 true: used as a condition: (?(?=foo))
3161 */
81ed78b2
DM
3162 PAD* last_pad = NULL;
3163 dMULTICALL;
3164 I32 gimme = G_SCALAR;
3165 CV *caller_cv = NULL; /* who called us */
3166 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3167 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3168
95b24440 3169#ifdef DEBUGGING
e68ec53f 3170 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3171#endif
3172
81ed78b2
DM
3173 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3174 multicall_oldcatch = 0;
3175 multicall_cv = NULL;
3176 cx = NULL;
4f8dbb2d
JL
3177 PERL_UNUSED_VAR(multicall_cop);
3178 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3179
3180
7918f24d
NC
3181 PERL_ARGS_ASSERT_REGMATCH;
3182
3b57cd43 3183 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3184 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3185 }));
5d9a96ca
DM
3186 /* on first ever call to regmatch, allocate first slab */
3187 if (!PL_regmatch_slab) {
3188 Newx(PL_regmatch_slab, 1, regmatch_slab);
3189 PL_regmatch_slab->prev = NULL;
3190 PL_regmatch_slab->next = NULL;
86545054 3191 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3192 }
3193
2f554ef7
DM
3194 oldsave = PL_savestack_ix;
3195 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3196 SAVEVPTR(PL_regmatch_slab);
3197 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3198
3199 /* grab next free state slot */
3200 st = ++PL_regmatch_state;
86545054 3201 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3202 st = PL_regmatch_state = S_push_slab(aTHX);
3203
d6a28714
JH
3204 /* Note that nextchr is a byte even in UTF */
3205 nextchr = UCHARAT(locinput);
3206 scan = prog;
3207 while (scan != NULL) {
8ba1375e 3208
a3621e74 3209 DEBUG_EXECUTE_r( {
6136c704 3210 SV * const prop = sv_newmortal();
1de06328 3211 regnode *rnext=regnext(scan);
f2ed9b32 3212 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3213 regprop(rex, prop, scan);
07be1b83
YO
3214
3215 PerlIO_printf(Perl_debug_log,
3216 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3217 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3218 SvPVX_const(prop),
1de06328 3219 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3220 0 : (IV)(rnext - rexi->program));
2a782b5b 3221 });
d6a28714
JH
3222
3223 next = scan + NEXT_OFF(scan);
3224 if (next == scan)
3225 next = NULL;
40a82448 3226 state_num = OP(scan);
d6a28714 3227
40a82448 3228 reenter_switch:
34a81e2b 3229
40a82448 3230 switch (state_num) {
d6a28714 3231 case BOL:
7fba1cd6 3232 if (locinput == PL_bostr)
d6a28714 3233 {
3b0527fe 3234 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3235 break;
3236 }
d6a28714
JH
3237 sayNO;
3238 case MBOL:
12d33761
HS
3239 if (locinput == PL_bostr ||
3240 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3241 {
b8c5462f
JH
3242 break;
3243 }
d6a28714
JH
3244 sayNO;
3245 case SBOL:
c2a73568 3246 if (locinput == PL_bostr)
b8c5462f 3247 break;
d6a28714
JH
3248 sayNO;
3249 case GPOS:
3b0527fe 3250 if (locinput == reginfo->ganch)
d6a28714
JH
3251 break;
3252 sayNO;
ee9b8eae
YO
3253
3254 case KEEPS:
3255 /* update the startpoint */
b93070ed 3256 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3257 PL_reginput = locinput;
b93070ed 3258 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3259 PUSH_STATE_GOTO(KEEPS_next, next);
3260 /*NOT-REACHED*/
3261 case KEEPS_next_fail:
3262 /* rollback the start point change */
b93070ed 3263 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3264 sayNO_SILENT;
3265 /*NOT-REACHED*/
d6a28714 3266 case EOL:
d6a28714
JH
3267 goto seol;
3268 case MEOL:
d6a28714 3269 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3270 sayNO;
b8c5462f 3271 break;
d6a28714
JH
3272 case SEOL:
3273 seol:
3274 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3275 sayNO;
d6a28714 3276 if (PL_regeol - locinput > 1)
b8c5462f 3277 sayNO;
b8c5462f 3278 break;
d6a28714
JH
3279 case EOS:
3280 if (PL_regeol != locinput)
b8c5462f 3281 sayNO;
d6a28714 3282 break;
ffc61ed2 3283 case SANY:
d6a28714 3284 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3285 sayNO;
f2ed9b32 3286 if (utf8_target) {
f33976b4
DB
3287 locinput += PL_utf8skip[nextchr];
3288 if (locinput > PL_regeol)
3289 sayNO;
3290 nextchr = UCHARAT(locinput);
3291 }
3292 else
3293 nextchr = UCHARAT(++locinput);
3294 break;
3295 case CANY:
3296 if (!nextchr && locinput >= PL_regeol)
3297 sayNO;
b8c5462f 3298 nextchr = UCHARAT(++locinput);
a0d0e21e 3299 break;
ffc61ed2 3300 case REG_ANY:
1aa99e6b
IH
3301 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3302 sayNO;
f2ed9b32 3303 if (utf8_target) {
b8c5462f 3304 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3305 if (locinput > PL_regeol)
3306 sayNO;
a0ed51b3 3307 nextchr = UCHARAT(locinput);
a0ed51b3 3308 }
1aa99e6b
IH
3309 else
3310 nextchr = UCHARAT(++locinput);
a0ed51b3 3311 break;
166ba7cd
DM
3312
3313#undef ST
3314#define ST st->u.trie
786e8c11
YO
3315 case TRIEC:
3316 /* In this case the charclass data is available inline so
3317 we can fail fast without a lot of extra overhead.
3318 */
fab2782b
YO
3319 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3320 DEBUG_EXECUTE_r(
3321 PerlIO_printf(Perl_debug_log,
3322 "%*s %sfailed to match trie start class...%s\n",
3323 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3324 );
3325 sayNO_SILENT;
118e2215 3326 assert(0); /* NOTREACHED */
786e8c11
YO
3327 }
3328 /* FALL THROUGH */
5b47454d 3329 case TRIE:
2e64971a
DM
3330 /* the basic plan of execution of the trie is:
3331 * At the beginning, run though all the states, and
3332 * find the longest-matching word. Also remember the position
3333 * of the shortest matching word. For example, this pattern:
3334 * 1 2 3 4 5
3335 * ab|a|x|abcd|abc
3336 * when matched against the string "abcde", will generate
3337 * accept states for all words except 3, with the longest
3338 * matching word being 4, and the shortest being 1 (with
3339 * the position being after char 1 of the string).
3340 *
3341 * Then for each matching word, in word order (i.e. 1,2,4,5),
3342 * we run the remainder of the pattern; on each try setting
3343 * the current position to the character following the word,
3344 * returning to try the next word on failure.
3345 *
3346 * We avoid having to build a list of words at runtime by
3347 * using a compile-time structure, wordinfo[].prev, which
3348 * gives, for each word, the previous accepting word (if any).
3349 * In the case above it would contain the mappings 1->2, 2->0,
3350 * 3->0, 4->5, 5->1. We can use this table to generate, from
3351 * the longest word (4 above), a list of all words, by
3352 * following the list of prev pointers; this gives us the
3353 * unordered list 4,5,1,2. Then given the current word we have
3354 * just tried, we can go through the list and find the
3355 * next-biggest word to try (so if we just failed on word 2,
3356 * the next in the list is 4).
3357 *
3358 * Since at runtime we don't record the matching position in
3359 * the string for each word, we have to work that out for
3360 * each word we're about to process. The wordinfo table holds
3361 * the character length of each word; given that we recorded
3362 * at the start: the position of the shortest word and its
3363 * length in chars, we just need to move the pointer the
3364 * difference between the two char lengths. Depending on
3365 * Unicode status and folding, that's cheap or expensive.
3366 *
3367 * This algorithm is optimised for the case where are only a
3368 * small number of accept states, i.e. 0,1, or maybe 2.
3369 * With lots of accepts states, and having to try all of them,
3370 * it becomes quadratic on number of accept states to find all
3371 * the next words.
3372 */
3373
3dab1dad 3374 {
07be1b83 3375 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3376 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3377
3378 /* what trie are we using right now */
be8e71aa 3379 reg_trie_data * const trie
f8fc2ecf 3380 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3381 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3382 U32 state = trie->startstate;
166ba7cd 3383
fab2782b 3384 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3385 if (trie->states[ state ].wordnum) {
3386 DEBUG_EXECUTE_r(
3387 PerlIO_printf(Perl_debug_log,
3388 "%*s %smatched empty string...%s\n",
5bc10b2c 3389 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3390 );
20dbff7c
YO
3391 if (!trie->jump)
3392 break;
3dab1dad
YO
3393 } else {
3394 DEBUG_EXECUTE_r(
3395 PerlIO_printf(Perl_debug_log,
786e8c11 3396 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3397 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3398 );
3399 sayNO_SILENT;
3400 }
3401 }
166ba7cd 3402
786e8c11
YO
3403 {
3404 U8 *uc = ( U8* )locinput;
3405
3406 STRLEN len = 0;
3407 STRLEN foldlen = 0;
3408 U8 *uscan = (U8*)NULL;
786e8c11 3409 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3410 U32 charcount = 0; /* how many input chars we have matched */
3411 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3412
786e8c11 3413 ST.jump = trie->jump;
786e8c11 3414 ST.me = scan;
2e64971a
DM
3415 ST.firstpos = NULL;
3416 ST.longfold = FALSE; /* char longer if folded => it's harder */
3417 ST.nextword = 0;
3418
3419 /* fully traverse the TRIE; note the position of the
3420 shortest accept state and the wordnum of the longest
3421 accept state */
07be1b83 3422
a3621e74 3423 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3424 U32 base = trie->states[ state ].trans.base;
f9f4320a 3425 UV uvc = 0;
acb909b4 3426 U16 charid = 0;