This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Subject: [PATCH] regexec.c: add and refactor macros
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
f2ed9b32 103#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
20d0b1e9 120/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
37e2e78e
KW
123
124/* Doesn't do an assert to verify that is correct */
125#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
127
1a4fad37
AL
128#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
129#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
130#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 131
37e2e78e 132#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
133 LOAD_UTF8_CHARCLASS(X_begin, " "); \
134 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
135 /* These are utf8 constants, and not utf-ebcdic constants, so the \
136 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 137 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
138 \
139 /* No asserts are done for these, in case called on an early \
140 * Unicode version in which they map to nothing */ \
d275fa5e
CB
141 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 148
d1eb3177
YO
149/*
150 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
151 so that it is possible to override the option here without having to
152 rebuild the entire core. as we are required to do if we change regcomp.h
153 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
154*/
155#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
156#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
157#endif
158
159#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
160#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
161#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
162#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
163#define RE_utf8_perl_word PL_utf8_alnum
164#define RE_utf8_perl_space PL_utf8_space
165#define RE_utf8_posix_digit PL_utf8_digit
166#define perl_word alnum
167#define perl_space space
168#define posix_digit digit
169#else
170#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
171#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
172#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
173#define RE_utf8_perl_word PL_utf8_perl_word
174#define RE_utf8_perl_space PL_utf8_perl_space
175#define RE_utf8_posix_digit PL_utf8_posix_digit
176#endif
177
178
b10ac0d8
KW
179#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
180 case NAMEL: \
20d0b1e9
YO
181 PL_reg_flags |= RF_tainted; \
182 /* FALL THROUGH */ \
b10ac0d8 183 case NAME: \
20d0b1e9
YO
184 if (!nextchr) \
185 sayNO; \
f2ed9b32 186 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
20d0b1e9
YO
187 if (!CAT2(PL_utf8_,CLASS)) { \
188 bool ok; \
189 ENTER; \
190 save_re_context(); \
191 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
192 assert(ok); \
193 LEAVE; \
194 } \
195 if (!(OP(scan) == NAME \
f2ed9b32 196 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
20d0b1e9
YO
197 : LCFUNC_utf8((U8*)locinput))) \
198 { \
199 sayNO; \
200 } \
201 locinput += PL_utf8skip[nextchr]; \
202 nextchr = UCHARAT(locinput); \
203 break; \
204 } \
b10ac0d8
KW
205 /* Finished up by calling macro */
206
207#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
208 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
20d0b1e9
YO
209 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
210 sayNO; \
211 nextchr = UCHARAT(++locinput); \
212 break
213
b10ac0d8
KW
214/* Almost identical to the above, but has a case for a node that matches chars
215 * between 128 and 255 using Unicode (latin1) semantics. */
216#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
217 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
218 if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
219 sayNO; \
220 nextchr = UCHARAT(++locinput); \
221 break
222
223#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
20d0b1e9
YO
224 case NAMEL: \
225 PL_reg_flags |= RF_tainted; \
226 /* FALL THROUGH */ \
227 case NAME : \
228 if (!nextchr && locinput >= PL_regeol) \
229 sayNO; \
f2ed9b32 230 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
20d0b1e9
YO
231 if (!CAT2(PL_utf8_,CLASS)) { \
232 bool ok; \
233 ENTER; \
234 save_re_context(); \
235 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
236 assert(ok); \
237 LEAVE; \
238 } \
239 if ((OP(scan) == NAME \
f2ed9b32 240 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
20d0b1e9
YO
241 : LCFUNC_utf8((U8*)locinput))) \
242 { \
243 sayNO; \
244 } \
245 locinput += PL_utf8skip[nextchr]; \
246 nextchr = UCHARAT(locinput); \
247 break; \
b10ac0d8
KW
248 }
249
250#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
251 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
20d0b1e9
YO
252 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
253 sayNO; \
254 nextchr = UCHARAT(++locinput); \
255 break
256
257
b10ac0d8
KW
258#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
259 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
260 if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
261 sayNO; \
262 nextchr = UCHARAT(++locinput); \
263 break
d1eb3177
YO
264
265
266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
296#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
302#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
304
305#endif
e2d8ce26 306
a84d97b6
HS
307/*
308 Search for mandatory following text node; for lookahead, the text must
309 follow but for lookbehind (rn->flags != 0) we skip to the next step.
310*/
cca55fe3 311#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
312 while (JUMPABLE(rn)) { \
313 const OPCODE type = OP(rn); \
314 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 315 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 316 else if (type == PLUS) \
cca55fe3 317 rn = NEXTOPER(rn); \
3dab1dad 318 else if (type == IFMATCH) \
a84d97b6 319 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 320 else rn += NEXT_OFF(rn); \
3dab1dad 321 } \
5f80c4cf 322} STMT_END
74750237 323
c476f425 324
acfe0abc 325static void restore_pos(pTHX_ void *arg);
51371543 326
620d5b66
NC
327#define REGCP_PAREN_ELEMS 4
328#define REGCP_OTHER_ELEMS 5
e0fa7e2b 329#define REGCP_FRAME_ELEMS 1
620d5b66
NC
330/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
331 * are needed for the regexp context stack bookkeeping. */
332
76e3520e 333STATIC CHECKPOINT
cea2e8a9 334S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 335{
97aff369 336 dVAR;
a3b680e6 337 const int retval = PL_savestack_ix;
a3b680e6 338 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
339 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
340 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 341 int p;
40a82448 342 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 343
e49a9654
IH
344 if (paren_elems_to_push < 0)
345 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
346
e0fa7e2b
NC
347 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
348 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
349 " out of range (%lu-%ld)",
350 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 351
620d5b66 352 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 353
3280af22 354 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 355/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
356 SSPUSHINT(PL_regoffs[p].end);
357 SSPUSHINT(PL_regoffs[p].start);
3280af22 358 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 359 SSPUSHINT(p);
e7707071 360 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 361 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 362 (UV)p, (IV)PL_regoffs[p].start,
40a82448 363 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 364 (IV)PL_regoffs[p].end
40a82448 365 ));
a0d0e21e 366 }
b1ce53c5 367/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 368 SSPUSHPTR(PL_regoffs);
3280af22
NIS
369 SSPUSHINT(PL_regsize);
370 SSPUSHINT(*PL_reglastparen);
a01268b5 371 SSPUSHINT(*PL_reglastcloseparen);
3280af22 372 SSPUSHPTR(PL_reginput);
e0fa7e2b 373 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 374
a0d0e21e
LW
375 return retval;
376}
377
c277df42 378/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
379#define REGCP_SET(cp) \
380 DEBUG_STATE_r( \
ab3bbdeb 381 PerlIO_printf(Perl_debug_log, \
e4f74956 382 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
383 (IV)PL_savestack_ix)); \
384 cp = PL_savestack_ix
c3464db5 385
ab3bbdeb 386#define REGCP_UNWIND(cp) \
e4f74956 387 DEBUG_STATE_r( \
ab3bbdeb 388 if (cp != PL_savestack_ix) \
e4f74956
YO
389 PerlIO_printf(Perl_debug_log, \
390 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
391 (IV)(cp), (IV)PL_savestack_ix)); \
392 regcpblow(cp)
c277df42 393
76e3520e 394STATIC char *
097eb12c 395S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 396{
97aff369 397 dVAR;
e0fa7e2b 398 UV i;
a0d0e21e 399 char *input;
a3621e74
YO
400 GET_RE_DEBUG_FLAGS_DECL;
401
7918f24d
NC
402 PERL_ARGS_ASSERT_REGCPPOP;
403
b1ce53c5 404 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 405 i = SSPOPUV;
e0fa7e2b
NC
406 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
407 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 408 input = (char *) SSPOPPTR;
a01268b5 409 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
410 *PL_reglastparen = SSPOPINT;
411 PL_regsize = SSPOPINT;
f0ab9afb 412 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 413
620d5b66 414 i -= REGCP_OTHER_ELEMS;
b1ce53c5 415 /* Now restore the parentheses context. */
620d5b66 416 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 417 I32 tmps;
097eb12c 418 U32 paren = (U32)SSPOPINT;
3280af22 419 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 420 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 421 tmps = SSPOPINT;
3280af22 422 if (paren <= *PL_reglastparen)
f0ab9afb 423 PL_regoffs[paren].end = tmps;
e7707071 424 DEBUG_BUFFERS_r(
c3464db5 425 PerlIO_printf(Perl_debug_log,
b900a521 426 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 427 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 428 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 429 (IV)PL_regoffs[paren].end,
3280af22 430 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 431 );
a0d0e21e 432 }
e7707071 433 DEBUG_BUFFERS_r(
bb7a0f54 434 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 435 PerlIO_printf(Perl_debug_log,
faccc32b 436 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 437 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
438 }
439 );
daf18116 440#if 1
dafc8851
JH
441 /* It would seem that the similar code in regtry()
442 * already takes care of this, and in fact it is in
443 * a better location to since this code can #if 0-ed out
444 * but the code in regtry() is needed or otherwise tests
445 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
446 * (as of patchlevel 7877) will fail. Then again,
447 * this code seems to be necessary or otherwise
225593e1
DM
448 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
449 * --jhi updated by dapm */
3b6647e0 450 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 451 if (i > PL_regsize)
f0ab9afb
NC
452 PL_regoffs[i].start = -1;
453 PL_regoffs[i].end = -1;
a0d0e21e 454 }
dafc8851 455#endif
a0d0e21e
LW
456 return input;
457}
458
02db2b7b 459#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 460
a687059c 461/*
e50aee73 462 * pregexec and friends
a687059c
LW
463 */
464
76234dfb 465#ifndef PERL_IN_XSUB_RE
a687059c 466/*
c277df42 467 - pregexec - match a regexp against a string
a687059c 468 */
c277df42 469I32
49d7dfbc 470Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 471 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
472/* strend: pointer to null at end of string */
473/* strbeg: real beginning of string */
474/* minend: end of match must be >=minend after stringarg. */
475/* nosave: For optimizations. */
476{
7918f24d
NC
477 PERL_ARGS_ASSERT_PREGEXEC;
478
c277df42 479 return
9041c2e3 480 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
481 nosave ? 0 : REXEC_COPY_STR);
482}
76234dfb 483#endif
22e551b9 484
9041c2e3 485/*
cad2e5aa
JH
486 * Need to implement the following flags for reg_anch:
487 *
488 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
489 * USE_INTUIT_ML
490 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
491 * INTUIT_AUTORITATIVE_ML
492 * INTUIT_ONCE_NOML - Intuit can match in one location only.
493 * INTUIT_ONCE_ML
494 *
495 * Another flag for this function: SECOND_TIME (so that float substrs
496 * with giant delta may be not rechecked).
497 */
498
499/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
500
3f7c398e 501/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
502 Otherwise, only SvCUR(sv) is used to get strbeg. */
503
504/* XXXX We assume that strpos is strbeg unless sv. */
505
6eb5f6b9
JH
506/* XXXX Some places assume that there is a fixed substring.
507 An update may be needed if optimizer marks as "INTUITable"
508 RExen without fixed substrings. Similarly, it is assumed that
509 lengths of all the strings are no more than minlen, thus they
510 cannot come from lookahead.
40d049e4
YO
511 (Or minlen should take into account lookahead.)
512 NOTE: Some of this comment is not correct. minlen does now take account
513 of lookahead/behind. Further research is required. -- demerphq
514
515*/
6eb5f6b9 516
2c2d71f5
JH
517/* A failure to find a constant substring means that there is no need to make
518 an expensive call to REx engine, thus we celebrate a failure. Similarly,
519 finding a substring too deep into the string means that less calls to
30944b6d
IZ
520 regtry() should be needed.
521
522 REx compiler's optimizer found 4 possible hints:
523 a) Anchored substring;
524 b) Fixed substring;
525 c) Whether we are anchored (beginning-of-line or \G);
526 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 527 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
528 string which does not contradict any of them.
529 */
2c2d71f5 530
6eb5f6b9
JH
531/* Most of decisions we do here should have been done at compile time.
532 The nodes of the REx which we used for the search should have been
533 deleted from the finite automaton. */
534
cad2e5aa 535char *
288b8c02 536Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 537 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 538{
97aff369 539 dVAR;
288b8c02 540 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 541 register I32 start_shift = 0;
cad2e5aa 542 /* Should be nonnegative! */
b7953727 543 register I32 end_shift = 0;
2c2d71f5
JH
544 register char *s;
545 register SV *check;
a1933d95 546 char *strbeg;
cad2e5aa 547 char *t;
f2ed9b32 548 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 549 I32 ml_anch;
bd61b366
SS
550 register char *other_last = NULL; /* other substr checked before this */
551 char *check_at = NULL; /* check substr found at this pos */
bbe252da 552 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 553 RXi_GET_DECL(prog,progi);
30944b6d 554#ifdef DEBUGGING
890ce7af 555 const char * const i_strpos = strpos;
30944b6d 556#endif
a3621e74
YO
557 GET_RE_DEBUG_FLAGS_DECL;
558
7918f24d
NC
559 PERL_ARGS_ASSERT_RE_INTUIT_START;
560
f2ed9b32 561 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 562
3c8556c3 563 if (RX_UTF8(rx)) {
b8d68ded
JH
564 PL_reg_flags |= RF_utf8;
565 }
ab3bbdeb 566 DEBUG_EXECUTE_r(
f2ed9b32 567 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
568 sv ? "Guessing start of match in sv for"
569 : "Guessing start of match in string for");
2a782b5b 570 );
cad2e5aa 571
c344f387
JH
572 /* CHR_DIST() would be more correct here but it makes things slow. */
573 if (prog->minlen > strend - strpos) {
a3621e74 574 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 575 "String too short... [re_intuit_start]\n"));
cad2e5aa 576 goto fail;
2c2d71f5 577 }
1de06328 578
a1933d95 579 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 580 PL_regeol = strend;
f2ed9b32 581 if (utf8_target) {
33b8afdf
JH
582 if (!prog->check_utf8 && prog->check_substr)
583 to_utf8_substr(prog);
584 check = prog->check_utf8;
585 } else {
586 if (!prog->check_substr && prog->check_utf8)
587 to_byte_substr(prog);
588 check = prog->check_substr;
589 }
1de06328 590 if (check == &PL_sv_undef) {
a3621e74 591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 592 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
593 goto fail;
594 }
bbe252da
YO
595 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
596 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
597 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 598 && !multiline ) ); /* Check after \n? */
cad2e5aa 599
7e25d62c 600 if (!ml_anch) {
bbe252da
YO
601 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
602 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 603 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
604 && sv && !SvROK(sv)
605 && (strpos != strbeg)) {
a3621e74 606 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
607 goto fail;
608 }
609 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 610 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 611 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
612 I32 slen;
613
1aa99e6b 614 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 615
653099ff
GS
616 if (SvTAIL(check)) {
617 slen = SvCUR(check); /* >= 1 */
cad2e5aa 618
9041c2e3 619 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 620 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 622 goto fail_finish;
cad2e5aa
JH
623 }
624 /* Now should match s[0..slen-2] */
625 slen--;
3f7c398e 626 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 627 || (slen > 1
3f7c398e 628 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 629 report_neq:
a3621e74 630 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
631 goto fail_finish;
632 }
cad2e5aa 633 }
3f7c398e 634 else if (*SvPVX_const(check) != *s
653099ff 635 || ((slen = SvCUR(check)) > 1
3f7c398e 636 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 637 goto report_neq;
c315bfe8 638 check_at = s;
2c2d71f5 639 goto success_at_start;
7e25d62c 640 }
cad2e5aa 641 }
2c2d71f5 642 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 643 s = strpos;
2c2d71f5 644 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
645 end_shift = prog->check_end_shift;
646
2c2d71f5 647 if (!ml_anch) {
a3b680e6 648 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 649 - (SvTAIL(check) != 0);
a3b680e6 650 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
651
652 if (end_shift < eshift)
653 end_shift = eshift;
654 }
cad2e5aa 655 }
2c2d71f5 656 else { /* Can match at random position */
cad2e5aa
JH
657 ml_anch = 0;
658 s = strpos;
1de06328
YO
659 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
660 end_shift = prog->check_end_shift;
661
662 /* end shift should be non negative here */
cad2e5aa
JH
663 }
664
bcdf7404 665#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 666 if (end_shift < 0)
1de06328 667 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 668 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
669#endif
670
2c2d71f5
JH
671 restart:
672 /* Find a possible match in the region s..strend by looking for
673 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
674
675 {
676 I32 srch_start_shift = start_shift;
677 I32 srch_end_shift = end_shift;
678 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
679 srch_end_shift -= ((strbeg - s) - srch_start_shift);
680 srch_start_shift = strbeg - s;
681 }
6bda09f9 682 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
683 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
684 (IV)prog->check_offset_min,
685 (IV)srch_start_shift,
686 (IV)srch_end_shift,
687 (IV)prog->check_end_shift);
688 });
689
cad2e5aa 690 if (flags & REXEC_SCREAM) {
cad2e5aa 691 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 692 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 693
2c2d71f5
JH
694 if (PL_screamfirst[BmRARE(check)] >= 0
695 || ( BmRARE(check) == '\n'
85c508c3 696 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 697 && SvTAIL(check) ))
9041c2e3 698 s = screaminstr(sv, check,
1de06328 699 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 700 else
2c2d71f5 701 goto fail_finish;
4addbd3b 702 /* we may be pointing at the wrong string */
07bc277f 703 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 704 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
705 if (data)
706 *data->scream_olds = s;
707 }
1de06328
YO
708 else {
709 U8* start_point;
710 U8* end_point;
bbe252da 711 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
712 start_point= (U8*)(s + srch_start_shift);
713 end_point= (U8*)(strend - srch_end_shift);
714 } else {
715 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
716 end_point= HOP3(strend, -srch_end_shift, strbeg);
717 }
6bda09f9 718 DEBUG_OPTIMISE_MORE_r({
56570a2c 719 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 720 (int)(end_point - start_point),
fc8cd66c 721 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
722 start_point);
723 });
724
725 s = fbm_instr( start_point, end_point,
7fba1cd6 726 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
727 }
728 }
cad2e5aa
JH
729 /* Update the count-of-usability, remove useless subpatterns,
730 unshift s. */
2c2d71f5 731
ab3bbdeb 732 DEBUG_EXECUTE_r({
f2ed9b32 733 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
734 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
735 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 736 (s ? "Found" : "Did not find"),
f2ed9b32 737 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
738 ? "anchored" : "floating"),
739 quoted,
740 RE_SV_TAIL(check),
741 (s ? " at offset " : "...\n") );
742 });
2c2d71f5
JH
743
744 if (!s)
745 goto fail_finish;
2c2d71f5 746 /* Finish the diagnostic message */
a3621e74 747 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 748
1de06328
YO
749 /* XXX dmq: first branch is for positive lookbehind...
750 Our check string is offset from the beginning of the pattern.
751 So we need to do any stclass tests offset forward from that
752 point. I think. :-(
753 */
754
755
756
757 check_at=s;
758
759
2c2d71f5
JH
760 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
761 Start with the other substr.
762 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 763 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
764 *always* match. Probably should be marked during compile...
765 Probably it is right to do no SCREAM here...
766 */
767
f2ed9b32 768 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
769 : (prog->float_substr && prog->anchored_substr))
770 {
30944b6d 771 /* Take into account the "other" substring. */
2c2d71f5
JH
772 /* XXXX May be hopelessly wrong for UTF... */
773 if (!other_last)
6eb5f6b9 774 other_last = strpos;
f2ed9b32 775 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
776 do_other_anchored:
777 {
890ce7af
AL
778 char * const last = HOP3c(s, -start_shift, strbeg);
779 char *last1, *last2;
be8e71aa 780 char * const saved_s = s;
33b8afdf 781 SV* must;
2c2d71f5 782
2c2d71f5
JH
783 t = s - prog->check_offset_max;
784 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 785 && (!utf8_target
0ce71af7 786 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 787 && t > strpos)))
6f207bd3 788 NOOP;
2c2d71f5
JH
789 else
790 t = strpos;
1aa99e6b 791 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
792 if (t < other_last) /* These positions already checked */
793 t = other_last;
1aa99e6b 794 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
795 if (last < last1)
796 last1 = last;
1de06328
YO
797 /* XXXX It is not documented what units *_offsets are in.
798 We assume bytes, but this is clearly wrong.
799 Meaning this code needs to be carefully reviewed for errors.
800 dmq.
801 */
802
2c2d71f5 803 /* On end-of-str: see comment below. */
f2ed9b32 804 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
805 if (must == &PL_sv_undef) {
806 s = (char*)NULL;
1de06328 807 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
808 }
809 else
810 s = fbm_instr(
811 (unsigned char*)t,
812 HOP3(HOP3(last1, prog->anchored_offset, strend)
813 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
814 must,
7fba1cd6 815 multiline ? FBMrf_MULTILINE : 0
33b8afdf 816 );
ab3bbdeb 817 DEBUG_EXECUTE_r({
f2ed9b32 818 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
819 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
820 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 821 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
822 quoted, RE_SV_TAIL(must));
823 });
824
825
2c2d71f5
JH
826 if (!s) {
827 if (last1 >= last2) {
a3621e74 828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
829 ", giving up...\n"));
830 goto fail_finish;
831 }
a3621e74 832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 833 ", trying floating at offset %ld...\n",
be8e71aa 834 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
835 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
836 s = HOP3c(last, 1, strend);
2c2d71f5
JH
837 goto restart;
838 }
839 else {
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 841 (long)(s - i_strpos)));
1aa99e6b
IH
842 t = HOP3c(s, -prog->anchored_offset, strbeg);
843 other_last = HOP3c(s, 1, strend);
be8e71aa 844 s = saved_s;
2c2d71f5
JH
845 if (t == strpos)
846 goto try_at_start;
2c2d71f5
JH
847 goto try_at_offset;
848 }
30944b6d 849 }
2c2d71f5
JH
850 }
851 else { /* Take into account the floating substring. */
33b8afdf 852 char *last, *last1;
be8e71aa 853 char * const saved_s = s;
33b8afdf
JH
854 SV* must;
855
856 t = HOP3c(s, -start_shift, strbeg);
857 last1 = last =
858 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
859 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
860 last = HOP3c(t, prog->float_max_offset, strend);
861 s = HOP3c(t, prog->float_min_offset, strend);
862 if (s < other_last)
863 s = other_last;
2c2d71f5 864 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 865 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
866 /* fbm_instr() takes into account exact value of end-of-str
867 if the check is SvTAIL(ed). Since false positives are OK,
868 and end-of-str is not later than strend we are OK. */
869 if (must == &PL_sv_undef) {
870 s = (char*)NULL;
1de06328 871 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
872 }
873 else
2c2d71f5 874 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
875 (unsigned char*)last + SvCUR(must)
876 - (SvTAIL(must)!=0),
7fba1cd6 877 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 878 DEBUG_EXECUTE_r({
f2ed9b32 879 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
880 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
881 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 882 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
883 quoted, RE_SV_TAIL(must));
884 });
33b8afdf
JH
885 if (!s) {
886 if (last1 == last) {
a3621e74 887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
888 ", giving up...\n"));
889 goto fail_finish;
2c2d71f5 890 }
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 892 ", trying anchored starting at offset %ld...\n",
be8e71aa 893 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
894 other_last = last;
895 s = HOP3c(t, 1, strend);
896 goto restart;
897 }
898 else {
a3621e74 899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
900 (long)(s - i_strpos)));
901 other_last = s; /* Fix this later. --Hugo */
be8e71aa 902 s = saved_s;
33b8afdf
JH
903 if (t == strpos)
904 goto try_at_start;
905 goto try_at_offset;
906 }
2c2d71f5 907 }
cad2e5aa 908 }
2c2d71f5 909
1de06328 910
9ef43ace 911 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 912
6bda09f9 913 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
914 PerlIO_printf(Perl_debug_log,
915 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
916 (IV)prog->check_offset_min,
917 (IV)prog->check_offset_max,
918 (IV)(s-strpos),
919 (IV)(t-strpos),
920 (IV)(t-s),
921 (IV)(strend-strpos)
922 )
923 );
924
2c2d71f5 925 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 926 && (!utf8_target
9ef43ace 927 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
928 && t > strpos)))
929 {
2c2d71f5
JH
930 /* Fixed substring is found far enough so that the match
931 cannot start at strpos. */
932 try_at_offset:
cad2e5aa 933 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
934 /* Eventually fbm_*() should handle this, but often
935 anchored_offset is not 0, so this check will not be wasted. */
936 /* XXXX In the code below we prefer to look for "^" even in
937 presence of anchored substrings. And we search even
938 beyond the found float position. These pessimizations
939 are historical artefacts only. */
940 find_anchor:
2c2d71f5 941 while (t < strend - prog->minlen) {
cad2e5aa 942 if (*t == '\n') {
4ee3650e 943 if (t < check_at - prog->check_offset_min) {
f2ed9b32 944 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
945 /* Since we moved from the found position,
946 we definitely contradict the found anchored
30944b6d
IZ
947 substr. Due to the above check we do not
948 contradict "check" substr.
949 Thus we can arrive here only if check substr
950 is float. Redo checking for "other"=="fixed".
951 */
9041c2e3 952 strpos = t + 1;
a3621e74 953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 954 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
955 goto do_other_anchored;
956 }
4ee3650e
GS
957 /* We don't contradict the found floating substring. */
958 /* XXXX Why not check for STCLASS? */
cad2e5aa 959 s = t + 1;
a3621e74 960 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 961 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
962 goto set_useful;
963 }
4ee3650e
GS
964 /* Position contradicts check-string */
965 /* XXXX probably better to look for check-string
966 than for "\n", so one should lower the limit for t? */
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 968 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 969 other_last = strpos = s = t + 1;
cad2e5aa
JH
970 goto restart;
971 }
972 t++;
973 }
a3621e74 974 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 975 PL_colors[0], PL_colors[1]));
2c2d71f5 976 goto fail_finish;
cad2e5aa 977 }
f5952150 978 else {
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 980 PL_colors[0], PL_colors[1]));
f5952150 981 }
cad2e5aa
JH
982 s = t;
983 set_useful:
f2ed9b32 984 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
985 }
986 else {
f5952150 987 /* The found string does not prohibit matching at strpos,
2c2d71f5 988 - no optimization of calling REx engine can be performed,
f5952150
GS
989 unless it was an MBOL and we are not after MBOL,
990 or a future STCLASS check will fail this. */
2c2d71f5
JH
991 try_at_start:
992 /* Even in this situation we may use MBOL flag if strpos is offset
993 wrt the start of the string. */
05b4157f 994 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 995 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 996 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 997 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 998 {
cad2e5aa
JH
999 t = strpos;
1000 goto find_anchor;
1001 }
a3621e74 1002 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1003 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1004 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1005 );
2c2d71f5 1006 success_at_start:
bbe252da 1007 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1008 && (utf8_target ? (
33b8afdf
JH
1009 prog->check_utf8 /* Could be deleted already */
1010 && --BmUSEFUL(prog->check_utf8) < 0
1011 && (prog->check_utf8 == prog->float_utf8)
1012 ) : (
1013 prog->check_substr /* Could be deleted already */
1014 && --BmUSEFUL(prog->check_substr) < 0
1015 && (prog->check_substr == prog->float_substr)
1016 )))
66e933ab 1017 {
cad2e5aa 1018 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1019 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1020 /* XXX Does the destruction order has to change with utf8_target? */
1021 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1022 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1023 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1024 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1025 check = NULL; /* abort */
cad2e5aa 1026 s = strpos;
c9415951
YO
1027 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1028 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1029 if (prog->intflags & PREGf_IMPLICIT)
1030 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1031 /* XXXX This is a remnant of the old implementation. It
1032 looks wasteful, since now INTUIT can use many
6eb5f6b9 1033 other heuristics. */
bbe252da 1034 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1035 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1036 }
1037 else
1038 s = strpos;
1039 }
1040
6eb5f6b9
JH
1041 /* Last resort... */
1042 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1043 /* trie stclasses are too expensive to use here, we are better off to
1044 leave it to regmatch itself */
f8fc2ecf 1045 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1046 /* minlen == 0 is possible if regstclass is \b or \B,
1047 and the fixed substr is ''$.
1048 Since minlen is already taken into account, s+1 is before strend;
1049 accidentally, minlen >= 1 guaranties no false positives at s + 1
1050 even for \b or \B. But (minlen? 1 : 0) below assumes that
1051 regstclass does not come from lookahead... */
1052 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1053 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
1054 const U8* const str = (U8*)STRING(progi->regstclass);
1055 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1056 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1057 : 1);
1de06328
YO
1058 char * endpos;
1059 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1060 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1061 else if (prog->float_substr || prog->float_utf8)
1062 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1063 else
1064 endpos= strend;
1065
70685ca0
JH
1066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1067 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1068
6eb5f6b9 1069 t = s;
f8fc2ecf 1070 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1071 if (!s) {
1072#ifdef DEBUGGING
cbbf8932 1073 const char *what = NULL;
6eb5f6b9
JH
1074#endif
1075 if (endpos == strend) {
a3621e74 1076 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1077 "Could not match STCLASS...\n") );
1078 goto fail;
1079 }
a3621e74 1080 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1081 "This position contradicts STCLASS...\n") );
bbe252da 1082 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1083 goto fail;
6eb5f6b9 1084 /* Contradict one of substrings */
33b8afdf 1085 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1086 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1087 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1088 hop_and_restart:
1aa99e6b 1089 s = HOP3c(t, 1, strend);
66e933ab
GS
1090 if (s + start_shift + end_shift > strend) {
1091 /* XXXX Should be taken into account earlier? */
a3621e74 1092 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1093 "Could not match STCLASS...\n") );
1094 goto fail;
1095 }
5e39e1e5
HS
1096 if (!check)
1097 goto giveup;
a3621e74 1098 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1099 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1100 what, (long)(s + start_shift - i_strpos)) );
1101 goto restart;
1102 }
66e933ab 1103 /* Have both, check_string is floating */
6eb5f6b9
JH
1104 if (t + start_shift >= check_at) /* Contradicts floating=check */
1105 goto retry_floating_check;
1106 /* Recheck anchored substring, but not floating... */
9041c2e3 1107 s = check_at;
5e39e1e5
HS
1108 if (!check)
1109 goto giveup;
a3621e74 1110 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1111 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1112 (long)(other_last - i_strpos)) );
1113 goto do_other_anchored;
1114 }
60e71179
GS
1115 /* Another way we could have checked stclass at the
1116 current position only: */
1117 if (ml_anch) {
1118 s = t = t + 1;
5e39e1e5
HS
1119 if (!check)
1120 goto giveup;
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1122 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1123 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1124 goto try_at_offset;
66e933ab 1125 }
f2ed9b32 1126 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1127 goto fail;
6eb5f6b9
JH
1128 /* Check is floating subtring. */
1129 retry_floating_check:
1130 t = check_at - start_shift;
a3621e74 1131 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1132 goto hop_and_restart;
1133 }
b7953727 1134 if (t != s) {
a3621e74 1135 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1136 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1137 (long)(t - i_strpos), (long)(s - i_strpos))
1138 );
1139 }
1140 else {
a3621e74 1141 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1142 "Does not contradict STCLASS...\n");
1143 );
1144 }
6eb5f6b9 1145 }
5e39e1e5 1146 giveup:
a3621e74 1147 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1148 PL_colors[4], (check ? "Guessed" : "Giving up"),
1149 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1150 return s;
2c2d71f5
JH
1151
1152 fail_finish: /* Substring not found */
33b8afdf 1153 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1154 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1155 fail:
a3621e74 1156 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1157 PL_colors[4], PL_colors[5]));
bd61b366 1158 return NULL;
cad2e5aa 1159}
9661b544 1160
a0a388a1
YO
1161#define DECL_TRIE_TYPE(scan) \
1162 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1163 trie_type = (scan->flags != EXACT) \
f2ed9b32
KW
1164 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1165 : (utf8_target ? trie_utf8 : trie_plain)
3b0527fe 1166
55eed653
NC
1167#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1168uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1169 switch (trie_type) { \
1170 case trie_utf8_fold: \
1171 if ( foldlen>0 ) { \
0abd0d78 1172 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1173 foldlen -= len; \
1174 uscan += len; \
1175 len=0; \
1176 } else { \
0abd0d78 1177 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1178 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1179 foldlen -= UNISKIP( uvc ); \
1180 uscan = foldbuf + UNISKIP( uvc ); \
1181 } \
1182 break; \
a0a388a1
YO
1183 case trie_latin_utf8_fold: \
1184 if ( foldlen>0 ) { \
1185 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1186 foldlen -= len; \
1187 uscan += len; \
1188 len=0; \
1189 } else { \
1190 len = 1; \
1191 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 uscan = foldbuf + UNISKIP( uvc ); \
1194 } \
1195 break; \
4cadc6a9
YO
1196 case trie_utf8: \
1197 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1198 break; \
1199 case trie_plain: \
1200 uvc = (UV)*uc; \
1201 len = 1; \
1202 } \
4cadc6a9
YO
1203 if (uvc < 256) { \
1204 charid = trie->charmap[ uvc ]; \
1205 } \
1206 else { \
1207 charid = 0; \
55eed653
NC
1208 if (widecharmap) { \
1209 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1210 (char*)&uvc, sizeof(UV), 0); \
1211 if (svpp) \
1212 charid = (U16)SvIV(*svpp); \
1213 } \
1214 } \
1215} STMT_END
1216
a0a388a1
YO
1217#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1218{ \
1219 char *my_strend= (char *)strend; \
4cadc6a9
YO
1220 if ( (CoNd) \
1221 && (ln == len || \
f2ed9b32
KW
1222 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1223 m, NULL, ln, cBOOL(UTF_PATTERN))) \
a0a388a1 1224 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1225 goto got_it; \
1226 else { \
1227 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1228 uvchr_to_utf8(tmpbuf, c); \
1229 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1230 if ( f != c \
1231 && (f == c1 || f == c2) \
a0a388a1 1232 && (ln == len || \
f2ed9b32
KW
1233 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1234 m, NULL, ln, cBOOL(UTF_PATTERN)))\
a0a388a1 1235 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1236 goto got_it; \
1237 } \
a0a388a1
YO
1238} \
1239s += len
4cadc6a9
YO
1240
1241#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1242STMT_START { \
1243 while (s <= e) { \
1244 if ( (CoNd) \
4c1b470c
KW
1245 && (ln == 1 || (OP(c) == EXACTF \
1246 ? foldEQ(s, m, ln) \
1247 : foldEQ_locale(s, m, ln))) \
24b23f37 1248 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1249 goto got_it; \
1250 s++; \
1251 } \
1252} STMT_END
1253
1254#define REXEC_FBC_UTF8_SCAN(CoDe) \
1255STMT_START { \
1256 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1257 CoDe \
1258 s += uskip; \
1259 } \
1260} STMT_END
1261
1262#define REXEC_FBC_SCAN(CoDe) \
1263STMT_START { \
1264 while (s < strend) { \
1265 CoDe \
1266 s++; \
1267 } \
1268} STMT_END
1269
1270#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1271REXEC_FBC_UTF8_SCAN( \
1272 if (CoNd) { \
24b23f37 1273 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it; \
1275 else \
1276 tmp = doevery; \
1277 } \
1278 else \
1279 tmp = 1; \
1280)
1281
1282#define REXEC_FBC_CLASS_SCAN(CoNd) \
1283REXEC_FBC_SCAN( \
1284 if (CoNd) { \
24b23f37 1285 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1286 goto got_it; \
1287 else \
1288 tmp = doevery; \
1289 } \
1290 else \
1291 tmp = 1; \
1292)
1293
1294#define REXEC_FBC_TRYIT \
24b23f37 1295if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1296 goto got_it
1297
e1d1eefb 1298#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1299 if (utf8_target) { \
e1d1eefb
YO
1300 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1301 } \
1302 else { \
1303 REXEC_FBC_CLASS_SCAN(CoNd); \
1304 } \
1305 break
1306
4cadc6a9 1307#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1308 if (utf8_target) { \
4cadc6a9
YO
1309 UtFpReLoAd; \
1310 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1311 } \
1312 else { \
1313 REXEC_FBC_CLASS_SCAN(CoNd); \
1314 } \
1315 break
1316
1317#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1318 PL_reg_flags |= RF_tainted; \
f2ed9b32 1319 if (utf8_target) { \
4cadc6a9
YO
1320 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1321 } \
1322 else { \
1323 REXEC_FBC_CLASS_SCAN(CoNd); \
1324 } \
1325 break
1326
786e8c11
YO
1327#define DUMP_EXEC_POS(li,s,doutf8) \
1328 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1329
1330/* We know what class REx starts with. Try to find this position... */
1331/* if reginfo is NULL, its a dryrun */
1332/* annoyingly all the vars in this routine have different names from their counterparts
1333 in regmatch. /grrr */
1334
3c3eec57 1335STATIC char *
07be1b83 1336S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1337 const char *strend, regmatch_info *reginfo)
a687059c 1338{
27da23d5 1339 dVAR;
bbe252da 1340 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1341 char *m;
d8093b23 1342 STRLEN ln;
5dab1207 1343 STRLEN lnc;
078c425b 1344 register STRLEN uskip;
d8093b23
G
1345 unsigned int c1;
1346 unsigned int c2;
6eb5f6b9
JH
1347 char *e;
1348 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1349 register const bool utf8_target = PL_reg_match_utf8;
f8fc2ecf 1350 RXi_GET_DECL(prog,progi);
7918f24d
NC
1351
1352 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1353
6eb5f6b9
JH
1354 /* We know what class it must start with. */
1355 switch (OP(c)) {
6eb5f6b9 1356 case ANYOF:
f2ed9b32 1357 if (utf8_target) {
4cadc6a9 1358 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1359 !UTF8_IS_INVARIANT((U8)s[0]) ?
f2ed9b32 1360 reginclass(prog, c, (U8*)s, 0, utf8_target) :
4cadc6a9 1361 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1362 }
1363 else {
1364 while (s < strend) {
1365 STRLEN skip = 1;
1366
32fc9b6a 1367 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1368 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1369 /* The assignment of 2 is intentional:
1370 * for the folded sharp s, the skip is 2. */
1371 (skip = SHARP_S_SKIP))) {
24b23f37 1372 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
1377 else
1378 tmp = 1;
1379 s += skip;
1380 }
a0d0e21e 1381 }
6eb5f6b9 1382 break;
f33976b4 1383 case CANY:
4cadc6a9 1384 REXEC_FBC_SCAN(
24b23f37 1385 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1386 goto got_it;
1387 else
1388 tmp = doevery;
4cadc6a9 1389 );
f33976b4 1390 break;
6eb5f6b9 1391 case EXACTF:
5dab1207
NIS
1392 m = STRING(c);
1393 ln = STR_LEN(c); /* length to match in octets/bytes */
1394 lnc = (I32) ln; /* length to match in characters */
f2ed9b32 1395 if (UTF_PATTERN) {
a2a2844f 1396 STRLEN ulen1, ulen2;
5dab1207 1397 U8 *sm = (U8 *) m;
89ebb4a3
JH
1398 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1399 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1400 /* used by commented-out code below */
1401 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1402
1403 /* XXX: Since the node will be case folded at compile
1404 time this logic is a little odd, although im not
1405 sure that its actually wrong. --dmq */
1406
1407 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1408 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1409
1410 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1411 codepoint of the first character in the converted
1412 form, yet originally we did the extra step.
1413 No tests fail by commenting this code out however
1414 so Ive left it out. -- dmq.
1415
89ebb4a3 1416 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1417 0, uniflags);
89ebb4a3 1418 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1419 0, uniflags);
a0a388a1
YO
1420 */
1421
5dab1207
NIS
1422 lnc = 0;
1423 while (sm < ((U8 *) m + ln)) {
1424 lnc++;
1425 sm += UTF8SKIP(sm);
1426 }
1aa99e6b
IH
1427 }
1428 else {
1429 c1 = *(U8*)m;
1430 c2 = PL_fold[c1];
1431 }
6eb5f6b9
JH
1432 goto do_exactf;
1433 case EXACTFL:
5dab1207
NIS
1434 m = STRING(c);
1435 ln = STR_LEN(c);
1436 lnc = (I32) ln;
d8093b23 1437 c1 = *(U8*)m;
6eb5f6b9
JH
1438 c2 = PL_fold_locale[c1];
1439 do_exactf:
db12adc6 1440 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1441
3b0527fe 1442 if (!reginfo && e < s)
6eb5f6b9 1443 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1444
60a8b682
JH
1445 /* The idea in the EXACTF* cases is to first find the
1446 * first character of the EXACTF* node and then, if
1447 * necessary, case-insensitively compare the full
1448 * text of the node. The c1 and c2 are the first
1449 * characters (though in Unicode it gets a bit
1450 * more complicated because there are more cases
7f16dd3d
JH
1451 * than just upper and lower: one needs to use
1452 * the so-called folding case for case-insensitive
1453 * matching (called "loose matching" in Unicode).
4c1b470c 1454 * foldEQ_utf8() will do just that. */
60a8b682 1455
f2ed9b32 1456 if (utf8_target || UTF_PATTERN) {
575cac57 1457 UV c, f;
89ebb4a3 1458 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1459 STRLEN len = 1;
1460 STRLEN foldlen;
4ad0818d 1461 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1462 if (c1 == c2) {
5dab1207
NIS
1463 /* Upper and lower of 1st char are equal -
1464 * probably not a "letter". */
1aa99e6b 1465 while (s <= e) {
f2ed9b32 1466 if (utf8_target) {
a0a388a1 1467 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1468 uniflags);
a0a388a1
YO
1469 } else {
1470 c = *((U8*)s);
1471 }
4cadc6a9 1472 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1473 }
09091399
JH
1474 }
1475 else {
1aa99e6b 1476 while (s <= e) {
f2ed9b32 1477 if (utf8_target) {
a0a388a1 1478 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1479 uniflags);
a0a388a1
YO
1480 } else {
1481 c = *((U8*)s);
1482 }
80aecb99 1483
60a8b682 1484 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1485 * Note that not all the possible combinations
1486 * are handled here: some of them are handled
1487 * by the standard folding rules, and some of
1488 * them (the character class or ANYOF cases)
1489 * are handled during compiletime in
1490 * regexec.c:S_regclass(). */
880bd946
JH
1491 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1492 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1493 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1494
4cadc6a9 1495 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1496 }
09091399 1497 }
1aa99e6b
IH
1498 }
1499 else {
a0a388a1 1500 /* Neither pattern nor string are UTF8 */
1aa99e6b 1501 if (c1 == c2)
4cadc6a9 1502 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1503 else
4cadc6a9 1504 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1505 }
1506 break;
bbce6d69 1507 case BOUNDL:
3280af22 1508 PL_reg_flags |= RF_tainted;
bbce6d69 1509 /* FALL THROUGH */
a0d0e21e 1510 case BOUND:
f2ed9b32 1511 if (utf8_target) {
12d33761 1512 if (s == PL_bostr)
ffc61ed2
JH
1513 tmp = '\n';
1514 else {
6136c704 1515 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1516 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1517 }
1518 tmp = ((OP(c) == BOUND ?
9041c2e3 1519 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1520 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1521 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1522 if (tmp == !(OP(c) == BOUND ?
f2ed9b32 1523 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1524 isALNUM_LC_utf8((U8*)s)))
1525 {
1526 tmp = !tmp;
4cadc6a9 1527 REXEC_FBC_TRYIT;
a687059c 1528 }
4cadc6a9 1529 );
a0d0e21e 1530 }
667bb95a 1531 else {
12d33761 1532 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1533 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1534 REXEC_FBC_SCAN(
ffc61ed2
JH
1535 if (tmp ==
1536 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1537 tmp = !tmp;
4cadc6a9 1538 REXEC_FBC_TRYIT;
a0ed51b3 1539 }
4cadc6a9 1540 );
a0ed51b3 1541 }
24b23f37 1542 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1543 goto got_it;
1544 break;
bbce6d69 1545 case NBOUNDL:
3280af22 1546 PL_reg_flags |= RF_tainted;
bbce6d69 1547 /* FALL THROUGH */
a0d0e21e 1548 case NBOUND:
f2ed9b32 1549 if (utf8_target) {
12d33761 1550 if (s == PL_bostr)
ffc61ed2
JH
1551 tmp = '\n';
1552 else {
6136c704 1553 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1554 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1555 }
1556 tmp = ((OP(c) == NBOUND ?
9041c2e3 1557 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1558 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1559 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1560 if (tmp == !(OP(c) == NBOUND ?
f2ed9b32 1561 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1562 isALNUM_LC_utf8((U8*)s)))
1563 tmp = !tmp;
4cadc6a9
YO
1564 else REXEC_FBC_TRYIT;
1565 );
a0d0e21e 1566 }
667bb95a 1567 else {
12d33761 1568 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1569 tmp = ((OP(c) == NBOUND ?
1570 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1571 REXEC_FBC_SCAN(
ffc61ed2
JH
1572 if (tmp ==
1573 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1574 tmp = !tmp;
4cadc6a9
YO
1575 else REXEC_FBC_TRYIT;
1576 );
a0ed51b3 1577 }
24b23f37 1578 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1579 goto got_it;
1580 break;
a0d0e21e 1581 case ALNUM:
4cadc6a9 1582 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1583 LOAD_UTF8_CHARCLASS_PERL_WORD(),
f2ed9b32 1584 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
4cadc6a9
YO
1585 isALNUM(*s)
1586 );
bbce6d69 1587 case ALNUML:
4cadc6a9
YO
1588 REXEC_FBC_CSCAN_TAINT(
1589 isALNUM_LC_utf8((U8*)s),
1590 isALNUM_LC(*s)
1591 );
a0d0e21e 1592 case NALNUM:
4cadc6a9 1593 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1594 LOAD_UTF8_CHARCLASS_PERL_WORD(),
f2ed9b32 1595 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
4cadc6a9
YO
1596 !isALNUM(*s)
1597 );
bbce6d69 1598 case NALNUML:
4cadc6a9
YO
1599 REXEC_FBC_CSCAN_TAINT(
1600 !isALNUM_LC_utf8((U8*)s),
1601 !isALNUM_LC(*s)
1602 );
a0d0e21e 1603 case SPACE:
4cadc6a9 1604 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1605 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1606 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
4cadc6a9
YO
1607 isSPACE(*s)
1608 );
bbce6d69 1609 case SPACEL:
4cadc6a9
YO
1610 REXEC_FBC_CSCAN_TAINT(
1611 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1612 isSPACE_LC(*s)
1613 );
a0d0e21e 1614 case NSPACE:
4cadc6a9 1615 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1616 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1617 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
4cadc6a9
YO
1618 !isSPACE(*s)
1619 );
bbce6d69 1620 case NSPACEL:
4cadc6a9
YO
1621 REXEC_FBC_CSCAN_TAINT(
1622 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1623 !isSPACE_LC(*s)
1624 );
a0d0e21e 1625 case DIGIT:
4cadc6a9 1626 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1627 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1628 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1629 isDIGIT(*s)
1630 );
b8c5462f 1631 case DIGITL:
4cadc6a9
YO
1632 REXEC_FBC_CSCAN_TAINT(
1633 isDIGIT_LC_utf8((U8*)s),
1634 isDIGIT_LC(*s)
1635 );
a0d0e21e 1636 case NDIGIT:
4cadc6a9 1637 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1638 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1639 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1640 !isDIGIT(*s)
1641 );
b8c5462f 1642 case NDIGITL:
4cadc6a9
YO
1643 REXEC_FBC_CSCAN_TAINT(
1644 !isDIGIT_LC_utf8((U8*)s),
1645 !isDIGIT_LC(*s)
1646 );
e1d1eefb
YO
1647 case LNBREAK:
1648 REXEC_FBC_CSCAN(
1649 is_LNBREAK_utf8(s),
1650 is_LNBREAK_latin1(s)
1651 );
1652 case VERTWS:
1653 REXEC_FBC_CSCAN(
1654 is_VERTWS_utf8(s),
1655 is_VERTWS_latin1(s)
1656 );
1657 case NVERTWS:
1658 REXEC_FBC_CSCAN(
1659 !is_VERTWS_utf8(s),
1660 !is_VERTWS_latin1(s)
1661 );
1662 case HORIZWS:
1663 REXEC_FBC_CSCAN(
1664 is_HORIZWS_utf8(s),
1665 is_HORIZWS_latin1(s)
1666 );
1667 case NHORIZWS:
1668 REXEC_FBC_CSCAN(
1669 !is_HORIZWS_utf8(s),
1670 !is_HORIZWS_latin1(s)
1671 );
1de06328
YO
1672 case AHOCORASICKC:
1673 case AHOCORASICK:
07be1b83 1674 {
a0a388a1 1675 DECL_TRIE_TYPE(c);
07be1b83
YO
1676 /* what trie are we using right now */
1677 reg_ac_data *aho
f8fc2ecf 1678 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1679 reg_trie_data *trie
1680 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1681 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1682
1683 const char *last_start = strend - trie->minlen;
6148ee25 1684#ifdef DEBUGGING
07be1b83 1685 const char *real_start = s;
6148ee25 1686#endif
07be1b83 1687 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1688 SV *sv_points;
1689 U8 **points; /* map of where we were in the input string
786e8c11 1690 when reading a given char. For ASCII this
be8e71aa 1691 is unnecessary overhead as the relationship
38a44b82
NC
1692 is always 1:1, but for Unicode, especially
1693 case folded Unicode this is not true. */
f9e705e8 1694 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1695 U8 *bitmap=NULL;
1696
07be1b83
YO
1697
1698 GET_RE_DEBUG_FLAGS_DECL;
1699
be8e71aa
YO
1700 /* We can't just allocate points here. We need to wrap it in
1701 * an SV so it gets freed properly if there is a croak while
1702 * running the match */
1703 ENTER;
1704 SAVETMPS;
1705 sv_points=newSV(maxlen * sizeof(U8 *));
1706 SvCUR_set(sv_points,
1707 maxlen * sizeof(U8 *));
1708 SvPOK_on(sv_points);
1709 sv_2mortal(sv_points);
1710 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1711 if ( trie_type != trie_utf8_fold
1712 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1713 {
786e8c11
YO
1714 if (trie->bitmap)
1715 bitmap=(U8*)trie->bitmap;
1716 else
1717 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1718 }
786e8c11
YO
1719 /* this is the Aho-Corasick algorithm modified a touch
1720 to include special handling for long "unknown char"
1721 sequences. The basic idea being that we use AC as long
1722 as we are dealing with a possible matching char, when
1723 we encounter an unknown char (and we have not encountered
1724 an accepting state) we scan forward until we find a legal
1725 starting char.
1726 AC matching is basically that of trie matching, except
1727 that when we encounter a failing transition, we fall back
1728 to the current states "fail state", and try the current char
1729 again, a process we repeat until we reach the root state,
1730 state 1, or a legal transition. If we fail on the root state
1731 then we can either terminate if we have reached an accepting
1732 state previously, or restart the entire process from the beginning
1733 if we have not.
1734
1735 */
07be1b83
YO
1736 while (s <= last_start) {
1737 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1738 U8 *uc = (U8*)s;
1739 U16 charid = 0;
1740 U32 base = 1;
1741 U32 state = 1;
1742 UV uvc = 0;
1743 STRLEN len = 0;
1744 STRLEN foldlen = 0;
1745 U8 *uscan = (U8*)NULL;
1746 U8 *leftmost = NULL;
786e8c11
YO
1747#ifdef DEBUGGING
1748 U32 accepted_word= 0;
1749#endif
07be1b83
YO
1750 U32 pointpos = 0;
1751
1752 while ( state && uc <= (U8*)strend ) {
1753 int failed=0;
786e8c11
YO
1754 U32 word = aho->states[ state ].wordnum;
1755
1de06328
YO
1756 if( state==1 ) {
1757 if ( bitmap ) {
1758 DEBUG_TRIE_EXECUTE_r(
1759 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1760 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1761 (char *)uc, utf8_target );
1de06328
YO
1762 PerlIO_printf( Perl_debug_log,
1763 " Scanning for legal start char...\n");
1764 }
1765 );
1766 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1767 uc++;
786e8c11 1768 }
1de06328 1769 s= (char *)uc;
786e8c11 1770 }
786e8c11
YO
1771 if (uc >(U8*)last_start) break;
1772 }
1773
1774 if ( word ) {
2e64971a 1775 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1776 if (!leftmost || lpos < leftmost) {
1777 DEBUG_r(accepted_word=word);
07be1b83 1778 leftmost= lpos;
786e8c11 1779 }
07be1b83 1780 if (base==0) break;
786e8c11 1781
07be1b83
YO
1782 }
1783 points[pointpos++ % maxlen]= uc;
55eed653
NC
1784 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1785 uscan, len, uvc, charid, foldlen,
1786 foldbuf, uniflags);
786e8c11
YO
1787 DEBUG_TRIE_EXECUTE_r({
1788 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1789 s, utf8_target );
07be1b83 1790 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1791 " Charid:%3u CP:%4"UVxf" ",
1792 charid, uvc);
1793 });
07be1b83
YO
1794
1795 do {
6148ee25 1796#ifdef DEBUGGING
786e8c11 1797 word = aho->states[ state ].wordnum;
6148ee25 1798#endif
07be1b83
YO
1799 base = aho->states[ state ].trans.base;
1800
786e8c11
YO
1801 DEBUG_TRIE_EXECUTE_r({
1802 if (failed)
1803 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1804 s, utf8_target );
07be1b83 1805 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1806 "%sState: %4"UVxf", word=%"UVxf,
1807 failed ? " Fail transition to " : "",
1808 (UV)state, (UV)word);
1809 });
07be1b83
YO
1810 if ( base ) {
1811 U32 tmp;
6dd2be57 1812 I32 offset;
07be1b83 1813 if (charid &&
6dd2be57
DM
1814 ( ((offset = base + charid
1815 - 1 - trie->uniquecharcount)) >= 0)
1816 && ((U32)offset < trie->lasttrans)
1817 && trie->trans[offset].check == state
1818 && (tmp=trie->trans[offset].next))
07be1b83 1819 {
786e8c11
YO
1820 DEBUG_TRIE_EXECUTE_r(
1821 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1822 state = tmp;
1823 break;
1824 }
1825 else {
786e8c11
YO
1826 DEBUG_TRIE_EXECUTE_r(
1827 PerlIO_printf( Perl_debug_log," - fail\n"));
1828 failed = 1;
1829 state = aho->fail[state];
07be1b83
YO
1830 }
1831 }
1832 else {
1833 /* we must be accepting here */
786e8c11
YO
1834 DEBUG_TRIE_EXECUTE_r(
1835 PerlIO_printf( Perl_debug_log," - accepting\n"));
1836 failed = 1;
07be1b83
YO
1837 break;
1838 }
1839 } while(state);
786e8c11 1840 uc += len;
07be1b83
YO
1841 if (failed) {
1842 if (leftmost)
1843 break;
786e8c11 1844 if (!state) state = 1;
07be1b83
YO
1845 }
1846 }
1847 if ( aho->states[ state ].wordnum ) {
2e64971a 1848 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1849 if (!leftmost || lpos < leftmost) {
1850 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1851 leftmost = lpos;
786e8c11 1852 }
07be1b83 1853 }
07be1b83
YO
1854 if (leftmost) {
1855 s = (char*)leftmost;
786e8c11
YO
1856 DEBUG_TRIE_EXECUTE_r({
1857 PerlIO_printf(
70685ca0
JH
1858 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1859 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1860 );
1861 });
24b23f37 1862 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1863 FREETMPS;
1864 LEAVE;
07be1b83 1865 goto got_it;
be8e71aa 1866 }
07be1b83 1867 s = HOPc(s,1);
786e8c11
YO
1868 DEBUG_TRIE_EXECUTE_r({
1869 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1870 });
07be1b83 1871 } else {
786e8c11
YO
1872 DEBUG_TRIE_EXECUTE_r(
1873 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1874 break;
1875 }
1876 }
be8e71aa
YO
1877 FREETMPS;
1878 LEAVE;
07be1b83
YO
1879 }
1880 break;
b3c9acc1 1881 default:
3c3eec57
GS
1882 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1883 break;
d6a28714 1884 }
6eb5f6b9
JH
1885 return 0;
1886 got_it:
1887 return s;
1888}
1889
fae667d5 1890
6eb5f6b9
JH
1891/*
1892 - regexec_flags - match a regexp against a string
1893 */
1894I32
288b8c02 1895Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1896 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1897/* strend: pointer to null at end of string */
1898/* strbeg: real beginning of string */
1899/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1900/* data: May be used for some additional optimizations.
1901 Currently its only used, with a U32 cast, for transmitting
1902 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1903/* nosave: For optimizations. */
1904{
97aff369 1905 dVAR;
288b8c02 1906 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1907 /*register*/ char *s;
6eb5f6b9 1908 register regnode *c;
24b23f37 1909 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1910 I32 minlen; /* must match at least this many chars */
1911 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1912 I32 end_shift = 0; /* Same for the end. */ /* CC */
1913 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1914 char *scream_olds = NULL;
f2ed9b32 1915 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 1916 I32 multiline;
f8fc2ecf 1917 RXi_GET_DECL(prog,progi);
3b0527fe 1918 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1919 regexp_paren_pair *swap = NULL;
a3621e74
YO
1920 GET_RE_DEBUG_FLAGS_DECL;
1921
7918f24d 1922 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1923 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1924
1925 /* Be paranoid... */
1926 if (prog == NULL || startpos == NULL) {
1927 Perl_croak(aTHX_ "NULL regexp parameter");
1928 return 0;
1929 }
1930
bbe252da 1931 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1932 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1933
f2ed9b32 1934 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 1935 DEBUG_EXECUTE_r(
f2ed9b32 1936 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
1937 "Matching");
1938 );
bac06658 1939
6eb5f6b9 1940 minlen = prog->minlen;
1de06328
YO
1941
1942 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1944 "String too short [regexec_flags]...\n"));
1945 goto phooey;
1aa99e6b 1946 }
6eb5f6b9 1947
1de06328 1948
6eb5f6b9 1949 /* Check validity of program. */
f8fc2ecf 1950 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1951 Perl_croak(aTHX_ "corrupted regexp program");
1952 }
1953
1954 PL_reg_flags = 0;
1955 PL_reg_eval_set = 0;
1956 PL_reg_maxiter = 0;
1957
3c8556c3 1958 if (RX_UTF8(rx))
6eb5f6b9
JH
1959 PL_reg_flags |= RF_utf8;
1960
1961 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1962 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1963 PL_bostr = strbeg;
3b0527fe 1964 reginfo.sv = sv;
6eb5f6b9
JH
1965
1966 /* Mark end of line for $ (and such) */
1967 PL_regeol = strend;
1968
1969 /* see how far we have to get to not match where we matched before */
3b0527fe 1970 reginfo.till = startpos+minend;
6eb5f6b9 1971
6eb5f6b9
JH
1972 /* If there is a "must appear" string, look for it. */
1973 s = startpos;
1974
bbe252da 1975 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 1976 MAGIC *mg;
2c296965 1977 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 1978 reginfo.ganch = startpos + prog->gofs;
2c296965 1979 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1980 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 1981 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 1982 && SvMAGIC(sv)
14befaf4
DM
1983 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1984 && mg->mg_len >= 0) {
3b0527fe 1985 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 1986 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1987 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 1988
bbe252da 1989 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1990 if (s > reginfo.ganch)
6eb5f6b9 1991 goto phooey;
58e23c8d 1992 s = reginfo.ganch - prog->gofs;
2c296965 1993 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1994 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
1995 if (s < strbeg)
1996 goto phooey;
6eb5f6b9
JH
1997 }
1998 }
58e23c8d 1999 else if (data) {
70685ca0 2000 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2001 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2002 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2003
2004 } else { /* pos() not defined */
3b0527fe 2005 reginfo.ganch = strbeg;
2c296965
YO
2006 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2007 "GPOS: reginfo.ganch = strbeg\n"));
2008 }
6eb5f6b9 2009 }
288b8c02 2010 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2011 /* We have to be careful. If the previous successful match
2012 was from this regex we don't want a subsequent partially
2013 successful match to clobber the old results.
2014 So when we detect this possibility we add a swap buffer
2015 to the re, and switch the buffer each match. If we fail
2016 we switch it back, otherwise we leave it swapped.
2017 */
2018 swap = prog->offs;
2019 /* do we need a save destructor here for eval dies? */
2020 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2021 }
a0714e2c 2022 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2023 re_scream_pos_data d;
2024
2025 d.scream_olds = &scream_olds;
2026 d.scream_pos = &scream_pos;
288b8c02 2027 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2028 if (!s) {
a3621e74 2029 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2030 goto phooey; /* not present */
3fa9c3d7 2031 }
6eb5f6b9
JH
2032 }
2033
1de06328 2034
6eb5f6b9
JH
2035
2036 /* Simplest case: anchored match need be tried only once. */
2037 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2038 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2039 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2040 goto got_it;
bbe252da
YO
2041 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2042 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2043 {
2044 char *end;
2045
2046 if (minlen)
2047 dontbother = minlen - 1;
1aa99e6b 2048 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2049 /* for multiline we only have to try after newlines */
33b8afdf 2050 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2051 /* because of the goto we can not easily reuse the macros for bifurcating the
2052 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2053 if (utf8_target) {
2054 if (s == startpos)
2055 goto after_try_utf8;
2056 while (1) {
2057 if (regtry(&reginfo, &s)) {
2058 goto got_it;
2059 }
2060 after_try_utf8:
2061 if (s > end) {
2062 goto phooey;
2063 }
2064 if (prog->extflags & RXf_USE_INTUIT) {
2065 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2066 if (!s) {
2067 goto phooey;
2068 }
2069 }
2070 else {
2071 s += UTF8SKIP(s);
2072 }
2073 }
2074 } /* end search for check string in unicode */
2075 else {
2076 if (s == startpos) {
2077 goto after_try_latin;
2078 }
2079 while (1) {
2080 if (regtry(&reginfo, &s)) {
2081 goto got_it;
2082 }
2083 after_try_latin:
2084 if (s > end) {
2085 goto phooey;
2086 }
2087 if (prog->extflags & RXf_USE_INTUIT) {
2088 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2089 if (!s) {
2090 goto phooey;
2091 }
2092 }
2093 else {
2094 s++;
2095 }
2096 }
2097 } /* end search for check string in latin*/
2098 } /* end search for check string */
2099 else { /* search for newline */
2100 if (s > startpos) {
2101 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2102 s--;
92f3d482
YO
2103 }
2104 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
6eb5f6b9
JH
2105 while (s < end) {
2106 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2107 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2108 goto got_it;
2109 }
92f3d482
YO
2110 }
2111 } /* end search for newline */
2112 } /* end anchored/multiline check string search */
6eb5f6b9 2113 goto phooey;
bbe252da 2114 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
2115 {
2116 /* the warning about reginfo.ganch being used without intialization
bbe252da 2117 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2118 and we only enter this block when the same bit is set. */
58e23c8d 2119 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2120
2121 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2122 goto got_it;
2123 goto phooey;
2124 }
2125
2126 /* Messy cases: unanchored match. */
bbe252da 2127 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2128 /* we have /x+whatever/ */
f2ed9b32 2129 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2130 char ch;
bf93d4cc
GS
2131#ifdef DEBUGGING
2132 int did_match = 0;
2133#endif
f2ed9b32
KW
2134 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2135 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2136 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2137
f2ed9b32 2138 if (utf8_target) {
4cadc6a9 2139 REXEC_FBC_SCAN(
6eb5f6b9 2140 if (*s == ch) {
a3621e74 2141 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2142 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2143 s += UTF8SKIP(s);
2144 while (s < strend && *s == ch)
2145 s += UTF8SKIP(s);
2146 }
4cadc6a9 2147 );
6eb5f6b9
JH
2148 }
2149 else {
4cadc6a9 2150 REXEC_FBC_SCAN(
6eb5f6b9 2151 if (*s == ch) {
a3621e74 2152 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2153 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2154 s++;
2155 while (s < strend && *s == ch)
2156 s++;
2157 }
4cadc6a9 2158 );
6eb5f6b9 2159 }
a3621e74 2160 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2161 PerlIO_printf(Perl_debug_log,
b7953727
JH
2162 "Did not find anchored character...\n")
2163 );
6eb5f6b9 2164 }
a0714e2c
SS
2165 else if (prog->anchored_substr != NULL
2166 || prog->anchored_utf8 != NULL
2167 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2168 && prog->float_max_offset < strend - s)) {
2169 SV *must;
2170 I32 back_max;
2171 I32 back_min;
2172 char *last;
6eb5f6b9 2173 char *last1; /* Last position checked before */
bf93d4cc
GS
2174#ifdef DEBUGGING
2175 int did_match = 0;
2176#endif
33b8afdf 2177 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2178 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2179 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2180 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2181 back_max = back_min = prog->anchored_offset;
2182 } else {
f2ed9b32
KW
2183 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2184 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2185 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2186 back_max = prog->float_max_offset;
2187 back_min = prog->float_min_offset;
2188 }
1de06328
YO
2189
2190
33b8afdf
JH
2191 if (must == &PL_sv_undef)
2192 /* could not downgrade utf8 check substring, so must fail */
2193 goto phooey;
2194
1de06328
YO
2195 if (back_min<0) {
2196 last = strend;
2197 } else {
2198 last = HOP3c(strend, /* Cannot start after this */
2199 -(I32)(CHR_SVLEN(must)
2200 - (SvTAIL(must) != 0) + back_min), strbeg);
2201 }
6eb5f6b9
JH
2202 if (s > PL_bostr)
2203 last1 = HOPc(s, -1);
2204 else
2205 last1 = s - 1; /* bogus */
2206
a0288114 2207 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2208 check_substr==must. */
2209 scream_pos = -1;
2210 dontbother = end_shift;
2211 strend = HOPc(strend, -dontbother);
2212 while ( (s <= last) &&
9041c2e3 2213 ((flags & REXEC_SCREAM)
1de06328 2214 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2215 end_shift, &scream_pos, 0))
1de06328 2216 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2217 (unsigned char*)strend, must,
7fba1cd6 2218 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2219 /* we may be pointing at the wrong string */
07bc277f 2220 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2221 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2222 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2223 if (HOPc(s, -back_max) > last1) {
2224 last1 = HOPc(s, -back_min);
2225 s = HOPc(s, -back_max);
2226 }
2227 else {
52657f30 2228 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2229
2230 last1 = HOPc(s, -back_min);
52657f30 2231 s = t;
6eb5f6b9 2232 }
f2ed9b32 2233 if (utf8_target) {
6eb5f6b9 2234 while (s <= last1) {
24b23f37 2235 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2236 goto got_it;
2237 s += UTF8SKIP(s);
2238 }
2239 }
2240 else {
2241 while (s <= last1) {
24b23f37 2242 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2243 goto got_it;
2244 s++;
2245 }
2246 }
2247 }
ab3bbdeb 2248 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2249 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2250 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2251 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2252 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2253 ? "anchored" : "floating"),
ab3bbdeb
YO
2254 quoted, RE_SV_TAIL(must));
2255 });
6eb5f6b9
JH
2256 goto phooey;
2257 }
f8fc2ecf 2258 else if ( (c = progi->regstclass) ) {
f14c76ed 2259 if (minlen) {
f8fc2ecf 2260 const OPCODE op = OP(progi->regstclass);
66e933ab 2261 /* don't bother with what can't match */
786e8c11 2262 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2263 strend = HOPc(strend, -(minlen - 1));
2264 }
a3621e74 2265 DEBUG_EXECUTE_r({
be8e71aa 2266 SV * const prop = sv_newmortal();
32fc9b6a 2267 regprop(prog, prop, c);
0df25f3d 2268 {
f2ed9b32 2269 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2270 s,strend-s,60);
0df25f3d 2271 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2272 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2273 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2274 quoted, (int)(strend - s));
0df25f3d 2275 }
ffc61ed2 2276 });
3b0527fe 2277 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2278 goto got_it;
07be1b83 2279 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2280 }
2281 else {
2282 dontbother = 0;
a0714e2c 2283 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2284 /* Trim the end. */
d6a28714 2285 char *last;
33b8afdf
JH
2286 SV* float_real;
2287
f2ed9b32
KW
2288 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2289 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2290 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2291
2292 if (flags & REXEC_SCREAM) {
33b8afdf 2293 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2294 end_shift, &scream_pos, 1); /* last one */
2295 if (!last)
ffc61ed2 2296 last = scream_olds; /* Only one occurrence. */
4addbd3b 2297 /* we may be pointing at the wrong string */
07bc277f 2298 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2299 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2300 }
d6a28714
JH
2301 else {
2302 STRLEN len;
cfd0369c 2303 const char * const little = SvPV_const(float_real, len);
d6a28714 2304
33b8afdf 2305 if (SvTAIL(float_real)) {
d6a28714
JH
2306 if (memEQ(strend - len + 1, little, len - 1))
2307 last = strend - len + 1;
7fba1cd6 2308 else if (!multiline)
9041c2e3 2309 last = memEQ(strend - len, little, len)
bd61b366 2310 ? strend - len : NULL;
b8c5462f 2311 else
d6a28714
JH
2312 goto find_last;
2313 } else {
2314 find_last:
9041c2e3 2315 if (len)
d6a28714 2316 last = rninstr(s, strend, little, little + len);
b8c5462f 2317 else
a0288114 2318 last = strend; /* matching "$" */
b8c5462f 2319 }
b8c5462f 2320 }
bf93d4cc 2321 if (last == NULL) {
6bda09f9
YO
2322 DEBUG_EXECUTE_r(
2323 PerlIO_printf(Perl_debug_log,
2324 "%sCan't trim the tail, match fails (should not happen)%s\n",
2325 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2326 goto phooey; /* Should not happen! */
2327 }
d6a28714
JH
2328 dontbother = strend - last + prog->float_min_offset;
2329 }
2330 if (minlen && (dontbother < minlen))
2331 dontbother = minlen - 1;
2332 strend -= dontbother; /* this one's always in bytes! */
2333 /* We don't know much -- general case. */
f2ed9b32 2334 if (utf8_target) {
d6a28714 2335 for (;;) {
24b23f37 2336 if (regtry(&reginfo, &s))
d6a28714
JH
2337 goto got_it;
2338 if (s >= strend)
2339 break;
b8c5462f 2340 s += UTF8SKIP(s);
d6a28714
JH
2341 };
2342 }
2343 else {
2344 do {
24b23f37 2345 if (regtry(&reginfo, &s))
d6a28714
JH
2346 goto got_it;
2347 } while (s++ < strend);
2348 }
2349 }
2350
2351 /* Failure. */
2352 goto phooey;
2353
2354got_it:
e9105d30 2355 Safefree(swap);
288b8c02 2356 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2357
19b95bf0 2358 if (PL_reg_eval_set)
4f639d21 2359 restore_pos(aTHX_ prog);
5daac39c
NC
2360 if (RXp_PAREN_NAMES(prog))
2361 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2362
2363 /* make sure $`, $&, $', and $digit will work later */
2364 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2365 RX_MATCH_COPY_FREE(rx);
d6a28714 2366 if (flags & REXEC_COPY_STR) {
be8e71aa 2367 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2368#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2369 if ((SvIsCOW(sv)
2370 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2371 if (DEBUG_C_TEST) {
2372 PerlIO_printf(Perl_debug_log,
2373 "Copy on write: regexp capture, type %d\n",
2374 (int) SvTYPE(sv));
2375 }
2376 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2377 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2378 assert (SvPOKp(prog->saved_copy));
2379 } else
2380#endif
2381 {
288b8c02 2382 RX_MATCH_COPIED_on(rx);
ed252734
NC
2383 s = savepvn(strbeg, i);
2384 prog->subbeg = s;
2385 }
d6a28714 2386 prog->sublen = i;
d6a28714
JH
2387 }
2388 else {
2389 prog->subbeg = strbeg;
2390 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2391 }
2392 }
9041c2e3 2393
d6a28714
JH
2394 return 1;
2395
2396phooey:
a3621e74 2397 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2398 PL_colors[4], PL_colors[5]));
d6a28714 2399 if (PL_reg_eval_set)
4f639d21 2400 restore_pos(aTHX_ prog);
e9105d30 2401 if (swap) {
c74340f9 2402 /* we failed :-( roll it back */
e9105d30
GG
2403 Safefree(prog->offs);
2404 prog->offs = swap;
2405 }
2406
d6a28714
JH
2407 return 0;
2408}
2409
6bda09f9 2410
d6a28714
JH
2411/*
2412 - regtry - try match at specific point
2413 */
2414STATIC I32 /* 0 failure, 1 success */
24b23f37 2415S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2416{
97aff369 2417 dVAR;
d6a28714 2418 CHECKPOINT lastcp;
288b8c02
NC
2419 REGEXP *const rx = reginfo->prog;
2420 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2421 RXi_GET_DECL(prog,progi);
a3621e74 2422 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2423
2424 PERL_ARGS_ASSERT_REGTRY;
2425
24b23f37 2426 reginfo->cutpoint=NULL;
d6a28714 2427
bbe252da 2428 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2429 MAGIC *mg;
2430
2431 PL_reg_eval_set = RS_init;
a3621e74 2432 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2433 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2434 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2435 ));
ea8d6ae1 2436 SAVESTACK_CXPOS();
d6a28714
JH
2437 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2438 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2439 SAVETMPS;
2440 /* Apparently this is not needed, judging by wantarray. */
e8347627 2441 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2442 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2443
3b0527fe 2444 if (reginfo->sv) {
d6a28714 2445 /* Make $_ available to executed code. */
3b0527fe 2446 if (reginfo->sv != DEFSV) {
59f00321 2447 SAVE_DEFSV;
414bf5ae 2448 DEFSV_set(reginfo->sv);
b8c5462f 2449 }
d6a28714 2450
3b0527fe
DM
2451 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2452 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2453 /* prepare for quick setting of pos */
d300d9fa 2454#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2455 if (SvIsCOW(reginfo->sv))
2456 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2457#endif
3dab1dad 2458 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2459 &PL_vtbl_mglob, NULL, 0);
d6a28714 2460 mg->mg_len = -1;
b8c5462f 2461 }
d6a28714
JH
2462 PL_reg_magic = mg;
2463 PL_reg_oldpos = mg->mg_len;
4f639d21 2464 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2465 }
09687e5a 2466 if (!PL_reg_curpm) {
a02a5408 2467 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2468#ifdef USE_ITHREADS
2469 {
14a49a24 2470 SV* const repointer = &PL_sv_undef;
92313705
NC
2471 /* this regexp is also owned by the new PL_reg_curpm, which
2472 will try to free it. */
d2ece331 2473 av_push(PL_regex_padav, repointer);
09687e5a
AB
2474 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2475 PL_regex_pad = AvARRAY(PL_regex_padav);
2476 }
2477#endif
2478 }
86c29d75
NC
2479#ifdef USE_ITHREADS
2480 /* It seems that non-ithreads works both with and without this code.
2481 So for efficiency reasons it seems best not to have the code
2482 compiled when it is not needed. */
92313705
NC
2483 /* This is safe against NULLs: */
2484 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2485 /* PM_reg_curpm owns a reference to this regexp. */
2486 ReREFCNT_inc(rx);
86c29d75 2487#endif
288b8c02 2488 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2489 PL_reg_oldcurpm = PL_curpm;
2490 PL_curpm = PL_reg_curpm;
07bc277f 2491 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2492 /* Here is a serious problem: we cannot rewrite subbeg,
2493 since it may be needed if this match fails. Thus
2494 $` inside (?{}) could fail... */
2495 PL_reg_oldsaved = prog->subbeg;
2496 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2497#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2498 PL_nrs = prog->saved_copy;
2499#endif
07bc277f 2500 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2501 }
2502 else
bd61b366 2503 PL_reg_oldsaved = NULL;
d6a28714
JH
2504 prog->subbeg = PL_bostr;
2505 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2506 }
24b23f37 2507 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2508 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2509 PL_reginput = *startpos;
d6a28714 2510 PL_reglastparen = &prog->lastparen;
a01268b5 2511 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2512 prog->lastparen = 0;
03994de8 2513 prog->lastcloseparen = 0;
d6a28714 2514 PL_regsize = 0;
f0ab9afb 2515 PL_regoffs = prog->offs;
d6a28714
JH
2516 if (PL_reg_start_tmpl <= prog->nparens) {
2517 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2518 if(PL_reg_start_tmp)
2519 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2520 else
a02a5408 2521 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2522 }
2523
2524 /* XXXX What this code is doing here?!!! There should be no need
2525 to do this again and again, PL_reglastparen should take care of
3dd2943c 2526 this! --ilya*/
dafc8851
JH
2527
2528 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2529 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2530 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2531 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2532 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2533 * Meanwhile, this code *is* needed for the
daf18116
JH
2534 * above-mentioned test suite tests to succeed. The common theme
2535 * on those tests seems to be returning null fields from matches.
225593e1 2536 * --jhi updated by dapm */
dafc8851 2537#if 1
d6a28714 2538 if (prog->nparens) {
f0ab9afb 2539 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2540 register I32 i;
eb160463 2541 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2542 ++pp;
2543 pp->start = -1;
2544 pp->end = -1;
d6a28714
JH
2545 }
2546 }
dafc8851 2547#endif
02db2b7b 2548 REGCP_SET(lastcp);
f8fc2ecf 2549 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2550 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2551 return 1;
2552 }
24b23f37
YO
2553 if (reginfo->cutpoint)
2554 *startpos= reginfo->cutpoint;
02db2b7b 2555 REGCP_UNWIND(lastcp);
d6a28714
JH
2556 return 0;
2557}
2558
02db2b7b 2559
8ba1375e
MJD
2560#define sayYES goto yes
2561#define sayNO goto no
262b90c4 2562#define sayNO_SILENT goto no_silent
8ba1375e 2563
f9f4320a
YO
2564/* we dont use STMT_START/END here because it leads to
2565 "unreachable code" warnings, which are bogus, but distracting. */
2566#define CACHEsayNO \
c476f425
DM
2567 if (ST.cache_mask) \
2568 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2569 sayNO
3298f257 2570
a3621e74 2571/* this is used to determine how far from the left messages like
265c4333
YO
2572 'failed...' are printed. It should be set such that messages
2573 are inline with the regop output that created them.
a3621e74 2574*/
265c4333 2575#define REPORT_CODE_OFF 32
a3621e74
YO
2576
2577
40a82448
DM
2578#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2579#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2580
86545054
DM
2581#define SLAB_FIRST(s) (&(s)->states[0])
2582#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2583
5d9a96ca
DM
2584/* grab a new slab and return the first slot in it */
2585
2586STATIC regmatch_state *
2587S_push_slab(pTHX)
2588{
a35a87e7 2589#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2590 dMY_CXT;
2591#endif
5d9a96ca
DM
2592 regmatch_slab *s = PL_regmatch_slab->next;
2593 if (!s) {
2594 Newx(s, 1, regmatch_slab);
2595 s->prev = PL_regmatch_slab;
2596 s->next = NULL;
2597 PL_regmatch_slab->next = s;
2598 }
2599 PL_regmatch_slab = s;
86545054 2600 return SLAB_FIRST(s);
5d9a96ca 2601}
5b47454d 2602
95b24440 2603
40a82448
DM
2604/* push a new state then goto it */
2605
2606#define PUSH_STATE_GOTO(state, node) \
2607 scan = node; \
2608 st->resume_state = state; \
2609 goto push_state;
2610
2611/* push a new state with success backtracking, then goto it */
2612
2613#define PUSH_YES_STATE_GOTO(state, node) \
2614 scan = node; \
2615 st->resume_state = state; \
2616 goto push_yes_state;
2617
aa283a38 2618
aa283a38 2619
d6a28714 2620/*
95b24440 2621
bf1f174e
DM
2622regmatch() - main matching routine
2623
2624This is basically one big switch statement in a loop. We execute an op,
2625set 'next' to point the next op, and continue. If we come to a point which
2626we may need to backtrack to on failure such as (A|B|C), we push a
2627backtrack state onto the backtrack stack. On failure, we pop the top
2628state, and re-enter the loop at the state indicated. If there are no more
2629states to pop, we return failure.
2630
2631Sometimes we also need to backtrack on success; for example /A+/, where
2632after successfully matching one A, we need to go back and try to
2633match another one; similarly for lookahead assertions: if the assertion
2634completes successfully, we backtrack to the state just before the assertion
2635and then carry on. In these cases, the pushed state is marked as
2636'backtrack on success too'. This marking is in fact done by a chain of
2637pointers, each pointing to the previous 'yes' state. On success, we pop to
2638the nearest yes state, discarding any intermediate failure-only states.
2639Sometimes a yes state is pushed just to force some cleanup code to be
2640called at the end of a successful match or submatch; e.g. (??{$re}) uses
2641it to free the inner regex.
2642
2643Note that failure backtracking rewinds the cursor position, while
2644success backtracking leaves it alone.
2645
2646A pattern is complete when the END op is executed, while a subpattern
2647such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2648ops trigger the "pop to last yes state if any, otherwise return true"
2649behaviour.
2650
2651A common convention in this function is to use A and B to refer to the two
2652subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2653the subpattern to be matched possibly multiple times, while B is the entire
2654rest of the pattern. Variable and state names reflect this convention.
2655
2656The states in the main switch are the union of ops and failure/success of
2657substates associated with with that op. For example, IFMATCH is the op
2658that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2659'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2660successfully matched A and IFMATCH_A_fail is a state saying that we have
2661just failed to match A. Resume states always come in pairs. The backtrack
2662state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2663at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2664on success or failure.
2665
2666The struct that holds a backtracking state is actually a big union, with
2667one variant for each major type of op. The variable st points to the
2668top-most backtrack struct. To make the code clearer, within each
2669block of code we #define ST to alias the relevant union.
2670
2671Here's a concrete example of a (vastly oversimplified) IFMATCH
2672implementation:
2673
2674 switch (state) {
2675 ....
2676
2677#define ST st->u.ifmatch
2678
2679 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2680 ST.foo = ...; // some state we wish to save
95b24440 2681 ...
bf1f174e
DM
2682 // push a yes backtrack state with a resume value of
2683 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2684 // first node of A:
2685 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2686 // NOTREACHED
2687
2688 case IFMATCH_A: // we have successfully executed A; now continue with B
2689 next = B;
2690 bar = ST.foo; // do something with the preserved value
2691 break;
2692
2693 case IFMATCH_A_fail: // A failed, so the assertion failed
2694 ...; // do some housekeeping, then ...
2695 sayNO; // propagate the failure
2696
2697#undef ST
95b24440 2698
bf1f174e
DM
2699 ...
2700 }
95b24440 2701
bf1f174e
DM
2702For any old-timers reading this who are familiar with the old recursive
2703approach, the code above is equivalent to:
95b24440 2704
bf1f174e
DM
2705 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2706 {
2707 int foo = ...
95b24440 2708 ...
bf1f174e
DM
2709 if (regmatch(A)) {
2710 next = B;
2711 bar = foo;
2712 break;
95b24440 2713 }
bf1f174e
DM
2714 ...; // do some housekeeping, then ...
2715 sayNO; // propagate the failure
95b24440 2716 }
bf1f174e
DM
2717
2718The topmost backtrack state, pointed to by st, is usually free. If you
2719want to claim it, populate any ST.foo fields in it with values you wish to
2720save, then do one of
2721
2722 PUSH_STATE_GOTO(resume_state, node);
2723 PUSH_YES_STATE_GOTO(resume_state, node);
2724
2725which sets that backtrack state's resume value to 'resume_state', pushes a
2726new free entry to the top of the backtrack stack, then goes to 'node'.
2727On backtracking, the free slot is popped, and the saved state becomes the
2728new free state. An ST.foo field in this new top state can be temporarily
2729accessed to retrieve values, but once the main loop is re-entered, it
2730becomes available for reuse.
2731
2732Note that the depth of the backtrack stack constantly increases during the
2733left-to-right execution of the pattern, rather than going up and down with
2734the pattern nesting. For example the stack is at its maximum at Z at the
2735end of the pattern, rather than at X in the following:
2736
2737 /(((X)+)+)+....(Y)+....Z/
2738
2739The only exceptions to this are lookahead/behind assertions and the cut,
2740(?>A), which pop all the backtrack states associated with A before
2741continuing.
2742
2743Bascktrack state structs are allocated in slabs of about 4K in size.
2744PL_regmatch_state and st always point to the currently active state,
2745and PL_regmatch_slab points to the slab currently containing
2746PL_regmatch_state. The first time regmatch() is called, the first slab is
2747allocated, and is never freed until interpreter destruction. When the slab
2748is full, a new one is allocated and chained to the end. At exit from
2749regmatch(), slabs allocated since entry are freed.
2750
2751*/
95b24440 2752
40a82448 2753
5bc10b2c 2754#define DEBUG_STATE_pp(pp) \
265c4333 2755 DEBUG_STATE_r({ \
f2ed9b32 2756 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2757 PerlIO_printf(Perl_debug_log, \
5d458dd8 2758 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2759 depth*2, "", \
13d6edb4 2760 PL_reg_name[st->resume_state], \
5d458dd8
YO
2761 ((st==yes_state||st==mark_state) ? "[" : ""), \
2762 ((st==yes_state) ? "Y" : ""), \
2763 ((st==mark_state) ? "M" : ""), \
2764 ((st==yes_state||st==mark_state) ? "]" : "") \
2765 ); \
265c4333 2766 });
5bc10b2c 2767
40a82448 2768
3dab1dad 2769#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2770
3df15adc 2771#ifdef DEBUGGING
5bc10b2c 2772
ab3bbdeb 2773STATIC void
f2ed9b32 2774S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2775 const char *start, const char *end, const char *blurb)
2776{
efd26800 2777 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2778
2779 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2780
ab3bbdeb
YO
2781 if (!PL_colorset)
2782 reginitcolors();
2783 {
2784 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2785 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2786
f2ed9b32 2787 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2788 start, end - start, 60);
2789
2790 PerlIO_printf(Perl_debug_log,
2791 "%s%s REx%s %s against %s\n",
2792 PL_colors[4], blurb, PL_colors[5], s0, s1);
2793
f2ed9b32 2794 if (utf8_target||utf8_pat)
1de06328
YO
2795 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2796 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2797 utf8_pat && utf8_target ? " and " : "",
2798 utf8_target ? "string" : ""
ab3bbdeb
YO
2799 );
2800 }
2801}
3df15adc
YO
2802
2803STATIC void
786e8c11
YO
2804S_dump_exec_pos(pTHX_ const char *locinput,
2805 const regnode *scan,
2806 const char *loc_regeol,
2807 const char *loc_bostr,
2808 const char *loc_reg_starttry,
f2ed9b32 2809 const bool utf8_target)
07be1b83 2810{
786e8c11 2811 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2812 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2813 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2814 /* The part of the string before starttry has one color
2815 (pref0_len chars), between starttry and current
2816 position another one (pref_len - pref0_len chars),
2817 after the current position the third one.
2818 We assume that pref0_len <= pref_len, otherwise we
2819 decrease pref0_len. */
786e8c11
YO
2820 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2821 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2822 int pref0_len;
2823
7918f24d
NC
2824 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2825
f2ed9b32 2826 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2827 pref_len++;
786e8c11
YO
2828 pref0_len = pref_len - (locinput - loc_reg_starttry);
2829 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2830 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2831 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2832 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2833 l--;
2834 if (pref0_len < 0)
2835 pref0_len = 0;
2836 if (pref0_len > pref_len)
2837 pref0_len = pref_len;
2838 {
f2ed9b32 2839 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2840
ab3bbdeb 2841 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2842 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2843
ab3bbdeb 2844 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2845 (locinput - pref_len + pref0_len),
1de06328 2846 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2847
ab3bbdeb 2848 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2849 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2850
1de06328 2851 const STRLEN tlen=len0+len1+len2;
3df15adc 2852 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2853 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2854 (IV)(locinput - loc_bostr),
07be1b83 2855 len0, s0,
07be1b83 2856 len1, s1,
07be1b83 2857 (docolor ? "" : "> <"),
07be1b83 2858 len2, s2,
f9f4320a 2859 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2860 "");
2861 }
2862}
3df15adc 2863
07be1b83
YO
2864#endif
2865
0a4db386
YO
2866/* reg_check_named_buff_matched()
2867 * Checks to see if a named buffer has matched. The data array of
2868 * buffer numbers corresponding to the buffer is expected to reside
2869 * in the regexp->data->data array in the slot stored in the ARG() of
2870 * node involved. Note that this routine doesn't actually care about the
2871 * name, that information is not preserved from compilation to execution.
2872 * Returns the index of the leftmost defined buffer with the given name
2873 * or 0 if non of the buffers matched.
2874 */
2875STATIC I32
7918f24d
NC
2876S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2877{
0a4db386 2878 I32 n;
f8fc2ecf 2879 RXi_GET_DECL(rex,rexi);
ad64d0ec 2880 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2881 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2882
2883 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2884
0a4db386
YO
2885 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2886 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2887 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2888 {
2889 return nums[n];
2890 }
2891 }
2892 return 0;
2893}
2894
2f554ef7
DM
2895
2896/* free all slabs above current one - called during LEAVE_SCOPE */
2897
2898STATIC void
2899S_clear_backtrack_stack(pTHX_ void *p)
2900{
2901 regmatch_slab *s = PL_regmatch_slab->next;
2902 PERL_UNUSED_ARG(p);
2903
2904 if (!s)
2905 return;
2906 PL_regmatch_slab->next = NULL;
2907 while (s) {
2908 regmatch_slab * const osl = s;
2909 s = s->next;
2910 Safefree(osl);
2911 }
2912}
2913
2914
28d8d7f4
YO
2915#define SETREX(Re1,Re2) \
2916 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2917 Re1 = (Re2)
2918
d6a28714 2919STATIC I32 /* 0 failure, 1 success */
24b23f37 2920S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2921{
a35a87e7 2922#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2923 dMY_CXT;
2924#endif
27da23d5 2925 dVAR;
f2ed9b32 2926 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 2927 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2928 REGEXP *rex_sv = reginfo->prog;
2929 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2930 RXi_GET_DECL(rex,rexi);
2f554ef7 2931 I32 oldsave;
5d9a96ca
DM
2932 /* the current state. This is a cached copy of PL_regmatch_state */
2933 register regmatch_state *st;
5d9a96ca
DM
2934 /* cache heavy used fields of st in registers */
2935 register regnode *scan;
2936 register regnode *next;
438e9bae 2937 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2938 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2939 register char *locinput = PL_reginput;
5d9a96ca 2940 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2941
b69b0499 2942 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2943 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2944 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2945 const U32 max_nochange_depth =
2946 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2947 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2948 regmatch_state *yes_state = NULL; /* state to pop to on success of
2949 subpattern */
e2e6a0f1
YO
2950 /* mark_state piggy backs on the yes_state logic so that when we unwind
2951 the stack on success we can update the mark_state as we go */
2952 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2953 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2954 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2955 U32 state_num;
5d458dd8
YO
2956 bool no_final = 0; /* prevent failure from backtracking? */
2957 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2958 char *startpoint = PL_reginput;
5d458dd8
YO
2959 SV *popmark = NULL; /* are we looking for a mark? */
2960 SV *sv_commit = NULL; /* last mark name seen in failure */
2961 SV *sv_yes_mark = NULL; /* last mark name we have seen
2962 during a successfull match */
2963 U32 lastopen = 0; /* last open we saw */
2964 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2965 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2966 /* these three flags are set by various ops to signal information to
2967 * the very next op. They have a useful lifetime of exactly one loop
2968 * iteration, and are not preserved or restored by state pushes/pops
2969 */
2970 bool sw = 0; /* the condition value in (?(cond)a|b) */
2971 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2972 int logical = 0; /* the following EVAL is:
2973 0: (?{...})
2974 1: (?(?{...})X|Y)
2975 2: (??{...})
2976 or the following IFMATCH/UNLESSM is:
2977 false: plain (?=foo)
2978 true: used as a condition: (?(?=foo))
2979 */
95b24440 2980#ifdef DEBUGGING
e68ec53f 2981 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2982#endif
2983
7918f24d
NC
2984 PERL_ARGS_ASSERT_REGMATCH;
2985
3b57cd43 2986 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2987 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2988 }));
5d9a96ca
DM
2989 /* on first ever call to regmatch, allocate first slab */
2990 if (!PL_regmatch_slab) {
2991 Newx(PL_regmatch_slab, 1, regmatch_slab);
2992 PL_regmatch_slab->prev = NULL;
2993 PL_regmatch_slab->next = NULL;
86545054 2994 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2995 }
2996
2f554ef7
DM
2997 oldsave = PL_savestack_ix;
2998 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2999 SAVEVPTR(PL_regmatch_slab);
3000 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3001
3002 /* grab next free state slot */
3003 st = ++PL_regmatch_state;
86545054 3004 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3005 st = PL_regmatch_state = S_push_slab(aTHX);
3006
d6a28714
JH
3007 /* Note that nextchr is a byte even in UTF */
3008 nextchr = UCHARAT(locinput);
3009 scan = prog;
3010 while (scan != NULL) {
8ba1375e 3011
a3621e74 3012 DEBUG_EXECUTE_r( {
6136c704 3013 SV * const prop = sv_newmortal();
1de06328 3014 regnode *rnext=regnext(scan);
f2ed9b32 3015 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3016 regprop(rex, prop, scan);
07be1b83
YO
3017
3018 PerlIO_printf(Perl_debug_log,
3019 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3020 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3021 SvPVX_const(prop),
1de06328 3022 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3023 0 : (IV)(rnext - rexi->program));
2a782b5b 3024 });
d6a28714
JH
3025
3026 next = scan + NEXT_OFF(scan);
3027 if (next == scan)
3028 next = NULL;
40a82448 3029 state_num = OP(scan);
d6a28714 3030
40a82448 3031 reenter_switch:
34a81e2b
B
3032
3033 assert(PL_reglastparen == &rex->lastparen);
3034 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3035 assert(PL_regoffs == rex->offs);
3036
40a82448 3037 switch (state_num) {
d6a28714 3038 case BOL:
7fba1cd6 3039 if (locinput == PL_bostr)
d6a28714 3040 {
3b0527fe 3041 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3042 break;
3043 }
d6a28714
JH
3044 sayNO;
3045 case MBOL:
12d33761
HS
3046 if (locinput == PL_bostr ||
3047 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3048 {
b8c5462f
JH
3049 break;
3050 }
d6a28714
JH
3051 sayNO;
3052 case SBOL:
c2a73568 3053 if (locinput == PL_bostr)
b8c5462f 3054 break;
d6a28714
JH
3055 sayNO;
3056 case GPOS:
3b0527fe 3057 if (locinput == reginfo->ganch)
d6a28714
JH
3058 break;
3059 sayNO;
ee9b8eae
YO
3060
3061 case KEEPS:
3062 /* update the startpoint */
f0ab9afb 3063 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3064 PL_reginput = locinput;
f0ab9afb 3065 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3066 PUSH_STATE_GOTO(KEEPS_next, next);
3067 /*NOT-REACHED*/
3068 case KEEPS_next_fail:
3069 /* rollback the start point change */
f0ab9afb 3070 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3071 sayNO_SILENT;
3072 /*NOT-REACHED*/
d6a28714 3073 case EOL:
d6a28714
JH
3074 goto seol;
3075 case MEOL:
d6a28714 3076 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3077 sayNO;
b8c5462f 3078 break;
d6a28714
JH
3079 case SEOL:
3080 seol:
3081 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3082 sayNO;
d6a28714 3083 if (PL_regeol - locinput > 1)
b8c5462f 3084 sayNO;
b8c5462f 3085 break;
d6a28714
JH
3086 case EOS:
3087 if (PL_regeol != locinput)
b8c5462f 3088 sayNO;
d6a28714 3089 break;
ffc61ed2 3090 case SANY:
d6a28714 3091 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3092 sayNO;
f2ed9b32 3093 if (utf8_target) {
f33976b4
DB
3094 locinput += PL_utf8skip[nextchr];
3095 if (locinput > PL_regeol)
3096 sayNO;
3097 nextchr = UCHARAT(locinput);
3098 }
3099 else
3100 nextchr = UCHARAT(++locinput);
3101 break;
3102 case CANY:
3103 if (!nextchr && locinput >= PL_regeol)
3104 sayNO;
b8c5462f 3105 nextchr = UCHARAT(++locinput);
a0d0e21e 3106 break;
ffc61ed2 3107 case REG_ANY:
1aa99e6b
IH
3108 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3109 sayNO;
f2ed9b32 3110 if (utf8_target) {
b8c5462f 3111 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3112 if (locinput > PL_regeol)
3113 sayNO;
a0ed51b3 3114 nextchr = UCHARAT(locinput);
a0ed51b3 3115 }
1aa99e6b
IH
3116 else
3117 nextchr = UCHARAT(++locinput);
a0ed51b3 3118 break;
166ba7cd
DM
3119
3120#undef ST
3121#define ST st->u.trie
786e8c11
YO
3122 case TRIEC:
3123 /* In this case the charclass data is available inline so
3124 we can fail fast without a lot of extra overhead.
3125 */
f2ed9b32 3126 if (scan->flags == EXACT || !utf8_target) {
786e8c11
YO
3127 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3128 DEBUG_EXECUTE_r(
3129 PerlIO_printf(Perl_debug_log,
3130 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3131 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3132 );
3133 sayNO_SILENT;
3134 /* NOTREACHED */
3135 }
3136 }
3137 /* FALL THROUGH */
5b47454d 3138 case TRIE:
2e64971a
DM
3139 /* the basic plan of execution of the trie is:
3140 * At the beginning, run though all the states, and
3141 * find the longest-matching word. Also remember the position
3142 * of the shortest matching word. For example, this pattern:
3143 * 1 2 3 4 5
3144 * ab|a|x|abcd|abc
3145 * when matched against the string "abcde", will generate
3146 * accept states for all words except 3, with the longest
3147 * matching word being 4, and the shortest being 1 (with
3148 * the position being after char 1 of the string).
3149 *
3150 * Then for each matching word, in word order (i.e. 1,2,4,5),
3151 * we run the remainder of the pattern; on each try setting
3152 * the current position to the character following the word,
3153 * returning to try the next word on failure.
3154 *
3155 * We avoid having to build a list of words at runtime by
3156 * using a compile-time structure, wordinfo[].prev, which
3157 * gives, for each word, the previous accepting word (if any).
3158 * In the case above it would contain the mappings 1->2, 2->0,
3159 * 3->0, 4->5, 5->1. We can use this table to generate, from
3160 * the longest word (4 above), a list of all words, by
3161 * following the list of prev pointers; this gives us the
3162 * unordered list 4,5,1,2. Then given the current word we have
3163 * just tried, we can go through the list and find the
3164 * next-biggest word to try (so if we just failed on word 2,
3165 * the next in the list is 4).
3166 *
3167 * Since at runtime we don't record the matching position in
3168 * the string for each word, we have to work that out for
3169 * each word we're about to process. The wordinfo table holds
3170 * the character length of each word; given that we recorded
3171 * at the start: the position of the shortest word and its
3172 * length in chars, we just need to move the pointer the
3173 * difference between the two char lengths. Depending on
3174 * Unicode status and folding, that's cheap or expensive.
3175 *
3176 * This algorithm is optimised for the case where are only a
3177 * small number of accept states, i.e. 0,1, or maybe 2.
3178 * With lots of accepts states, and having to try all of them,
3179 * it becomes quadratic on number of accept states to find all
3180 * the next words.
3181 */
3182
3dab1dad 3183 {
07be1b83 3184 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3185 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3186
3187 /* what trie are we using right now */
be8e71aa 3188 reg_trie_data * const trie
f8fc2ecf 3189 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3190 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3191 U32 state = trie->startstate;
166ba7cd 3192
3dab1dad
YO
3193 if (trie->bitmap && trie_type != trie_utf8_fold &&
3194 !TRIE_BITMAP_TEST(trie,*locinput)
3195 ) {
3196 if (trie->states[ state ].wordnum) {
3197 DEBUG_EXECUTE_r(
3198 PerlIO_printf(Perl_debug_log,
3199 "%*s %smatched empty string...%s\n",
5bc10b2c 3200 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3201 );
3202 break;
3203 } else {
3204 DEBUG_EXECUTE_r(
3205 PerlIO_printf(Perl_debug_log,
786e8c11 3206 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3207 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3208 );
3209 sayNO_SILENT;
3210 }
3211 }
166ba7cd 3212
786e8c11
YO
3213 {
3214 U8 *uc = ( U8* )locinput;
3215
3216 STRLEN len = 0;
3217 STRLEN foldlen = 0;
3218 U8 *uscan = (U8*)NULL;
786e8c11 3219 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3220 U32 charcount = 0; /* how many input chars we have matched */
3221 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3222
786e8c11
YO
3223 ST.B = next;
3224 ST.jump = trie->jump;
786e8c11 3225 ST.me = scan;
2e64971a
DM
3226 ST.firstpos = NULL;
3227 ST.longfold = FALSE; /* char longer if folded => it's harder */
3228 ST.nextword = 0;
3229
3230 /* fully traverse the TRIE; note the position of the
3231 shortest accept state and the wordnum of the longest
3232 accept state */
07be1b83 3233
a3621e74 3234 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3235 U32 base = trie->states[ state ].trans.base;
f9f4320a 3236 UV uvc = 0;
acb909b4 3237 U16 charid = 0;
2e64971a
DM
3238 U16 wordnum;
3239 wordnum = trie->states[ state ].wordnum;
3240
3241 if (wordnum) { /* it's an accept state */
3242 if (!accepted) {
3243 accepted = 1;
3244 /* record first match position */
3245 if (ST.longfold) {
3246 ST.firstpos = (U8*)locinput;
3247 ST.firstchars = 0;
5b47454d 3248 }
2e64971a
DM
3249 else {
3250 ST.firstpos = uc;
3251 ST.firstchars = charcount;
3252 }
3253 }
3254 if (!ST.nextword || wordnum < ST.nextword)
3255 ST.nextword = wordnum;
3256 ST.topword = wordnum;
786e8c11 3257 }
a3621e74 3258
07be1b83 3259 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3260 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3261 PerlIO_printf( Perl_debug_log,
2e64971a 3262 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3263 2+depth * 2, "", PL_colors[4],
2e64971a 3264 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3265 });
a3621e74 3266
2e64971a 3267 /* read a char and goto next state */
a3621e74 3268 if ( base ) {
6dd2be57 3269 I32 offset;
55eed653
NC
3270 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3271 uscan, len, uvc, charid, foldlen,
3272 foldbuf, uniflags);
2e64971a
DM
3273 charcount++;
3274 if (foldlen>0)
3275 ST.longfold = TRUE;
5b47454d 3276 if (charid &&
6dd2be57
DM
3277 ( ((offset =
3278 base + charid - 1 - trie->uniquecharcount)) >= 0)
3279
3280 && ((U32)offset < trie->lasttrans)
3281 && trie->trans[offset].check == state)
5b47454d 3282 {
6dd2be57 3283 state = trie->trans[offset].next;
5b47454d
DM
3284 }
3285 else {
3286 state = 0;
3287 }
3288 uc += len;
3289
3290 }
3291 else {
a3621e74
YO
3292 state = 0;
3293 }
3294 DEBUG_TRIE_EXECUTE_r(
e4584336 3295 PerlIO_printf( Perl_debug_log,
786e8c11 3296 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3297 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3298 );
3299 }
2e64971a 3300 if (!accepted)
a3621e74 3301 sayNO;
a3621e74 3302
2e64971a
DM
3303 /* calculate total number of accept states */
3304 {
3305 U16 w = ST.topword;
3306 accepted = 0;
3307 while (w) {
3308 w = trie->wordinfo[w].prev;
3309 accepted++;
3310 }
3311 ST.accepted = accepted;
3312 }
3313
166ba7cd
DM
3314 DEBUG_EXECUTE_r(
3315 PerlIO_printf( Perl_debug_log,
3316 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3317 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3318 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3319 );
2e64971a 3320 goto trie_first_try; /* jump into the fail handler */
786e8c11 3321 }}
fae667d5 3322 /* NOTREACHED */
2e64971a
DM
3323
3324 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3325 if ( ST.jump) {
3326 REGCP_UNWIND(ST.cp);
3327 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3328 PL_regoffs[n].end = -1;
fae667d5
YO
3329 *PL_reglastparen = n;
3330 }
2e64971a
DM
3331 if (!--ST.accepted) {
3332 DEBUG_EXECUTE_r({
3333 PerlIO_printf( Perl_debug_log,
3334 "%*s %sTRIE failed...%s\n",
3335 REPORT_CODE_OFF+depth*2, "",
3336 PL_colors[4],
3337 PL_colors[5] );
3338 });
3339 sayNO_SILENT;
3340 }
3341 {
3342 /* Find next-highest word to process. Note that this code
3343 * is O(N^2) per trie run (O(N) per branch), so keep tight */
d9a396a3
DM
3344 register U16 min = 0;
3345 register U16 word;
2e64971a
DM
3346 register U16 const nextword = ST.nextword;
3347 register reg_trie_wordinfo * const wordinfo
3348 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3349 for (word=ST.topword; word; word=wordinfo[word].prev) {
3350 if (word > nextword && (!min || word < min))
3351 min = word;
3352 }
3353 ST.nextword = min;
3354 }
3355
fae667d5 3356 trie_first_try:
5d458dd8
YO
3357 if (do_cutgroup) {
3358 do_cutgroup = 0;
3359 no_final = 0;
3360 }
fae667d5
YO
3361
3362 if ( ST.jump) {
3363 ST.lastparen = *PL_reglastparen;
3364 REGCP_SET(ST.cp);
2e64971a 3365 }
a3621e74 3366
2e64971a 3367 /* find start char of end of current word */
166ba7cd 3368 {
2e64971a
DM
3369 U32 chars; /* how many chars to skip */
3370 U8 *uc = ST.firstpos;
3371 reg_trie_data * const trie
3372 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3373