This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: utf8 doesn't match /i nonutf8 self
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
f2ed9b32 103#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
20d0b1e9 120/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
37e2e78e
KW
123
124/* Doesn't do an assert to verify that is correct */
125#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
127
1a4fad37
AL
128#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
129#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
130#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 131
37e2e78e 132#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
133 LOAD_UTF8_CHARCLASS(X_begin, " "); \
134 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
135 /* These are utf8 constants, and not utf-ebcdic constants, so the \
136 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 137 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
138 \
139 /* No asserts are done for these, in case called on an early \
140 * Unicode version in which they map to nothing */ \
d275fa5e
CB
141 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 148
d1eb3177
YO
149/*
150 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
151 so that it is possible to override the option here without having to
152 rebuild the entire core. as we are required to do if we change regcomp.h
153 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
154*/
155#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
156#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
157#endif
158
159#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
160#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
161#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
162#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
163#define RE_utf8_perl_word PL_utf8_alnum
164#define RE_utf8_perl_space PL_utf8_space
165#define RE_utf8_posix_digit PL_utf8_digit
166#define perl_word alnum
167#define perl_space space
168#define posix_digit digit
169#else
170#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
171#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
172#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
173#define RE_utf8_perl_word PL_utf8_perl_word
174#define RE_utf8_perl_space PL_utf8_perl_space
175#define RE_utf8_posix_digit PL_utf8_posix_digit
176#endif
177
178
c02a0702
KW
179#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
180 case NAMEL: \
181 PL_reg_flags |= RF_tainted; \
182 /* FALL THROUGH */ \
183 case NAME: \
184 if (!nextchr) \
185 sayNO; \
186 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
187 if (!CAT2(PL_utf8_,CLASS)) { \
188 bool ok; \
189 ENTER; \
190 save_re_context(); \
191 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
192 assert(ok); \
193 LEAVE; \
194 } \
195 if (!(OP(scan) == NAME \
f2ed9b32 196 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
c02a0702
KW
197 : LCFUNC_utf8((U8*)locinput))) \
198 { \
199 sayNO; \
200 } \
201 locinput += PL_utf8skip[nextchr]; \
202 nextchr = UCHARAT(locinput); \
203 break; \
204 } \
a12cf05f 205 /* Drops through to the macro that calls this one */
c02a0702
KW
206
207#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
208 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
209 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
210 sayNO; \
211 nextchr = UCHARAT(++locinput); \
20d0b1e9
YO
212 break
213
b10ac0d8
KW
214/* Almost identical to the above, but has a case for a node that matches chars
215 * between 128 and 255 using Unicode (latin1) semantics. */
216#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
c02a0702 217 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
b10ac0d8 218 if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
c02a0702
KW
219 sayNO; \
220 nextchr = UCHARAT(++locinput); \
b10ac0d8
KW
221 break
222
c02a0702
KW
223#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
224 case NAMEL: \
225 PL_reg_flags |= RF_tainted; \
226 /* FALL THROUGH */ \
227 case NAME : \
228 if (!nextchr && locinput >= PL_regeol) \
229 sayNO; \
230 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
231 if (!CAT2(PL_utf8_,CLASS)) { \
232 bool ok; \
233 ENTER; \
234 save_re_context(); \
235 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
236 assert(ok); \
237 LEAVE; \
238 } \
239 if ((OP(scan) == NAME \
f2ed9b32 240 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
c02a0702
KW
241 : LCFUNC_utf8((U8*)locinput))) \
242 { \
243 sayNO; \
244 } \
245 locinput += PL_utf8skip[nextchr]; \
246 nextchr = UCHARAT(locinput); \
247 break; \
b10ac0d8
KW
248 }
249
c02a0702
KW
250#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
251 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
252 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
253 sayNO; \
254 nextchr = UCHARAT(++locinput); \
20d0b1e9
YO
255 break
256
257
c02a0702
KW
258#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
259 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
b10ac0d8 260 if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
c02a0702
KW
261 sayNO; \
262 nextchr = UCHARAT(++locinput); \
b10ac0d8 263 break
d1eb3177
YO
264
265
266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
296#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
302#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
304
305#endif
e2d8ce26 306
a84d97b6
HS
307/*
308 Search for mandatory following text node; for lookahead, the text must
309 follow but for lookbehind (rn->flags != 0) we skip to the next step.
310*/
cca55fe3 311#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
312 while (JUMPABLE(rn)) { \
313 const OPCODE type = OP(rn); \
314 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 315 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 316 else if (type == PLUS) \
cca55fe3 317 rn = NEXTOPER(rn); \
3dab1dad 318 else if (type == IFMATCH) \
a84d97b6 319 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 320 else rn += NEXT_OFF(rn); \
3dab1dad 321 } \
5f80c4cf 322} STMT_END
74750237 323
c476f425 324
acfe0abc 325static void restore_pos(pTHX_ void *arg);
51371543 326
620d5b66
NC
327#define REGCP_PAREN_ELEMS 4
328#define REGCP_OTHER_ELEMS 5
e0fa7e2b 329#define REGCP_FRAME_ELEMS 1
620d5b66
NC
330/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
331 * are needed for the regexp context stack bookkeeping. */
332
76e3520e 333STATIC CHECKPOINT
cea2e8a9 334S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 335{
97aff369 336 dVAR;
a3b680e6 337 const int retval = PL_savestack_ix;
a3b680e6 338 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
339 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
340 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 341 int p;
40a82448 342 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 343
e49a9654
IH
344 if (paren_elems_to_push < 0)
345 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
346
e0fa7e2b
NC
347 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
348 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
349 " out of range (%lu-%ld)",
350 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 351
620d5b66 352 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 353
3280af22 354 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 355/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
356 SSPUSHINT(PL_regoffs[p].end);
357 SSPUSHINT(PL_regoffs[p].start);
3280af22 358 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 359 SSPUSHINT(p);
e7707071 360 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 361 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 362 (UV)p, (IV)PL_regoffs[p].start,
40a82448 363 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 364 (IV)PL_regoffs[p].end
40a82448 365 ));
a0d0e21e 366 }
b1ce53c5 367/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 368 SSPUSHPTR(PL_regoffs);
3280af22
NIS
369 SSPUSHINT(PL_regsize);
370 SSPUSHINT(*PL_reglastparen);
a01268b5 371 SSPUSHINT(*PL_reglastcloseparen);
3280af22 372 SSPUSHPTR(PL_reginput);
e0fa7e2b 373 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 374
a0d0e21e
LW
375 return retval;
376}
377
c277df42 378/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
379#define REGCP_SET(cp) \
380 DEBUG_STATE_r( \
ab3bbdeb 381 PerlIO_printf(Perl_debug_log, \
e4f74956 382 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
383 (IV)PL_savestack_ix)); \
384 cp = PL_savestack_ix
c3464db5 385
ab3bbdeb 386#define REGCP_UNWIND(cp) \
e4f74956 387 DEBUG_STATE_r( \
ab3bbdeb 388 if (cp != PL_savestack_ix) \
e4f74956
YO
389 PerlIO_printf(Perl_debug_log, \
390 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
391 (IV)(cp), (IV)PL_savestack_ix)); \
392 regcpblow(cp)
c277df42 393
76e3520e 394STATIC char *
097eb12c 395S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 396{
97aff369 397 dVAR;
e0fa7e2b 398 UV i;
a0d0e21e 399 char *input;
a3621e74
YO
400 GET_RE_DEBUG_FLAGS_DECL;
401
7918f24d
NC
402 PERL_ARGS_ASSERT_REGCPPOP;
403
b1ce53c5 404 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 405 i = SSPOPUV;
e0fa7e2b
NC
406 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
407 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 408 input = (char *) SSPOPPTR;
a01268b5 409 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
410 *PL_reglastparen = SSPOPINT;
411 PL_regsize = SSPOPINT;
f0ab9afb 412 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 413
620d5b66 414 i -= REGCP_OTHER_ELEMS;
b1ce53c5 415 /* Now restore the parentheses context. */
620d5b66 416 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 417 I32 tmps;
097eb12c 418 U32 paren = (U32)SSPOPINT;
3280af22 419 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 420 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 421 tmps = SSPOPINT;
3280af22 422 if (paren <= *PL_reglastparen)
f0ab9afb 423 PL_regoffs[paren].end = tmps;
e7707071 424 DEBUG_BUFFERS_r(
c3464db5 425 PerlIO_printf(Perl_debug_log,
b900a521 426 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 427 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 428 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 429 (IV)PL_regoffs[paren].end,
3280af22 430 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 431 );
a0d0e21e 432 }
e7707071 433 DEBUG_BUFFERS_r(
bb7a0f54 434 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 435 PerlIO_printf(Perl_debug_log,
faccc32b 436 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 437 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
438 }
439 );
daf18116 440#if 1
dafc8851
JH
441 /* It would seem that the similar code in regtry()
442 * already takes care of this, and in fact it is in
443 * a better location to since this code can #if 0-ed out
444 * but the code in regtry() is needed or otherwise tests
445 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
446 * (as of patchlevel 7877) will fail. Then again,
447 * this code seems to be necessary or otherwise
225593e1
DM
448 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
449 * --jhi updated by dapm */
3b6647e0 450 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 451 if (i > PL_regsize)
f0ab9afb
NC
452 PL_regoffs[i].start = -1;
453 PL_regoffs[i].end = -1;
a0d0e21e 454 }
dafc8851 455#endif
a0d0e21e
LW
456 return input;
457}
458
02db2b7b 459#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 460
a687059c 461/*
e50aee73 462 * pregexec and friends
a687059c
LW
463 */
464
76234dfb 465#ifndef PERL_IN_XSUB_RE
a687059c 466/*
c277df42 467 - pregexec - match a regexp against a string
a687059c 468 */
c277df42 469I32
49d7dfbc 470Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 471 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
472/* strend: pointer to null at end of string */
473/* strbeg: real beginning of string */
474/* minend: end of match must be >=minend after stringarg. */
475/* nosave: For optimizations. */
476{
7918f24d
NC
477 PERL_ARGS_ASSERT_PREGEXEC;
478
c277df42 479 return
9041c2e3 480 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
481 nosave ? 0 : REXEC_COPY_STR);
482}
76234dfb 483#endif
22e551b9 484
9041c2e3 485/*
cad2e5aa
JH
486 * Need to implement the following flags for reg_anch:
487 *
488 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
489 * USE_INTUIT_ML
490 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
491 * INTUIT_AUTORITATIVE_ML
492 * INTUIT_ONCE_NOML - Intuit can match in one location only.
493 * INTUIT_ONCE_ML
494 *
495 * Another flag for this function: SECOND_TIME (so that float substrs
496 * with giant delta may be not rechecked).
497 */
498
499/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
500
3f7c398e 501/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
502 Otherwise, only SvCUR(sv) is used to get strbeg. */
503
504/* XXXX We assume that strpos is strbeg unless sv. */
505
6eb5f6b9
JH
506/* XXXX Some places assume that there is a fixed substring.
507 An update may be needed if optimizer marks as "INTUITable"
508 RExen without fixed substrings. Similarly, it is assumed that
509 lengths of all the strings are no more than minlen, thus they
510 cannot come from lookahead.
40d049e4
YO
511 (Or minlen should take into account lookahead.)
512 NOTE: Some of this comment is not correct. minlen does now take account
513 of lookahead/behind. Further research is required. -- demerphq
514
515*/
6eb5f6b9 516
2c2d71f5
JH
517/* A failure to find a constant substring means that there is no need to make
518 an expensive call to REx engine, thus we celebrate a failure. Similarly,
519 finding a substring too deep into the string means that less calls to
30944b6d
IZ
520 regtry() should be needed.
521
522 REx compiler's optimizer found 4 possible hints:
523 a) Anchored substring;
524 b) Fixed substring;
525 c) Whether we are anchored (beginning-of-line or \G);
526 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 527 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
528 string which does not contradict any of them.
529 */
2c2d71f5 530
6eb5f6b9
JH
531/* Most of decisions we do here should have been done at compile time.
532 The nodes of the REx which we used for the search should have been
533 deleted from the finite automaton. */
534
cad2e5aa 535char *
288b8c02 536Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 537 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 538{
97aff369 539 dVAR;
288b8c02 540 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 541 register I32 start_shift = 0;
cad2e5aa 542 /* Should be nonnegative! */
b7953727 543 register I32 end_shift = 0;
2c2d71f5
JH
544 register char *s;
545 register SV *check;
a1933d95 546 char *strbeg;
cad2e5aa 547 char *t;
f2ed9b32 548 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 549 I32 ml_anch;
bd61b366
SS
550 register char *other_last = NULL; /* other substr checked before this */
551 char *check_at = NULL; /* check substr found at this pos */
bbe252da 552 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 553 RXi_GET_DECL(prog,progi);
30944b6d 554#ifdef DEBUGGING
890ce7af 555 const char * const i_strpos = strpos;
30944b6d 556#endif
a3621e74
YO
557 GET_RE_DEBUG_FLAGS_DECL;
558
7918f24d
NC
559 PERL_ARGS_ASSERT_RE_INTUIT_START;
560
f2ed9b32 561 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 562
3c8556c3 563 if (RX_UTF8(rx)) {
b8d68ded
JH
564 PL_reg_flags |= RF_utf8;
565 }
ab3bbdeb 566 DEBUG_EXECUTE_r(
f2ed9b32 567 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
568 sv ? "Guessing start of match in sv for"
569 : "Guessing start of match in string for");
2a782b5b 570 );
cad2e5aa 571
c344f387
JH
572 /* CHR_DIST() would be more correct here but it makes things slow. */
573 if (prog->minlen > strend - strpos) {
a3621e74 574 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 575 "String too short... [re_intuit_start]\n"));
cad2e5aa 576 goto fail;
2c2d71f5 577 }
1de06328 578
a1933d95 579 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 580 PL_regeol = strend;
f2ed9b32 581 if (utf8_target) {
33b8afdf
JH
582 if (!prog->check_utf8 && prog->check_substr)
583 to_utf8_substr(prog);
584 check = prog->check_utf8;
585 } else {
586 if (!prog->check_substr && prog->check_utf8)
587 to_byte_substr(prog);
588 check = prog->check_substr;
589 }
1de06328 590 if (check == &PL_sv_undef) {
a3621e74 591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 592 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
593 goto fail;
594 }
bbe252da
YO
595 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
596 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
597 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 598 && !multiline ) ); /* Check after \n? */
cad2e5aa 599
7e25d62c 600 if (!ml_anch) {
bbe252da
YO
601 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
602 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 603 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
604 && sv && !SvROK(sv)
605 && (strpos != strbeg)) {
a3621e74 606 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
607 goto fail;
608 }
609 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 610 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 611 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
612 I32 slen;
613
1aa99e6b 614 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 615
653099ff
GS
616 if (SvTAIL(check)) {
617 slen = SvCUR(check); /* >= 1 */
cad2e5aa 618
9041c2e3 619 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 620 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 622 goto fail_finish;
cad2e5aa
JH
623 }
624 /* Now should match s[0..slen-2] */
625 slen--;
3f7c398e 626 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 627 || (slen > 1
3f7c398e 628 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 629 report_neq:
a3621e74 630 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
631 goto fail_finish;
632 }
cad2e5aa 633 }
3f7c398e 634 else if (*SvPVX_const(check) != *s
653099ff 635 || ((slen = SvCUR(check)) > 1
3f7c398e 636 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 637 goto report_neq;
c315bfe8 638 check_at = s;
2c2d71f5 639 goto success_at_start;
7e25d62c 640 }
cad2e5aa 641 }
2c2d71f5 642 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 643 s = strpos;
2c2d71f5 644 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
645 end_shift = prog->check_end_shift;
646
2c2d71f5 647 if (!ml_anch) {
a3b680e6 648 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 649 - (SvTAIL(check) != 0);
a3b680e6 650 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
651
652 if (end_shift < eshift)
653 end_shift = eshift;
654 }
cad2e5aa 655 }
2c2d71f5 656 else { /* Can match at random position */
cad2e5aa
JH
657 ml_anch = 0;
658 s = strpos;
1de06328
YO
659 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
660 end_shift = prog->check_end_shift;
661
662 /* end shift should be non negative here */
cad2e5aa
JH
663 }
664
bcdf7404 665#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 666 if (end_shift < 0)
1de06328 667 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 668 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
669#endif
670
2c2d71f5
JH
671 restart:
672 /* Find a possible match in the region s..strend by looking for
673 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
674
675 {
676 I32 srch_start_shift = start_shift;
677 I32 srch_end_shift = end_shift;
678 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
679 srch_end_shift -= ((strbeg - s) - srch_start_shift);
680 srch_start_shift = strbeg - s;
681 }
6bda09f9 682 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
683 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
684 (IV)prog->check_offset_min,
685 (IV)srch_start_shift,
686 (IV)srch_end_shift,
687 (IV)prog->check_end_shift);
688 });
689
cad2e5aa 690 if (flags & REXEC_SCREAM) {
cad2e5aa 691 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 692 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 693
2c2d71f5
JH
694 if (PL_screamfirst[BmRARE(check)] >= 0
695 || ( BmRARE(check) == '\n'
85c508c3 696 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 697 && SvTAIL(check) ))
9041c2e3 698 s = screaminstr(sv, check,
1de06328 699 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 700 else
2c2d71f5 701 goto fail_finish;
4addbd3b 702 /* we may be pointing at the wrong string */
07bc277f 703 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 704 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
705 if (data)
706 *data->scream_olds = s;
707 }
1de06328
YO
708 else {
709 U8* start_point;
710 U8* end_point;
bbe252da 711 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
712 start_point= (U8*)(s + srch_start_shift);
713 end_point= (U8*)(strend - srch_end_shift);
714 } else {
715 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
716 end_point= HOP3(strend, -srch_end_shift, strbeg);
717 }
6bda09f9 718 DEBUG_OPTIMISE_MORE_r({
56570a2c 719 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 720 (int)(end_point - start_point),
fc8cd66c 721 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
722 start_point);
723 });
724
725 s = fbm_instr( start_point, end_point,
7fba1cd6 726 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
727 }
728 }
cad2e5aa
JH
729 /* Update the count-of-usability, remove useless subpatterns,
730 unshift s. */
2c2d71f5 731
ab3bbdeb 732 DEBUG_EXECUTE_r({
f2ed9b32 733 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
734 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
735 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 736 (s ? "Found" : "Did not find"),
f2ed9b32 737 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
738 ? "anchored" : "floating"),
739 quoted,
740 RE_SV_TAIL(check),
741 (s ? " at offset " : "...\n") );
742 });
2c2d71f5
JH
743
744 if (!s)
745 goto fail_finish;
2c2d71f5 746 /* Finish the diagnostic message */
a3621e74 747 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 748
1de06328
YO
749 /* XXX dmq: first branch is for positive lookbehind...
750 Our check string is offset from the beginning of the pattern.
751 So we need to do any stclass tests offset forward from that
752 point. I think. :-(
753 */
754
755
756
757 check_at=s;
758
759
2c2d71f5
JH
760 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
761 Start with the other substr.
762 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 763 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
764 *always* match. Probably should be marked during compile...
765 Probably it is right to do no SCREAM here...
766 */
767
f2ed9b32 768 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
769 : (prog->float_substr && prog->anchored_substr))
770 {
30944b6d 771 /* Take into account the "other" substring. */
2c2d71f5
JH
772 /* XXXX May be hopelessly wrong for UTF... */
773 if (!other_last)
6eb5f6b9 774 other_last = strpos;
f2ed9b32 775 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
776 do_other_anchored:
777 {
890ce7af
AL
778 char * const last = HOP3c(s, -start_shift, strbeg);
779 char *last1, *last2;
be8e71aa 780 char * const saved_s = s;
33b8afdf 781 SV* must;
2c2d71f5 782
2c2d71f5
JH
783 t = s - prog->check_offset_max;
784 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 785 && (!utf8_target
0ce71af7 786 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 787 && t > strpos)))
6f207bd3 788 NOOP;
2c2d71f5
JH
789 else
790 t = strpos;
1aa99e6b 791 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
792 if (t < other_last) /* These positions already checked */
793 t = other_last;
1aa99e6b 794 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
795 if (last < last1)
796 last1 = last;
1de06328
YO
797 /* XXXX It is not documented what units *_offsets are in.
798 We assume bytes, but this is clearly wrong.
799 Meaning this code needs to be carefully reviewed for errors.
800 dmq.
801 */
802
2c2d71f5 803 /* On end-of-str: see comment below. */
f2ed9b32 804 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
805 if (must == &PL_sv_undef) {
806 s = (char*)NULL;
1de06328 807 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
808 }
809 else
810 s = fbm_instr(
811 (unsigned char*)t,
812 HOP3(HOP3(last1, prog->anchored_offset, strend)
813 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
814 must,
7fba1cd6 815 multiline ? FBMrf_MULTILINE : 0
33b8afdf 816 );
ab3bbdeb 817 DEBUG_EXECUTE_r({
f2ed9b32 818 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
819 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
820 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 821 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
822 quoted, RE_SV_TAIL(must));
823 });
824
825
2c2d71f5
JH
826 if (!s) {
827 if (last1 >= last2) {
a3621e74 828 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
829 ", giving up...\n"));
830 goto fail_finish;
831 }
a3621e74 832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 833 ", trying floating at offset %ld...\n",
be8e71aa 834 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
835 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
836 s = HOP3c(last, 1, strend);
2c2d71f5
JH
837 goto restart;
838 }
839 else {
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 841 (long)(s - i_strpos)));
1aa99e6b
IH
842 t = HOP3c(s, -prog->anchored_offset, strbeg);
843 other_last = HOP3c(s, 1, strend);
be8e71aa 844 s = saved_s;
2c2d71f5
JH
845 if (t == strpos)
846 goto try_at_start;
2c2d71f5
JH
847 goto try_at_offset;
848 }
30944b6d 849 }
2c2d71f5
JH
850 }
851 else { /* Take into account the floating substring. */
33b8afdf 852 char *last, *last1;
be8e71aa 853 char * const saved_s = s;
33b8afdf
JH
854 SV* must;
855
856 t = HOP3c(s, -start_shift, strbeg);
857 last1 = last =
858 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
859 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
860 last = HOP3c(t, prog->float_max_offset, strend);
861 s = HOP3c(t, prog->float_min_offset, strend);
862 if (s < other_last)
863 s = other_last;
2c2d71f5 864 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 865 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
866 /* fbm_instr() takes into account exact value of end-of-str
867 if the check is SvTAIL(ed). Since false positives are OK,
868 and end-of-str is not later than strend we are OK. */
869 if (must == &PL_sv_undef) {
870 s = (char*)NULL;
1de06328 871 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
872 }
873 else
2c2d71f5 874 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
875 (unsigned char*)last + SvCUR(must)
876 - (SvTAIL(must)!=0),
7fba1cd6 877 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 878 DEBUG_EXECUTE_r({
f2ed9b32 879 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
880 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
881 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 882 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
883 quoted, RE_SV_TAIL(must));
884 });
33b8afdf
JH
885 if (!s) {
886 if (last1 == last) {
a3621e74 887 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
888 ", giving up...\n"));
889 goto fail_finish;
2c2d71f5 890 }
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 892 ", trying anchored starting at offset %ld...\n",
be8e71aa 893 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
894 other_last = last;
895 s = HOP3c(t, 1, strend);
896 goto restart;
897 }
898 else {
a3621e74 899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
900 (long)(s - i_strpos)));
901 other_last = s; /* Fix this later. --Hugo */
be8e71aa 902 s = saved_s;
33b8afdf
JH
903 if (t == strpos)
904 goto try_at_start;
905 goto try_at_offset;
906 }
2c2d71f5 907 }
cad2e5aa 908 }
2c2d71f5 909
1de06328 910
9ef43ace 911 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 912
6bda09f9 913 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
914 PerlIO_printf(Perl_debug_log,
915 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
916 (IV)prog->check_offset_min,
917 (IV)prog->check_offset_max,
918 (IV)(s-strpos),
919 (IV)(t-strpos),
920 (IV)(t-s),
921 (IV)(strend-strpos)
922 )
923 );
924
2c2d71f5 925 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 926 && (!utf8_target
9ef43ace 927 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
928 && t > strpos)))
929 {
2c2d71f5
JH
930 /* Fixed substring is found far enough so that the match
931 cannot start at strpos. */
932 try_at_offset:
cad2e5aa 933 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
934 /* Eventually fbm_*() should handle this, but often
935 anchored_offset is not 0, so this check will not be wasted. */
936 /* XXXX In the code below we prefer to look for "^" even in
937 presence of anchored substrings. And we search even
938 beyond the found float position. These pessimizations
939 are historical artefacts only. */
940 find_anchor:
2c2d71f5 941 while (t < strend - prog->minlen) {
cad2e5aa 942 if (*t == '\n') {
4ee3650e 943 if (t < check_at - prog->check_offset_min) {
f2ed9b32 944 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
945 /* Since we moved from the found position,
946 we definitely contradict the found anchored
30944b6d
IZ
947 substr. Due to the above check we do not
948 contradict "check" substr.
949 Thus we can arrive here only if check substr
950 is float. Redo checking for "other"=="fixed".
951 */
9041c2e3 952 strpos = t + 1;
a3621e74 953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 954 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
955 goto do_other_anchored;
956 }
4ee3650e
GS
957 /* We don't contradict the found floating substring. */
958 /* XXXX Why not check for STCLASS? */
cad2e5aa 959 s = t + 1;
a3621e74 960 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 961 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
962 goto set_useful;
963 }
4ee3650e
GS
964 /* Position contradicts check-string */
965 /* XXXX probably better to look for check-string
966 than for "\n", so one should lower the limit for t? */
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 968 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 969 other_last = strpos = s = t + 1;
cad2e5aa
JH
970 goto restart;
971 }
972 t++;
973 }
a3621e74 974 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 975 PL_colors[0], PL_colors[1]));
2c2d71f5 976 goto fail_finish;
cad2e5aa 977 }
f5952150 978 else {
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 980 PL_colors[0], PL_colors[1]));
f5952150 981 }
cad2e5aa
JH
982 s = t;
983 set_useful:
f2ed9b32 984 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
985 }
986 else {
f5952150 987 /* The found string does not prohibit matching at strpos,
2c2d71f5 988 - no optimization of calling REx engine can be performed,
f5952150
GS
989 unless it was an MBOL and we are not after MBOL,
990 or a future STCLASS check will fail this. */
2c2d71f5
JH
991 try_at_start:
992 /* Even in this situation we may use MBOL flag if strpos is offset
993 wrt the start of the string. */
05b4157f 994 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 995 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 996 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 997 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 998 {
cad2e5aa
JH
999 t = strpos;
1000 goto find_anchor;
1001 }
a3621e74 1002 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1003 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1004 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1005 );
2c2d71f5 1006 success_at_start:
bbe252da 1007 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1008 && (utf8_target ? (
33b8afdf
JH
1009 prog->check_utf8 /* Could be deleted already */
1010 && --BmUSEFUL(prog->check_utf8) < 0
1011 && (prog->check_utf8 == prog->float_utf8)
1012 ) : (
1013 prog->check_substr /* Could be deleted already */
1014 && --BmUSEFUL(prog->check_substr) < 0
1015 && (prog->check_substr == prog->float_substr)
1016 )))
66e933ab 1017 {
cad2e5aa 1018 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1019 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1020 /* XXX Does the destruction order has to change with utf8_target? */
1021 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1022 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1023 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1024 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1025 check = NULL; /* abort */
cad2e5aa 1026 s = strpos;
c9415951
YO
1027 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1028 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1029 if (prog->intflags & PREGf_IMPLICIT)
1030 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1031 /* XXXX This is a remnant of the old implementation. It
1032 looks wasteful, since now INTUIT can use many
6eb5f6b9 1033 other heuristics. */
bbe252da 1034 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1035 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1036 }
1037 else
1038 s = strpos;
1039 }
1040
6eb5f6b9
JH
1041 /* Last resort... */
1042 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1043 /* trie stclasses are too expensive to use here, we are better off to
1044 leave it to regmatch itself */
f8fc2ecf 1045 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1046 /* minlen == 0 is possible if regstclass is \b or \B,
1047 and the fixed substr is ''$.
1048 Since minlen is already taken into account, s+1 is before strend;
1049 accidentally, minlen >= 1 guaranties no false positives at s + 1
1050 even for \b or \B. But (minlen? 1 : 0) below assumes that
1051 regstclass does not come from lookahead... */
1052 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1053 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
1054 const U8* const str = (U8*)STRING(progi->regstclass);
1055 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1056 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1057 : 1);
1de06328
YO
1058 char * endpos;
1059 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1060 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1061 else if (prog->float_substr || prog->float_utf8)
1062 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1063 else
1064 endpos= strend;
1065
70685ca0
JH
1066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1067 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1068
6eb5f6b9 1069 t = s;
f8fc2ecf 1070 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1071 if (!s) {
1072#ifdef DEBUGGING
cbbf8932 1073 const char *what = NULL;
6eb5f6b9
JH
1074#endif
1075 if (endpos == strend) {
a3621e74 1076 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1077 "Could not match STCLASS...\n") );
1078 goto fail;
1079 }
a3621e74 1080 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1081 "This position contradicts STCLASS...\n") );
bbe252da 1082 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1083 goto fail;
6eb5f6b9 1084 /* Contradict one of substrings */
33b8afdf 1085 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1086 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1087 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1088 hop_and_restart:
1aa99e6b 1089 s = HOP3c(t, 1, strend);
66e933ab
GS
1090 if (s + start_shift + end_shift > strend) {
1091 /* XXXX Should be taken into account earlier? */
a3621e74 1092 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1093 "Could not match STCLASS...\n") );
1094 goto fail;
1095 }
5e39e1e5
HS
1096 if (!check)
1097 goto giveup;
a3621e74 1098 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1099 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1100 what, (long)(s + start_shift - i_strpos)) );
1101 goto restart;
1102 }
66e933ab 1103 /* Have both, check_string is floating */
6eb5f6b9
JH
1104 if (t + start_shift >= check_at) /* Contradicts floating=check */
1105 goto retry_floating_check;
1106 /* Recheck anchored substring, but not floating... */
9041c2e3 1107 s = check_at;
5e39e1e5
HS
1108 if (!check)
1109 goto giveup;
a3621e74 1110 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1111 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1112 (long)(other_last - i_strpos)) );
1113 goto do_other_anchored;
1114 }
60e71179
GS
1115 /* Another way we could have checked stclass at the
1116 current position only: */
1117 if (ml_anch) {
1118 s = t = t + 1;
5e39e1e5
HS
1119 if (!check)
1120 goto giveup;
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1122 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1123 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1124 goto try_at_offset;
66e933ab 1125 }
f2ed9b32 1126 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1127 goto fail;
6eb5f6b9
JH
1128 /* Check is floating subtring. */
1129 retry_floating_check:
1130 t = check_at - start_shift;
a3621e74 1131 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1132 goto hop_and_restart;
1133 }
b7953727 1134 if (t != s) {
a3621e74 1135 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1136 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1137 (long)(t - i_strpos), (long)(s - i_strpos))
1138 );
1139 }
1140 else {
a3621e74 1141 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1142 "Does not contradict STCLASS...\n");
1143 );
1144 }
6eb5f6b9 1145 }
5e39e1e5 1146 giveup:
a3621e74 1147 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1148 PL_colors[4], (check ? "Guessed" : "Giving up"),
1149 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1150 return s;
2c2d71f5
JH
1151
1152 fail_finish: /* Substring not found */
33b8afdf 1153 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1154 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1155 fail:
a3621e74 1156 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1157 PL_colors[4], PL_colors[5]));
bd61b366 1158 return NULL;
cad2e5aa 1159}
9661b544 1160
a0a388a1
YO
1161#define DECL_TRIE_TYPE(scan) \
1162 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1163 trie_type = (scan->flags != EXACT) \
f2ed9b32
KW
1164 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1165 : (utf8_target ? trie_utf8 : trie_plain)
3b0527fe 1166
55eed653
NC
1167#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1168uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1169 switch (trie_type) { \
1170 case trie_utf8_fold: \
1171 if ( foldlen>0 ) { \
0abd0d78 1172 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1173 foldlen -= len; \
1174 uscan += len; \
1175 len=0; \
1176 } else { \
0abd0d78 1177 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1178 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1179 foldlen -= UNISKIP( uvc ); \
1180 uscan = foldbuf + UNISKIP( uvc ); \
1181 } \
1182 break; \
a0a388a1
YO
1183 case trie_latin_utf8_fold: \
1184 if ( foldlen>0 ) { \
1185 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1186 foldlen -= len; \
1187 uscan += len; \
1188 len=0; \
1189 } else { \
1190 len = 1; \
1191 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1192 foldlen -= UNISKIP( uvc ); \
1193 uscan = foldbuf + UNISKIP( uvc ); \
1194 } \
1195 break; \
4cadc6a9
YO
1196 case trie_utf8: \
1197 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1198 break; \
1199 case trie_plain: \
1200 uvc = (UV)*uc; \
1201 len = 1; \
1202 } \
4cadc6a9
YO
1203 if (uvc < 256) { \
1204 charid = trie->charmap[ uvc ]; \
1205 } \
1206 else { \
1207 charid = 0; \
55eed653
NC
1208 if (widecharmap) { \
1209 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1210 (char*)&uvc, sizeof(UV), 0); \
1211 if (svpp) \
1212 charid = (U16)SvIV(*svpp); \
1213 } \
1214 } \
1215} STMT_END
1216
a0a388a1
YO
1217#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1218{ \
1219 char *my_strend= (char *)strend; \
4cadc6a9
YO
1220 if ( (CoNd) \
1221 && (ln == len || \
f2ed9b32
KW
1222 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1223 m, NULL, ln, cBOOL(UTF_PATTERN))) \
a0a388a1 1224 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1225 goto got_it; \
1226 else { \
1227 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1228 uvchr_to_utf8(tmpbuf, c); \
1229 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1230 if ( f != c \
1231 && (f == c1 || f == c2) \
a0a388a1 1232 && (ln == len || \
f2ed9b32
KW
1233 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1234 m, NULL, ln, cBOOL(UTF_PATTERN)))\
a0a388a1 1235 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1236 goto got_it; \
1237 } \
a0a388a1
YO
1238} \
1239s += len
4cadc6a9
YO
1240
1241#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1242STMT_START { \
1243 while (s <= e) { \
1244 if ( (CoNd) \
4c1b470c
KW
1245 && (ln == 1 || (OP(c) == EXACTF \
1246 ? foldEQ(s, m, ln) \
1247 : foldEQ_locale(s, m, ln))) \
24b23f37 1248 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1249 goto got_it; \
1250 s++; \
1251 } \
1252} STMT_END
1253
1254#define REXEC_FBC_UTF8_SCAN(CoDe) \
1255STMT_START { \
1256 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1257 CoDe \
1258 s += uskip; \
1259 } \
1260} STMT_END
1261
1262#define REXEC_FBC_SCAN(CoDe) \
1263STMT_START { \
1264 while (s < strend) { \
1265 CoDe \
1266 s++; \
1267 } \
1268} STMT_END
1269
1270#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1271REXEC_FBC_UTF8_SCAN( \
1272 if (CoNd) { \
24b23f37 1273 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it; \
1275 else \
1276 tmp = doevery; \
1277 } \
1278 else \
1279 tmp = 1; \
1280)
1281
1282#define REXEC_FBC_CLASS_SCAN(CoNd) \
1283REXEC_FBC_SCAN( \
1284 if (CoNd) { \
24b23f37 1285 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1286 goto got_it; \
1287 else \
1288 tmp = doevery; \
1289 } \
1290 else \
1291 tmp = 1; \
1292)
1293
1294#define REXEC_FBC_TRYIT \
24b23f37 1295if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1296 goto got_it
1297
e1d1eefb 1298#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1299 if (utf8_target) { \
e1d1eefb
YO
1300 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1301 } \
1302 else { \
1303 REXEC_FBC_CLASS_SCAN(CoNd); \
1304 } \
1305 break
1306
4cadc6a9 1307#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1308 if (utf8_target) { \
4cadc6a9
YO
1309 UtFpReLoAd; \
1310 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1311 } \
1312 else { \
1313 REXEC_FBC_CLASS_SCAN(CoNd); \
1314 } \
1315 break
1316
1317#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1318 PL_reg_flags |= RF_tainted; \
f2ed9b32 1319 if (utf8_target) { \
4cadc6a9
YO
1320 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1321 } \
1322 else { \
1323 REXEC_FBC_CLASS_SCAN(CoNd); \
1324 } \
1325 break
1326
786e8c11
YO
1327#define DUMP_EXEC_POS(li,s,doutf8) \
1328 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1329
1330/* We know what class REx starts with. Try to find this position... */
1331/* if reginfo is NULL, its a dryrun */
1332/* annoyingly all the vars in this routine have different names from their counterparts
1333 in regmatch. /grrr */
1334
3c3eec57 1335STATIC char *
07be1b83 1336S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1337 const char *strend, regmatch_info *reginfo)
a687059c 1338{
27da23d5 1339 dVAR;
bbe252da 1340 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1341 char *m;
d8093b23 1342 STRLEN ln;
5dab1207 1343 STRLEN lnc;
078c425b 1344 register STRLEN uskip;
d8093b23
G
1345 unsigned int c1;
1346 unsigned int c2;
6eb5f6b9
JH
1347 char *e;
1348 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1349 register const bool utf8_target = PL_reg_match_utf8;
f8fc2ecf 1350 RXi_GET_DECL(prog,progi);
7918f24d
NC
1351
1352 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1353
6eb5f6b9
JH
1354 /* We know what class it must start with. */
1355 switch (OP(c)) {
6eb5f6b9 1356 case ANYOF:
f2ed9b32 1357 if (utf8_target) {
4cadc6a9 1358 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1359 !UTF8_IS_INVARIANT((U8)s[0]) ?
f2ed9b32 1360 reginclass(prog, c, (U8*)s, 0, utf8_target) :
4cadc6a9 1361 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1362 }
1363 else {
1364 while (s < strend) {
1365 STRLEN skip = 1;
1366
32fc9b6a 1367 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1368 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1369 /* The assignment of 2 is intentional:
1370 * for the folded sharp s, the skip is 2. */
1371 (skip = SHARP_S_SKIP))) {
24b23f37 1372 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1373 goto got_it;
1374 else
1375 tmp = doevery;
1376 }
1377 else
1378 tmp = 1;
1379 s += skip;
1380 }
a0d0e21e 1381 }
6eb5f6b9 1382 break;
f33976b4 1383 case CANY:
4cadc6a9 1384 REXEC_FBC_SCAN(
24b23f37 1385 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1386 goto got_it;
1387 else
1388 tmp = doevery;
4cadc6a9 1389 );
f33976b4 1390 break;
6eb5f6b9 1391 case EXACTF:
5dab1207
NIS
1392 m = STRING(c);
1393 ln = STR_LEN(c); /* length to match in octets/bytes */
1394 lnc = (I32) ln; /* length to match in characters */
f2ed9b32 1395 if (UTF_PATTERN) {
a2a2844f 1396 STRLEN ulen1, ulen2;
5dab1207 1397 U8 *sm = (U8 *) m;
89ebb4a3
JH
1398 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1399 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1400 /* used by commented-out code below */
1401 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1402
1403 /* XXX: Since the node will be case folded at compile
1404 time this logic is a little odd, although im not
1405 sure that its actually wrong. --dmq */
1406
1407 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1408 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1409
1410 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1411 codepoint of the first character in the converted
1412 form, yet originally we did the extra step.
1413 No tests fail by commenting this code out however
1414 so Ive left it out. -- dmq.
1415
89ebb4a3 1416 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1417 0, uniflags);
89ebb4a3 1418 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1419 0, uniflags);
a0a388a1
YO
1420 */
1421
5dab1207
NIS
1422 lnc = 0;
1423 while (sm < ((U8 *) m + ln)) {
1424 lnc++;
1425 sm += UTF8SKIP(sm);
1426 }
1aa99e6b
IH
1427 }
1428 else {
1429 c1 = *(U8*)m;
1430 c2 = PL_fold[c1];
1431 }
6eb5f6b9
JH
1432 goto do_exactf;
1433 case EXACTFL:
5dab1207
NIS
1434 m = STRING(c);
1435 ln = STR_LEN(c);
1436 lnc = (I32) ln;
d8093b23 1437 c1 = *(U8*)m;
6eb5f6b9
JH
1438 c2 = PL_fold_locale[c1];
1439 do_exactf:
db12adc6 1440 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1441
3b0527fe 1442 if (!reginfo && e < s)
6eb5f6b9 1443 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1444
60a8b682
JH
1445 /* The idea in the EXACTF* cases is to first find the
1446 * first character of the EXACTF* node and then, if
1447 * necessary, case-insensitively compare the full
1448 * text of the node. The c1 and c2 are the first
1449 * characters (though in Unicode it gets a bit
1450 * more complicated because there are more cases
7f16dd3d
JH
1451 * than just upper and lower: one needs to use
1452 * the so-called folding case for case-insensitive
1453 * matching (called "loose matching" in Unicode).
4c1b470c 1454 * foldEQ_utf8() will do just that. */
60a8b682 1455
f2ed9b32 1456 if (utf8_target || UTF_PATTERN) {
575cac57 1457 UV c, f;
89ebb4a3 1458 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1459 STRLEN len = 1;
1460 STRLEN foldlen;
4ad0818d 1461 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1462 if (c1 == c2) {
5dab1207
NIS
1463 /* Upper and lower of 1st char are equal -
1464 * probably not a "letter". */
1aa99e6b 1465 while (s <= e) {
f2ed9b32 1466 if (utf8_target) {
a0a388a1 1467 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1468 uniflags);
a0a388a1
YO
1469 } else {
1470 c = *((U8*)s);
1471 }
4cadc6a9 1472 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1473 }
09091399
JH
1474 }
1475 else {
1aa99e6b 1476 while (s <= e) {
f2ed9b32 1477 if (utf8_target) {
a0a388a1 1478 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1479 uniflags);
a0a388a1
YO
1480 } else {
1481 c = *((U8*)s);
1482 }
80aecb99 1483
60a8b682 1484 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1485 * Note that not all the possible combinations
1486 * are handled here: some of them are handled
1487 * by the standard folding rules, and some of
1488 * them (the character class or ANYOF cases)
1489 * are handled during compiletime in
1490 * regexec.c:S_regclass(). */
880bd946
JH
1491 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1492 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1493 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1494
4cadc6a9 1495 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1496 }
09091399 1497 }
1aa99e6b
IH
1498 }
1499 else {
a0a388a1 1500 /* Neither pattern nor string are UTF8 */
1aa99e6b 1501 if (c1 == c2)
4cadc6a9 1502 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1503 else
4cadc6a9 1504 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1505 }
1506 break;
bbce6d69 1507 case BOUNDL:
3280af22 1508 PL_reg_flags |= RF_tainted;
bbce6d69 1509 /* FALL THROUGH */
a0d0e21e 1510 case BOUND:
f2ed9b32 1511 if (utf8_target) {
12d33761 1512 if (s == PL_bostr)
ffc61ed2
JH
1513 tmp = '\n';
1514 else {
6136c704 1515 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1516 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1517 }
1518 tmp = ((OP(c) == BOUND ?
9041c2e3 1519 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1520 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1521 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1522 if (tmp == !(OP(c) == BOUND ?
f2ed9b32 1523 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1524 isALNUM_LC_utf8((U8*)s)))
1525 {
1526 tmp = !tmp;
4cadc6a9 1527 REXEC_FBC_TRYIT;
a687059c 1528 }
4cadc6a9 1529 );
a0d0e21e 1530 }
a12cf05f 1531 else { /* Not utf8 */
12d33761 1532 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
a12cf05f
KW
1533 tmp = cBOOL((OP(c) == BOUNDL)
1534 ? isALNUM_LC(tmp)
1535 : (isWORDCHAR_L1(tmp)
1536 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
4cadc6a9 1537 REXEC_FBC_SCAN(
ffc61ed2 1538 if (tmp ==
a12cf05f
KW
1539 !((OP(c) == BOUNDL)
1540 ? isALNUM_LC(*s)
1541 : (isWORDCHAR_L1((U8) *s)
1542 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1543 {
ffc61ed2 1544 tmp = !tmp;
4cadc6a9 1545 REXEC_FBC_TRYIT;
a0ed51b3 1546 }
4cadc6a9 1547 );
a0ed51b3 1548 }
24b23f37 1549 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1550 goto got_it;
1551 break;
bbce6d69 1552 case NBOUNDL:
3280af22 1553 PL_reg_flags |= RF_tainted;
bbce6d69 1554 /* FALL THROUGH */
a0d0e21e 1555 case NBOUND:
f2ed9b32 1556 if (utf8_target) {
12d33761 1557 if (s == PL_bostr)
ffc61ed2
JH
1558 tmp = '\n';
1559 else {
6136c704 1560 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1561 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1562 }
1563 tmp = ((OP(c) == NBOUND ?
9041c2e3 1564 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1565 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1566 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1567 if (tmp == !(OP(c) == NBOUND ?
f2ed9b32 1568 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
ffc61ed2
JH
1569 isALNUM_LC_utf8((U8*)s)))
1570 tmp = !tmp;
4cadc6a9
YO
1571 else REXEC_FBC_TRYIT;
1572 );
a0d0e21e 1573 }
667bb95a 1574 else {
12d33761 1575 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
a12cf05f
KW
1576 tmp = cBOOL((OP(c) == NBOUNDL)
1577 ? isALNUM_LC(tmp)
1578 : (isWORDCHAR_L1(tmp)
1579 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
4cadc6a9 1580 REXEC_FBC_SCAN(
a12cf05f
KW
1581 if (tmp == ! cBOOL(
1582 (OP(c) == NBOUNDL)
1583 ? isALNUM_LC(*s)
1584 : (isWORDCHAR_L1((U8) *s)
1585 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1586 {
ffc61ed2 1587 tmp = !tmp;
a12cf05f 1588 }
4cadc6a9
YO
1589 else REXEC_FBC_TRYIT;
1590 );
a0ed51b3 1591 }
24b23f37 1592 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1593 goto got_it;
1594 break;
a0d0e21e 1595 case ALNUM:
4cadc6a9 1596 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1597 LOAD_UTF8_CHARCLASS_PERL_WORD(),
f2ed9b32 1598 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
a12cf05f 1599 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
4cadc6a9 1600 );
bbce6d69 1601 case ALNUML:
4cadc6a9
YO
1602 REXEC_FBC_CSCAN_TAINT(
1603 isALNUM_LC_utf8((U8*)s),
1604 isALNUM_LC(*s)
1605 );
a0d0e21e 1606 case NALNUM:
4cadc6a9 1607 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1608 LOAD_UTF8_CHARCLASS_PERL_WORD(),
f2ed9b32 1609 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
a12cf05f 1610 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
4cadc6a9 1611 );
bbce6d69 1612 case NALNUML:
4cadc6a9
YO
1613 REXEC_FBC_CSCAN_TAINT(
1614 !isALNUM_LC_utf8((U8*)s),
1615 !isALNUM_LC(*s)
1616 );
a0d0e21e 1617 case SPACE:
4cadc6a9 1618 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1619 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1620 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
a12cf05f 1621 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
4cadc6a9 1622 );
bbce6d69 1623 case SPACEL:
4cadc6a9
YO
1624 REXEC_FBC_CSCAN_TAINT(
1625 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1626 isSPACE_LC(*s)
1627 );
a0d0e21e 1628 case NSPACE:
4cadc6a9 1629 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1630 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
f2ed9b32 1631 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
a12cf05f 1632 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
4cadc6a9 1633 );
bbce6d69 1634 case NSPACEL:
4cadc6a9
YO
1635 REXEC_FBC_CSCAN_TAINT(
1636 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1637 !isSPACE_LC(*s)
1638 );
a0d0e21e 1639 case DIGIT:
4cadc6a9 1640 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1641 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1642 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1643 isDIGIT(*s)
1644 );
b8c5462f 1645 case DIGITL:
4cadc6a9
YO
1646 REXEC_FBC_CSCAN_TAINT(
1647 isDIGIT_LC_utf8((U8*)s),
1648 isDIGIT_LC(*s)
1649 );
a0d0e21e 1650 case NDIGIT:
4cadc6a9 1651 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177 1652 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
f2ed9b32 1653 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1654 !isDIGIT(*s)
1655 );
b8c5462f 1656 case NDIGITL:
4cadc6a9
YO
1657 REXEC_FBC_CSCAN_TAINT(
1658 !isDIGIT_LC_utf8((U8*)s),
1659 !isDIGIT_LC(*s)
1660 );
e1d1eefb
YO
1661 case LNBREAK:
1662 REXEC_FBC_CSCAN(
1663 is_LNBREAK_utf8(s),
1664 is_LNBREAK_latin1(s)
1665 );
1666 case VERTWS:
1667 REXEC_FBC_CSCAN(
1668 is_VERTWS_utf8(s),
1669 is_VERTWS_latin1(s)
1670 );
1671 case NVERTWS:
1672 REXEC_FBC_CSCAN(
1673 !is_VERTWS_utf8(s),
1674 !is_VERTWS_latin1(s)
1675 );
1676 case HORIZWS:
1677 REXEC_FBC_CSCAN(
1678 is_HORIZWS_utf8(s),
1679 is_HORIZWS_latin1(s)
1680 );
1681 case NHORIZWS:
1682 REXEC_FBC_CSCAN(
1683 !is_HORIZWS_utf8(s),
1684 !is_HORIZWS_latin1(s)
1685 );
1de06328
YO
1686 case AHOCORASICKC:
1687 case AHOCORASICK:
07be1b83 1688 {
a0a388a1 1689 DECL_TRIE_TYPE(c);
07be1b83
YO
1690 /* what trie are we using right now */
1691 reg_ac_data *aho
f8fc2ecf 1692 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1693 reg_trie_data *trie
1694 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1695 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1696
1697 const char *last_start = strend - trie->minlen;
6148ee25 1698#ifdef DEBUGGING
07be1b83 1699 const char *real_start = s;
6148ee25 1700#endif
07be1b83 1701 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1702 SV *sv_points;
1703 U8 **points; /* map of where we were in the input string
786e8c11 1704 when reading a given char. For ASCII this
be8e71aa 1705 is unnecessary overhead as the relationship
38a44b82
NC
1706 is always 1:1, but for Unicode, especially
1707 case folded Unicode this is not true. */
f9e705e8 1708 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1709 U8 *bitmap=NULL;
1710
07be1b83
YO
1711
1712 GET_RE_DEBUG_FLAGS_DECL;
1713
be8e71aa
YO
1714 /* We can't just allocate points here. We need to wrap it in
1715 * an SV so it gets freed properly if there is a croak while
1716 * running the match */
1717 ENTER;
1718 SAVETMPS;
1719 sv_points=newSV(maxlen * sizeof(U8 *));
1720 SvCUR_set(sv_points,
1721 maxlen * sizeof(U8 *));
1722 SvPOK_on(sv_points);
1723 sv_2mortal(sv_points);
1724 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1725 if ( trie_type != trie_utf8_fold
1726 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1727 {
786e8c11
YO
1728 if (trie->bitmap)
1729 bitmap=(U8*)trie->bitmap;
1730 else
1731 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1732 }
786e8c11
YO
1733 /* this is the Aho-Corasick algorithm modified a touch
1734 to include special handling for long "unknown char"
1735 sequences. The basic idea being that we use AC as long
1736 as we are dealing with a possible matching char, when
1737 we encounter an unknown char (and we have not encountered
1738 an accepting state) we scan forward until we find a legal
1739 starting char.
1740 AC matching is basically that of trie matching, except
1741 that when we encounter a failing transition, we fall back
1742 to the current states "fail state", and try the current char
1743 again, a process we repeat until we reach the root state,
1744 state 1, or a legal transition. If we fail on the root state
1745 then we can either terminate if we have reached an accepting
1746 state previously, or restart the entire process from the beginning
1747 if we have not.
1748
1749 */
07be1b83
YO
1750 while (s <= last_start) {
1751 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1752 U8 *uc = (U8*)s;
1753 U16 charid = 0;
1754 U32 base = 1;
1755 U32 state = 1;
1756 UV uvc = 0;
1757 STRLEN len = 0;
1758 STRLEN foldlen = 0;
1759 U8 *uscan = (U8*)NULL;
1760 U8 *leftmost = NULL;
786e8c11
YO
1761#ifdef DEBUGGING
1762 U32 accepted_word= 0;
1763#endif
07be1b83
YO
1764 U32 pointpos = 0;
1765
1766 while ( state && uc <= (U8*)strend ) {
1767 int failed=0;
786e8c11
YO
1768 U32 word = aho->states[ state ].wordnum;
1769
1de06328
YO
1770 if( state==1 ) {
1771 if ( bitmap ) {
1772 DEBUG_TRIE_EXECUTE_r(
1773 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1774 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1775 (char *)uc, utf8_target );
1de06328
YO
1776 PerlIO_printf( Perl_debug_log,
1777 " Scanning for legal start char...\n");
1778 }
1779 );
1780 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1781 uc++;
786e8c11 1782 }
1de06328 1783 s= (char *)uc;
786e8c11 1784 }
786e8c11
YO
1785 if (uc >(U8*)last_start) break;
1786 }
1787
1788 if ( word ) {
2e64971a 1789 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1790 if (!leftmost || lpos < leftmost) {
1791 DEBUG_r(accepted_word=word);
07be1b83 1792 leftmost= lpos;
786e8c11 1793 }
07be1b83 1794 if (base==0) break;
786e8c11 1795
07be1b83
YO
1796 }
1797 points[pointpos++ % maxlen]= uc;
55eed653
NC
1798 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1799 uscan, len, uvc, charid, foldlen,
1800 foldbuf, uniflags);
786e8c11
YO
1801 DEBUG_TRIE_EXECUTE_r({
1802 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1803 s, utf8_target );
07be1b83 1804 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1805 " Charid:%3u CP:%4"UVxf" ",
1806 charid, uvc);
1807 });
07be1b83
YO
1808
1809 do {
6148ee25 1810#ifdef DEBUGGING
786e8c11 1811 word = aho->states[ state ].wordnum;
6148ee25 1812#endif
07be1b83
YO
1813 base = aho->states[ state ].trans.base;
1814
786e8c11
YO
1815 DEBUG_TRIE_EXECUTE_r({
1816 if (failed)
1817 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1818 s, utf8_target );
07be1b83 1819 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1820 "%sState: %4"UVxf", word=%"UVxf,
1821 failed ? " Fail transition to " : "",
1822 (UV)state, (UV)word);
1823 });
07be1b83
YO
1824 if ( base ) {
1825 U32 tmp;
6dd2be57 1826 I32 offset;
07be1b83 1827 if (charid &&
6dd2be57
DM
1828 ( ((offset = base + charid
1829 - 1 - trie->uniquecharcount)) >= 0)
1830 && ((U32)offset < trie->lasttrans)
1831 && trie->trans[offset].check == state
1832 && (tmp=trie->trans[offset].next))
07be1b83 1833 {
786e8c11
YO
1834 DEBUG_TRIE_EXECUTE_r(
1835 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1836 state = tmp;
1837 break;
1838 }
1839 else {
786e8c11
YO
1840 DEBUG_TRIE_EXECUTE_r(
1841 PerlIO_printf( Perl_debug_log," - fail\n"));
1842 failed = 1;
1843 state = aho->fail[state];
07be1b83
YO
1844 }
1845 }
1846 else {
1847 /* we must be accepting here */
786e8c11
YO
1848 DEBUG_TRIE_EXECUTE_r(
1849 PerlIO_printf( Perl_debug_log," - accepting\n"));
1850 failed = 1;
07be1b83
YO
1851 break;
1852 }
1853 } while(state);
786e8c11 1854 uc += len;
07be1b83
YO
1855 if (failed) {
1856 if (leftmost)
1857 break;
786e8c11 1858 if (!state) state = 1;
07be1b83
YO
1859 }
1860 }
1861 if ( aho->states[ state ].wordnum ) {
2e64971a 1862 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1863 if (!leftmost || lpos < leftmost) {
1864 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1865 leftmost = lpos;
786e8c11 1866 }
07be1b83 1867 }
07be1b83
YO
1868 if (leftmost) {
1869 s = (char*)leftmost;
786e8c11
YO
1870 DEBUG_TRIE_EXECUTE_r({
1871 PerlIO_printf(
70685ca0
JH
1872 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1873 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1874 );
1875 });
24b23f37 1876 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1877 FREETMPS;
1878 LEAVE;
07be1b83 1879 goto got_it;
be8e71aa 1880 }
07be1b83 1881 s = HOPc(s,1);
786e8c11
YO
1882 DEBUG_TRIE_EXECUTE_r({
1883 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1884 });
07be1b83 1885 } else {
786e8c11
YO
1886 DEBUG_TRIE_EXECUTE_r(
1887 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1888 break;
1889 }
1890 }
be8e71aa
YO
1891 FREETMPS;
1892 LEAVE;
07be1b83
YO
1893 }
1894 break;
b3c9acc1 1895 default:
3c3eec57
GS
1896 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1897 break;
d6a28714 1898 }
6eb5f6b9
JH
1899 return 0;
1900 got_it:
1901 return s;
1902}
1903
fae667d5 1904
6eb5f6b9
JH
1905/*
1906 - regexec_flags - match a regexp against a string
1907 */
1908I32
288b8c02 1909Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1910 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1911/* strend: pointer to null at end of string */
1912/* strbeg: real beginning of string */
1913/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1914/* data: May be used for some additional optimizations.
1915 Currently its only used, with a U32 cast, for transmitting
1916 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1917/* nosave: For optimizations. */
1918{
97aff369 1919 dVAR;
288b8c02 1920 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1921 /*register*/ char *s;
6eb5f6b9 1922 register regnode *c;
24b23f37 1923 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1924 I32 minlen; /* must match at least this many chars */
1925 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1926 I32 end_shift = 0; /* Same for the end. */ /* CC */
1927 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1928 char *scream_olds = NULL;
f2ed9b32 1929 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 1930 I32 multiline;
f8fc2ecf 1931 RXi_GET_DECL(prog,progi);
3b0527fe 1932 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1933 regexp_paren_pair *swap = NULL;
a3621e74
YO
1934 GET_RE_DEBUG_FLAGS_DECL;
1935
7918f24d 1936 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1937 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1938
1939 /* Be paranoid... */
1940 if (prog == NULL || startpos == NULL) {
1941 Perl_croak(aTHX_ "NULL regexp parameter");
1942 return 0;
1943 }
1944
bbe252da 1945 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1946 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1947
f2ed9b32 1948 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 1949 DEBUG_EXECUTE_r(
f2ed9b32 1950 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
1951 "Matching");
1952 );
bac06658 1953
6eb5f6b9 1954 minlen = prog->minlen;
1de06328
YO
1955
1956 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1958 "String too short [regexec_flags]...\n"));
1959 goto phooey;
1aa99e6b 1960 }
6eb5f6b9 1961
1de06328 1962
6eb5f6b9 1963 /* Check validity of program. */
f8fc2ecf 1964 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1965 Perl_croak(aTHX_ "corrupted regexp program");
1966 }
1967
1968 PL_reg_flags = 0;
1969 PL_reg_eval_set = 0;
1970 PL_reg_maxiter = 0;
1971
3c8556c3 1972 if (RX_UTF8(rx))
6eb5f6b9
JH
1973 PL_reg_flags |= RF_utf8;
1974
1975 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1976 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1977 PL_bostr = strbeg;
3b0527fe 1978 reginfo.sv = sv;
6eb5f6b9
JH
1979
1980 /* Mark end of line for $ (and such) */
1981 PL_regeol = strend;
1982
1983 /* see how far we have to get to not match where we matched before */
3b0527fe 1984 reginfo.till = startpos+minend;
6eb5f6b9 1985
6eb5f6b9
JH
1986 /* If there is a "must appear" string, look for it. */
1987 s = startpos;
1988
bbe252da 1989 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 1990 MAGIC *mg;
2c296965 1991 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 1992 reginfo.ganch = startpos + prog->gofs;
2c296965 1993 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1994 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 1995 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 1996 && SvMAGIC(sv)
14befaf4
DM
1997 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1998 && mg->mg_len >= 0) {
3b0527fe 1999 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2000 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2001 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2002
bbe252da 2003 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2004 if (s > reginfo.ganch)
6eb5f6b9 2005 goto phooey;
58e23c8d 2006 s = reginfo.ganch - prog->gofs;
2c296965 2007 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2008 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2009 if (s < strbeg)
2010 goto phooey;
6eb5f6b9
JH
2011 }
2012 }
58e23c8d 2013 else if (data) {
70685ca0 2014 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2015 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2016 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2017
2018 } else { /* pos() not defined */
3b0527fe 2019 reginfo.ganch = strbeg;
2c296965
YO
2020 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2021 "GPOS: reginfo.ganch = strbeg\n"));
2022 }
6eb5f6b9 2023 }
288b8c02 2024 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2025 /* We have to be careful. If the previous successful match
2026 was from this regex we don't want a subsequent partially
2027 successful match to clobber the old results.
2028 So when we detect this possibility we add a swap buffer
2029 to the re, and switch the buffer each match. If we fail
2030 we switch it back, otherwise we leave it swapped.
2031 */
2032 swap = prog->offs;
2033 /* do we need a save destructor here for eval dies? */
2034 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2035 }
a0714e2c 2036 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2037 re_scream_pos_data d;
2038
2039 d.scream_olds = &scream_olds;
2040 d.scream_pos = &scream_pos;
288b8c02 2041 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2042 if (!s) {
a3621e74 2043 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2044 goto phooey; /* not present */
3fa9c3d7 2045 }
6eb5f6b9
JH
2046 }
2047
1de06328 2048
6eb5f6b9
JH
2049
2050 /* Simplest case: anchored match need be tried only once. */
2051 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2052 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2053 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2054 goto got_it;
bbe252da
YO
2055 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2056 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2057 {
2058 char *end;
2059
2060 if (minlen)
2061 dontbother = minlen - 1;
1aa99e6b 2062 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2063 /* for multiline we only have to try after newlines */
33b8afdf 2064 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2065 /* because of the goto we can not easily reuse the macros for bifurcating the
2066 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2067 if (utf8_target) {
2068 if (s == startpos)
2069 goto after_try_utf8;
2070 while (1) {
2071 if (regtry(&reginfo, &s)) {
2072 goto got_it;
2073 }
2074 after_try_utf8:
2075 if (s > end) {
2076 goto phooey;
2077 }
2078 if (prog->extflags & RXf_USE_INTUIT) {
2079 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2080 if (!s) {
2081 goto phooey;
2082 }
2083 }
2084 else {
2085 s += UTF8SKIP(s);
2086 }
2087 }
2088 } /* end search for check string in unicode */
2089 else {
2090 if (s == startpos) {
2091 goto after_try_latin;
2092 }
2093 while (1) {
2094 if (regtry(&reginfo, &s)) {
2095 goto got_it;
2096 }
2097 after_try_latin:
2098 if (s > end) {
2099 goto phooey;
2100 }
2101 if (prog->extflags & RXf_USE_INTUIT) {
2102 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2103 if (!s) {
2104 goto phooey;
2105 }
2106 }
2107 else {
2108 s++;
2109 }
2110 }
2111 } /* end search for check string in latin*/
2112 } /* end search for check string */
2113 else { /* search for newline */
2114 if (s > startpos) {
2115 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2116 s--;
92f3d482
YO
2117 }
2118 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
6eb5f6b9
JH
2119 while (s < end) {
2120 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2121 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2122 goto got_it;
2123 }
92f3d482
YO
2124 }
2125 } /* end search for newline */
2126 } /* end anchored/multiline check string search */
6eb5f6b9 2127 goto phooey;
bbe252da 2128 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
2129 {
2130 /* the warning about reginfo.ganch being used without intialization
bbe252da 2131 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2132 and we only enter this block when the same bit is set. */
58e23c8d 2133 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2134
2135 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2136 goto got_it;
2137 goto phooey;
2138 }
2139
2140 /* Messy cases: unanchored match. */
bbe252da 2141 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2142 /* we have /x+whatever/ */
f2ed9b32 2143 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2144 char ch;
bf93d4cc
GS
2145#ifdef DEBUGGING
2146 int did_match = 0;
2147#endif
f2ed9b32
KW
2148 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2149 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2150 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2151
f2ed9b32 2152 if (utf8_target) {
4cadc6a9 2153 REXEC_FBC_SCAN(
6eb5f6b9 2154 if (*s == ch) {
a3621e74 2155 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2156 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2157 s += UTF8SKIP(s);
2158 while (s < strend && *s == ch)
2159 s += UTF8SKIP(s);
2160 }
4cadc6a9 2161 );
6eb5f6b9
JH
2162 }
2163 else {
4cadc6a9 2164 REXEC_FBC_SCAN(
6eb5f6b9 2165 if (*s == ch) {
a3621e74 2166 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2167 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2168 s++;
2169 while (s < strend && *s == ch)
2170 s++;
2171 }
4cadc6a9 2172 );
6eb5f6b9 2173 }
a3621e74 2174 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2175 PerlIO_printf(Perl_debug_log,
b7953727
JH
2176 "Did not find anchored character...\n")
2177 );
6eb5f6b9 2178 }
a0714e2c
SS
2179 else if (prog->anchored_substr != NULL
2180 || prog->anchored_utf8 != NULL
2181 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2182 && prog->float_max_offset < strend - s)) {
2183 SV *must;
2184 I32 back_max;
2185 I32 back_min;
2186 char *last;
6eb5f6b9 2187 char *last1; /* Last position checked before */
bf93d4cc
GS
2188#ifdef DEBUGGING
2189 int did_match = 0;
2190#endif
33b8afdf 2191 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2192 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2193 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2194 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2195 back_max = back_min = prog->anchored_offset;
2196 } else {
f2ed9b32
KW
2197 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2198 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2199 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2200 back_max = prog->float_max_offset;
2201 back_min = prog->float_min_offset;
2202 }
1de06328
YO
2203
2204
33b8afdf
JH
2205 if (must == &PL_sv_undef)
2206 /* could not downgrade utf8 check substring, so must fail */
2207 goto phooey;
2208
1de06328
YO
2209 if (back_min<0) {
2210 last = strend;
2211 } else {
2212 last = HOP3c(strend, /* Cannot start after this */
2213 -(I32)(CHR_SVLEN(must)
2214 - (SvTAIL(must) != 0) + back_min), strbeg);
2215 }
6eb5f6b9
JH
2216 if (s > PL_bostr)
2217 last1 = HOPc(s, -1);
2218 else
2219 last1 = s - 1; /* bogus */
2220
a0288114 2221 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2222 check_substr==must. */
2223 scream_pos = -1;
2224 dontbother = end_shift;
2225 strend = HOPc(strend, -dontbother);
2226 while ( (s <= last) &&
9041c2e3 2227 ((flags & REXEC_SCREAM)
1de06328 2228 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2229 end_shift, &scream_pos, 0))
1de06328 2230 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2231 (unsigned char*)strend, must,
7fba1cd6 2232 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2233 /* we may be pointing at the wrong string */
07bc277f 2234 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2235 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2236 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2237 if (HOPc(s, -back_max) > last1) {
2238 last1 = HOPc(s, -back_min);
2239 s = HOPc(s, -back_max);
2240 }
2241 else {
52657f30 2242 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2243
2244 last1 = HOPc(s, -back_min);
52657f30 2245 s = t;
6eb5f6b9 2246 }
f2ed9b32 2247 if (utf8_target) {
6eb5f6b9 2248 while (s <= last1) {
24b23f37 2249 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2250 goto got_it;
2251 s += UTF8SKIP(s);
2252 }
2253 }
2254 else {
2255 while (s <= last1) {
24b23f37 2256 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2257 goto got_it;
2258 s++;
2259 }
2260 }
2261 }
ab3bbdeb 2262 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2263 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2264 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2265 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2266 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2267 ? "anchored" : "floating"),
ab3bbdeb
YO
2268 quoted, RE_SV_TAIL(must));
2269 });
6eb5f6b9
JH
2270 goto phooey;
2271 }
f8fc2ecf 2272 else if ( (c = progi->regstclass) ) {
f14c76ed 2273 if (minlen) {
f8fc2ecf 2274 const OPCODE op = OP(progi->regstclass);
66e933ab 2275 /* don't bother with what can't match */
786e8c11 2276 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2277 strend = HOPc(strend, -(minlen - 1));
2278 }
a3621e74 2279 DEBUG_EXECUTE_r({
be8e71aa 2280 SV * const prop = sv_newmortal();
32fc9b6a 2281 regprop(prog, prop, c);
0df25f3d 2282 {
f2ed9b32 2283 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2284 s,strend-s,60);
0df25f3d 2285 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2286 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2287 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2288 quoted, (int)(strend - s));
0df25f3d 2289 }
ffc61ed2 2290 });
3b0527fe 2291 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2292 goto got_it;
07be1b83 2293 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2294 }
2295 else {
2296 dontbother = 0;
a0714e2c 2297 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2298 /* Trim the end. */
d6a28714 2299 char *last;
33b8afdf
JH
2300 SV* float_real;
2301
f2ed9b32
KW
2302 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2303 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2304 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2305
2306 if (flags & REXEC_SCREAM) {
33b8afdf 2307 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2308 end_shift, &scream_pos, 1); /* last one */
2309 if (!last)
ffc61ed2 2310 last = scream_olds; /* Only one occurrence. */
4addbd3b 2311 /* we may be pointing at the wrong string */
07bc277f 2312 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2313 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2314 }
d6a28714
JH
2315 else {
2316 STRLEN len;
cfd0369c 2317 const char * const little = SvPV_const(float_real, len);
d6a28714 2318
33b8afdf 2319 if (SvTAIL(float_real)) {
d6a28714
JH
2320 if (memEQ(strend - len + 1, little, len - 1))
2321 last = strend - len + 1;
7fba1cd6 2322 else if (!multiline)
9041c2e3 2323 last = memEQ(strend - len, little, len)
bd61b366 2324 ? strend - len : NULL;
b8c5462f 2325 else
d6a28714
JH
2326 goto find_last;
2327 } else {
2328 find_last:
9041c2e3 2329 if (len)
d6a28714 2330 last = rninstr(s, strend, little, little + len);
b8c5462f 2331 else
a0288114 2332 last = strend; /* matching "$" */
b8c5462f 2333 }
b8c5462f 2334 }
bf93d4cc 2335 if (last == NULL) {
6bda09f9
YO
2336 DEBUG_EXECUTE_r(
2337 PerlIO_printf(Perl_debug_log,
2338 "%sCan't trim the tail, match fails (should not happen)%s\n",
2339 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2340 goto phooey; /* Should not happen! */
2341 }
d6a28714
JH
2342 dontbother = strend - last + prog->float_min_offset;
2343 }
2344 if (minlen && (dontbother < minlen))
2345 dontbother = minlen - 1;
2346 strend -= dontbother; /* this one's always in bytes! */
2347 /* We don't know much -- general case. */
f2ed9b32 2348 if (utf8_target) {
d6a28714 2349 for (;;) {
24b23f37 2350 if (regtry(&reginfo, &s))
d6a28714
JH
2351 goto got_it;
2352 if (s >= strend)
2353 break;
b8c5462f 2354 s += UTF8SKIP(s);
d6a28714
JH
2355 };
2356 }
2357 else {
2358 do {
24b23f37 2359 if (regtry(&reginfo, &s))
d6a28714
JH
2360 goto got_it;
2361 } while (s++ < strend);
2362 }
2363 }
2364
2365 /* Failure. */
2366 goto phooey;
2367
2368got_it:
e9105d30 2369 Safefree(swap);
288b8c02 2370 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2371
19b95bf0 2372 if (PL_reg_eval_set)
4f639d21 2373 restore_pos(aTHX_ prog);
5daac39c
NC
2374 if (RXp_PAREN_NAMES(prog))
2375 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2376
2377 /* make sure $`, $&, $', and $digit will work later */
2378 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2379 RX_MATCH_COPY_FREE(rx);
d6a28714 2380 if (flags & REXEC_COPY_STR) {
be8e71aa 2381 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2382#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2383 if ((SvIsCOW(sv)
2384 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2385 if (DEBUG_C_TEST) {
2386 PerlIO_printf(Perl_debug_log,
2387 "Copy on write: regexp capture, type %d\n",
2388 (int) SvTYPE(sv));
2389 }
2390 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2391 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2392 assert (SvPOKp(prog->saved_copy));
2393 } else
2394#endif
2395 {
288b8c02 2396 RX_MATCH_COPIED_on(rx);
ed252734
NC
2397 s = savepvn(strbeg, i);
2398 prog->subbeg = s;
2399 }
d6a28714 2400 prog->sublen = i;
d6a28714
JH
2401 }
2402 else {
2403 prog->subbeg = strbeg;
2404 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2405 }
2406 }
9041c2e3 2407
d6a28714
JH
2408 return 1;
2409
2410phooey:
a3621e74 2411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2412 PL_colors[4], PL_colors[5]));
d6a28714 2413 if (PL_reg_eval_set)
4f639d21 2414 restore_pos(aTHX_ prog);
e9105d30 2415 if (swap) {
c74340f9 2416 /* we failed :-( roll it back */
e9105d30
GG
2417 Safefree(prog->offs);
2418 prog->offs = swap;
2419 }
2420
d6a28714
JH
2421 return 0;
2422}
2423
6bda09f9 2424
d6a28714
JH
2425/*
2426 - regtry - try match at specific point
2427 */
2428STATIC I32 /* 0 failure, 1 success */
24b23f37 2429S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2430{
97aff369 2431 dVAR;
d6a28714 2432 CHECKPOINT lastcp;
288b8c02
NC
2433 REGEXP *const rx = reginfo->prog;
2434 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2435 RXi_GET_DECL(prog,progi);
a3621e74 2436 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2437
2438 PERL_ARGS_ASSERT_REGTRY;
2439
24b23f37 2440 reginfo->cutpoint=NULL;
d6a28714 2441
bbe252da 2442 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2443 MAGIC *mg;
2444
2445 PL_reg_eval_set = RS_init;
a3621e74 2446 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2447 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2448 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2449 ));
ea8d6ae1 2450 SAVESTACK_CXPOS();
d6a28714
JH
2451 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2452 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2453 SAVETMPS;
2454 /* Apparently this is not needed, judging by wantarray. */
e8347627 2455 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2456 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2457
3b0527fe 2458 if (reginfo->sv) {
d6a28714 2459 /* Make $_ available to executed code. */
3b0527fe 2460 if (reginfo->sv != DEFSV) {
59f00321 2461 SAVE_DEFSV;
414bf5ae 2462 DEFSV_set(reginfo->sv);
b8c5462f 2463 }
d6a28714 2464
3b0527fe
DM
2465 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2466 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2467 /* prepare for quick setting of pos */
d300d9fa 2468#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2469 if (SvIsCOW(reginfo->sv))
2470 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2471#endif
3dab1dad 2472 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2473 &PL_vtbl_mglob, NULL, 0);
d6a28714 2474 mg->mg_len = -1;
b8c5462f 2475 }
d6a28714
JH
2476 PL_reg_magic = mg;
2477 PL_reg_oldpos = mg->mg_len;
4f639d21 2478 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2479 }
09687e5a 2480 if (!PL_reg_curpm) {
a02a5408 2481 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2482#ifdef USE_ITHREADS
2483 {
14a49a24 2484 SV* const repointer = &PL_sv_undef;
92313705
NC
2485 /* this regexp is also owned by the new PL_reg_curpm, which
2486 will try to free it. */
d2ece331 2487 av_push(PL_regex_padav, repointer);
09687e5a
AB
2488 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2489 PL_regex_pad = AvARRAY(PL_regex_padav);
2490 }
2491#endif
2492 }
86c29d75
NC
2493#ifdef USE_ITHREADS
2494 /* It seems that non-ithreads works both with and without this code.
2495 So for efficiency reasons it seems best not to have the code
2496 compiled when it is not needed. */
92313705
NC
2497 /* This is safe against NULLs: */
2498 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2499 /* PM_reg_curpm owns a reference to this regexp. */
2500 ReREFCNT_inc(rx);
86c29d75 2501#endif
288b8c02 2502 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2503 PL_reg_oldcurpm = PL_curpm;
2504 PL_curpm = PL_reg_curpm;
07bc277f 2505 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2506 /* Here is a serious problem: we cannot rewrite subbeg,
2507 since it may be needed if this match fails. Thus
2508 $` inside (?{}) could fail... */
2509 PL_reg_oldsaved = prog->subbeg;
2510 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2511#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2512 PL_nrs = prog->saved_copy;
2513#endif
07bc277f 2514 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2515 }
2516 else
bd61b366 2517 PL_reg_oldsaved = NULL;
d6a28714
JH
2518 prog->subbeg = PL_bostr;
2519 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2520 }
24b23f37 2521 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2522 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2523 PL_reginput = *startpos;
d6a28714 2524 PL_reglastparen = &prog->lastparen;
a01268b5 2525 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2526 prog->lastparen = 0;
03994de8 2527 prog->lastcloseparen = 0;
d6a28714 2528 PL_regsize = 0;
f0ab9afb 2529 PL_regoffs = prog->offs;
d6a28714
JH
2530 if (PL_reg_start_tmpl <= prog->nparens) {
2531 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2532 if(PL_reg_start_tmp)
2533 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2534 else
a02a5408 2535 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2536 }
2537
2538 /* XXXX What this code is doing here?!!! There should be no need
2539 to do this again and again, PL_reglastparen should take care of
3dd2943c 2540 this! --ilya*/
dafc8851
JH
2541
2542 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2543 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2544 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2545 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2546 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2547 * Meanwhile, this code *is* needed for the
daf18116
JH
2548 * above-mentioned test suite tests to succeed. The common theme
2549 * on those tests seems to be returning null fields from matches.
225593e1 2550 * --jhi updated by dapm */
dafc8851 2551#if 1
d6a28714 2552 if (prog->nparens) {
f0ab9afb 2553 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2554 register I32 i;
eb160463 2555 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2556 ++pp;
2557 pp->start = -1;
2558 pp->end = -1;
d6a28714
JH
2559 }
2560 }
dafc8851 2561#endif
02db2b7b 2562 REGCP_SET(lastcp);
f8fc2ecf 2563 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2564 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2565 return 1;
2566 }
24b23f37
YO
2567 if (reginfo->cutpoint)
2568 *startpos= reginfo->cutpoint;
02db2b7b 2569 REGCP_UNWIND(lastcp);
d6a28714
JH
2570 return 0;
2571}
2572
02db2b7b 2573
8ba1375e
MJD
2574#define sayYES goto yes
2575#define sayNO goto no
262b90c4 2576#define sayNO_SILENT goto no_silent
8ba1375e 2577
f9f4320a
YO
2578/* we dont use STMT_START/END here because it leads to
2579 "unreachable code" warnings, which are bogus, but distracting. */
2580#define CACHEsayNO \
c476f425
DM
2581 if (ST.cache_mask) \
2582 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2583 sayNO
3298f257 2584
a3621e74 2585/* this is used to determine how far from the left messages like
265c4333
YO
2586 'failed...' are printed. It should be set such that messages
2587 are inline with the regop output that created them.
a3621e74 2588*/
265c4333 2589#define REPORT_CODE_OFF 32
a3621e74
YO
2590
2591
40a82448
DM
2592#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2593#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2594
86545054
DM
2595#define SLAB_FIRST(s) (&(s)->states[0])
2596#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2597
5d9a96ca
DM
2598/* grab a new slab and return the first slot in it */
2599
2600STATIC regmatch_state *
2601S_push_slab(pTHX)
2602{
a35a87e7 2603#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2604 dMY_CXT;
2605#endif
5d9a96ca
DM
2606 regmatch_slab *s = PL_regmatch_slab->next;
2607 if (!s) {
2608 Newx(s, 1, regmatch_slab);
2609 s->prev = PL_regmatch_slab;
2610 s->next = NULL;
2611 PL_regmatch_slab->next = s;
2612 }
2613 PL_regmatch_slab = s;
86545054 2614 return SLAB_FIRST(s);
5d9a96ca 2615}
5b47454d 2616
95b24440 2617
40a82448
DM
2618/* push a new state then goto it */
2619
2620#define PUSH_STATE_GOTO(state, node) \
2621 scan = node; \
2622 st->resume_state = state; \
2623 goto push_state;
2624
2625/* push a new state with success backtracking, then goto it */
2626
2627#define PUSH_YES_STATE_GOTO(state, node) \
2628 scan = node; \
2629 st->resume_state = state; \
2630 goto push_yes_state;
2631
aa283a38 2632
aa283a38 2633
d6a28714 2634/*
95b24440 2635
bf1f174e
DM
2636regmatch() - main matching routine
2637
2638This is basically one big switch statement in a loop. We execute an op,
2639set 'next' to point the next op, and continue. If we come to a point which
2640we may need to backtrack to on failure such as (A|B|C), we push a
2641backtrack state onto the backtrack stack. On failure, we pop the top
2642state, and re-enter the loop at the state indicated. If there are no more
2643states to pop, we return failure.
2644
2645Sometimes we also need to backtrack on success; for example /A+/, where
2646after successfully matching one A, we need to go back and try to
2647match another one; similarly for lookahead assertions: if the assertion
2648completes successfully, we backtrack to the state just before the assertion
2649and then carry on. In these cases, the pushed state is marked as
2650'backtrack on success too'. This marking is in fact done by a chain of
2651pointers, each pointing to the previous 'yes' state. On success, we pop to
2652the nearest yes state, discarding any intermediate failure-only states.
2653Sometimes a yes state is pushed just to force some cleanup code to be
2654called at the end of a successful match or submatch; e.g. (??{$re}) uses
2655it to free the inner regex.
2656
2657Note that failure backtracking rewinds the cursor position, while
2658success backtracking leaves it alone.
2659
2660A pattern is complete when the END op is executed, while a subpattern
2661such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2662ops trigger the "pop to last yes state if any, otherwise return true"
2663behaviour.
2664
2665A common convention in this function is to use A and B to refer to the two
2666subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2667the subpattern to be matched possibly multiple times, while B is the entire
2668rest of the pattern. Variable and state names reflect this convention.
2669
2670The states in the main switch are the union of ops and failure/success of
2671substates associated with with that op. For example, IFMATCH is the op
2672that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2673'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2674successfully matched A and IFMATCH_A_fail is a state saying that we have
2675just failed to match A. Resume states always come in pairs. The backtrack
2676state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2677at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2678on success or failure.
2679
2680The struct that holds a backtracking state is actually a big union, with
2681one variant for each major type of op. The variable st points to the
2682top-most backtrack struct. To make the code clearer, within each
2683block of code we #define ST to alias the relevant union.
2684
2685Here's a concrete example of a (vastly oversimplified) IFMATCH
2686implementation:
2687
2688 switch (state) {
2689 ....
2690
2691#define ST st->u.ifmatch
2692
2693 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2694 ST.foo = ...; // some state we wish to save
95b24440 2695 ...
bf1f174e
DM
2696 // push a yes backtrack state with a resume value of
2697 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2698 // first node of A:
2699 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2700 // NOTREACHED
2701
2702 case IFMATCH_A: // we have successfully executed A; now continue with B
2703 next = B;
2704 bar = ST.foo; // do something with the preserved value
2705 break;
2706
2707 case IFMATCH_A_fail: // A failed, so the assertion failed
2708 ...; // do some housekeeping, then ...
2709 sayNO; // propagate the failure
2710
2711#undef ST
95b24440 2712
bf1f174e
DM
2713 ...
2714 }
95b24440 2715
bf1f174e
DM
2716For any old-timers reading this who are familiar with the old recursive
2717approach, the code above is equivalent to:
95b24440 2718
bf1f174e
DM
2719 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2720 {
2721 int foo = ...
95b24440 2722 ...
bf1f174e
DM
2723 if (regmatch(A)) {
2724 next = B;
2725 bar = foo;
2726 break;
95b24440 2727 }
bf1f174e
DM
2728 ...; // do some housekeeping, then ...
2729 sayNO; // propagate the failure
95b24440 2730 }
bf1f174e
DM
2731
2732The topmost backtrack state, pointed to by st, is usually free. If you
2733want to claim it, populate any ST.foo fields in it with values you wish to
2734save, then do one of
2735
2736 PUSH_STATE_GOTO(resume_state, node);
2737 PUSH_YES_STATE_GOTO(resume_state, node);
2738
2739which sets that backtrack state's resume value to 'resume_state', pushes a
2740new free entry to the top of the backtrack stack, then goes to 'node'.
2741On backtracking, the free slot is popped, and the saved state becomes the
2742new free state. An ST.foo field in this new top state can be temporarily
2743accessed to retrieve values, but once the main loop is re-entered, it
2744becomes available for reuse.
2745
2746Note that the depth of the backtrack stack constantly increases during the
2747left-to-right execution of the pattern, rather than going up and down with
2748the pattern nesting. For example the stack is at its maximum at Z at the
2749end of the pattern, rather than at X in the following:
2750
2751 /(((X)+)+)+....(Y)+....Z/
2752
2753The only exceptions to this are lookahead/behind assertions and the cut,
2754(?>A), which pop all the backtrack states associated with A before
2755continuing.
2756
2757Bascktrack state structs are allocated in slabs of about 4K in size.
2758PL_regmatch_state and st always point to the currently active state,
2759and PL_regmatch_slab points to the slab currently containing
2760PL_regmatch_state. The first time regmatch() is called, the first slab is
2761allocated, and is never freed until interpreter destruction. When the slab
2762is full, a new one is allocated and chained to the end. At exit from
2763regmatch(), slabs allocated since entry are freed.
2764
2765*/
95b24440 2766
40a82448 2767
5bc10b2c 2768#define DEBUG_STATE_pp(pp) \
265c4333 2769 DEBUG_STATE_r({ \
f2ed9b32 2770 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2771 PerlIO_printf(Perl_debug_log, \
5d458dd8 2772 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2773 depth*2, "", \
13d6edb4 2774 PL_reg_name[st->resume_state], \
5d458dd8
YO
2775 ((st==yes_state||st==mark_state) ? "[" : ""), \
2776 ((st==yes_state) ? "Y" : ""), \
2777 ((st==mark_state) ? "M" : ""), \
2778 ((st==yes_state||st==mark_state) ? "]" : "") \
2779 ); \
265c4333 2780 });
5bc10b2c 2781
40a82448 2782
3dab1dad 2783#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2784
3df15adc 2785#ifdef DEBUGGING
5bc10b2c 2786
ab3bbdeb 2787STATIC void
f2ed9b32 2788S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2789 const char *start, const char *end, const char *blurb)
2790{
efd26800 2791 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2792
2793 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2794
ab3bbdeb
YO
2795 if (!PL_colorset)
2796 reginitcolors();
2797 {
2798 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2799 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2800
f2ed9b32 2801 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2802 start, end - start, 60);
2803
2804 PerlIO_printf(Perl_debug_log,
2805 "%s%s REx%s %s against %s\n",
2806 PL_colors[4], blurb, PL_colors[5], s0, s1);
2807
f2ed9b32 2808 if (utf8_target||utf8_pat)
1de06328
YO
2809 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2810 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2811 utf8_pat && utf8_target ? " and " : "",
2812 utf8_target ? "string" : ""
ab3bbdeb
YO
2813 );
2814 }
2815}
3df15adc
YO
2816
2817STATIC void
786e8c11
YO
2818S_dump_exec_pos(pTHX_ const char *locinput,
2819 const regnode *scan,
2820 const char *loc_regeol,
2821 const char *loc_bostr,
2822 const char *loc_reg_starttry,
f2ed9b32 2823 const bool utf8_target)
07be1b83 2824{
786e8c11 2825 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2826 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2827 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2828 /* The part of the string before starttry has one color
2829 (pref0_len chars), between starttry and current
2830 position another one (pref_len - pref0_len chars),
2831 after the current position the third one.
2832 We assume that pref0_len <= pref_len, otherwise we
2833 decrease pref0_len. */
786e8c11
YO
2834 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2835 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2836 int pref0_len;
2837
7918f24d
NC
2838 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2839
f2ed9b32 2840 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2841 pref_len++;
786e8c11
YO
2842 pref0_len = pref_len - (locinput - loc_reg_starttry);
2843 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2844 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2845 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2846 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2847 l--;
2848 if (pref0_len < 0)
2849 pref0_len = 0;
2850 if (pref0_len > pref_len)
2851 pref0_len = pref_len;
2852 {
f2ed9b32 2853 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2854
ab3bbdeb 2855 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2856 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2857
ab3bbdeb 2858 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2859 (locinput - pref_len + pref0_len),
1de06328 2860 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2861
ab3bbdeb 2862 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2863 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2864
1de06328 2865 const STRLEN tlen=len0+len1+len2;
3df15adc 2866 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2867 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2868 (IV)(locinput - loc_bostr),
07be1b83 2869 len0, s0,
07be1b83 2870 len1, s1,
07be1b83 2871 (docolor ? "" : "> <"),
07be1b83 2872 len2, s2,
f9f4320a 2873 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2874 "");
2875 }
2876}
3df15adc 2877
07be1b83
YO
2878#endif
2879
0a4db386
YO
2880/* reg_check_named_buff_matched()
2881 * Checks to see if a named buffer has matched. The data array of
2882 * buffer numbers corresponding to the buffer is expected to reside
2883 * in the regexp->data->data array in the slot stored in the ARG() of
2884 * node involved. Note that this routine doesn't actually care about the
2885 * name, that information is not preserved from compilation to execution.
2886 * Returns the index of the leftmost defined buffer with the given name
2887 * or 0 if non of the buffers matched.
2888 */
2889STATIC I32
7918f24d
NC
2890S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2891{
0a4db386 2892 I32 n;
f8fc2ecf 2893 RXi_GET_DECL(rex,rexi);
ad64d0ec 2894 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2895 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2896
2897 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2898
0a4db386
YO
2899 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2900 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2901 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2902 {
2903 return nums[n];
2904 }
2905 }
2906 return 0;
2907}
2908
2f554ef7
DM
2909
2910/* free all slabs above current one - called during LEAVE_SCOPE */
2911
2912STATIC void
2913S_clear_backtrack_stack(pTHX_ void *p)
2914{
2915 regmatch_slab *s = PL_regmatch_slab->next;
2916 PERL_UNUSED_ARG(p);
2917
2918 if (!s)
2919 return;
2920 PL_regmatch_slab->next = NULL;
2921 while (s) {
2922 regmatch_slab * const osl = s;
2923 s = s->next;
2924 Safefree(osl);
2925 }
2926}
2927
2928
28d8d7f4
YO
2929#define SETREX(Re1,Re2) \
2930 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2931 Re1 = (Re2)
2932
d6a28714 2933STATIC I32 /* 0 failure, 1 success */
24b23f37 2934S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2935{
a35a87e7 2936#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2937 dMY_CXT;
2938#endif
27da23d5 2939 dVAR;
f2ed9b32 2940 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 2941 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2942 REGEXP *rex_sv = reginfo->prog;
2943 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2944 RXi_GET_DECL(rex,rexi);
2f554ef7 2945 I32 oldsave;
5d9a96ca
DM
2946 /* the current state. This is a cached copy of PL_regmatch_state */
2947 register regmatch_state *st;
5d9a96ca
DM
2948 /* cache heavy used fields of st in registers */
2949 register regnode *scan;
2950 register regnode *next;
438e9bae 2951 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2952 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2953 register char *locinput = PL_reginput;
5d9a96ca 2954 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2955
b69b0499 2956 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2957 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2958 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2959 const U32 max_nochange_depth =
2960 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2961 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2962 regmatch_state *yes_state = NULL; /* state to pop to on success of
2963 subpattern */
e2e6a0f1
YO
2964 /* mark_state piggy backs on the yes_state logic so that when we unwind
2965 the stack on success we can update the mark_state as we go */
2966 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2967 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2968 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2969 U32 state_num;
5d458dd8
YO
2970 bool no_final = 0; /* prevent failure from backtracking? */
2971 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2972 char *startpoint = PL_reginput;
5d458dd8
YO
2973 SV *popmark = NULL; /* are we looking for a mark? */
2974 SV *sv_commit = NULL; /* last mark name seen in failure */
2975 SV *sv_yes_mark = NULL; /* last mark name we have seen
2976 during a successfull match */
2977 U32 lastopen = 0; /* last open we saw */
2978 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2979 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2980 /* these three flags are set by various ops to signal information to
2981 * the very next op. They have a useful lifetime of exactly one loop
2982 * iteration, and are not preserved or restored by state pushes/pops
2983 */
2984 bool sw = 0; /* the condition value in (?(cond)a|b) */
2985 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2986 int logical = 0; /* the following EVAL is:
2987 0: (?{...})
2988 1: (?(?{...})X|Y)
2989 2: (??{...})
2990 or the following IFMATCH/UNLESSM is:
2991 false: plain (?=foo)
2992 true: used as a condition: (?(?=foo))
2993 */
95b24440 2994#ifdef DEBUGGING
e68ec53f 2995 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2996#endif
2997
7918f24d
NC
2998 PERL_ARGS_ASSERT_REGMATCH;
2999
3b57cd43 3000 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3001 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3002 }));
5d9a96ca
DM
3003 /* on first ever call to regmatch, allocate first slab */
3004 if (!PL_regmatch_slab) {
3005 Newx(PL_regmatch_slab, 1, regmatch_slab);
3006 PL_regmatch_slab->prev = NULL;
3007 PL_regmatch_slab->next = NULL;
86545054 3008 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3009 }
3010
2f554ef7
DM
3011 oldsave = PL_savestack_ix;
3012 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3013 SAVEVPTR(PL_regmatch_slab);
3014 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3015
3016 /* grab next free state slot */
3017 st = ++PL_regmatch_state;
86545054 3018 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3019 st = PL_regmatch_state = S_push_slab(aTHX);
3020
d6a28714
JH
3021 /* Note that nextchr is a byte even in UTF */
3022 nextchr = UCHARAT(locinput);
3023 scan = prog;
3024 while (scan != NULL) {
8ba1375e 3025
a3621e74 3026 DEBUG_EXECUTE_r( {
6136c704 3027 SV * const prop = sv_newmortal();
1de06328 3028 regnode *rnext=regnext(scan);
f2ed9b32 3029 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3030 regprop(rex, prop, scan);
07be1b83
YO
3031
3032 PerlIO_printf(Perl_debug_log,
3033 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3034 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3035 SvPVX_const(prop),
1de06328 3036 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3037 0 : (IV)(rnext - rexi->program));
2a782b5b 3038 });
d6a28714
JH
3039
3040 next = scan + NEXT_OFF(scan);
3041 if (next == scan)
3042 next = NULL;
40a82448 3043 state_num = OP(scan);
d6a28714 3044
40a82448 3045 reenter_switch:
34a81e2b
B
3046
3047 assert(PL_reglastparen == &rex->lastparen);
3048 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3049 assert(PL_regoffs == rex->offs);
3050
40a82448 3051 switch (state_num) {
d6a28714 3052 case BOL:
7fba1cd6 3053 if (locinput == PL_bostr)
d6a28714 3054 {
3b0527fe 3055 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3056 break;
3057 }
d6a28714
JH
3058 sayNO;
3059 case MBOL:
12d33761
HS
3060 if (locinput == PL_bostr ||
3061 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3062 {
b8c5462f
JH
3063 break;
3064 }
d6a28714
JH
3065 sayNO;
3066 case SBOL:
c2a73568 3067 if (locinput == PL_bostr)
b8c5462f 3068 break;
d6a28714
JH
3069 sayNO;
3070 case GPOS:
3b0527fe 3071 if (locinput == reginfo->ganch)
d6a28714
JH
3072 break;
3073 sayNO;
ee9b8eae
YO
3074
3075 case KEEPS:
3076 /* update the startpoint */
f0ab9afb 3077 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3078 PL_reginput = locinput;
f0ab9afb 3079 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3080 PUSH_STATE_GOTO(KEEPS_next, next);
3081 /*NOT-REACHED*/
3082 case KEEPS_next_fail:
3083 /* rollback the start point change */
f0ab9afb 3084 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3085 sayNO_SILENT;
3086 /*NOT-REACHED*/
d6a28714 3087 case EOL:
d6a28714
JH
3088 goto seol;
3089 case MEOL:
d6a28714 3090 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3091 sayNO;
b8c5462f 3092 break;
d6a28714
JH
3093 case SEOL:
3094 seol:
3095 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3096 sayNO;
d6a28714 3097 if (PL_regeol - locinput > 1)
b8c5462f 3098 sayNO;
b8c5462f 3099 break;
d6a28714
JH
3100 case EOS:
3101 if (PL_regeol != locinput)
b8c5462f 3102 sayNO;
d6a28714 3103 break;
ffc61ed2 3104 case SANY:
d6a28714 3105 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3106 sayNO;
f2ed9b32 3107 if (utf8_target) {
f33976b4
DB
3108 locinput += PL_utf8skip[nextchr];
3109 if (locinput > PL_regeol)
3110 sayNO;
3111 nextchr = UCHARAT(locinput);
3112 }
3113 else
3114 nextchr = UCHARAT(++locinput);
3115 break;
3116 case CANY:
3117 if (!nextchr && locinput >= PL_regeol)
3118 sayNO;
b8c5462f 3119 nextchr = UCHARAT(++locinput);
a0d0e21e 3120 break;
ffc61ed2 3121 case REG_ANY:
1aa99e6b
IH
3122 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3123 sayNO;
f2ed9b32 3124 if (utf8_target) {
b8c5462f 3125 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3126 if (locinput > PL_regeol)
3127 sayNO;
a0ed51b3 3128 nextchr = UCHARAT(locinput);
a0ed51b3 3129 }
1aa99e6b
IH
3130 else
3131 nextchr = UCHARAT(++locinput);
a0ed51b3 3132 break;
166ba7cd
DM
3133
3134#undef ST
3135#define ST st->u.trie
786e8c11
YO
3136 case TRIEC:
3137 /* In this case the charclass data is available inline so
3138 we can fail fast without a lot of extra overhead.
3139 */
f2ed9b32 3140 if (scan->flags == EXACT || !utf8_target) {
786e8c11
YO
3141 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3142 DEBUG_EXECUTE_r(
3143 PerlIO_printf(Perl_debug_log,
3144 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3145 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3146 );
3147 sayNO_SILENT;
3148 /* NOTREACHED */
3149 }
3150 }
3151 /* FALL THROUGH */
5b47454d 3152 case TRIE:
2e64971a
DM
3153 /* the basic plan of execution of the trie is:
3154 * At the beginning, run though all the states, and
3155 * find the longest-matching word. Also remember the position
3156 * of the shortest matching word. For example, this pattern:
3157 * 1 2 3 4 5
3158 * ab|a|x|abcd|abc
3159 * when matched against the string "abcde", will generate
3160 * accept states for all words except 3, with the longest
3161 * matching word being 4, and the shortest being 1 (with
3162 * the position being after char 1 of the string).
3163 *
3164 * Then for each matching word, in word order (i.e. 1,2,4,5),
3165 * we run the remainder of the pattern; on each try setting
3166 * the current position to the character following the word,
3167 * returning to try the next word on failure.
3168 *
3169 * We avoid having to build a list of words at runtime by
3170 * using a compile-time structure, wordinfo[].prev, which
3171 * gives, for each word, the previous accepting word (if any).
3172 * In the case above it would contain the mappings 1->2, 2->0,
3173 * 3->0, 4->5, 5->1. We can use this table to generate, from
3174 * the longest word (4 above), a list of all words, by
3175 * following the list of prev pointers; this gives us the
3176 * unordered list 4,5,1,2. Then given the current word we have
3177 * just tried, we can go through the list and find the
3178 * next-biggest word to try (so if we just failed on word 2,
3179 * the next in the list is 4).
3180 *
3181 * Since at runtime we don't record the matching position in
3182 * the string for each word, we have to work that out for
3183 * each word we're about to process. The wordinfo table holds
3184 * the character length of each word; given that we recorded
3185 * at the start: the position of the shortest word and its
3186 * length in chars, we just need to move the pointer the
3187 * difference between the two char lengths. Depending on
3188 * Unicode status and folding, that's cheap or expensive.
3189 *
3190 * This algorithm is optimised for the case where are only a
3191 * small number of accept states, i.e. 0,1, or maybe 2.
3192 * With lots of accepts states, and having to try all of them,
3193 * it becomes quadratic on number of accept states to find all
3194 * the next words.
3195 */
3196
3dab1dad 3197 {
07be1b83 3198 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3199 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3200
3201 /* what trie are we using right now */
be8e71aa 3202 reg_trie_data * const trie
f8fc2ecf 3203 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3204 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3205 U32 state = trie->startstate;
166ba7cd 3206
3dab1dad
YO
3207 if (trie->bitmap && trie_type != trie_utf8_fold &&
3208 !TRIE_BITMAP_TEST(trie,*locinput)
3209 ) {
3210 if (trie->states[ state ].wordnum) {
3211 DEBUG_EXECUTE_r(
3212 PerlIO_printf(Perl_debug_log,
3213 "%*s %smatched empty string...%s\n",
5bc10b2c 3214 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3215 );
3216 break;
3217 } else {
3218 DEBUG_EXECUTE_r(
3219 PerlIO_printf(Perl_debug_log,
786e8c11 3220 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3221 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3222 );
3223 sayNO_SILENT;
3224 }
3225 }
166ba7cd 3226
786e8c11
YO
3227 {
3228 U8 *uc = ( U8* )locinput;
3229
3230 STRLEN len = 0;
3231 STRLEN foldlen = 0;
3232 U8 *uscan = (U8*)NULL;
786e8c11 3233 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3234 U32 charcount = 0; /* how many input chars we have matched */
3235 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3236
786e8c11
YO
3237 ST.B = next;
3238 ST.jump = trie->jump;
786e8c11 3239 ST.me = scan;
2e64971a
DM
3240 ST.firstpos = NULL;
3241 ST.longfold = FALSE; /* char longer if folded => it's harder */
3242 ST.nextword = 0;
3243
3244 /* fully traverse the TRIE; note the position of the
3245 shortest accept state and the wordnum of the longest
3246 accept state */
07be1b83 3247
a3621e74 3248 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3249 U32 base = trie->states[ state ].trans.base;
f9f4320a 3250 UV uvc = 0;
acb909b4 3251 U16 charid = 0;
2e64971a
DM
3252 U16 wordnum;
3253 wordnum = trie->states[ state ].wordnum;
3254
3255 if (wordnum) { /* it's an accept state */
3256 if (!accepted) {
3257 accepted = 1;
3258 /* record first match position */
3259 if (ST.longfold) {
3260 ST.firstpos = (U8*)locinput;
3261 ST.firstchars = 0;
5b47454d 3262 }
2e64971a
DM
3263 else {
3264 ST.firstpos = uc;
3265 ST.firstchars = charcount;
3266 }
3267 }
3268 if (!ST.nextword || wordnum < ST.nextword)
3269 ST.nextword = wordnum;
3270 ST.topword = wordnum;
786e8c11 3271 }
a3621e74 3272
07be1b83 3273 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3274 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3275 PerlIO_printf( Perl_debug_log,
2e64971a 3276 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3277 2+depth * 2, "", PL_colors[4],
2e64971a 3278 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3279 });
a3621e74 3280
2e64971a 3281 /* read a char and goto next state */
a3621e74 3282 if ( base ) {
6dd2be57 3283 I32 offset;
55eed653
NC
3284 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3285 uscan, len, uvc, charid, foldlen,
3286 foldbuf, uniflags);
2e64971a
DM
3287 charcount++;
3288 if (foldlen>0)
3289 ST.longfold = TRUE;
5b47454d 3290 if (charid &&
6dd2be57
DM
3291 ( ((offset =
3292 base + charid - 1 - trie->uniquecharcount)) >= 0)
3293
3294 && ((U32)offset < trie->lasttrans)
3295 && trie->trans[offset].check == state)
5b47454d 3296 {
6dd2be57 3297 state = trie->trans[offset].next;
5b47454d
DM
3298 }
3299 else {
3300 state = 0;
3301 }
3302 uc += len;
3303
3304 }
3305 else {
a3621e74
YO
3306 state = 0;
3307 }
3308 DEBUG_TRIE_EXECUTE_r(
e4584336 3309 PerlIO_printf( Perl_debug_log,
786e8c11 3310 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3311 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3312 );
3313 }
2e64971a 3314 if (!accepted)
a3621e74 3315 sayNO;
a3621e74 3316
2e64971a
DM
3317 /* calculate total number of accept states */
3318 {
3319 U16 w = ST.topword;
3320 accepted = 0;
3321 while (w) {
3322 w = trie->wordinfo[w].prev;
3323 accepted++;
3324 }
3325 ST.accepted = accepted;
3326 }
3327
166ba7cd
DM
3328 DEBUG_EXECUTE_r(
3329 PerlIO_printf( Perl_debug_log,
3330 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3331 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3332 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3333 );
2e64971a 3334 goto trie_first_try; /* jump into the fail handler */
786e8c11 3335 }}
fae667d5 3336 /* NOTREACHED */
2e64971a
DM
3337
3338 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3339 if ( ST.jump) {
3340 REGCP_UNWIND(ST.cp);
3341 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3342 PL_regoffs[n].end = -1;
fae667d5
YO
3343 *PL_reglastparen = n;
3344 }
2e64971a
DM
3345 if (!--ST.accepted) {
3346 DEBUG_EXECUTE_r({
3347 PerlIO_printf( Perl_debug_log,
3348 "%*s %sTRIE failed...%s\n",
3349 REPORT_CODE_OFF+depth*2, "",
3350 PL_colors[4],
3351 PL_colors[5] );
3352 });
3353 sayNO_SILENT;
3354 }
3355 {
3356 /* Find next-highest word to process. Note that this code
3357 * is O(N^2) per trie run (O(N) per branch), so keep tight */
d9a396a3
DM
3358 register U16 min = 0;
3359 register U16 word;
2e64971a
DM
3360 register U16 const nextword = ST.nextword;
3361 register reg_trie_wordinfo * const wordinfo
3362 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3363 for (word=ST.topword; word; word=wordinfo[word].prev) {
3364 if (word > nextword && (!min || word < min))
3365 min = word;
3366 }
3367 ST.nextword = min;
3368 }
3369
fae667d5 3370 trie_first_try:
5d458dd8
YO