This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
use cBOOL for bool casts
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
eb160463 88#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
33b8afdf 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
20d0b1e9 120/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
37e2e78e
KW
123
124/* Doesn't do an assert to verify that is correct */
125#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
127
1a4fad37
AL
128#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
129#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
130#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 131
37e2e78e 132#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
133 LOAD_UTF8_CHARCLASS(X_begin, " "); \
134 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
135 /* These are utf8 constants, and not utf-ebcdic constants, so the \
136 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 137 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
138 \
139 /* No asserts are done for these, in case called on an early \
140 * Unicode version in which they map to nothing */ \
d275fa5e
CB
141 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 148
d1eb3177
YO
149/*
150 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
151 so that it is possible to override the option here without having to
152 rebuild the entire core. as we are required to do if we change regcomp.h
153 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
154*/
155#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
156#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
157#endif
158
159#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
160#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
161#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
162#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
163#define RE_utf8_perl_word PL_utf8_alnum
164#define RE_utf8_perl_space PL_utf8_space
165#define RE_utf8_posix_digit PL_utf8_digit
166#define perl_word alnum
167#define perl_space space
168#define posix_digit digit
169#else
170#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
171#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
172#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
173#define RE_utf8_perl_word PL_utf8_perl_word
174#define RE_utf8_perl_space PL_utf8_perl_space
175#define RE_utf8_posix_digit PL_utf8_posix_digit
176#endif
177
178
20d0b1e9
YO
179#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
180 case NAMEL: \
181 PL_reg_flags |= RF_tainted; \
182 /* FALL THROUGH */ \
183 case NAME: \
184 if (!nextchr) \
185 sayNO; \
186 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
187 if (!CAT2(PL_utf8_,CLASS)) { \
188 bool ok; \
189 ENTER; \
190 save_re_context(); \
191 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
192 assert(ok); \
193 LEAVE; \
194 } \
195 if (!(OP(scan) == NAME \
f2338a2e 196 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \
20d0b1e9
YO
197 : LCFUNC_utf8((U8*)locinput))) \
198 { \
199 sayNO; \
200 } \
201 locinput += PL_utf8skip[nextchr]; \
202 nextchr = UCHARAT(locinput); \
203 break; \
204 } \
205 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
206 sayNO; \
207 nextchr = UCHARAT(++locinput); \
208 break
209
210#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
211 case NAMEL: \
212 PL_reg_flags |= RF_tainted; \
213 /* FALL THROUGH */ \
214 case NAME : \
215 if (!nextchr && locinput >= PL_regeol) \
216 sayNO; \
217 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
218 if (!CAT2(PL_utf8_,CLASS)) { \
219 bool ok; \
220 ENTER; \
221 save_re_context(); \
222 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
223 assert(ok); \
224 LEAVE; \
225 } \
226 if ((OP(scan) == NAME \
f2338a2e 227 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \
20d0b1e9
YO
228 : LCFUNC_utf8((U8*)locinput))) \
229 { \
230 sayNO; \
231 } \
232 locinput += PL_utf8skip[nextchr]; \
233 nextchr = UCHARAT(locinput); \
234 break; \
235 } \
236 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
237 sayNO; \
238 nextchr = UCHARAT(++locinput); \
239 break
240
241
d1eb3177
YO
242
243
244
3dab1dad
YO
245/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
246
5f80c4cf 247/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
248/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
249#define JUMPABLE(rn) ( \
250 OP(rn) == OPEN || \
251 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
252 OP(rn) == EVAL || \
cca55fe3
JP
253 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
254 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 255 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 256 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 257)
ee9b8eae 258#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 259
ee9b8eae
YO
260#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
261
262#if 0
263/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
264 we don't need this definition. */
265#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
266#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
267#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
268
269#else
270/* ... so we use this as its faster. */
271#define IS_TEXT(rn) ( OP(rn)==EXACT )
272#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
273#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
274
275#endif
e2d8ce26 276
a84d97b6
HS
277/*
278 Search for mandatory following text node; for lookahead, the text must
279 follow but for lookbehind (rn->flags != 0) we skip to the next step.
280*/
cca55fe3 281#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
282 while (JUMPABLE(rn)) { \
283 const OPCODE type = OP(rn); \
284 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 285 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 286 else if (type == PLUS) \
cca55fe3 287 rn = NEXTOPER(rn); \
3dab1dad 288 else if (type == IFMATCH) \
a84d97b6 289 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 290 else rn += NEXT_OFF(rn); \
3dab1dad 291 } \
5f80c4cf 292} STMT_END
74750237 293
c476f425 294
acfe0abc 295static void restore_pos(pTHX_ void *arg);
51371543 296
76e3520e 297STATIC CHECKPOINT
cea2e8a9 298S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 299{
97aff369 300 dVAR;
a3b680e6 301 const int retval = PL_savestack_ix;
b1ce53c5 302#define REGCP_PAREN_ELEMS 4
a3b680e6 303 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 304 int p;
40a82448 305 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 306
e49a9654
IH
307 if (paren_elems_to_push < 0)
308 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
309
f0ab9afb 310#define REGCP_OTHER_ELEMS 7
4b3c1a47 311 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 312
3280af22 313 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 314/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
315 SSPUSHINT(PL_regoffs[p].end);
316 SSPUSHINT(PL_regoffs[p].start);
3280af22 317 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 318 SSPUSHINT(p);
e7707071 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 320 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 321 (UV)p, (IV)PL_regoffs[p].start,
40a82448 322 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 323 (IV)PL_regoffs[p].end
40a82448 324 ));
a0d0e21e 325 }
b1ce53c5 326/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 327 SSPUSHPTR(PL_regoffs);
3280af22
NIS
328 SSPUSHINT(PL_regsize);
329 SSPUSHINT(*PL_reglastparen);
a01268b5 330 SSPUSHINT(*PL_reglastcloseparen);
3280af22 331 SSPUSHPTR(PL_reginput);
41123dfd
JH
332#define REGCP_FRAME_ELEMS 2
333/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
334 * are needed for the regexp context stack bookkeeping. */
335 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 336 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 337
a0d0e21e
LW
338 return retval;
339}
340
c277df42 341/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
342#define REGCP_SET(cp) \
343 DEBUG_STATE_r( \
ab3bbdeb 344 PerlIO_printf(Perl_debug_log, \
e4f74956 345 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
346 (IV)PL_savestack_ix)); \
347 cp = PL_savestack_ix
c3464db5 348
ab3bbdeb 349#define REGCP_UNWIND(cp) \
e4f74956 350 DEBUG_STATE_r( \
ab3bbdeb 351 if (cp != PL_savestack_ix) \
e4f74956
YO
352 PerlIO_printf(Perl_debug_log, \
353 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
354 (IV)(cp), (IV)PL_savestack_ix)); \
355 regcpblow(cp)
c277df42 356
76e3520e 357STATIC char *
097eb12c 358S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 359{
97aff369 360 dVAR;
3b6647e0 361 U32 i;
a0d0e21e 362 char *input;
a3621e74
YO
363 GET_RE_DEBUG_FLAGS_DECL;
364
7918f24d
NC
365 PERL_ARGS_ASSERT_REGCPPOP;
366
b1ce53c5 367 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 368 i = SSPOPINT;
b1ce53c5
JH
369 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
370 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 371 input = (char *) SSPOPPTR;
a01268b5 372 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
373 *PL_reglastparen = SSPOPINT;
374 PL_regsize = SSPOPINT;
f0ab9afb 375 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 376
6bda09f9 377
b1ce53c5 378 /* Now restore the parentheses context. */
41123dfd
JH
379 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
380 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 381 I32 tmps;
097eb12c 382 U32 paren = (U32)SSPOPINT;
3280af22 383 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 384 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 385 tmps = SSPOPINT;
3280af22 386 if (paren <= *PL_reglastparen)
f0ab9afb 387 PL_regoffs[paren].end = tmps;
e7707071 388 DEBUG_BUFFERS_r(
c3464db5 389 PerlIO_printf(Perl_debug_log,
b900a521 390 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 391 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 392 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 393 (IV)PL_regoffs[paren].end,
3280af22 394 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 395 );
a0d0e21e 396 }
e7707071 397 DEBUG_BUFFERS_r(
bb7a0f54 398 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 399 PerlIO_printf(Perl_debug_log,
faccc32b 400 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 401 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
402 }
403 );
daf18116 404#if 1
dafc8851
JH
405 /* It would seem that the similar code in regtry()
406 * already takes care of this, and in fact it is in
407 * a better location to since this code can #if 0-ed out
408 * but the code in regtry() is needed or otherwise tests
409 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
410 * (as of patchlevel 7877) will fail. Then again,
411 * this code seems to be necessary or otherwise
225593e1
DM
412 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
413 * --jhi updated by dapm */
3b6647e0 414 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 415 if (i > PL_regsize)
f0ab9afb
NC
416 PL_regoffs[i].start = -1;
417 PL_regoffs[i].end = -1;
a0d0e21e 418 }
dafc8851 419#endif
a0d0e21e
LW
420 return input;
421}
422
02db2b7b 423#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 424
a687059c 425/*
e50aee73 426 * pregexec and friends
a687059c
LW
427 */
428
76234dfb 429#ifndef PERL_IN_XSUB_RE
a687059c 430/*
c277df42 431 - pregexec - match a regexp against a string
a687059c 432 */
c277df42 433I32
49d7dfbc 434Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 435 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
436/* strend: pointer to null at end of string */
437/* strbeg: real beginning of string */
438/* minend: end of match must be >=minend after stringarg. */
439/* nosave: For optimizations. */
440{
7918f24d
NC
441 PERL_ARGS_ASSERT_PREGEXEC;
442
c277df42 443 return
9041c2e3 444 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
445 nosave ? 0 : REXEC_COPY_STR);
446}
76234dfb 447#endif
22e551b9 448
9041c2e3 449/*
cad2e5aa
JH
450 * Need to implement the following flags for reg_anch:
451 *
452 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
453 * USE_INTUIT_ML
454 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
455 * INTUIT_AUTORITATIVE_ML
456 * INTUIT_ONCE_NOML - Intuit can match in one location only.
457 * INTUIT_ONCE_ML
458 *
459 * Another flag for this function: SECOND_TIME (so that float substrs
460 * with giant delta may be not rechecked).
461 */
462
463/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
464
3f7c398e 465/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
466 Otherwise, only SvCUR(sv) is used to get strbeg. */
467
468/* XXXX We assume that strpos is strbeg unless sv. */
469
6eb5f6b9
JH
470/* XXXX Some places assume that there is a fixed substring.
471 An update may be needed if optimizer marks as "INTUITable"
472 RExen without fixed substrings. Similarly, it is assumed that
473 lengths of all the strings are no more than minlen, thus they
474 cannot come from lookahead.
40d049e4
YO
475 (Or minlen should take into account lookahead.)
476 NOTE: Some of this comment is not correct. minlen does now take account
477 of lookahead/behind. Further research is required. -- demerphq
478
479*/
6eb5f6b9 480
2c2d71f5
JH
481/* A failure to find a constant substring means that there is no need to make
482 an expensive call to REx engine, thus we celebrate a failure. Similarly,
483 finding a substring too deep into the string means that less calls to
30944b6d
IZ
484 regtry() should be needed.
485
486 REx compiler's optimizer found 4 possible hints:
487 a) Anchored substring;
488 b) Fixed substring;
489 c) Whether we are anchored (beginning-of-line or \G);
490 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 491 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
492 string which does not contradict any of them.
493 */
2c2d71f5 494
6eb5f6b9
JH
495/* Most of decisions we do here should have been done at compile time.
496 The nodes of the REx which we used for the search should have been
497 deleted from the finite automaton. */
498
cad2e5aa 499char *
288b8c02 500Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 501 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 502{
97aff369 503 dVAR;
288b8c02 504 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 505 register I32 start_shift = 0;
cad2e5aa 506 /* Should be nonnegative! */
b7953727 507 register I32 end_shift = 0;
2c2d71f5
JH
508 register char *s;
509 register SV *check;
a1933d95 510 char *strbeg;
cad2e5aa 511 char *t;
1de06328 512 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 513 I32 ml_anch;
bd61b366
SS
514 register char *other_last = NULL; /* other substr checked before this */
515 char *check_at = NULL; /* check substr found at this pos */
bbe252da 516 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 517 RXi_GET_DECL(prog,progi);
30944b6d 518#ifdef DEBUGGING
890ce7af 519 const char * const i_strpos = strpos;
30944b6d 520#endif
a3621e74
YO
521 GET_RE_DEBUG_FLAGS_DECL;
522
7918f24d
NC
523 PERL_ARGS_ASSERT_RE_INTUIT_START;
524
288b8c02 525 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 526
3c8556c3 527 if (RX_UTF8(rx)) {
b8d68ded
JH
528 PL_reg_flags |= RF_utf8;
529 }
ab3bbdeb 530 DEBUG_EXECUTE_r(
efd26800 531 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
532 sv ? "Guessing start of match in sv for"
533 : "Guessing start of match in string for");
2a782b5b 534 );
cad2e5aa 535
c344f387
JH
536 /* CHR_DIST() would be more correct here but it makes things slow. */
537 if (prog->minlen > strend - strpos) {
a3621e74 538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 539 "String too short... [re_intuit_start]\n"));
cad2e5aa 540 goto fail;
2c2d71f5 541 }
1de06328 542
a1933d95 543 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 544 PL_regeol = strend;
33b8afdf
JH
545 if (do_utf8) {
546 if (!prog->check_utf8 && prog->check_substr)
547 to_utf8_substr(prog);
548 check = prog->check_utf8;
549 } else {
550 if (!prog->check_substr && prog->check_utf8)
551 to_byte_substr(prog);
552 check = prog->check_substr;
553 }
1de06328 554 if (check == &PL_sv_undef) {
a3621e74 555 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 556 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
557 goto fail;
558 }
bbe252da
YO
559 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
560 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
561 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 562 && !multiline ) ); /* Check after \n? */
cad2e5aa 563
7e25d62c 564 if (!ml_anch) {
bbe252da
YO
565 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
566 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 567 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
568 && sv && !SvROK(sv)
569 && (strpos != strbeg)) {
a3621e74 570 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
571 goto fail;
572 }
573 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 574 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 575 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
576 I32 slen;
577
1aa99e6b 578 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 579
653099ff
GS
580 if (SvTAIL(check)) {
581 slen = SvCUR(check); /* >= 1 */
cad2e5aa 582
9041c2e3 583 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 584 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 586 goto fail_finish;
cad2e5aa
JH
587 }
588 /* Now should match s[0..slen-2] */
589 slen--;
3f7c398e 590 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 591 || (slen > 1
3f7c398e 592 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 593 report_neq:
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
595 goto fail_finish;
596 }
cad2e5aa 597 }
3f7c398e 598 else if (*SvPVX_const(check) != *s
653099ff 599 || ((slen = SvCUR(check)) > 1
3f7c398e 600 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 601 goto report_neq;
c315bfe8 602 check_at = s;
2c2d71f5 603 goto success_at_start;
7e25d62c 604 }
cad2e5aa 605 }
2c2d71f5 606 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 607 s = strpos;
2c2d71f5 608 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
609 end_shift = prog->check_end_shift;
610
2c2d71f5 611 if (!ml_anch) {
a3b680e6 612 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 613 - (SvTAIL(check) != 0);
a3b680e6 614 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
615
616 if (end_shift < eshift)
617 end_shift = eshift;
618 }
cad2e5aa 619 }
2c2d71f5 620 else { /* Can match at random position */
cad2e5aa
JH
621 ml_anch = 0;
622 s = strpos;
1de06328
YO
623 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
624 end_shift = prog->check_end_shift;
625
626 /* end shift should be non negative here */
cad2e5aa
JH
627 }
628
bcdf7404 629#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 630 if (end_shift < 0)
1de06328 631 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 632 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
633#endif
634
2c2d71f5
JH
635 restart:
636 /* Find a possible match in the region s..strend by looking for
637 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
638
639 {
640 I32 srch_start_shift = start_shift;
641 I32 srch_end_shift = end_shift;
642 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
643 srch_end_shift -= ((strbeg - s) - srch_start_shift);
644 srch_start_shift = strbeg - s;
645 }
6bda09f9 646 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
647 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
648 (IV)prog->check_offset_min,
649 (IV)srch_start_shift,
650 (IV)srch_end_shift,
651 (IV)prog->check_end_shift);
652 });
653
cad2e5aa 654 if (flags & REXEC_SCREAM) {
cad2e5aa 655 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 656 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 657
2c2d71f5
JH
658 if (PL_screamfirst[BmRARE(check)] >= 0
659 || ( BmRARE(check) == '\n'
85c508c3 660 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 661 && SvTAIL(check) ))
9041c2e3 662 s = screaminstr(sv, check,
1de06328 663 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 664 else
2c2d71f5 665 goto fail_finish;
4addbd3b 666 /* we may be pointing at the wrong string */
07bc277f 667 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 668 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
669 if (data)
670 *data->scream_olds = s;
671 }
1de06328
YO
672 else {
673 U8* start_point;
674 U8* end_point;
bbe252da 675 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
676 start_point= (U8*)(s + srch_start_shift);
677 end_point= (U8*)(strend - srch_end_shift);
678 } else {
679 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
680 end_point= HOP3(strend, -srch_end_shift, strbeg);
681 }
6bda09f9 682 DEBUG_OPTIMISE_MORE_r({
56570a2c 683 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 684 (int)(end_point - start_point),
fc8cd66c 685 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
686 start_point);
687 });
688
689 s = fbm_instr( start_point, end_point,
7fba1cd6 690 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
691 }
692 }
cad2e5aa
JH
693 /* Update the count-of-usability, remove useless subpatterns,
694 unshift s. */
2c2d71f5 695
ab3bbdeb
YO
696 DEBUG_EXECUTE_r({
697 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
698 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
699 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 700 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
701 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
702 ? "anchored" : "floating"),
703 quoted,
704 RE_SV_TAIL(check),
705 (s ? " at offset " : "...\n") );
706 });
2c2d71f5
JH
707
708 if (!s)
709 goto fail_finish;
2c2d71f5 710 /* Finish the diagnostic message */
a3621e74 711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 712
1de06328
YO
713 /* XXX dmq: first branch is for positive lookbehind...
714 Our check string is offset from the beginning of the pattern.
715 So we need to do any stclass tests offset forward from that
716 point. I think. :-(
717 */
718
719
720
721 check_at=s;
722
723
2c2d71f5
JH
724 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
725 Start with the other substr.
726 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 727 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
728 *always* match. Probably should be marked during compile...
729 Probably it is right to do no SCREAM here...
730 */
731
1de06328
YO
732 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
733 : (prog->float_substr && prog->anchored_substr))
734 {
30944b6d 735 /* Take into account the "other" substring. */
2c2d71f5
JH
736 /* XXXX May be hopelessly wrong for UTF... */
737 if (!other_last)
6eb5f6b9 738 other_last = strpos;
33b8afdf 739 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
740 do_other_anchored:
741 {
890ce7af
AL
742 char * const last = HOP3c(s, -start_shift, strbeg);
743 char *last1, *last2;
be8e71aa 744 char * const saved_s = s;
33b8afdf 745 SV* must;
2c2d71f5 746
2c2d71f5
JH
747 t = s - prog->check_offset_max;
748 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 749 && (!do_utf8
0ce71af7 750 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 751 && t > strpos)))
6f207bd3 752 NOOP;
2c2d71f5
JH
753 else
754 t = strpos;
1aa99e6b 755 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
756 if (t < other_last) /* These positions already checked */
757 t = other_last;
1aa99e6b 758 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
759 if (last < last1)
760 last1 = last;
1de06328
YO
761 /* XXXX It is not documented what units *_offsets are in.
762 We assume bytes, but this is clearly wrong.
763 Meaning this code needs to be carefully reviewed for errors.
764 dmq.
765 */
766
2c2d71f5 767 /* On end-of-str: see comment below. */
33b8afdf
JH
768 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
769 if (must == &PL_sv_undef) {
770 s = (char*)NULL;
1de06328 771 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
772 }
773 else
774 s = fbm_instr(
775 (unsigned char*)t,
776 HOP3(HOP3(last1, prog->anchored_offset, strend)
777 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
778 must,
7fba1cd6 779 multiline ? FBMrf_MULTILINE : 0
33b8afdf 780 );
ab3bbdeb
YO
781 DEBUG_EXECUTE_r({
782 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
783 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
784 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 785 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
786 quoted, RE_SV_TAIL(must));
787 });
788
789
2c2d71f5
JH
790 if (!s) {
791 if (last1 >= last2) {
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
793 ", giving up...\n"));
794 goto fail_finish;
795 }
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 797 ", trying floating at offset %ld...\n",
be8e71aa 798 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
799 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
800 s = HOP3c(last, 1, strend);
2c2d71f5
JH
801 goto restart;
802 }
803 else {
a3621e74 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 805 (long)(s - i_strpos)));
1aa99e6b
IH
806 t = HOP3c(s, -prog->anchored_offset, strbeg);
807 other_last = HOP3c(s, 1, strend);
be8e71aa 808 s = saved_s;
2c2d71f5
JH
809 if (t == strpos)
810 goto try_at_start;
2c2d71f5
JH
811 goto try_at_offset;
812 }
30944b6d 813 }
2c2d71f5
JH
814 }
815 else { /* Take into account the floating substring. */
33b8afdf 816 char *last, *last1;
be8e71aa 817 char * const saved_s = s;
33b8afdf
JH
818 SV* must;
819
820 t = HOP3c(s, -start_shift, strbeg);
821 last1 = last =
822 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
823 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
824 last = HOP3c(t, prog->float_max_offset, strend);
825 s = HOP3c(t, prog->float_min_offset, strend);
826 if (s < other_last)
827 s = other_last;
2c2d71f5 828 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
829 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
830 /* fbm_instr() takes into account exact value of end-of-str
831 if the check is SvTAIL(ed). Since false positives are OK,
832 and end-of-str is not later than strend we are OK. */
833 if (must == &PL_sv_undef) {
834 s = (char*)NULL;
1de06328 835 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
836 }
837 else
2c2d71f5 838 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
839 (unsigned char*)last + SvCUR(must)
840 - (SvTAIL(must)!=0),
7fba1cd6 841 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
842 DEBUG_EXECUTE_r({
843 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
844 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
845 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 846 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
847 quoted, RE_SV_TAIL(must));
848 });
33b8afdf
JH
849 if (!s) {
850 if (last1 == last) {
a3621e74 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
852 ", giving up...\n"));
853 goto fail_finish;
2c2d71f5 854 }
a3621e74 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 856 ", trying anchored starting at offset %ld...\n",
be8e71aa 857 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
858 other_last = last;
859 s = HOP3c(t, 1, strend);
860 goto restart;
861 }
862 else {
a3621e74 863 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
864 (long)(s - i_strpos)));
865 other_last = s; /* Fix this later. --Hugo */
be8e71aa 866 s = saved_s;
33b8afdf
JH
867 if (t == strpos)
868 goto try_at_start;
869 goto try_at_offset;
870 }
2c2d71f5 871 }
cad2e5aa 872 }
2c2d71f5 873
1de06328 874
9ef43ace 875 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 876
6bda09f9 877 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
878 PerlIO_printf(Perl_debug_log,
879 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
880 (IV)prog->check_offset_min,
881 (IV)prog->check_offset_max,
882 (IV)(s-strpos),
883 (IV)(t-strpos),
884 (IV)(t-s),
885 (IV)(strend-strpos)
886 )
887 );
888
2c2d71f5 889 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 890 && (!do_utf8
9ef43ace 891 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
892 && t > strpos)))
893 {
2c2d71f5
JH
894 /* Fixed substring is found far enough so that the match
895 cannot start at strpos. */
896 try_at_offset:
cad2e5aa 897 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
898 /* Eventually fbm_*() should handle this, but often
899 anchored_offset is not 0, so this check will not be wasted. */
900 /* XXXX In the code below we prefer to look for "^" even in
901 presence of anchored substrings. And we search even
902 beyond the found float position. These pessimizations
903 are historical artefacts only. */
904 find_anchor:
2c2d71f5 905 while (t < strend - prog->minlen) {
cad2e5aa 906 if (*t == '\n') {
4ee3650e 907 if (t < check_at - prog->check_offset_min) {
33b8afdf 908 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
909 /* Since we moved from the found position,
910 we definitely contradict the found anchored
30944b6d
IZ
911 substr. Due to the above check we do not
912 contradict "check" substr.
913 Thus we can arrive here only if check substr
914 is float. Redo checking for "other"=="fixed".
915 */
9041c2e3 916 strpos = t + 1;
a3621e74 917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 918 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
919 goto do_other_anchored;
920 }
4ee3650e
GS
921 /* We don't contradict the found floating substring. */
922 /* XXXX Why not check for STCLASS? */
cad2e5aa 923 s = t + 1;
a3621e74 924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
926 goto set_useful;
927 }
4ee3650e
GS
928 /* Position contradicts check-string */
929 /* XXXX probably better to look for check-string
930 than for "\n", so one should lower the limit for t? */
a3621e74 931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 932 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 933 other_last = strpos = s = t + 1;
cad2e5aa
JH
934 goto restart;
935 }
936 t++;
937 }
a3621e74 938 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 939 PL_colors[0], PL_colors[1]));
2c2d71f5 940 goto fail_finish;
cad2e5aa 941 }
f5952150 942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 944 PL_colors[0], PL_colors[1]));
f5952150 945 }
cad2e5aa
JH
946 s = t;
947 set_useful:
33b8afdf 948 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
949 }
950 else {
f5952150 951 /* The found string does not prohibit matching at strpos,
2c2d71f5 952 - no optimization of calling REx engine can be performed,
f5952150
GS
953 unless it was an MBOL and we are not after MBOL,
954 or a future STCLASS check will fail this. */
2c2d71f5
JH
955 try_at_start:
956 /* Even in this situation we may use MBOL flag if strpos is offset
957 wrt the start of the string. */
05b4157f 958 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 959 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 960 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 961 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 962 {
cad2e5aa
JH
963 t = strpos;
964 goto find_anchor;
965 }
a3621e74 966 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 967 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 968 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 969 );
2c2d71f5 970 success_at_start:
bbe252da 971 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
972 && (do_utf8 ? (
973 prog->check_utf8 /* Could be deleted already */
974 && --BmUSEFUL(prog->check_utf8) < 0
975 && (prog->check_utf8 == prog->float_utf8)
976 ) : (
977 prog->check_substr /* Could be deleted already */
978 && --BmUSEFUL(prog->check_substr) < 0
979 && (prog->check_substr == prog->float_substr)
980 )))
66e933ab 981 {
cad2e5aa 982 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
ef8d46e8 984 /* XXX Does the destruction order has to change with do_utf8? */
33b8afdf 985 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
ef8d46e8 986 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
987 prog->check_substr = prog->check_utf8 = NULL; /* disable */
988 prog->float_substr = prog->float_utf8 = NULL; /* clear */
989 check = NULL; /* abort */
cad2e5aa 990 s = strpos;
3cf5c195
IZ
991 /* XXXX This is a remnant of the old implementation. It
992 looks wasteful, since now INTUIT can use many
6eb5f6b9 993 other heuristics. */
bbe252da 994 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
995 }
996 else
997 s = strpos;
998 }
999
6eb5f6b9
JH
1000 /* Last resort... */
1001 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1002 /* trie stclasses are too expensive to use here, we are better off to
1003 leave it to regmatch itself */
f8fc2ecf 1004 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1005 /* minlen == 0 is possible if regstclass is \b or \B,
1006 and the fixed substr is ''$.
1007 Since minlen is already taken into account, s+1 is before strend;
1008 accidentally, minlen >= 1 guaranties no false positives at s + 1
1009 even for \b or \B. But (minlen? 1 : 0) below assumes that
1010 regstclass does not come from lookahead... */
1011 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1012 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
1013 const U8* const str = (U8*)STRING(progi->regstclass);
1014 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1015 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1016 : 1);
1de06328
YO
1017 char * endpos;
1018 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1019 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1020 else if (prog->float_substr || prog->float_utf8)
1021 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1022 else
1023 endpos= strend;
1024
70685ca0
JH
1025 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1026 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1027
6eb5f6b9 1028 t = s;
f8fc2ecf 1029 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1030 if (!s) {
1031#ifdef DEBUGGING
cbbf8932 1032 const char *what = NULL;
6eb5f6b9
JH
1033#endif
1034 if (endpos == strend) {
a3621e74 1035 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1036 "Could not match STCLASS...\n") );
1037 goto fail;
1038 }
a3621e74 1039 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1040 "This position contradicts STCLASS...\n") );
bbe252da 1041 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1042 goto fail;
6eb5f6b9 1043 /* Contradict one of substrings */
33b8afdf
JH
1044 if (prog->anchored_substr || prog->anchored_utf8) {
1045 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1046 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1047 hop_and_restart:
1aa99e6b 1048 s = HOP3c(t, 1, strend);
66e933ab
GS
1049 if (s + start_shift + end_shift > strend) {
1050 /* XXXX Should be taken into account earlier? */
a3621e74 1051 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1052 "Could not match STCLASS...\n") );
1053 goto fail;
1054 }
5e39e1e5
HS
1055 if (!check)
1056 goto giveup;
a3621e74 1057 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1058 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1059 what, (long)(s + start_shift - i_strpos)) );
1060 goto restart;
1061 }
66e933ab 1062 /* Have both, check_string is floating */
6eb5f6b9
JH
1063 if (t + start_shift >= check_at) /* Contradicts floating=check */
1064 goto retry_floating_check;
1065 /* Recheck anchored substring, but not floating... */
9041c2e3 1066 s = check_at;
5e39e1e5
HS
1067 if (!check)
1068 goto giveup;
a3621e74 1069 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1070 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1071 (long)(other_last - i_strpos)) );
1072 goto do_other_anchored;
1073 }
60e71179
GS
1074 /* Another way we could have checked stclass at the
1075 current position only: */
1076 if (ml_anch) {
1077 s = t = t + 1;
5e39e1e5
HS
1078 if (!check)
1079 goto giveup;
a3621e74 1080 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1081 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1082 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1083 goto try_at_offset;
66e933ab 1084 }
33b8afdf 1085 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1086 goto fail;
6eb5f6b9
JH
1087 /* Check is floating subtring. */
1088 retry_floating_check:
1089 t = check_at - start_shift;
a3621e74 1090 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1091 goto hop_and_restart;
1092 }
b7953727 1093 if (t != s) {
a3621e74 1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1095 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1096 (long)(t - i_strpos), (long)(s - i_strpos))
1097 );
1098 }
1099 else {
a3621e74 1100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1101 "Does not contradict STCLASS...\n");
1102 );
1103 }
6eb5f6b9 1104 }
5e39e1e5 1105 giveup:
a3621e74 1106 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1107 PL_colors[4], (check ? "Guessed" : "Giving up"),
1108 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1109 return s;
2c2d71f5
JH
1110
1111 fail_finish: /* Substring not found */
33b8afdf
JH
1112 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1113 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1114 fail:
a3621e74 1115 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1116 PL_colors[4], PL_colors[5]));
bd61b366 1117 return NULL;
cad2e5aa 1118}
9661b544 1119
a0a388a1
YO
1120#define DECL_TRIE_TYPE(scan) \
1121 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1122 trie_type = (scan->flags != EXACT) \
1123 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1124 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1125
55eed653
NC
1126#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1127uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1128 switch (trie_type) { \
1129 case trie_utf8_fold: \
1130 if ( foldlen>0 ) { \
0abd0d78 1131 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1132 foldlen -= len; \
1133 uscan += len; \
1134 len=0; \
1135 } else { \
0abd0d78 1136 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1137 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1138 foldlen -= UNISKIP( uvc ); \
1139 uscan = foldbuf + UNISKIP( uvc ); \
1140 } \
1141 break; \
a0a388a1
YO
1142 case trie_latin_utf8_fold: \
1143 if ( foldlen>0 ) { \
1144 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1145 foldlen -= len; \
1146 uscan += len; \
1147 len=0; \
1148 } else { \
1149 len = 1; \
1150 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1151 foldlen -= UNISKIP( uvc ); \
1152 uscan = foldbuf + UNISKIP( uvc ); \
1153 } \
1154 break; \
4cadc6a9
YO
1155 case trie_utf8: \
1156 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1157 break; \
1158 case trie_plain: \
1159 uvc = (UV)*uc; \
1160 len = 1; \
1161 } \
4cadc6a9
YO
1162 if (uvc < 256) { \
1163 charid = trie->charmap[ uvc ]; \
1164 } \
1165 else { \
1166 charid = 0; \
55eed653
NC
1167 if (widecharmap) { \
1168 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1169 (char*)&uvc, sizeof(UV), 0); \
1170 if (svpp) \
1171 charid = (U16)SvIV(*svpp); \
1172 } \
1173 } \
1174} STMT_END
1175
a0a388a1
YO
1176#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1177{ \
1178 char *my_strend= (char *)strend; \
4cadc6a9
YO
1179 if ( (CoNd) \
1180 && (ln == len || \
a0a388a1 1181 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
f2338a2e 1182 m, NULL, ln, cBOOL(UTF))) \
a0a388a1 1183 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1184 goto got_it; \
1185 else { \
1186 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1187 uvchr_to_utf8(tmpbuf, c); \
1188 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1189 if ( f != c \
1190 && (f == c1 || f == c2) \
a0a388a1
YO
1191 && (ln == len || \
1192 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
f2338a2e 1193 m, NULL, ln, cBOOL(UTF)))\
a0a388a1 1194 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1195 goto got_it; \
1196 } \
a0a388a1
YO
1197} \
1198s += len
4cadc6a9
YO
1199
1200#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1201STMT_START { \
1202 while (s <= e) { \
1203 if ( (CoNd) \
1204 && (ln == 1 || !(OP(c) == EXACTF \
1205 ? ibcmp(s, m, ln) \
1206 : ibcmp_locale(s, m, ln))) \
24b23f37 1207 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1208 goto got_it; \
1209 s++; \
1210 } \
1211} STMT_END
1212
1213#define REXEC_FBC_UTF8_SCAN(CoDe) \
1214STMT_START { \
1215 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1216 CoDe \
1217 s += uskip; \
1218 } \
1219} STMT_END
1220
1221#define REXEC_FBC_SCAN(CoDe) \
1222STMT_START { \
1223 while (s < strend) { \
1224 CoDe \
1225 s++; \
1226 } \
1227} STMT_END
1228
1229#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1230REXEC_FBC_UTF8_SCAN( \
1231 if (CoNd) { \
24b23f37 1232 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1233 goto got_it; \
1234 else \
1235 tmp = doevery; \
1236 } \
1237 else \
1238 tmp = 1; \
1239)
1240
1241#define REXEC_FBC_CLASS_SCAN(CoNd) \
1242REXEC_FBC_SCAN( \
1243 if (CoNd) { \
24b23f37 1244 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1245 goto got_it; \
1246 else \
1247 tmp = doevery; \
1248 } \
1249 else \
1250 tmp = 1; \
1251)
1252
1253#define REXEC_FBC_TRYIT \
24b23f37 1254if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1255 goto got_it
1256
e1d1eefb
YO
1257#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1258 if (do_utf8) { \
1259 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1260 } \
1261 else { \
1262 REXEC_FBC_CLASS_SCAN(CoNd); \
1263 } \
1264 break
1265
4cadc6a9
YO
1266#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1267 if (do_utf8) { \
1268 UtFpReLoAd; \
1269 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1270 } \
1271 else { \
1272 REXEC_FBC_CLASS_SCAN(CoNd); \
1273 } \
1274 break
1275
1276#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1277 PL_reg_flags |= RF_tainted; \
1278 if (do_utf8) { \
1279 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1280 } \
1281 else { \
1282 REXEC_FBC_CLASS_SCAN(CoNd); \
1283 } \
1284 break
1285
786e8c11
YO
1286#define DUMP_EXEC_POS(li,s,doutf8) \
1287 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1288
1289/* We know what class REx starts with. Try to find this position... */
1290/* if reginfo is NULL, its a dryrun */
1291/* annoyingly all the vars in this routine have different names from their counterparts
1292 in regmatch. /grrr */
1293
3c3eec57 1294STATIC char *
07be1b83 1295S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1296 const char *strend, regmatch_info *reginfo)
a687059c 1297{
27da23d5 1298 dVAR;
bbe252da 1299 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1300 char *m;
d8093b23 1301 STRLEN ln;
5dab1207 1302 STRLEN lnc;
078c425b 1303 register STRLEN uskip;
d8093b23 1304 unsigned int c1;
1305 unsigned int c2;
6eb5f6b9
JH
1306 char *e;
1307 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1308 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1309 RXi_GET_DECL(prog,progi);
7918f24d
NC
1310
1311 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1312
6eb5f6b9
JH
1313 /* We know what class it must start with. */
1314 switch (OP(c)) {
6eb5f6b9 1315 case ANYOF:
388cc4de 1316 if (do_utf8) {
4cadc6a9 1317 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1318 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1319 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1320 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1321 }
1322 else {
1323 while (s < strend) {
1324 STRLEN skip = 1;
1325
32fc9b6a 1326 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1327 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1328 /* The assignment of 2 is intentional:
1329 * for the folded sharp s, the skip is 2. */
1330 (skip = SHARP_S_SKIP))) {
24b23f37 1331 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1332 goto got_it;
1333 else
1334 tmp = doevery;
1335 }
1336 else
1337 tmp = 1;
1338 s += skip;
1339 }
a0d0e21e 1340 }
6eb5f6b9 1341 break;
f33976b4 1342 case CANY:
4cadc6a9 1343 REXEC_FBC_SCAN(
24b23f37 1344 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1345 goto got_it;
1346 else
1347 tmp = doevery;
4cadc6a9 1348 );
f33976b4 1349 break;
6eb5f6b9 1350 case EXACTF:
5dab1207
NIS
1351 m = STRING(c);
1352 ln = STR_LEN(c); /* length to match in octets/bytes */
1353 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1354 if (UTF) {
a2a2844f 1355 STRLEN ulen1, ulen2;
5dab1207 1356 U8 *sm = (U8 *) m;
89ebb4a3
JH
1357 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1358 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1359 /* used by commented-out code below */
1360 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1361
1362 /* XXX: Since the node will be case folded at compile
1363 time this logic is a little odd, although im not
1364 sure that its actually wrong. --dmq */
1365
1366 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1367 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1368
1369 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1370 codepoint of the first character in the converted
1371 form, yet originally we did the extra step.
1372 No tests fail by commenting this code out however
1373 so Ive left it out. -- dmq.
1374
89ebb4a3 1375 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1376 0, uniflags);
89ebb4a3 1377 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1378 0, uniflags);
a0a388a1
YO
1379 */
1380
5dab1207
NIS
1381 lnc = 0;
1382 while (sm < ((U8 *) m + ln)) {
1383 lnc++;
1384 sm += UTF8SKIP(sm);
1385 }
1aa99e6b
IH
1386 }
1387 else {
1388 c1 = *(U8*)m;
1389 c2 = PL_fold[c1];
1390 }
6eb5f6b9
JH
1391 goto do_exactf;
1392 case EXACTFL:
5dab1207
NIS
1393 m = STRING(c);
1394 ln = STR_LEN(c);
1395 lnc = (I32) ln;
d8093b23 1396 c1 = *(U8*)m;
6eb5f6b9
JH
1397 c2 = PL_fold_locale[c1];
1398 do_exactf:
db12adc6 1399 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1400
3b0527fe 1401 if (!reginfo && e < s)
6eb5f6b9 1402 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1403
60a8b682
JH
1404 /* The idea in the EXACTF* cases is to first find the
1405 * first character of the EXACTF* node and then, if
1406 * necessary, case-insensitively compare the full
1407 * text of the node. The c1 and c2 are the first
1408 * characters (though in Unicode it gets a bit
1409 * more complicated because there are more cases
7f16dd3d
JH
1410 * than just upper and lower: one needs to use
1411 * the so-called folding case for case-insensitive
1412 * matching (called "loose matching" in Unicode).
1413 * ibcmp_utf8() will do just that. */
60a8b682 1414
a0a388a1 1415 if (do_utf8 || UTF) {
575cac57 1416 UV c, f;
89ebb4a3 1417 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1418 STRLEN len = 1;
1419 STRLEN foldlen;
4ad0818d 1420 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1421 if (c1 == c2) {
5dab1207
NIS
1422 /* Upper and lower of 1st char are equal -
1423 * probably not a "letter". */
1aa99e6b 1424 while (s <= e) {
a0a388a1
YO
1425 if (do_utf8) {
1426 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1427 uniflags);
a0a388a1
YO
1428 } else {
1429 c = *((U8*)s);
1430 }
4cadc6a9 1431 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1432 }
09091399
JH
1433 }
1434 else {
1aa99e6b 1435 while (s <= e) {
a0a388a1
YO
1436 if (do_utf8) {
1437 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1438 uniflags);
a0a388a1
YO
1439 } else {
1440 c = *((U8*)s);
1441 }
80aecb99 1442
60a8b682 1443 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1444 * Note that not all the possible combinations
1445 * are handled here: some of them are handled
1446 * by the standard folding rules, and some of
1447 * them (the character class or ANYOF cases)
1448 * are handled during compiletime in
1449 * regexec.c:S_regclass(). */
880bd946
JH
1450 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1451 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1452 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1453
4cadc6a9 1454 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1455 }
09091399 1456 }
1aa99e6b
IH
1457 }
1458 else {
a0a388a1 1459 /* Neither pattern nor string are UTF8 */
1aa99e6b 1460 if (c1 == c2)
4cadc6a9 1461 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1462 else
4cadc6a9 1463 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1464 }
1465 break;
bbce6d69 1466 case BOUNDL:
3280af22 1467 PL_reg_flags |= RF_tainted;
bbce6d69 1468 /* FALL THROUGH */
a0d0e21e 1469 case BOUND:
ffc61ed2 1470 if (do_utf8) {
12d33761 1471 if (s == PL_bostr)
ffc61ed2
JH
1472 tmp = '\n';
1473 else {
6136c704 1474 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1475 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1476 }
1477 tmp = ((OP(c) == BOUND ?
9041c2e3 1478 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1479 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1480 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1481 if (tmp == !(OP(c) == BOUND ?
f2338a2e 1482 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
ffc61ed2
JH
1483 isALNUM_LC_utf8((U8*)s)))
1484 {
1485 tmp = !tmp;
4cadc6a9 1486 REXEC_FBC_TRYIT;
a687059c 1487 }
4cadc6a9 1488 );
a0d0e21e 1489 }
667bb95a 1490 else {
12d33761 1491 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1492 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1493 REXEC_FBC_SCAN(
ffc61ed2
JH
1494 if (tmp ==
1495 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1496 tmp = !tmp;
4cadc6a9 1497 REXEC_FBC_TRYIT;
a0ed51b3 1498 }
4cadc6a9 1499 );
a0ed51b3 1500 }
24b23f37 1501 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1502 goto got_it;
1503 break;
bbce6d69 1504 case NBOUNDL:
3280af22 1505 PL_reg_flags |= RF_tainted;
bbce6d69 1506 /* FALL THROUGH */
a0d0e21e 1507 case NBOUND:
ffc61ed2 1508 if (do_utf8) {
12d33761 1509 if (s == PL_bostr)
ffc61ed2
JH
1510 tmp = '\n';
1511 else {
6136c704 1512 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1513 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1514 }
1515 tmp = ((OP(c) == NBOUND ?
9041c2e3 1516 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1517 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1518 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1519 if (tmp == !(OP(c) == NBOUND ?
f2338a2e 1520 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) :
ffc61ed2
JH
1521 isALNUM_LC_utf8((U8*)s)))
1522 tmp = !tmp;
4cadc6a9
YO
1523 else REXEC_FBC_TRYIT;
1524 );
a0d0e21e 1525 }
667bb95a 1526 else {
12d33761 1527 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1528 tmp = ((OP(c) == NBOUND ?
1529 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1530 REXEC_FBC_SCAN(
ffc61ed2
JH
1531 if (tmp ==
1532 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1533 tmp = !tmp;
4cadc6a9
YO
1534 else REXEC_FBC_TRYIT;
1535 );
a0ed51b3 1536 }
24b23f37 1537 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1538 goto got_it;
1539 break;
a0d0e21e 1540 case ALNUM:
4cadc6a9 1541 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1542 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1543 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1544 isALNUM(*s)
1545 );
bbce6d69 1546 case ALNUML:
4cadc6a9
YO
1547 REXEC_FBC_CSCAN_TAINT(
1548 isALNUM_LC_utf8((U8*)s),
1549 isALNUM_LC(*s)
1550 );
a0d0e21e 1551 case NALNUM:
4cadc6a9 1552 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1553 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1554 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1555 !isALNUM(*s)
1556 );
bbce6d69 1557 case NALNUML:
4cadc6a9
YO
1558 REXEC_FBC_CSCAN_TAINT(
1559 !isALNUM_LC_utf8((U8*)s),
1560 !isALNUM_LC(*s)
1561 );
a0d0e21e 1562 case SPACE:
4cadc6a9 1563 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1564 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1565 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
4cadc6a9
YO
1566 isSPACE(*s)
1567 );
bbce6d69 1568 case SPACEL:
4cadc6a9
YO
1569 REXEC_FBC_CSCAN_TAINT(
1570 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1571 isSPACE_LC(*s)
1572 );
a0d0e21e 1573 case NSPACE:
4cadc6a9 1574 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1575 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1576 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
4cadc6a9
YO
1577 !isSPACE(*s)
1578 );
bbce6d69 1579 case NSPACEL:
4cadc6a9
YO
1580 REXEC_FBC_CSCAN_TAINT(
1581 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1582 !isSPACE_LC(*s)
1583 );
a0d0e21e 1584 case DIGIT:
4cadc6a9 1585 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1586 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1587 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1588 isDIGIT(*s)
1589 );
b8c5462f 1590 case DIGITL:
4cadc6a9
YO
1591 REXEC_FBC_CSCAN_TAINT(
1592 isDIGIT_LC_utf8((U8*)s),
1593 isDIGIT_LC(*s)
1594 );
a0d0e21e 1595 case NDIGIT:
4cadc6a9 1596 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1597 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1598 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1599 !isDIGIT(*s)
1600 );
b8c5462f 1601 case NDIGITL:
4cadc6a9
YO
1602 REXEC_FBC_CSCAN_TAINT(
1603 !isDIGIT_LC_utf8((U8*)s),
1604 !isDIGIT_LC(*s)
1605 );
e1d1eefb
YO
1606 case LNBREAK:
1607 REXEC_FBC_CSCAN(
1608 is_LNBREAK_utf8(s),
1609 is_LNBREAK_latin1(s)
1610 );
1611 case VERTWS:
1612 REXEC_FBC_CSCAN(
1613 is_VERTWS_utf8(s),
1614 is_VERTWS_latin1(s)
1615 );
1616 case NVERTWS:
1617 REXEC_FBC_CSCAN(
1618 !is_VERTWS_utf8(s),
1619 !is_VERTWS_latin1(s)
1620 );
1621 case HORIZWS:
1622 REXEC_FBC_CSCAN(
1623 is_HORIZWS_utf8(s),
1624 is_HORIZWS_latin1(s)
1625 );
1626 case NHORIZWS:
1627 REXEC_FBC_CSCAN(
1628 !is_HORIZWS_utf8(s),
1629 !is_HORIZWS_latin1(s)
1630 );
1de06328
YO
1631 case AHOCORASICKC:
1632 case AHOCORASICK:
07be1b83 1633 {
a0a388a1 1634 DECL_TRIE_TYPE(c);
07be1b83
YO
1635 /* what trie are we using right now */
1636 reg_ac_data *aho
f8fc2ecf 1637 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1638 reg_trie_data *trie
1639 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1640 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1641
1642 const char *last_start = strend - trie->minlen;
6148ee25 1643#ifdef DEBUGGING
07be1b83 1644 const char *real_start = s;
6148ee25 1645#endif
07be1b83 1646 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1647 SV *sv_points;
1648 U8 **points; /* map of where we were in the input string
786e8c11 1649 when reading a given char. For ASCII this
be8e71aa 1650 is unnecessary overhead as the relationship
38a44b82
NC
1651 is always 1:1, but for Unicode, especially
1652 case folded Unicode this is not true. */
f9e705e8 1653 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1654 U8 *bitmap=NULL;
1655
07be1b83
YO
1656
1657 GET_RE_DEBUG_FLAGS_DECL;
1658
be8e71aa
YO
1659 /* We can't just allocate points here. We need to wrap it in
1660 * an SV so it gets freed properly if there is a croak while
1661 * running the match */
1662 ENTER;
1663 SAVETMPS;
1664 sv_points=newSV(maxlen * sizeof(U8 *));
1665 SvCUR_set(sv_points,
1666 maxlen * sizeof(U8 *));
1667 SvPOK_on(sv_points);
1668 sv_2mortal(sv_points);
1669 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1670 if ( trie_type != trie_utf8_fold
1671 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1672 {
786e8c11
YO
1673 if (trie->bitmap)
1674 bitmap=(U8*)trie->bitmap;
1675 else
1676 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1677 }
786e8c11
YO
1678 /* this is the Aho-Corasick algorithm modified a touch
1679 to include special handling for long "unknown char"
1680 sequences. The basic idea being that we use AC as long
1681 as we are dealing with a possible matching char, when
1682 we encounter an unknown char (and we have not encountered
1683 an accepting state) we scan forward until we find a legal
1684 starting char.
1685 AC matching is basically that of trie matching, except
1686 that when we encounter a failing transition, we fall back
1687 to the current states "fail state", and try the current char
1688 again, a process we repeat until we reach the root state,
1689 state 1, or a legal transition. If we fail on the root state
1690 then we can either terminate if we have reached an accepting
1691 state previously, or restart the entire process from the beginning
1692 if we have not.
1693
1694 */
07be1b83
YO
1695 while (s <= last_start) {
1696 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1697 U8 *uc = (U8*)s;
1698 U16 charid = 0;
1699 U32 base = 1;
1700 U32 state = 1;
1701 UV uvc = 0;
1702 STRLEN len = 0;
1703 STRLEN foldlen = 0;
1704 U8 *uscan = (U8*)NULL;
1705 U8 *leftmost = NULL;
786e8c11
YO
1706#ifdef DEBUGGING
1707 U32 accepted_word= 0;
1708#endif
07be1b83
YO
1709 U32 pointpos = 0;
1710
1711 while ( state && uc <= (U8*)strend ) {
1712 int failed=0;
786e8c11
YO
1713 U32 word = aho->states[ state ].wordnum;
1714
1de06328
YO
1715 if( state==1 ) {
1716 if ( bitmap ) {
1717 DEBUG_TRIE_EXECUTE_r(
1718 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1719 dump_exec_pos( (char *)uc, c, strend, real_start,
1720 (char *)uc, do_utf8 );
1721 PerlIO_printf( Perl_debug_log,
1722 " Scanning for legal start char...\n");
1723 }
1724 );
1725 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1726 uc++;
786e8c11 1727 }
1de06328 1728 s= (char *)uc;
786e8c11 1729 }
786e8c11
YO
1730 if (uc >(U8*)last_start) break;
1731 }
1732
1733 if ( word ) {
1734 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1735 if (!leftmost || lpos < leftmost) {
1736 DEBUG_r(accepted_word=word);
07be1b83 1737 leftmost= lpos;
786e8c11 1738 }
07be1b83 1739 if (base==0) break;
786e8c11 1740
07be1b83
YO
1741 }
1742 points[pointpos++ % maxlen]= uc;
55eed653
NC
1743 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1744 uscan, len, uvc, charid, foldlen,
1745 foldbuf, uniflags);
786e8c11
YO
1746 DEBUG_TRIE_EXECUTE_r({
1747 dump_exec_pos( (char *)uc, c, strend, real_start,
1748 s, do_utf8 );
07be1b83 1749 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1750 " Charid:%3u CP:%4"UVxf" ",
1751 charid, uvc);
1752 });
07be1b83
YO
1753
1754 do {
6148ee25 1755#ifdef DEBUGGING
786e8c11 1756 word = aho->states[ state ].wordnum;
6148ee25 1757#endif
07be1b83
YO
1758 base = aho->states[ state ].trans.base;
1759
786e8c11
YO
1760 DEBUG_TRIE_EXECUTE_r({
1761 if (failed)
1762 dump_exec_pos( (char *)uc, c, strend, real_start,
1763 s, do_utf8 );
07be1b83 1764 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1765 "%sState: %4"UVxf", word=%"UVxf,
1766 failed ? " Fail transition to " : "",
1767 (UV)state, (UV)word);
1768 });
07be1b83
YO
1769 if ( base ) {
1770 U32 tmp;
1771 if (charid &&
1772 (base + charid > trie->uniquecharcount )
1773 && (base + charid - 1 - trie->uniquecharcount
1774 < trie->lasttrans)
1775 && trie->trans[base + charid - 1 -
1776 trie->uniquecharcount].check == state
1777 && (tmp=trie->trans[base + charid - 1 -
1778 trie->uniquecharcount ].next))
1779 {
786e8c11
YO
1780 DEBUG_TRIE_EXECUTE_r(
1781 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1782 state = tmp;
1783 break;
1784 }
1785 else {
786e8c11
YO
1786 DEBUG_TRIE_EXECUTE_r(
1787 PerlIO_printf( Perl_debug_log," - fail\n"));
1788 failed = 1;
1789 state = aho->fail[state];
07be1b83
YO
1790 }
1791 }
1792 else {
1793 /* we must be accepting here */
786e8c11
YO
1794 DEBUG_TRIE_EXECUTE_r(
1795 PerlIO_printf( Perl_debug_log," - accepting\n"));
1796 failed = 1;
07be1b83
YO
1797 break;
1798 }
1799 } while(state);
786e8c11 1800 uc += len;
07be1b83
YO
1801 if (failed) {
1802 if (leftmost)
1803 break;
786e8c11 1804 if (!state) state = 1;
07be1b83
YO
1805 }
1806 }
1807 if ( aho->states[ state ].wordnum ) {
1808 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1809 if (!leftmost || lpos < leftmost) {
1810 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1811 leftmost = lpos;
786e8c11 1812 }
07be1b83 1813 }
07be1b83
YO
1814 if (leftmost) {
1815 s = (char*)leftmost;
786e8c11
YO
1816 DEBUG_TRIE_EXECUTE_r({
1817 PerlIO_printf(
70685ca0
JH
1818 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1819 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1820 );
1821 });
24b23f37 1822 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1823 FREETMPS;
1824 LEAVE;
07be1b83 1825 goto got_it;
be8e71aa 1826 }
07be1b83 1827 s = HOPc(s,1);
786e8c11
YO
1828 DEBUG_TRIE_EXECUTE_r({
1829 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1830 });
07be1b83 1831 } else {
786e8c11
YO
1832 DEBUG_TRIE_EXECUTE_r(
1833 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1834 break;
1835 }
1836 }
be8e71aa
YO
1837 FREETMPS;
1838 LEAVE;
07be1b83
YO
1839 }
1840 break;
b3c9acc1 1841 default:
3c3eec57
GS
1842 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1843 break;
d6a28714 1844 }
6eb5f6b9
JH
1845 return 0;
1846 got_it:
1847 return s;
1848}
1849
fae667d5 1850
6eb5f6b9
JH
1851/*
1852 - regexec_flags - match a regexp against a string
1853 */
1854I32
288b8c02 1855Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1856 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1857/* strend: pointer to null at end of string */
1858/* strbeg: real beginning of string */
1859/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1860/* data: May be used for some additional optimizations.
1861 Currently its only used, with a U32 cast, for transmitting
1862 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1863/* nosave: For optimizations. */
1864{
97aff369 1865 dVAR;
288b8c02 1866 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1867 /*register*/ char *s;
6eb5f6b9 1868 register regnode *c;
24b23f37 1869 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1870 I32 minlen; /* must match at least this many chars */
1871 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1872 I32 end_shift = 0; /* Same for the end. */ /* CC */
1873 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1874 char *scream_olds = NULL;
f2338a2e 1875 const bool do_utf8 = cBOOL(DO_UTF8(sv));
2757e526 1876 I32 multiline;
f8fc2ecf 1877 RXi_GET_DECL(prog,progi);
3b0527fe 1878 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1879 regexp_paren_pair *swap = NULL;
a3621e74
YO
1880 GET_RE_DEBUG_FLAGS_DECL;
1881
7918f24d 1882 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1883 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1884
1885 /* Be paranoid... */
1886 if (prog == NULL || startpos == NULL) {
1887 Perl_croak(aTHX_ "NULL regexp parameter");
1888 return 0;
1889 }
1890
bbe252da 1891 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1892 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1893
288b8c02 1894 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1895 DEBUG_EXECUTE_r(
efd26800 1896 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1897 "Matching");
1898 );
bac06658 1899
6eb5f6b9 1900 minlen = prog->minlen;
1de06328
YO
1901
1902 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1904 "String too short [regexec_flags]...\n"));
1905 goto phooey;
1aa99e6b 1906 }
6eb5f6b9 1907
1de06328 1908
6eb5f6b9 1909 /* Check validity of program. */
f8fc2ecf 1910 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1911 Perl_croak(aTHX_ "corrupted regexp program");
1912 }
1913
1914 PL_reg_flags = 0;
1915 PL_reg_eval_set = 0;
1916 PL_reg_maxiter = 0;
1917
3c8556c3 1918 if (RX_UTF8(rx))
6eb5f6b9
JH
1919 PL_reg_flags |= RF_utf8;
1920
1921 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1922 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1923 PL_bostr = strbeg;
3b0527fe 1924 reginfo.sv = sv;
6eb5f6b9
JH
1925
1926 /* Mark end of line for $ (and such) */
1927 PL_regeol = strend;
1928
1929 /* see how far we have to get to not match where we matched before */
3b0527fe 1930 reginfo.till = startpos+minend;
6eb5f6b9 1931
6eb5f6b9
JH
1932 /* If there is a "must appear" string, look for it. */
1933 s = startpos;
1934
bbe252da 1935 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 1936 MAGIC *mg;
2c296965 1937 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 1938 reginfo.ganch = startpos + prog->gofs;
2c296965 1939 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1940 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 1941 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 1942 && SvMAGIC(sv)
14befaf4
DM
1943 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1944 && mg->mg_len >= 0) {
3b0527fe 1945 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 1946 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1947 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 1948
bbe252da 1949 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1950 if (s > reginfo.ganch)
6eb5f6b9 1951 goto phooey;
58e23c8d 1952 s = reginfo.ganch - prog->gofs;
2c296965 1953 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1954 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
1955 if (s < strbeg)
1956 goto phooey;
6eb5f6b9
JH
1957 }
1958 }
58e23c8d 1959 else if (data) {
70685ca0 1960 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
1961 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1962 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1963
1964 } else { /* pos() not defined */
3b0527fe 1965 reginfo.ganch = strbeg;
2c296965
YO
1966 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1967 "GPOS: reginfo.ganch = strbeg\n"));
1968 }
6eb5f6b9 1969 }
288b8c02 1970 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
1971 /* We have to be careful. If the previous successful match
1972 was from this regex we don't want a subsequent partially
1973 successful match to clobber the old results.
1974 So when we detect this possibility we add a swap buffer
1975 to the re, and switch the buffer each match. If we fail
1976 we switch it back, otherwise we leave it swapped.
1977 */
1978 swap = prog->offs;
1979 /* do we need a save destructor here for eval dies? */
1980 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 1981 }
a0714e2c 1982 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1983 re_scream_pos_data d;
1984
1985 d.scream_olds = &scream_olds;
1986 d.scream_pos = &scream_pos;
288b8c02 1987 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1988 if (!s) {
a3621e74 1989 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1990 goto phooey; /* not present */
3fa9c3d7 1991 }
6eb5f6b9
JH
1992 }
1993
1de06328 1994
6eb5f6b9
JH
1995
1996 /* Simplest case: anchored match need be tried only once. */
1997 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1998 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1999 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2000 goto got_it;
bbe252da
YO
2001 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2002 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2003 {
2004 char *end;
2005
2006 if (minlen)
2007 dontbother = minlen - 1;
1aa99e6b 2008 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2009 /* for multiline we only have to try after newlines */
33b8afdf 2010 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
2011 if (s == startpos)
2012 goto after_try;
2013 while (1) {
24b23f37 2014 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2015 goto got_it;
2016 after_try:
5339e136 2017 if (s > end)
6eb5f6b9 2018 goto phooey;
bbe252da 2019 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 2020 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
2021 if (!s)
2022 goto phooey;
2023 }
2024 else
2025 s++;
2026 }
2027 } else {
2028 if (s > startpos)
2029 s--;
2030 while (s < end) {
2031 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2032 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2033 goto got_it;
2034 }
2035 }
2036 }
2037 }
2038 goto phooey;
bbe252da 2039 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
2040 {
2041 /* the warning about reginfo.ganch being used without intialization
bbe252da 2042 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2043 and we only enter this block when the same bit is set. */
58e23c8d 2044 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2045
2046 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2047 goto got_it;
2048 goto phooey;
2049 }
2050
2051 /* Messy cases: unanchored match. */
bbe252da 2052 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
2053 /* we have /x+whatever/ */
2054 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 2055 char ch;
bf93d4cc
GS
2056#ifdef DEBUGGING
2057 int did_match = 0;
2058#endif
33b8afdf
JH
2059 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2060 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 2061 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2062
1aa99e6b 2063 if (do_utf8) {
4cadc6a9 2064 REXEC_FBC_SCAN(
6eb5f6b9 2065 if (*s == ch) {
a3621e74 2066 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2067 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2068 s += UTF8SKIP(s);
2069 while (s < strend && *s == ch)
2070 s += UTF8SKIP(s);
2071 }
4cadc6a9 2072 );
6eb5f6b9
JH
2073 }
2074 else {
4cadc6a9 2075 REXEC_FBC_SCAN(
6eb5f6b9 2076 if (*s == ch) {
a3621e74 2077 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2078 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2079 s++;
2080 while (s < strend && *s == ch)
2081 s++;
2082 }
4cadc6a9 2083 );
6eb5f6b9 2084 }
a3621e74 2085 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2086 PerlIO_printf(Perl_debug_log,
b7953727
JH
2087 "Did not find anchored character...\n")
2088 );
6eb5f6b9 2089 }
a0714e2c
SS
2090 else if (prog->anchored_substr != NULL
2091 || prog->anchored_utf8 != NULL
2092 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2093 && prog->float_max_offset < strend - s)) {
2094 SV *must;
2095 I32 back_max;
2096 I32 back_min;
2097 char *last;
6eb5f6b9 2098 char *last1; /* Last position checked before */
bf93d4cc
GS
2099#ifdef DEBUGGING
2100 int did_match = 0;
2101#endif
33b8afdf
JH
2102 if (prog->anchored_substr || prog->anchored_utf8) {
2103 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2104 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2105 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2106 back_max = back_min = prog->anchored_offset;
2107 } else {
2108 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2109 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2110 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2111 back_max = prog->float_max_offset;
2112 back_min = prog->float_min_offset;
2113 }
1de06328
YO
2114
2115
33b8afdf
JH
2116 if (must == &PL_sv_undef)
2117 /* could not downgrade utf8 check substring, so must fail */
2118 goto phooey;
2119
1de06328
YO
2120 if (back_min<0) {
2121 last = strend;
2122 } else {
2123 last = HOP3c(strend, /* Cannot start after this */
2124 -(I32)(CHR_SVLEN(must)
2125 - (SvTAIL(must) != 0) + back_min), strbeg);
2126 }
6eb5f6b9
JH
2127 if (s > PL_bostr)
2128 last1 = HOPc(s, -1);
2129 else
2130 last1 = s - 1; /* bogus */
2131
a0288114 2132 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2133 check_substr==must. */
2134 scream_pos = -1;
2135 dontbother = end_shift;
2136 strend = HOPc(strend, -dontbother);
2137 while ( (s <= last) &&
9041c2e3 2138 ((flags & REXEC_SCREAM)
1de06328 2139 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2140 end_shift, &scream_pos, 0))
1de06328 2141 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2142 (unsigned char*)strend, must,
7fba1cd6 2143 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2144 /* we may be pointing at the wrong string */
07bc277f 2145 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2146 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2147 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2148 if (HOPc(s, -back_max) > last1) {
2149 last1 = HOPc(s, -back_min);
2150 s = HOPc(s, -back_max);
2151 }
2152 else {
52657f30 2153 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2154
2155 last1 = HOPc(s, -back_min);
52657f30 2156 s = t;
6eb5f6b9 2157 }
1aa99e6b 2158 if (do_utf8) {
6eb5f6b9 2159 while (s <= last1) {
24b23f37 2160 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2161 goto got_it;
2162 s += UTF8SKIP(s);
2163 }
2164 }
2165 else {
2166 while (s <= last1) {
24b23f37 2167 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2168 goto got_it;
2169 s++;
2170 }
2171 }
2172 }
ab3bbdeb
YO
2173 DEBUG_EXECUTE_r(if (!did_match) {
2174 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2175 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2176 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2177 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2178 ? "anchored" : "floating"),
ab3bbdeb
YO
2179 quoted, RE_SV_TAIL(must));
2180 });
6eb5f6b9
JH
2181 goto phooey;
2182 }
f8fc2ecf 2183 else if ( (c = progi->regstclass) ) {
f14c76ed 2184 if (minlen) {
f8fc2ecf 2185 const OPCODE op = OP(progi->regstclass);
66e933ab 2186 /* don't bother with what can't match */
786e8c11 2187 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2188 strend = HOPc(strend, -(minlen - 1));
2189 }
a3621e74 2190 DEBUG_EXECUTE_r({
be8e71aa 2191 SV * const prop = sv_newmortal();
32fc9b6a 2192 regprop(prog, prop, c);
0df25f3d 2193 {
02daf0ab 2194 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2195 s,strend-s,60);
0df25f3d 2196 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2197 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2198 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2199 quoted, (int)(strend - s));
0df25f3d 2200 }
ffc61ed2 2201 });
3b0527fe 2202 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2203 goto got_it;
07be1b83 2204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2205 }
2206 else {
2207 dontbother = 0;
a0714e2c 2208 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2209 /* Trim the end. */
d6a28714 2210 char *last;
33b8afdf
JH
2211 SV* float_real;
2212
2213 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2214 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2215 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2216
2217 if (flags & REXEC_SCREAM) {
33b8afdf 2218 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2219 end_shift, &scream_pos, 1); /* last one */
2220 if (!last)
ffc61ed2 2221 last = scream_olds; /* Only one occurrence. */
4addbd3b 2222 /* we may be pointing at the wrong string */
07bc277f 2223 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2224 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2225 }
d6a28714
JH
2226 else {
2227 STRLEN len;
cfd0369c 2228 const char * const little = SvPV_const(float_real, len);
d6a28714 2229
33b8afdf 2230 if (SvTAIL(float_real)) {
d6a28714
JH
2231 if (memEQ(strend - len + 1, little, len - 1))
2232 last = strend - len + 1;
7fba1cd6 2233 else if (!multiline)
9041c2e3 2234 last = memEQ(strend - len, little, len)
bd61b366 2235 ? strend - len : NULL;
b8c5462f 2236 else
d6a28714
JH
2237 goto find_last;
2238 } else {
2239 find_last:
9041c2e3 2240 if (len)
d6a28714 2241 last = rninstr(s, strend, little, little + len);
b8c5462f 2242 else
a0288114 2243 last = strend; /* matching "$" */
b8c5462f 2244 }
b8c5462f 2245 }
bf93d4cc 2246 if (last == NULL) {
6bda09f9
YO
2247 DEBUG_EXECUTE_r(
2248 PerlIO_printf(Perl_debug_log,
2249 "%sCan't trim the tail, match fails (should not happen)%s\n",
2250 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2251 goto phooey; /* Should not happen! */
2252 }
d6a28714
JH
2253 dontbother = strend - last + prog->float_min_offset;
2254 }
2255 if (minlen && (dontbother < minlen))
2256 dontbother = minlen - 1;
2257 strend -= dontbother; /* this one's always in bytes! */
2258 /* We don't know much -- general case. */
1aa99e6b 2259 if (do_utf8) {
d6a28714 2260 for (;;) {
24b23f37 2261 if (regtry(&reginfo, &s))
d6a28714
JH
2262 goto got_it;
2263 if (s >= strend)
2264 break;
b8c5462f 2265 s += UTF8SKIP(s);
d6a28714
JH
2266 };
2267 }
2268 else {
2269 do {
24b23f37 2270 if (regtry(&reginfo, &s))
d6a28714
JH
2271 goto got_it;
2272 } while (s++ < strend);
2273 }
2274 }
2275
2276 /* Failure. */
2277 goto phooey;
2278
2279got_it:
e9105d30 2280 Safefree(swap);
288b8c02 2281 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2282
19b95bf0 2283 if (PL_reg_eval_set)
4f639d21 2284 restore_pos(aTHX_ prog);
5daac39c
NC
2285 if (RXp_PAREN_NAMES(prog))
2286 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2287
2288 /* make sure $`, $&, $', and $digit will work later */
2289 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2290 RX_MATCH_COPY_FREE(rx);
d6a28714 2291 if (flags & REXEC_COPY_STR) {
be8e71aa 2292 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2293#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2294 if ((SvIsCOW(sv)
2295 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2296 if (DEBUG_C_TEST) {
2297 PerlIO_printf(Perl_debug_log,
2298 "Copy on write: regexp capture, type %d\n",
2299 (int) SvTYPE(sv));
2300 }
2301 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2302 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2303 assert (SvPOKp(prog->saved_copy));
2304 } else
2305#endif
2306 {
288b8c02 2307 RX_MATCH_COPIED_on(rx);
ed252734
NC
2308 s = savepvn(strbeg, i);
2309 prog->subbeg = s;
2310 }
d6a28714 2311 prog->sublen = i;
d6a28714
JH
2312 }
2313 else {
2314 prog->subbeg = strbeg;
2315 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2316 }
2317 }
9041c2e3 2318
d6a28714
JH
2319 return 1;
2320
2321phooey:
a3621e74 2322 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2323 PL_colors[4], PL_colors[5]));
d6a28714 2324 if (PL_reg_eval_set)
4f639d21 2325 restore_pos(aTHX_ prog);
e9105d30 2326 if (swap) {
c74340f9 2327 /* we failed :-( roll it back */
e9105d30
GG
2328 Safefree(prog->offs);
2329 prog->offs = swap;
2330 }
2331
d6a28714
JH
2332 return 0;
2333}
2334
6bda09f9 2335
d6a28714
JH
2336/*
2337 - regtry - try match at specific point
2338 */
2339STATIC I32 /* 0 failure, 1 success */
24b23f37 2340S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2341{
97aff369 2342 dVAR;
d6a28714 2343 CHECKPOINT lastcp;
288b8c02
NC
2344 REGEXP *const rx = reginfo->prog;
2345 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2346 RXi_GET_DECL(prog,progi);
a3621e74 2347 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2348
2349 PERL_ARGS_ASSERT_REGTRY;
2350
24b23f37 2351 reginfo->cutpoint=NULL;
d6a28714 2352
bbe252da 2353 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2354 MAGIC *mg;
2355
2356 PL_reg_eval_set = RS_init;
a3621e74 2357 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2358 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2359 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2360 ));
ea8d6ae1 2361 SAVESTACK_CXPOS();
d6a28714
JH
2362 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2363 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2364 SAVETMPS;
2365 /* Apparently this is not needed, judging by wantarray. */
e8347627 2366 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2367 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2368
3b0527fe 2369 if (reginfo->sv) {
d6a28714 2370 /* Make $_ available to executed code. */
3b0527fe 2371 if (reginfo->sv != DEFSV) {
59f00321 2372 SAVE_DEFSV;
414bf5ae 2373 DEFSV_set(reginfo->sv);
b8c5462f 2374 }
d6a28714 2375
3b0527fe
DM
2376 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2377 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2378 /* prepare for quick setting of pos */
d300d9fa 2379#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2380 if (SvIsCOW(reginfo->sv))
2381 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2382#endif
3dab1dad 2383 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2384 &PL_vtbl_mglob, NULL, 0);
d6a28714 2385 mg->mg_len = -1;
b8c5462f 2386 }
d6a28714
JH
2387 PL_reg_magic = mg;
2388 PL_reg_oldpos = mg->mg_len;
4f639d21 2389 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2390 }
09687e5a 2391 if (!PL_reg_curpm) {
a02a5408 2392 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2393#ifdef USE_ITHREADS
2394 {
14a49a24 2395 SV* const repointer = &PL_sv_undef;
92313705
NC
2396 /* this regexp is also owned by the new PL_reg_curpm, which
2397 will try to free it. */
d2ece331 2398 av_push(PL_regex_padav, repointer);
09687e5a
AB
2399 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2400 PL_regex_pad = AvARRAY(PL_regex_padav);
2401 }
2402#endif
2403 }
86c29d75
NC
2404#ifdef USE_ITHREADS
2405 /* It seems that non-ithreads works both with and without this code.
2406 So for efficiency reasons it seems best not to have the code
2407 compiled when it is not needed. */
92313705
NC
2408 /* This is safe against NULLs: */
2409 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2410 /* PM_reg_curpm owns a reference to this regexp. */
2411 ReREFCNT_inc(rx);
86c29d75 2412#endif
288b8c02 2413 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2414 PL_reg_oldcurpm = PL_curpm;
2415 PL_curpm = PL_reg_curpm;
07bc277f 2416 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2417 /* Here is a serious problem: we cannot rewrite subbeg,
2418 since it may be needed if this match fails. Thus
2419 $` inside (?{}) could fail... */
2420 PL_reg_oldsaved = prog->subbeg;
2421 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2422#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2423 PL_nrs = prog->saved_copy;
2424#endif
07bc277f 2425 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2426 }
2427 else
bd61b366 2428 PL_reg_oldsaved = NULL;
d6a28714
JH
2429 prog->subbeg = PL_bostr;
2430 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2431 }
24b23f37 2432 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2433 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2434 PL_reginput = *startpos;
d6a28714 2435 PL_reglastparen = &prog->lastparen;
a01268b5 2436 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2437 prog->lastparen = 0;
03994de8 2438 prog->lastcloseparen = 0;
d6a28714 2439 PL_regsize = 0;
f0ab9afb 2440 PL_regoffs = prog->offs;
d6a28714
JH
2441 if (PL_reg_start_tmpl <= prog->nparens) {
2442 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2443 if(PL_reg_start_tmp)
2444 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2445 else
a02a5408 2446 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2447 }
2448
2449 /* XXXX What this code is doing here?!!! There should be no need
2450 to do this again and again, PL_reglastparen should take care of
3dd2943c 2451 this! --ilya*/
dafc8851
JH
2452
2453 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2454 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2455 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2456 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2457 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2458 * Meanwhile, this code *is* needed for the
daf18116
JH
2459 * above-mentioned test suite tests to succeed. The common theme
2460 * on those tests seems to be returning null fields from matches.
225593e1 2461 * --jhi updated by dapm */
dafc8851 2462#if 1
d6a28714 2463 if (prog->nparens) {
f0ab9afb 2464 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2465 register I32 i;
eb160463 2466 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2467 ++pp;
2468 pp->start = -1;
2469 pp->end = -1;
d6a28714
JH
2470 }
2471 }
dafc8851 2472#endif
02db2b7b 2473 REGCP_SET(lastcp);
f8fc2ecf 2474 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2475 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2476 return 1;
2477 }
24b23f37
YO
2478 if (reginfo->cutpoint)
2479 *startpos= reginfo->cutpoint;
02db2b7b 2480 REGCP_UNWIND(lastcp);
d6a28714
JH
2481 return 0;
2482}
2483
02db2b7b 2484
8ba1375e
MJD
2485#define sayYES goto yes
2486#define sayNO goto no
262b90c4 2487#define sayNO_SILENT goto no_silent
8ba1375e 2488
f9f4320a
YO
2489/* we dont use STMT_START/END here because it leads to
2490 "unreachable code" warnings, which are bogus, but distracting. */
2491#define CACHEsayNO \
c476f425
DM
2492 if (ST.cache_mask) \
2493 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2494 sayNO
3298f257 2495
a3621e74 2496/* this is used to determine how far from the left messages like
265c4333
YO
2497 'failed...' are printed. It should be set such that messages
2498 are inline with the regop output that created them.
a3621e74 2499*/
265c4333 2500#define REPORT_CODE_OFF 32
a3621e74
YO
2501
2502
2503/* Make sure there is a test for this +1 options in re_tests */
2504#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2505
40a82448
DM
2506#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2507#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2508
86545054
DM
2509#define SLAB_FIRST(s) (&(s)->states[0])
2510#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2511
5d9a96ca
DM
2512/* grab a new slab and return the first slot in it */
2513
2514STATIC regmatch_state *
2515S_push_slab(pTHX)
2516{
a35a87e7 2517#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2518 dMY_CXT;
2519#endif
5d9a96ca
DM
2520 regmatch_slab *s = PL_regmatch_slab->next;
2521 if (!s) {
2522 Newx(s, 1, regmatch_slab);
2523 s->prev = PL_regmatch_slab;
2524 s->next = NULL;
2525 PL_regmatch_slab->next = s;
2526 }
2527 PL_regmatch_slab = s;
86545054 2528 return SLAB_FIRST(s);
5d9a96ca 2529}
5b47454d 2530
95b24440 2531
40a82448
DM
2532/* push a new state then goto it */
2533
2534#define PUSH_STATE_GOTO(state, node) \
2535 scan = node; \
2536 st->resume_state = state; \
2537 goto push_state;
2538
2539/* push a new state with success backtracking, then goto it */
2540
2541#define PUSH_YES_STATE_GOTO(state, node) \
2542 scan = node; \
2543 st->resume_state = state; \
2544 goto push_yes_state;
2545
aa283a38 2546
aa283a38 2547
d6a28714 2548/*
95b24440 2549
bf1f174e
DM
2550regmatch() - main matching routine
2551
2552This is basically one big switch statement in a loop. We execute an op,
2553set 'next' to point the next op, and continue. If we come to a point which
2554we may need to backtrack to on failure such as (A|B|C), we push a
2555backtrack state onto the backtrack stack. On failure, we pop the top
2556state, and re-enter the loop at the state indicated. If there are no more
2557states to pop, we return failure.
2558
2559Sometimes we also need to backtrack on success; for example /A+/, where
2560after successfully matching one A, we need to go back and try to
2561match another one; similarly for lookahead assertions: if the assertion
2562completes successfully, we backtrack to the state just before the assertion
2563and then carry on. In these cases, the pushed state is marked as
2564'backtrack on success too'. This marking is in fact done by a chain of
2565pointers, each pointing to the previous 'yes' state. On success, we pop to
2566the nearest yes state, discarding any intermediate failure-only states.
2567Sometimes a yes state is pushed just to force some cleanup code to be
2568called at the end of a successful match or submatch; e.g. (??{$re}) uses
2569it to free the inner regex.
2570
2571Note that failure backtracking rewinds the cursor position, while
2572success backtracking leaves it alone.
2573
2574A pattern is complete when the END op is executed, while a subpattern
2575such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2576ops trigger the "pop to last yes state if any, otherwise return true"
2577behaviour.
2578
2579A common convention in this function is to use A and B to refer to the two
2580subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2581the subpattern to be matched possibly multiple times, while B is the entire
2582rest of the pattern. Variable and state names reflect this convention.
2583
2584The states in the main switch are the union of ops and failure/success of
2585substates associated with with that op. For example, IFMATCH is the op
2586that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2587'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2588successfully matched A and IFMATCH_A_fail is a state saying that we have
2589just failed to match A. Resume states always come in pairs. The backtrack
2590state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2591at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2592on success or failure.
2593
2594The struct that holds a backtracking state is actually a big union, with
2595one variant for each major type of op. The variable st points to the
2596top-most backtrack struct. To make the code clearer, within each
2597block of code we #define ST to alias the relevant union.
2598
2599Here's a concrete example of a (vastly oversimplified) IFMATCH
2600implementation:
2601
2602 switch (state) {
2603 ....
2604
2605#define ST st->u.ifmatch
2606
2607 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2608 ST.foo = ...; // some state we wish to save
95b24440 2609 ...
bf1f174e
DM
2610 // push a yes backtrack state with a resume value of
2611 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2612 // first node of A:
2613 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2614 // NOTREACHED
2615
2616 case IFMATCH_A: // we have successfully executed A; now continue with B
2617 next = B;
2618 bar = ST.foo; // do something with the preserved value
2619 break;
2620
2621 case IFMATCH_A_fail: // A failed, so the assertion failed
2622 ...; // do some housekeeping, then ...
2623 sayNO; // propagate the failure
2624
2625#undef ST
95b24440 2626
bf1f174e
DM
2627 ...
2628 }
95b24440 2629
bf1f174e
DM
2630For any old-timers reading this who are familiar with the old recursive
2631approach, the code above is equivalent to:
95b24440 2632
bf1f174e
DM
2633 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2634 {
2635 int foo = ...
95b24440 2636 ...
bf1f174e
DM
2637 if (regmatch(A)) {
2638 next = B;
2639 bar = foo;
2640 break;
95b24440 2641 }
bf1f174e
DM
2642 ...; // do some housekeeping, then ...
2643 sayNO; // propagate the failure
95b24440 2644 }
bf1f174e
DM
2645
2646The topmost backtrack state, pointed to by st, is usually free. If you
2647want to claim it, populate any ST.foo fields in it with values you wish to
2648save, then do one of
2649
2650 PUSH_STATE_GOTO(resume_state, node);
2651 PUSH_YES_STATE_GOTO(resume_state, node);
2652
2653which sets that backtrack state's resume value to 'resume_state', pushes a
2654new free entry to the top of the backtrack stack, then goes to 'node'.
2655On backtracking, the free slot is popped, and the saved state becomes the
2656new free state. An ST.foo field in this new top state can be temporarily
2657accessed to retrieve values, but once the main loop is re-entered, it
2658becomes available for reuse.
2659
2660Note that the depth of the backtrack stack constantly increases during the
2661left-to-right execution of the pattern, rather than going up and down with
2662the pattern nesting. For example the stack is at its maximum at Z at the
2663end of the pattern, rather than at X in the following:
2664
2665 /(((X)+)+)+....(Y)+....Z/
2666
2667The only exceptions to this are lookahead/behind assertions and the cut,
2668(?>A), which pop all the backtrack states associated with A before
2669continuing.
2670
2671Bascktrack state structs are allocated in slabs of about 4K in size.
2672PL_regmatch_state and st always point to the currently active state,
2673and PL_regmatch_slab points to the slab currently containing
2674PL_regmatch_state. The first time regmatch() is called, the first slab is
2675allocated, and is never freed until interpreter destruction. When the slab
2676is full, a new one is allocated and chained to the end. At exit from
2677regmatch(), slabs allocated since entry are freed.
2678
2679*/
95b24440 2680
40a82448 2681
5bc10b2c 2682#define DEBUG_STATE_pp(pp) \
265c4333 2683 DEBUG_STATE_r({ \
5bc10b2c
DM
2684 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2685 PerlIO_printf(Perl_debug_log, \
5d458dd8 2686 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2687 depth*2, "", \
13d6edb4 2688 PL_reg_name[st->resume_state], \
5d458dd8
YO
2689 ((st==yes_state||st==mark_state) ? "[" : ""), \
2690 ((st==yes_state) ? "Y" : ""), \
2691 ((st==mark_state) ? "M" : ""), \
2692 ((st==yes_state||st==mark_state) ? "]" : "") \
2693 ); \
265c4333 2694 });
5bc10b2c 2695
40a82448 2696
3dab1dad 2697#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2698
3df15adc 2699#ifdef DEBUGGING
5bc10b2c 2700
ab3bbdeb 2701STATIC void
efd26800 2702S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2703 const char *start, const char *end, const char *blurb)
2704{
efd26800 2705 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2706
2707 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2708
ab3bbdeb
YO
2709 if (!PL_colorset)
2710 reginitcolors();
2711 {
2712 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2713 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2714
2715 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2716 start, end - start, 60);
2717
2718 PerlIO_printf(Perl_debug_log,
2719 "%s%s REx%s %s against %s\n",
2720 PL_colors[4], blurb, PL_colors[5], s0, s1);
2721
2722 if (do_utf8||utf8_pat)
1de06328
YO
2723 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2724 utf8_pat ? "pattern" : "",
2725 utf8_pat && do_utf8 ? " and " : "",
2726 do_utf8 ? "string" : ""
ab3bbdeb
YO
2727 );
2728 }
2729}
3df15adc
YO
2730
2731STATIC void
786e8c11
YO
2732S_dump_exec_pos(pTHX_ const char *locinput,
2733 const regnode *scan,
2734 const char *loc_regeol,
2735 const char *loc_bostr,
2736 const char *loc_reg_starttry,
2737 const bool do_utf8)
07be1b83 2738{
786e8c11 2739 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2740 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2741 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2742 /* The part of the string before starttry has one color
2743 (pref0_len chars), between starttry and current
2744 position another one (pref_len - pref0_len chars),
2745 after the current position the third one.
2746 We assume that pref0_len <= pref_len, otherwise we
2747 decrease pref0_len. */
786e8c11
YO
2748 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2749 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2750 int pref0_len;
2751
7918f24d
NC
2752 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2753
07be1b83
YO
2754 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2755 pref_len++;
786e8c11
YO
2756 pref0_len = pref_len - (locinput - loc_reg_starttry);
2757 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2758 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2759 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2760 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2761 l--;
2762 if (pref0_len < 0)
2763 pref0_len = 0;
2764 if (pref0_len > pref_len)
2765 pref0_len = pref_len;
2766 {
3df15adc 2767 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2768
ab3bbdeb 2769 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2770 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2771
ab3bbdeb 2772 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2773 (locinput - pref_len + pref0_len),
1de06328 2774 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2775
ab3bbdeb 2776 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2777 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2778
1de06328 2779 const STRLEN tlen=len0+len1+len2;
3df15adc 2780 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2781 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2782 (IV)(locinput - loc_bostr),
07be1b83 2783 len0, s0,
07be1b83 2784 len1, s1,
07be1b83 2785 (docolor ? "" : "> <"),
07be1b83 2786 len2, s2,
f9f4320a 2787 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2788 "");
2789 }
2790}
3df15adc 2791
07be1b83
YO
2792#endif
2793
0a4db386
YO
2794/* reg_check_named_buff_matched()
2795 * Checks to see if a named buffer has matched. The data array of
2796 * buffer numbers corresponding to the buffer is expected to reside
2797 * in the regexp->data->data array in the slot stored in the ARG() of
2798 * node involved. Note that this routine doesn't actually care about the
2799 * name, that information is not preserved from compilation to execution.
2800 * Returns the index of the leftmost defined buffer with the given name
2801 * or 0 if non of the buffers matched.
2802 */
2803STATIC I32
7918f24d
NC
2804S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2805{
0a4db386 2806 I32 n;
f8fc2ecf 2807 RXi_GET_DECL(rex,rexi);
ad64d0ec 2808 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2809 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2810
2811 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2812
0a4db386
YO
2813 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2814 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2815 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2816 {
2817 return nums[n];
2818 }
2819 }
2820 return 0;
2821}
2822
2f554ef7
DM
2823
2824/* free all slabs above current one - called during LEAVE_SCOPE */
2825
2826STATIC void
2827S_clear_backtrack_stack(pTHX_ void *p)
2828{
2829 regmatch_slab *s = PL_regmatch_slab->next;
2830 PERL_UNUSED_ARG(p);
2831
2832 if (!s)
2833 return;
2834 PL_regmatch_slab->next = NULL;
2835 while (s) {
2836 regmatch_slab * const osl = s;
2837 s = s->next;
2838 Safefree(osl);
2839 }
2840}
2841
2842
28d8d7f4
YO
2843#define SETREX(Re1,Re2) \
2844 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2845 Re1 = (Re2)
2846
d6a28714 2847STATIC I32 /* 0 failure, 1 success */
24b23f37 2848S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2849{
a35a87e7 2850#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2851 dMY_CXT;
2852#endif
27da23d5 2853 dVAR;
95b24440 2854 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2855 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2856 REGEXP *rex_sv = reginfo->prog;
2857 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2858 RXi_GET_DECL(rex,rexi);
2f554ef7 2859 I32 oldsave;
5d9a96ca
DM
2860 /* the current state. This is a cached copy of PL_regmatch_state */
2861 register regmatch_state *st;
5d9a96ca
DM
2862 /* cache heavy used fields of st in registers */
2863 register regnode *scan;
2864 register regnode *next;
438e9bae 2865 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2866 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2867 register char *locinput = PL_reginput;
5d9a96ca 2868 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2869
b69b0499 2870 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2871 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2872 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2873 const U32 max_nochange_depth =
2874 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2875 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2876 regmatch_state *yes_state = NULL; /* state to pop to on success of
2877 subpattern */
e2e6a0f1
YO
2878 /* mark_state piggy backs on the yes_state logic so that when we unwind
2879 the stack on success we can update the mark_state as we go */
2880 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2881 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2882 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2883 U32 state_num;
5d458dd8
YO
2884 bool no_final = 0; /* prevent failure from backtracking? */
2885 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2886 char *startpoint = PL_reginput;
5d458dd8
YO
2887 SV *popmark = NULL; /* are we looking for a mark? */
2888 SV *sv_commit = NULL; /* last mark name seen in failure */
2889 SV *sv_yes_mark = NULL; /* last mark name we have seen
2890 during a successfull match */
2891 U32 lastopen = 0; /* last open we saw */
2892 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2893 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2894 /* these three flags are set by various ops to signal information to
2895 * the very next op. They have a useful lifetime of exactly one loop
2896 * iteration, and are not preserved or restored by state pushes/pops
2897 */
2898 bool sw = 0; /* the condition value in (?(cond)a|b) */
2899 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2900 int logical = 0; /* the following EVAL is:
2901 0: (?{...})
2902 1: (?(?{...})X|Y)
2903 2: (??{...})
2904 or the following IFMATCH/UNLESSM is:
2905 false: plain (?=foo)
2906 true: used as a condition: (?(?=foo))
2907 */
95b24440 2908#ifdef DEBUGGING
e68ec53f 2909 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2910#endif
2911
7918f24d
NC
2912 PERL_ARGS_ASSERT_REGMATCH;
2913
3b57cd43 2914 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2915 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2916 }));
5d9a96ca
DM
2917 /* on first ever call to regmatch, allocate first slab */
2918 if (!PL_regmatch_slab) {
2919 Newx(PL_regmatch_slab, 1, regmatch_slab);
2920 PL_regmatch_slab->prev = NULL;
2921 PL_regmatch_slab->next = NULL;
86545054 2922 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2923 }
2924
2f554ef7
DM
2925 oldsave = PL_savestack_ix;
2926 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2927 SAVEVPTR(PL_regmatch_slab);
2928 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2929
2930 /* grab next free state slot */
2931 st = ++PL_regmatch_state;
86545054 2932 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2933 st = PL_regmatch_state = S_push_slab(aTHX);
2934
d6a28714
JH
2935 /* Note that nextchr is a byte even in UTF */
2936 nextchr = UCHARAT(locinput);
2937 scan = prog;
2938 while (scan != NULL) {
8ba1375e 2939
a3621e74 2940 DEBUG_EXECUTE_r( {
6136c704 2941 SV * const prop = sv_newmortal();
1de06328 2942 regnode *rnext=regnext(scan);
786e8c11 2943 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2944 regprop(rex, prop, scan);
07be1b83
YO
2945
2946 PerlIO_printf(Perl_debug_log,
2947 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2948 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2949 SvPVX_const(prop),
1de06328 2950 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2951 0 : (IV)(rnext - rexi->program));
2a782b5b 2952 });
d6a28714
JH
2953
2954 next = scan + NEXT_OFF(scan);
2955 if (next == scan)
2956 next = NULL;
40a82448 2957 state_num = OP(scan);
d6a28714 2958
40a82448 2959 reenter_switch:
34a81e2b
B
2960
2961 assert(PL_reglastparen == &rex->lastparen);
2962 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2963 assert(PL_regoffs == rex->offs);
2964
40a82448 2965 switch (state_num) {
d6a28714 2966 case BOL:
7fba1cd6 2967 if (locinput == PL_bostr)
d6a28714 2968 {
3b0527fe 2969 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2970 break;
2971 }
d6a28714
JH
2972 sayNO;
2973 case MBOL:
12d33761
HS
2974 if (locinput == PL_bostr ||
2975 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2976 {
b8c5462f
JH
2977 break;
2978 }
d6a28714
JH
2979 sayNO;
2980 case SBOL:
c2a73568 2981 if (locinput == PL_bostr)
b8c5462f 2982 break;
d6a28714
JH
2983 sayNO;
2984 case GPOS:
3b0527fe 2985 if (locinput == reginfo->ganch)
d6a28714
JH
2986 break;
2987 sayNO;
ee9b8eae
YO
2988
2989 case KEEPS:
2990 /* update the startpoint */
f0ab9afb 2991 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2992 PL_reginput = locinput;
f0ab9afb 2993 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2994 PUSH_STATE_GOTO(KEEPS_next, next);
2995 /*NOT-REACHED*/
2996 case KEEPS_next_fail:
2997 /* rollback the start point change */
f0ab9afb 2998 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2999 sayNO_SILENT;
3000 /*NOT-REACHED*/
d6a28714 3001 case EOL:
d6a28714
JH
3002 goto seol;
3003 case MEOL:
d6a28714 3004 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3005 sayNO;
b8c5462f 3006 break;
d6a28714
JH
3007 case SEOL:
3008 seol:
3009 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3010 sayNO;
d6a28714 3011 if (PL_regeol - locinput > 1)
b8c5462f 3012 sayNO;
b8c5462f 3013 break;
d6a28714
JH
3014 case EOS:
3015 if (PL_regeol != locinput)
b8c5462f 3016 sayNO;
d6a28714 3017 break;
ffc61ed2 3018 case SANY:
d6a28714 3019 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3020 sayNO;
f33976b4
DB
3021 if (do_utf8) {
3022 locinput += PL_utf8skip[nextchr];
3023 if (locinput > PL_regeol)
3024 sayNO;
3025 nextchr = UCHARAT(locinput);
3026 }
3027 else
3028 nextchr = UCHARAT(++locinput);
3029 break;
3030 case CANY:
3031 if (!nextchr && locinput >= PL_regeol)
3032 sayNO;
b8c5462f 3033 nextchr = UCHARAT(++locinput);
a0d0e21e 3034 break;
ffc61ed2 3035 case REG_ANY:
1aa99e6b
IH
3036 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3037 sayNO;
3038 if (do_utf8) {
b8c5462f 3039 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3040 if (locinput > PL_regeol)
3041 sayNO;
a0ed51b3 3042 nextchr = UCHARAT(locinput);
a0ed51b3 3043 }
1aa99e6b
IH
3044 else
3045 nextchr = UCHARAT(++locinput);
a0ed51b3 3046 break;
166ba7cd
DM
3047
3048#undef ST
3049#define ST st->u.trie
786e8c11
YO
3050 case TRIEC:
3051 /* In this case the charclass data is available inline so
3052 we can fail fast without a lot of extra overhead.
3053 */
3054 if (scan->flags == EXACT || !do_utf8) {
3055 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3056 DEBUG_EXECUTE_r(
3057 PerlIO_printf(Perl_debug_log,
3058 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3059 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3060 );
3061 sayNO_SILENT;
3062 /* NOTREACHED */
3063 }
3064 }
3065 /* FALL THROUGH */
5b47454d 3066 case TRIE:
3dab1dad 3067 {
07be1b83 3068 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3069 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3070
3071 /* what trie are we using right now */
be8e71aa 3072 reg_trie_data * const trie
f8fc2ecf 3073 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3074 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3075 U32 state = trie->startstate;
166ba7cd 3076
3dab1dad
YO
3077 if (trie->bitmap && trie_type != trie_utf8_fold &&
3078 !TRIE_BITMAP_TEST(trie,*locinput)
3079 ) {
3080 if (trie->states[ state ].wordnum) {
3081 DEBUG_EXECUTE_r(
3082 PerlIO_printf(Perl_debug_log,
3083 "%*s %smatched empty string...%s\n",
5bc10b2c 3084 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3085 );
3086 break;
3087 } else {
3088 DEBUG_EXECUTE_r(
3089 PerlIO_printf(Perl_debug_log,
786e8c11 3090 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3091 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3092 );
3093 sayNO_SILENT;
3094 }
3095 }
166ba7cd 3096
786e8c11
YO
3097 {
3098 U8 *uc = ( U8* )locinput;
3099
3100 STRLEN len = 0;
3101 STRLEN foldlen = 0;
3102 U8 *uscan = (U8*)NULL;
3103 STRLEN bufflen=0;
3104 SV *sv_accept_buff = NULL;
3105 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3106
3107 ST.accepted = 0; /* how many accepting states we have seen */
3108 ST.B = next;
3109 ST.jump = trie->jump;
786e8c11 3110 ST.me = scan;
07be1b83
YO
3111 /*
3112 traverse the TRIE keeping track of all accepting states
3113 we transition through until we get to a failing node.
3114 */
3115
a3621e74 3116 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3117 U32 base = trie->states[ state ].trans.base;
f9f4320a 3118 UV uvc = 0;
786e8c11
YO
3119 U16 charid;
3120 /* We use charid to hold the wordnum as we don't use it
3121 for charid until after we have done the wordnum logic.
3122 We define an alias just so that the wordnum logic reads
3123 more naturally. */
3124
3125#define got_wordnum charid
3126 got_wordnum = trie->states[ state ].wordnum;
3127
3128 if ( got_wordnum ) {
3129 if ( ! ST.accepted ) {
5b47454d 3130 ENTER;
6962fb1a 3131 SAVETMPS; /* XXX is this necessary? dmq */
5b47454d
DM
3132 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3133 sv_accept_buff=newSV(bufflen *
3134 sizeof(reg_trie_accepted) - 1);
786e8c11 3135 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3136 SvPOK_on(sv_accept_buff);
3137 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3138 SAVETMPS;
3139 ST.accept_buff =
5b47454d
DM
3140 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3141 }
786e8c11 3142 do {
166ba7cd 3143 if (ST.accepted >= bufflen) {
5b47454d 3144 bufflen *= 2;
166ba7cd 3145 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3146 SvGROW(sv_accept_buff,
3147 bufflen * sizeof(reg_trie_accepted));
3148 }
3149 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3150 + sizeof(reg_trie_accepted));
a3621e74 3151
786e8c11
YO
3152
3153 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3154 ST.accept_buff[ST.accepted].endpos = uc;
3155 ++ST.accepted;
3156 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3157 }
3158#undef got_wordnum
a3621e74 3159
07be1b83 3160 DEBUG_TRIE_EXECUTE_r({
786e8c11 3161 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3162 PerlIO_printf( Perl_debug_log,
786e8c11 3163 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3164 2+depth * 2, "", PL_colors[4],
786e8c11 3165 (UV)state, (UV)ST.accepted );
07be1b83 3166 });
a3621e74
YO
3167
3168 if ( base ) {
55eed653
NC
3169 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3170 uscan, len, uvc, charid, foldlen,
3171 foldbuf, uniflags);
a3621e74 3172
5b47454d
DM
3173 if (charid &&
3174 (base + charid > trie->uniquecharcount )
3175 && (base + charid - 1 - trie->uniquecharcount
3176 < trie->lasttrans)
3177 && trie->trans[base + charid - 1 -
3178 trie->uniquecharcount].check == state)
3179 {
3180 state = trie->trans[base + charid - 1 -
3181 trie->uniquecharcount ].next;
3182 }
3183 else {
3184 state = 0;
3185 }
3186 uc += len;
3187
3188 }
3189 else {
a3621e74
YO
3190 state = 0;
3191 }
3192 DEBUG_TRIE_EXECUTE_r(
e4584336 3193 PerlIO_printf( Perl_debug_log,
786e8c11 3194 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3195 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3196 );
3197 }
166ba7cd 3198 if (!ST.accepted )
a3621e74 3199 sayNO;
a3621e74 3200
166ba7cd
DM
3201 DEBUG_EXECUTE_r(
3202 PerlIO_printf( Perl_debug_log,
3203 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3204 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3205 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3206 );
786e8c11 3207 }}
fae667d5
YO
3208 goto trie_first_try; /* jump into the fail handler */
3209 /* NOTREACHED */
166ba7cd 3210 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3211 if ( ST.jump) {
3212 REGCP_UNWIND(ST.cp);
3213 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3214 PL_regoffs[n].end = -1;
fae667d5
YO
3215 *PL_reglastparen = n;
3216 }
3217 trie_first_try:
5d458dd8
YO
3218 if (do_cutgroup) {
3219 do_cutgroup = 0;
3220 no_final = 0;
3221 }
fae667d5
YO
3222
3223 if ( ST.jump) {
3224 ST.lastparen = *PL_reglastparen;
3225 REGCP_SET(ST.cp);
3226 }
166ba7cd
DM
3227 if ( ST.accepted == 1 ) {
3228 /* only one choice left - just continue */
3229 DEBUG_EXECUTE_r({
2b8b4781 3230 AV *const trie_words
502c6561 3231 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3232 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3233 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3234 SV *sv= tmp ? sv_newmortal() : NULL;
3235
166ba7cd
DM
3236 PerlIO_printf( Perl_debug_log,
3237 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3238 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3239 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3240 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3241 PL_colors[0], PL_colors[1],
3242 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3243 )
3244 : "not compiled under -Dr",
166ba7cd
DM
3245 PL_colors[5] );
3246 });
3247 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3248 /* in this case we free tmps/leave before we call regmatch
3249 as we wont be using accept_buff again. */
5d458dd8 3250
166ba7cd
DM
3251 locinput = PL_reginput;
3252 nextchr = UCHARAT(locinput);
5d458dd8
YO
3253 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3254 scan = ST.B;
3255 else
3256 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3257 if (!has_cutgroup) {
3258 FREETMPS;
3259 LEAVE;
3260 } else {
3261 ST.accepted--;
3262 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3263 }
786e8c11 3264
166ba7cd
DM
3265 continue; /* execute rest of RE */
3266 }
fae667d5
YO
3267
3268 if ( !ST.accepted-- ) {
5d458dd8
YO
3269 DEBUG_EXECUTE_r({
3270 PerlIO_printf( Perl_debug_log,
3271 "%*s %sTRIE failed...%s\n",
3272 REPORT_CODE_OFF+depth*2, "",
3273 PL_colors[4],
3274 PL_colors[5] );
3275 });
166ba7cd
DM
3276 FREETMPS;
3277 LEAVE;
5d458dd8 3278 sayNO_SILENT;
fae667d5
YO
3279 /*NOTREACHED*/
3280 }
166ba7cd 3281
a3621e74 3282 /*
166ba7cd
DM
3283 There are at least two accepting states left. Presumably
3284 the number of accepting states is going to be low,
3285 typically two. So we simply scan through to find the one
3286 with lowest wordnum. Once we find it, we swap the last
3287 state into its place and decrement the size. We then try to
3288 match the rest of the pattern at the point where the word
3289 ends. If we succeed, control just continues along the
3290 regex; if we fail we return here to try the next accepting
3291 state
3292 */
a3621e74 3293
166ba7cd
DM
3294 {
3295 U32 best = 0;
3296 U32 cur;
3297 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3298 DEBUG_TRIE_EXECUTE_r(
f2278c82 3299 PerlIO_printf( Perl_debug_log,
166ba7cd 3300 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3301 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3302 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3303 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3304 );
3305
3306 if (ST.accept_buff[cur].wordnum <
3307 ST.accept_buff[best].wordnum)
3308 best = cur;
a3621e74 3309 }
166ba7cd
DM
3310
3311 DEBUG_EXECUTE_r({
2b8b4781 3312 AV *const trie_words
502c6561 3313 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3314 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3315 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3316 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3317 ST.B :
7f69552c 3318 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3319 SV *sv= tmp ? sv_newmortal() : NULL;
3320
265c4333
YO
3321 PerlIO_printf( Perl_debug_log,
3322 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3323 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3324 ST.accept_buff[best].wordnum,
de734bd5
A
3325 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3326 PL_colors[0], PL_colors[1],
3327 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3328 ) : "not compiled under -Dr",
265c4333 3329 REG_NODE_NUM(nextop),
166ba7cd
DM
3330 PL_colors[5] );
3331 });
3332
3333 if ( best<ST.accepted ) {
3334 reg_trie_accepted tmp = ST.accept_buff[ best ];
3335 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3336 ST.accept_buff[ ST.accepted ] = tmp;
3337 best = ST.accepted;
a3621e74 3338 }
166ba7cd 3339 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3340 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3341 scan = ST.B;
786e8c11 3342 } else {
5d458dd8 3343 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3344 }
6b173516 3345 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3346 /* NOTREACHED */
166ba7cd 3347 }
166ba7cd 3348 /* NOTREACHED */
5d458dd8 3349 case TRIE_next:
6962fb1a
YO
3350 /* we dont want to throw this away, see bug 57042*/
3351 if (oreplsv != GvSV(PL_replgv))
3352 sv_setsv(oreplsv, GvSV(PL_replgv));
5d458dd8
YO
3353 FREETMPS;
3354 LEAVE;
3355 sayYES;
166ba7cd
DM
3356#undef ST
3357
95b24440
DM
3358 case EXACT: {
3359 char *s = STRING(scan);
24d3c4a9 3360 ln = STR_LEN(scan);
eb160463 3361 if (do_utf8 != UTF) {
bc517b45 3362 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3363 char *l = locinput;
24d3c4a9 3364 const char * const e = s + ln;
a72c7584 3365
5ff6fc6d
JH
3366 if (do_utf8) {
3367 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3368 while (s < e) {
a3b680e6 3369 STRLEN ulen;
1aa99e6b 3370 if (l >= PL_regeol)
5ff6fc6d
JH
3371 sayNO;
3372 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3373 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3374 uniflags))
5ff6fc6d 3375 sayNO;
bc517b45 3376 l += ulen;
5ff6fc6d 3377 s ++;
1aa99e6b 3378 }
5ff6fc6d
JH
3379 }
3380 else {
3381 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3382 while (s < e) {
a3b680e6 3383 STRLEN ulen;
1aa99e6b
IH
3384 if (l >= PL_regeol)
3385 sayNO;
5ff6fc6d 3386 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3387 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3388 uniflags))