This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add more internal files and directories to no_index in META.yml
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
c277df42
IZ
83#define RF_tainted 1 /* tainted information used? */
84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
eb160463 88#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
32fc9b6a 97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 98
c277df42
IZ
99/*
100 * Forwards.
101 */
102
33b8afdf 103#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 104#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 105
3dab1dad
YO
106#define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
52657f30 108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
109 : (U8*)(pos + off))
110#define HOPBACKc(pos, off) \
07be1b83
YO
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
8e11feef 114 ? (U8*)pos - off \
3dab1dad 115 : NULL)
efb30f32 116
e7409c1b 117#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 118#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 119
20d0b1e9 120/* these are unrolled below in the CCC_TRY_XXX defined */
1a4fad37
AL
121#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
37e2e78e
KW
123
124/* Doesn't do an assert to verify that is correct */
125#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
127
1a4fad37
AL
128#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
129#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
130#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 131
37e2e78e 132#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
133 LOAD_UTF8_CHARCLASS(X_begin, " "); \
134 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
135 /* These are utf8 constants, and not utf-ebcdic constants, so the \
136 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 137 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
138 \
139 /* No asserts are done for these, in case called on an early \
140 * Unicode version in which they map to nothing */ \
d275fa5e
CB
141 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 148
d1eb3177
YO
149/*
150 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
151 so that it is possible to override the option here without having to
152 rebuild the entire core. as we are required to do if we change regcomp.h
153 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
154*/
155#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
156#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
157#endif
158
159#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
160#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
161#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
162#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
163#define RE_utf8_perl_word PL_utf8_alnum
164#define RE_utf8_perl_space PL_utf8_space
165#define RE_utf8_posix_digit PL_utf8_digit
166#define perl_word alnum
167#define perl_space space
168#define posix_digit digit
169#else
170#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
171#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
172#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
173#define RE_utf8_perl_word PL_utf8_perl_word
174#define RE_utf8_perl_space PL_utf8_perl_space
175#define RE_utf8_posix_digit PL_utf8_posix_digit
176#endif
177
178
20d0b1e9
YO
179#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
180 case NAMEL: \
181 PL_reg_flags |= RF_tainted; \
182 /* FALL THROUGH */ \
183 case NAME: \
184 if (!nextchr) \
185 sayNO; \
186 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
187 if (!CAT2(PL_utf8_,CLASS)) { \
188 bool ok; \
189 ENTER; \
190 save_re_context(); \
191 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
192 assert(ok); \
193 LEAVE; \
194 } \
195 if (!(OP(scan) == NAME \
196 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
197 : LCFUNC_utf8((U8*)locinput))) \
198 { \
199 sayNO; \
200 } \
201 locinput += PL_utf8skip[nextchr]; \
202 nextchr = UCHARAT(locinput); \
203 break; \
204 } \
205 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
206 sayNO; \
207 nextchr = UCHARAT(++locinput); \
208 break
209
210#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
211 case NAMEL: \
212 PL_reg_flags |= RF_tainted; \
213 /* FALL THROUGH */ \
214 case NAME : \
215 if (!nextchr && locinput >= PL_regeol) \
216 sayNO; \
217 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
218 if (!CAT2(PL_utf8_,CLASS)) { \
219 bool ok; \
220 ENTER; \
221 save_re_context(); \
222 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
223 assert(ok); \
224 LEAVE; \
225 } \
226 if ((OP(scan) == NAME \
227 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
228 : LCFUNC_utf8((U8*)locinput))) \
229 { \
230 sayNO; \
231 } \
232 locinput += PL_utf8skip[nextchr]; \
233 nextchr = UCHARAT(locinput); \
234 break; \
235 } \
236 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
237 sayNO; \
238 nextchr = UCHARAT(++locinput); \
239 break
240
241
d1eb3177
YO
242
243
244
3dab1dad
YO
245/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
246
5f80c4cf 247/* for use after a quantifier and before an EXACT-like node -- japhy */
3e901dc0
YO
248/* it would be nice to rework regcomp.sym to generate this stuff. sigh */
249#define JUMPABLE(rn) ( \
250 OP(rn) == OPEN || \
251 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
252 OP(rn) == EVAL || \
cca55fe3
JP
253 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
254 OP(rn) == PLUS || OP(rn) == MINMOD || \
ee9b8eae 255 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
3dab1dad 256 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 257)
ee9b8eae 258#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 259
ee9b8eae
YO
260#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
261
262#if 0
263/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
264 we don't need this definition. */
265#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
266#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
267#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
268
269#else
270/* ... so we use this as its faster. */
271#define IS_TEXT(rn) ( OP(rn)==EXACT )
272#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
273#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
274
275#endif
e2d8ce26 276
a84d97b6
HS
277/*
278 Search for mandatory following text node; for lookahead, the text must
279 follow but for lookbehind (rn->flags != 0) we skip to the next step.
280*/
cca55fe3 281#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
282 while (JUMPABLE(rn)) { \
283 const OPCODE type = OP(rn); \
284 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 285 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 286 else if (type == PLUS) \
cca55fe3 287 rn = NEXTOPER(rn); \
3dab1dad 288 else if (type == IFMATCH) \
a84d97b6 289 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 290 else rn += NEXT_OFF(rn); \
3dab1dad 291 } \
5f80c4cf 292} STMT_END
74750237 293
c476f425 294
acfe0abc 295static void restore_pos(pTHX_ void *arg);
51371543 296
76e3520e 297STATIC CHECKPOINT
cea2e8a9 298S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 299{
97aff369 300 dVAR;
a3b680e6 301 const int retval = PL_savestack_ix;
b1ce53c5 302#define REGCP_PAREN_ELEMS 4
a3b680e6 303 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 304 int p;
40a82448 305 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 306
e49a9654
IH
307 if (paren_elems_to_push < 0)
308 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
309
f0ab9afb 310#define REGCP_OTHER_ELEMS 7
4b3c1a47 311 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
7f69552c 312
3280af22 313 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 314/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
315 SSPUSHINT(PL_regoffs[p].end);
316 SSPUSHINT(PL_regoffs[p].start);
3280af22 317 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 318 SSPUSHINT(p);
e7707071 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 320 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 321 (UV)p, (IV)PL_regoffs[p].start,
40a82448 322 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 323 (IV)PL_regoffs[p].end
40a82448 324 ));
a0d0e21e 325 }
b1ce53c5 326/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 327 SSPUSHPTR(PL_regoffs);
3280af22
NIS
328 SSPUSHINT(PL_regsize);
329 SSPUSHINT(*PL_reglastparen);
a01268b5 330 SSPUSHINT(*PL_reglastcloseparen);
3280af22 331 SSPUSHPTR(PL_reginput);
41123dfd
JH
332#define REGCP_FRAME_ELEMS 2
333/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
334 * are needed for the regexp context stack bookkeeping. */
335 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 336 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 337
a0d0e21e
LW
338 return retval;
339}
340
c277df42 341/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
342#define REGCP_SET(cp) \
343 DEBUG_STATE_r( \
ab3bbdeb 344 PerlIO_printf(Perl_debug_log, \
e4f74956 345 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
346 (IV)PL_savestack_ix)); \
347 cp = PL_savestack_ix
c3464db5 348
ab3bbdeb 349#define REGCP_UNWIND(cp) \
e4f74956 350 DEBUG_STATE_r( \
ab3bbdeb 351 if (cp != PL_savestack_ix) \
e4f74956
YO
352 PerlIO_printf(Perl_debug_log, \
353 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
354 (IV)(cp), (IV)PL_savestack_ix)); \
355 regcpblow(cp)
c277df42 356
76e3520e 357STATIC char *
097eb12c 358S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 359{
97aff369 360 dVAR;
3b6647e0 361 U32 i;
a0d0e21e 362 char *input;
a3621e74
YO
363 GET_RE_DEBUG_FLAGS_DECL;
364
7918f24d
NC
365 PERL_ARGS_ASSERT_REGCPPOP;
366
b1ce53c5 367 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 368 i = SSPOPINT;
b1ce53c5
JH
369 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
370 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 371 input = (char *) SSPOPPTR;
a01268b5 372 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
373 *PL_reglastparen = SSPOPINT;
374 PL_regsize = SSPOPINT;
f0ab9afb 375 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 376
6bda09f9 377
b1ce53c5 378 /* Now restore the parentheses context. */
41123dfd
JH
379 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
380 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 381 I32 tmps;
097eb12c 382 U32 paren = (U32)SSPOPINT;
3280af22 383 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 384 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 385 tmps = SSPOPINT;
3280af22 386 if (paren <= *PL_reglastparen)
f0ab9afb 387 PL_regoffs[paren].end = tmps;
e7707071 388 DEBUG_BUFFERS_r(
c3464db5 389 PerlIO_printf(Perl_debug_log,
b900a521 390 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 391 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 392 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 393 (IV)PL_regoffs[paren].end,
3280af22 394 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 395 );
a0d0e21e 396 }
e7707071 397 DEBUG_BUFFERS_r(
bb7a0f54 398 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 399 PerlIO_printf(Perl_debug_log,
faccc32b 400 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 401 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
402 }
403 );
daf18116 404#if 1
dafc8851
JH
405 /* It would seem that the similar code in regtry()
406 * already takes care of this, and in fact it is in
407 * a better location to since this code can #if 0-ed out
408 * but the code in regtry() is needed or otherwise tests
409 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
410 * (as of patchlevel 7877) will fail. Then again,
411 * this code seems to be necessary or otherwise
225593e1
DM
412 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
413 * --jhi updated by dapm */
3b6647e0 414 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 415 if (i > PL_regsize)
f0ab9afb
NC
416 PL_regoffs[i].start = -1;
417 PL_regoffs[i].end = -1;
a0d0e21e 418 }
dafc8851 419#endif
a0d0e21e
LW
420 return input;
421}
422
02db2b7b 423#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 424
a687059c 425/*
e50aee73 426 * pregexec and friends
a687059c
LW
427 */
428
76234dfb 429#ifndef PERL_IN_XSUB_RE
a687059c 430/*
c277df42 431 - pregexec - match a regexp against a string
a687059c 432 */
c277df42 433I32
49d7dfbc 434Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 435 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
436/* strend: pointer to null at end of string */
437/* strbeg: real beginning of string */
438/* minend: end of match must be >=minend after stringarg. */
439/* nosave: For optimizations. */
440{
7918f24d
NC
441 PERL_ARGS_ASSERT_PREGEXEC;
442
c277df42 443 return
9041c2e3 444 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
445 nosave ? 0 : REXEC_COPY_STR);
446}
76234dfb 447#endif
22e551b9 448
9041c2e3 449/*
cad2e5aa
JH
450 * Need to implement the following flags for reg_anch:
451 *
452 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
453 * USE_INTUIT_ML
454 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
455 * INTUIT_AUTORITATIVE_ML
456 * INTUIT_ONCE_NOML - Intuit can match in one location only.
457 * INTUIT_ONCE_ML
458 *
459 * Another flag for this function: SECOND_TIME (so that float substrs
460 * with giant delta may be not rechecked).
461 */
462
463/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
464
3f7c398e 465/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
466 Otherwise, only SvCUR(sv) is used to get strbeg. */
467
468/* XXXX We assume that strpos is strbeg unless sv. */
469
6eb5f6b9
JH
470/* XXXX Some places assume that there is a fixed substring.
471 An update may be needed if optimizer marks as "INTUITable"
472 RExen without fixed substrings. Similarly, it is assumed that
473 lengths of all the strings are no more than minlen, thus they
474 cannot come from lookahead.
40d049e4
YO
475 (Or minlen should take into account lookahead.)
476 NOTE: Some of this comment is not correct. minlen does now take account
477 of lookahead/behind. Further research is required. -- demerphq
478
479*/
6eb5f6b9 480
2c2d71f5
JH
481/* A failure to find a constant substring means that there is no need to make
482 an expensive call to REx engine, thus we celebrate a failure. Similarly,
483 finding a substring too deep into the string means that less calls to
30944b6d
IZ
484 regtry() should be needed.
485
486 REx compiler's optimizer found 4 possible hints:
487 a) Anchored substring;
488 b) Fixed substring;
489 c) Whether we are anchored (beginning-of-line or \G);
490 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 491 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
492 string which does not contradict any of them.
493 */
2c2d71f5 494
6eb5f6b9
JH
495/* Most of decisions we do here should have been done at compile time.
496 The nodes of the REx which we used for the search should have been
497 deleted from the finite automaton. */
498
cad2e5aa 499char *
288b8c02 500Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 501 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 502{
97aff369 503 dVAR;
288b8c02 504 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 505 register I32 start_shift = 0;
cad2e5aa 506 /* Should be nonnegative! */
b7953727 507 register I32 end_shift = 0;
2c2d71f5
JH
508 register char *s;
509 register SV *check;
a1933d95 510 char *strbeg;
cad2e5aa 511 char *t;
1de06328 512 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 513 I32 ml_anch;
bd61b366
SS
514 register char *other_last = NULL; /* other substr checked before this */
515 char *check_at = NULL; /* check substr found at this pos */
bbe252da 516 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 517 RXi_GET_DECL(prog,progi);
30944b6d 518#ifdef DEBUGGING
890ce7af 519 const char * const i_strpos = strpos;
30944b6d 520#endif
a3621e74
YO
521 GET_RE_DEBUG_FLAGS_DECL;
522
7918f24d
NC
523 PERL_ARGS_ASSERT_RE_INTUIT_START;
524
288b8c02 525 RX_MATCH_UTF8_set(rx,do_utf8);
cad2e5aa 526
3c8556c3 527 if (RX_UTF8(rx)) {
b8d68ded
JH
528 PL_reg_flags |= RF_utf8;
529 }
ab3bbdeb 530 DEBUG_EXECUTE_r(
efd26800 531 debug_start_match(rx, do_utf8, strpos, strend,
1de06328
YO
532 sv ? "Guessing start of match in sv for"
533 : "Guessing start of match in string for");
2a782b5b 534 );
cad2e5aa 535
c344f387
JH
536 /* CHR_DIST() would be more correct here but it makes things slow. */
537 if (prog->minlen > strend - strpos) {
a3621e74 538 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 539 "String too short... [re_intuit_start]\n"));
cad2e5aa 540 goto fail;
2c2d71f5 541 }
1de06328 542
a1933d95 543 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 544 PL_regeol = strend;
33b8afdf
JH
545 if (do_utf8) {
546 if (!prog->check_utf8 && prog->check_substr)
547 to_utf8_substr(prog);
548 check = prog->check_utf8;
549 } else {
550 if (!prog->check_substr && prog->check_utf8)
551 to_byte_substr(prog);
552 check = prog->check_substr;
553 }
1de06328 554 if (check == &PL_sv_undef) {
a3621e74 555 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 556 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
557 goto fail;
558 }
bbe252da
YO
559 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
560 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
561 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 562 && !multiline ) ); /* Check after \n? */
cad2e5aa 563
7e25d62c 564 if (!ml_anch) {
bbe252da
YO
565 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
566 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 567 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
568 && sv && !SvROK(sv)
569 && (strpos != strbeg)) {
a3621e74 570 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
571 goto fail;
572 }
573 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 574 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 575 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
576 I32 slen;
577
1aa99e6b 578 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 579
653099ff
GS
580 if (SvTAIL(check)) {
581 slen = SvCUR(check); /* >= 1 */
cad2e5aa 582
9041c2e3 583 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 584 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 585 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 586 goto fail_finish;
cad2e5aa
JH
587 }
588 /* Now should match s[0..slen-2] */
589 slen--;
3f7c398e 590 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 591 || (slen > 1
3f7c398e 592 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 593 report_neq:
a3621e74 594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
595 goto fail_finish;
596 }
cad2e5aa 597 }
3f7c398e 598 else if (*SvPVX_const(check) != *s
653099ff 599 || ((slen = SvCUR(check)) > 1
3f7c398e 600 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 601 goto report_neq;
c315bfe8 602 check_at = s;
2c2d71f5 603 goto success_at_start;
7e25d62c 604 }
cad2e5aa 605 }
2c2d71f5 606 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 607 s = strpos;
2c2d71f5 608 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
609 end_shift = prog->check_end_shift;
610
2c2d71f5 611 if (!ml_anch) {
a3b680e6 612 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 613 - (SvTAIL(check) != 0);
a3b680e6 614 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
615
616 if (end_shift < eshift)
617 end_shift = eshift;
618 }
cad2e5aa 619 }
2c2d71f5 620 else { /* Can match at random position */
cad2e5aa
JH
621 ml_anch = 0;
622 s = strpos;
1de06328
YO
623 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
624 end_shift = prog->check_end_shift;
625
626 /* end shift should be non negative here */
cad2e5aa
JH
627 }
628
bcdf7404 629#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 630 if (end_shift < 0)
1de06328 631 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 632 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
633#endif
634
2c2d71f5
JH
635 restart:
636 /* Find a possible match in the region s..strend by looking for
637 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
638
639 {
640 I32 srch_start_shift = start_shift;
641 I32 srch_end_shift = end_shift;
642 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
643 srch_end_shift -= ((strbeg - s) - srch_start_shift);
644 srch_start_shift = strbeg - s;
645 }
6bda09f9 646 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
647 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
648 (IV)prog->check_offset_min,
649 (IV)srch_start_shift,
650 (IV)srch_end_shift,
651 (IV)prog->check_end_shift);
652 });
653
cad2e5aa 654 if (flags & REXEC_SCREAM) {
cad2e5aa 655 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 656 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 657
2c2d71f5
JH
658 if (PL_screamfirst[BmRARE(check)] >= 0
659 || ( BmRARE(check) == '\n'
85c508c3 660 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 661 && SvTAIL(check) ))
9041c2e3 662 s = screaminstr(sv, check,
1de06328 663 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 664 else
2c2d71f5 665 goto fail_finish;
4addbd3b 666 /* we may be pointing at the wrong string */
07bc277f 667 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 668 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
669 if (data)
670 *data->scream_olds = s;
671 }
1de06328
YO
672 else {
673 U8* start_point;
674 U8* end_point;
bbe252da 675 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
676 start_point= (U8*)(s + srch_start_shift);
677 end_point= (U8*)(strend - srch_end_shift);
678 } else {
679 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
680 end_point= HOP3(strend, -srch_end_shift, strbeg);
681 }
6bda09f9 682 DEBUG_OPTIMISE_MORE_r({
56570a2c 683 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 684 (int)(end_point - start_point),
fc8cd66c 685 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
686 start_point);
687 });
688
689 s = fbm_instr( start_point, end_point,
7fba1cd6 690 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
691 }
692 }
cad2e5aa
JH
693 /* Update the count-of-usability, remove useless subpatterns,
694 unshift s. */
2c2d71f5 695
ab3bbdeb
YO
696 DEBUG_EXECUTE_r({
697 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
698 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
699 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 700 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
701 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
702 ? "anchored" : "floating"),
703 quoted,
704 RE_SV_TAIL(check),
705 (s ? " at offset " : "...\n") );
706 });
2c2d71f5
JH
707
708 if (!s)
709 goto fail_finish;
2c2d71f5 710 /* Finish the diagnostic message */
a3621e74 711 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 712
1de06328
YO
713 /* XXX dmq: first branch is for positive lookbehind...
714 Our check string is offset from the beginning of the pattern.
715 So we need to do any stclass tests offset forward from that
716 point. I think. :-(
717 */
718
719
720
721 check_at=s;
722
723
2c2d71f5
JH
724 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
725 Start with the other substr.
726 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 727 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
728 *always* match. Probably should be marked during compile...
729 Probably it is right to do no SCREAM here...
730 */
731
1de06328
YO
732 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
733 : (prog->float_substr && prog->anchored_substr))
734 {
30944b6d 735 /* Take into account the "other" substring. */
2c2d71f5
JH
736 /* XXXX May be hopelessly wrong for UTF... */
737 if (!other_last)
6eb5f6b9 738 other_last = strpos;
33b8afdf 739 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
740 do_other_anchored:
741 {
890ce7af
AL
742 char * const last = HOP3c(s, -start_shift, strbeg);
743 char *last1, *last2;
be8e71aa 744 char * const saved_s = s;
33b8afdf 745 SV* must;
2c2d71f5 746
2c2d71f5
JH
747 t = s - prog->check_offset_max;
748 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 749 && (!do_utf8
0ce71af7 750 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 751 && t > strpos)))
6f207bd3 752 NOOP;
2c2d71f5
JH
753 else
754 t = strpos;
1aa99e6b 755 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
756 if (t < other_last) /* These positions already checked */
757 t = other_last;
1aa99e6b 758 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
759 if (last < last1)
760 last1 = last;
1de06328
YO
761 /* XXXX It is not documented what units *_offsets are in.
762 We assume bytes, but this is clearly wrong.
763 Meaning this code needs to be carefully reviewed for errors.
764 dmq.
765 */
766
2c2d71f5 767 /* On end-of-str: see comment below. */
33b8afdf
JH
768 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
769 if (must == &PL_sv_undef) {
770 s = (char*)NULL;
1de06328 771 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
772 }
773 else
774 s = fbm_instr(
775 (unsigned char*)t,
776 HOP3(HOP3(last1, prog->anchored_offset, strend)
777 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
778 must,
7fba1cd6 779 multiline ? FBMrf_MULTILINE : 0
33b8afdf 780 );
ab3bbdeb
YO
781 DEBUG_EXECUTE_r({
782 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
783 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
784 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 785 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
786 quoted, RE_SV_TAIL(must));
787 });
788
789
2c2d71f5
JH
790 if (!s) {
791 if (last1 >= last2) {
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
793 ", giving up...\n"));
794 goto fail_finish;
795 }
a3621e74 796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 797 ", trying floating at offset %ld...\n",
be8e71aa 798 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
799 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
800 s = HOP3c(last, 1, strend);
2c2d71f5
JH
801 goto restart;
802 }
803 else {
a3621e74 804 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 805 (long)(s - i_strpos)));
1aa99e6b
IH
806 t = HOP3c(s, -prog->anchored_offset, strbeg);
807 other_last = HOP3c(s, 1, strend);
be8e71aa 808 s = saved_s;
2c2d71f5
JH
809 if (t == strpos)
810 goto try_at_start;
2c2d71f5
JH
811 goto try_at_offset;
812 }
30944b6d 813 }
2c2d71f5
JH
814 }
815 else { /* Take into account the floating substring. */
33b8afdf 816 char *last, *last1;
be8e71aa 817 char * const saved_s = s;
33b8afdf
JH
818 SV* must;
819
820 t = HOP3c(s, -start_shift, strbeg);
821 last1 = last =
822 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
823 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
824 last = HOP3c(t, prog->float_max_offset, strend);
825 s = HOP3c(t, prog->float_min_offset, strend);
826 if (s < other_last)
827 s = other_last;
2c2d71f5 828 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
829 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
830 /* fbm_instr() takes into account exact value of end-of-str
831 if the check is SvTAIL(ed). Since false positives are OK,
832 and end-of-str is not later than strend we are OK. */
833 if (must == &PL_sv_undef) {
834 s = (char*)NULL;
1de06328 835 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
836 }
837 else
2c2d71f5 838 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
839 (unsigned char*)last + SvCUR(must)
840 - (SvTAIL(must)!=0),
7fba1cd6 841 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
842 DEBUG_EXECUTE_r({
843 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
844 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
845 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 846 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
847 quoted, RE_SV_TAIL(must));
848 });
33b8afdf
JH
849 if (!s) {
850 if (last1 == last) {
a3621e74 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
852 ", giving up...\n"));
853 goto fail_finish;
2c2d71f5 854 }
a3621e74 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 856 ", trying anchored starting at offset %ld...\n",
be8e71aa 857 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
858 other_last = last;
859 s = HOP3c(t, 1, strend);
860 goto restart;
861 }
862 else {
a3621e74 863 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
864 (long)(s - i_strpos)));
865 other_last = s; /* Fix this later. --Hugo */
be8e71aa 866 s = saved_s;
33b8afdf
JH
867 if (t == strpos)
868 goto try_at_start;
869 goto try_at_offset;
870 }
2c2d71f5 871 }
cad2e5aa 872 }
2c2d71f5 873
1de06328 874
9ef43ace 875 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 876
6bda09f9 877 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
878 PerlIO_printf(Perl_debug_log,
879 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
880 (IV)prog->check_offset_min,
881 (IV)prog->check_offset_max,
882 (IV)(s-strpos),
883 (IV)(t-strpos),
884 (IV)(t-s),
885 (IV)(strend-strpos)
886 )
887 );
888
2c2d71f5 889 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 890 && (!do_utf8
9ef43ace 891 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
892 && t > strpos)))
893 {
2c2d71f5
JH
894 /* Fixed substring is found far enough so that the match
895 cannot start at strpos. */
896 try_at_offset:
cad2e5aa 897 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
898 /* Eventually fbm_*() should handle this, but often
899 anchored_offset is not 0, so this check will not be wasted. */
900 /* XXXX In the code below we prefer to look for "^" even in
901 presence of anchored substrings. And we search even
902 beyond the found float position. These pessimizations
903 are historical artefacts only. */
904 find_anchor:
2c2d71f5 905 while (t < strend - prog->minlen) {
cad2e5aa 906 if (*t == '\n') {
4ee3650e 907 if (t < check_at - prog->check_offset_min) {
33b8afdf 908 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
909 /* Since we moved from the found position,
910 we definitely contradict the found anchored
30944b6d
IZ
911 substr. Due to the above check we do not
912 contradict "check" substr.
913 Thus we can arrive here only if check substr
914 is float. Redo checking for "other"=="fixed".
915 */
9041c2e3 916 strpos = t + 1;
a3621e74 917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 918 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
919 goto do_other_anchored;
920 }
4ee3650e
GS
921 /* We don't contradict the found floating substring. */
922 /* XXXX Why not check for STCLASS? */
cad2e5aa 923 s = t + 1;
a3621e74 924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 925 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
926 goto set_useful;
927 }
4ee3650e
GS
928 /* Position contradicts check-string */
929 /* XXXX probably better to look for check-string
930 than for "\n", so one should lower the limit for t? */
a3621e74 931 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 932 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 933 other_last = strpos = s = t + 1;
cad2e5aa
JH
934 goto restart;
935 }
936 t++;
937 }
a3621e74 938 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 939 PL_colors[0], PL_colors[1]));
2c2d71f5 940 goto fail_finish;
cad2e5aa 941 }
f5952150 942 else {
a3621e74 943 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 944 PL_colors[0], PL_colors[1]));
f5952150 945 }
cad2e5aa
JH
946 s = t;
947 set_useful:
33b8afdf 948 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
949 }
950 else {
f5952150 951 /* The found string does not prohibit matching at strpos,
2c2d71f5 952 - no optimization of calling REx engine can be performed,
f5952150
GS
953 unless it was an MBOL and we are not after MBOL,
954 or a future STCLASS check will fail this. */
2c2d71f5
JH
955 try_at_start:
956 /* Even in this situation we may use MBOL flag if strpos is offset
957 wrt the start of the string. */
05b4157f 958 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 959 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 960 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 961 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 962 {
cad2e5aa
JH
963 t = strpos;
964 goto find_anchor;
965 }
a3621e74 966 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 967 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 968 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 969 );
2c2d71f5 970 success_at_start:
bbe252da 971 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
972 && (do_utf8 ? (
973 prog->check_utf8 /* Could be deleted already */
974 && --BmUSEFUL(prog->check_utf8) < 0
975 && (prog->check_utf8 == prog->float_utf8)
976 ) : (
977 prog->check_substr /* Could be deleted already */
978 && --BmUSEFUL(prog->check_substr) < 0
979 && (prog->check_substr == prog->float_substr)
980 )))
66e933ab 981 {
cad2e5aa 982 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
ef8d46e8 984 /* XXX Does the destruction order has to change with do_utf8? */
33b8afdf 985 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
ef8d46e8 986 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
987 prog->check_substr = prog->check_utf8 = NULL; /* disable */
988 prog->float_substr = prog->float_utf8 = NULL; /* clear */
989 check = NULL; /* abort */
cad2e5aa 990 s = strpos;
3cf5c195
IZ
991 /* XXXX This is a remnant of the old implementation. It
992 looks wasteful, since now INTUIT can use many
6eb5f6b9 993 other heuristics. */
bbe252da 994 prog->extflags &= ~RXf_USE_INTUIT;
cad2e5aa
JH
995 }
996 else
997 s = strpos;
998 }
999
6eb5f6b9
JH
1000 /* Last resort... */
1001 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1002 /* trie stclasses are too expensive to use here, we are better off to
1003 leave it to regmatch itself */
f8fc2ecf 1004 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1005 /* minlen == 0 is possible if regstclass is \b or \B,
1006 and the fixed substr is ''$.
1007 Since minlen is already taken into account, s+1 is before strend;
1008 accidentally, minlen >= 1 guaranties no false positives at s + 1
1009 even for \b or \B. But (minlen? 1 : 0) below assumes that
1010 regstclass does not come from lookahead... */
1011 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1012 This leaves EXACTF only, which is dealt with in find_byclass(). */
f8fc2ecf
YO
1013 const U8* const str = (U8*)STRING(progi->regstclass);
1014 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1015 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1016 : 1);
1de06328
YO
1017 char * endpos;
1018 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1019 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1020 else if (prog->float_substr || prog->float_utf8)
1021 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1022 else
1023 endpos= strend;
1024
70685ca0
JH
1025 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1026 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1027
6eb5f6b9 1028 t = s;
f8fc2ecf 1029 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1030 if (!s) {
1031#ifdef DEBUGGING
cbbf8932 1032 const char *what = NULL;
6eb5f6b9
JH
1033#endif
1034 if (endpos == strend) {
a3621e74 1035 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1036 "Could not match STCLASS...\n") );
1037 goto fail;
1038 }
a3621e74 1039 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1040 "This position contradicts STCLASS...\n") );
bbe252da 1041 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1042 goto fail;
6eb5f6b9 1043 /* Contradict one of substrings */
33b8afdf
JH
1044 if (prog->anchored_substr || prog->anchored_utf8) {
1045 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1046 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1047 hop_and_restart:
1aa99e6b 1048 s = HOP3c(t, 1, strend);
66e933ab
GS
1049 if (s + start_shift + end_shift > strend) {
1050 /* XXXX Should be taken into account earlier? */
a3621e74 1051 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1052 "Could not match STCLASS...\n") );
1053 goto fail;
1054 }
5e39e1e5
HS
1055 if (!check)
1056 goto giveup;
a3621e74 1057 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1058 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1059 what, (long)(s + start_shift - i_strpos)) );
1060 goto restart;
1061 }
66e933ab 1062 /* Have both, check_string is floating */
6eb5f6b9
JH
1063 if (t + start_shift >= check_at) /* Contradicts floating=check */
1064 goto retry_floating_check;
1065 /* Recheck anchored substring, but not floating... */
9041c2e3 1066 s = check_at;
5e39e1e5
HS
1067 if (!check)
1068 goto giveup;
a3621e74 1069 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1070 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1071 (long)(other_last - i_strpos)) );
1072 goto do_other_anchored;
1073 }
60e71179
GS
1074 /* Another way we could have checked stclass at the
1075 current position only: */
1076 if (ml_anch) {
1077 s = t = t + 1;
5e39e1e5
HS
1078 if (!check)
1079 goto giveup;
a3621e74 1080 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1081 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1082 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1083 goto try_at_offset;
66e933ab 1084 }
33b8afdf 1085 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1086 goto fail;
6eb5f6b9
JH
1087 /* Check is floating subtring. */
1088 retry_floating_check:
1089 t = check_at - start_shift;
a3621e74 1090 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1091 goto hop_and_restart;
1092 }
b7953727 1093 if (t != s) {
a3621e74 1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1095 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1096 (long)(t - i_strpos), (long)(s - i_strpos))
1097 );
1098 }
1099 else {
a3621e74 1100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1101 "Does not contradict STCLASS...\n");
1102 );
1103 }
6eb5f6b9 1104 }
5e39e1e5 1105 giveup:
a3621e74 1106 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1107 PL_colors[4], (check ? "Guessed" : "Giving up"),
1108 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1109 return s;
2c2d71f5
JH
1110
1111 fail_finish: /* Substring not found */
33b8afdf
JH
1112 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1113 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1114 fail:
a3621e74 1115 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1116 PL_colors[4], PL_colors[5]));
bd61b366 1117 return NULL;
cad2e5aa 1118}
9661b544 1119
a0a388a1
YO
1120#define DECL_TRIE_TYPE(scan) \
1121 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1122 trie_type = (scan->flags != EXACT) \
1123 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1124 : (do_utf8 ? trie_utf8 : trie_plain)
3b0527fe 1125
55eed653
NC
1126#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1127uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1128 switch (trie_type) { \
1129 case trie_utf8_fold: \
1130 if ( foldlen>0 ) { \
0abd0d78 1131 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1132 foldlen -= len; \
1133 uscan += len; \
1134 len=0; \
1135 } else { \
0abd0d78 1136 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1137 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1138 foldlen -= UNISKIP( uvc ); \
1139 uscan = foldbuf + UNISKIP( uvc ); \
1140 } \
1141 break; \
a0a388a1
YO
1142 case trie_latin_utf8_fold: \
1143 if ( foldlen>0 ) { \
1144 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1145 foldlen -= len; \
1146 uscan += len; \
1147 len=0; \
1148 } else { \
1149 len = 1; \
1150 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1151 foldlen -= UNISKIP( uvc ); \
1152 uscan = foldbuf + UNISKIP( uvc ); \
1153 } \
1154 break; \
4cadc6a9
YO
1155 case trie_utf8: \
1156 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1157 break; \
1158 case trie_plain: \
1159 uvc = (UV)*uc; \
1160 len = 1; \
1161 } \
4cadc6a9
YO
1162 if (uvc < 256) { \
1163 charid = trie->charmap[ uvc ]; \
1164 } \
1165 else { \
1166 charid = 0; \
55eed653
NC
1167 if (widecharmap) { \
1168 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1169 (char*)&uvc, sizeof(UV), 0); \
1170 if (svpp) \
1171 charid = (U16)SvIV(*svpp); \
1172 } \
1173 } \
1174} STMT_END
1175
a0a388a1
YO
1176#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1177{ \
1178 char *my_strend= (char *)strend; \
4cadc6a9
YO
1179 if ( (CoNd) \
1180 && (ln == len || \
a0a388a1 1181 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
4cadc6a9 1182 m, NULL, ln, (bool)UTF)) \
a0a388a1 1183 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1184 goto got_it; \
1185 else { \
1186 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1187 uvchr_to_utf8(tmpbuf, c); \
1188 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1189 if ( f != c \
1190 && (f == c1 || f == c2) \
a0a388a1
YO
1191 && (ln == len || \
1192 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1193 m, NULL, ln, (bool)UTF)) \
1194 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1195 goto got_it; \
1196 } \
a0a388a1
YO
1197} \
1198s += len
4cadc6a9
YO
1199
1200#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1201STMT_START { \
1202 while (s <= e) { \
1203 if ( (CoNd) \
1204 && (ln == 1 || !(OP(c) == EXACTF \
1205 ? ibcmp(s, m, ln) \
1206 : ibcmp_locale(s, m, ln))) \
24b23f37 1207 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1208 goto got_it; \
1209 s++; \
1210 } \
1211} STMT_END
1212
1213#define REXEC_FBC_UTF8_SCAN(CoDe) \
1214STMT_START { \
1215 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1216 CoDe \
1217 s += uskip; \
1218 } \
1219} STMT_END
1220
1221#define REXEC_FBC_SCAN(CoDe) \
1222STMT_START { \
1223 while (s < strend) { \
1224 CoDe \
1225 s++; \
1226 } \
1227} STMT_END
1228
1229#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1230REXEC_FBC_UTF8_SCAN( \
1231 if (CoNd) { \
24b23f37 1232 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1233 goto got_it; \
1234 else \
1235 tmp = doevery; \
1236 } \
1237 else \
1238 tmp = 1; \
1239)
1240
1241#define REXEC_FBC_CLASS_SCAN(CoNd) \
1242REXEC_FBC_SCAN( \
1243 if (CoNd) { \
24b23f37 1244 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1245 goto got_it; \
1246 else \
1247 tmp = doevery; \
1248 } \
1249 else \
1250 tmp = 1; \
1251)
1252
1253#define REXEC_FBC_TRYIT \
24b23f37 1254if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1255 goto got_it
1256
e1d1eefb
YO
1257#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1258 if (do_utf8) { \
1259 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1260 } \
1261 else { \
1262 REXEC_FBC_CLASS_SCAN(CoNd); \
1263 } \
1264 break
1265
4cadc6a9
YO
1266#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1267 if (do_utf8) { \
1268 UtFpReLoAd; \
1269 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1270 } \
1271 else { \
1272 REXEC_FBC_CLASS_SCAN(CoNd); \
1273 } \
1274 break
1275
1276#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1277 PL_reg_flags |= RF_tainted; \
1278 if (do_utf8) { \
1279 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1280 } \
1281 else { \
1282 REXEC_FBC_CLASS_SCAN(CoNd); \
1283 } \
1284 break
1285
786e8c11
YO
1286#define DUMP_EXEC_POS(li,s,doutf8) \
1287 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1288
1289/* We know what class REx starts with. Try to find this position... */
1290/* if reginfo is NULL, its a dryrun */
1291/* annoyingly all the vars in this routine have different names from their counterparts
1292 in regmatch. /grrr */
1293
3c3eec57 1294STATIC char *
07be1b83 1295S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1296 const char *strend, regmatch_info *reginfo)
a687059c 1297{
27da23d5 1298 dVAR;
bbe252da 1299 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
6eb5f6b9 1300 char *m;
d8093b23 1301 STRLEN ln;
5dab1207 1302 STRLEN lnc;
078c425b 1303 register STRLEN uskip;
d8093b23
G
1304 unsigned int c1;
1305 unsigned int c2;
6eb5f6b9
JH
1306 char *e;
1307 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1308 register const bool do_utf8 = PL_reg_match_utf8;
f8fc2ecf 1309 RXi_GET_DECL(prog,progi);
7918f24d
NC
1310
1311 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1312
6eb5f6b9
JH
1313 /* We know what class it must start with. */
1314 switch (OP(c)) {
6eb5f6b9 1315 case ANYOF:
388cc4de 1316 if (do_utf8) {
4cadc6a9 1317 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1318 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1319 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1320 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1321 }
1322 else {
1323 while (s < strend) {
1324 STRLEN skip = 1;
1325
32fc9b6a 1326 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1327 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1328 /* The assignment of 2 is intentional:
1329 * for the folded sharp s, the skip is 2. */
1330 (skip = SHARP_S_SKIP))) {
24b23f37 1331 if (tmp && (!reginfo || regtry(reginfo, &s)))
388cc4de
HS
1332 goto got_it;
1333 else
1334 tmp = doevery;
1335 }
1336 else
1337 tmp = 1;
1338 s += skip;
1339 }
a0d0e21e 1340 }
6eb5f6b9 1341 break;
f33976b4 1342 case CANY:
4cadc6a9 1343 REXEC_FBC_SCAN(
24b23f37 1344 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1345 goto got_it;
1346 else
1347 tmp = doevery;
4cadc6a9 1348 );
f33976b4 1349 break;
6eb5f6b9 1350 case EXACTF:
5dab1207
NIS
1351 m = STRING(c);
1352 ln = STR_LEN(c); /* length to match in octets/bytes */
1353 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1354 if (UTF) {
a2a2844f 1355 STRLEN ulen1, ulen2;
5dab1207 1356 U8 *sm = (U8 *) m;
89ebb4a3
JH
1357 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1358 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
97dc7d3e
RGS
1359 /* used by commented-out code below */
1360 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
a0a388a1
YO
1361
1362 /* XXX: Since the node will be case folded at compile
1363 time this logic is a little odd, although im not
1364 sure that its actually wrong. --dmq */
1365
1366 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1367 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1368
1369 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1370 codepoint of the first character in the converted
1371 form, yet originally we did the extra step.
1372 No tests fail by commenting this code out however
1373 so Ive left it out. -- dmq.
1374
89ebb4a3 1375 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1376 0, uniflags);
89ebb4a3 1377 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1378 0, uniflags);
a0a388a1
YO
1379 */
1380
5dab1207
NIS
1381 lnc = 0;
1382 while (sm < ((U8 *) m + ln)) {
1383 lnc++;
1384 sm += UTF8SKIP(sm);
1385 }
1aa99e6b
IH
1386 }
1387 else {
1388 c1 = *(U8*)m;
1389 c2 = PL_fold[c1];
1390 }
6eb5f6b9
JH
1391 goto do_exactf;
1392 case EXACTFL:
5dab1207
NIS
1393 m = STRING(c);
1394 ln = STR_LEN(c);
1395 lnc = (I32) ln;
d8093b23 1396 c1 = *(U8*)m;
6eb5f6b9
JH
1397 c2 = PL_fold_locale[c1];
1398 do_exactf:
db12adc6 1399 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1400
3b0527fe 1401 if (!reginfo && e < s)
6eb5f6b9 1402 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1403
60a8b682
JH
1404 /* The idea in the EXACTF* cases is to first find the
1405 * first character of the EXACTF* node and then, if
1406 * necessary, case-insensitively compare the full
1407 * text of the node. The c1 and c2 are the first
1408 * characters (though in Unicode it gets a bit
1409 * more complicated because there are more cases
7f16dd3d
JH
1410 * than just upper and lower: one needs to use
1411 * the so-called folding case for case-insensitive
1412 * matching (called "loose matching" in Unicode).
1413 * ibcmp_utf8() will do just that. */
60a8b682 1414
a0a388a1 1415 if (do_utf8 || UTF) {
575cac57 1416 UV c, f;
89ebb4a3 1417 U8 tmpbuf [UTF8_MAXBYTES+1];
a0a388a1
YO
1418 STRLEN len = 1;
1419 STRLEN foldlen;
4ad0818d 1420 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1421 if (c1 == c2) {
5dab1207
NIS
1422 /* Upper and lower of 1st char are equal -
1423 * probably not a "letter". */
1aa99e6b 1424 while (s <= e) {
a0a388a1
YO
1425 if (do_utf8) {
1426 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1427 uniflags);
a0a388a1
YO
1428 } else {
1429 c = *((U8*)s);
1430 }
4cadc6a9 1431 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1432 }
09091399
JH
1433 }
1434 else {
1aa99e6b 1435 while (s <= e) {
a0a388a1
YO
1436 if (do_utf8) {
1437 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1438 uniflags);
a0a388a1
YO
1439 } else {
1440 c = *((U8*)s);
1441 }
80aecb99 1442
60a8b682 1443 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1444 * Note that not all the possible combinations
1445 * are handled here: some of them are handled
1446 * by the standard folding rules, and some of
1447 * them (the character class or ANYOF cases)
1448 * are handled during compiletime in
1449 * regexec.c:S_regclass(). */
880bd946
JH
1450 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1451 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1452 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1453
4cadc6a9 1454 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1455 }
09091399 1456 }
1aa99e6b
IH
1457 }
1458 else {
a0a388a1 1459 /* Neither pattern nor string are UTF8 */
1aa99e6b 1460 if (c1 == c2)
4cadc6a9 1461 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1462 else
4cadc6a9 1463 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1464 }
1465 break;
bbce6d69 1466 case BOUNDL:
3280af22 1467 PL_reg_flags |= RF_tainted;
bbce6d69 1468 /* FALL THROUGH */
a0d0e21e 1469 case BOUND:
ffc61ed2 1470 if (do_utf8) {
12d33761 1471 if (s == PL_bostr)
ffc61ed2
JH
1472 tmp = '\n';
1473 else {
6136c704 1474 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1475 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1476 }
1477 tmp = ((OP(c) == BOUND ?
9041c2e3 1478 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1479 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1480 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1481 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1482 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1483 isALNUM_LC_utf8((U8*)s)))
1484 {
1485 tmp = !tmp;
4cadc6a9 1486 REXEC_FBC_TRYIT;
a687059c 1487 }
4cadc6a9 1488 );
a0d0e21e 1489 }
667bb95a 1490 else {
12d33761 1491 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1492 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1493 REXEC_FBC_SCAN(
ffc61ed2
JH
1494 if (tmp ==
1495 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1496 tmp = !tmp;
4cadc6a9 1497 REXEC_FBC_TRYIT;
a0ed51b3 1498 }
4cadc6a9 1499 );
a0ed51b3 1500 }
24b23f37 1501 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1502 goto got_it;
1503 break;
bbce6d69 1504 case NBOUNDL:
3280af22 1505 PL_reg_flags |= RF_tainted;
bbce6d69 1506 /* FALL THROUGH */
a0d0e21e 1507 case NBOUND:
ffc61ed2 1508 if (do_utf8) {
12d33761 1509 if (s == PL_bostr)
ffc61ed2
JH
1510 tmp = '\n';
1511 else {
6136c704 1512 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1513 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1514 }
1515 tmp = ((OP(c) == NBOUND ?
9041c2e3 1516 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1517 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1518 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1519 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1520 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1521 isALNUM_LC_utf8((U8*)s)))
1522 tmp = !tmp;
4cadc6a9
YO
1523 else REXEC_FBC_TRYIT;
1524 );
a0d0e21e 1525 }
667bb95a 1526 else {
12d33761 1527 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1528 tmp = ((OP(c) == NBOUND ?
1529 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1530 REXEC_FBC_SCAN(
ffc61ed2
JH
1531 if (tmp ==
1532 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1533 tmp = !tmp;
4cadc6a9
YO
1534 else REXEC_FBC_TRYIT;
1535 );
a0ed51b3 1536 }
24b23f37 1537 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
a0ed51b3
LW
1538 goto got_it;
1539 break;
a0d0e21e 1540 case ALNUM:
4cadc6a9 1541 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1542 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1543 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1544 isALNUM(*s)
1545 );
bbce6d69 1546 case ALNUML:
4cadc6a9
YO
1547 REXEC_FBC_CSCAN_TAINT(
1548 isALNUM_LC_utf8((U8*)s),
1549 isALNUM_LC(*s)
1550 );
a0d0e21e 1551 case NALNUM:
4cadc6a9 1552 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1553 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1554 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
4cadc6a9
YO
1555 !isALNUM(*s)
1556 );
bbce6d69 1557 case NALNUML:
4cadc6a9
YO
1558 REXEC_FBC_CSCAN_TAINT(
1559 !isALNUM_LC_utf8((U8*)s),
1560 !isALNUM_LC(*s)
1561 );
a0d0e21e 1562 case SPACE:
4cadc6a9 1563 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1564 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1565 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
4cadc6a9
YO
1566 isSPACE(*s)
1567 );
bbce6d69 1568 case SPACEL:
4cadc6a9
YO
1569 REXEC_FBC_CSCAN_TAINT(
1570 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1571 isSPACE_LC(*s)
1572 );
a0d0e21e 1573 case NSPACE:
4cadc6a9 1574 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1575 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1576 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
4cadc6a9
YO
1577 !isSPACE(*s)
1578 );
bbce6d69 1579 case NSPACEL:
4cadc6a9
YO
1580 REXEC_FBC_CSCAN_TAINT(
1581 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1582 !isSPACE_LC(*s)
1583 );
a0d0e21e 1584 case DIGIT:
4cadc6a9 1585 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1586 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1587 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1588 isDIGIT(*s)
1589 );
b8c5462f 1590 case DIGITL:
4cadc6a9
YO
1591 REXEC_FBC_CSCAN_TAINT(
1592 isDIGIT_LC_utf8((U8*)s),
1593 isDIGIT_LC(*s)
1594 );
a0d0e21e 1595 case NDIGIT:
4cadc6a9 1596 REXEC_FBC_CSCAN_PRELOAD(
d1eb3177
YO
1597 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1598 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
4cadc6a9
YO
1599 !isDIGIT(*s)
1600 );
b8c5462f 1601 case NDIGITL:
4cadc6a9
YO
1602 REXEC_FBC_CSCAN_TAINT(
1603 !isDIGIT_LC_utf8((U8*)s),
1604 !isDIGIT_LC(*s)
1605 );
e1d1eefb
YO
1606 case LNBREAK:
1607 REXEC_FBC_CSCAN(
1608 is_LNBREAK_utf8(s),
1609 is_LNBREAK_latin1(s)
1610 );
1611 case VERTWS:
1612 REXEC_FBC_CSCAN(
1613 is_VERTWS_utf8(s),
1614 is_VERTWS_latin1(s)
1615 );
1616 case NVERTWS:
1617 REXEC_FBC_CSCAN(
1618 !is_VERTWS_utf8(s),
1619 !is_VERTWS_latin1(s)
1620 );
1621 case HORIZWS:
1622 REXEC_FBC_CSCAN(
1623 is_HORIZWS_utf8(s),
1624 is_HORIZWS_latin1(s)
1625 );
1626 case NHORIZWS:
1627 REXEC_FBC_CSCAN(
1628 !is_HORIZWS_utf8(s),
1629 !is_HORIZWS_latin1(s)
1630 );
1de06328
YO
1631 case AHOCORASICKC:
1632 case AHOCORASICK:
07be1b83 1633 {
a0a388a1 1634 DECL_TRIE_TYPE(c);
07be1b83
YO
1635 /* what trie are we using right now */
1636 reg_ac_data *aho
f8fc2ecf 1637 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1638 reg_trie_data *trie
1639 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1640 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1641
1642 const char *last_start = strend - trie->minlen;
6148ee25 1643#ifdef DEBUGGING
07be1b83 1644 const char *real_start = s;
6148ee25 1645#endif
07be1b83 1646 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1647 SV *sv_points;
1648 U8 **points; /* map of where we were in the input string
786e8c11 1649 when reading a given char. For ASCII this
be8e71aa 1650 is unnecessary overhead as the relationship
38a44b82
NC
1651 is always 1:1, but for Unicode, especially
1652 case folded Unicode this is not true. */
f9e705e8 1653 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1654 U8 *bitmap=NULL;
1655
07be1b83
YO
1656
1657 GET_RE_DEBUG_FLAGS_DECL;
1658
be8e71aa
YO
1659 /* We can't just allocate points here. We need to wrap it in
1660 * an SV so it gets freed properly if there is a croak while
1661 * running the match */
1662 ENTER;
1663 SAVETMPS;
1664 sv_points=newSV(maxlen * sizeof(U8 *));
1665 SvCUR_set(sv_points,
1666 maxlen * sizeof(U8 *));
1667 SvPOK_on(sv_points);
1668 sv_2mortal(sv_points);
1669 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1670 if ( trie_type != trie_utf8_fold
1671 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1672 {
786e8c11
YO
1673 if (trie->bitmap)
1674 bitmap=(U8*)trie->bitmap;
1675 else
1676 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1677 }
786e8c11
YO
1678 /* this is the Aho-Corasick algorithm modified a touch
1679 to include special handling for long "unknown char"
1680 sequences. The basic idea being that we use AC as long
1681 as we are dealing with a possible matching char, when
1682 we encounter an unknown char (and we have not encountered
1683 an accepting state) we scan forward until we find a legal
1684 starting char.
1685 AC matching is basically that of trie matching, except
1686 that when we encounter a failing transition, we fall back
1687 to the current states "fail state", and try the current char
1688 again, a process we repeat until we reach the root state,
1689 state 1, or a legal transition. If we fail on the root state
1690 then we can either terminate if we have reached an accepting
1691 state previously, or restart the entire process from the beginning
1692 if we have not.
1693
1694 */
07be1b83
YO
1695 while (s <= last_start) {
1696 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1697 U8 *uc = (U8*)s;
1698 U16 charid = 0;
1699 U32 base = 1;
1700 U32 state = 1;
1701 UV uvc = 0;
1702 STRLEN len = 0;
1703 STRLEN foldlen = 0;
1704 U8 *uscan = (U8*)NULL;
1705 U8 *leftmost = NULL;
786e8c11
YO
1706#ifdef DEBUGGING
1707 U32 accepted_word= 0;
1708#endif
07be1b83
YO
1709 U32 pointpos = 0;
1710
1711 while ( state && uc <= (U8*)strend ) {
1712 int failed=0;
786e8c11
YO
1713 U32 word = aho->states[ state ].wordnum;
1714
1de06328
YO
1715 if( state==1 ) {
1716 if ( bitmap ) {
1717 DEBUG_TRIE_EXECUTE_r(
1718 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1719 dump_exec_pos( (char *)uc, c, strend, real_start,
1720 (char *)uc, do_utf8 );
1721 PerlIO_printf( Perl_debug_log,
1722 " Scanning for legal start char...\n");
1723 }
1724 );
1725 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1726 uc++;
786e8c11 1727 }
1de06328 1728 s= (char *)uc;
786e8c11 1729 }
786e8c11
YO
1730 if (uc >(U8*)last_start) break;
1731 }
1732
1733 if ( word ) {
1734 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1735 if (!leftmost || lpos < leftmost) {
1736 DEBUG_r(accepted_word=word);
07be1b83 1737 leftmost= lpos;
786e8c11 1738 }
07be1b83 1739 if (base==0) break;
786e8c11 1740
07be1b83
YO
1741 }
1742 points[pointpos++ % maxlen]= uc;
55eed653
NC
1743 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1744 uscan, len, uvc, charid, foldlen,
1745 foldbuf, uniflags);
786e8c11
YO
1746 DEBUG_TRIE_EXECUTE_r({
1747 dump_exec_pos( (char *)uc, c, strend, real_start,
1748 s, do_utf8 );
07be1b83 1749 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1750 " Charid:%3u CP:%4"UVxf" ",
1751 charid, uvc);
1752 });
07be1b83
YO
1753
1754 do {
6148ee25 1755#ifdef DEBUGGING
786e8c11 1756 word = aho->states[ state ].wordnum;
6148ee25 1757#endif
07be1b83
YO
1758 base = aho->states[ state ].trans.base;
1759
786e8c11
YO
1760 DEBUG_TRIE_EXECUTE_r({
1761 if (failed)
1762 dump_exec_pos( (char *)uc, c, strend, real_start,
1763 s, do_utf8 );
07be1b83 1764 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1765 "%sState: %4"UVxf", word=%"UVxf,
1766 failed ? " Fail transition to " : "",
1767 (UV)state, (UV)word);
1768 });
07be1b83
YO
1769 if ( base ) {
1770 U32 tmp;
1771 if (charid &&
1772 (base + charid > trie->uniquecharcount )
1773 && (base + charid - 1 - trie->uniquecharcount
1774 < trie->lasttrans)
1775 && trie->trans[base + charid - 1 -
1776 trie->uniquecharcount].check == state
1777 && (tmp=trie->trans[base + charid - 1 -
1778 trie->uniquecharcount ].next))
1779 {
786e8c11
YO
1780 DEBUG_TRIE_EXECUTE_r(
1781 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1782 state = tmp;
1783 break;
1784 }
1785 else {
786e8c11
YO
1786 DEBUG_TRIE_EXECUTE_r(
1787 PerlIO_printf( Perl_debug_log," - fail\n"));
1788 failed = 1;
1789 state = aho->fail[state];
07be1b83
YO
1790 }
1791 }
1792 else {
1793 /* we must be accepting here */
786e8c11
YO
1794 DEBUG_TRIE_EXECUTE_r(
1795 PerlIO_printf( Perl_debug_log," - accepting\n"));
1796 failed = 1;
07be1b83
YO
1797 break;
1798 }
1799 } while(state);
786e8c11 1800 uc += len;
07be1b83
YO
1801 if (failed) {
1802 if (leftmost)
1803 break;
786e8c11 1804 if (!state) state = 1;
07be1b83
YO
1805 }
1806 }
1807 if ( aho->states[ state ].wordnum ) {
1808 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
786e8c11
YO
1809 if (!leftmost || lpos < leftmost) {
1810 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1811 leftmost = lpos;
786e8c11 1812 }
07be1b83 1813 }
07be1b83
YO
1814 if (leftmost) {
1815 s = (char*)leftmost;
786e8c11
YO
1816 DEBUG_TRIE_EXECUTE_r({
1817 PerlIO_printf(
70685ca0
JH
1818 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1819 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
1820 );
1821 });
24b23f37 1822 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
1823 FREETMPS;
1824 LEAVE;
07be1b83 1825 goto got_it;
be8e71aa 1826 }
07be1b83 1827 s = HOPc(s,1);
786e8c11
YO
1828 DEBUG_TRIE_EXECUTE_r({
1829 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1830 });
07be1b83 1831 } else {
786e8c11
YO
1832 DEBUG_TRIE_EXECUTE_r(
1833 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
1834 break;
1835 }
1836 }
be8e71aa
YO
1837 FREETMPS;
1838 LEAVE;
07be1b83
YO
1839 }
1840 break;
b3c9acc1 1841 default:
3c3eec57
GS
1842 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1843 break;
d6a28714 1844 }
6eb5f6b9
JH
1845 return 0;
1846 got_it:
1847 return s;
1848}
1849
fae667d5 1850
6eb5f6b9
JH
1851/*
1852 - regexec_flags - match a regexp against a string
1853 */
1854I32
288b8c02 1855Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
1856 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1857/* strend: pointer to null at end of string */
1858/* strbeg: real beginning of string */
1859/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
1860/* data: May be used for some additional optimizations.
1861 Currently its only used, with a U32 cast, for transmitting
1862 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
1863/* nosave: For optimizations. */
1864{
97aff369 1865 dVAR;
288b8c02 1866 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 1867 /*register*/ char *s;
6eb5f6b9 1868 register regnode *c;
24b23f37 1869 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
1870 I32 minlen; /* must match at least this many chars */
1871 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1872 I32 end_shift = 0; /* Same for the end. */ /* CC */
1873 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1874 char *scream_olds = NULL;
f9f4320a 1875 const bool do_utf8 = (bool)DO_UTF8(sv);
2757e526 1876 I32 multiline;
f8fc2ecf 1877 RXi_GET_DECL(prog,progi);
3b0527fe 1878 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 1879 regexp_paren_pair *swap = NULL;
a3621e74
YO
1880 GET_RE_DEBUG_FLAGS_DECL;
1881
7918f24d 1882 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 1883 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1884
1885 /* Be paranoid... */
1886 if (prog == NULL || startpos == NULL) {
1887 Perl_croak(aTHX_ "NULL regexp parameter");
1888 return 0;
1889 }
1890
bbe252da 1891 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 1892 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 1893
288b8c02 1894 RX_MATCH_UTF8_set(rx, do_utf8);
1de06328 1895 DEBUG_EXECUTE_r(
efd26800 1896 debug_start_match(rx, do_utf8, startpos, strend,
1de06328
YO
1897 "Matching");
1898 );
bac06658 1899
6eb5f6b9 1900 minlen = prog->minlen;
1de06328
YO
1901
1902 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 1903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1904 "String too short [regexec_flags]...\n"));
1905 goto phooey;
1aa99e6b 1906 }
6eb5f6b9 1907
1de06328 1908
6eb5f6b9 1909 /* Check validity of program. */
f8fc2ecf 1910 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
1911 Perl_croak(aTHX_ "corrupted regexp program");
1912 }
1913
1914 PL_reg_flags = 0;
1915 PL_reg_eval_set = 0;
1916 PL_reg_maxiter = 0;
1917
3c8556c3 1918 if (RX_UTF8(rx))
6eb5f6b9
JH
1919 PL_reg_flags |= RF_utf8;
1920
1921 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1922 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1923 PL_bostr = strbeg;
3b0527fe 1924 reginfo.sv = sv;
6eb5f6b9
JH
1925
1926 /* Mark end of line for $ (and such) */
1927 PL_regeol = strend;
1928
1929 /* see how far we have to get to not match where we matched before */
3b0527fe 1930 reginfo.till = startpos+minend;
6eb5f6b9 1931
6eb5f6b9
JH
1932 /* If there is a "must appear" string, look for it. */
1933 s = startpos;
1934
bbe252da 1935 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 1936 MAGIC *mg;
2c296965 1937 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 1938 reginfo.ganch = startpos + prog->gofs;
2c296965 1939 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1940 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 1941 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 1942 && SvMAGIC(sv)
14befaf4
DM
1943 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1944 && mg->mg_len >= 0) {
3b0527fe 1945 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 1946 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1947 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 1948
bbe252da 1949 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 1950 if (s > reginfo.ganch)
6eb5f6b9 1951 goto phooey;
58e23c8d 1952 s = reginfo.ganch - prog->gofs;
2c296965 1953 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 1954 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
1955 if (s < strbeg)
1956 goto phooey;
6eb5f6b9
JH
1957 }
1958 }
58e23c8d 1959 else if (data) {
70685ca0 1960 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
1961 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1962 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1963
1964 } else { /* pos() not defined */
3b0527fe 1965 reginfo.ganch = strbeg;
2c296965
YO
1966 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1967 "GPOS: reginfo.ganch = strbeg\n"));
1968 }
6eb5f6b9 1969 }
288b8c02 1970 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
1971 /* We have to be careful. If the previous successful match
1972 was from this regex we don't want a subsequent partially
1973 successful match to clobber the old results.
1974 So when we detect this possibility we add a swap buffer
1975 to the re, and switch the buffer each match. If we fail
1976 we switch it back, otherwise we leave it swapped.
1977 */
1978 swap = prog->offs;
1979 /* do we need a save destructor here for eval dies? */
1980 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 1981 }
a0714e2c 1982 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1983 re_scream_pos_data d;
1984
1985 d.scream_olds = &scream_olds;
1986 d.scream_pos = &scream_pos;
288b8c02 1987 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 1988 if (!s) {
a3621e74 1989 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1990 goto phooey; /* not present */
3fa9c3d7 1991 }
6eb5f6b9
JH
1992 }
1993
1de06328 1994
6eb5f6b9
JH
1995
1996 /* Simplest case: anchored match need be tried only once. */
1997 /* [unless only anchor is BOL and multiline is set] */
bbe252da 1998 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 1999 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2000 goto got_it;
bbe252da
YO
2001 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2002 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2003 {
2004 char *end;
2005
2006 if (minlen)
2007 dontbother = minlen - 1;
1aa99e6b 2008 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2009 /* for multiline we only have to try after newlines */
33b8afdf 2010 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
2011 if (s == startpos)
2012 goto after_try;
2013 while (1) {
24b23f37 2014 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2015 goto got_it;
2016 after_try:
5339e136 2017 if (s > end)
6eb5f6b9 2018 goto phooey;
bbe252da 2019 if (prog->extflags & RXf_USE_INTUIT) {
288b8c02 2020 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
6eb5f6b9
JH
2021 if (!s)
2022 goto phooey;
2023 }
2024 else
2025 s++;
2026 }
2027 } else {
2028 if (s > startpos)
2029 s--;
2030 while (s < end) {
2031 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2032 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2033 goto got_it;
2034 }
2035 }
2036 }
2037 }
2038 goto phooey;
bbe252da 2039 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a
YO
2040 {
2041 /* the warning about reginfo.ganch being used without intialization
bbe252da 2042 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2043 and we only enter this block when the same bit is set. */
58e23c8d 2044 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2045
2046 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2047 goto got_it;
2048 goto phooey;
2049 }
2050
2051 /* Messy cases: unanchored match. */
bbe252da 2052 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9
JH
2053 /* we have /x+whatever/ */
2054 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 2055 char ch;
bf93d4cc
GS
2056#ifdef DEBUGGING
2057 int did_match = 0;
2058#endif
33b8afdf
JH
2059 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2060 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 2061 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2062
1aa99e6b 2063 if (do_utf8) {
4cadc6a9 2064 REXEC_FBC_SCAN(
6eb5f6b9 2065 if (*s == ch) {
a3621e74 2066 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2067 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2068 s += UTF8SKIP(s);
2069 while (s < strend && *s == ch)
2070 s += UTF8SKIP(s);
2071 }
4cadc6a9 2072 );
6eb5f6b9
JH
2073 }
2074 else {
4cadc6a9 2075 REXEC_FBC_SCAN(
6eb5f6b9 2076 if (*s == ch) {
a3621e74 2077 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2078 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2079 s++;
2080 while (s < strend && *s == ch)
2081 s++;
2082 }
4cadc6a9 2083 );
6eb5f6b9 2084 }
a3621e74 2085 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2086 PerlIO_printf(Perl_debug_log,
b7953727
JH
2087 "Did not find anchored character...\n")
2088 );
6eb5f6b9 2089 }
a0714e2c
SS
2090 else if (prog->anchored_substr != NULL
2091 || prog->anchored_utf8 != NULL
2092 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2093 && prog->float_max_offset < strend - s)) {
2094 SV *must;
2095 I32 back_max;
2096 I32 back_min;
2097 char *last;
6eb5f6b9 2098 char *last1; /* Last position checked before */
bf93d4cc
GS
2099#ifdef DEBUGGING
2100 int did_match = 0;
2101#endif
33b8afdf
JH
2102 if (prog->anchored_substr || prog->anchored_utf8) {
2103 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2104 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2105 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2106 back_max = back_min = prog->anchored_offset;
2107 } else {
2108 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2109 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2110 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2111 back_max = prog->float_max_offset;
2112 back_min = prog->float_min_offset;
2113 }
1de06328
YO
2114
2115
33b8afdf
JH
2116 if (must == &PL_sv_undef)
2117 /* could not downgrade utf8 check substring, so must fail */
2118 goto phooey;
2119
1de06328
YO
2120 if (back_min<0) {
2121 last = strend;
2122 } else {
2123 last = HOP3c(strend, /* Cannot start after this */
2124 -(I32)(CHR_SVLEN(must)
2125 - (SvTAIL(must) != 0) + back_min), strbeg);
2126 }
6eb5f6b9
JH
2127 if (s > PL_bostr)
2128 last1 = HOPc(s, -1);
2129 else
2130 last1 = s - 1; /* bogus */
2131
a0288114 2132 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2133 check_substr==must. */
2134 scream_pos = -1;
2135 dontbother = end_shift;
2136 strend = HOPc(strend, -dontbother);
2137 while ( (s <= last) &&
9041c2e3 2138 ((flags & REXEC_SCREAM)
1de06328 2139 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2140 end_shift, &scream_pos, 0))
1de06328 2141 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2142 (unsigned char*)strend, must,
7fba1cd6 2143 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2144 /* we may be pointing at the wrong string */
07bc277f 2145 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2146 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2147 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2148 if (HOPc(s, -back_max) > last1) {
2149 last1 = HOPc(s, -back_min);
2150 s = HOPc(s, -back_max);
2151 }
2152 else {
52657f30 2153 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2154
2155 last1 = HOPc(s, -back_min);
52657f30 2156 s = t;
6eb5f6b9 2157 }
1aa99e6b 2158 if (do_utf8) {
6eb5f6b9 2159 while (s <= last1) {
24b23f37 2160 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2161 goto got_it;
2162 s += UTF8SKIP(s);
2163 }
2164 }
2165 else {
2166 while (s <= last1) {
24b23f37 2167 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2168 goto got_it;
2169 s++;
2170 }
2171 }
2172 }
ab3bbdeb
YO
2173 DEBUG_EXECUTE_r(if (!did_match) {
2174 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2175 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2176 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2177 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2178 ? "anchored" : "floating"),
ab3bbdeb
YO
2179 quoted, RE_SV_TAIL(must));
2180 });
6eb5f6b9
JH
2181 goto phooey;
2182 }
f8fc2ecf 2183 else if ( (c = progi->regstclass) ) {
f14c76ed 2184 if (minlen) {
f8fc2ecf 2185 const OPCODE op = OP(progi->regstclass);
66e933ab 2186 /* don't bother with what can't match */
786e8c11 2187 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2188 strend = HOPc(strend, -(minlen - 1));
2189 }
a3621e74 2190 DEBUG_EXECUTE_r({
be8e71aa 2191 SV * const prop = sv_newmortal();
32fc9b6a 2192 regprop(prog, prop, c);
0df25f3d 2193 {
02daf0ab 2194 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2195 s,strend-s,60);
0df25f3d 2196 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2197 "Matching stclass %.*s against %s (%d chars)\n",
e4f74956 2198 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2199 quoted, (int)(strend - s));
0df25f3d 2200 }
ffc61ed2 2201 });
3b0527fe 2202 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2203 goto got_it;
07be1b83 2204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2205 }
2206 else {
2207 dontbother = 0;
a0714e2c 2208 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2209 /* Trim the end. */
d6a28714 2210 char *last;
33b8afdf
JH
2211 SV* float_real;
2212
2213 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2214 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2215 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
2216
2217 if (flags & REXEC_SCREAM) {
33b8afdf 2218 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2219 end_shift, &scream_pos, 1); /* last one */
2220 if (!last)
ffc61ed2 2221 last = scream_olds; /* Only one occurrence. */
4addbd3b 2222 /* we may be pointing at the wrong string */
07bc277f 2223 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2224 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2225 }
d6a28714
JH
2226 else {
2227 STRLEN len;
cfd0369c 2228 const char * const little = SvPV_const(float_real, len);
d6a28714 2229
33b8afdf 2230 if (SvTAIL(float_real)) {
d6a28714
JH
2231 if (memEQ(strend - len + 1, little, len - 1))
2232 last = strend - len + 1;
7fba1cd6 2233 else if (!multiline)
9041c2e3 2234 last = memEQ(strend - len, little, len)
bd61b366 2235 ? strend - len : NULL;
b8c5462f 2236 else
d6a28714
JH
2237 goto find_last;
2238 } else {
2239 find_last:
9041c2e3 2240 if (len)
d6a28714 2241 last = rninstr(s, strend, little, little + len);
b8c5462f 2242 else
a0288114 2243 last = strend; /* matching "$" */
b8c5462f 2244 }
b8c5462f 2245 }
bf93d4cc 2246 if (last == NULL) {
6bda09f9
YO
2247 DEBUG_EXECUTE_r(
2248 PerlIO_printf(Perl_debug_log,
2249 "%sCan't trim the tail, match fails (should not happen)%s\n",
2250 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2251 goto phooey; /* Should not happen! */
2252 }
d6a28714
JH
2253 dontbother = strend - last + prog->float_min_offset;
2254 }
2255 if (minlen && (dontbother < minlen))
2256 dontbother = minlen - 1;
2257 strend -= dontbother; /* this one's always in bytes! */
2258 /* We don't know much -- general case. */
1aa99e6b 2259 if (do_utf8) {
d6a28714 2260 for (;;) {
24b23f37 2261 if (regtry(&reginfo, &s))
d6a28714
JH
2262 goto got_it;
2263 if (s >= strend)
2264 break;
b8c5462f 2265 s += UTF8SKIP(s);
d6a28714
JH
2266 };
2267 }
2268 else {
2269 do {
24b23f37 2270 if (regtry(&reginfo, &s))
d6a28714
JH
2271 goto got_it;
2272 } while (s++ < strend);
2273 }
2274 }
2275
2276 /* Failure. */
2277 goto phooey;
2278
2279got_it:
e9105d30 2280 Safefree(swap);
288b8c02 2281 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2282
19b95bf0 2283 if (PL_reg_eval_set)
4f639d21 2284 restore_pos(aTHX_ prog);
5daac39c
NC
2285 if (RXp_PAREN_NAMES(prog))
2286 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2287
2288 /* make sure $`, $&, $', and $digit will work later */
2289 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2290 RX_MATCH_COPY_FREE(rx);
d6a28714 2291 if (flags & REXEC_COPY_STR) {
be8e71aa 2292 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2293#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2294 if ((SvIsCOW(sv)
2295 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2296 if (DEBUG_C_TEST) {
2297 PerlIO_printf(Perl_debug_log,
2298 "Copy on write: regexp capture, type %d\n",
2299 (int) SvTYPE(sv));
2300 }
2301 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2302 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2303 assert (SvPOKp(prog->saved_copy));
2304 } else
2305#endif
2306 {
288b8c02 2307 RX_MATCH_COPIED_on(rx);
ed252734
NC
2308 s = savepvn(strbeg, i);
2309 prog->subbeg = s;
2310 }
d6a28714 2311 prog->sublen = i;
d6a28714
JH
2312 }
2313 else {
2314 prog->subbeg = strbeg;
2315 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2316 }
2317 }
9041c2e3 2318
d6a28714
JH
2319 return 1;
2320
2321phooey:
a3621e74 2322 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2323 PL_colors[4], PL_colors[5]));
d6a28714 2324 if (PL_reg_eval_set)
4f639d21 2325 restore_pos(aTHX_ prog);
e9105d30 2326 if (swap) {
c74340f9 2327 /* we failed :-( roll it back */
e9105d30
GG
2328 Safefree(prog->offs);
2329 prog->offs = swap;
2330 }
2331
d6a28714
JH
2332 return 0;
2333}
2334
6bda09f9 2335
d6a28714
JH
2336/*
2337 - regtry - try match at specific point
2338 */
2339STATIC I32 /* 0 failure, 1 success */
24b23f37 2340S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2341{
97aff369 2342 dVAR;
d6a28714 2343 CHECKPOINT lastcp;
288b8c02
NC
2344 REGEXP *const rx = reginfo->prog;
2345 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2346 RXi_GET_DECL(prog,progi);
a3621e74 2347 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2348
2349 PERL_ARGS_ASSERT_REGTRY;
2350
24b23f37 2351 reginfo->cutpoint=NULL;
d6a28714 2352
bbe252da 2353 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2354 MAGIC *mg;
2355
2356 PL_reg_eval_set = RS_init;
a3621e74 2357 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2358 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2359 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2360 ));
ea8d6ae1 2361 SAVESTACK_CXPOS();
d6a28714
JH
2362 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2363 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2364 SAVETMPS;
2365 /* Apparently this is not needed, judging by wantarray. */
e8347627 2366 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2367 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2368
3b0527fe 2369 if (reginfo->sv) {
d6a28714 2370 /* Make $_ available to executed code. */
3b0527fe 2371 if (reginfo->sv != DEFSV) {
59f00321 2372 SAVE_DEFSV;
414bf5ae 2373 DEFSV_set(reginfo->sv);
b8c5462f 2374 }
d6a28714 2375
3b0527fe
DM
2376 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2377 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2378 /* prepare for quick setting of pos */
d300d9fa 2379#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2380 if (SvIsCOW(reginfo->sv))
2381 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2382#endif
3dab1dad 2383 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2384 &PL_vtbl_mglob, NULL, 0);
d6a28714 2385 mg->mg_len = -1;
b8c5462f 2386 }
d6a28714
JH
2387 PL_reg_magic = mg;
2388 PL_reg_oldpos = mg->mg_len;
4f639d21 2389 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2390 }
09687e5a 2391 if (!PL_reg_curpm) {
a02a5408 2392 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2393#ifdef USE_ITHREADS
2394 {
14a49a24 2395 SV* const repointer = &PL_sv_undef;
92313705
NC
2396 /* this regexp is also owned by the new PL_reg_curpm, which
2397 will try to free it. */
d2ece331 2398 av_push(PL_regex_padav, repointer);
09687e5a
AB
2399 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2400 PL_regex_pad = AvARRAY(PL_regex_padav);
2401 }
2402#endif
2403 }
86c29d75
NC
2404#ifdef USE_ITHREADS
2405 /* It seems that non-ithreads works both with and without this code.
2406 So for efficiency reasons it seems best not to have the code
2407 compiled when it is not needed. */
92313705
NC
2408 /* This is safe against NULLs: */
2409 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2410 /* PM_reg_curpm owns a reference to this regexp. */
2411 ReREFCNT_inc(rx);
86c29d75 2412#endif
288b8c02 2413 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2414 PL_reg_oldcurpm = PL_curpm;
2415 PL_curpm = PL_reg_curpm;
07bc277f 2416 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2417 /* Here is a serious problem: we cannot rewrite subbeg,
2418 since it may be needed if this match fails. Thus
2419 $` inside (?{}) could fail... */
2420 PL_reg_oldsaved = prog->subbeg;
2421 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2422#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2423 PL_nrs = prog->saved_copy;
2424#endif
07bc277f 2425 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2426 }
2427 else
bd61b366 2428 PL_reg_oldsaved = NULL;
d6a28714
JH
2429 prog->subbeg = PL_bostr;
2430 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2431 }
24b23f37 2432 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2433 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2434 PL_reginput = *startpos;
d6a28714 2435 PL_reglastparen = &prog->lastparen;
a01268b5 2436 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2437 prog->lastparen = 0;
03994de8 2438 prog->lastcloseparen = 0;
d6a28714 2439 PL_regsize = 0;
f0ab9afb 2440 PL_regoffs = prog->offs;
d6a28714
JH
2441 if (PL_reg_start_tmpl <= prog->nparens) {
2442 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2443 if(PL_reg_start_tmp)
2444 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2445 else
a02a5408 2446 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2447 }
2448
2449 /* XXXX What this code is doing here?!!! There should be no need
2450 to do this again and again, PL_reglastparen should take care of
3dd2943c 2451 this! --ilya*/
dafc8851
JH
2452
2453 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2454 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2455 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2456 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2457 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2458 * Meanwhile, this code *is* needed for the
daf18116
JH
2459 * above-mentioned test suite tests to succeed. The common theme
2460 * on those tests seems to be returning null fields from matches.
225593e1 2461 * --jhi updated by dapm */
dafc8851 2462#if 1
d6a28714 2463 if (prog->nparens) {
f0ab9afb 2464 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2465 register I32 i;
eb160463 2466 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2467 ++pp;
2468 pp->start = -1;
2469 pp->end = -1;
d6a28714
JH
2470 }
2471 }
dafc8851 2472#endif
02db2b7b 2473 REGCP_SET(lastcp);
f8fc2ecf 2474 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2475 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2476 return 1;
2477 }
24b23f37
YO
2478 if (reginfo->cutpoint)
2479 *startpos= reginfo->cutpoint;
02db2b7b 2480 REGCP_UNWIND(lastcp);
d6a28714
JH
2481 return 0;
2482}
2483
02db2b7b 2484
8ba1375e
MJD
2485#define sayYES goto yes
2486#define sayNO goto no
262b90c4 2487#define sayNO_SILENT goto no_silent
8ba1375e 2488
f9f4320a
YO
2489/* we dont use STMT_START/END here because it leads to
2490 "unreachable code" warnings, which are bogus, but distracting. */
2491#define CACHEsayNO \
c476f425
DM
2492 if (ST.cache_mask) \
2493 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2494 sayNO
3298f257 2495
a3621e74 2496/* this is used to determine how far from the left messages like
265c4333
YO
2497 'failed...' are printed. It should be set such that messages
2498 are inline with the regop output that created them.
a3621e74 2499*/
265c4333 2500#define REPORT_CODE_OFF 32
a3621e74
YO
2501
2502
2503/* Make sure there is a test for this +1 options in re_tests */
2504#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2505
40a82448
DM
2506#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2507#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2508
86545054
DM
2509#define SLAB_FIRST(s) (&(s)->states[0])
2510#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2511
5d9a96ca
DM
2512/* grab a new slab and return the first slot in it */
2513
2514STATIC regmatch_state *
2515S_push_slab(pTHX)
2516{
a35a87e7 2517#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2518 dMY_CXT;
2519#endif
5d9a96ca
DM
2520 regmatch_slab *s = PL_regmatch_slab->next;
2521 if (!s) {
2522 Newx(s, 1, regmatch_slab);
2523 s->prev = PL_regmatch_slab;
2524 s->next = NULL;
2525 PL_regmatch_slab->next = s;
2526 }
2527 PL_regmatch_slab = s;
86545054 2528 return SLAB_FIRST(s);
5d9a96ca 2529}
5b47454d 2530
95b24440 2531
40a82448
DM
2532/* push a new state then goto it */
2533
2534#define PUSH_STATE_GOTO(state, node) \
2535 scan = node; \
2536 st->resume_state = state; \
2537 goto push_state;
2538
2539/* push a new state with success backtracking, then goto it */
2540
2541#define PUSH_YES_STATE_GOTO(state, node) \
2542 scan = node; \
2543 st->resume_state = state; \
2544 goto push_yes_state;
2545
aa283a38 2546
aa283a38 2547
d6a28714 2548/*
95b24440 2549
bf1f174e
DM
2550regmatch() - main matching routine
2551
2552This is basically one big switch statement in a loop. We execute an op,
2553set 'next' to point the next op, and continue. If we come to a point which
2554we may need to backtrack to on failure such as (A|B|C), we push a
2555backtrack state onto the backtrack stack. On failure, we pop the top
2556state, and re-enter the loop at the state indicated. If there are no more
2557states to pop, we return failure.
2558
2559Sometimes we also need to backtrack on success; for example /A+/, where
2560after successfully matching one A, we need to go back and try to
2561match another one; similarly for lookahead assertions: if the assertion
2562completes successfully, we backtrack to the state just before the assertion
2563and then carry on. In these cases, the pushed state is marked as
2564'backtrack on success too'. This marking is in fact done by a chain of
2565pointers, each pointing to the previous 'yes' state. On success, we pop to
2566the nearest yes state, discarding any intermediate failure-only states.
2567Sometimes a yes state is pushed just to force some cleanup code to be
2568called at the end of a successful match or submatch; e.g. (??{$re}) uses
2569it to free the inner regex.
2570
2571Note that failure backtracking rewinds the cursor position, while
2572success backtracking leaves it alone.
2573
2574A pattern is complete when the END op is executed, while a subpattern
2575such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2576ops trigger the "pop to last yes state if any, otherwise return true"
2577behaviour.
2578
2579A common convention in this function is to use A and B to refer to the two
2580subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2581the subpattern to be matched possibly multiple times, while B is the entire
2582rest of the pattern. Variable and state names reflect this convention.
2583
2584The states in the main switch are the union of ops and failure/success of
2585substates associated with with that op. For example, IFMATCH is the op
2586that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2587'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2588successfully matched A and IFMATCH_A_fail is a state saying that we have
2589just failed to match A. Resume states always come in pairs. The backtrack
2590state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2591at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2592on success or failure.
2593
2594The struct that holds a backtracking state is actually a big union, with
2595one variant for each major type of op. The variable st points to the
2596top-most backtrack struct. To make the code clearer, within each
2597block of code we #define ST to alias the relevant union.
2598
2599Here's a concrete example of a (vastly oversimplified) IFMATCH
2600implementation:
2601
2602 switch (state) {
2603 ....
2604
2605#define ST st->u.ifmatch
2606
2607 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2608 ST.foo = ...; // some state we wish to save
95b24440 2609 ...
bf1f174e
DM
2610 // push a yes backtrack state with a resume value of
2611 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2612 // first node of A:
2613 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2614 // NOTREACHED
2615
2616 case IFMATCH_A: // we have successfully executed A; now continue with B
2617 next = B;
2618 bar = ST.foo; // do something with the preserved value
2619 break;
2620
2621 case IFMATCH_A_fail: // A failed, so the assertion failed
2622 ...; // do some housekeeping, then ...
2623 sayNO; // propagate the failure
2624
2625#undef ST
95b24440 2626
bf1f174e
DM
2627 ...
2628 }
95b24440 2629
bf1f174e
DM
2630For any old-timers reading this who are familiar with the old recursive
2631approach, the code above is equivalent to:
95b24440 2632
bf1f174e
DM
2633 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2634 {
2635 int foo = ...
95b24440 2636 ...
bf1f174e
DM
2637 if (regmatch(A)) {
2638 next = B;
2639 bar = foo;
2640 break;
95b24440 2641 }
bf1f174e
DM
2642 ...; // do some housekeeping, then ...
2643 sayNO; // propagate the failure
95b24440 2644 }
bf1f174e
DM
2645
2646The topmost backtrack state, pointed to by st, is usually free. If you
2647want to claim it, populate any ST.foo fields in it with values you wish to
2648save, then do one of
2649
2650 PUSH_STATE_GOTO(resume_state, node);
2651 PUSH_YES_STATE_GOTO(resume_state, node);
2652
2653which sets that backtrack state's resume value to 'resume_state', pushes a
2654new free entry to the top of the backtrack stack, then goes to 'node'.
2655On backtracking, the free slot is popped, and the saved state becomes the
2656new free state. An ST.foo field in this new top state can be temporarily
2657accessed to retrieve values, but once the main loop is re-entered, it
2658becomes available for reuse.
2659
2660Note that the depth of the backtrack stack constantly increases during the
2661left-to-right execution of the pattern, rather than going up and down with
2662the pattern nesting. For example the stack is at its maximum at Z at the
2663end of the pattern, rather than at X in the following:
2664
2665 /(((X)+)+)+....(Y)+....Z/
2666
2667The only exceptions to this are lookahead/behind assertions and the cut,
2668(?>A), which pop all the backtrack states associated with A before
2669continuing.
2670
2671Bascktrack state structs are allocated in slabs of about 4K in size.
2672PL_regmatch_state and st always point to the currently active state,
2673and PL_regmatch_slab points to the slab currently containing
2674PL_regmatch_state. The first time regmatch() is called, the first slab is
2675allocated, and is never freed until interpreter destruction. When the slab
2676is full, a new one is allocated and chained to the end. At exit from
2677regmatch(), slabs allocated since entry are freed.
2678
2679*/
95b24440 2680
40a82448 2681
5bc10b2c 2682#define DEBUG_STATE_pp(pp) \
265c4333 2683 DEBUG_STATE_r({ \
5bc10b2c
DM
2684 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2685 PerlIO_printf(Perl_debug_log, \
5d458dd8 2686 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2687 depth*2, "", \
13d6edb4 2688 PL_reg_name[st->resume_state], \
5d458dd8
YO
2689 ((st==yes_state||st==mark_state) ? "[" : ""), \
2690 ((st==yes_state) ? "Y" : ""), \
2691 ((st==mark_state) ? "M" : ""), \
2692 ((st==yes_state||st==mark_state) ? "]" : "") \
2693 ); \
265c4333 2694 });
5bc10b2c 2695
40a82448 2696
3dab1dad 2697#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2698
3df15adc 2699#ifdef DEBUGGING
5bc10b2c 2700
ab3bbdeb 2701STATIC void
efd26800 2702S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
ab3bbdeb
YO
2703 const char *start, const char *end, const char *blurb)
2704{
efd26800 2705 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2706
2707 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2708
ab3bbdeb
YO
2709 if (!PL_colorset)
2710 reginitcolors();
2711 {
2712 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2713 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb
YO
2714
2715 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2716 start, end - start, 60);
2717
2718 PerlIO_printf(Perl_debug_log,
2719 "%s%s REx%s %s against %s\n",
2720 PL_colors[4], blurb, PL_colors[5], s0, s1);
2721
2722 if (do_utf8||utf8_pat)
1de06328
YO
2723 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2724 utf8_pat ? "pattern" : "",
2725 utf8_pat && do_utf8 ? " and " : "",
2726 do_utf8 ? "string" : ""
ab3bbdeb
YO
2727 );
2728 }
2729}
3df15adc
YO
2730
2731STATIC void
786e8c11
YO
2732S_dump_exec_pos(pTHX_ const char *locinput,
2733 const regnode *scan,
2734 const char *loc_regeol,
2735 const char *loc_bostr,
2736 const char *loc_reg_starttry,
2737 const bool do_utf8)
07be1b83 2738{
786e8c11 2739 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2740 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2741 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2742 /* The part of the string before starttry has one color
2743 (pref0_len chars), between starttry and current
2744 position another one (pref_len - pref0_len chars),
2745 after the current position the third one.
2746 We assume that pref0_len <= pref_len, otherwise we
2747 decrease pref0_len. */
786e8c11
YO
2748 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2749 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2750 int pref0_len;
2751
7918f24d
NC
2752 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2753
07be1b83
YO
2754 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2755 pref_len++;
786e8c11
YO
2756 pref0_len = pref_len - (locinput - loc_reg_starttry);
2757 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2758 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2759 ? (5 + taill) - pref_len : loc_regeol - locinput);
07be1b83
YO
2760 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2761 l--;
2762 if (pref0_len < 0)
2763 pref0_len = 0;
2764 if (pref0_len > pref_len)
2765 pref0_len = pref_len;
2766 {
3df15adc 2767 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2768
ab3bbdeb 2769 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2770 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2771
ab3bbdeb 2772 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2773 (locinput - pref_len + pref0_len),
1de06328 2774 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2775
ab3bbdeb 2776 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2777 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2778
1de06328 2779 const STRLEN tlen=len0+len1+len2;
3df15adc 2780 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2781 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2782 (IV)(locinput - loc_bostr),
07be1b83 2783 len0, s0,
07be1b83 2784 len1, s1,
07be1b83 2785 (docolor ? "" : "> <"),
07be1b83 2786 len2, s2,
f9f4320a 2787 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
2788 "");
2789 }
2790}
3df15adc 2791
07be1b83
YO
2792#endif
2793
0a4db386
YO
2794/* reg_check_named_buff_matched()
2795 * Checks to see if a named buffer has matched. The data array of
2796 * buffer numbers corresponding to the buffer is expected to reside
2797 * in the regexp->data->data array in the slot stored in the ARG() of
2798 * node involved. Note that this routine doesn't actually care about the
2799 * name, that information is not preserved from compilation to execution.
2800 * Returns the index of the leftmost defined buffer with the given name
2801 * or 0 if non of the buffers matched.
2802 */
2803STATIC I32
7918f24d
NC
2804S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2805{
0a4db386 2806 I32 n;
f8fc2ecf 2807 RXi_GET_DECL(rex,rexi);
ad64d0ec 2808 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 2809 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
2810
2811 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2812
0a4db386
YO
2813 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2814 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 2815 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
2816 {
2817 return nums[n];
2818 }
2819 }
2820 return 0;
2821}
2822
2f554ef7
DM
2823
2824/* free all slabs above current one - called during LEAVE_SCOPE */
2825
2826STATIC void
2827S_clear_backtrack_stack(pTHX_ void *p)
2828{
2829 regmatch_slab *s = PL_regmatch_slab->next;
2830 PERL_UNUSED_ARG(p);
2831
2832 if (!s)
2833 return;
2834 PL_regmatch_slab->next = NULL;
2835 while (s) {
2836 regmatch_slab * const osl = s;
2837 s = s->next;
2838 Safefree(osl);
2839 }
2840}
2841
2842
28d8d7f4
YO
2843#define SETREX(Re1,Re2) \
2844 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2845 Re1 = (Re2)
2846
d6a28714 2847STATIC I32 /* 0 failure, 1 success */
24b23f37 2848S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 2849{
a35a87e7 2850#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2851 dMY_CXT;
2852#endif
27da23d5 2853 dVAR;
95b24440 2854 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2855 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
2856 REGEXP *rex_sv = reginfo->prog;
2857 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 2858 RXi_GET_DECL(rex,rexi);
2f554ef7 2859 I32 oldsave;
5d9a96ca
DM
2860 /* the current state. This is a cached copy of PL_regmatch_state */
2861 register regmatch_state *st;
5d9a96ca
DM
2862 /* cache heavy used fields of st in registers */
2863 register regnode *scan;
2864 register regnode *next;
438e9bae 2865 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 2866 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 2867 register char *locinput = PL_reginput;
5d9a96ca 2868 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 2869
b69b0499 2870 bool result = 0; /* return value of S_regmatch */
24d3c4a9 2871 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
2872 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2873 const U32 max_nochange_depth =
2874 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2875 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
2876 regmatch_state *yes_state = NULL; /* state to pop to on success of
2877 subpattern */
e2e6a0f1
YO
2878 /* mark_state piggy backs on the yes_state logic so that when we unwind
2879 the stack on success we can update the mark_state as we go */
2880 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 2881 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 2882 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 2883 U32 state_num;
5d458dd8
YO
2884 bool no_final = 0; /* prevent failure from backtracking? */
2885 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 2886 char *startpoint = PL_reginput;
5d458dd8
YO
2887 SV *popmark = NULL; /* are we looking for a mark? */
2888 SV *sv_commit = NULL; /* last mark name seen in failure */
2889 SV *sv_yes_mark = NULL; /* last mark name we have seen
2890 during a successfull match */
2891 U32 lastopen = 0; /* last open we saw */
2892 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 2893 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
2894 /* these three flags are set by various ops to signal information to
2895 * the very next op. They have a useful lifetime of exactly one loop
2896 * iteration, and are not preserved or restored by state pushes/pops
2897 */
2898 bool sw = 0; /* the condition value in (?(cond)a|b) */
2899 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2900 int logical = 0; /* the following EVAL is:
2901 0: (?{...})
2902 1: (?(?{...})X|Y)
2903 2: (??{...})
2904 or the following IFMATCH/UNLESSM is:
2905 false: plain (?=foo)
2906 true: used as a condition: (?(?=foo))
2907 */
95b24440 2908#ifdef DEBUGGING
e68ec53f 2909 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2910#endif
2911
7918f24d
NC
2912 PERL_ARGS_ASSERT_REGMATCH;
2913
3b57cd43 2914 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 2915 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 2916 }));
5d9a96ca
DM
2917 /* on first ever call to regmatch, allocate first slab */
2918 if (!PL_regmatch_slab) {
2919 Newx(PL_regmatch_slab, 1, regmatch_slab);
2920 PL_regmatch_slab->prev = NULL;
2921 PL_regmatch_slab->next = NULL;
86545054 2922 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2923 }
2924
2f554ef7
DM
2925 oldsave = PL_savestack_ix;
2926 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2927 SAVEVPTR(PL_regmatch_slab);
2928 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
2929
2930 /* grab next free state slot */
2931 st = ++PL_regmatch_state;
86545054 2932 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2933 st = PL_regmatch_state = S_push_slab(aTHX);
2934
d6a28714
JH
2935 /* Note that nextchr is a byte even in UTF */
2936 nextchr = UCHARAT(locinput);
2937 scan = prog;
2938 while (scan != NULL) {
8ba1375e 2939
a3621e74 2940 DEBUG_EXECUTE_r( {
6136c704 2941 SV * const prop = sv_newmortal();
1de06328 2942 regnode *rnext=regnext(scan);
786e8c11 2943 DUMP_EXEC_POS( locinput, scan, do_utf8 );
32fc9b6a 2944 regprop(rex, prop, scan);
07be1b83
YO
2945
2946 PerlIO_printf(Perl_debug_log,
2947 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 2948 (IV)(scan - rexi->program), depth*2, "",
07be1b83 2949 SvPVX_const(prop),
1de06328 2950 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 2951 0 : (IV)(rnext - rexi->program));
2a782b5b 2952 });
d6a28714
JH
2953
2954 next = scan + NEXT_OFF(scan);
2955 if (next == scan)
2956 next = NULL;
40a82448 2957 state_num = OP(scan);
d6a28714 2958
40a82448 2959 reenter_switch:
34a81e2b
B
2960
2961 assert(PL_reglastparen == &rex->lastparen);
2962 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2963 assert(PL_regoffs == rex->offs);
2964
40a82448 2965 switch (state_num) {
d6a28714 2966 case BOL:
7fba1cd6 2967 if (locinput == PL_bostr)
d6a28714 2968 {
3b0527fe 2969 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2970 break;
2971 }
d6a28714
JH
2972 sayNO;
2973 case MBOL:
12d33761
HS
2974 if (locinput == PL_bostr ||
2975 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2976 {
b8c5462f
JH
2977 break;
2978 }
d6a28714
JH
2979 sayNO;
2980 case SBOL:
c2a73568 2981 if (locinput == PL_bostr)
b8c5462f 2982 break;
d6a28714
JH
2983 sayNO;
2984 case GPOS:
3b0527fe 2985 if (locinput == reginfo->ganch)
d6a28714
JH
2986 break;
2987 sayNO;
ee9b8eae
YO
2988
2989 case KEEPS:
2990 /* update the startpoint */
f0ab9afb 2991 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 2992 PL_reginput = locinput;
f0ab9afb 2993 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
2994 PUSH_STATE_GOTO(KEEPS_next, next);
2995 /*NOT-REACHED*/
2996 case KEEPS_next_fail:
2997 /* rollback the start point change */
f0ab9afb 2998 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
2999 sayNO_SILENT;
3000 /*NOT-REACHED*/
d6a28714 3001 case EOL:
d6a28714
JH
3002 goto seol;
3003 case MEOL:
d6a28714 3004 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3005 sayNO;
b8c5462f 3006 break;
d6a28714
JH
3007 case SEOL:
3008 seol:
3009 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3010 sayNO;
d6a28714 3011 if (PL_regeol - locinput > 1)
b8c5462f 3012 sayNO;
b8c5462f 3013 break;
d6a28714
JH
3014 case EOS:
3015 if (PL_regeol != locinput)
b8c5462f 3016 sayNO;
d6a28714 3017 break;
ffc61ed2 3018 case SANY:
d6a28714 3019 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3020 sayNO;
f33976b4
DB
3021 if (do_utf8) {
3022 locinput += PL_utf8skip[nextchr];
3023 if (locinput > PL_regeol)
3024 sayNO;
3025 nextchr = UCHARAT(locinput);
3026 }
3027 else
3028 nextchr = UCHARAT(++locinput);
3029 break;
3030 case CANY:
3031 if (!nextchr && locinput >= PL_regeol)
3032 sayNO;
b8c5462f 3033 nextchr = UCHARAT(++locinput);
a0d0e21e 3034 break;
ffc61ed2 3035 case REG_ANY:
1aa99e6b
IH
3036 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3037 sayNO;
3038 if (do_utf8) {
b8c5462f 3039 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3040 if (locinput > PL_regeol)
3041 sayNO;
a0ed51b3 3042 nextchr = UCHARAT(locinput);
a0ed51b3 3043 }
1aa99e6b
IH
3044 else
3045 nextchr = UCHARAT(++locinput);
a0ed51b3 3046 break;
166ba7cd
DM
3047
3048#undef ST
3049#define ST st->u.trie
786e8c11
YO
3050 case TRIEC:
3051 /* In this case the charclass data is available inline so
3052 we can fail fast without a lot of extra overhead.
3053 */
3054 if (scan->flags == EXACT || !do_utf8) {
3055 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3056 DEBUG_EXECUTE_r(
3057 PerlIO_printf(Perl_debug_log,
3058 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3059 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3060 );
3061 sayNO_SILENT;
3062 /* NOTREACHED */
3063 }
3064 }
3065 /* FALL THROUGH */
5b47454d 3066 case TRIE:
3dab1dad 3067 {
07be1b83 3068 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3069 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3070
3071 /* what trie are we using right now */
be8e71aa 3072 reg_trie_data * const trie
f8fc2ecf 3073 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3074 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3075 U32 state = trie->startstate;
166ba7cd 3076
3dab1dad
YO
3077 if (trie->bitmap && trie_type != trie_utf8_fold &&
3078 !TRIE_BITMAP_TEST(trie,*locinput)
3079 ) {
3080 if (trie->states[ state ].wordnum) {
3081 DEBUG_EXECUTE_r(
3082 PerlIO_printf(Perl_debug_log,
3083 "%*s %smatched empty string...%s\n",
5bc10b2c 3084 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3085 );
3086 break;
3087 } else {
3088 DEBUG_EXECUTE_r(
3089 PerlIO_printf(Perl_debug_log,
786e8c11 3090 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3091 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3092 );
3093 sayNO_SILENT;
3094 }
3095 }
166ba7cd 3096
786e8c11
YO
3097 {
3098 U8 *uc = ( U8* )locinput;
3099
3100 STRLEN len = 0;
3101 STRLEN foldlen = 0;
3102 U8 *uscan = (U8*)NULL;
3103 STRLEN bufflen=0;
3104 SV *sv_accept_buff = NULL;
3105 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3106
3107 ST.accepted = 0; /* how many accepting states we have seen */
3108 ST.B = next;
3109 ST.jump = trie->jump;
786e8c11 3110 ST.me = scan;
07be1b83
YO
3111 /*
3112 traverse the TRIE keeping track of all accepting states
3113 we transition through until we get to a failing node.
3114 */
3115
a3621e74 3116 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3117 U32 base = trie->states[ state ].trans.base;
f9f4320a 3118 UV uvc = 0;
786e8c11
YO
3119 U16 charid;
3120 /* We use charid to hold the wordnum as we don't use it
3121 for charid until after we have done the wordnum logic.
3122 We define an alias just so that the wordnum logic reads
3123 more naturally. */
3124
3125#define got_wordnum charid
3126 got_wordnum = trie->states[ state ].wordnum;
3127
3128 if ( got_wordnum ) {
3129 if ( ! ST.accepted ) {
5b47454d 3130 ENTER;
6962fb1a 3131 SAVETMPS; /* XXX is this necessary? dmq */
5b47454d
DM
3132 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3133 sv_accept_buff=newSV(bufflen *
3134 sizeof(reg_trie_accepted) - 1);
786e8c11 3135 SvCUR_set(sv_accept_buff, 0);
5b47454d
DM
3136 SvPOK_on(sv_accept_buff);
3137 sv_2mortal(sv_accept_buff);
166ba7cd
DM
3138 SAVETMPS;
3139 ST.accept_buff =
5b47454d
DM
3140 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3141 }
786e8c11 3142 do {
166ba7cd 3143 if (ST.accepted >= bufflen) {
5b47454d 3144 bufflen *= 2;
166ba7cd 3145 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
3146 SvGROW(sv_accept_buff,
3147 bufflen * sizeof(reg_trie_accepted));
3148 }
3149 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3150 + sizeof(reg_trie_accepted));
a3621e74 3151
786e8c11
YO
3152
3153 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3154 ST.accept_buff[ST.accepted].endpos = uc;
3155 ++ST.accepted;
3156 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3157 }
3158#undef got_wordnum
a3621e74 3159
07be1b83 3160 DEBUG_TRIE_EXECUTE_r({
786e8c11 3161 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
a3621e74 3162 PerlIO_printf( Perl_debug_log,
786e8c11 3163 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
5bc10b2c 3164 2+depth * 2, "", PL_colors[4],
786e8c11 3165 (UV)state, (UV)ST.accepted );
07be1b83 3166 });
a3621e74
YO
3167
3168 if ( base ) {
55eed653
NC
3169 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3170 uscan, len, uvc, charid, foldlen,
3171 foldbuf, uniflags);
a3621e74 3172
5b47454d
DM
3173 if (charid &&
3174 (base + charid > trie->uniquecharcount )
3175 && (base + charid - 1 - trie->uniquecharcount
3176 < trie->lasttrans)
3177 && trie->trans[base + charid - 1 -
3178 trie->uniquecharcount].check == state)
3179 {
3180 state = trie->trans[base + charid - 1 -
3181 trie->uniquecharcount ].next;
3182 }
3183 else {
3184 state = 0;
3185 }
3186 uc += len;
3187
3188 }
3189 else {
a3621e74
YO
3190 state = 0;
3191 }
3192 DEBUG_TRIE_EXECUTE_r(
e4584336 3193 PerlIO_printf( Perl_debug_log,
786e8c11 3194 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3195 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3196 );
3197 }
166ba7cd 3198 if (!ST.accepted )
a3621e74 3199 sayNO;
a3621e74 3200
166ba7cd
DM
3201 DEBUG_EXECUTE_r(
3202 PerlIO_printf( Perl_debug_log,
3203 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3204 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3205 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3206 );
786e8c11 3207 }}
fae667d5
YO
3208 goto trie_first_try; /* jump into the fail handler */
3209 /* NOTREACHED */
166ba7cd 3210 case TRIE_next_fail: /* we failed - try next alterative */
fae667d5
YO
3211 if ( ST.jump) {
3212 REGCP_UNWIND(ST.cp);
3213 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3214 PL_regoffs[n].end = -1;
fae667d5
YO
3215 *PL_reglastparen = n;
3216 }
3217 trie_first_try:
5d458dd8
YO
3218 if (do_cutgroup) {
3219 do_cutgroup = 0;
3220 no_final = 0;
3221 }
fae667d5
YO
3222
3223 if ( ST.jump) {
3224 ST.lastparen = *PL_reglastparen;
3225 REGCP_SET(ST.cp);
3226 }
166ba7cd
DM
3227 if ( ST.accepted == 1 ) {
3228 /* only one choice left - just continue */
3229 DEBUG_EXECUTE_r({
2b8b4781 3230 AV *const trie_words
502c6561 3231 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3232 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3233 ST.accept_buff[ 0 ].wordnum-1, 0 );
de734bd5
A
3234 SV *sv= tmp ? sv_newmortal() : NULL;
3235
166ba7cd
DM
3236 PerlIO_printf( Perl_debug_log,
3237 "%*s %sonly one match left: #%d <%s>%s\n",
5bc10b2c 3238 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3239 ST.accept_buff[ 0 ].wordnum,
de734bd5
A
3240 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3241 PL_colors[0], PL_colors[1],
3242 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3243 )
3244 : "not compiled under -Dr",
166ba7cd
DM
3245 PL_colors[5] );
3246 });
3247 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3248 /* in this case we free tmps/leave before we call regmatch
3249 as we wont be using accept_buff again. */
5d458dd8 3250
166ba7cd
DM
3251 locinput = PL_reginput;
3252 nextchr = UCHARAT(locinput);
5d458dd8
YO
3253 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3254 scan = ST.B;
3255 else
3256 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3257 if (!has_cutgroup) {
3258 FREETMPS;
3259 LEAVE;
3260 } else {
3261 ST.accepted--;
3262 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3263 }
786e8c11 3264
166ba7cd
DM
3265 continue; /* execute rest of RE */
3266 }
fae667d5
YO
3267
3268 if ( !ST.accepted-- ) {
5d458dd8
YO
3269 DEBUG_EXECUTE_r({
3270 PerlIO_printf( Perl_debug_log,
3271 "%*s %sTRIE failed...%s\n",
3272 REPORT_CODE_OFF+depth*2, "",
3273 PL_colors[4],
3274 PL_colors[5] );
3275 });
166ba7cd
DM
3276 FREETMPS;
3277 LEAVE;
5d458dd8 3278 sayNO_SILENT;
fae667d5
YO
3279 /*NOTREACHED*/
3280 }
166ba7cd 3281
a3621e74 3282 /*
166ba7cd
DM
3283 There are at least two accepting states left. Presumably
3284 the number of accepting states is going to be low,
3285 typically two. So we simply scan through to find the one
3286 with lowest wordnum. Once we find it, we swap the last
3287 state into its place and decrement the size. We then try to
3288 match the rest of the pattern at the point where the word
3289 ends. If we succeed, control just continues along the
3290 regex; if we fail we return here to try the next accepting
3291 state
3292 */
a3621e74 3293
166ba7cd
DM
3294 {
3295 U32 best = 0;
3296 U32 cur;
3297 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3298 DEBUG_TRIE_EXECUTE_r(
f2278c82 3299 PerlIO_printf( Perl_debug_log,
166ba7cd 3300 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
5bc10b2c 3301 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
166ba7cd
DM
3302 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3303 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3304 );
3305
3306 if (ST.accept_buff[cur].wordnum <
3307 ST.accept_buff[best].wordnum)
3308 best = cur;
a3621e74 3309 }
166ba7cd
DM
3310
3311 DEBUG_EXECUTE_r({
2b8b4781 3312 AV *const trie_words
502c6561 3313 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
2b8b4781 3314 SV ** const tmp = av_fetch( trie_words,
5c9f2f80 3315 ST.accept_buff[ best ].wordnum - 1, 0 );
7f69552c 3316 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
265c4333 3317 ST.B :
7f69552c 3318 ST.me + ST.jump[ST.accept_buff[best].wordnum];
de734bd5
A
3319 SV *sv= tmp ? sv_newmortal() : NULL;
3320
265c4333
YO
3321 PerlIO_printf( Perl_debug_log,
3322 "%*s %strying alternation #%d <%s> at node #%d %s\n",
5bc10b2c 3323 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
166ba7cd 3324 ST.accept_buff[best].wordnum,
de734bd5
A
3325 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3326 PL_colors[0], PL_colors[1],
3327 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3328 ) : "not compiled under -Dr",
265c4333 3329 REG_NODE_NUM(nextop),
166ba7cd
DM
3330 PL_colors[5] );
3331 });
3332
3333 if ( best<ST.accepted ) {
3334 reg_trie_accepted tmp = ST.accept_buff[ best ];
3335 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3336 ST.accept_buff[ ST.accepted ] = tmp;
3337 best = ST.accepted;
a3621e74 3338 }
166ba7cd 3339 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
7f69552c 3340 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
5d458dd8 3341 scan = ST.B;
786e8c11 3342 } else {
5d458dd8 3343 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
5d458dd8 3344 }
6b173516 3345 PUSH_YES_STATE_GOTO(TRIE_next, scan);
786e8c11 3346 /* NOTREACHED */
166ba7cd 3347 }
166ba7cd 3348 /* NOTREACHED */
5d458dd8 3349 case TRIE_next:
6962fb1a
YO
3350 /* we dont want to throw this away, see bug 57042*/
3351 if (oreplsv != GvSV(PL_replgv))
3352 sv_setsv(oreplsv, GvSV(PL_replgv));
5d458dd8
YO
3353 FREETMPS;
3354 LEAVE;
3355 sayYES;
166ba7cd
DM
3356#undef ST
3357
95b24440
DM
3358 case EXACT: {
3359 char *s = STRING(scan);
24d3c4a9 3360 ln = STR_LEN(scan);
eb160463 3361 if (do_utf8 != UTF) {
bc517b45 3362 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3363 char *l = locinput;
24d3c4a9 3364 const char * const e = s + ln;
a72c7584 3365
5ff6fc6d
JH
3366 if (do_utf8) {
3367 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3368 while (s < e) {
a3b680e6 3369 STRLEN ulen;
1aa99e6b 3370 if (l >= PL_regeol)
5ff6fc6d
JH
3371 sayNO;
3372 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3373 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3374 uniflags))
5ff6fc6d 3375 sayNO;
bc517b45 3376 l += ulen;
5ff6fc6d 3377 s ++;
1aa99e6b 3378 }
5ff6fc6d
JH
3379 }
3380 else {
3381 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3382 while (s < e) {
a3b680e6 3383 STRLEN ulen;
1aa99e6b
IH
3384 if (l >= PL_regeol)
3385 sayNO;
5ff6fc6d 3386 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3387 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3388 uniflags))
1aa99e6b 3389 sayNO;
bc517b45 3390 s += ulen;
a72c7584 3391 l ++;
1aa99e6b 3392 }
5ff6fc6d 3393 }
1aa99e6b
IH
3394 locinput = l;
3395 nextchr = UCHARAT(locinput);
3396 break;
3397 }
bc517b45 3398 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3399 /* Inline the first character, for speed. */
3400 if (UCHARAT(s) != nextchr)
3401 sayNO;
24d3c4a9 3402 if (PL_regeol - locinput < ln)
d6a28714 3403 sayNO;
24d3c4a9 3404 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3405 sayNO;
24d3c4a9 3406 locinput += ln;
d6a28714
JH
3407 nextchr = UCHARAT(locinput);
3408 break;
95b24440 3409 }
d6a28714 3410 case EXACTFL:
b8c5462f
JH
3411 PL_reg_flags |= RF_tainted;
3412 /* FALL THROUGH */
95b24440 3413 case EXACTF: {
be8e71aa 3414 char * const s = STRING(scan);
24d3c4a9 3415 ln = STR_LEN(scan);
d6a28714 3416
d07ddd77
JH
3417 if (do_utf8 || UTF) {
3418 /* Either target or the pattern are utf8. */
be8e71aa 3419 const char * const l = locinput;
d07ddd77 3420 char *e = PL_regeol;
bc517b45 3421
24d3c4a9 3422 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 3423 l, &e, 0, do_utf8)) {
5486206c
JH
3424 /* One more case for the sharp s:
3425 * pack("U0U*", 0xDF) =~ /ss/i,
3426 * the 0xC3 0x9F are the UTF-8
3427 * byte sequence for the U+00DF. */
e1d1eefb 3428
5486206c 3429 if (!(do_utf8 &&
e1d1eefb 3430 toLOWER(s[0]) == 's' &&
24d3c4a9 3431 ln >= 2 &&
5486206c
JH
3432 toLOWER(s[1]) == 's' &&
3433 (U8)l[0] == 0xC3 &&
3434 e - l >= 2 &&
3435 (U8)l[1] == 0x9F))
3436 sayNO;
3437 }
d07ddd77
JH
3438 locinput = e;
3439 nextchr = UCHARAT(locinput);
3440 break;
a0ed51b3 3441 }
d6a28714 3442
bc517b45
JH
3443 /* Neither the target and the pattern are utf8. */
3444
d6a28714
JH
3445 /* Inline the first character, for speed. */
3446 if (UCHARAT(s) != nextchr &&
3447 UCHARAT(s) != ((OP(scan) == EXACTF)
3448 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 3449 sayNO;
24d3c4a9 3450 if (PL_regeol - locinput < ln)
b8c5462f 3451 sayNO;
24d3c4a9
DM
3452 if (ln > 1 && (OP(scan) == EXACTF
3453 ? ibcmp(s, locinput, ln)
3454 : ibcmp_locale(s, locinput, ln)))
4633a7c4 3455 sayNO;
24d3c4a9 3456 locinput += ln;
d6a28714 3457 nextchr = UCHARAT(locinput);
a0d0e21e 3458 break;
95b24440 3459 }
b2680017
YO
3460 case BOUNDL:
3461 case NBOUNDL:
3462 PL_reg_flags |= RF_tainted;
3463 /* FALL THROUGH */
3464 case BOUND:
3465 case NBOUND:
3466 /* was last char in word? */
3467 if (do_utf8) {
3468 if (locinput == PL_bostr)
3469 ln = '\n';
3470 else {
3471 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3472
3473 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3474 }
3475 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3476 ln = isALNUM_uni(ln);
3477 LOAD_UTF8_CHARCLASS_ALNUM();
3478 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3479 }
3480 else {
3481 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3482 n = isALNUM_LC_utf8((U8*)locinput);
3483 }
3484 }
3485 else {
3486 ln = (locinput != PL_bostr) ?
3487 UCHARAT(locinput - 1) : '\n';
3488 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3489 ln = isALNUM(ln);
3490 n = isALNUM(nextchr);
3491 }
3492 else {
3493 ln = isALNUM_LC(ln);
3494 n = isALNUM_LC(nextchr);
3495 }
3496 }
3497 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3498 OP(scan) == BOUNDL))
3499 sayNO;
3500 break;
d6a28714 3501 case ANYOF:
ffc61ed2 3502 if (do_utf8) {
9e55ce06
JH
3503 STRLEN inclasslen = PL_regeol - locinput;
3504
32fc9b6a 3505 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
262b90c4 3506 goto anyof_fail;
ffc61ed2
JH
3507 if (locinput >= PL_regeol)
3508 sayNO;
0f0076b4 3509 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 3510 nextchr = UCHARAT(locinput);
e0f9d4a8 3511 break;
ffc61ed2
JH
3512 }
3513 else {
3514 if (nextchr < 0)
3515 nextchr = UCHARAT(locinput);
32fc9b6a 3516 if (!REGINCLASS(rex, scan, (U8*)locinput))
262b90c4 3517 goto anyof_fail;
ffc61ed2
JH
3518 if (!nextchr && locinput >= PL_regeol)
3519 sayNO;
3520 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3521 break;
3522 }
262b90c4 3523 anyof_fail:
e0f9d4a8
JH
3524 /* If we might have the case of the German sharp s
3525 * in a casefolding Unicode character class. */
3526
ebc501f0
JH
3527 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3528 locinput += SHARP_S_SKIP;
e0f9d4a8 3529 nextchr = UCHARAT(locinput);
ffc61ed2 3530 }
e0f9d4a8
JH
3531 else
3532 sayNO;
b8c5462f 3533 break;
20d0b1e9 3534 /* Special char classes - The defines start on line 129 or so */
d1eb3177
YO
3535 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3536 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
20d0b1e9 3537
d1eb3177
YO
3538 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3539 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
20d0b1e9 3540
d1eb3177
YO
3541 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3542 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
20d0b1e9 3543
37e2e78e
KW
3544 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3545 a Unicode extended Grapheme Cluster */
3546 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3547 extended Grapheme Cluster is:
3548
3549 CR LF
3550 | Prepend* Begin Extend*
3551 | .
3552
3553 Begin is (Hangul-syllable | ! Control)
3554 Extend is (Grapheme_Extend | Spacing_Mark)
3555 Control is [ GCB_Control CR LF ]
3556
3557 The discussion below shows how the code for CLUMP is derived
3558 from this regex. Note that most of these concepts are from
3559 property values of the Grapheme Cluster Boundary (GCB) property.
3560 No code point can have multiple property values for a given
3561 property. Thus a code point in Prepend can't be in Control, but
3562 it must be in !Control. This is why Control above includes
3563 GCB_Control plus CR plus LF. The latter two are used in the GCB
3564 property separately, and so can't be in GCB_Control, even though
3565 they logically are controls. Control is not the same as gc=cc,
3566 but includes format and other characters as well.
3567
3568 The Unicode definition of Hangul-syllable is:
3569 L+
3570 | (L* ( ( V | LV ) V* | LVT ) T*)
3571 | T+
3572 )
3573 Each of these is a value for the GCB property, and hence must be
3574 disjoint, so the order they are tested is immaterial, so the
3575 above can safely be changed to
3576 T+
3577 | L+
3578 | (L* ( LVT | ( V | LV ) V*) T*)
3579
3580 The last two terms can be combined like this:
3581 L* ( L
3582 | (( LVT | ( V | LV ) V*) T*))
3583
3584 And refactored into this:
3585 L* (L | LVT T* | V V* T* | LV V* T*)
3586
3587 That means that if we have seen any L's at all we can quit
3588 there, but if the next character is a LVT, a V or and LV we
3589 should keep going.
3590
3591 There is a subtlety with Prepend* which showed up in testing.
3592 Note that the Begin, and only the Begin is required in:
3593 | Prepend* Begin Extend*
3594 Also, Begin contains '! Control'. A Prepend must be a '!
3595 Control', which means it must be a Begin. What it comes down to
3596 is that if we match Prepend* and then find no suitable Begin
3597 afterwards, that if we backtrack the last Prepend, that one will
3598 be a suitable Begin.
3599 */
3600
b7c83a7e 3601 if (locinput >= PL_regeol)
a0ed51b3 3602 sayNO;
37e2e78e
KW
3603 if (! do_utf8) {
3604
3605 /* Match either CR LF or '.', as all the other possibilities
3606 * require utf8 */
3607 locinput++; /* Match the . or CR */
3608 if (nextchr == '\r'
3609 && locinput < PL_regeol
3610 && UCHARAT(locinput) == '\n') locinput++;
3611 }
3612 else {
3613
3614 /* Utf8: See if is ( CR LF ); already know that locinput <
3615 * PL_regeol, so locinput+1 is in bounds */
3616 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3617 locinput += 2;
3618 }
3619 else {
3620 /* In case have to backtrack to beginning, then match '.' */
3621 char *starting = locinput;
3622
3623 /* In case have to backtrack the last prepend */
3624 char *previous_prepend = 0;
3625
3626 LOAD_UTF8_CHARCLASS_GCB();
3627
3628 /* Match (prepend)* */
3629 while (locinput < PL_regeol
3630 && swash_fetch(PL_utf8_X_prepend,
3631 (U8*)locinput, do_utf8))
3632 {
3633 previous_prepend = locinput;
3634 locinput += UTF8SKIP(locinput);
3635 }
3636
3637 /* As noted above, if we matched a prepend character, but
3638 * the next thing won't match, back off the last prepend we
3639 * matched, as it is guaranteed to match the begin */
3640 if (previous_prepend
3641 && (locinput >= PL_regeol
3642 || ! swash_fetch(PL_utf8_X_begin,
3643 (U8*)locinput, do_utf8)))
3644 {
3645 locinput = previous_prepend;
3646 }
3647
3648 /* Note that here we know PL_regeol > locinput, as we
3649 * tested that upon input to this switch case, and if we
3650 * moved locinput forward, we tested the result just above
3651 * and it either passed, or we backed off so that it will
3652 * now pass */
3653 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3654
3655 /* Here did not match the required 'Begin' in the
3656 * second term. So just match the very first
3657 * character, the '.' of the final term of the regex */
3658 locinput = starting + UTF8SKIP(starting);
3659 } else {
3660
3661 /* Here is the beginning of a character that can have
3662 * an extender. It is either a hangul syllable, or a
3663 * non-control */
3664 if (swash_fetch(PL_utf8_X_non_hangul,
3665 (U8*)locinput, do_utf8))
3666 {
3667
3668 /* Here not a Hangul syllable, must be a
3669 * ('! * Control') */
3670 locinput += UTF8SKIP(locinput);
3671 } else {
3672
3673 /* Here is a Hangul syllable. It can be composed
3674 * of several individual characters. One
3675 * possibility is T+ */
3676 if (swash_fetch(PL_utf8_X_T,
3677 (U8*)locinput, do_utf8))
3678 {
3679 while (locinput < PL_regeol
3680 && swash_fetch(PL_utf8_X_T,
3681 (U8*)locinput, do_utf8))
3682 {
3683 locinput += UTF8SKIP(locinput);
3684 }
3685 } else {
3686
3687 /* Here, not T+, but is a Hangul. That means
3688 * it is one of the others: L, LV, LVT or V,
3689 * and matches:
3690 * L* (L | LVT T* | V V* T* | LV V* T*) */
3691
3692 /* Match L* */
3693 while (locinput < PL_regeol
3694 && swash_fetch(PL_utf8_X_L,
3695 (U8*)locinput, do_utf8))
3696 {
3697 locinput += UTF8SKIP(locinput);
3698 }
3699
3700 /* Here, have exhausted L*. If the next
3701 * character is not an LV, LVT nor V, it means
3702 * we had to have at least one L, so matches L+
3703 * in the original equation, we have a complete
3704 * hangul syllable. Are done. */
3705
3706 if (locinput < PL_regeol
3707 && swash_fetch(PL_utf8_X_LV_LVT_V,
3708 (U8*)locinput, do_utf8))
3709 {
3710
3711 /* Otherwise keep going. Must be LV, LVT
3712 * or V. See if LVT */
3713 if (swash_fetch(PL_utf8_X_LVT,
3714 (U8*)locinput, do_utf8))
3715 {
3716 locinput += UTF8SKIP(locinput);
3717 } else {
3718
3719 /* Must be V or LV. Take it, then
3720 * match V* */
3721 locinput += UTF8SKIP(locinput);
3722 while (locinput < PL_regeol
3723 && swash_fetch(PL_utf8_X_V,
3724 (U8*)locinput, do_utf8))
3725 {
3726 locinput += UTF8SKIP(locinput);
3727 }
3728 }
3729
3730 /* And any of LV, LVT, or V can be followed
3731 * by T* */
3732 while (locinput < PL_regeol
3733 && swash_fetch(PL_utf8_X_T,
3734 (U8*)locinput,
3735 do_utf8))
3736 {
3737 locinput += UTF8SKIP(locinput);
3738 }
3739 }
3740 }
3741 }
3742
3743 /* Match any extender */
3744 while (locinput < PL_regeol
3745 && swash_fetch(PL_utf8_X_extend,
3746 (U8*)locinput, do_utf8))
3747 {
3748 locinput += UTF8SKIP(locinput);
3749 }
3750 }
3751 }
3752 if (locinput > PL_regeol) sayNO;
3753 }
a0ed51b3
LW
3754 nextchr = UCHARAT(locinput);
3755 break;
81714fb9
YO
3756
3757 case NREFFL:
3758 {
3759 char *s;
ff1157ca 3760 char type;
81714fb9
YO
3761 PL_reg_flags |= RF_tainted;
3762 /* FALL THROUGH */
3763 case NREF:
3764 case NREFF:
ff1157ca 3765 type = OP(scan);
0a4db386
YO
3766 n = reg_check_named_buff_matched(rex,scan);
3767
3768 if ( n ) {
3769 type = REF + ( type - NREF );
3770 goto do_ref;
3771 } else {
81714fb9 3772 sayNO;
0a4db386
YO
3773 }
3774 /* unreached */
c8756f30 3775 case REFFL:
3280af22 3776 PL_reg_flags |= RF_tainted;
c8756f30 3777 /* FALL THROUGH */
c277df42 3778 case REF:
81714fb9 3779 case REFF:
c277df42 3780 n = ARG(scan); /* which paren pair */
81714fb9
YO
3781 type = OP(scan);
3782 do_ref:
f0ab9afb 3783 ln = PL_regoffs[n].start;
2c2d71f5 3784 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 3785 if (*PL_reglastparen < n || ln == -1)
af3f8c16 3786 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 3787 if (ln == PL_regoffs[n].end)
a0d0e21e 3788 break;
a0ed51b3 3789
24d3c4a9 3790 s = PL_bostr + ln;
81714fb9 3791 if (do_utf8 && type != REF) { /* REF can do byte comparison */
a0ed51b3 3792 char *l = locinput;
f0ab9afb 3793 const char *e = PL_bostr + PL_regoffs[n].end;
a0ed51b3
LW
3794 /*
3795 * Note that we can't do the "other character" lookup trick as
3796 * in the 8-bit case (no pun intended) because in Unicode we
3797 * have to map both upper and title case to lower case.
3798 */
81714fb9 3799 if (type == REFF) {
a0ed51b3 3800 while (s < e) {
a3b680e6
AL
3801 STRLEN ulen1, ulen2;
3802 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3803 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3804
a0ed51b3
LW
3805 if (l >= PL_regeol)
3806 sayNO;
a2a2844f
JH
3807 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3808 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3809 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3810 sayNO;
a2a2844f
JH
3811 s += ulen1;
3812 l += ulen2;
a0ed51b3
LW
3813 }
3814 }
3815 locinput = l;
3816 nextchr = UCHARAT(locinput);
3817 break;
3818 }
3819
a0d0e21e 3820 /* Inline the first character, for speed. */
76e3520e 3821 if (UCHARAT(s) != nextchr &&
81714fb9
YO
3822 (type == REF ||
3823 (UCHARAT(s) != (type == REFF
3824 ? PL_fold : PL_fold_locale)[nextchr])))
4633a7c4 3825 sayNO;
f0ab9afb 3826 ln = PL_regoffs[n].end - ln;
24d3c4a9 3827 if (locinput + ln > PL_regeol)
4633a7c4 3828 sayNO;
81714fb9 3829 if (ln > 1 && (type == REF
24d3c4a9 3830 ? memNE(s, locinput, ln)
81714fb9 3831 : (type == REFF
24d3c4a9
DM
3832 ? ibcmp(s, locinput, ln)
3833 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3834 sayNO;
24d3c4a9 3835 locinput += ln;
76e3520e 3836 nextchr = UCHARAT(locinput);
a0d0e21e 3837 break;
81714fb9 3838 }
a0d0e21e 3839 case NOTHING:
c277df42 3840 case TAIL:
a0d0e21e
LW
3841 break;
3842 case BACK:
3843 break;
40a82448
DM
3844
3845#undef ST
3846#define ST st->u.eval
c277df42 3847 {
c277df42 3848 SV *ret;
d2f13c59 3849 REGEXP *re_sv;
6bda09f9 3850 regexp *re;
f8fc2ecf 3851 regexp_internal *rei;
1a147d38
YO
3852 regnode *startpoint;
3853
3854 case GOSTART:
e7707071
YO
3855 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3856 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 3857 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 3858 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 3859 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
3860 Perl_croak(aTHX_
3861 "Pattern subroutine nesting without pos change"
3862 " exceeded limit in regex");
6bda09f9
YO
3863 } else {
3864 nochange_depth = 0;
1a147d38 3865 }
288b8c02 3866 re_sv = rex_sv;
6bda09f9 3867 re = rex;
f8fc2ecf 3868 rei = rexi;
288b8c02 3869 (void)ReREFCNT_inc(rex_sv);
1a147d38 3870 if (OP(scan)==GOSUB) {
6bda09f9
YO
3871 startpoint = scan + ARG2L(scan);
3872 ST.close_paren = ARG(scan);
3873 } else {
f8fc2ecf 3874 startpoint = rei->program+1;
6bda09f9
YO
3875 ST.close_paren = 0;
3876 }
3877 goto eval_recurse_doit;
3878 /* NOTREACHED */
3879 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3880 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 3881 if ( ++nochange_depth > max_nochange_depth )
1a147d38 3882 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
3883 } else {
3884 nochange_depth = 0;
3885 }
8e5e9ebe 3886 {
4aabdb9b
DM
3887 /* execute the code in the {...} */
3888 dSP;
6136c704 3889 SV ** const before = SP;
4aabdb9b
DM
3890 OP_4tree * const oop = PL_op;
3891 COP * const ocurcop = PL_curcop;
3892 PAD *old_comppad;
d80618d2 3893 char *saved_regeol = PL_regeol;
4aabdb9b
DM
3894
3895 n = ARG(scan);
f8fc2ecf 3896 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
3897 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3898 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f8fc2ecf 3899 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 3900 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 3901
2bf803e2
YO
3902 if (sv_yes_mark) {
3903 SV *sv_mrk = get_sv("REGMARK", 1);
3904 sv_setsv(sv_mrk, sv_yes_mark);
3905 }
3906
8e5e9ebe
RGS
3907 CALLRUNOPS(aTHX); /* Scalar context. */
3908 SPAGAIN;
3909 if (SP == before)
075aa684 3910 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3911 else {
3912 ret = POPs;
3913 PUTBACK;
3914 }
4aabdb9b
DM
3915
3916 PL_op = oop;
3917 PAD_RESTORE_LOCAL(old_comppad);
3918 PL_curcop = ocurcop;
d80618d2 3919 PL_regeol = saved_regeol;
24d3c4a9 3920 if (!logical) {
4aabdb9b
DM
3921 /* /(?{...})/ */
3922 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3923 break;
3924 }
8e5e9ebe 3925 }
24d3c4a9
DM
3926 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3927 logical = 0;
4aabdb9b 3928 {
4f639d21
DM
3929 /* extract RE object from returned value; compiling if
3930 * necessary */
6136c704 3931 MAGIC *mg = NULL;
288b8c02 3932 REGEXP *rx = NULL;
5c35adbb
NC
3933
3934 if (SvROK(ret)) {
288b8c02 3935 SV *const sv = SvRV(ret);
5c35adbb
NC
3936
3937 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 3938 rx = (REGEXP*) sv;
5c35adbb
NC
3939 } else if (SvSMAGICAL(sv)) {
3940 mg = mg_find(sv, PERL_MAGIC_qr);
3941 assert(mg);
3942 }
3943 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 3944 rx = (REGEXP*) ret;
5c35adbb 3945 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
3946 if (SvGMAGICAL(ret)) {
3947 /* I don't believe that there is ever qr magic
3948 here. */
3949 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 3950 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
3951 }
3952 else {
faf82a0b 3953 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
3954 /* testing suggests mg only ends up non-NULL for
3955 scalars who were upgraded and compiled in the
3956 else block below. In turn, this is only
3957 triggered in the "postponed utf8 string" tests
3958 in t/op/pat.t */
3959 }
0f5d15d6 3960 }
faf82a0b 3961
0f5d15d6 3962 if (mg) {
d2f13c59 3963 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 3964 assert(rx);
0f5d15d6 3965 }
288b8c02 3966 if (rx) {
f0826785 3967 rx = reg_temp_copy(NULL, rx);
288b8c02 3968 }
0f5d15d6 3969 else {
c737faaf 3970 U32 pm_flags = 0;
a3b680e6 3971 const I32 osize = PL_regsize;
0f5d15d6 3972
b9ad30b4
NC
3973 if (DO_UTF8(ret)) {
3974 assert (SvUTF8(ret));
3975 } else if (SvUTF8(ret)) {
3976 /* Not doing UTF-8, despite what the SV says. Is
3977 this only if we're trapped in use 'bytes'? */
3978 /* Make a copy of the octet sequence, but without
3979 the flag on, as the compiler now honours the
3980 SvUTF8 flag on ret. */
3981 STRLEN len;
3982 const char *const p = SvPV(ret, len);
3983 ret = newSVpvn_flags(p, len, SVs_TEMP);
3984 }
288b8c02 3985 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 3986 if (!(SvFLAGS(ret)
faf82a0b 3987 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 3988 | SVs_GMG))) {
a2794585
NC
3989 /* This isn't a first class regexp. Instead, it's
3990 caching a regexp onto an existing, Perl visible
3991 scalar. */
ad64d0ec 3992 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 3993 }
0f5d15d6 3994 PL_regsize = osize;
0f5d15d6 3995 }
288b8c02
NC
3996 re_sv = rx;
3997 re = (struct regexp *)SvANY(rx);
4aabdb9b 3998 }
07bc277f 3999 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4000 re->subbeg = rex->subbeg;
4001 re->sublen = rex->sublen;
f8fc2ecf 4002 rei = RXi_GET(re);
6bda09f9 4003 DEBUG_EXECUTE_r(
efd26800 4004 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
6bda09f9
YO
4005 "Matching embedded");
4006 );
f8fc2ecf 4007 startpoint = rei->program + 1;
1a147d38 4008 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
4009 /* borrowed from regtry */
4010 if (PL_reg_start_tmpl <= re->nparens) {
4011 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4012 if(PL_reg_start_tmp)
4013 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4014 else
4015 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 4016 }
aa283a38 4017
1a147d38 4018 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4019 /* run the pattern returned from (??{...}) */
40a82448
DM
4020 ST.cp = regcppush(0); /* Save *all* the positions. */
4021 REGCP_SET(ST.lastcp);
6bda09f9 4022
f0ab9afb 4023 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 4024
0357f1fd
ML
4025 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4026 PL_reglastparen = &re->lastparen;
4027 PL_reglastcloseparen = &re->lastcloseparen;
4028 re->lastparen = 0;
4029 re->lastcloseparen = 0;
4030
4aabdb9b 4031 PL_reginput = locinput;
ae0beba1 4032 PL_regsize = 0;
4aabdb9b
DM
4033
4034 /* XXXX This is too dramatic a measure... */
4035 PL_reg_maxiter = 0;
4036
faec1544 4037 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4038 if (RX_UTF8(re_sv))
faec1544
DM
4039 PL_reg_flags |= RF_utf8;
4040 else
4041 PL_reg_flags &= ~RF_utf8;
4042 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4043
288b8c02 4044 ST.prev_rex = rex_sv;
faec1544 4045 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
4046 SETREX(rex_sv,re_sv);
4047 rex = re;
f8fc2ecf 4048 rexi = rei;
faec1544 4049 cur_curlyx = NULL;
40a82448 4050 ST.B = next;
faec1544
DM
4051 ST.prev_eval = cur_eval;
4052 cur_eval = st;
faec1544 4053 /* now continue from first node in postoned RE */
6bda09f9 4054 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 4055 /* NOTREACHED */
a0ed51b3 4056 }
24d3c4a9
DM
4057 /* logical is 1, /(?(?{...})X|Y)/ */
4058 sw = (bool)SvTRUE(ret);
4059 logical = 0;
c277df42
IZ
4060 break;
4061 }
40a82448 4062
faec1544
DM
4063 case EVAL_AB: /* cleanup after a successful (??{A})B */
4064 /* note: this is called twice; first after popping B, then A */
4065 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4066 ReREFCNT_dec(rex_sv);
4067 SETREX(rex_sv,ST.prev_rex);
4068 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4069 rexi = RXi_GET(rex);
faec1544
DM
4070 regcpblow(ST.cp);
4071 cur_eval = ST.prev_eval;
4072 cur_curlyx = ST.prev_curlyx;
34a81e2b
B
4073
4074 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4075 PL_reglastparen = &rex->lastparen;
4076 PL_reglastcloseparen = &rex->lastcloseparen;
34a81e2b
B
4077 /* also update PL_regoffs */
4078 PL_regoffs = rex->offs;
0357f1fd 4079
40a82448
DM
4080 /* XXXX This is too dramatic a measure... */
4081 PL_reg_maxiter = 0;
e7707071 4082 if ( nochange_depth )
4b196cd4 4083 nochange_depth--;
262b90c4 4084 sayYES;
40a82448 4085
40a82448 4086
faec1544
DM
4087 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4088 /* note: this is called twice; first after popping B, then A */
4089 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4090 ReREFCNT_dec(rex_sv);
4091 SETREX(rex_sv,ST.prev_rex);
4092 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4093 rexi = RXi_GET(rex);
34a81e2b 4094 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4095 PL_reglastparen = &rex->lastparen;
4096 PL_reglastcloseparen = &rex->lastcloseparen;
4097
40a82448
DM
4098 PL_reginput = locinput;
4099 REGCP_UNWIND(ST.lastcp);
4100 regcppop(rex);
faec1544
DM
4101 cur_eval = ST.prev_eval;
4102 cur_curlyx = ST.prev_curlyx;
4103 /* XXXX This is too dramatic a measure... */
4104 PL_reg_maxiter = 0;
e7707071 4105 if ( nochange_depth )
4b196cd4 4106 nochange_depth--;
40a82448 4107 sayNO_SILENT;
40a82448
DM
4108#undef ST
4109
a0d0e21e 4110 case OPEN:
c277df42 4111 n = ARG(scan); /* which paren pair */
3280af22
NIS
4112 PL_reg_start_tmp[n] = locinput;
4113 if (n > PL_regsize)
4114 PL_regsize = n;
e2e6a0f1 4115 lastopen = n;
a0d0e21e
LW
4116 break;
4117 case CLOSE:
c277df42 4118 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
4119 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4120 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
4121 /*if (n > PL_regsize)
4122 PL_regsize = n;*/
3b6647e0 4123 if (n > *PL_reglastparen)
3280af22 4124 *PL_reglastparen = n;
a01268b5 4125 *PL_reglastcloseparen = n;
3b6647e0 4126 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4127 goto fake_end;
4128 }
a0d0e21e 4129 break;
e2e6a0f1
YO
4130 case ACCEPT:
4131 if (ARG(scan)){
4132 regnode *cursor;
4133 for (cursor=scan;
4134 cursor && OP(cursor)!=END;
4135 cursor=regnext(cursor))
4136 {
4137 if ( OP(cursor)==CLOSE ){
4138 n = ARG(cursor);
4139 if ( n <= lastopen ) {
f0ab9afb
NC
4140 PL_regoffs[n].start
4141 = PL_reg_start_tmp[n] - PL_bostr;
4142 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
4143 /*if (n > PL_regsize)
4144 PL_regsize = n;*/
3b6647e0 4145 if (n > *PL_reglastparen)
e2e6a0f1
YO
4146 *PL_reglastparen = n;
4147 *PL_reglastcloseparen = n;
3b6647e0
RB
4148 if ( n == ARG(scan) || (cur_eval &&
4149 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4150 break;
4151 }
4152 }
4153 }
4154 }
4155 goto fake_end;
4156 /*NOTREACHED*/
c277df42
IZ
4157 case GROUPP:
4158 n = ARG(scan); /* which paren pair */
f0ab9afb 4159 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 4160 break;
0a4db386
YO
4161 case NGROUPP:
4162 /* reg_check_named_buff_matched returns 0 for no match */
4163 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4164 break;
1a147d38 4165 case INSUBP:
0a4db386 4166 n = ARG(scan);
3b6647e0 4167 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4168 break;
4169 case DEFINEP:
4170 sw = 0;
4171 break;
c277df42 4172 case IFTHEN:
2c2d71f5 4173 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4174 if (sw)
c277df42
IZ
4175 next = NEXTOPER(NEXTOPER(scan));
4176 else {
4177 next = scan + ARG(scan);
4178 if (OP(next) == IFTHEN) /* Fake one. */
4179 next = NEXTOPER(NEXTOPER(next));
4180 }
4181 break;
4182 case LOGICAL:
24d3c4a9 4183 logical = scan->flags;
c277df42 4184 break;
c476f425 4185
2ab05381 4186/*******************************************************************
2ab05381 4187
c476f425
DM
4188The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4189pattern, where A and B are subpatterns. (For simple A, CURLYM or
4190STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4191
c476f425 4192A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4193
c476f425
DM
4194On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4195state, which contains the current count, initialised to -1. It also sets
4196cur_curlyx to point to this state, with any previous value saved in the
4197state block.
2ab05381 4198
c476f425
DM
4199CURLYX then jumps straight to the WHILEM op, rather than executing A,
4200since the pattern may possibly match zero times (i.e. it's a while {} loop
4201rather than a do {} while loop).
2ab05381 4202
c476f425
DM
4203Each entry to WHILEM represents a successful match of A. The count in the
4204CURLYX block is incremented, another WHILEM state is pushed, and execution
4205passes to A or B depending on greediness and the current count.
2ab05381 4206
c476f425
DM
4207For example, if matching against the string a1a2a3b (where the aN are
4208substrings that match /A/), then the match progresses as follows: (the
4209pushed states are interspersed with the bits of strings matched so far):
2ab05381 4210
c476f425
DM
4211 <CURLYX cnt=-1>
4212 <CURLYX cnt=0><WHILEM>
4213 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4214 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4215 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4216 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4217
c476f425
DM
4218(Contrast this with something like CURLYM, which maintains only a single
4219backtrack state:
2ab05381 4220
c476f425
DM
4221 <CURLYM cnt=0> a1
4222 a1 <CURLYM cnt=1> a2
4223 a1 a2 <CURLYM cnt=2> a3
4224 a1 a2 a3 <CURLYM cnt=3> b
4225)
2ab05381 4226
c476f425
DM
4227Each WHILEM state block marks a point to backtrack to upon partial failure
4228of A or B, and also contains some minor state data related to that
4229iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4230overall state, such as the count, and pointers to the A and B ops.
2ab05381 4231
c476f425
DM
4232This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4233must always point to the *current* CURLYX block, the rules are:
2ab05381 4234
c476f425
DM
4235When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4236and set cur_curlyx to point the new block.
2ab05381 4237
c476f425
DM
4238When popping the CURLYX block after a successful or unsuccessful match,
4239restore the previous cur_curlyx.
2ab05381 4240
c476f425
DM
4241When WHILEM is about to execute B, save the current cur_curlyx, and set it
4242to the outer one saved in the CURLYX block.
2ab05381 4243
c476f425
DM
4244When popping the WHILEM block after a successful or unsuccessful B match,
4245restore the previous cur_curlyx.
2ab05381 4246
c476f425
DM
4247Here's an example for the pattern (AI* BI)*BO
4248I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4249
c476f425
DM
4250cur_
4251curlyx backtrack stack
4252------ ---------------
4253NULL
4254CO <CO prev=NULL> <WO>
4255CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4256CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4257NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4258
c476f425
DM
4259At this point the pattern succeeds, and we work back down the stack to
4260clean up, restoring as we go:
95b24440 4261
c476f425
DM
4262CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4263CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4264CO <CO prev=NULL> <WO>
4265NULL
a0374537 4266
c476f425
DM
4267*******************************************************************/
4268
4269#define ST st->u.curlyx
4270
4271 case CURLYX: /* start of /A*B/ (for complex A) */
4272 {
4273 /* No need to save/restore up to this paren */
4274 I32 parenfloor = scan->flags;
4275
4276 assert(next); /* keep Coverity happy */
4277 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4278 next += ARG(next);
4279
4280 /* XXXX Probably it is better to teach regpush to support
4281 parenfloor > PL_regsize... */
4282 if (parenfloor > (I32)*PL_reglastparen)
4283 parenfloor = *PL_reglastparen; /* Pessimization... */
4284
4285 ST.prev_curlyx= cur_curlyx;
4286 cur_curlyx = st;
4287 ST.cp = PL_savestack_ix;
4288
4289 /* these fields contain the state of the current curly.
4290 * they are accessed by subsequent WHILEMs */
4291 ST.parenfloor = parenfloor;
4292 ST.min = ARG1(scan);
4293 ST.max = ARG2(scan);
4294 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4295 ST.B = next;
24d3c4a9
DM
4296 ST.minmod = minmod;
4297 minmod = 0;
c476f425
DM
4298 ST.count = -1; /* this will be updated by WHILEM */
4299 ST.lastloc = NULL; /* this will be updated by WHILEM */
4300
4301 PL_reginput = locinput;
4302 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4303 /* NOTREACHED */
c476f425 4304 }
a0d0e21e 4305
c476f425 4306 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4307 cur_curlyx = ST.prev_curlyx;
4308 sayYES;
4309 /* NOTREACHED */
a0d0e21e 4310
c476f425
DM
4311 case CURLYX_end_fail: /* just failed to match all of A*B */
4312 regcpblow(ST.cp);
4313 cur_curlyx = ST.prev_curlyx;
4314 sayNO;
4315 /* NOTREACHED */
4633a7c4 4316
a0d0e21e 4317
c476f425
DM
4318#undef ST
4319#define ST st->u.whilem
4320
4321 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4322 {
4323 /* see the discussion above about CURLYX/WHILEM */
c476f425
DM
4324 I32 n;
4325 assert(cur_curlyx); /* keep Coverity happy */
4326 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4327 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4328 ST.cache_offset = 0;
4329 ST.cache_mask = 0;
4330
4331 PL_reginput = locinput;
4332
4333 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4334 "%*s whilem: matched %ld out of %ld..%ld\n",
4335 REPORT_CODE_OFF+depth*2, "", (long)n,
4336 (long)cur_curlyx->u.curlyx.min,
4337 (long)cur_curlyx->u.curlyx.max)
4338 );
a0d0e21e 4339
c476f425 4340 /* First just match a string of min A's. */
a0d0e21e 4341
c476f425
DM
4342 if (n < cur_curlyx->u.curlyx.min) {
4343 cur_curlyx->u.curlyx.lastloc = locinput;
4344 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4345 /* NOTREACHED */
4346 }
4347
4348 /* If degenerate A matches "", assume A done. */
4349
4350 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4351 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4352 "%*s whilem: empty match detected, trying continuation...\n",
4353 REPORT_CODE_OFF+depth*2, "")
4354 );
4355 goto do_whilem_B_max;
4356 }
4357
4358 /* super-linear cache processing */
4359
4360 if (scan->flags) {
a0d0e21e 4361
2c2d71f5 4362 if (!PL_reg_maxiter) {
c476f425
DM
4363 /* start the countdown: Postpone detection until we
4364 * know the match is not *that* much linear. */
2c2d71f5 4365 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4366 /* possible overflow for long strings and many CURLYX's */
4367 if (PL_reg_maxiter < 0)
4368 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4369 PL_reg_leftiter = PL_reg_maxiter;
4370 }
c476f425 4371
2c2d71f5 4372 if (PL_reg_leftiter-- == 0) {
c476f425 4373 /* initialise cache */
3298f257 4374 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4375 if (PL_reg_poscache) {
eb160463 4376 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4377 Renew(PL_reg_poscache, size, char);
4378 PL_reg_poscache_size = size;
4379 }
4380 Zero(PL_reg_poscache, size, char);
4381 }
4382 else {
4383 PL_reg_poscache_size = size;
a02a5408 4384 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4385 }
c476f425
DM
4386 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4387 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4388 PL_colors[4], PL_colors[5])
4389 );
2c2d71f5 4390 }
c476f425 4391
2c2d71f5 4392 if (PL_reg_leftiter < 0) {
c476f425
DM
4393 /* have we already failed at this position? */
4394 I32 offset, mask;
4395 offset = (scan->flags & 0xf) - 1
4396 + (locinput - PL_bostr) * (scan->flags>>4);
4397 mask = 1 << (offset % 8);
4398 offset /= 8;
4399 if (PL_reg_poscache[offset] & mask) {
4400 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4401 "%*s whilem: (cache) already tried at this position...\n",
4402 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4403 );
3298f257 4404 sayNO; /* cache records failure */
2c2d71f5 4405 }
c476f425
DM
4406 ST.cache_offset = offset;
4407 ST.cache_mask = mask;
2c2d71f5 4408 }
c476f425 4409 }
2c2d71f5 4410
c476f425 4411 /* Prefer B over A for minimal matching. */
a687059c 4412
c476f425
DM
4413 if (cur_curlyx->u.curlyx.minmod) {
4414 ST.save_curlyx = cur_curlyx;
4415 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4416 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4417 REGCP_SET(ST.lastcp);
4418 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4419 /* NOTREACHED */
4420 }
a0d0e21e 4421
c476f425
DM
4422 /* Prefer A over B for maximal matching. */
4423
4424 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4425 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4426 cur_curlyx->u.curlyx.lastloc = locinput;
4427 REGCP_SET(ST.lastcp);
4428 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4429 /* NOTREACHED */
4430 }
4431 goto do_whilem_B_max;
4432 }
4433 /* NOTREACHED */
4434
4435 case WHILEM_B_min: /* just matched B in a minimal match */
4436 case WHILEM_B_max: /* just matched B in a maximal match */
4437 cur_curlyx = ST.save_curlyx;
4438 sayYES;
4439 /* NOTREACHED */
4440
4441 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4442 cur_curlyx = ST.save_curlyx;
4443 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4444 cur_curlyx->u.curlyx.count--;
4445 CACHEsayNO;
4446 /* NOTREACHED */
4447
4448 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4449 REGCP_UNWIND(ST.lastcp);
4450 regcppop(rex);
4451 /* FALL THROUGH */
4452 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4453 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4454 cur_curlyx->u.curlyx.count--;
4455 CACHEsayNO;
4456 /* NOTREACHED */
4457
4458 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4459 REGCP_UNWIND(ST.lastcp);
4460 regcppop(rex); /* Restore some previous $<digit>s? */
4461 PL_reginput = locinput;
4462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4463 "%*s whilem: failed, trying continuation...\n",
4464 REPORT_CODE_OFF+depth*2, "")
4465 );
4466 do_whilem_B_max:
4467 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4468 && ckWARN(WARN_REGEXP)
4469 && !(PL_reg_flags & RF_warned))
4470 {
4471 PL_reg_flags |= RF_warned;
4472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4473 "Complex regular subexpression recursion",
4474 REG_INFTY - 1);
4475 }
4476
4477 /* now try B */
4478 ST.save_curlyx = cur_curlyx;
4479 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4480 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4481 /* NOTREACHED */
4482
4483 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4484 cur_curlyx = ST.save_curlyx;
4485 REGCP_UNWIND(ST.lastcp);
4486 regcppop(rex);
4487
4488 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4489 /* Maximum greed exceeded */
4490 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4491 && ckWARN(WARN_REGEXP)
4492 && !(PL_reg_flags & RF_warned))
4493 {
3280af22 4494 PL_reg_flags |= RF_warned;
c476f425
DM
4495 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4496 "%s limit (%d) exceeded",
4497 "Complex regular subexpression recursion",
4498 REG_INFTY - 1);
a0d0e21e 4499 }
c476f425 4500 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4501 CACHEsayNO;
a0d0e21e 4502 }
c476f425
DM
4503
4504 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4505 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4506 );
4507 /* Try grabbing another A and see if it helps. */
4508 PL_reginput = locinput;
4509 cur_curlyx->u.curlyx.lastloc = locinput;
4510 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4511 REGCP_SET(ST.lastcp);
4512 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
5f66b61c 4513 /* NOTREACHED */
40a82448
DM
4514
4515#undef ST
4516#define ST st->u.branch
4517
4518 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4519 next = scan + ARG(scan);
4520 if (next == scan)
4521 next = NULL;
40a82448
DM
4522 scan = NEXTOPER(scan);
4523 /* FALL THROUGH */
c277df42 4524
40a82448
DM
4525 case BRANCH: /* /(...|A|...)/ */
4526 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4527 ST.lastparen = *PL_reglastparen;
4528 ST.next_branch = next;
4529 REGCP_SET(ST.cp);
4530 PL_reginput = locinput;
02db2b7b 4531
40a82448 4532 /* Now go into the branch */
5d458dd8
YO
4533 if (has_cutgroup) {
4534 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4535 } else {
4536 PUSH_STATE_GOTO(BRANCH_next, scan);
4537 }
40a82448 4538 /* NOTREACHED */
5d458dd8
YO
4539 case CUTGROUP:
4540 PL_reginput = locinput;
4541 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4542 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4543 PUSH_STATE_GOTO(CUTGROUP_next,next);
4544 /* NOTREACHED */
4545 case CUTGROUP_next_fail:
4546 do_cutgroup = 1;
4547 no_final = 1;
4548 if (st->u.mark.mark_name)
4549 sv_commit = st->u.mark.mark_name;
4550 sayNO;
4551 /* NOTREACHED */
4552 case BRANCH_next:
4553 sayYES;
4554 /* NOTREACHED */
40a82448 4555 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4556 if (do_cutgroup) {
4557 do_cutgroup = 0;
4558 no_final = 0;
4559 }
40a82448
DM
4560 REGCP_UNWIND(ST.cp);
4561 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4562 PL_regoffs[n].end = -1;
40a82448 4563 *PL_reglastparen = n;
0a4db386 4564 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4565 scan = ST.next_branch;
4566 /* no more branches? */
5d458dd8
YO
4567 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4568 DEBUG_EXECUTE_r({
4569 PerlIO_printf( Perl_debug_log,
4570 "%*s %sBRANCH failed...%s\n",
4571 REPORT_CODE_OFF+depth*2, "",
4572 PL_colors[4],
4573 PL_colors[5] );
4574 });
4575 sayNO_SILENT;
4576 }
40a82448
DM
4577 continue; /* execute next BRANCH[J] op */
4578 /* NOTREACHED */
4579
a0d0e21e 4580 case MINMOD:
24d3c4a9 4581 minmod = 1;
a0d0e21e 4582 break;
40a82448
DM
4583
4584#undef ST
4585#define ST st->u.curlym
4586
4587 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4588
4589 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 4590 * only a single backtracking state, no matter how many matches
40a82448
DM
4591 * there are in {m,n}. It relies on the pattern being constant
4592 * length, with no parens to influence future backrefs
4593 */
4594
4595 ST.me = scan;
dc45a647 4596 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4597
4598 /* if paren positive, emulate an OPEN/CLOSE around A */
4599 if (ST.me->flags) {
3b6647e0 4600 U32 paren = ST.me->flags;
40a82448
DM
4601 if (paren > PL_regsize)
4602 PL_regsize = paren;
3b6647e0 4603 if (paren > *PL_reglastparen)
40a82448 4604 *PL_reglastparen = paren;
c277df42 4605 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4606 }
40a82448
DM
4607 ST.A = scan;
4608 ST.B = next;
4609 ST.alen = 0;
4610 ST.count = 0;
24d3c4a9
DM
4611 ST.minmod = minmod;
4612 minmod = 0;
40a82448
DM
4613 ST.c1 = CHRTEST_UNINIT;
4614 REGCP_SET(ST.cp);
6407bf3b 4615
40a82448
DM
4616 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4617 goto curlym_do_B;
4618
4619 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 4620 PL_reginput = locinput;
40a82448
DM
4621 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4622 /* NOTREACHED */
5f80c4cf 4623
40a82448
DM
4624 case CURLYM_A: /* we've just matched an A */
4625 locinput = st->locinput;
4626 nextchr = UCHARAT(locinput);
4627
4628 ST.count++;
4629 /* after first match, determine A's length: u.curlym.alen */
4630 if (ST.count == 1) {
4631 if (PL_reg_match_utf8) {
4632 char *s = locinput;
4633 while (s < PL_reginput) {
4634 ST.alen++;
4635 s += UTF8SKIP(s);
4636 }
4637 }
4638 else {
4639 ST.alen = PL_reginput - locinput;
4640 }
4641 if (ST.alen == 0)
4642 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4643 }
0cadcf80
DM
4644 DEBUG_EXECUTE_r(
4645 PerlIO_printf(Perl_debug_log,
40a82448 4646 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 4647 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 4648 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
4649 );
4650
40a82448 4651 locinput = PL_reginput;
0a4db386
YO
4652
4653 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4654 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4655 goto fake_end;
4656
c966426a
DM
4657 {
4658 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4659 if ( max == REG_INFTY || ST.count < max )
4660 goto curlym_do_A; /* try to match another A */
4661 }
40a82448 4662 goto curlym_do_B; /* try to match B */
5f80c4cf 4663
40a82448
DM
4664 case CURLYM_A_fail: /* just failed to match an A */
4665 REGCP_UNWIND(ST.cp);
0a4db386
YO
4666
4667 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4668 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4669 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 4670 sayNO;
0cadcf80 4671
40a82448
DM
4672 curlym_do_B: /* execute the B in /A{m,n}B/ */
4673 PL_reginput = locinput;
4674 if (ST.c1 == CHRTEST_UNINIT) {
4675 /* calculate c1 and c2 for possible match of 1st char
4676 * following curly */
4677 ST.c1 = ST.c2 = CHRTEST_VOID;
4678 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4679 regnode *text_node = ST.B;
4680 if (! HAS_TEXT(text_node))
4681 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
4682 /* this used to be
4683
4684 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4685
4686 But the former is redundant in light of the latter.
4687
4688 if this changes back then the macro for
4689 IS_TEXT and friends need to change.
4690 */
4691 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 4692 {
ee9b8eae 4693
40a82448
DM
4694 ST.c1 = (U8)*STRING(text_node);
4695 ST.c2 =
ee9b8eae 4696 (IS_TEXTF(text_node))
40a82448 4697 ? PL_fold[ST.c1]
ee9b8eae 4698 : (IS_TEXTFL(text_node))
40a82448
DM
4699 ? PL_fold_locale[ST.c1]
4700 : ST.c1;
c277df42 4701 }
c277df42 4702 }
40a82448
DM
4703 }
4704
4705 DEBUG_EXECUTE_r(
4706 PerlIO_printf(Perl_debug_log,
4707 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 4708 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
4709 "", (IV)ST.count)
4710 );
4711 if (ST.c1 != CHRTEST_VOID
4712 && UCHARAT(PL_reginput) != ST.c1
4713 && UCHARAT(PL_reginput) != ST.c2)
4714 {
4715 /* simulate B failing */
3e901dc0
YO
4716 DEBUG_OPTIMISE_r(
4717 PerlIO_printf(Perl_debug_log,
4718 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4719 (int)(REPORT_CODE_OFF+(depth*2)),"",
4720 (IV)ST.c1,(IV)ST.c2
4721 ));
40a82448
DM
4722 state_num = CURLYM_B_fail;
4723 goto reenter_switch;
4724 }
4725
4726 if (ST.me->flags) {
4727 /* mark current A as captured */
4728 I32 paren = ST.me->flags;
4729 if (ST.count) {
f0ab9afb 4730 PL_regoffs[paren].start
40a82448 4731 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 4732 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 4733 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 4734 }
40a82448 4735 else
f0ab9afb 4736 PL_regoffs[paren].end = -1;
0a4db386 4737 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 4738 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
4739 {
4740 if (ST.count)
4741 goto fake_end;
4742 else
4743 sayNO;
4744 }
c277df42 4745 }
0a4db386 4746
40a82448 4747 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 4748 /* NOTREACHED */
40a82448
DM
4749
4750 case CURLYM_B_fail: /* just failed to match a B */
4751 REGCP_UNWIND(ST.cp);
4752 if (ST.minmod) {
84d2fa14
HS
4753 I32 max = ARG2(ST.me);
4754 if (max != REG_INFTY && ST.count == max)
40a82448
DM
4755 sayNO;
4756 goto curlym_do_A; /* try to match a further A */
4757 }
4758 /* backtrack one A */
4759 if (ST.count == ARG1(ST.me) /* min */)
4760 sayNO;
4761 ST.count--;
4762 locinput = HOPc(locinput, -ST.alen);
4763 goto curlym_do_B; /* try to match B */
4764
c255a977
DM
4765#undef ST
4766#define ST st->u.curly
40a82448 4767
c255a977
DM
4768#define CURLY_SETPAREN(paren, success) \
4769 if (paren) { \
4770 if (success) { \
f0ab9afb
NC
4771 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4772 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 4773 *PL_reglastcloseparen = paren; \
c255a977
DM
4774 } \
4775 else \
f0ab9afb 4776 PL_regoffs[paren].end = -1; \
c255a977
DM
4777 }
4778
4779 case STAR: /* /A*B/ where A is width 1 */
4780 ST.paren = 0;
4781 ST.min = 0;
4782 ST.max = REG_INFTY;
a0d0e21e
LW
4783 scan = NEXTOPER(scan);
4784 goto repeat;
c255a977
DM
4785 case PLUS: /* /A+B/ where A is width 1 */
4786 ST.paren = 0;
4787 ST.min = 1;
4788 ST.max = REG_INFTY;
c277df42 4789 scan = NEXTOPER(scan);
c255a977
DM
4790 goto repeat;
4791 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4792 ST.paren = scan->flags; /* Which paren to set */
4793 if (ST.paren > PL_regsize)
4794 PL_regsize = ST.paren;
3b6647e0 4795 if (ST.paren > *PL_reglastparen)
c255a977
DM
4796 *PL_reglastparen = ST.paren;
4797 ST.min = ARG1(scan); /* min to match */
4798 ST.max = ARG2(scan); /* max to match */
0a4db386 4799 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 4800 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
4801 ST.min=1;
4802 ST.max=1;
4803 }
c255a977
DM
4804 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4805 goto repeat;
4806 case CURLY: /* /A{m,n}B/ where A is width 1 */
4807 ST.paren = 0;
4808 ST.min = ARG1(scan); /* min to match */
4809 ST.max = ARG2(scan); /* max to match */
4810 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 4811 repeat:
a0d0e21e
LW
4812 /*
4813 * Lookahead to avoid useless match attempts
4814 * when we know what character comes next.
c255a977 4815 *
5f80c4cf
JP
4816 * Used to only do .*x and .*?x, but now it allows
4817 * for )'s, ('s and (?{ ... })'s to be in the way
4818 * of the quantifier and the EXACT-like node. -- japhy
4819 */
4820
c255a977
DM
4821 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4822 sayNO;
cca55fe3 4823 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
4824 U8 *s;
4825 regnode *text_node = next;
4826
3dab1dad
YO
4827 if (! HAS_TEXT(text_node))
4828 FIND_NEXT_IMPT(text_node);
5f80c4cf 4829
9e137952 4830 if (! HAS_TEXT(text_node))
c255a977 4831 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 4832 else {
ee9b8eae 4833 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 4834 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 4835 goto assume_ok_easy;
cca55fe3 4836 }
be8e71aa
YO
4837 else
4838 s = (U8*)STRING(text_node);
ee9b8eae
YO
4839
4840 /* Currently we only get here when
4841
4842 PL_rekind[OP(text_node)] == EXACT
4843
4844 if this changes back then the macro for IS_TEXT and
4845 friends need to change. */
5f80c4cf 4846 if (!UTF) {
c255a977 4847 ST.c2 = ST.c1 = *s;
ee9b8eae 4848 if (IS_TEXTF(text_node))
c255a977 4849 ST.c2 = PL_fold[ST.c1];
ee9b8eae 4850 else if (IS_TEXTFL(text_node))
c255a977 4851 ST.c2 = PL_fold_locale[ST.c1];
1aa99e6b 4852 }
5f80c4cf 4853 else { /* UTF */
ee9b8eae 4854 if (IS_TEXTF(text_node)) {
a2a2844f 4855 STRLEN ulen1, ulen2;
89ebb4a3
JH
4856 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4857 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
4858
4859 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4860 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
4861#ifdef EBCDIC
4862 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4863 ckWARN(WARN_UTF8) ?
4864 0 : UTF8_ALLOW_ANY);
4865 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4866 ckWARN(WARN_UTF8) ?
4867 0 : UTF8_ALLOW_ANY);
4868#else
c255a977 4869 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 4870 uniflags);
c255a977 4871 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
4872 uniflags);
4873#endif
5f80c4cf
JP
4874 }
4875 else {
c255a977 4876 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 4877 uniflags);
5f80c4cf 4878 }
1aa99e6b
IH
4879 }
4880 }
bbce6d69 4881 }
a0d0e21e 4882 else
c255a977 4883 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 4884 assume_ok_easy:
c255a977
DM
4885
4886 ST.A = scan;
4887 ST.B = next;
3280af22 4888 PL_reginput = locinput;
24d3c4a9
DM
4889 if (minmod) {
4890 minmod = 0;
e2e6a0f1 4891 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 4892 sayNO;
c255a977 4893 ST.count = ST.min;
a0ed51b3 4894 locinput = PL_reginput;
c255a977
DM
4895 REGCP_SET(ST.cp);
4896 if (ST.c1 == CHRTEST_VOID)
4897 goto curly_try_B_min;
4898
4899 ST.oldloc = locinput;
4900
4901 /* set ST.maxpos to the furthest point along the
4902 * string that could possibly match */
4903 if (ST.max == REG_INFTY) {
4904 ST.maxpos = PL_regeol - 1;
4905 if (do_utf8)
4906 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4907 ST.maxpos--;
4908 }
4909 else if (do_utf8) {
4910 int m = ST.max - ST.min;
4911 for (ST.maxpos = locinput;
4912 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4913 ST.maxpos += UTF8SKIP(ST.maxpos);
4914 }
4915 else {
4916 ST.maxpos = locinput + ST.max - ST.min;
4917 if (ST.maxpos >= PL_regeol)
4918 ST.maxpos = PL_regeol - 1;
4919 }
4920 goto curly_try_B_min_known;
4921
4922 }
4923 else {
e2e6a0f1 4924 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
4925 locinput = PL_reginput;
4926 if (ST.count < ST.min)
4927 sayNO;
4928 if ((ST.count > ST.min)
4929 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4930 {
4931 /* A{m,n} must come at the end of the string, there's
4932 * no point in backing off ... */
4933 ST.min = ST.count;
4934 /* ...except that $ and \Z can match before *and* after
4935 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4936 We may back off by one in this case. */
4937 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4938 ST.min--;
4939 }
4940 REGCP_SET(ST.cp);
4941 goto curly_try_B_max;
4942 }
4943 /* NOTREACHED */
4944
4945
4946 case CURLY_B_min_known_fail:
4947 /* failed to find B in a non-greedy match where c1,c2 valid */
4948 if (ST.paren && ST.count)
f0ab9afb 4949 PL_regoffs[ST.paren].end = -1;
c255a977
DM
4950
4951 PL_reginput = locinput; /* Could be reset... */
4952 REGCP_UNWIND(ST.cp);
4953 /* Couldn't or didn't -- move forward. */
4954 ST.oldloc = locinput;
4955 if (do_utf8)
4956 locinput += UTF8SKIP(locinput);
4957 else
4958 locinput++;
4959 ST.count++;
4960 curly_try_B_min_known:
4961 /* find the next place where 'B' could work, then call B */
4962 {
4963 int n;
4964 if (do_utf8) {
4965 n = (ST.oldloc == locinput) ? 0 : 1;
4966 if (ST.c1 == ST.c2) {
4967 STRLEN len;
4968 /* set n to utf8_distance(oldloc, locinput) */
4969 while (locinput <= ST.maxpos &&
4970 utf8n_to_uvchr((U8*)locinput,
4971 UTF8_MAXBYTES, &len,
4972 uniflags) != (UV)ST.c1) {
4973 locinput += len;
4974 n++;
4975 }
1aa99e6b
IH
4976 }
4977 else {
c255a977
DM
4978 /* set n to utf8_distance(oldloc, locinput) */
4979 while (locinput <= ST.maxpos) {
4980 STRLEN len;
4981 const UV c = utf8n_to_uvchr((U8*)locinput,
4982 UTF8_MAXBYTES, &len,
4983 uniflags);
4984 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4985 break;
4986 locinput += len;
4987 n++;
1aa99e6b 4988 }
0fe9bf95
IZ
4989 }
4990 }
c255a977
DM
4991 else {
4992 if (ST.c1 == ST.c2) {
4993 while (locinput <= ST.maxpos &&
4994 UCHARAT(locinput) != ST.c1)
4995 locinput++;
bbce6d69 4996 }
c255a977
DM
4997 else {
4998 while (locinput <= ST.maxpos
4999 && UCHARAT(locinput) != ST.c1
5000 && UCHARAT(locinput) != ST.c2)
5001 locinput++;
a0ed51b3 5002 }
c255a977
DM
5003 n = locinput - ST.oldloc;
5004 }
5005 if (locinput > ST.maxpos)
5006 sayNO;
5007 /* PL_reginput == oldloc now */
5008 if (n) {
5009 ST.count += n;
e2e6a0f1 5010 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5011 sayNO;
a0d0e21e 5012 }
c255a977
DM
5013 PL_reginput = locinput;
5014 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5015 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5016 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5017 goto fake_end;
5018 }
c255a977 5019 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5020 }
c255a977
DM
5021 /* NOTREACHED */
5022
5023
5024 case CURLY_B_min_fail:
5025 /* failed to find B in a non-greedy match where c1,c2 invalid */
5026 if (ST.paren && ST.count)
f0ab9afb 5027 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5028
5029 REGCP_UNWIND(ST.cp);
5030 /* failed -- move forward one */
5031 PL_reginput = locinput;
e2e6a0f1 5032 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5033 ST.count++;
a0ed51b3 5034 locinput = PL_reginput;
c255a977
DM
5035 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5036 ST.count > 0)) /* count overflow ? */
15272685 5037 {
c255a977
DM
5038 curly_try_B_min:
5039 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5040 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5041 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5042 goto fake_end;
5043 }
c255a977 5044 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5045 }
5046 }
4633a7c4 5047 sayNO;
c255a977
DM
5048 /* NOTREACHED */
5049
5050
5051 curly_try_B_max:
5052 /* a successful greedy match: now try to match B */
40d049e4 5053 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5054 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5055 goto fake_end;
5056 }
c255a977
DM
5057 {
5058 UV c = 0;
5059 if (ST.c1 != CHRTEST_VOID)
5060 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5061 UTF8_MAXBYTES, 0, uniflags)
466787eb 5062 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5063 /* If it could work, try it. */
5064 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5065 CURLY_SETPAREN(ST.paren, ST.count);
5066 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5067 /* NOTREACHED */
5068 }
5069 }
5070 /* FALL THROUGH */
5071 case CURLY_B_max_fail:
5072 /* failed to find B in a greedy match */
5073 if (ST.paren && ST.count)
f0ab9afb 5074 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5075
5076 REGCP_UNWIND(ST.cp);
5077 /* back up. */
5078 if (--ST.count < ST.min)
5079 sayNO;
5080 PL_reginput = locinput = HOPc(locinput, -1);
5081 goto curly_try_B_max;
5082
5083#undef ST
5084
a0d0e21e 5085 case END:
6bda09f9 5086 fake_end:
faec1544
DM
5087 if (cur_eval) {
5088 /* we've just finished A in /(??{A})B/; now continue with B */
5089 I32 tmpix;
faec1544
DM
5090 st->u.eval.toggle_reg_flags
5091 = cur_eval->u.eval.toggle_reg_flags;
5092 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5093
288b8c02
NC
5094 st->u.eval.prev_rex = rex_sv; /* inner */
5095 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5096 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5097 rexi = RXi_GET(rex);
faec1544 5098 cur_curlyx = cur_eval->u.eval.prev_curlyx;
288b8c02 5099 ReREFCNT_inc(rex_sv);
faec1544 5100 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
34a81e2b
B
5101
5102 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5103 PL_reglastparen = &rex->lastparen;
5104 PL_reglastcloseparen = &rex->lastcloseparen;
5105
faec1544
DM
5106 REGCP_SET(st->u.eval.lastcp);
5107 PL_reginput = locinput;
5108
5109 /* Restore parens of the outer rex without popping the
5110 * savestack */
5111 tmpix = PL_savestack_ix;
5112 PL_savestack_ix = cur_eval->u.eval.lastcp;
5113 regcppop(rex);
5114 PL_savestack_ix = tmpix;
5115
5116 st->u.eval.prev_eval = cur_eval;
5117 cur_eval = cur_eval->u.eval.prev_eval;
5118 DEBUG_EXECUTE_r(
2a49f0f5
JH
5119 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5120 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5121 if ( nochange_depth )
5122 nochange_depth--;
5123
5124 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5125 st->u.eval.prev_eval->u.eval.B); /* match B */
5126 }
5127
3b0527fe 5128 if (locinput < reginfo->till) {
a3621e74 5129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5130 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5131 PL_colors[4],
5132 (long)(locinput - PL_reg_starttry),
3b0527fe 5133 (long)(reginfo->till - PL_reg_starttry),
7821416a 5134 PL_colors[5]));
58e23c8d 5135
262b90c4 5136 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5137 }
5138 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5139 sayYES; /* Success! */
dad79028
DM
5140
5141 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5142 DEBUG_EXECUTE_r(
5143 PerlIO_printf(Perl_debug_log,
5144 "%*s %ssubpattern success...%s\n",
5bc10b2c 5145 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5146 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5147 sayYES; /* Success! */
dad79028 5148
40a82448
DM
5149#undef ST
5150#define ST st->u.ifmatch
5151
5152 case SUSPEND: /* (?>A) */
5153 ST.wanted = 1;
9fe1d20c 5154 PL_reginput = locinput;
9041c2e3 5155 goto do_ifmatch;
dad79028 5156
40a82448
DM
5157 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5158 ST.wanted = 0;
dad79028
DM
5159 goto ifmatch_trivial_fail_test;
5160
40a82448
DM
5161 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5162 ST.wanted = 1;
dad79028 5163 ifmatch_trivial_fail_test:
a0ed51b3 5164 if (scan->flags) {
52657f30 5165 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5166 if (!s) {
5167 /* trivial fail */
24d3c4a9
DM
5168 if (logical) {
5169 logical = 0;
5170 sw = 1 - (bool)ST.wanted;
dad79028 5171 }
40a82448 5172 else if (ST.wanted)
dad79028
DM
5173 sayNO;
5174 next = scan + ARG(scan);
5175 if (next == scan)
5176 next = NULL;
5177 break;
5178 }
efb30f32 5179 PL_reginput = s;
a0ed51b3
LW
5180 }
5181 else
5182 PL_reginput = locinput;
5183
c277df42 5184 do_ifmatch:
40a82448 5185 ST.me = scan;
24d3c4a9 5186 ST.logical = logical;
24d786f4
YO
5187 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5188
40a82448
DM
5189 /* execute body of (?...A) */
5190 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5191 /* NOTREACHED */
5192
5193 case IFMATCH_A_fail: /* body of (?...A) failed */
5194 ST.wanted = !ST.wanted;
5195 /* FALL THROUGH */
5196
5197 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9
DM
5198 if (ST.logical) {
5199 sw = (bool)ST.wanted;
40a82448
DM
5200 }
5201 else if (!ST.wanted)
5202 sayNO;
5203
5204 if (OP(ST.me) == SUSPEND)
5205 locinput = PL_reginput;
5206 else {
5207 locinput = PL_reginput = st->locinput;
5208 nextchr = UCHARAT(locinput);
5209 }
5210 scan = ST.me + ARG(ST.me);
5211 if (scan == ST.me)
5212 scan = NULL;
5213 continue; /* execute B */
5214
5215#undef ST
dad79028 5216
c277df42 5217 case LONGJMP:
c277df42
IZ
5218 next = scan + ARG(scan);
5219 if (next == scan)
5220 next = NULL;
a0d0e21e 5221 break;
54612592 5222 case COMMIT:
e2e6a0f1
YO
5223 reginfo->cutpoint = PL_regeol;
5224 /* FALLTHROUGH */
5d458dd8 5225 case PRUNE:
24b23f37 5226 PL_reginput = locinput;
e2e6a0f1 5227 if (!scan->flags)
ad64d0ec 5228 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5229 PUSH_STATE_GOTO(COMMIT_next,next);
5230 /* NOTREACHED */
5231 case COMMIT_next_fail:
5232 no_final = 1;
5233 /* FALLTHROUGH */
7f69552c
YO
5234 case OPFAIL:
5235 sayNO;
e2e6a0f1
YO
5236 /* NOTREACHED */
5237
5238#define ST st->u.mark
5239 case MARKPOINT:
5240 ST.prev_mark = mark_state;
5d458dd8 5241 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5242 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5243 mark_state = st;
5244 ST.mark_loc = PL_reginput = locinput;
5245 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5246 /* NOTREACHED */
5247 case MARKPOINT_next:
5248 mark_state = ST.prev_mark;
5249 sayYES;
5250 /* NOTREACHED */
5251 case MARKPOINT_next_fail:
5d458dd8 5252 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5253 {
5254 if (ST.mark_loc > startpoint)
5255 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5256 popmark = NULL; /* we found our mark */
5257 sv_commit = ST.mark_name;
5258
5259 DEBUG_EXECUTE_r({
5d458dd8 5260 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5261 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5262 REPORT_CODE_OFF+depth*2, "",
be2597df 5263 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5264 });
5265 }
5266 mark_state = ST.prev_mark;
5d458dd8
YO
5267 sv_yes_mark = mark_state ?
5268 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5269 sayNO;
5270 /* NOTREACHED */
5d458dd8
YO
5271 case SKIP:
5272 PL_reginput = locinput;
5273 if (scan->flags) {
2bf803e2 5274 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5275 ST.mark_name = NULL;
e2e6a0f1 5276 ST.mark_loc = locinput;
5d458dd8
YO
5277 PUSH_STATE_GOTO(SKIP_next,next);
5278 } else {
2bf803e2 5279 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5280 otherwise do nothing. Meaning we need to scan
5281 */
5282 regmatch_state *cur = mark_state;
ad64d0ec 5283 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5284
5285 while (cur) {
5286 if ( sv_eq( cur->u.mark.mark_name,
5287 find ) )
5288 {
5289 ST.mark_name = find;
5290 PUSH_STATE_GOTO( SKIP_next, next );
5291 }
5292 cur = cur->u.mark.prev_mark;
5293 }
e2e6a0f1 5294 }
2bf803e2 5295 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5296 break;
5297 case SKIP_next_fail:
5298 if (ST.mark_name) {
5299 /* (*CUT:NAME) - Set up to search for the name as we
5300 collapse the stack*/
5301 popmark = ST.mark_name;
5302 } else {
5303 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5304 if (ST.mark_loc > startpoint)
5305 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5306 /* but we set sv_commit to latest mark_name if there
5307 is one so they can test to see how things lead to this
5308 cut */
5309 if (mark_state)
5310 sv_commit=mark_state->u.mark.mark_name;
5311 }
e2e6a0f1
YO
5312 no_final = 1;
5313 sayNO;
5314 /* NOTREACHED */
5315#undef ST
32e6a07c
YO
5316 case FOLDCHAR:
5317 n = ARG(scan);
81d4fa0f 5318 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
e64b1bd1
YO
5319 locinput += ln;
5320 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5321 sayNO;
5322 } else {
5323 U8 folded[UTF8_MAXBYTES_CASE+1];
5324 STRLEN foldlen;
5325 const char * const l = locinput;
5326 char *e = PL_regeol;
5327 to_uni_fold(n, folded, &foldlen);
5328
59fe32ea 5329 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
e64b1bd1 5330 l, &e, 0, do_utf8)) {
32e6a07c 5331 sayNO;
e64b1bd1
YO
5332 }
5333 locinput = e;
32e6a07c
YO
5334 }
5335 nextchr = UCHARAT(locinput);
5336 break;
e1d1eefb
YO
5337 case LNBREAK:
5338 if ((n=is_LNBREAK(locinput,do_utf8))) {
5339 locinput += n;
5340 nextchr = UCHARAT(locinput);
5341 } else
5342 sayNO;
5343 break;
5344
5345#define CASE_CLASS(nAmE) \
5346 case nAmE: \
5347 if ((n=is_##nAmE(locinput,do_utf8))) { \
5348 locinput += n; \
5349 nextchr = UCHARAT(locinput); \
5350 } else \
5351 sayNO; \
5352 break; \
5353 case N##nAmE: \
5354 if ((n=is_##nAmE(locinput,do_utf8))) { \
5355 sayNO; \
5356 } else { \
5357 locinput += UTF8SKIP(locinput); \
5358 nextchr = UCHARAT(locinput); \
5359 } \
5360 break
5361
5362 CASE_CLASS(VERTWS);
5363 CASE_CLASS(HORIZWS);
5364#undef CASE_CLASS
5365
a0d0e21e 5366 default:
b900a521 5367 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5368 PTR2UV(scan), OP(scan));
cea2e8a9 5369 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5370
5371 } /* end switch */
95b24440 5372
5d458dd8
YO
5373 /* switch break jumps here */
5374 scan = next; /* prepare to execute the next op and ... */
5375 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5376 /* NOTREACHED */
5377
40a82448
DM
5378 push_yes_state:
5379 /* push a state that backtracks on success */
5380 st->u.yes.prev_yes_state = yes_state;
5381 yes_state = st;
5382 /* FALL THROUGH */
5383 push_state:
5384 /* push a new regex state, then continue at scan */
5385 {
5386 regmatch_state *newst;
5387
24b23f37
YO
5388 DEBUG_STACK_r({
5389 regmatch_state *cur = st;
5390 regmatch_state *curyes = yes_state;
5391 int curd = depth;
5392 regmatch_slab *slab = PL_regmatch_slab;
5393 for (;curd > -1;cur--,curd--) {
5394 if (cur < SLAB_FIRST(slab)) {
5395 slab = slab->prev;
5396 cur = SLAB_LAST(slab);
5397 }
5398 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5399 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5400 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5401 (curyes == cur) ? "yes" : ""
5402 );
5403 if (curyes == cur)
5404 curyes = cur->u.yes.prev_yes_state;
5405 }
5406 } else
5407 DEBUG_STATE_pp("push")
5408 );
40a82448 5409 depth++;
40a82448
DM
5410 st->locinput = locinput;
5411 newst = st+1;
5412 if (newst > SLAB_LAST(PL_regmatch_slab))
5413 newst = S_push_slab(aTHX);
5414 PL_regmatch_state = newst;
786e8c11 5415
40a82448
DM
5416 locinput = PL_reginput;
5417 nextchr = UCHARAT(locinput);
5418 st = newst;
5419 continue;
5420 /* NOTREACHED */
5421 }
a0d0e21e 5422 }
a687059c 5423
a0d0e21e
LW
5424 /*
5425 * We get here only if there's trouble -- normally "case END" is
5426 * the terminating point.
5427 */
cea2e8a9 5428 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5429 /*NOTREACHED*/
4633a7c4
LW
5430 sayNO;
5431
262b90c4 5432yes:
77cb431f
DM
5433 if (yes_state) {
5434 /* we have successfully completed a subexpression, but we must now
5435 * pop to the state marked by yes_state and continue from there */
77cb431f 5436 assert(st != yes_state);
5bc10b2c
DM
5437#ifdef DEBUGGING
5438 while (st != yes_state) {
5439 st--;
5440 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5441 PL_regmatch_slab = PL_regmatch_slab->prev;
5442 st = SLAB_LAST(PL_regmatch_slab);
5443 }
e2e6a0f1 5444 DEBUG_STATE_r({
54612592
YO
5445 if (no_final) {
5446 DEBUG_STATE_pp("pop (no final)");
5447 } else {
5448 DEBUG_STATE_pp("pop (yes)");
5449 }
e2e6a0f1 5450 });
5bc10b2c
DM
5451 depth--;
5452 }
5453#else
77cb431f
DM
5454 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5455 || yes_state > SLAB_LAST(PL_regmatch_slab))
5456 {
5457 /* not in this slab, pop slab */
5458 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5459 PL_regmatch_slab = PL_regmatch_slab->prev;
5460 st = SLAB_LAST(PL_regmatch_slab);
5461 }
5462 depth -= (st - yes_state);
5bc10b2c 5463#endif
77cb431f
DM
5464 st = yes_state;
5465 yes_state = st->u.yes.prev_yes_state;
5466 PL_regmatch_state = st;
24b23f37 5467
5d458dd8
YO
5468 if (no_final) {
5469 locinput= st->locinput;
5470 nextchr = UCHARAT(locinput);
5471 }
54612592 5472 state_num = st->resume_state + no_final;
24d3c4a9 5473 goto reenter_switch;
77cb431f
DM
5474 }
5475
a3621e74 5476 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5477 PL_colors[4], PL_colors[5]));
02db2b7b 5478
19b95bf0
DM
5479 if (PL_reg_eval_set) {
5480 /* each successfully executed (?{...}) block does the equivalent of
5481 * local $^R = do {...}
5482 * When popping the save stack, all these locals would be undone;
5483 * bypass this by setting the outermost saved $^R to the latest
5484 * value */
5485 if (oreplsv != GvSV(PL_replgv))
5486 sv_setsv(oreplsv, GvSV(PL_replgv));
5487 }
95b24440 5488 result = 1;
aa283a38 5489 goto final_exit;
4633a7c4
LW
5490
5491no:
a3621e74 5492 DEBUG_EXECUTE_r(
7821416a 5493 PerlIO_printf(Perl_debug_log,
786e8c11 5494 "%*s %sfailed...%s\n",
5bc10b2c 5495 REPORT_CODE_OFF+depth*2, "",
786e8c11 5496 PL_colors[4], PL_colors[5])
7821416a 5497 );
aa283a38 5498
262b90c4 5499no_silent:
54612592
YO
5500 if (no_final) {
5501 if (yes_state) {
5502 goto yes;
5503 } else {
5504 goto final_exit;
5505 }
5506 }
aa283a38
DM
5507 if (depth) {
5508 /* there's a previous state to backtrack to */
40a82448
DM
5509 st--;
5510 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5511 PL_regmatch_slab = PL_regmatch_slab->prev;
5512 st = SLAB_LAST(PL_regmatch_slab);
5513 }
5514 PL_regmatch_state = st;
40a82448
DM
5515 locinput= st->locinput;
5516 nextchr = UCHARAT(locinput);
5517
5bc10b2c
DM
5518 DEBUG_STATE_pp("pop");
5519 depth--;
262b90c4
DM
5520 if (yes_state == st)
5521 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5522
24d3c4a9
DM
5523 state_num = st->resume_state + 1; /* failure = success + 1 */
5524 goto reenter_switch;
95b24440 5525 }
24d3c4a9 5526 result = 0;
aa283a38 5527
262b90c4 5528 final_exit:
bbe252da 5529 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5530 SV *sv_err = get_sv("REGERROR", 1);
5531 SV *sv_mrk = get_sv("REGMARK", 1);
5532 if (result) {
e2e6a0f1 5533 sv_commit = &PL_sv_no;
5d458dd8
YO
5534 if (!sv_yes_mark)
5535 sv_yes_mark = &PL_sv_yes;
5536 } else {
5537 if (!sv_commit)
5538 sv_commit = &PL_sv_yes;
5539 sv_yes_mark = &PL_sv_no;
5540 }
5541 sv_setsv(sv_err, sv_commit);
5542 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5543 }
19b95bf0 5544
2f554ef7
DM
5545 /* clean up; in particular, free all slabs above current one */
5546 LEAVE_SCOPE(oldsave);
5d9a96ca 5547
95b24440 5548 return result;
a687059c
LW
5549}
5550
5551/*
5552 - regrepeat - repeatedly match something simple, report how many
5553 */
5554/*
5555 * [This routine now assumes that it will only match on things of length 1.
5556 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5557 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5558 */
76e3520e 5559STATIC I32
e2e6a0f1 5560S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5561{
27da23d5 5562 dVAR;
a0d0e21e 5563 register char *scan;
a0d0e21e 5564 register I32 c;
3280af22 5565 register char *loceol = PL_regeol;
a0ed51b3 5566 register I32 hardcount = 0;
53c4c00c 5567 register bool do_utf8 = PL_reg_match_utf8;
4f55667c
SP
5568#ifndef DEBUGGING
5569 PERL_UNUSED_ARG(depth);
5570#endif
a0d0e21e 5571
7918f24d
NC
5572 PERL_ARGS_ASSERT_REGREPEAT;
5573
3280af22 5574 scan = PL_reginput;
faf11cac
HS
5575 if (max == REG_INFTY)
5576 max = I32_MAX;
5577 else if (max < loceol - scan)
7f596f4c 5578 loceol = scan + max;
a0d0e21e 5579 switch (OP(p)) {
22c35a8c 5580 case REG_ANY:
1aa99e6b 5581 if (do_utf8) {
ffc61ed2 5582 loceol = PL_regeol;
1aa99e6b 5583 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5584 scan += UTF8SKIP(scan);
5585 hardcount++;
5586 }
5587 } else {
5588 while (scan < loceol && *scan != '\n')
5589 scan++;
a0ed51b3
LW
5590 }
5591 break;
ffc61ed2 5592 case SANY:
def8e4ea
JH
5593 if (do_utf8) {
5594 loceol = PL_regeol;
a0804c9e 5595 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5596 scan += UTF8SKIP(scan);
5597 hardcount++;
5598 }
5599 }
5600 else
5601 scan = loceol;
a0ed51b3 5602 break;
f33976b4
DB
5603 case CANY:
5604 scan = loceol;
5605 break;
090f7165
JH
5606 case EXACT: /* length of string is 1 */
5607 c = (U8)*STRING(p);
5608 while (scan < loceol && UCHARAT(scan) == c)
5609 scan++;
bbce6d69 5610 break;
5611 case EXACTF: /* length of string is 1 */
cd439c50 5612 c = (U8)*STRING(p);
bbce6d69 5613 while (scan < loceol &&
22c35a8c 5614 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 5615 scan++;
5616 break;
5617 case EXACTFL: /* length of string is 1 */
3280af22 5618 PL_reg_flags |= RF_tainted;
cd439c50 5619 c = (U8)*STRING(p);
bbce6d69 5620 while (scan < loceol &&
22c35a8c 5621 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
5622 scan++;
5623 break;
5624 case ANYOF:
ffc61ed2
JH
5625 if (do_utf8) {
5626 loceol = PL_regeol;
cfc92286 5627 while (hardcount < max && scan < loceol &&
32fc9b6a 5628 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
5629 scan += UTF8SKIP(scan);
5630 hardcount++;
5631 }
5632 } else {
32fc9b6a 5633 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
5634 scan++;
5635 }
a0d0e21e
LW
5636 break;
5637 case ALNUM:
1aa99e6b 5638 if (do_utf8) {
ffc61ed2 5639 loceol = PL_regeol;
1a4fad37 5640 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5641 while (hardcount < max && scan < loceol &&
3568d838 5642 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5643 scan += UTF8SKIP(scan);
5644 hardcount++;
5645 }
5646 } else {
5647 while (scan < loceol && isALNUM(*scan))
5648 scan++;
a0ed51b3
LW
5649 }
5650 break;
bbce6d69 5651 case ALNUML:
3280af22 5652 PL_reg_flags |= RF_tainted;
1aa99e6b 5653 if (do_utf8) {
ffc61ed2 5654 loceol = PL_regeol;
1aa99e6b
IH
5655 while (hardcount < max && scan < loceol &&
5656 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5657 scan += UTF8SKIP(scan);
5658 hardcount++;
5659 }
5660 } else {
5661 while (scan < loceol && isALNUM_LC(*scan))
5662 scan++;
a0ed51b3
LW
5663 }
5664 break;
a0d0e21e 5665 case NALNUM:
1aa99e6b 5666 if (do_utf8) {
ffc61ed2 5667 loceol = PL_regeol;
1a4fad37 5668 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 5669 while (hardcount < max && scan < loceol &&
3568d838 5670 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5671 scan += UTF8SKIP(scan);
5672 hardcount++;
5673 }
5674 } else {
5675 while (scan < loceol && !isALNUM(*scan))
5676 scan++;
a0ed51b3
LW
5677 }
5678 break;
bbce6d69 5679 case NALNUML:
3280af22 5680 PL_reg_flags |= RF_tainted;
1aa99e6b 5681 if (do_utf8) {
ffc61ed2 5682 loceol = PL_regeol;
1aa99e6b
IH
5683 while (hardcount < max && scan < loceol &&
5684 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
5685 scan += UTF8SKIP(scan);
5686 hardcount++;
5687 }
5688 } else {
5689 while (scan < loceol && !isALNUM_LC(*scan))
5690 scan++;
a0ed51b3
LW
5691 }
5692 break;
a0d0e21e 5693 case SPACE:
1aa99e6b 5694 if (do_utf8) {
ffc61ed2 5695 loceol = PL_regeol;
1a4fad37 5696 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5697 while (hardcount < max && scan < loceol &&
3568d838
JH
5698 (*scan == ' ' ||
5699 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5700 scan += UTF8SKIP(scan);
5701 hardcount++;
5702 }
5703 } else {
5704 while (scan < loceol && isSPACE(*scan))
5705 scan++;
a0ed51b3
LW
5706 }
5707 break;
bbce6d69 5708 case SPACEL:
3280af22 5709 PL_reg_flags |= RF_tainted;
1aa99e6b 5710 if (do_utf8) {
ffc61ed2 5711 loceol = PL_regeol;
1aa99e6b 5712 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5713 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5714 scan += UTF8SKIP(scan);
5715 hardcount++;
5716 }
5717 } else {
5718 while (scan < loceol && isSPACE_LC(*scan))
5719 scan++;
a0ed51b3
LW
5720 }
5721 break;
a0d0e21e 5722 case NSPACE:
1aa99e6b 5723 if (do_utf8) {
ffc61ed2 5724 loceol = PL_regeol;
1a4fad37 5725 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 5726 while (hardcount < max && scan < loceol &&
3568d838
JH
5727 !(*scan == ' ' ||
5728 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
5729 scan += UTF8SKIP(scan);
5730 hardcount++;
5731 }
5732 } else {
5733 while (scan < loceol && !isSPACE(*scan))
5734 scan++;
a0ed51b3 5735 }
0008a298 5736 break;
bbce6d69 5737 case NSPACEL:
3280af22 5738 PL_reg_flags |= RF_tainted;
1aa99e6b 5739 if (do_utf8) {
ffc61ed2 5740 loceol = PL_regeol;
1aa99e6b 5741 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
5742 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5743 scan += UTF8SKIP(scan);
5744 hardcount++;
5745 }
5746 } else {
5747 while (scan < loceol && !isSPACE_LC(*scan))
5748 scan++;
a0ed51b3
LW
5749 }
5750 break;
a0d0e21e 5751 case DIGIT:
1aa99e6b 5752 if (do_utf8) {
ffc61ed2 5753 loceol = PL_regeol;
1a4fad37 5754 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5755 while (hardcount < max && scan < loceol &&
3568d838 5756 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5757 scan += UTF8SKIP(scan);
5758 hardcount++;
5759 }
5760 } else {
5761 while (scan < loceol && isDIGIT(*scan))
5762 scan++;
a0ed51b3
LW
5763 }
5764 break;
a0d0e21e 5765 case NDIGIT:
1aa99e6b 5766 if (do_utf8) {
ffc61ed2 5767 loceol = PL_regeol;
1a4fad37 5768 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 5769 while (hardcount < max && scan < loceol &&
3568d838 5770 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
5771 scan += UTF8SKIP(scan);
5772 hardcount++;
5773 }
5774 } else {
5775 while (scan < loceol && !isDIGIT(*scan))
5776 scan++;
a0ed51b3 5777 }
e1d1eefb
YO
5778 case LNBREAK:
5779 if (do_utf8) {
5780 loceol = PL_regeol;
5781 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5782 scan += c;
5783 hardcount++;
5784 }
5785 } else {
5786 /*
5787 LNBREAK can match two latin chars, which is ok,
5788 because we have a null terminated string, but we
5789 have to use hardcount in this situation
5790 */
5791 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5792 scan+=c;
5793 hardcount++;
5794 }
5795 }
5796 break;
5797 case HORIZWS:
5798 if (do_utf8) {
5799 loceol = PL_regeol;
5800 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5801 scan += c;
5802 hardcount++;
5803 }
5804 } else {
5805 while (scan < loceol && is_HORIZWS_latin1(scan))
5806 scan++;
5807 }
a0ed51b3 5808 break;
e1d1eefb
YO
5809 case NHORIZWS:
5810 if (do_utf8) {
5811 loceol = PL_regeol;
5812 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5813 scan += UTF8SKIP(scan);
5814 hardcount++;
5815 }
5816 } else {
5817 while (scan < loceol && !is_HORIZWS_latin1(scan))
5818 scan++;
5819
5820 }
5821 break;
5822 case VERTWS:
5823 if (do_utf8) {
5824 loceol = PL_regeol;
5825 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5826 scan += c;
5827 hardcount++;
5828 }
5829 } else {
5830 while (scan < loceol && is_VERTWS_latin1(scan))
5831 scan++;
5832
5833 }
5834 break;
5835 case NVERTWS:
5836 if (do_utf8) {
5837 loceol = PL_regeol;
5838 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5839 scan += UTF8SKIP(scan);
5840 hardcount++;
5841 }
5842 } else {
5843 while (scan < loceol && !is_VERTWS_latin1(scan))
5844 scan++;
5845
5846 }
5847 break;
5848
a0d0e21e
LW
5849 default: /* Called on something of 0 width. */
5850 break; /* So match right here or not at all. */
5851 }
a687059c 5852
a0ed51b3
LW
5853 if (hardcount)
5854 c = hardcount;
5855 else
5856 c = scan - PL_reginput;
3280af22 5857 PL_reginput = scan;
a687059c 5858
a3621e74 5859 DEBUG_r({
e68ec53f 5860 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 5861 DEBUG_EXECUTE_r({
e68ec53f
YO
5862 SV * const prop = sv_newmortal();
5863 regprop(prog, prop, p);
5864 PerlIO_printf(Perl_debug_log,
be8e71aa 5865 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 5866 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 5867 });
be8e71aa 5868 });
9041c2e3 5869
a0d0e21e 5870 return(c);
a687059c
LW
5871}
5872
c277df42 5873
be8e71aa 5874#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 5875/*
ffc61ed2
JH
5876- regclass_swash - prepare the utf8 swash
5877*/
5878
5879SV *
32fc9b6a 5880Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 5881{
97aff369 5882 dVAR;
9e55ce06
JH
5883 SV *sw = NULL;
5884 SV *si = NULL;
5885 SV *alt = NULL;
f8fc2ecf
YO
5886 RXi_GET_DECL(prog,progi);
5887 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 5888
7918f24d
NC
5889 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5890
4f639d21 5891 if (data && data->count) {
a3b680e6 5892 const U32 n = ARG(node);
ffc61ed2 5893
4f639d21 5894 if (data->what[n] == 's') {
ad64d0ec
NC
5895 SV * const rv = MUTABLE_SV(data->data[n]);
5896 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 5897 SV **const ary = AvARRAY(av);
9e55ce06 5898 SV **a, **b;
9041c2e3 5899
711a919c 5900 /* See the end of regcomp.c:S_regclass() for
9e55ce06
JH
5901 * documentation of these array elements. */
5902
b11f357e 5903 si = *ary;
fe5bfecd
JH
5904 a = SvROK(ary[1]) ? &ary[1] : NULL;
5905 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
b11f357e 5906
ffc61ed2
JH
5907 if (a)
5908 sw = *a;
5909 else if (si && doinit) {
5910 sw = swash_init("utf8", "", si, 1, 0);
5911 (void)av_store(av, 1, sw);
5912 }
9e55ce06
JH
5913 if (b)
5914 alt = *b;
ffc61ed2
JH
5915 }
5916 }
5917
9e55ce06
JH
5918 if (listsvp)
5919 *listsvp = si;
5920 if (altsvp)
5921 *altsvp = alt;
ffc61ed2
JH
5922
5923 return sw;
5924}
76234dfb 5925#endif
ffc61ed2
JH
5926
5927/*
ba7b4546 5928 - reginclass - determine if a character falls into a character class
832705d4
JH
5929
5930 The n is the ANYOF regnode, the p is the target string, lenp
5931 is pointer to the maximum length of how far to go in the p
5932 (if the lenp is zero, UTF8SKIP(p) is used),
5933 do_utf8 tells whether the target string is in UTF-8.
5934
bbce6d69 5935 */
5936
76e3520e 5937STATIC bool
32fc9b6a 5938S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 5939{
27da23d5 5940 dVAR;
a3b680e6 5941 const char flags = ANYOF_FLAGS(n);
bbce6d69 5942 bool match = FALSE;
cc07378b 5943 UV c = *p;
ae9ddab8 5944 STRLEN len = 0;
9e55ce06 5945 STRLEN plen;
1aa99e6b 5946
7918f24d
NC
5947 PERL_ARGS_ASSERT_REGINCLASS;
5948
19f67299
TS
5949 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5950 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
6182169b
KW
5951 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5952 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5953 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5954 * UTF8_ALLOW_FFFF */
e8a70c6f
SP
5955 if (len == (STRLEN)-1)
5956 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 5957 }
bbce6d69 5958
0f0076b4 5959 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 5960 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
5961 if (lenp)
5962 *lenp = 0;
ffc61ed2 5963 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
5964 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5965 match = TRUE;
bbce6d69 5966 }
3568d838 5967 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 5968 match = TRUE;
ffc61ed2 5969 if (!match) {
9e55ce06 5970 AV *av;
32fc9b6a 5971 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
5972
5973 if (sw) {
3f0c5693
KW
5974 U8 * utf8_p;
5975 if (do_utf8) {
5976 utf8_p = (U8 *) p;
5977 } else {
5978 STRLEN len = 1;
5979 utf8_p = bytes_to_utf8(p, &len);
5980 }
5981 if (swash_fetch(sw, utf8_p, 1))
ffc61ed2
JH
5982 match = TRUE;
5983 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
5984 if (!match && lenp && av) {
5985 I32 i;
9e55ce06 5986 for (i = 0; i <= av_len(av); i++) {
890ce7af 5987 SV* const sv = *av_fetch(av, i, FALSE);
9e55ce06 5988 STRLEN len;
890ce7af 5989 const char * const s = SvPV_const(sv, len);
3f0c5693 5990 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
9e55ce06
JH
5991 *lenp = len;
5992 match = TRUE;
5993 break;
5994 }
5995 }
5996 }
5997 if (!match) {
89ebb4a3 5998 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43 5999
3f0c5693
KW
6000 STRLEN tmplen;
6001 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6002 if (swash_fetch(sw, tmpbuf, 1))
9e55ce06
JH
6003 match = TRUE;
6004 }
ffc61ed2 6005 }
b3a04dd3
KW
6006
6007 /* If we allocated a string above, free it */
6008 if (! do_utf8) Safefree(utf8_p);
ffc61ed2 6009 }
bbce6d69 6010 }
9e55ce06 6011 if (match && lenp && *lenp == 0)
0f0076b4 6012 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 6013 }
1aa99e6b 6014 if (!match && c < 256) {
ffc61ed2
JH
6015 if (ANYOF_BITMAP_TEST(n, c))
6016 match = TRUE;
6017 else if (flags & ANYOF_FOLD) {
eb160463 6018 U8 f;
a0ed51b3 6019
ffc61ed2
JH
6020 if (flags & ANYOF_LOCALE) {
6021 PL_reg_flags |= RF_tainted;
6022 f = PL_fold_locale[c];
6023 }
6024 else
6025 f = PL_fold[c];
6026 if (f != c && ANYOF_BITMAP_TEST(n, f))
6027 match = TRUE;
6028 }
6029
6030 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 6031 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
6032 if (
6033 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6034 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6035 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6036 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6037 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6038 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6039 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6040 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6041 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6042 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6043 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6044 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6045 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6046 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6047 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6048 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6049 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6050 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6051 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6052 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6053 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6054 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6055 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6056 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6057 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6058 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6059 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6060 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6061 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6062 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6063 ) /* How's that for a conditional? */
6064 {
6065 match = TRUE;
6066 }
a0ed51b3 6067 }
a0ed51b3
LW
6068 }
6069
a0ed51b3
LW
6070 return (flags & ANYOF_INVERT) ? !match : match;
6071}
161b471a 6072
dfe13c55 6073STATIC U8 *
0ce71af7 6074S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 6075{
97aff369 6076 dVAR;
7918f24d
NC
6077
6078 PERL_ARGS_ASSERT_REGHOP3;
6079
a0ed51b3 6080 if (off >= 0) {
1aa99e6b 6081 while (off-- && s < lim) {
ffc61ed2 6082 /* XXX could check well-formedness here */
a0ed51b3 6083 s += UTF8SKIP(s);
ffc61ed2 6084 }
a0ed51b3
LW
6085 }
6086 else {
1de06328
YO
6087 while (off++ && s > lim) {
6088 s--;
6089 if (UTF8_IS_CONTINUED(*s)) {
6090 while (s > lim && UTF8_IS_CONTINUATION(*s))
6091 s--;
a0ed51b3 6092 }
1de06328 6093 /* XXX could check well-formedness here */
a0ed51b3
LW
6094 }
6095 }
6096 return s;
6097}
161b471a 6098
f9f4320a
YO
6099#ifdef XXX_dmq
6100/* there are a bunch of places where we use two reghop3's that should
6101 be replaced with this routine. but since thats not done yet
6102 we ifdef it out - dmq
6103*/
dfe13c55 6104STATIC U8 *
1de06328
YO
6105S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6106{
6107 dVAR;
7918f24d
NC
6108
6109 PERL_ARGS_ASSERT_REGHOP4;
6110
1de06328
YO
6111 if (off >= 0) {
6112 while (off-- && s < rlim) {
6113 /* XXX could check well-formedness here */
6114 s += UTF8SKIP(s);
6115 }
6116 }
6117 else {
6118 while (off++ && s > llim) {
6119 s--;
6120 if (UTF8_IS_CONTINUED(*s)) {
6121 while (s > llim && UTF8_IS_CONTINUATION(*s))
6122 s--;
6123 }
6124 /* XXX could check well-formedness here */
6125 }
6126 }
6127 return s;
6128}
f9f4320a 6129#endif
1de06328
YO
6130
6131STATIC U8 *
0ce71af7 6132S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 6133{
97aff369 6134 dVAR;
7918f24d
NC
6135
6136 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6137
a0ed51b3 6138 if (off >= 0) {
1aa99e6b 6139 while (off-- && s < lim) {
ffc61ed2 6140 /* XXX could check well-formedness here */
a0ed51b3 6141 s += UTF8SKIP(s);
ffc61ed2 6142 }
a0ed51b3 6143 if (off >= 0)
3dab1dad 6144 return NULL;
a0ed51b3
LW
6145 }
6146 else {
1de06328
YO
6147 while (off++ && s > lim) {
6148 s--;
6149 if (UTF8_IS_CONTINUED(*s)) {
6150 while (s > lim && UTF8_IS_CONTINUATION(*s))
6151 s--;
a0ed51b3 6152 }
1de06328 6153 /* XXX could check well-formedness here */
a0ed51b3
LW
6154 }
6155 if (off <= 0)
3dab1dad 6156 return NULL;
a0ed51b3
LW
6157 }
6158 return s;
6159}
51371543 6160
51371543 6161static void
acfe0abc 6162restore_pos(pTHX_ void *arg)
51371543 6163{
97aff369 6164 dVAR;
097eb12c 6165 regexp * const rex = (regexp *)arg;
51371543
GS
6166 if (PL_reg_eval_set) {
6167 if (PL_reg_oldsaved) {
4f639d21
DM
6168 rex->subbeg = PL_reg_oldsaved;
6169 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 6170#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 6171 rex->saved_copy = PL_nrs;
ed252734 6172#endif
07bc277f 6173 RXp_MATCH_COPIED_on(rex);
51371543
GS
6174 }
6175 PL_reg_magic->mg_len = PL_reg_oldpos;
6176 PL_reg_eval_set = 0;
6177 PL_curpm = PL_reg_oldcurpm;
6178 }
6179}
33b8afdf
JH
6180
6181STATIC void
6182S_to_utf8_substr(pTHX_ register regexp *prog)
6183{
a1cac82e 6184 int i = 1;
7918f24d
NC
6185
6186 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6187
a1cac82e
NC
6188 do {
6189 if (prog->substrs->data[i].substr
6190 && !prog->substrs->data[i].utf8_substr) {
6191 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6192 prog->substrs->data[i].utf8_substr = sv;
6193 sv_utf8_upgrade(sv);
610460f9
NC
6194 if (SvVALID(prog->substrs->data[i].substr)) {
6195 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6196 if (flags & FBMcf_TAIL) {
6197 /* Trim the trailing \n that fbm_compile added last
6198 time. */
6199 SvCUR_set(sv, SvCUR(sv) - 1);
6200 /* Whilst this makes the SV technically "invalid" (as its
6201 buffer is no longer followed by "\0") when fbm_compile()
6202 adds the "\n" back, a "\0" is restored. */
6203 }
6204 fbm_compile(sv, flags);
6205 }
a1cac82e
NC
6206 if (prog->substrs->data[i].substr == prog->check_substr)
6207 prog->check_utf8 = sv;
6208 }
6209 } while (i--);
33b8afdf
JH
6210}
6211
6212STATIC void
6213S_to_byte_substr(pTHX_ register regexp *prog)
6214{
97aff369 6215 dVAR;
a1cac82e 6216 int i = 1;
7918f24d
NC
6217
6218 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6219
a1cac82e
NC
6220 do {
6221 if (prog->substrs->data[i].utf8_substr
6222 && !prog->substrs->data[i].substr) {
6223 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6224 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9
NC
6225 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6226 const U8 flags
6227 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6228 if (flags & FBMcf_TAIL) {
6229 /* Trim the trailing \n that fbm_compile added last
6230 time. */
6231 SvCUR_set(sv, SvCUR(sv) - 1);
6232 }
6233 fbm_compile(sv, flags);
6234 }
a1cac82e
NC
6235 } else {
6236 SvREFCNT_dec(sv);
6237 sv = &PL_sv_undef;
6238 }
6239 prog->substrs->data[i].substr = sv;
6240 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6241 prog->check_substr = sv;
33b8afdf 6242 }
a1cac82e 6243 } while (i--);
33b8afdf 6244}
66610fdd
RGS
6245
6246/*
6247 * Local variables:
6248 * c-indentation-style: bsd
6249 * c-basic-offset: 4
6250 * indent-tabs-mode: t
6251 * End:
6252 *
37442d52
RGS
6253 * ex: set ts=8 sts=4 sw=4 noet:
6254 */