This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add &CORE::undef
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
125#ifdef EBCDIC
126 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
127 * skip the check on EBCDIC platforms */
128# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
129#else
130# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
131 if (!CAT2(PL_utf8_,class)) { \
132 bool ok; \
133 ENTER; save_re_context(); \
134 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 135 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 136#endif
37e2e78e
KW
137
138/* Doesn't do an assert to verify that is correct */
139#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 140 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 141 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
142 ENTER; save_re_context(); \
143 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
144 LEAVE; } } STMT_END
37e2e78e 145
1a4fad37
AL
146#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
147#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
148#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 149
37e2e78e 150#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
151 LOAD_UTF8_CHARCLASS(X_begin, " "); \
152 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
153 /* These are utf8 constants, and not utf-ebcdic constants, so the \
154 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 155 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
156 \
157 /* No asserts are done for these, in case called on an early \
158 * Unicode version in which they map to nothing */ \
d275fa5e
CB
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
163 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
164 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 165 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 166
1dcf4a1b 167#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 168
ee9a90b8
KW
169/* The actual code for CCC_TRY, which uses several variables from the routine
170 * it's callable from. It is designed to be the bulk of a case statement.
171 * FUNC is the macro or function to call on non-utf8 targets that indicate if
172 * nextchr matches the class.
173 * UTF8_TEST is the whole test string to use for utf8 targets
174 * LOAD is what to use to test, and if not present to load in the swash for the
175 * class
176 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
177 * UTF8_TEST test.
178 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
179 * utf8 and a variant, load the swash if necessary and test using the utf8
180 * test. Advance to the next character if test is ok, otherwise fail; If not
181 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
182 * fails, or advance to the next character */
183
184#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
185 if (locinput >= PL_regeol) { \
186 sayNO; \
187 } \
188 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
189 LOAD_UTF8_CHARCLASS(CLASS, STR); \
190 if (POS_OR_NEG (UTF8_TEST)) { \
191 sayNO; \
192 } \
193 locinput += PL_utf8skip[nextchr]; \
194 nextchr = UCHARAT(locinput); \
195 break; \
196 } \
197 if (POS_OR_NEG (FUNC(nextchr))) { \
198 sayNO; \
199 } \
200 nextchr = UCHARAT(++locinput); \
980866de
KW
201 break;
202
ee9a90b8
KW
203/* Handle the non-locale cases for a character class and its complement. It
204 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
205 * This is because that code fails when the test succeeds, so we want to have
206 * the test fail so that the code succeeds. The swash is stored in a
207 * predictable PL_ place */
cfaf538b
KW
208#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
209 CLASS, STR) \
ee9a90b8
KW
210 case NAME: \
211 _CCC_TRY_CODE( !, FUNC, \
212 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
213 (U8*)locinput, TRUE)), \
214 CLASS, STR) \
215 case NNAME: \
1dcf4a1b 216 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
217 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
218 (U8*)locinput, TRUE)), \
219 CLASS, STR) \
220
221/* Generate the case statements for both locale and non-locale character
222 * classes in regmatch for classes that don't have special unicode semantics.
223 * Locales don't use an immediate swash, but an intermediary special locale
224 * function that is called on the pointer to the current place in the input
225 * string. That function will resolve to needing the same swash. One might
226 * think that because we don't know what the locale will match, we shouldn't
227 * check with the swash loading function that it loaded properly; ie, that we
228 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
229 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
230 * irrelevant here */
231#define CCC_TRY(NAME, NNAME, FUNC, \
232 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 233 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
234 CLASS, STR) \
235 case NAMEL: \
236 PL_reg_flags |= RF_tainted; \
237 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
238 case NNAMEL: \
239 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
240 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
241 CLASS, STR) \
cfaf538b
KW
242 case NAMEA: \
243 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
244 sayNO; \
245 } \
246 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
247 nextchr = UCHARAT(++locinput); \
248 break; \
249 case NNAMEA: \
250 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
251 sayNO; \
252 } \
253 if (utf8_target) { \
254 locinput += PL_utf8skip[nextchr]; \
255 nextchr = UCHARAT(locinput); \
256 } \
257 else { \
258 nextchr = UCHARAT(++locinput); \
259 } \
260 break; \
ee9a90b8
KW
261 /* Generate the non-locale cases */ \
262 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
263
264/* This is like CCC_TRY, but has an extra set of parameters for generating case
265 * statements to handle separate Unicode semantics nodes */
266#define CCC_TRY_U(NAME, NNAME, FUNC, \
267 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
268 NAMEU, NNAMEU, FUNCU, \
cfaf538b 269 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 270 CLASS, STR) \
cfaf538b
KW
271 CCC_TRY(NAME, NNAME, FUNC, \
272 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
273 NAMEA, NNAMEA, FUNCA, \
274 CLASS, STR) \
ee9a90b8 275 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 276
3dab1dad
YO
277/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
278
5f80c4cf 279/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
280/* it would be nice to rework regcomp.sym to generate this stuff. sigh
281 *
282 * NOTE that *nothing* that affects backtracking should be in here, specifically
283 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
284 * node that is in between two EXACT like nodes when ascertaining what the required
285 * "follow" character is. This should probably be moved to regex compile time
286 * although it may be done at run time beause of the REF possibility - more
287 * investigation required. -- demerphq
288*/
3e901dc0
YO
289#define JUMPABLE(rn) ( \
290 OP(rn) == OPEN || \
291 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
292 OP(rn) == EVAL || \
cca55fe3
JP
293 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
294 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 295 OP(rn) == KEEPS || \
3dab1dad 296 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 297)
ee9b8eae 298#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 299
ee9b8eae
YO
300#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
301
302#if 0
303/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
304 we don't need this definition. */
305#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 306#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
307#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
308
309#else
310/* ... so we use this as its faster. */
311#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 312#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
313#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
314#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
315
316#endif
e2d8ce26 317
a84d97b6
HS
318/*
319 Search for mandatory following text node; for lookahead, the text must
320 follow but for lookbehind (rn->flags != 0) we skip to the next step.
321*/
cca55fe3 322#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
323 while (JUMPABLE(rn)) { \
324 const OPCODE type = OP(rn); \
325 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 326 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 327 else if (type == PLUS) \
cca55fe3 328 rn = NEXTOPER(rn); \
3dab1dad 329 else if (type == IFMATCH) \
a84d97b6 330 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 331 else rn += NEXT_OFF(rn); \
3dab1dad 332 } \
5f80c4cf 333} STMT_END
74750237 334
c476f425 335
acfe0abc 336static void restore_pos(pTHX_ void *arg);
51371543 337
620d5b66
NC
338#define REGCP_PAREN_ELEMS 4
339#define REGCP_OTHER_ELEMS 5
e0fa7e2b 340#define REGCP_FRAME_ELEMS 1
620d5b66
NC
341/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
342 * are needed for the regexp context stack bookkeeping. */
343
76e3520e 344STATIC CHECKPOINT
cea2e8a9 345S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 346{
97aff369 347 dVAR;
a3b680e6 348 const int retval = PL_savestack_ix;
a3b680e6 349 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
350 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
351 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 352 int p;
40a82448 353 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 354
e49a9654 355 if (paren_elems_to_push < 0)
5637ef5b
NC
356 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
357 paren_elems_to_push);
e49a9654 358
e0fa7e2b
NC
359 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
360 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
361 " out of range (%lu-%ld)",
362 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 363
620d5b66 364 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 365
3280af22 366 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
368 SSPUSHINT(PL_regoffs[p].end);
369 SSPUSHINT(PL_regoffs[p].start);
3280af22 370 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 371 SSPUSHINT(p);
e7707071 372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 373 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 374 (UV)p, (IV)PL_regoffs[p].start,
40a82448 375 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 376 (IV)PL_regoffs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 380 SSPUSHPTR(PL_regoffs);
3280af22
NIS
381 SSPUSHINT(PL_regsize);
382 SSPUSHINT(*PL_reglastparen);
a01268b5 383 SSPUSHINT(*PL_reglastcloseparen);
3280af22 384 SSPUSHPTR(PL_reginput);
e0fa7e2b 385 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 386
a0d0e21e
LW
387 return retval;
388}
389
c277df42 390/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
391#define REGCP_SET(cp) \
392 DEBUG_STATE_r( \
ab3bbdeb 393 PerlIO_printf(Perl_debug_log, \
e4f74956 394 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
395 (IV)PL_savestack_ix)); \
396 cp = PL_savestack_ix
c3464db5 397
ab3bbdeb 398#define REGCP_UNWIND(cp) \
e4f74956 399 DEBUG_STATE_r( \
ab3bbdeb 400 if (cp != PL_savestack_ix) \
e4f74956
YO
401 PerlIO_printf(Perl_debug_log, \
402 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
403 (IV)(cp), (IV)PL_savestack_ix)); \
404 regcpblow(cp)
c277df42 405
76e3520e 406STATIC char *
097eb12c 407S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 408{
97aff369 409 dVAR;
e0fa7e2b 410 UV i;
a0d0e21e 411 char *input;
a3621e74
YO
412 GET_RE_DEBUG_FLAGS_DECL;
413
7918f24d
NC
414 PERL_ARGS_ASSERT_REGCPPOP;
415
b1ce53c5 416 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 417 i = SSPOPUV;
e0fa7e2b
NC
418 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
419 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 420 input = (char *) SSPOPPTR;
a01268b5 421 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
422 *PL_reglastparen = SSPOPINT;
423 PL_regsize = SSPOPINT;
f0ab9afb 424 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 425
620d5b66 426 i -= REGCP_OTHER_ELEMS;
b1ce53c5 427 /* Now restore the parentheses context. */
620d5b66 428 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 429 I32 tmps;
097eb12c 430 U32 paren = (U32)SSPOPINT;
3280af22 431 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 432 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 433 tmps = SSPOPINT;
3280af22 434 if (paren <= *PL_reglastparen)
f0ab9afb 435 PL_regoffs[paren].end = tmps;
e7707071 436 DEBUG_BUFFERS_r(
c3464db5 437 PerlIO_printf(Perl_debug_log,
b900a521 438 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 439 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 440 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 441 (IV)PL_regoffs[paren].end,
3280af22 442 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 443 );
a0d0e21e 444 }
e7707071 445 DEBUG_BUFFERS_r(
bb7a0f54 446 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 447 PerlIO_printf(Perl_debug_log,
faccc32b 448 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 449 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
450 }
451 );
daf18116 452#if 1
dafc8851
JH
453 /* It would seem that the similar code in regtry()
454 * already takes care of this, and in fact it is in
455 * a better location to since this code can #if 0-ed out
456 * but the code in regtry() is needed or otherwise tests
457 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
458 * (as of patchlevel 7877) will fail. Then again,
459 * this code seems to be necessary or otherwise
225593e1
DM
460 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
461 * --jhi updated by dapm */
3b6647e0 462 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 463 if (i > PL_regsize)
f0ab9afb
NC
464 PL_regoffs[i].start = -1;
465 PL_regoffs[i].end = -1;
a0d0e21e 466 }
dafc8851 467#endif
a0d0e21e
LW
468 return input;
469}
470
02db2b7b 471#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 472
a687059c 473/*
e50aee73 474 * pregexec and friends
a687059c
LW
475 */
476
76234dfb 477#ifndef PERL_IN_XSUB_RE
a687059c 478/*
c277df42 479 - pregexec - match a regexp against a string
a687059c 480 */
c277df42 481I32
49d7dfbc 482Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 483 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
484/* strend: pointer to null at end of string */
485/* strbeg: real beginning of string */
486/* minend: end of match must be >=minend after stringarg. */
487/* nosave: For optimizations. */
488{
7918f24d
NC
489 PERL_ARGS_ASSERT_PREGEXEC;
490
c277df42 491 return
9041c2e3 492 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
493 nosave ? 0 : REXEC_COPY_STR);
494}
76234dfb 495#endif
22e551b9 496
9041c2e3 497/*
cad2e5aa
JH
498 * Need to implement the following flags for reg_anch:
499 *
500 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
501 * USE_INTUIT_ML
502 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
503 * INTUIT_AUTORITATIVE_ML
504 * INTUIT_ONCE_NOML - Intuit can match in one location only.
505 * INTUIT_ONCE_ML
506 *
507 * Another flag for this function: SECOND_TIME (so that float substrs
508 * with giant delta may be not rechecked).
509 */
510
511/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
512
3f7c398e 513/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
514 Otherwise, only SvCUR(sv) is used to get strbeg. */
515
516/* XXXX We assume that strpos is strbeg unless sv. */
517
6eb5f6b9
JH
518/* XXXX Some places assume that there is a fixed substring.
519 An update may be needed if optimizer marks as "INTUITable"
520 RExen without fixed substrings. Similarly, it is assumed that
521 lengths of all the strings are no more than minlen, thus they
522 cannot come from lookahead.
40d049e4
YO
523 (Or minlen should take into account lookahead.)
524 NOTE: Some of this comment is not correct. minlen does now take account
525 of lookahead/behind. Further research is required. -- demerphq
526
527*/
6eb5f6b9 528
2c2d71f5
JH
529/* A failure to find a constant substring means that there is no need to make
530 an expensive call to REx engine, thus we celebrate a failure. Similarly,
531 finding a substring too deep into the string means that less calls to
30944b6d
IZ
532 regtry() should be needed.
533
534 REx compiler's optimizer found 4 possible hints:
535 a) Anchored substring;
536 b) Fixed substring;
537 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 538 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 539 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
540 string which does not contradict any of them.
541 */
2c2d71f5 542
6eb5f6b9
JH
543/* Most of decisions we do here should have been done at compile time.
544 The nodes of the REx which we used for the search should have been
545 deleted from the finite automaton. */
546
cad2e5aa 547char *
288b8c02 548Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 549 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 550{
97aff369 551 dVAR;
288b8c02 552 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 553 register I32 start_shift = 0;
cad2e5aa 554 /* Should be nonnegative! */
b7953727 555 register I32 end_shift = 0;
2c2d71f5
JH
556 register char *s;
557 register SV *check;
a1933d95 558 char *strbeg;
cad2e5aa 559 char *t;
f2ed9b32 560 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 561 I32 ml_anch;
bd61b366
SS
562 register char *other_last = NULL; /* other substr checked before this */
563 char *check_at = NULL; /* check substr found at this pos */
bbe252da 564 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 565 RXi_GET_DECL(prog,progi);
30944b6d 566#ifdef DEBUGGING
890ce7af 567 const char * const i_strpos = strpos;
30944b6d 568#endif
a3621e74
YO
569 GET_RE_DEBUG_FLAGS_DECL;
570
7918f24d
NC
571 PERL_ARGS_ASSERT_RE_INTUIT_START;
572
f2ed9b32 573 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 574
3c8556c3 575 if (RX_UTF8(rx)) {
b8d68ded
JH
576 PL_reg_flags |= RF_utf8;
577 }
ab3bbdeb 578 DEBUG_EXECUTE_r(
f2ed9b32 579 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
580 sv ? "Guessing start of match in sv for"
581 : "Guessing start of match in string for");
2a782b5b 582 );
cad2e5aa 583
c344f387
JH
584 /* CHR_DIST() would be more correct here but it makes things slow. */
585 if (prog->minlen > strend - strpos) {
a3621e74 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 587 "String too short... [re_intuit_start]\n"));
cad2e5aa 588 goto fail;
2c2d71f5 589 }
1de06328 590
a1933d95 591 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 592 PL_regeol = strend;
f2ed9b32 593 if (utf8_target) {
33b8afdf
JH
594 if (!prog->check_utf8 && prog->check_substr)
595 to_utf8_substr(prog);
596 check = prog->check_utf8;
597 } else {
598 if (!prog->check_substr && prog->check_utf8)
599 to_byte_substr(prog);
600 check = prog->check_substr;
601 }
1de06328 602 if (check == &PL_sv_undef) {
a3621e74 603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 604 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
605 goto fail;
606 }
bbe252da
YO
607 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
608 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
609 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 610 && !multiline ) ); /* Check after \n? */
cad2e5aa 611
7e25d62c 612 if (!ml_anch) {
bbe252da
YO
613 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
614 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 615 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
616 && sv && !SvROK(sv)
617 && (strpos != strbeg)) {
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
619 goto fail;
620 }
621 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 622 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 623 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
624 I32 slen;
625
1aa99e6b 626 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 627
653099ff
GS
628 if (SvTAIL(check)) {
629 slen = SvCUR(check); /* >= 1 */
cad2e5aa 630
9041c2e3 631 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 632 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 634 goto fail_finish;
cad2e5aa
JH
635 }
636 /* Now should match s[0..slen-2] */
637 slen--;
3f7c398e 638 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 639 || (slen > 1
3f7c398e 640 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 641 report_neq:
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
643 goto fail_finish;
644 }
cad2e5aa 645 }
3f7c398e 646 else if (*SvPVX_const(check) != *s
653099ff 647 || ((slen = SvCUR(check)) > 1
3f7c398e 648 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 649 goto report_neq;
c315bfe8 650 check_at = s;
2c2d71f5 651 goto success_at_start;
7e25d62c 652 }
cad2e5aa 653 }
2c2d71f5 654 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 655 s = strpos;
2c2d71f5 656 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
657 end_shift = prog->check_end_shift;
658
2c2d71f5 659 if (!ml_anch) {
a3b680e6 660 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 661 - (SvTAIL(check) != 0);
a3b680e6 662 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
663
664 if (end_shift < eshift)
665 end_shift = eshift;
666 }
cad2e5aa 667 }
2c2d71f5 668 else { /* Can match at random position */
cad2e5aa
JH
669 ml_anch = 0;
670 s = strpos;
1de06328
YO
671 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
672 end_shift = prog->check_end_shift;
673
674 /* end shift should be non negative here */
cad2e5aa
JH
675 }
676
bcdf7404 677#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 678 if (end_shift < 0)
1de06328 679 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 680 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
681#endif
682
2c2d71f5
JH
683 restart:
684 /* Find a possible match in the region s..strend by looking for
685 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
686
687 {
688 I32 srch_start_shift = start_shift;
689 I32 srch_end_shift = end_shift;
690 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
691 srch_end_shift -= ((strbeg - s) - srch_start_shift);
692 srch_start_shift = strbeg - s;
693 }
6bda09f9 694 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
695 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
696 (IV)prog->check_offset_min,
697 (IV)srch_start_shift,
698 (IV)srch_end_shift,
699 (IV)prog->check_end_shift);
700 });
701
f1905e1b 702 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
cad2e5aa 703 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 704 I32 * const pp = data ? data->scream_pos : &p;
4185c919 705 const MAGIC *mg;
72de20cd 706 bool found = FALSE;
cad2e5aa 707
4185c919
NC
708 assert(SvMAGICAL(sv));
709 mg = mg_find(sv, PERL_MAGIC_study);
710 assert(mg);
711
72de20cd
NC
712 if (mg->mg_private == 1) {
713 found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
714 } else if (mg->mg_private == 2) {
715 found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
716 } else {
717 assert (mg->mg_private == 4);
718 found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
719 }
720
721 if (found
2c2d71f5 722 || ( BmRARE(check) == '\n'
85c508c3 723 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 724 && SvTAIL(check) ))
9041c2e3 725 s = screaminstr(sv, check,
1de06328 726 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 727 else
2c2d71f5 728 goto fail_finish;
4addbd3b 729 /* we may be pointing at the wrong string */
07bc277f 730 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 731 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
732 if (data)
733 *data->scream_olds = s;
734 }
1de06328
YO
735 else {
736 U8* start_point;
737 U8* end_point;
bbe252da 738 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
739 start_point= (U8*)(s + srch_start_shift);
740 end_point= (U8*)(strend - srch_end_shift);
741 } else {
742 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
743 end_point= HOP3(strend, -srch_end_shift, strbeg);
744 }
6bda09f9 745 DEBUG_OPTIMISE_MORE_r({
56570a2c 746 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 747 (int)(end_point - start_point),
fc8cd66c 748 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
749 start_point);
750 });
751
752 s = fbm_instr( start_point, end_point,
7fba1cd6 753 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
754 }
755 }
cad2e5aa
JH
756 /* Update the count-of-usability, remove useless subpatterns,
757 unshift s. */
2c2d71f5 758
ab3bbdeb 759 DEBUG_EXECUTE_r({
f2ed9b32 760 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
761 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
762 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 763 (s ? "Found" : "Did not find"),
f2ed9b32 764 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
765 ? "anchored" : "floating"),
766 quoted,
767 RE_SV_TAIL(check),
768 (s ? " at offset " : "...\n") );
769 });
2c2d71f5
JH
770
771 if (!s)
772 goto fail_finish;
2c2d71f5 773 /* Finish the diagnostic message */
a3621e74 774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 775
1de06328
YO
776 /* XXX dmq: first branch is for positive lookbehind...
777 Our check string is offset from the beginning of the pattern.
778 So we need to do any stclass tests offset forward from that
779 point. I think. :-(
780 */
781
782
783
784 check_at=s;
785
786
2c2d71f5
JH
787 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
788 Start with the other substr.
789 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 790 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
791 *always* match. Probably should be marked during compile...
792 Probably it is right to do no SCREAM here...
793 */
794
f2ed9b32 795 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
796 : (prog->float_substr && prog->anchored_substr))
797 {
30944b6d 798 /* Take into account the "other" substring. */
2c2d71f5
JH
799 /* XXXX May be hopelessly wrong for UTF... */
800 if (!other_last)
6eb5f6b9 801 other_last = strpos;
f2ed9b32 802 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
803 do_other_anchored:
804 {
890ce7af
AL
805 char * const last = HOP3c(s, -start_shift, strbeg);
806 char *last1, *last2;
be8e71aa 807 char * const saved_s = s;
33b8afdf 808 SV* must;
2c2d71f5 809
2c2d71f5
JH
810 t = s - prog->check_offset_max;
811 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 812 && (!utf8_target
0ce71af7 813 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 814 && t > strpos)))
6f207bd3 815 NOOP;
2c2d71f5
JH
816 else
817 t = strpos;
1aa99e6b 818 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
819 if (t < other_last) /* These positions already checked */
820 t = other_last;
1aa99e6b 821 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
822 if (last < last1)
823 last1 = last;
1de06328
YO
824 /* XXXX It is not documented what units *_offsets are in.
825 We assume bytes, but this is clearly wrong.
826 Meaning this code needs to be carefully reviewed for errors.
827 dmq.
828 */
829
2c2d71f5 830 /* On end-of-str: see comment below. */
f2ed9b32 831 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
832 if (must == &PL_sv_undef) {
833 s = (char*)NULL;
1de06328 834 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
835 }
836 else
837 s = fbm_instr(
838 (unsigned char*)t,
839 HOP3(HOP3(last1, prog->anchored_offset, strend)
840 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
841 must,
7fba1cd6 842 multiline ? FBMrf_MULTILINE : 0
33b8afdf 843 );
ab3bbdeb 844 DEBUG_EXECUTE_r({
f2ed9b32 845 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
846 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
847 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 848 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
849 quoted, RE_SV_TAIL(must));
850 });
851
852
2c2d71f5
JH
853 if (!s) {
854 if (last1 >= last2) {
a3621e74 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
856 ", giving up...\n"));
857 goto fail_finish;
858 }
a3621e74 859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 860 ", trying floating at offset %ld...\n",
be8e71aa 861 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
862 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
863 s = HOP3c(last, 1, strend);
2c2d71f5
JH
864 goto restart;
865 }
866 else {
a3621e74 867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 868 (long)(s - i_strpos)));
1aa99e6b
IH
869 t = HOP3c(s, -prog->anchored_offset, strbeg);
870 other_last = HOP3c(s, 1, strend);
be8e71aa 871 s = saved_s;
2c2d71f5
JH
872 if (t == strpos)
873 goto try_at_start;
2c2d71f5
JH
874 goto try_at_offset;
875 }
30944b6d 876 }
2c2d71f5
JH
877 }
878 else { /* Take into account the floating substring. */
33b8afdf 879 char *last, *last1;
be8e71aa 880 char * const saved_s = s;
33b8afdf
JH
881 SV* must;
882
883 t = HOP3c(s, -start_shift, strbeg);
884 last1 = last =
885 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
886 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
887 last = HOP3c(t, prog->float_max_offset, strend);
888 s = HOP3c(t, prog->float_min_offset, strend);
889 if (s < other_last)
890 s = other_last;
2c2d71f5 891 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 892 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
893 /* fbm_instr() takes into account exact value of end-of-str
894 if the check is SvTAIL(ed). Since false positives are OK,
895 and end-of-str is not later than strend we are OK. */
896 if (must == &PL_sv_undef) {
897 s = (char*)NULL;
1de06328 898 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
899 }
900 else
2c2d71f5 901 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
902 (unsigned char*)last + SvCUR(must)
903 - (SvTAIL(must)!=0),
7fba1cd6 904 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 905 DEBUG_EXECUTE_r({
f2ed9b32 906 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
907 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
908 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 909 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
910 quoted, RE_SV_TAIL(must));
911 });
33b8afdf
JH
912 if (!s) {
913 if (last1 == last) {
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
915 ", giving up...\n"));
916 goto fail_finish;
2c2d71f5 917 }
a3621e74 918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 919 ", trying anchored starting at offset %ld...\n",
be8e71aa 920 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
921 other_last = last;
922 s = HOP3c(t, 1, strend);
923 goto restart;
924 }
925 else {
a3621e74 926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
927 (long)(s - i_strpos)));
928 other_last = s; /* Fix this later. --Hugo */
be8e71aa 929 s = saved_s;
33b8afdf
JH
930 if (t == strpos)
931 goto try_at_start;
932 goto try_at_offset;
933 }
2c2d71f5 934 }
cad2e5aa 935 }
2c2d71f5 936
1de06328 937
9ef43ace 938 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 939
6bda09f9 940 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
941 PerlIO_printf(Perl_debug_log,
942 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
943 (IV)prog->check_offset_min,
944 (IV)prog->check_offset_max,
945 (IV)(s-strpos),
946 (IV)(t-strpos),
947 (IV)(t-s),
948 (IV)(strend-strpos)
949 )
950 );
951
2c2d71f5 952 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 953 && (!utf8_target
9ef43ace 954 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
955 && t > strpos)))
956 {
2c2d71f5
JH
957 /* Fixed substring is found far enough so that the match
958 cannot start at strpos. */
959 try_at_offset:
cad2e5aa 960 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
961 /* Eventually fbm_*() should handle this, but often
962 anchored_offset is not 0, so this check will not be wasted. */
963 /* XXXX In the code below we prefer to look for "^" even in
964 presence of anchored substrings. And we search even
965 beyond the found float position. These pessimizations
966 are historical artefacts only. */
967 find_anchor:
2c2d71f5 968 while (t < strend - prog->minlen) {
cad2e5aa 969 if (*t == '\n') {
4ee3650e 970 if (t < check_at - prog->check_offset_min) {
f2ed9b32 971 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
972 /* Since we moved from the found position,
973 we definitely contradict the found anchored
30944b6d
IZ
974 substr. Due to the above check we do not
975 contradict "check" substr.
976 Thus we can arrive here only if check substr
977 is float. Redo checking for "other"=="fixed".
978 */
9041c2e3 979 strpos = t + 1;
a3621e74 980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 981 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
982 goto do_other_anchored;
983 }
4ee3650e
GS
984 /* We don't contradict the found floating substring. */
985 /* XXXX Why not check for STCLASS? */
cad2e5aa 986 s = t + 1;
a3621e74 987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 988 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
989 goto set_useful;
990 }
4ee3650e
GS
991 /* Position contradicts check-string */
992 /* XXXX probably better to look for check-string
993 than for "\n", so one should lower the limit for t? */
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 996 other_last = strpos = s = t + 1;
cad2e5aa
JH
997 goto restart;
998 }
999 t++;
1000 }
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1002 PL_colors[0], PL_colors[1]));
2c2d71f5 1003 goto fail_finish;
cad2e5aa 1004 }
f5952150 1005 else {
a3621e74 1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1007 PL_colors[0], PL_colors[1]));
f5952150 1008 }
cad2e5aa
JH
1009 s = t;
1010 set_useful:
f2ed9b32 1011 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1012 }
1013 else {
f5952150 1014 /* The found string does not prohibit matching at strpos,
2c2d71f5 1015 - no optimization of calling REx engine can be performed,
f5952150
GS
1016 unless it was an MBOL and we are not after MBOL,
1017 or a future STCLASS check will fail this. */
2c2d71f5
JH
1018 try_at_start:
1019 /* Even in this situation we may use MBOL flag if strpos is offset
1020 wrt the start of the string. */
05b4157f 1021 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1022 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1023 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1024 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1025 {
cad2e5aa
JH
1026 t = strpos;
1027 goto find_anchor;
1028 }
a3621e74 1029 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1030 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1031 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1032 );
2c2d71f5 1033 success_at_start:
bbe252da 1034 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1035 && (utf8_target ? (
33b8afdf
JH
1036 prog->check_utf8 /* Could be deleted already */
1037 && --BmUSEFUL(prog->check_utf8) < 0
1038 && (prog->check_utf8 == prog->float_utf8)
1039 ) : (
1040 prog->check_substr /* Could be deleted already */
1041 && --BmUSEFUL(prog->check_substr) < 0
1042 && (prog->check_substr == prog->float_substr)
1043 )))
66e933ab 1044 {
cad2e5aa 1045 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1046 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1047 /* XXX Does the destruction order has to change with utf8_target? */
1048 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1050 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1051 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1052 check = NULL; /* abort */
cad2e5aa 1053 s = strpos;
486ec47a 1054 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1055 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056 if (prog->intflags & PREGf_IMPLICIT)
1057 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1058 /* XXXX This is a remnant of the old implementation. It
1059 looks wasteful, since now INTUIT can use many
6eb5f6b9 1060 other heuristics. */
bbe252da 1061 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1062 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1063 }
1064 else
1065 s = strpos;
1066 }
1067
6eb5f6b9
JH
1068 /* Last resort... */
1069 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1070 /* trie stclasses are too expensive to use here, we are better off to
1071 leave it to regmatch itself */
f8fc2ecf 1072 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1073 /* minlen == 0 is possible if regstclass is \b or \B,
1074 and the fixed substr is ''$.
1075 Since minlen is already taken into account, s+1 is before strend;
1076 accidentally, minlen >= 1 guaranties no false positives at s + 1
1077 even for \b or \B. But (minlen? 1 : 0) below assumes that
1078 regstclass does not come from lookahead... */
1079 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1080 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1081 const U8* const str = (U8*)STRING(progi->regstclass);
1082 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1084 : 1);
1de06328
YO
1085 char * endpos;
1086 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088 else if (prog->float_substr || prog->float_utf8)
1089 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090 else
1091 endpos= strend;
1092
e8b06a55
YO
1093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1095
6eb5f6b9 1096 t = s;
e8b06a55
YO
1097 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1098 if (!s) {
6eb5f6b9 1099#ifdef DEBUGGING
cbbf8932 1100 const char *what = NULL;
6eb5f6b9
JH
1101#endif
1102 if (endpos == strend) {
a3621e74 1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1104 "Could not match STCLASS...\n") );
1105 goto fail;
1106 }
a3621e74 1107 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1108 "This position contradicts STCLASS...\n") );
bbe252da 1109 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1110 goto fail;
6eb5f6b9 1111 /* Contradict one of substrings */
33b8afdf 1112 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1113 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1114 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1115 hop_and_restart:
1aa99e6b 1116 s = HOP3c(t, 1, strend);
66e933ab
GS
1117 if (s + start_shift + end_shift > strend) {
1118 /* XXXX Should be taken into account earlier? */
a3621e74 1119 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1120 "Could not match STCLASS...\n") );
1121 goto fail;
1122 }
5e39e1e5
HS
1123 if (!check)
1124 goto giveup;
a3621e74 1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1126 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1127 what, (long)(s + start_shift - i_strpos)) );
1128 goto restart;
1129 }
66e933ab 1130 /* Have both, check_string is floating */
6eb5f6b9
JH
1131 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132 goto retry_floating_check;
1133 /* Recheck anchored substring, but not floating... */
9041c2e3 1134 s = check_at;
5e39e1e5
HS
1135 if (!check)
1136 goto giveup;
a3621e74 1137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1138 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1139 (long)(other_last - i_strpos)) );
1140 goto do_other_anchored;
1141 }
60e71179
GS
1142 /* Another way we could have checked stclass at the
1143 current position only: */
1144 if (ml_anch) {
1145 s = t = t + 1;
5e39e1e5
HS
1146 if (!check)
1147 goto giveup;
a3621e74 1148 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1149 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1150 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1151 goto try_at_offset;
66e933ab 1152 }
f2ed9b32 1153 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1154 goto fail;
486ec47a 1155 /* Check is floating substring. */
6eb5f6b9
JH
1156 retry_floating_check:
1157 t = check_at - start_shift;
a3621e74 1158 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1159 goto hop_and_restart;
1160 }
b7953727 1161 if (t != s) {
a3621e74 1162 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1163 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1164 (long)(t - i_strpos), (long)(s - i_strpos))
1165 );
1166 }
1167 else {
a3621e74 1168 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1169 "Does not contradict STCLASS...\n");
1170 );
1171 }
6eb5f6b9 1172 }
5e39e1e5 1173 giveup:
a3621e74 1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1175 PL_colors[4], (check ? "Guessed" : "Giving up"),
1176 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1177 return s;
2c2d71f5
JH
1178
1179 fail_finish: /* Substring not found */
33b8afdf 1180 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1181 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1182 fail:
a3621e74 1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1184 PL_colors[4], PL_colors[5]));
bd61b366 1185 return NULL;
cad2e5aa 1186}
9661b544 1187
a0a388a1
YO
1188#define DECL_TRIE_TYPE(scan) \
1189 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1190 trie_type = ((scan->flags == EXACT) \
1191 ? (utf8_target ? trie_utf8 : trie_plain) \
1192 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1193
1194#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1195uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1196 STRLEN skiplen; \
1197 switch (trie_type) { \
1198 case trie_utf8_fold: \
1199 if ( foldlen>0 ) { \
1200 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1201 foldlen -= len; \
1202 uscan += len; \
1203 len=0; \
1204 } else { \
1205 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1206 len = UTF8SKIP(uc); \
1207 skiplen = UNISKIP( uvc ); \
1208 foldlen -= skiplen; \
1209 uscan = foldbuf + skiplen; \
1210 } \
1211 break; \
1212 case trie_latin_utf8_fold: \
1213 if ( foldlen>0 ) { \
1214 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1215 foldlen -= len; \
1216 uscan += len; \
1217 len=0; \
1218 } else { \
1219 len = 1; \
1220 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1221 skiplen = UNISKIP( uvc ); \
1222 foldlen -= skiplen; \
1223 uscan = foldbuf + skiplen; \
1224 } \
1225 break; \
1226 case trie_utf8: \
1227 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1228 break; \
1229 case trie_plain: \
1230 uvc = (UV)*uc; \
1231 len = 1; \
1232 } \
1233 if (uvc < 256) { \
1234 charid = trie->charmap[ uvc ]; \
1235 } \
1236 else { \
1237 charid = 0; \
1238 if (widecharmap) { \
1239 SV** const svpp = hv_fetch(widecharmap, \
1240 (char*)&uvc, sizeof(UV), 0); \
1241 if (svpp) \
1242 charid = (U16)SvIV(*svpp); \
1243 } \
1244 } \
4cadc6a9
YO
1245} STMT_END
1246
4cadc6a9
YO
1247#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1248STMT_START { \
1249 while (s <= e) { \
1250 if ( (CoNd) \
fac1af77 1251 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1252 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1253 goto got_it; \
1254 s++; \
1255 } \
1256} STMT_END
1257
1258#define REXEC_FBC_UTF8_SCAN(CoDe) \
1259STMT_START { \
1260 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1261 CoDe \
1262 s += uskip; \
1263 } \
1264} STMT_END
1265
1266#define REXEC_FBC_SCAN(CoDe) \
1267STMT_START { \
1268 while (s < strend) { \
1269 CoDe \
1270 s++; \
1271 } \
1272} STMT_END
1273
1274#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1275REXEC_FBC_UTF8_SCAN( \
1276 if (CoNd) { \
24b23f37 1277 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1278 goto got_it; \
1279 else \
1280 tmp = doevery; \
1281 } \
1282 else \
1283 tmp = 1; \
1284)
1285
1286#define REXEC_FBC_CLASS_SCAN(CoNd) \
1287REXEC_FBC_SCAN( \
1288 if (CoNd) { \
24b23f37 1289 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1290 goto got_it; \
1291 else \
1292 tmp = doevery; \
1293 } \
1294 else \
1295 tmp = 1; \
1296)
1297
1298#define REXEC_FBC_TRYIT \
24b23f37 1299if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1300 goto got_it
1301
e1d1eefb 1302#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1303 if (utf8_target) { \
e1d1eefb
YO
1304 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1305 } \
1306 else { \
1307 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1308 }
e1d1eefb 1309
4cadc6a9 1310#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1311 if (utf8_target) { \
4cadc6a9
YO
1312 UtFpReLoAd; \
1313 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1314 } \
1315 else { \
1316 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1317 }
4cadc6a9
YO
1318
1319#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1320 PL_reg_flags |= RF_tainted; \
f2ed9b32 1321 if (utf8_target) { \
4cadc6a9
YO
1322 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1323 } \
1324 else { \
1325 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1326 }
4cadc6a9 1327
786e8c11
YO
1328#define DUMP_EXEC_POS(li,s,doutf8) \
1329 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1330
cfaf538b
KW
1331
1332#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1333 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1334 tmp = TEST_NON_UTF8(tmp); \
1335 REXEC_FBC_UTF8_SCAN( \
1336 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1337 tmp = !tmp; \
1338 IF_SUCCESS; \
1339 } \
1340 else { \
1341 IF_FAIL; \
1342 } \
1343 ); \
1344
1345#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1346 if (s == PL_bostr) { \
1347 tmp = '\n'; \
1348 } \
1349 else { \
1350 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1351 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1352 } \
1353 tmp = TeSt1_UtF8; \
1354 LOAD_UTF8_CHARCLASS_ALNUM(); \
1355 REXEC_FBC_UTF8_SCAN( \
1356 if (tmp == ! (TeSt2_UtF8)) { \
1357 tmp = !tmp; \
1358 IF_SUCCESS; \
1359 } \
1360 else { \
1361 IF_FAIL; \
1362 } \
1363 ); \
1364
63ac0dad
KW
1365/* The only difference between the BOUND and NBOUND cases is that
1366 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1367 * NBOUND. This is accomplished by passing it in either the if or else clause,
1368 * with the other one being empty */
1369#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1370 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1371
1372#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1373 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1374
1375#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1376 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1377
1378#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1379 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1380
63ac0dad
KW
1381
1382/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1383 * be passed in completely with the variable name being tested, which isn't
1384 * such a clean interface, but this is easier to read than it was before. We
1385 * are looking for the boundary (or non-boundary between a word and non-word
1386 * character. The utf8 and non-utf8 cases have the same logic, but the details
1387 * must be different. Find the "wordness" of the character just prior to this
1388 * one, and compare it with the wordness of this one. If they differ, we have
1389 * a boundary. At the beginning of the string, pretend that the previous
1390 * character was a new-line */
cfaf538b 1391#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1392 if (utf8_target) { \
cfaf538b 1393 UTF8_CODE \
63ac0dad
KW
1394 } \
1395 else { /* Not utf8 */ \
1396 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1397 tmp = TEST_NON_UTF8(tmp); \
1398 REXEC_FBC_SCAN( \
1399 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1400 tmp = !tmp; \
1401 IF_SUCCESS; \
1402 } \
1403 else { \
1404 IF_FAIL; \
1405 } \
1406 ); \
1407 } \
1408 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1409 goto got_it;
1410
786e8c11
YO
1411/* We know what class REx starts with. Try to find this position... */
1412/* if reginfo is NULL, its a dryrun */
1413/* annoyingly all the vars in this routine have different names from their counterparts
1414 in regmatch. /grrr */
1415
3c3eec57 1416STATIC char *
07be1b83 1417S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1418 const char *strend, regmatch_info *reginfo)
a687059c 1419{
27da23d5 1420 dVAR;
bbe252da 1421 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1422 char *pat_string; /* The pattern's exactish string */
1423 char *pat_end; /* ptr to end char of pat_string */
1424 re_fold_t folder; /* Function for computing non-utf8 folds */
1425 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1426 STRLEN ln;
5dab1207 1427 STRLEN lnc;
078c425b 1428 register STRLEN uskip;
fac1af77
KW
1429 U8 c1;
1430 U8 c2;
6eb5f6b9
JH
1431 char *e;
1432 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1433 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1434 UV utf8_fold_flags = 0;
f8fc2ecf 1435 RXi_GET_DECL(prog,progi);
7918f24d
NC
1436
1437 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1438
6eb5f6b9
JH
1439 /* We know what class it must start with. */
1440 switch (OP(c)) {
f56b6394 1441 case ANYOFV:
6eb5f6b9 1442 case ANYOF:
f56b6394 1443 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1444 STRLEN inclasslen = strend - s;
1445 REXEC_FBC_UTF8_CLASS_SCAN(
1446 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1447 }
1448 else {
6ef69d56 1449 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1450 }
6eb5f6b9 1451 break;
f33976b4 1452 case CANY:
4cadc6a9 1453 REXEC_FBC_SCAN(
24b23f37 1454 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1455 goto got_it;
1456 else
1457 tmp = doevery;
4cadc6a9 1458 );
f33976b4 1459 break;
2f7f8cb1
KW
1460
1461 case EXACTFA:
1462 if (UTF_PATTERN || utf8_target) {
1463 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1464 goto do_exactf_utf8;
1465 }
1466 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1467 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1468 goto do_exactf_non_utf8; /* isn't dealt with by these */
1469
6eb5f6b9 1470 case EXACTF:
62bf7766 1471 if (utf8_target) {
77a6d856
KW
1472
1473 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1474 utf8_fold_flags = 0;
fac1af77
KW
1475 goto do_exactf_utf8;
1476 }
1477 fold_array = PL_fold;
1478 folder = foldEQ;
1479 goto do_exactf_non_utf8;
1480
1481 case EXACTFL:
1482 if (UTF_PATTERN || utf8_target) {
17580e7a 1483 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1484 goto do_exactf_utf8;
1485 }
1486 fold_array = PL_fold_locale;
1487 folder = foldEQ_locale;
16d951b7
KW
1488 goto do_exactf_non_utf8;
1489
3c760661
KW
1490 case EXACTFU_SS:
1491 if (UTF_PATTERN) {
1492 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1493 }
1494 goto do_exactf_utf8;
1495
fab2782b 1496 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1497 case EXACTFU:
1498 if (UTF_PATTERN || utf8_target) {
77a6d856 1499 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1500 goto do_exactf_utf8;
1501 }
1502
1503 /* Any 'ss' in the pattern should have been replaced by regcomp,
1504 * so we don't have to worry here about this single special case
1505 * in the Latin1 range */
1506 fold_array = PL_fold_latin1;
1507 folder = foldEQ_latin1;
fac1af77
KW
1508
1509 /* FALL THROUGH */
1510
62bf7766
KW
1511 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1512 are no glitches with fold-length differences
1513 between the target string and pattern */
fac1af77
KW
1514
1515 /* The idea in the non-utf8 EXACTF* cases is to first find the
1516 * first character of the EXACTF* node and then, if necessary,
1517 * case-insensitively compare the full text of the node. c1 is the
1518 * first character. c2 is its fold. This logic will not work for
1519 * Unicode semantics and the german sharp ss, which hence should
1520 * not be compiled into a node that gets here. */
1521 pat_string = STRING(c);
1522 ln = STR_LEN(c); /* length to match in octets/bytes */
1523
8a90a8fe
KW
1524 /* We know that we have to match at least 'ln' bytes (which is the
1525 * same as characters, since not utf8). If we have to match 3
1526 * characters, and there are only 2 availabe, we know without
1527 * trying that it will fail; so don't start a match past the
1528 * required minimum number from the far end */
fac1af77
KW
1529 e = HOP3c(strend, -((I32)ln), s);
1530
1531 if (!reginfo && e < s) {
1532 e = s; /* Due to minlen logic of intuit() */
1533 }
1534
1535 c1 = *pat_string;
1536 c2 = fold_array[c1];
1537 if (c1 == c2) { /* If char and fold are the same */
1538 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1539 }
1540 else {
1541 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1542 }
1543 break;
1544
1545 do_exactf_utf8:
e067297c
KW
1546 {
1547 unsigned expansion;
1548
fac1af77
KW
1549
1550 /* If one of the operands is in utf8, we can't use the simpler
1551 * folding above, due to the fact that many different characters
1552 * can have the same fold, or portion of a fold, or different-
1553 * length fold */
1554 pat_string = STRING(c);
1555 ln = STR_LEN(c); /* length to match in octets/bytes */
1556 pat_end = pat_string + ln;
1557 lnc = (UTF_PATTERN) /* length to match in characters */
1558 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1559 : ln;
1560
e067297c
KW
1561 /* We have 'lnc' characters to match in the pattern, but because of
1562 * multi-character folding, each character in the target can match
1563 * up to 3 characters (Unicode guarantees it will never exceed
1564 * this) if it is utf8-encoded; and up to 2 if not (based on the
1565 * fact that the Latin 1 folds are already determined, and the
1566 * only multi-char fold in that range is the sharp-s folding to
1567 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1568 * string character. Adjust lnc accordingly, rounding up, so that
1569 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1570 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1571 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1572
1573 /* As in the non-UTF8 case, if we have to match 3 characters, and
1574 * only 2 are left, it's guaranteed to fail, so don't start a
1575 * match that would require us to go beyond the end of the string
1576 */
1577 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1578
1579 if (!reginfo && e < s) {
1580 e = s; /* Due to minlen logic of intuit() */
1581 }
1582
b33105db
KW
1583 /* XXX Note that we could recalculate e to stop the loop earlier,
1584 * as the worst case expansion above will rarely be met, and as we
1585 * go along we would usually find that e moves further to the left.
1586 * This would happen only after we reached the point in the loop
1587 * where if there were no expansion we should fail. Unclear if
1588 * worth the expense */
e067297c 1589
fac1af77
KW
1590 while (s <= e) {
1591 char *my_strend= (char *)strend;
1592 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1593 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1594 && (!reginfo || regtry(reginfo, &s)) )
1595 {
1596 goto got_it;
1597 }
bbdd8bad 1598 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1599 }
1600 break;
e067297c 1601 }
bbce6d69 1602 case BOUNDL:
3280af22 1603 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1604 FBC_BOUND(isALNUM_LC,
1605 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1606 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1607 break;
bbce6d69 1608 case NBOUNDL:
3280af22 1609 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1610 FBC_NBOUND(isALNUM_LC,
1611 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1612 isALNUM_LC_utf8((U8*)s));
1613 break;
1614 case BOUND:
1615 FBC_BOUND(isWORDCHAR,
1616 isALNUM_uni(tmp),
1617 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1618 break;
cfaf538b
KW
1619 case BOUNDA:
1620 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1621 isWORDCHAR_A(tmp),
1622 isWORDCHAR_A((U8*)s));
1623 break;
a0d0e21e 1624 case NBOUND:
63ac0dad
KW
1625 FBC_NBOUND(isWORDCHAR,
1626 isALNUM_uni(tmp),
1627 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1628 break;
cfaf538b
KW
1629 case NBOUNDA:
1630 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1631 isWORDCHAR_A(tmp),
1632 isWORDCHAR_A((U8*)s));
1633 break;
63ac0dad
KW
1634 case BOUNDU:
1635 FBC_BOUND(isWORDCHAR_L1,
1636 isALNUM_uni(tmp),
1637 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1638 break;
1639 case NBOUNDU:
1640 FBC_NBOUND(isWORDCHAR_L1,
1641 isALNUM_uni(tmp),
1642 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1643 break;
bbce6d69 1644 case ALNUML:
4cadc6a9
YO
1645 REXEC_FBC_CSCAN_TAINT(
1646 isALNUM_LC_utf8((U8*)s),
1647 isALNUM_LC(*s)
1648 );
6895a8aa 1649 break;
980866de
KW
1650 case ALNUMU:
1651 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1652 LOAD_UTF8_CHARCLASS_ALNUM(),
1653 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1654 isWORDCHAR_L1((U8) *s)
1655 );
6895a8aa 1656 break;
980866de
KW
1657 case ALNUM:
1658 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1659 LOAD_UTF8_CHARCLASS_ALNUM(),
1660 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1661 isWORDCHAR((U8) *s)
1662 );
6895a8aa 1663 break;
cfaf538b 1664 case ALNUMA:
8e9da4d4
KW
1665 /* Don't need to worry about utf8, as it can match only a single
1666 * byte invariant character */
cfaf538b 1667 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1668 break;
980866de
KW
1669 case NALNUMU:
1670 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1671 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1672 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1673 ! isWORDCHAR_L1((U8) *s)
1674 );
6895a8aa 1675 break;
a0d0e21e 1676 case NALNUM:
4cadc6a9 1677 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1680 ! isALNUM(*s)
4cadc6a9 1681 );
6895a8aa 1682 break;
cfaf538b 1683 case NALNUMA:
8e9da4d4
KW
1684 REXEC_FBC_CSCAN(
1685 !isWORDCHAR_A(*s),
1686 !isWORDCHAR_A(*s)
1687 );
1688 break;
bbce6d69 1689 case NALNUML:
4cadc6a9
YO
1690 REXEC_FBC_CSCAN_TAINT(
1691 !isALNUM_LC_utf8((U8*)s),
1692 !isALNUM_LC(*s)
1693 );
6895a8aa 1694 break;
980866de
KW
1695 case SPACEU:
1696 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1697 LOAD_UTF8_CHARCLASS_SPACE(),
1698 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1699 isSPACE_L1((U8) *s)
1700 );
6895a8aa 1701 break;
a0d0e21e 1702 case SPACE:
4cadc6a9 1703 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1704 LOAD_UTF8_CHARCLASS_SPACE(),
1705 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1706 isSPACE((U8) *s)
4cadc6a9 1707 );
6895a8aa 1708 break;
cfaf538b 1709 case SPACEA:
8e9da4d4
KW
1710 /* Don't need to worry about utf8, as it can match only a single
1711 * byte invariant character */
cfaf538b 1712 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1713 break;
bbce6d69 1714 case SPACEL:
4cadc6a9 1715 REXEC_FBC_CSCAN_TAINT(
6bbba904 1716 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1717 isSPACE_LC(*s)
1718 );
6895a8aa 1719 break;
980866de
KW
1720 case NSPACEU:
1721 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1722 LOAD_UTF8_CHARCLASS_SPACE(),
1723 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1724 ! isSPACE_L1((U8) *s)
1725 );
6895a8aa 1726 break;
a0d0e21e 1727 case NSPACE:
4cadc6a9 1728 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1729 LOAD_UTF8_CHARCLASS_SPACE(),
1730 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1731 ! isSPACE((U8) *s)
4cadc6a9 1732 );
6895a8aa 1733 break;
cfaf538b 1734 case NSPACEA:
8e9da4d4
KW
1735 REXEC_FBC_CSCAN(
1736 !isSPACE_A(*s),
1737 !isSPACE_A(*s)
1738 );
1739 break;
bbce6d69 1740 case NSPACEL:
4cadc6a9 1741 REXEC_FBC_CSCAN_TAINT(
6bbba904 1742 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1743 !isSPACE_LC(*s)
1744 );
6895a8aa 1745 break;
a0d0e21e 1746 case DIGIT:
4cadc6a9 1747 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1748 LOAD_UTF8_CHARCLASS_DIGIT(),
1749 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1750 isDIGIT(*s)
1751 );
6895a8aa 1752 break;
cfaf538b 1753 case DIGITA:
8e9da4d4
KW
1754 /* Don't need to worry about utf8, as it can match only a single
1755 * byte invariant character */
cfaf538b 1756 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1757 break;
b8c5462f 1758 case DIGITL:
4cadc6a9
YO
1759 REXEC_FBC_CSCAN_TAINT(
1760 isDIGIT_LC_utf8((U8*)s),
1761 isDIGIT_LC(*s)
1762 );
6895a8aa 1763 break;
a0d0e21e 1764 case NDIGIT:
4cadc6a9 1765 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1766 LOAD_UTF8_CHARCLASS_DIGIT(),
1767 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1768 !isDIGIT(*s)
1769 );
6895a8aa 1770 break;
cfaf538b 1771 case NDIGITA:
8e9da4d4
KW
1772 REXEC_FBC_CSCAN(
1773 !isDIGIT_A(*s),
1774 !isDIGIT_A(*s)
1775 );
1776 break;
b8c5462f 1777 case NDIGITL:
4cadc6a9
YO
1778 REXEC_FBC_CSCAN_TAINT(
1779 !isDIGIT_LC_utf8((U8*)s),
1780 !isDIGIT_LC(*s)
1781 );
6895a8aa 1782 break;
e1d1eefb
YO
1783 case LNBREAK:
1784 REXEC_FBC_CSCAN(
1785 is_LNBREAK_utf8(s),
1786 is_LNBREAK_latin1(s)
1787 );
6895a8aa 1788 break;
e1d1eefb
YO
1789 case VERTWS:
1790 REXEC_FBC_CSCAN(
1791 is_VERTWS_utf8(s),
1792 is_VERTWS_latin1(s)
1793 );
6895a8aa 1794 break;
e1d1eefb
YO
1795 case NVERTWS:
1796 REXEC_FBC_CSCAN(
1797 !is_VERTWS_utf8(s),
1798 !is_VERTWS_latin1(s)
1799 );
6895a8aa 1800 break;
e1d1eefb
YO
1801 case HORIZWS:
1802 REXEC_FBC_CSCAN(
1803 is_HORIZWS_utf8(s),
1804 is_HORIZWS_latin1(s)
1805 );
6895a8aa 1806 break;
e1d1eefb
YO
1807 case NHORIZWS:
1808 REXEC_FBC_CSCAN(
1809 !is_HORIZWS_utf8(s),
1810 !is_HORIZWS_latin1(s)
1811 );
6895a8aa 1812 break;
1de06328
YO
1813 case AHOCORASICKC:
1814 case AHOCORASICK:
07be1b83 1815 {
a0a388a1 1816 DECL_TRIE_TYPE(c);
07be1b83
YO
1817 /* what trie are we using right now */
1818 reg_ac_data *aho
f8fc2ecf 1819 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1820 reg_trie_data *trie
1821 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1822 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1823
1824 const char *last_start = strend - trie->minlen;
6148ee25 1825#ifdef DEBUGGING
07be1b83 1826 const char *real_start = s;
6148ee25 1827#endif
07be1b83 1828 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1829 SV *sv_points;
1830 U8 **points; /* map of where we were in the input string
786e8c11 1831 when reading a given char. For ASCII this
be8e71aa 1832 is unnecessary overhead as the relationship
38a44b82
NC
1833 is always 1:1, but for Unicode, especially
1834 case folded Unicode this is not true. */
f9e705e8 1835 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1836 U8 *bitmap=NULL;
1837
07be1b83
YO
1838
1839 GET_RE_DEBUG_FLAGS_DECL;
1840
be8e71aa
YO
1841 /* We can't just allocate points here. We need to wrap it in
1842 * an SV so it gets freed properly if there is a croak while
1843 * running the match */
1844 ENTER;
1845 SAVETMPS;
1846 sv_points=newSV(maxlen * sizeof(U8 *));
1847 SvCUR_set(sv_points,
1848 maxlen * sizeof(U8 *));
1849 SvPOK_on(sv_points);
1850 sv_2mortal(sv_points);
1851 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1852 if ( trie_type != trie_utf8_fold
1853 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1854 {
786e8c11
YO
1855 if (trie->bitmap)
1856 bitmap=(U8*)trie->bitmap;
1857 else
1858 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1859 }
786e8c11
YO
1860 /* this is the Aho-Corasick algorithm modified a touch
1861 to include special handling for long "unknown char"
1862 sequences. The basic idea being that we use AC as long
1863 as we are dealing with a possible matching char, when
1864 we encounter an unknown char (and we have not encountered
1865 an accepting state) we scan forward until we find a legal
1866 starting char.
1867 AC matching is basically that of trie matching, except
1868 that when we encounter a failing transition, we fall back
1869 to the current states "fail state", and try the current char
1870 again, a process we repeat until we reach the root state,
1871 state 1, or a legal transition. If we fail on the root state
1872 then we can either terminate if we have reached an accepting
1873 state previously, or restart the entire process from the beginning
1874 if we have not.
1875
1876 */
07be1b83
YO
1877 while (s <= last_start) {
1878 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1879 U8 *uc = (U8*)s;
1880 U16 charid = 0;
1881 U32 base = 1;
1882 U32 state = 1;
1883 UV uvc = 0;
1884 STRLEN len = 0;
1885 STRLEN foldlen = 0;
1886 U8 *uscan = (U8*)NULL;
1887 U8 *leftmost = NULL;
786e8c11
YO
1888#ifdef DEBUGGING
1889 U32 accepted_word= 0;
1890#endif
07be1b83
YO
1891 U32 pointpos = 0;
1892
1893 while ( state && uc <= (U8*)strend ) {
1894 int failed=0;
786e8c11
YO
1895 U32 word = aho->states[ state ].wordnum;
1896
1de06328
YO
1897 if( state==1 ) {
1898 if ( bitmap ) {
1899 DEBUG_TRIE_EXECUTE_r(
1900 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1901 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1902 (char *)uc, utf8_target );
1de06328
YO
1903 PerlIO_printf( Perl_debug_log,
1904 " Scanning for legal start char...\n");
1905 }
d085b490
YO
1906 );
1907 if (utf8_target) {
1908 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1909 uc += UTF8SKIP(uc);
1910 }
1911 } else {
1912 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1913 uc++;
1914 }
1915 }
1de06328 1916 s= (char *)uc;
786e8c11 1917 }
786e8c11
YO
1918 if (uc >(U8*)last_start) break;
1919 }
1920
1921 if ( word ) {
2e64971a 1922 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1923 if (!leftmost || lpos < leftmost) {
1924 DEBUG_r(accepted_word=word);
07be1b83 1925 leftmost= lpos;
786e8c11 1926 }
07be1b83 1927 if (base==0) break;
786e8c11 1928
07be1b83
YO
1929 }
1930 points[pointpos++ % maxlen]= uc;
55eed653
NC
1931 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1932 uscan, len, uvc, charid, foldlen,
1933 foldbuf, uniflags);
786e8c11
YO
1934 DEBUG_TRIE_EXECUTE_r({
1935 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1936 s, utf8_target );
07be1b83 1937 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1938 " Charid:%3u CP:%4"UVxf" ",
1939 charid, uvc);
1940 });
07be1b83
YO
1941
1942 do {
6148ee25 1943#ifdef DEBUGGING
786e8c11 1944 word = aho->states[ state ].wordnum;
6148ee25 1945#endif
07be1b83
YO
1946 base = aho->states[ state ].trans.base;
1947
786e8c11
YO
1948 DEBUG_TRIE_EXECUTE_r({
1949 if (failed)
1950 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1951 s, utf8_target );
07be1b83 1952 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1953 "%sState: %4"UVxf", word=%"UVxf,
1954 failed ? " Fail transition to " : "",
1955 (UV)state, (UV)word);
1956 });
07be1b83
YO
1957 if ( base ) {
1958 U32 tmp;
6dd2be57 1959 I32 offset;
07be1b83 1960 if (charid &&
6dd2be57
DM
1961 ( ((offset = base + charid
1962 - 1 - trie->uniquecharcount)) >= 0)
1963 && ((U32)offset < trie->lasttrans)
1964 && trie->trans[offset].check == state
1965 && (tmp=trie->trans[offset].next))
07be1b83 1966 {
786e8c11
YO
1967 DEBUG_TRIE_EXECUTE_r(
1968 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1969 state = tmp;
1970 break;
1971 }
1972 else {
786e8c11
YO
1973 DEBUG_TRIE_EXECUTE_r(
1974 PerlIO_printf( Perl_debug_log," - fail\n"));
1975 failed = 1;
1976 state = aho->fail[state];
07be1b83
YO
1977 }
1978 }
1979 else {
1980 /* we must be accepting here */
786e8c11
YO
1981 DEBUG_TRIE_EXECUTE_r(
1982 PerlIO_printf( Perl_debug_log," - accepting\n"));
1983 failed = 1;
07be1b83
YO
1984 break;
1985 }
1986 } while(state);
786e8c11 1987 uc += len;
07be1b83
YO
1988 if (failed) {
1989 if (leftmost)
1990 break;
786e8c11 1991 if (!state) state = 1;
07be1b83
YO
1992 }
1993 }
1994 if ( aho->states[ state ].wordnum ) {
2e64971a 1995 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1996 if (!leftmost || lpos < leftmost) {
1997 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1998 leftmost = lpos;
786e8c11 1999 }
07be1b83 2000 }
07be1b83
YO
2001 if (leftmost) {
2002 s = (char*)leftmost;
786e8c11
YO
2003 DEBUG_TRIE_EXECUTE_r({
2004 PerlIO_printf(
70685ca0
JH
2005 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2006 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2007 );
2008 });
24b23f37 2009 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2010 FREETMPS;
2011 LEAVE;
07be1b83 2012 goto got_it;
be8e71aa 2013 }
07be1b83 2014 s = HOPc(s,1);
786e8c11
YO
2015 DEBUG_TRIE_EXECUTE_r({
2016 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2017 });
07be1b83 2018 } else {
786e8c11
YO
2019 DEBUG_TRIE_EXECUTE_r(
2020 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2021 break;
2022 }
2023 }
be8e71aa
YO
2024 FREETMPS;
2025 LEAVE;
07be1b83
YO
2026 }
2027 break;
b3c9acc1 2028 default:
3c3eec57
GS
2029 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2030 break;
d6a28714 2031 }
6eb5f6b9
JH
2032 return 0;
2033 got_it:
2034 return s;
2035}
2036
fae667d5 2037
6eb5f6b9
JH
2038/*
2039 - regexec_flags - match a regexp against a string
2040 */
2041I32
288b8c02 2042Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2043 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2044/* strend: pointer to null at end of string */
2045/* strbeg: real beginning of string */
2046/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2047/* data: May be used for some additional optimizations.
2048 Currently its only used, with a U32 cast, for transmitting
2049 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2050/* nosave: For optimizations. */
2051{
97aff369 2052 dVAR;
288b8c02 2053 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2054 /*register*/ char *s;
6eb5f6b9 2055 register regnode *c;
24b23f37 2056 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2057 I32 minlen; /* must match at least this many chars */
2058 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2059 I32 end_shift = 0; /* Same for the end. */ /* CC */
2060 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2061 char *scream_olds = NULL;
f2ed9b32 2062 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2063 I32 multiline;
f8fc2ecf 2064 RXi_GET_DECL(prog,progi);
3b0527fe 2065 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2066 regexp_paren_pair *swap = NULL;
a3621e74
YO
2067 GET_RE_DEBUG_FLAGS_DECL;
2068
7918f24d 2069 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2070 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2071
2072 /* Be paranoid... */
2073 if (prog == NULL || startpos == NULL) {
2074 Perl_croak(aTHX_ "NULL regexp parameter");
2075 return 0;
2076 }
2077
bbe252da 2078 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2079 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2080
f2ed9b32 2081 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2082 DEBUG_EXECUTE_r(
f2ed9b32 2083 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2084 "Matching");
2085 );
bac06658 2086
6eb5f6b9 2087 minlen = prog->minlen;
1de06328
YO
2088
2089 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2091 "String too short [regexec_flags]...\n"));
2092 goto phooey;
1aa99e6b 2093 }
6eb5f6b9 2094
1de06328 2095
6eb5f6b9 2096 /* Check validity of program. */
f8fc2ecf 2097 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2098 Perl_croak(aTHX_ "corrupted regexp program");
2099 }
2100
2101 PL_reg_flags = 0;
2102 PL_reg_eval_set = 0;
2103 PL_reg_maxiter = 0;
2104
3c8556c3 2105 if (RX_UTF8(rx))
6eb5f6b9
JH
2106 PL_reg_flags |= RF_utf8;
2107
2108 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2109 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2110 PL_bostr = strbeg;
3b0527fe 2111 reginfo.sv = sv;
6eb5f6b9
JH
2112
2113 /* Mark end of line for $ (and such) */
2114 PL_regeol = strend;
2115
2116 /* see how far we have to get to not match where we matched before */
3b0527fe 2117 reginfo.till = startpos+minend;
6eb5f6b9 2118
6eb5f6b9
JH
2119 /* If there is a "must appear" string, look for it. */
2120 s = startpos;
2121
bbe252da 2122 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2123 MAGIC *mg;
2c296965 2124 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2125 reginfo.ganch = startpos + prog->gofs;
2c296965 2126 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2127 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2128 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2129 && SvMAGIC(sv)
14befaf4
DM
2130 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2131 && mg->mg_len >= 0) {
3b0527fe 2132 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2133 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2134 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2135
bbe252da 2136 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2137 if (s > reginfo.ganch)
6eb5f6b9 2138 goto phooey;
58e23c8d 2139 s = reginfo.ganch - prog->gofs;
2c296965 2140 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2141 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2142 if (s < strbeg)
2143 goto phooey;
6eb5f6b9
JH
2144 }
2145 }
58e23c8d 2146 else if (data) {
70685ca0 2147 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2149 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2150
2151 } else { /* pos() not defined */
3b0527fe 2152 reginfo.ganch = strbeg;
2c296965
YO
2153 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154 "GPOS: reginfo.ganch = strbeg\n"));
2155 }
6eb5f6b9 2156 }
288b8c02 2157 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2158 /* We have to be careful. If the previous successful match
2159 was from this regex we don't want a subsequent partially
2160 successful match to clobber the old results.
2161 So when we detect this possibility we add a swap buffer
2162 to the re, and switch the buffer each match. If we fail
2163 we switch it back, otherwise we leave it swapped.
2164 */
2165 swap = prog->offs;
2166 /* do we need a save destructor here for eval dies? */
2167 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2168 }
a0714e2c 2169 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2170 re_scream_pos_data d;
2171
2172 d.scream_olds = &scream_olds;
2173 d.scream_pos = &scream_pos;
288b8c02 2174 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2175 if (!s) {
a3621e74 2176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2177 goto phooey; /* not present */
3fa9c3d7 2178 }
6eb5f6b9
JH
2179 }
2180
1de06328 2181
6eb5f6b9
JH
2182
2183 /* Simplest case: anchored match need be tried only once. */
2184 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2185 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2186 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2187 goto got_it;
bbe252da
YO
2188 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2189 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2190 {
2191 char *end;
2192
2193 if (minlen)
2194 dontbother = minlen - 1;
1aa99e6b 2195 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2196 /* for multiline we only have to try after newlines */
33b8afdf 2197 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2198 /* because of the goto we can not easily reuse the macros for bifurcating the
2199 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2200 if (utf8_target) {
2201 if (s == startpos)
2202 goto after_try_utf8;
2203 while (1) {
2204 if (regtry(&reginfo, &s)) {
2205 goto got_it;
2206 }
2207 after_try_utf8:
2208 if (s > end) {
2209 goto phooey;
2210 }
2211 if (prog->extflags & RXf_USE_INTUIT) {
2212 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2213 if (!s) {
2214 goto phooey;
2215 }
2216 }
2217 else {
2218 s += UTF8SKIP(s);
2219 }
2220 }
2221 } /* end search for check string in unicode */
2222 else {
2223 if (s == startpos) {
2224 goto after_try_latin;
2225 }
2226 while (1) {
2227 if (regtry(&reginfo, &s)) {
2228 goto got_it;
2229 }
2230 after_try_latin:
2231 if (s > end) {
2232 goto phooey;
2233 }
2234 if (prog->extflags & RXf_USE_INTUIT) {
2235 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2236 if (!s) {
2237 goto phooey;
2238 }
2239 }
2240 else {
2241 s++;
2242 }
2243 }
2244 } /* end search for check string in latin*/
2245 } /* end search for check string */
2246 else { /* search for newline */
2247 if (s > startpos) {
2248 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2249 s--;
92f3d482 2250 }
21eede78
YO
2251 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2252 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2253 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2254 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2255 goto got_it;
2256 }
92f3d482
YO
2257 }
2258 } /* end search for newline */
2259 } /* end anchored/multiline check string search */
6eb5f6b9 2260 goto phooey;
bbe252da 2261 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2262 {
486ec47a 2263 /* the warning about reginfo.ganch being used without initialization
bbe252da 2264 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2265 and we only enter this block when the same bit is set. */
58e23c8d 2266 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2267
2268 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2269 goto got_it;
2270 goto phooey;
2271 }
2272
2273 /* Messy cases: unanchored match. */
bbe252da 2274 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2275 /* we have /x+whatever/ */
f2ed9b32 2276 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2277 char ch;
bf93d4cc
GS
2278#ifdef DEBUGGING
2279 int did_match = 0;
2280#endif
f2ed9b32
KW
2281 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2282 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2283 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2284
f2ed9b32 2285 if (utf8_target) {
4cadc6a9 2286 REXEC_FBC_SCAN(
6eb5f6b9 2287 if (*s == ch) {
a3621e74 2288 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2289 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2290 s += UTF8SKIP(s);
2291 while (s < strend && *s == ch)
2292 s += UTF8SKIP(s);
2293 }
4cadc6a9 2294 );
6eb5f6b9
JH
2295 }
2296 else {
4cadc6a9 2297 REXEC_FBC_SCAN(
6eb5f6b9 2298 if (*s == ch) {
a3621e74 2299 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2300 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2301 s++;
2302 while (s < strend && *s == ch)
2303 s++;
2304 }
4cadc6a9 2305 );
6eb5f6b9 2306 }
a3621e74 2307 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2308 PerlIO_printf(Perl_debug_log,
b7953727
JH
2309 "Did not find anchored character...\n")
2310 );
6eb5f6b9 2311 }
a0714e2c
SS
2312 else if (prog->anchored_substr != NULL
2313 || prog->anchored_utf8 != NULL
2314 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2315 && prog->float_max_offset < strend - s)) {
2316 SV *must;
2317 I32 back_max;
2318 I32 back_min;
2319 char *last;
6eb5f6b9 2320 char *last1; /* Last position checked before */
bf93d4cc
GS
2321#ifdef DEBUGGING
2322 int did_match = 0;
2323#endif
33b8afdf 2324 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2325 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2326 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2327 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2328 back_max = back_min = prog->anchored_offset;
2329 } else {
f2ed9b32
KW
2330 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2331 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2332 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2333 back_max = prog->float_max_offset;
2334 back_min = prog->float_min_offset;
2335 }
1de06328
YO
2336
2337
33b8afdf
JH
2338 if (must == &PL_sv_undef)
2339 /* could not downgrade utf8 check substring, so must fail */
2340 goto phooey;
2341
1de06328
YO
2342 if (back_min<0) {
2343 last = strend;
2344 } else {
2345 last = HOP3c(strend, /* Cannot start after this */
2346 -(I32)(CHR_SVLEN(must)
2347 - (SvTAIL(must) != 0) + back_min), strbeg);
2348 }
6eb5f6b9
JH
2349 if (s > PL_bostr)
2350 last1 = HOPc(s, -1);
2351 else
2352 last1 = s - 1; /* bogus */
2353
a0288114 2354 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2355 check_substr==must. */
2356 scream_pos = -1;
2357 dontbother = end_shift;
2358 strend = HOPc(strend, -dontbother);
2359 while ( (s <= last) &&
f1905e1b 2360 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
1de06328 2361 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2362 end_shift, &scream_pos, 0))
1de06328 2363 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2364 (unsigned char*)strend, must,
7fba1cd6 2365 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2366 /* we may be pointing at the wrong string */
07bc277f 2367 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2368 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2369 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2370 if (HOPc(s, -back_max) > last1) {
2371 last1 = HOPc(s, -back_min);
2372 s = HOPc(s, -back_max);
2373 }
2374 else {
52657f30 2375 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2376
2377 last1 = HOPc(s, -back_min);
52657f30 2378 s = t;
6eb5f6b9 2379 }
f2ed9b32 2380 if (utf8_target) {
6eb5f6b9 2381 while (s <= last1) {
24b23f37 2382 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2383 goto got_it;
2384 s += UTF8SKIP(s);
2385 }
2386 }
2387 else {
2388 while (s <= last1) {
24b23f37 2389 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2390 goto got_it;
2391 s++;
2392 }
2393 }
2394 }
ab3bbdeb 2395 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2396 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2397 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2398 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2399 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2400 ? "anchored" : "floating"),
ab3bbdeb
YO
2401 quoted, RE_SV_TAIL(must));
2402 });
6eb5f6b9
JH
2403 goto phooey;
2404 }
f8fc2ecf 2405 else if ( (c = progi->regstclass) ) {
f14c76ed 2406 if (minlen) {
f8fc2ecf 2407 const OPCODE op = OP(progi->regstclass);
66e933ab 2408 /* don't bother with what can't match */
786e8c11 2409 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2410 strend = HOPc(strend, -(minlen - 1));
2411 }
a3621e74 2412 DEBUG_EXECUTE_r({
be8e71aa 2413 SV * const prop = sv_newmortal();
32fc9b6a 2414 regprop(prog, prop, c);
0df25f3d 2415 {
f2ed9b32 2416 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2417 s,strend-s,60);
0df25f3d 2418 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2419 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2420 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2421 quoted, (int)(strend - s));
0df25f3d 2422 }
ffc61ed2 2423 });
3b0527fe 2424 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2425 goto got_it;
07be1b83 2426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2427 }
2428 else {
2429 dontbother = 0;
a0714e2c 2430 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2431 /* Trim the end. */
6af40bd7 2432 char *last= NULL;
33b8afdf
JH
2433 SV* float_real;
2434
f2ed9b32
KW
2435 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2436 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2437 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2438
f1905e1b 2439 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
33b8afdf 2440 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2441 end_shift, &scream_pos, 1); /* last one */
2442 if (!last)
ffc61ed2 2443 last = scream_olds; /* Only one occurrence. */
4addbd3b 2444 /* we may be pointing at the wrong string */
07bc277f 2445 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2446 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2447 }
d6a28714
JH
2448 else {
2449 STRLEN len;
cfd0369c 2450 const char * const little = SvPV_const(float_real, len);
33b8afdf 2451 if (SvTAIL(float_real)) {
1a13b075
YO
2452 /* This means that float_real contains an artificial \n on the end
2453 * due to the presence of something like this: /foo$/
2454 * where we can match both "foo" and "foo\n" at the end of the string.
2455 * So we have to compare the end of the string first against the float_real
2456 * without the \n and then against the full float_real with the string.
2457 * We have to watch out for cases where the string might be smaller
2458 * than the float_real or the float_real without the \n.
2459 */
2460 char *checkpos= strend - len;
2461 DEBUG_OPTIMISE_r(
2462 PerlIO_printf(Perl_debug_log,
2463 "%sChecking for float_real.%s\n",
2464 PL_colors[4], PL_colors[5]));
2465 if (checkpos + 1 < strbeg) {
2466 /* can't match, even if we remove the trailing \n string is too short to match */
2467 DEBUG_EXECUTE_r(
2468 PerlIO_printf(Perl_debug_log,
2469 "%sString shorter than required trailing substring, cannot match.%s\n",
2470 PL_colors[4], PL_colors[5]));
2471 goto phooey;
2472 } else if (memEQ(checkpos + 1, little, len - 1)) {
2473 /* can match, the end of the string matches without the "\n" */
2474 last = checkpos + 1;
2475 } else if (checkpos < strbeg) {
2476 /* cant match, string is too short when the "\n" is included */
2477 DEBUG_EXECUTE_r(
2478 PerlIO_printf(Perl_debug_log,
2479 "%sString does not contain required trailing substring, cannot match.%s\n",
2480 PL_colors[4], PL_colors[5]));
2481 goto phooey;
2482 } else if (!multiline) {
2483 /* non multiline match, so compare with the "\n" at the end of the string */
2484 if (memEQ(checkpos, little, len)) {
2485 last= checkpos;
2486 } else {
2487 DEBUG_EXECUTE_r(
2488 PerlIO_printf(Perl_debug_log,
2489 "%sString does not contain required trailing substring, cannot match.%s\n",
2490 PL_colors[4], PL_colors[5]));
2491 goto phooey;
2492 }
2493 } else {
2494 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2495 goto find_last;
1a13b075 2496 }
d6a28714
JH
2497 } else {
2498 find_last:
9041c2e3 2499 if (len)
d6a28714 2500 last = rninstr(s, strend, little, little + len);
b8c5462f 2501 else
a0288114 2502 last = strend; /* matching "$" */
b8c5462f 2503 }
b8c5462f 2504 }
6af40bd7
YO
2505 if (!last) {
2506 /* at one point this block contained a comment which was probably
2507 * incorrect, which said that this was a "should not happen" case.
2508 * Even if it was true when it was written I am pretty sure it is
2509 * not anymore, so I have removed the comment and replaced it with
2510 * this one. Yves */
6bda09f9
YO
2511 DEBUG_EXECUTE_r(
2512 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2513 "String does not contain required substring, cannot match.\n"
2514 ));
2515 goto phooey;
bf93d4cc 2516 }
d6a28714
JH
2517 dontbother = strend - last + prog->float_min_offset;
2518 }
2519 if (minlen && (dontbother < minlen))
2520 dontbother = minlen - 1;
2521 strend -= dontbother; /* this one's always in bytes! */
2522 /* We don't know much -- general case. */
f2ed9b32 2523 if (utf8_target) {
d6a28714 2524 for (;;) {
24b23f37 2525 if (regtry(&reginfo, &s))
d6a28714
JH
2526 goto got_it;
2527 if (s >= strend)
2528 break;
b8c5462f 2529 s += UTF8SKIP(s);
d6a28714
JH
2530 };
2531 }
2532 else {
2533 do {
24b23f37 2534 if (regtry(&reginfo, &s))
d6a28714
JH
2535 goto got_it;
2536 } while (s++ < strend);
2537 }
2538 }
2539
2540 /* Failure. */
2541 goto phooey;
2542
2543got_it:
e9105d30 2544 Safefree(swap);
288b8c02 2545 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2546
19b95bf0 2547 if (PL_reg_eval_set)
4f639d21 2548 restore_pos(aTHX_ prog);
5daac39c
NC
2549 if (RXp_PAREN_NAMES(prog))
2550 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2551
2552 /* make sure $`, $&, $', and $digit will work later */
2553 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2554 RX_MATCH_COPY_FREE(rx);
d6a28714 2555 if (flags & REXEC_COPY_STR) {
be8e71aa 2556 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2557#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2558 if ((SvIsCOW(sv)
2559 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2560 if (DEBUG_C_TEST) {
2561 PerlIO_printf(Perl_debug_log,
2562 "Copy on write: regexp capture, type %d\n",
2563 (int) SvTYPE(sv));
2564 }
2565 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2566 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2567 assert (SvPOKp(prog->saved_copy));
2568 } else
2569#endif
2570 {
288b8c02 2571 RX_MATCH_COPIED_on(rx);
ed252734
NC
2572 s = savepvn(strbeg, i);
2573 prog->subbeg = s;
2574 }
d6a28714 2575 prog->sublen = i;
d6a28714
JH
2576 }
2577 else {
2578 prog->subbeg = strbeg;
2579 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2580 }
2581 }
9041c2e3 2582
d6a28714
JH
2583 return 1;
2584
2585phooey:
a3621e74 2586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2587 PL_colors[4], PL_colors[5]));
d6a28714 2588 if (PL_reg_eval_set)
4f639d21 2589 restore_pos(aTHX_ prog);
e9105d30 2590 if (swap) {
c74340f9 2591 /* we failed :-( roll it back */
e9105d30
GG
2592 Safefree(prog->offs);
2593 prog->offs = swap;
2594 }
2595
d6a28714
JH
2596 return 0;
2597}
2598
6bda09f9 2599
d6a28714
JH
2600/*
2601 - regtry - try match at specific point
2602 */
2603STATIC I32 /* 0 failure, 1 success */
24b23f37 2604S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2605{
97aff369 2606 dVAR;
d6a28714 2607 CHECKPOINT lastcp;
288b8c02
NC
2608 REGEXP *const rx = reginfo->prog;
2609 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2610 RXi_GET_DECL(prog,progi);
a3621e74 2611 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2612
2613 PERL_ARGS_ASSERT_REGTRY;
2614
24b23f37 2615 reginfo->cutpoint=NULL;
d6a28714 2616
bbe252da 2617 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2618 MAGIC *mg;
2619
2620 PL_reg_eval_set = RS_init;
a3621e74 2621 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2622 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2623 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2624 ));
ea8d6ae1 2625 SAVESTACK_CXPOS();
d6a28714
JH
2626 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2627 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2628 SAVETMPS;
2629 /* Apparently this is not needed, judging by wantarray. */
e8347627 2630 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2631 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2632
3b0527fe 2633 if (reginfo->sv) {
d6a28714 2634 /* Make $_ available to executed code. */
3b0527fe 2635 if (reginfo->sv != DEFSV) {
59f00321 2636 SAVE_DEFSV;
414bf5ae 2637 DEFSV_set(reginfo->sv);
b8c5462f 2638 }
d6a28714 2639
3b0527fe
DM
2640 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2641 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2642 /* prepare for quick setting of pos */
d300d9fa 2643#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2644 if (SvIsCOW(reginfo->sv))
2645 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2646#endif
3dab1dad 2647 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2648 &PL_vtbl_mglob, NULL, 0);
d6a28714 2649 mg->mg_len = -1;
b8c5462f 2650 }
d6a28714
JH
2651 PL_reg_magic = mg;
2652 PL_reg_oldpos = mg->mg_len;
4f639d21 2653 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2654 }
09687e5a 2655 if (!PL_reg_curpm) {
a02a5408 2656 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2657#ifdef USE_ITHREADS
2658 {
14a49a24 2659 SV* const repointer = &PL_sv_undef;
92313705
NC
2660 /* this regexp is also owned by the new PL_reg_curpm, which
2661 will try to free it. */
d2ece331 2662 av_push(PL_regex_padav, repointer);
09687e5a
AB
2663 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2664 PL_regex_pad = AvARRAY(PL_regex_padav);
2665 }
2666#endif
2667 }
86c29d75
NC
2668#ifdef USE_ITHREADS
2669 /* It seems that non-ithreads works both with and without this code.
2670 So for efficiency reasons it seems best not to have the code
2671 compiled when it is not needed. */
92313705
NC
2672 /* This is safe against NULLs: */
2673 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2674 /* PM_reg_curpm owns a reference to this regexp. */
cd733337 2675 (void)ReREFCNT_inc(rx);
86c29d75 2676#endif
288b8c02 2677 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2678 PL_reg_oldcurpm = PL_curpm;
2679 PL_curpm = PL_reg_curpm;
07bc277f 2680 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2681 /* Here is a serious problem: we cannot rewrite subbeg,
2682 since it may be needed if this match fails. Thus
2683 $` inside (?{}) could fail... */
2684 PL_reg_oldsaved = prog->subbeg;
2685 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2686#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2687 PL_nrs = prog->saved_copy;
2688#endif
07bc277f 2689 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2690 }
2691 else
bd61b366 2692 PL_reg_oldsaved = NULL;
d6a28714
JH
2693 prog->subbeg = PL_bostr;
2694 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2695 }
24b23f37 2696 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2697 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2698 PL_reginput = *startpos;
d6a28714 2699 PL_reglastparen = &prog->lastparen;
a01268b5 2700 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2701 prog->lastparen = 0;
03994de8 2702 prog->lastcloseparen = 0;
d6a28714 2703 PL_regsize = 0;
f0ab9afb 2704 PL_regoffs = prog->offs;
d6a28714
JH
2705 if (PL_reg_start_tmpl <= prog->nparens) {
2706 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2707 if(PL_reg_start_tmp)
2708 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2709 else
a02a5408 2710 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2711 }
2712
2713 /* XXXX What this code is doing here?!!! There should be no need
2714 to do this again and again, PL_reglastparen should take care of
3dd2943c 2715 this! --ilya*/
dafc8851
JH
2716
2717 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2718 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2719 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2720 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2721 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2722 * Meanwhile, this code *is* needed for the
daf18116
JH
2723 * above-mentioned test suite tests to succeed. The common theme
2724 * on those tests seems to be returning null fields from matches.
225593e1 2725 * --jhi updated by dapm */
dafc8851 2726#if 1
d6a28714 2727 if (prog->nparens) {
f0ab9afb 2728 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2729 register I32 i;
eb160463 2730 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2731 ++pp;
2732 pp->start = -1;
2733 pp->end = -1;
d6a28714
JH
2734 }
2735 }
dafc8851 2736#endif
02db2b7b 2737 REGCP_SET(lastcp);
f8fc2ecf 2738 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2739 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2740 return 1;
2741 }
24b23f37
YO
2742 if (reginfo->cutpoint)
2743 *startpos= reginfo->cutpoint;
02db2b7b 2744 REGCP_UNWIND(lastcp);
d6a28714
JH
2745 return 0;
2746}
2747
02db2b7b 2748
8ba1375e
MJD
2749#define sayYES goto yes
2750#define sayNO goto no
262b90c4 2751#define sayNO_SILENT goto no_silent
8ba1375e 2752
f9f4320a
YO
2753/* we dont use STMT_START/END here because it leads to
2754 "unreachable code" warnings, which are bogus, but distracting. */
2755#define CACHEsayNO \
c476f425
DM
2756 if (ST.cache_mask) \
2757 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2758 sayNO
3298f257 2759
a3621e74 2760/* this is used to determine how far from the left messages like
265c4333
YO
2761 'failed...' are printed. It should be set such that messages
2762 are inline with the regop output that created them.
a3621e74 2763*/
265c4333 2764#define REPORT_CODE_OFF 32
a3621e74
YO
2765
2766
40a82448
DM
2767#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2768#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2769
86545054
DM
2770#define SLAB_FIRST(s) (&(s)->states[0])
2771#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2772
5d9a96ca
DM
2773/* grab a new slab and return the first slot in it */
2774
2775STATIC regmatch_state *
2776S_push_slab(pTHX)
2777{
a35a87e7 2778#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2779 dMY_CXT;
2780#endif
5d9a96ca
DM
2781 regmatch_slab *s = PL_regmatch_slab->next;
2782 if (!s) {
2783 Newx(s, 1, regmatch_slab);
2784 s->prev = PL_regmatch_slab;
2785 s->next = NULL;
2786 PL_regmatch_slab->next = s;
2787 }
2788 PL_regmatch_slab = s;
86545054 2789 return SLAB_FIRST(s);
5d9a96ca 2790}
5b47454d 2791
95b24440 2792
40a82448
DM
2793/* push a new state then goto it */
2794
2795#define PUSH_STATE_GOTO(state, node) \
2796 scan = node; \
2797 st->resume_state = state; \
2798 goto push_state;
2799
2800/* push a new state with success backtracking, then goto it */
2801
2802#define PUSH_YES_STATE_GOTO(state, node) \
2803 scan = node; \
2804 st->resume_state = state; \
2805 goto push_yes_state;
2806
aa283a38 2807
aa283a38 2808
d6a28714 2809/*
95b24440 2810
bf1f174e
DM
2811regmatch() - main matching routine
2812
2813This is basically one big switch statement in a loop. We execute an op,
2814set 'next' to point the next op, and continue. If we come to a point which
2815we may need to backtrack to on failure such as (A|B|C), we push a
2816backtrack state onto the backtrack stack. On failure, we pop the top
2817state, and re-enter the loop at the state indicated. If there are no more
2818states to pop, we return failure.
2819
2820Sometimes we also need to backtrack on success; for example /A+/, where
2821after successfully matching one A, we need to go back and try to
2822match another one; similarly for lookahead assertions: if the assertion
2823completes successfully, we backtrack to the state just before the assertion
2824and then carry on. In these cases, the pushed state is marked as
2825'backtrack on success too'. This marking is in fact done by a chain of
2826pointers, each pointing to the previous 'yes' state. On success, we pop to
2827the nearest yes state, discarding any intermediate failure-only states.
2828Sometimes a yes state is pushed just to force some cleanup code to be
2829called at the end of a successful match or submatch; e.g. (??{$re}) uses
2830it to free the inner regex.
2831
2832Note that failure backtracking rewinds the cursor position, while
2833success backtracking leaves it alone.
2834
2835A pattern is complete when the END op is executed, while a subpattern
2836such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2837ops trigger the "pop to last yes state if any, otherwise return true"
2838behaviour.
2839
2840A common convention in this function is to use A and B to refer to the two
2841subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2842the subpattern to be matched possibly multiple times, while B is the entire
2843rest of the pattern. Variable and state names reflect this convention.
2844
2845The states in the main switch are the union of ops and failure/success of
2846substates associated with with that op. For example, IFMATCH is the op
2847that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2848'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2849successfully matched A and IFMATCH_A_fail is a state saying that we have
2850just failed to match A. Resume states always come in pairs. The backtrack
2851state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2852at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2853on success or failure.
2854
2855The struct that holds a backtracking state is actually a big union, with
2856one variant for each major type of op. The variable st points to the
2857top-most backtrack struct. To make the code clearer, within each
2858block of code we #define ST to alias the relevant union.
2859
2860Here's a concrete example of a (vastly oversimplified) IFMATCH
2861implementation:
2862
2863 switch (state) {
2864 ....
2865
2866#define ST st->u.ifmatch
2867
2868 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2869 ST.foo = ...; // some state we wish to save
95b24440 2870 ...
bf1f174e
DM
2871 // push a yes backtrack state with a resume value of
2872 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2873 // first node of A:
2874 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2875 // NOTREACHED
2876
2877 case IFMATCH_A: // we have successfully executed A; now continue with B
2878 next = B;
2879 bar = ST.foo; // do something with the preserved value
2880 break;
2881
2882 case IFMATCH_A_fail: // A failed, so the assertion failed
2883 ...; // do some housekeeping, then ...
2884 sayNO; // propagate the failure
2885
2886#undef ST
95b24440 2887
bf1f174e
DM
2888 ...
2889 }
95b24440 2890
bf1f174e
DM
2891For any old-timers reading this who are familiar with the old recursive
2892approach, the code above is equivalent to:
95b24440 2893
bf1f174e
DM
2894 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2895 {
2896 int foo = ...
95b24440 2897 ...
bf1f174e
DM
2898 if (regmatch(A)) {
2899 next = B;
2900 bar = foo;
2901 break;
95b24440 2902 }
bf1f174e
DM
2903 ...; // do some housekeeping, then ...
2904 sayNO; // propagate the failure
95b24440 2905 }
bf1f174e
DM
2906
2907The topmost backtrack state, pointed to by st, is usually free. If you
2908want to claim it, populate any ST.foo fields in it with values you wish to
2909save, then do one of
2910
2911 PUSH_STATE_GOTO(resume_state, node);
2912 PUSH_YES_STATE_GOTO(resume_state, node);
2913
2914which sets that backtrack state's resume value to 'resume_state', pushes a
2915new free entry to the top of the backtrack stack, then goes to 'node'.
2916On backtracking, the free slot is popped, and the saved state becomes the
2917new free state. An ST.foo field in this new top state can be temporarily
2918accessed to retrieve values, but once the main loop is re-entered, it
2919becomes available for reuse.
2920
2921Note that the depth of the backtrack stack constantly increases during the
2922left-to-right execution of the pattern, rather than going up and down with
2923the pattern nesting. For example the stack is at its maximum at Z at the
2924end of the pattern, rather than at X in the following:
2925
2926 /(((X)+)+)+....(Y)+....Z/
2927
2928The only exceptions to this are lookahead/behind assertions and the cut,
2929(?>A), which pop all the backtrack states associated with A before
2930continuing.
2931
486ec47a 2932Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2933PL_regmatch_state and st always point to the currently active state,
2934and PL_regmatch_slab points to the slab currently containing
2935PL_regmatch_state. The first time regmatch() is called, the first slab is
2936allocated, and is never freed until interpreter destruction. When the slab
2937is full, a new one is allocated and chained to the end. At exit from
2938regmatch(), slabs allocated since entry are freed.
2939
2940*/
95b24440 2941
40a82448 2942
5bc10b2c 2943#define DEBUG_STATE_pp(pp) \
265c4333 2944 DEBUG_STATE_r({ \
f2ed9b32 2945 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2946 PerlIO_printf(Perl_debug_log, \
5d458dd8 2947 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2948 depth*2, "", \
13d6edb4 2949 PL_reg_name[st->resume_state], \
5d458dd8
YO
2950 ((st==yes_state||st==mark_state) ? "[" : ""), \
2951 ((st==yes_state) ? "Y" : ""), \
2952 ((st==mark_state) ? "M" : ""), \
2953 ((st==yes_state||st==mark_state) ? "]" : "") \
2954 ); \
265c4333 2955 });
5bc10b2c 2956
40a82448 2957
3dab1dad 2958#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2959
3df15adc 2960#ifdef DEBUGGING
5bc10b2c 2961
ab3bbdeb 2962STATIC void
f2ed9b32 2963S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2964 const char *start, const char *end, const char *blurb)
2965{
efd26800 2966 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2967
2968 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2969
ab3bbdeb
YO
2970 if (!PL_colorset)
2971 reginitcolors();
2972 {
2973 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2974 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2975
f2ed9b32 2976 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2977 start, end - start, 60);
2978
2979 PerlIO_printf(Perl_debug_log,
2980 "%s%s REx%s %s against %s\n",
2981 PL_colors[4], blurb, PL_colors[5], s0, s1);
2982
f2ed9b32 2983 if (utf8_target||utf8_pat)
1de06328
YO
2984 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2985 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2986 utf8_pat && utf8_target ? " and " : "",
2987 utf8_target ? "string" : ""
ab3bbdeb
YO
2988 );
2989 }
2990}
3df15adc
YO
2991
2992STATIC void
786e8c11
YO
2993S_dump_exec_pos(pTHX_ const char *locinput,
2994 const regnode *scan,
2995 const char *loc_regeol,
2996 const char *loc_bostr,
2997 const char *loc_reg_starttry,
f2ed9b32 2998 const bool utf8_target)
07be1b83 2999{
786e8c11 3000 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3001 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3002 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3003 /* The part of the string before starttry has one color
3004 (pref0_len chars), between starttry and current
3005 position another one (pref_len - pref0_len chars),
3006 after the current position the third one.
3007 We assume that pref0_len <= pref_len, otherwise we
3008 decrease pref0_len. */
786e8c11
YO
3009 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3010 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3011 int pref0_len;
3012
7918f24d
NC
3013 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3014
f2ed9b32 3015 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3016 pref_len++;
786e8c11
YO
3017 pref0_len = pref_len - (locinput - loc_reg_starttry);
3018 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3019 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3020 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3021 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3022 l--;
3023 if (pref0_len < 0)
3024 pref0_len = 0;
3025 if (pref0_len > pref_len)
3026 pref0_len = pref_len;
3027 {
f2ed9b32 3028 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3029
ab3bbdeb 3030 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3031 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3032
ab3bbdeb 3033 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3034 (locinput - pref_len + pref0_len),
1de06328 3035 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3036
ab3bbdeb 3037 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3038 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3039
1de06328 3040 const STRLEN tlen=len0+len1+len2;
3df15adc 3041 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3042 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3043 (IV)(locinput - loc_bostr),
07be1b83 3044 len0, s0,
07be1b83 3045 len1, s1,
07be1b83 3046 (docolor ? "" : "> <"),
07be1b83 3047 len2, s2,
f9f4320a 3048 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3049 "");
3050 }
3051}
3df15adc 3052
07be1b83
YO
3053#endif
3054
0a4db386
YO
3055/* reg_check_named_buff_matched()
3056 * Checks to see if a named buffer has matched. The data array of
3057 * buffer numbers corresponding to the buffer is expected to reside
3058 * in the regexp->data->data array in the slot stored in the ARG() of
3059 * node involved. Note that this routine doesn't actually care about the
3060 * name, that information is not preserved from compilation to execution.
3061 * Returns the index of the leftmost defined buffer with the given name
3062 * or 0 if non of the buffers matched.
3063 */
3064STATIC I32
7918f24d
NC
3065S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3066{
0a4db386 3067 I32 n;
f8fc2ecf 3068 RXi_GET_DECL(rex,rexi);
ad64d0ec 3069 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3070 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3071
3072 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3073
0a4db386
YO
3074 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3075 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 3076 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
3077 {
3078 return nums[n];
3079 }
3080 }
3081 return 0;
3082}
3083
2f554ef7
DM
3084
3085/* free all slabs above current one - called during LEAVE_SCOPE */
3086
3087STATIC void
3088S_clear_backtrack_stack(pTHX_ void *p)
3089{
3090 regmatch_slab *s = PL_regmatch_slab->next;
3091 PERL_UNUSED_ARG(p);
3092
3093 if (!s)
3094 return;
3095 PL_regmatch_slab->next = NULL;
3096 while (s) {
3097 regmatch_slab * const osl = s;
3098 s = s->next;
3099 Safefree(osl);
3100 }
3101}
3102
3103
28d8d7f4
YO
3104#define SETREX(Re1,Re2) \
3105 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3106 Re1 = (Re2)
3107
d6a28714 3108STATIC I32 /* 0 failure, 1 success */
24b23f37 3109S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3110{
a35a87e7 3111#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3112 dMY_CXT;
3113#endif
27da23d5 3114 dVAR;
f2ed9b32 3115 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3116 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3117 REGEXP *rex_sv = reginfo->prog;
3118 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3119 RXi_GET_DECL(rex,rexi);
2f554ef7 3120 I32 oldsave;
5d9a96ca
DM
3121 /* the current state. This is a cached copy of PL_regmatch_state */
3122 register regmatch_state *st;
5d9a96ca
DM
3123 /* cache heavy used fields of st in registers */
3124 register regnode *scan;
3125 register regnode *next;
438e9bae 3126 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3127 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3128 register char *locinput = PL_reginput;
5d9a96ca 3129 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3130
b69b0499 3131 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3132 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3133 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3134 const U32 max_nochange_depth =
3135 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3136 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3137 regmatch_state *yes_state = NULL; /* state to pop to on success of
3138 subpattern */
e2e6a0f1
YO
3139 /* mark_state piggy backs on the yes_state logic so that when we unwind
3140 the stack on success we can update the mark_state as we go */
3141 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3142 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3143 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3144 U32 state_num;
5d458dd8
YO
3145 bool no_final = 0; /* prevent failure from backtracking? */
3146 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3147 char *startpoint = PL_reginput;
5d458dd8
YO
3148 SV *popmark = NULL; /* are we looking for a mark? */
3149 SV *sv_commit = NULL; /* last mark name seen in failure */
3150 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3151 during a successful match */
5d458dd8
YO
3152 U32 lastopen = 0; /* last open we saw */
3153 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3154 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3155 /* these three flags are set by various ops to signal information to
3156 * the very next op. They have a useful lifetime of exactly one loop
3157 * iteration, and are not preserved or restored by state pushes/pops
3158 */
3159 bool sw = 0; /* the condition value in (?(cond)a|b) */
3160 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3161 int logical = 0; /* the following EVAL is:
3162 0: (?{...})
3163 1: (?(?{...})X|Y)
3164 2: (??{...})
3165 or the following IFMATCH/UNLESSM is:
3166 false: plain (?=foo)
3167 true: used as a condition: (?(?=foo))
3168 */
95b24440 3169#ifdef DEBUGGING
e68ec53f 3170 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3171#endif
3172
7918f24d
NC
3173 PERL_ARGS_ASSERT_REGMATCH;
3174
3b57cd43 3175 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3176 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3177 }));
5d9a96ca
DM
3178 /* on first ever call to regmatch, allocate first slab */
3179 if (!PL_regmatch_slab) {
3180 Newx(PL_regmatch_slab, 1, regmatch_slab);
3181 PL_regmatch_slab->prev = NULL;
3182 PL_regmatch_slab->next = NULL;
86545054 3183 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3184 }
3185
2f554ef7
DM
3186 oldsave = PL_savestack_ix;
3187 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3188 SAVEVPTR(PL_regmatch_slab);
3189 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3190
3191 /* grab next free state slot */
3192 st = ++PL_regmatch_state;
86545054 3193 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3194 st = PL_regmatch_state = S_push_slab(aTHX);
3195
d6a28714
JH
3196 /* Note that nextchr is a byte even in UTF */
3197 nextchr = UCHARAT(locinput);
3198 scan = prog;
3199 while (scan != NULL) {
8ba1375e 3200
a3621e74 3201 DEBUG_EXECUTE_r( {
6136c704 3202 SV * const prop = sv_newmortal();
1de06328 3203 regnode *rnext=regnext(scan);
f2ed9b32 3204 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3205 regprop(rex, prop, scan);
07be1b83
YO
3206
3207 PerlIO_printf(Perl_debug_log,
3208 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3209 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3210 SvPVX_const(prop),
1de06328 3211 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3212 0 : (IV)(rnext - rexi->program));
2a782b5b 3213 });
d6a28714
JH
3214
3215 next = scan + NEXT_OFF(scan);
3216 if (next == scan)
3217 next = NULL;
40a82448 3218 state_num = OP(scan);
d6a28714 3219
40a82448 3220 reenter_switch:
34a81e2b
B
3221
3222 assert(PL_reglastparen == &rex->lastparen);
3223 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3224 assert(PL_regoffs == rex->offs);
3225
40a82448 3226 switch (state_num) {
d6a28714 3227 case BOL:
7fba1cd6 3228 if (locinput == PL_bostr)
d6a28714 3229 {
3b0527fe 3230 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3231 break;
3232 }
d6a28714
JH
3233 sayNO;
3234 case MBOL:
12d33761
HS
3235 if (locinput == PL_bostr ||
3236 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3237 {
b8c5462f
JH
3238 break;
3239 }
d6a28714
JH
3240 sayNO;
3241 case SBOL:
c2a73568 3242 if (locinput == PL_bostr)
b8c5462f 3243 break;
d6a28714
JH
3244 sayNO;
3245 case GPOS:
3b0527fe 3246 if (locinput == reginfo->ganch)
d6a28714
JH
3247 break;
3248 sayNO;
ee9b8eae
YO
3249
3250 case KEEPS:
3251 /* update the startpoint */
f0ab9afb 3252 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3253 PL_reginput = locinput;
f0ab9afb 3254 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3255 PUSH_STATE_GOTO(KEEPS_next, next);
3256 /*NOT-REACHED*/
3257 case KEEPS_next_fail:
3258 /* rollback the start point change */
f0ab9afb 3259 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3260 sayNO_SILENT;
3261 /*NOT-REACHED*/
d6a28714 3262 case EOL:
d6a28714
JH
3263 goto seol;
3264 case MEOL:
d6a28714 3265 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3266 sayNO;
b8c5462f 3267 break;
d6a28714
JH
3268 case SEOL:
3269 seol:
3270 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3271 sayNO;
d6a28714 3272 if (PL_regeol - locinput > 1)
b8c5462f 3273 sayNO;
b8c5462f 3274 break;
d6a28714
JH
3275 case EOS:
3276 if (PL_regeol != locinput)
b8c5462f 3277 sayNO;
d6a28714 3278 break;
ffc61ed2 3279 case SANY:
d6a28714 3280 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3281 sayNO;
f2ed9b32 3282 if (utf8_target) {
f33976b4
DB
3283 locinput += PL_utf8skip[nextchr];
3284 if (locinput > PL_regeol)
3285 sayNO;
3286 nextchr = UCHARAT(locinput);
3287 }
3288 else
3289 nextchr = UCHARAT(++locinput);
3290 break;
3291 case CANY:
3292 if (!nextchr && locinput >= PL_regeol)
3293 sayNO;
b8c5462f 3294 nextchr = UCHARAT(++locinput);
a0d0e21e 3295 break;
ffc61ed2 3296 case REG_ANY:
1aa99e6b
IH
3297 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3298 sayNO;
f2ed9b32 3299 if (utf8_target) {
b8c5462f 3300 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3301 if (locinput > PL_regeol)
3302 sayNO;
a0ed51b3 3303 nextchr = UCHARAT(locinput);
a0ed51b3 3304 }
1aa99e6b
IH
3305 else
3306 nextchr = UCHARAT(++locinput);
a0ed51b3 3307 break;
166ba7cd
DM
3308
3309#undef ST
3310#define ST st->u.trie
786e8c11
YO
3311 case TRIEC:
3312 /* In this case the charclass data is available inline so
3313 we can fail fast without a lot of extra overhead.
3314 */
fab2782b
YO
3315 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3316 DEBUG_EXECUTE_r(
3317 PerlIO_printf(Perl_debug_log,
3318 "%*s %sfailed to match trie start class...%s\n",
3319 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3320 );
3321 sayNO_SILENT;
3322 /* NOTREACHED */
786e8c11
YO
3323 }
3324 /* FALL THROUGH */
5b47454d 3325 case TRIE:
2e64971a
DM
3326 /* the basic plan of execution of the trie is:
3327 * At the beginning, run though all the states, and
3328 * find the longest-matching word. Also remember the position
3329 * of the shortest matching word. For example, this pattern:
3330 * 1 2 3 4 5
3331 * ab|a|x|abcd|abc
3332 * when matched against the string "abcde", will generate
3333 * accept states for all words except 3, with the longest
3334 * matching word being 4, and the shortest being 1 (with
3335 * the position being after char 1 of the string).
3336 *
3337 * Then for each matching word, in word order (i.e. 1,2,4,5),
3338 * we run the remainder of the pattern; on each try setting
3339 * the current position to the character following the word,
3340 * returning to try the next word on failure.
3341 *
3342 * We avoid having to build a list of words at runtime by
3343 * using a compile-time structure, wordinfo[].prev, which
3344 * gives, for each word, the previous accepting word (if any).
3345 * In the case above it would contain the mappings 1->2, 2->0,
3346 * 3->0, 4->5, 5->1. We can use this table to generate, from
3347 * the longest word (4 above), a list of all words, by
3348 * following the list of prev pointers; this gives us the
3349 * unordered list 4,5,1,2. Then given the current word we have
3350 * just tried, we can go through the list and find the
3351 * next-biggest word to try (so if we just failed on word 2,
3352 * the next in the list is 4).
3353 *
3354 * Since at runtime we don't record the matching position in
3355 * the string for each word, we have to work that out for
3356 * each word we're about to process. The wordinfo table holds
3357 * the character length of each word; given that we recorded
3358 * at the start: the position of the shortest word and its
3359 * length in chars, we just need to move the pointer the
3360 * difference between the two char lengths. Depending on
3361 * Unicode status and folding, that's cheap or expensive.
3362 *
3363 * This algorithm is optimised for the case where are only a
3364 * small number of accept states, i.e. 0,1, or maybe 2.
3365 * With lots of accepts states, and having to try all of them,
3366 * it becomes quadratic on number of accept states to find all
3367 * the next words.
3368 */
3369
3dab1dad 3370 {
07be1b83 3371 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3372 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3373
3374 /* what trie are we using right now */
be8e71aa 3375 reg_trie_data * const trie
f8fc2ecf 3376 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3377 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3378 U32 state = trie->startstate;
166ba7cd 3379
fab2782b 3380 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3381 if (trie->states[ state ].wordnum) {
3382 DEBUG_EXECUTE_r(
3383 PerlIO_printf(Perl_debug_log,
3384 "%*s %smatched empty string...%s\n",
5bc10b2c 3385 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3386 );
20dbff7c
YO
3387 if (!trie->jump)
3388 break;
3dab1dad
YO
3389 } else {
3390 DEBUG_EXECUTE_r(
3391 PerlIO_printf(Perl_debug_log,
786e8c11 3392 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3393 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3394 );
3395 sayNO_SILENT;
3396 }
3397 }
166ba7cd 3398
786e8c11
YO
3399 {
3400 U8 *uc = ( U8* )locinput;
3401
3402 STRLEN len = 0;
3403 STRLEN foldlen = 0;
3404 U8 *uscan = (U8*)NULL;
786e8c11 3405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3406 U32 charcount = 0; /* how many input chars we have matched */
3407 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3408
786e8c11
YO
3409 ST.B = next;
3410 ST.jump = trie->jump;
786e8c11 3411 ST.me = scan;
2e64971a
DM
3412 ST.firstpos = NULL;
3413 ST.longfold = FALSE; /* char longer if folded => it's harder */
3414 ST.nextword = 0;
3415
3416 /* fully traverse the TRIE; note the position of the
3417 shortest accept state and the wordnum of the longest
3418 accept state */
07be1b83 3419
a3621e74 3420 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3421 U32 base = trie->states[ state ].trans.base;
f9f4320a 3422 UV uvc = 0;
acb909b4 3423 U16 charid = 0;
2e64971a
DM
3424 U16 wordnum;
3425 wordnum = trie->states[ state ].wordnum;
3426
3427 if (wordnum) { /* it's an accept state */
3428 if (!accepted) {
3429 accepted = 1;
3430 /* record first match position */
3431 if (ST.longfold) {
3432 ST.firstpos = (U8*)locinput;
3433 ST.firstchars = 0;
5b47454d 3434 }
2e64971a
DM
3435 else {
3436 ST.firstpos = uc;
3437 ST.firstchars = charcount;
3438 }
3439 }
3440 if (!ST.nextword || wordnum < ST.nextword)
3441 ST.nextword = wordnum;
3442 ST.topword = wordnum;
786e8c11 3443 }
a3621e74 3444
07be1b83 3445 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3446 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3447 PerlIO_printf( Perl_debug_log,
2e64971a 3448 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3449 2+depth * 2, "", PL_colors[4],
2e64971a 3450 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3451 });
a3621e74 3452
2e64971a 3453 /* read a char and goto next state */
a3621e74 3454 if ( base ) {
6dd2be57 3455 I32 offset;
55eed653
NC
3456 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3457 uscan, len, uvc, charid, foldlen,
3458 foldbuf, uniflags);
2e64971a
DM
3459 charcount++;
3460 if (foldlen>0)
3461 ST.longfold = TRUE;
5b47454d 3462 if (charid &&
6dd2be57
DM
3463 ( ((offset =
3464 base + charid - 1 - trie->uniquecharcount)) >= 0)
3465
3466 && ((U32)offset < trie->lasttrans)
3467 && trie->trans[offset].check == state)
5b47454d 3468 {
6dd2be57 3469 state = trie->trans[offset].next;
5b47454d
DM
3470 }
3471 else {
3472 state = 0;
3473 }
3474 uc += len;
3475
3476 }
3477 else {
a3621e74
YO
3478 state = 0;
3479 }
3480 DEBUG_TRIE_EXECUTE_r(
e4584336 3481 PerlIO_printf( Perl_debug_log,
786e8c11 3482 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3483 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3484 );
3485 }
2e64971a 3486 if (!accepted)
a3621e74 3487 sayNO;
a3621e74 3488
2e64971a
DM
3489 /* calculate total number of accept states */
3490 {
3491 U16 w = ST.topword;
3492 accepted = 0;
3493 while (w) {
3494 w = trie->wordinfo[w].prev;
3495 accepted++;
3496 }
3497 ST.accepted = accepted;
3498 }
3499
166ba7cd
DM
3500 DEBUG_EXECUTE_r(
3501 PerlIO_printf( Perl_debug_log,
3502 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3503 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3504 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3505 );
2e64971a 3506 goto trie_first_try; /* jump into the fail handler */
786e8c11 3507 }}
fae667d5 3508 /* NOTREACHED */
2e64971a
DM
3509
3510 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3511 if ( ST.jump) {
3512 REGCP_UNWIND(ST.cp);
3513 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3514 PL_regoffs[n].end = -1;
fae667d5
YO
3515 *PL_reglastparen = n;
3516 }
2e64971a
DM
3517 if (!--ST.accepted) {
3518 DEBUG_EXECUTE_r({
3519 PerlIO_printf( Perl_debug_log,
3520 "%*s %sTRIE failed...%s\n",
3521 REPORT_CODE_OFF+depth*2, "",
3522 PL_colors[4],
3523 PL_colors[5] );
3524 });
3525 sayNO_SILENT;
3526 }
3527 {
3528 /* Find next-highest word to process. Note that this code
3529 * is O(N^2) per trie run (O(N) per branch), so keep tight */
d9a396a3
DM
3530 register U16 min = 0;
3531 register U16 word;
2e64971a
DM
3532 register U16 const nextword = ST.nextword;
3533 register reg_trie_wordinfo * const wordinfo
3534 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3535 for (word=ST.topword; word; word=wordinfo[word].prev) {
3536 if (word > nextword && (!min || word < min))
3537 min = word;
3538 }
3539 ST.nextword = min;
3540 }
3541
fae667d5 3542 trie_first_try:
5d458dd8
YO
3543 if (do_cutgroup) {
3544 do_cutgroup = 0;
3545 no_final = 0;
3546 }
fae667d5
YO
3547
3548 if ( ST.jump) {
3549 ST.lastparen = *PL_reglastparen;
3550 REGCP_SET(ST.cp);
2e64971a 3551 }
a3621e74 3552
2e64971a 3553 /* find start char of end of current word */
166ba7cd 3554 {
2e64971a
DM
3555 U32 chars; /* how many chars to skip */
3556 U8 *uc = ST.firstpos;
3557 reg_trie_data * const trie
3558 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3559
3560 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3561 >= ST.firstchars);
3562 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3563 - ST.firstchars;
3564
3565 if (ST.longfold) {
3566 /* the hard option - fold each char in turn and find
3567 * its folded length (which may be different */
3568 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3569 STRLEN foldlen;
3570 STRLEN len;
d9a396a3 3571 UV uvc;
2e64971a
DM
3572 U8 *uscan;
3573
3574 while (chars) {
f2ed9b32 3575 if (utf8_target) {
2e64971a
DM
3576 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3577 uniflags);
3578 uc += len;
3579 }
3580 else {
3581 uvc = *uc;
3582 uc++;
3583 }
3584 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3585 uscan = foldbuf;
3586 while (foldlen) {
3587 if (!--chars)
3588 break;
3589 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3590 uniflags);
3591 uscan += len;
3592 foldlen -= len;
3593 }
3594 }
a3621e74 3595 }
2e64971a 3596 else {
f2ed9b32 3597 if (utf8_target)
2e64971a
DM
3598 while (chars--)
3599 uc += UTF8SKIP(uc);
3600 else
3601 uc += chars;
3602 }
3603 PL_reginput = (char *)uc;
3604 }
166ba7cd 3605
2e64971a
DM
3606 scan = (ST.jump && ST.jump[ST.nextword])
3607 ? ST.me + ST.jump[ST.nextword]
3608 : ST.B;
166ba7cd 3609
2e64971a
DM
3610 DEBUG_EXECUTE_r({
3611 PerlIO_printf( Perl_debug_log,
3612 "%*s %sTRIE matched word #%d, continuing%s\n",
3613 REPORT_CODE_OFF+depth*2, "",
3614 PL_colors[4],
3615 ST.nextword,
3616 PL_colors[5]
3617 );
3618 });
3619
3620 if (ST.accepted > 1 || has_cutgroup) {
3621 PUSH_STATE_GOTO(TRIE_next, scan);
3622 /* NOTREACHED */
166ba7cd 3623 }
2e64971a
DM
3624 /* only one choice left - just continue */
3625 DEBUG_EXECUTE_r({
3626 AV *const trie_words
3627 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3628 SV ** const tmp = av_fetch( trie_words,
3629 ST.nextword-1, 0 );
3630 SV *sv= tmp ? sv_newmortal() : NULL;
3631
3632 PerlIO_printf( Perl_debug_log,
3633 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3634 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3635 ST.nextword,
3636 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3637 PL_colors[0], PL_colors[1],
c89df6cf 3638 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3639 )
3640 : "not compiled under -Dr",
3641 PL_colors[5] );
3642 });
3643
3644 locinput = PL_reginput;
3645 nextchr = UCHARAT(locinput);
3646 continue; /* execute rest of RE */
166ba7cd 3647 /* NOTREACHED */
166ba7cd
DM
3648#undef ST
3649
95b24440
DM
3650 case EXACT: {
3651 char *s = STRING(scan);
24d3c4a9 3652 ln = STR_LEN(scan);
f2ed9b32 3653 if (utf8_target != UTF_PATTERN) {
bc517b45 3654 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3655 char *l = locinput;
24d3c4a9 3656 const char * const e = s + ln;
a72c7584 3657
f2ed9b32 3658 if (utf8_target) {
5ff6fc6d 3659 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3660 while (s < e) {
a3b680e6 3661 STRLEN ulen;
1aa99e6b 3662 if (l >= PL_regeol)
5ff6fc6d
JH
3663 sayNO;
3664 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3665 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3666 uniflags))
5ff6fc6d 3667 sayNO;
bc517b45 3668 l += ulen;
5ff6fc6d 3669 s ++;
1aa99e6b 3670 }
5ff6fc6d
JH
3671 }
3672 else {
3673 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3674 while (s < e) {
a3b680e6 3675 STRLEN ulen;
1aa99e6b
IH
3676 if (l >= PL_regeol)
3677 sayNO;
5ff6fc6d 3678 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3679 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3680 uniflags))
1aa99e6b 3681 sayNO;
bc517b45 3682 s += ulen;
a72c7584 3683 l ++;
1aa99e6b 3684 }
5ff6fc6d 3685 }
1aa99e6b
IH
3686 locinput = l;
3687 nextchr = UCHARAT(locinput);
3688 break;
3689 }
bc517b45 3690 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3691 /* Inline the first character, for speed. */
3692 if (UCHARAT(s) != nextchr)
3693 sayNO;
24d3c4a9 3694 if (PL_regeol - locinput < ln)
d6a28714 3695 sayNO;
24d3c4a9 3696 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3697 sayNO;
24d3c4a9 3698 locinput += ln;
d6a28714
JH
3699 nextchr = UCHARAT(locinput);
3700 break;
95b24440 3701 }
9a5a5549 3702 case EXACTFL: {
a932d541 3703 re_fold_t folder;
9a5a5549
KW
3704 const U8 * fold_array;
3705 const char * s;
d513472c 3706 U32 fold_utf8_flags;
9a5a5549 3707
b8c5462f 3708 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3709 folder = foldEQ_locale;
3710 fold_array = PL_fold_locale;
17580e7a 3711 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3712 goto do_exactf;
3713
3c760661 3714 case EXACTFU_SS:
fab2782b 3715 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
3716 case EXACTFU:
3717 folder = foldEQ_latin1;
3718 fold_array = PL_fold_latin1;
2daa8fee 3719 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3720 goto do_exactf;
3721
2f7f8cb1
KW
3722 case EXACTFA:
3723 folder = foldEQ_latin1;
3724 fold_array = PL_fold_latin1;
57014d77 3725 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3726 goto do_exactf;
3727
9a5a5549
KW
3728 case EXACTF:
3729 folder = foldEQ;
3730 fold_array = PL_fold;
62bf7766 3731 fold_utf8_flags = 0;
9a5a5549
KW
3732
3733 do_exactf:
3734 s = STRING(scan);
24d3c4a9 3735 ln = STR_LEN(scan);
d6a28714 3736
3c760661
KW
3737 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3738 /* Either target or the pattern are utf8, or has the issue where
3739 * the fold lengths may differ. */
be8e71aa 3740 const char * const l = locinput;
d07ddd77 3741 char *e = PL_regeol;
bc517b45 3742
d513472c 3743 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3744 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3745 {
3746 sayNO;
5486206c 3747 }
d07ddd77
JH
3748 locinput = e;
3749 nextchr = UCHARAT(locinput);
3750 break;
a0ed51b3 3751 }
d6a28714 3752
0a138b74 3753 /* Neither the target nor the pattern are utf8 */
d6a28714 3754 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3755 UCHARAT(s) != fold_array[nextchr])
3756 {
a0ed51b3 3757 sayNO;
9a5a5549 3758 }
24d3c4a9 3759 if (PL_regeol - locinput < ln)
b8c5462f 3760 sayNO;
9a5a5549 3761 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3762 sayNO;
24d3c4a9 3763 locinput += ln;
d6a28714 3764 nextchr = UCHARAT(locinput);
a0d0e21e 3765 break;
9a5a5549 3766 }
63ac0dad
KW
3767
3768 /* XXX Could improve efficiency by separating these all out using a
3769 * macro or in-line function. At that point regcomp.c would no longer
3770 * have to set the FLAGS fields of these */
b2680017
YO
3771 case BOUNDL:
3772 case NBOUNDL:
3773 PL_reg_flags |= RF_tainted;
3774 /* FALL THROUGH */
3775 case BOUND:
63ac0dad 3776 case BOUNDU:
cfaf538b 3777 case BOUNDA:
b2680017 3778 case NBOUND:
63ac0dad 3779 case NBOUNDU:
cfaf538b 3780 case NBOUNDA:
b2680017 3781 /* was last char in word? */
f2e96b5d
KW
3782 if (utf8_target
3783 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3784 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3785 {
b2680017
YO
3786 if (locinput == PL_bostr)
3787 ln = '\n';
3788 else {
3789 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3790
3791 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3792 }
63ac0dad 3793 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3794 ln = isALNUM_uni(ln);
3795 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3796 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3797 }
3798 else {
3799 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3800 n = isALNUM_LC_utf8((U8*)locinput);
3801 }
3802 }
3803 else {
cfaf538b
KW
3804
3805 /* Here the string isn't utf8, or is utf8 and only ascii
3806 * characters are to match \w. In the latter case looking at
3807 * the byte just prior to the current one may be just the final
3808 * byte of a multi-byte character. This is ok. There are two
3809 * cases:
3810 * 1) it is a single byte character, and then the test is doing
3811 * just what it's supposed to.
3812 * 2) it is a multi-byte character, in which case the final
3813 * byte is never mistakable for ASCII, and so the test
3814 * will say it is not a word character, which is the
3815 * correct answer. */
b2680017
YO
3816 ln = (locinput != PL_bostr) ?
3817 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3818 switch (FLAGS(scan)) {
3819 case REGEX_UNICODE_CHARSET:
3820 ln = isWORDCHAR_L1(ln);
3821 n = isWORDCHAR_L1(nextchr);
3822 break;
3823 case REGEX_LOCALE_CHARSET:
3824 ln = isALNUM_LC(ln);
3825 n = isALNUM_LC(nextchr);
3826 break;
3827 case REGEX_DEPENDS_CHARSET:
3828 ln = isALNUM(ln);
3829 n = isALNUM(nextchr);
3830 break;
cfaf538b 3831 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3832 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3833 ln = isWORDCHAR_A(ln);
3834 n = isWORDCHAR_A(nextchr);
3835 break;
63ac0dad
KW
3836 default:
3837 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3838 break;
b2680017
YO
3839 }
3840 }
63ac0dad
KW
3841 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3842 * regcomp.sym */
3843 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3844 sayNO;
3845 break;
f56b6394 3846 case ANYOFV:
d6a28714 3847 case ANYOF:
f56b6394 3848 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3849 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3850 if (locinput >= PL_regeol)
3851 sayNO;
9e55ce06 3852
f2ed9b32 3853 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3854 sayNO;
b32d7d3e 3855 locinput += inclasslen;
b8c5462f 3856 nextchr = UCHARAT(locinput);
e0f9d4a8 3857 break;
ffc61ed2
JH
3858 }
3859 else {
3860 if (nextchr < 0)
3861 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3862 if (!nextchr && locinput >= PL_regeol)
3863 sayNO;
20ed0b26 3864 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3865 sayNO;
ffc61ed2 3866 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3867 break;
3868 }
b8c5462f 3869 break;
20d0b1e9 3870 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3871 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3872 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3873 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3874 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 3875 alnum, "a");
ee9a90b8
KW
3876
3877 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3878 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3879 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 3880 SPACEA, NSPACEA, isSPACE_A,
779d7b58 3881 space, " ");
ee9a90b8
KW
3882
3883 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3884 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 3885 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 3886 digit, "0");
20d0b1e9 3887
37e2e78e
KW
3888 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3889 a Unicode extended Grapheme Cluster */
3890 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3891 extended Grapheme Cluster is:
3892
3893 CR LF
3894 | Prepend* Begin Extend*
3895 | .
3896
3897 Begin is (Hangul-syllable | ! Control)
3898 Extend is (Grapheme_Extend | Spacing_Mark)
3899 Control is [ GCB_Control CR LF ]
3900
3901 The discussion below shows how the code for CLUMP is derived
3902 from this regex. Note that most of these concepts are from
3903 property values of the Grapheme Cluster Boundary (GCB) property.
3904 No code point can have multiple property values for a given
3905 property. Thus a code point in Prepend can't be in Control, but
3906 it must be in !Control. This is why Control above includes
3907 GCB_Control plus CR plus LF. The latter two are used in the GCB
3908 property separately, and so can't be in GCB_Control, even though
3909 they logically are controls. Control is not the same as gc=cc,
3910 but includes format and other characters as well.
3911
3912 The Unicode definition of Hangul-syllable is:
3913 L+
3914 | (L* ( ( V | LV ) V* | LVT ) T*)
3915 | T+
3916 )
3917 Each of these is a value for the GCB property, and hence must be
3918 disjoint, so the order they are tested is immaterial, so the
3919 above can safely be changed to
3920 T+
3921 | L+
3922 | (L* ( LVT | ( V | LV ) V*) T*)
3923
3924 The last two terms can be combined like this:
3925 L* ( L
3926 | (( LVT | ( V | LV ) V*) T*))
3927
3928 And refactored into this:
3929 L* (L | LVT T* | V V* T* | LV V* T*)
3930
3931 That means that if we have seen any L's at all we can quit
cc3b396d 3932 there, but if the next character is an LVT, a V, or an LV we
37e2e78e
KW
3933 should keep going.
3934
3935 There is a subtlety with Prepend* which showed up in testing.
3936 Note that the Begin, and only the Begin is required in:
3937 | Prepend* Begin Extend*
cc3b396d
KW
3938 Also, Begin contains '! Control'. A Prepend must be a
3939 '! Control', which means it must also be a Begin. What it
3940 comes down to is that if we match Prepend* and then find no
3941 suitable Begin afterwards, that if we backtrack the last
3942 Prepend, that one will be a suitable Begin.
37e2e78e
KW
3943 */
3944
b7c83a7e 3945 if (locinput >= PL_regeol)
a0ed51b3 3946 sayNO;
f2ed9b32 3947 if (! utf8_target) {
37e2e78e
KW
3948
3949 /* Match either CR LF or '.', as all the other possibilities
3950 * require utf8 */
3951 locinput++; /* Match the . or CR */
cc3b396d
KW
3952 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3953 match the LF */
37e2e78e
KW
3954 && locinput < PL_regeol
3955 && UCHARAT(locinput) == '\n') locinput++;
3956 }
3957 else {
3958
3959 /* Utf8: See if is ( CR LF ); already know that locinput <
3960 * PL_regeol, so locinput+1 is in bounds */
3961 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3962 locinput += 2;
3963 }
3964 else {
3965 /* In case have to backtrack to beginning, then match '.' */
3966 char *starting = locinput;
3967
3968 /* In case have to backtrack the last prepend */
3969 char *previous_prepend = 0;
3970
3971 LOAD_UTF8_CHARCLASS_GCB();
3972
3973 /* Match (prepend)* */
3974 while (locinput < PL_regeol
3975 && swash_fetch(PL_utf8_X_prepend,
f2ed9b32 3976 (U8*)locinput, utf8_target))
37e2e78e
KW
3977 {
3978 previous_prepend = locinput;
3979 locinput += UTF8SKIP(locinput);
3980 }
3981
3982 /* As noted above, if we matched a prepend character, but
3983 * the next thing won't match, back off the last prepend we
3984 * matched, as it is guaranteed to match the begin */
3985 if (previous_prepend
3986 && (locinput >= PL_regeol
3987 || ! swash_fetch(PL_utf8_X_begin,
f2ed9b32 3988 (U8*)locinput, utf8_target)))
37e2e78e
KW
3989 {
3990 locinput = previous_prepend;
3991 }
3992
3993 /* Note that here we know PL_regeol > locinput, as we
3994 * tested that upon input to this switch case, and if we
3995 * moved locinput forward, we tested the result just above
3996 * and it either passed, or we backed off so that it will
3997 * now pass */
f2ed9b32 3998 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
37e2e78e
KW
3999
4000 /* Here did not match the required 'Begin' in the
4001 * second term. So just match the very first
4002 * character, the '.' of the final term of the regex */
4003 locinput = starting + UTF8SKIP(starting);
4004 } else {
4005
4006 /* Here is the beginning of a character that can have
4007 * an extender. It is either a hangul syllable, or a
4008 * non-control */
4009 if (swash_fetch(PL_utf8_X_non_hangul,
f2ed9b32 4010 (U8*)locinput, utf8_target))
37e2e78e
KW
4011 {
4012
4013 /* Here not a Hangul syllable, must be a
4014 * ('! * Control') */
4015 locinput += UTF8SKIP(locinput);
4016 } else {
4017
4018 /* Here is a Hangul syllable. It can be composed
4019 * of several individual characters. One
4020 * possibility is T+ */
4021 if (swash_fetch(PL_utf8_X_T,
f2ed9b32 4022 (U8*)locinput, utf8_target))
37e2e78e
KW
4023 {
4024 while (locinput < PL_regeol
4025 && swash_fetch(PL_utf8_X_T,
f2ed9b32 4026 (U8*)locinput, utf8_target))
37e2e78e
KW
4027 {
4028 locinput += UTF8SKIP(locinput);
4029 }
4030 } else {
4031
4032 /* Here, not T+, but is a Hangul. That means
4033 * it is one of the others: L, LV, LVT or V,
4034 * and matches:
4035 * L* (L | LVT T* | V V* T* | LV V* T*) */
4036
4037 /* Match L* */
4038 while (locinput < PL_regeol
4039 && swash_fetch(PL_utf8_X_L,
f2ed9b32 4040 (U8*)locinput, utf8_target))
37e2e78e
KW
4041 {
4042 locinput += UTF8SKIP(locinput);
4043 }
4044
4045 /* Here, have exhausted L*. If the next
4046 * character is not an LV, LVT nor V, it means
4047 * we had to have at least one L, so matches L+
4048 * in the original equation, we have a complete
4049 * hangul syllable. Are done. */
4050
4051 if (locinput < PL_regeol
4052 && swash_fetch(PL_utf8_X_LV_LVT_V,
f2ed9b32 4053 (U8*)locinput, utf8_target))
37e2e78e
KW
4054 {
4055
4056 /* Otherwise keep going. Must be LV, LVT
4057 * or V. See if LVT */
4058 if (swash_fetch(PL_utf8_X_LVT,
f2ed9b32 4059 (U8*)locinput, utf8_target))
37e2e78e
KW
4060 {
4061 locinput += UTF8SKIP(locinput);
4062 } else {
4063
4064 /* Must be V or LV. Take it, then
4065 * match V* */
4066 locinput += UTF8SKIP(locinput);
4067 while (locinput < PL_regeol
4068 && swash_fetch(PL_utf8_X_V,
f2ed9b32 4069 (U8*)locinput, utf8_target))
37e2e78e
KW
4070 {
4071 locinput += UTF8SKIP(locinput);
4072 }
4073 }
4074
4075 /* And any of LV, LVT, or V can be followed
4076 * by T* */
4077 while (locinput < PL_regeol
4078 && swash_fetch(PL_utf8_X_T,
4079 (U8*)locinput,
f2ed9b32 4080 utf8_target))
37e2e78e
KW
4081 {
4082 locinput += UTF8SKIP(locinput);
4083 }
4084 }
4085 }
4086 }
4087
4088 /* Match any extender */
4089 while (locinput < PL_regeol
4090 && swash_fetch(PL_utf8_X_extend,
f2ed9b32 4091 (U8*)locinput, utf8_target))
37e2e78e
KW
4092 {
4093 locinput += UTF8SKIP(locinput);
4094 }
4095 }
4096 }
4097 if (locinput > PL_regeol) sayNO;
4098 }
a0ed51b3
LW
4099 nextchr = UCHARAT(locinput);
4100 break;
81714fb9
YO
4101
4102 case NREFFL:
d7ef4b73
KW
4103 { /* The capture buffer cases. The ones beginning with N for the
4104 named buffers just convert to the equivalent numbered and
4105 pretend they were called as the corresponding numbered buffer
4106 op. */
26ecd678
TC
4107 /* don't initialize these in the declaration, it makes C++
4108 unhappy */
81714fb9 4109 char *s;
ff1157ca 4110 char type;
8368298a
TC
4111 re_fold_t folder;
4112 const U8 *fold_array;
26ecd678 4113 UV utf8_fold_flags;
8368298a 4114
81714fb9 4115 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4116 folder = foldEQ_locale;
4117 fold_array = PL_fold_locale;
4118 type = REFFL;
17580e7a 4119 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4120 goto do_nref;
4121
2f7f8cb1
KW
4122 case NREFFA:
4123 folder = foldEQ_latin1;
4124 fold_array = PL_fold_latin1;
4125 type = REFFA;
4126 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4127 goto do_nref;
4128
d7ef4b73
KW
4129 case NREFFU:
4130 folder = foldEQ_latin1;
4131 fold_array = PL_fold_latin1;
4132 type = REFFU;
d513472c 4133 utf8_fold_flags = 0;
d7ef4b73
KW
4134 goto do_nref;
4135
81714fb9 4136 case NREFF:
d7ef4b73
KW
4137 folder = foldEQ;
4138 fold_array = PL_fold;
4139 type = REFF;
d513472c 4140 utf8_fold_flags = 0;
d7ef4b73
KW
4141 goto do_nref;
4142
4143 case NREF:
4144 type = REF;
83d7b90b
KW
4145 folder = NULL;
4146 fold_array = NULL;
d513472c 4147 utf8_fold_flags = 0;
d7ef4b73
KW
4148 do_nref:
4149
4150 /* For the named back references, find the corresponding buffer
4151 * number */
0a4db386
YO
4152 n = reg_check_named_buff_matched(rex,scan);
4153
d7ef4b73 4154 if ( ! n ) {
81714fb9 4155 sayNO;
d7ef4b73
KW
4156 }
4157 goto do_nref_ref_common;
4158
c8756f30 4159 case REFFL:
3280af22 4160 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4161 folder = foldEQ_locale;
4162 fold_array = PL_fold_locale;
17580e7a 4163 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4164 goto do_ref;
4165
2f7f8cb1
KW
4166 case REFFA:
4167 folder = foldEQ_latin1;
4168 fold_array = PL_fold_latin1;
4169 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4170 goto do_ref;
4171
d7ef4b73
KW
4172 case REFFU:
4173 folder = foldEQ_latin1;
4174 fold_array = PL_fold_latin1;
d513472c 4175 utf8_fold_flags = 0;
d7ef4b73
KW
4176 goto do_ref;
4177
4178 case REFF:
4179 folder = foldEQ;
4180 fold_array = PL_fold;
d513472c 4181 utf8_fold_flags = 0;
83d7b90b 4182 goto do_ref;
d7ef4b73 4183
c277df42 4184 case REF:
83d7b90b
KW
4185 folder = NULL;
4186 fold_array = NULL;
d513472c 4187 utf8_fold_flags = 0;
83d7b90b 4188
d7ef4b73 4189 do_ref:
81714fb9 4190 type = OP(scan);
d7ef4b73
KW
4191 n = ARG(scan); /* which paren pair */
4192
4193 do_nref_ref_common:
f0ab9afb 4194 ln = PL_regoffs[n].start;
2c2d71f5 4195 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 4196 if (*PL_reglastparen < n || ln == -1)
af3f8c16 4197 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 4198 if (ln == PL_regoffs[n].end)
a0d0e21e 4199 break;
a0ed51b3 4200
24d3c4a9 4201 s = PL_bostr + ln;
d7ef4b73 4202 if (type != REF /* REF can do byte comparison */
2f65c56d 4203 && (utf8_target || type == REFFU))
d7ef4b73
KW
4204 { /* XXX handle REFFL better */
4205 char * limit = PL_regeol;
4206
4207 /* This call case insensitively compares the entire buffer
4208 * at s, with the current input starting at locinput, but
4209 * not going off the end given by PL_regeol, and returns in
4210 * limit upon success, how much of the current input was
4211 * matched */
d513472c
KW
4212 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4213 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4214 {
4215 sayNO;
a0ed51b3 4216 }
d7ef4b73 4217 locinput = limit;
a0ed51b3
LW
4218 nextchr = UCHARAT(locinput);
4219 break;
4220 }
4221
d7ef4b73 4222 /* Not utf8: Inline the first character, for speed. */
76e3520e 4223 if (UCHARAT(s) != nextchr &&
81714fb9 4224 (type == REF ||
d7ef4b73 4225 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4226 sayNO;
f0ab9afb 4227 ln = PL_regoffs[n].end - ln;
24d3c4a9 4228 if (locinput + ln > PL_regeol)
4633a7c4 4229 sayNO;
81714fb9 4230 if (ln > 1 && (type == REF
24d3c4a9 4231 ? memNE(s, locinput, ln)
d7ef4b73 4232 : ! folder(s, locinput, ln)))
4633a7c4 4233 sayNO;
24d3c4a9 4234 locinput += ln;
76e3520e 4235 nextchr = UCHARAT(locinput);
a0d0e21e 4236 break;
81714fb9 4237 }
a0d0e21e 4238 case NOTHING:
c277df42 4239 case TAIL:
a0d0e21e
LW
4240 break;
4241 case BACK:
4242 break;
40a82448
DM
4243
4244#undef ST
4245#define ST st->u.eval
c277df42 4246 {
c277df42 4247 SV *ret;
d2f13c59 4248 REGEXP *re_sv;
6bda09f9 4249 regexp *re;
f8fc2ecf 4250 regexp_internal *rei;
1a147d38
YO
4251 regnode *startpoint;
4252
4253 case GOSTART:
e7707071
YO
4254 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4255 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4256 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4257 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4258 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4259 Perl_croak(aTHX_
4260 "Pattern subroutine nesting without pos change"
4261 " exceeded limit in regex");
6bda09f9
YO
4262 } else {
4263 nochange_depth = 0;
1a147d38 4264 }
288b8c02 4265 re_sv = rex_sv;
6bda09f9 4266 re = rex;
f8fc2ecf 4267 rei = rexi;
288b8c02 4268 (void)ReREFCNT_inc(rex_sv);
1a147d38 4269 if (OP(scan)==GOSUB) {
6bda09f9
YO
4270 startpoint = scan + ARG2L(scan);
4271 ST.close_paren = ARG(scan);
4272 } else {
f8fc2ecf 4273 startpoint = rei->program+1;
6bda09f9
YO
4274 ST.close_paren = 0;
4275 }
4276 goto eval_recurse_doit;
4277 /* NOTREACHED */
4278 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4279 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4280 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4281 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4282 } else {
4283 nochange_depth = 0;
4284 }
8e5e9ebe 4285 {
4aabdb9b
DM
4286 /* execute the code in the {...} */
4287 dSP;
6136c704 4288 SV ** const before = SP;
4aabdb9b
DM
4289 OP_4tree * const oop = PL_op;
4290 COP * const ocurcop = PL_curcop;
4291 PAD *old_comppad;
d80618d2 4292 char *saved_regeol = PL_regeol;
91332126
FR
4293 struct re_save_state saved_state;
4294
6562f1c4 4295 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4296 * eval we would normally put it on the save stack, like with
4297 * save_re_context. However, re-evals have a weird scoping so we
4298 * can't just add ENTER/LEAVE here. With that, things like
4299 *
4300 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4301 *
4302 * would break, as they expect the localisation to be unwound
4303 * only when the re-engine backtracks through the bit that
4304 * localised it.
4305 *
4306 * What we do instead is just saving the state in a local c
4307 * variable.
4308 */
91332126
FR
4309 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4310
4aabdb9b 4311 n = ARG(scan);
f8fc2ecf 4312 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
4313 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4314 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
9e103e26
DM
4315 /* wrap the call in two SAVECOMPPADs. This ensures that
4316 * when the save stack is eventually unwound, all the
4317 * accumulated SAVEt_CLEARSV's will be processed with
4318 * interspersed SAVEt_COMPPAD's to ensure that lexicals
4319 * are cleared in the right pad */
4320 SAVECOMPPAD();
f8fc2ecf 4321 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 4322 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4323
2bf803e2
YO
4324 if (sv_yes_mark) {
4325 SV *sv_mrk = get_sv("REGMARK", 1);
4326 sv_setsv(sv_mrk, sv_yes_mark);
4327 }
4328
8e5e9ebe
RGS
4329 CALLRUNOPS(aTHX); /* Scalar context. */
4330 SPAGAIN;
4331 if (SP == before)
075aa684 4332 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4333 else {
4334 ret = POPs;
4335 PUTBACK;
4336 }
4aabdb9b 4337
91332126
FR
4338 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4339
4aabdb9b 4340 PL_op = oop;
9e103e26 4341 SAVECOMPPAD();
4aabdb9b
DM
4342 PAD_RESTORE_LOCAL(old_comppad);
4343 PL_curcop = ocurcop;
d80618d2 4344 PL_regeol = saved_regeol;
24d3c4a9 4345 if (!logical) {
4aabdb9b
DM
4346 /* /(?{...})/ */
4347 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
4348 break;
4349 }
8e5e9ebe 4350 }
24d3c4a9
DM
4351 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4352 logical = 0;
4aabdb9b 4353 {
4f639d21
DM
4354 /* extract RE object from returned value; compiling if
4355 * necessary */
6136c704 4356 MAGIC *mg = NULL;
288b8c02 4357 REGEXP *rx = NULL;
5c35adbb
NC
4358
4359 if (SvROK(ret)) {
288b8c02 4360 SV *const sv = SvRV(ret);
5c35adbb
NC
4361
4362 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 4363 rx = (REGEXP*) sv;
5c35adbb
NC
4364 } else if (SvSMAGICAL(sv)) {
4365 mg = mg_find(sv, PERL_MAGIC_qr);
4366 assert(mg);
4367 }
4368 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 4369 rx = (REGEXP*) ret;
5c35adbb 4370 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
4371 if (SvGMAGICAL(ret)) {
4372 /* I don't believe that there is ever qr magic
4373 here. */
4374 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 4375 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
4376 }
4377 else {
faf82a0b 4378 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
4379 /* testing suggests mg only ends up non-NULL for
4380 scalars who were upgraded and compiled in the
4381 else block below. In turn, this is only
4382 triggered in the "postponed utf8 string" tests
4383 in t/op/pat.t */
4384 }
0f5d15d6 4385 }
faf82a0b 4386
0f5d15d6 4387 if (mg) {
d2f13c59 4388 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 4389 assert(rx);
0f5d15d6 4390 }
288b8c02 4391 if (rx) {
f0826785 4392 rx = reg_temp_copy(NULL, rx);
288b8c02 4393 }
0f5d15d6 4394 else {
c737faaf 4395 U32 pm_flags = 0;
a3b680e6 4396 const I32 osize = PL_regsize;
0f5d15d6 4397
b9ad30b4
NC
4398 if (DO_UTF8(ret)) {
4399 assert (SvUTF8(ret));
4400 } else if (SvUTF8(ret)) {
4401 /* Not doing UTF-8, despite what the SV says. Is
4402 this only if we're trapped in use 'bytes'? */
4403 /* Make a copy of the octet sequence, but without
4404 the flag on, as the compiler now honours the
4405 SvUTF8 flag on ret. */
4406 STRLEN len;
4407 const char *const p = SvPV(ret, len);
4408 ret = newSVpvn_flags(p, len, SVs_TEMP);
4409 }
288b8c02 4410 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 4411 if (!(SvFLAGS(ret)
faf82a0b 4412 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4413 | SVs_GMG))) {
a2794585
NC
4414 /* This isn't a first class regexp. Instead, it's
4415 caching a regexp onto an existing, Perl visible
4416 scalar. */
ad64d0ec 4417 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4418 }
0f5d15d6 4419 PL_regsize = osize;
0f5d15d6 4420 }
288b8c02
NC
4421 re_sv = rx;
4422 re = (struct regexp *)SvANY(rx);
4aabdb9b 4423 }
07bc277f 4424 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4425 re->subbeg = rex->subbeg;
4426 re->sublen = rex->sublen;
f8fc2ecf 4427 rei = RXi_GET(re);
6bda09f9 4428 DEBUG_EXECUTE_r(
f2ed9b32 4429 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4430 "Matching embedded");
4431 );
f8fc2ecf 4432 startpoint = rei->program + 1;
1a147d38 4433 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
4434 /* borrowed from regtry */
4435 if (PL_reg_start_tmpl <= re->nparens) {
4436 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4437 if(PL_reg_start_tmp)
4438 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4439 else
4440 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 4441 }
aa283a38 4442
1a147d38 4443 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4444 /* run the pattern returned from (??{...}) */
40a82448
DM
4445 ST.cp = regcppush(0); /* Save *all* the positions. */
4446 REGCP_SET(ST.lastcp);
6bda09f9 4447
f0ab9afb 4448 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 4449
0357f1fd
ML
4450 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4451 PL_reglastparen = &re->lastparen;
4452 PL_reglastcloseparen = &re->lastcloseparen;
4453 re->lastparen = 0;
4454 re->lastcloseparen = 0;
4455
4aabdb9b 4456 PL_reginput = locinput;
ae0beba1 4457 PL_regsize = 0;
4aabdb9b
DM
4458
4459 /* XXXX This is too dramatic a measure... */
4460 PL_reg_maxiter = 0;
4461
faec1544 4462 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4463 if (RX_UTF8(re_sv))
faec1544
DM
4464 PL_reg_flags |= RF_utf8;
4465 else
4466 PL_reg_flags &= ~RF_utf8;
4467 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4468
288b8c02 4469 ST.prev_rex = rex_sv;
faec1544 4470 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
4471 SETREX(rex_sv,re_sv);
4472 rex = re;
f8fc2ecf 4473 rexi = rei;
faec1544 4474 cur_curlyx = NULL;
40a82448 4475 ST.B = next;
faec1544
DM
4476 ST.prev_eval = cur_eval;
4477 cur_eval = st;
faec1544 4478 /* now continue from first node in postoned RE */
6bda09f9 4479 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 4480 /* NOTREACHED */
a0ed51b3 4481 }
24d3c4a9 4482 /* logical is 1, /(?(?{...})X|Y)/ */
f2338a2e 4483 sw = cBOOL(SvTRUE(ret));
24d3c4a9 4484 logical = 0;
c277df42
IZ
4485 break;
4486 }
40a82448 4487
faec1544
DM
4488 case EVAL_AB: /* cleanup after a successful (??{A})B */
4489 /* note: this is called twice; first after popping B, then A */
4490 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4491 ReREFCNT_dec(rex_sv);
4492 SETREX(rex_sv,ST.prev_rex);
4493 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4494 rexi = RXi_GET(rex);
faec1544
DM
4495 regcpblow(ST.cp);
4496 cur_eval = ST.prev_eval;
4497 cur_curlyx = ST.prev_curlyx;
34a81e2b
B
4498
4499 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4500 PL_reglastparen = &rex->lastparen;
4501 PL_reglastcloseparen = &rex->lastcloseparen;
34a81e2b
B
4502 /* also update PL_regoffs */
4503 PL_regoffs = rex->offs;
0357f1fd 4504
40a82448
DM
4505 /* XXXX This is too dramatic a measure... */
4506 PL_reg_maxiter = 0;
e7707071 4507 if ( nochange_depth )
4b196cd4 4508 nochange_depth--;
262b90c4 4509 sayYES;
40a82448 4510
40a82448 4511
faec1544
DM
4512 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4513 /* note: this is called twice; first after popping B, then A */
4514 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4515 ReREFCNT_dec(rex_sv);
4516 SETREX(rex_sv,ST.prev_rex);
4517 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4518 rexi = RXi_GET(rex);
34a81e2b 4519 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4520 PL_reglastparen = &rex->lastparen;
4521 PL_reglastcloseparen = &rex->lastcloseparen;
4522
40a82448
DM
4523 PL_reginput = locinput;
4524 REGCP_UNWIND(ST.lastcp);
4525 regcppop(rex);
faec1544
DM
4526 cur_eval = ST.prev_eval;
4527 cur_curlyx = ST.prev_curlyx;
4528 /* XXXX This is too dramatic a measure... */
4529 PL_reg_maxiter = 0;
e7707071 4530 if ( nochange_depth )
4b196cd4 4531 nochange_depth--;
40a82448 4532 sayNO_SILENT;
40a82448
DM
4533#undef ST
4534
a0d0e21e 4535 case OPEN:
c277df42 4536 n = ARG(scan); /* which paren pair */
3280af22
NIS
4537 PL_reg_start_tmp[n] = locinput;
4538 if (n > PL_regsize)
4539 PL_regsize = n;
e2e6a0f1 4540 lastopen = n;
a0d0e21e
LW
4541 break;
4542 case CLOSE:
c277df42 4543 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
4544 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4545 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
4546 /*if (n > PL_regsize)
4547 PL_regsize = n;*/
3b6647e0 4548 if (n > *PL_reglastparen)
3280af22 4549 *PL_reglastparen = n;
a01268b5 4550 *PL_reglastcloseparen = n;
3b6647e0 4551 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4552 goto fake_end;
4553 }
a0d0e21e 4554 break;
e2e6a0f1
YO
4555 case ACCEPT:
4556 if (ARG(scan)){
4557 regnode *cursor;
4558 for (cursor=scan;
4559 cursor && OP(cursor)!=END;
4560 cursor=regnext(cursor))
4561 {
4562 if ( OP(cursor)==CLOSE ){
4563 n = ARG(cursor);
4564 if ( n <= lastopen ) {
f0ab9afb
NC
4565 PL_regoffs[n].start
4566 = PL_reg_start_tmp[n] - PL_bostr;
4567 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
4568 /*if (n > PL_regsize)
4569 PL_regsize = n;*/
3b6647e0 4570 if (n > *PL_reglastparen)
e2e6a0f1
YO
4571 *PL_reglastparen = n;
4572 *PL_reglastcloseparen = n;
3b6647e0
RB
4573 if ( n == ARG(scan) || (cur_eval &&
4574 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4575 break;
4576 }
4577 }
4578 }
4579 }
4580 goto fake_end;
4581 /*NOTREACHED*/
c277df42
IZ
4582 case GROUPP:
4583 n = ARG(scan); /* which paren pair */
f2338a2e 4584 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 4585 break;
0a4db386
YO
4586 case NGROUPP:
4587 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4588 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4589 break;
1a147d38 4590 case INSUBP:
0a4db386 4591 n = ARG(scan);
3b6647e0 4592 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4593 break;
4594 case DEFINEP:
4595 sw = 0;
4596 break;
c277df42 4597 case IFTHEN:
2c2d71f5 4598 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4599 if (sw)
c277df42
IZ
4600 next = NEXTOPER(NEXTOPER(scan));
4601 else {
4602 next = scan + ARG(scan);
4603 if (OP(next) == IFTHEN) /* Fake one. */
4604 next = NEXTOPER(NEXTOPER(next));
4605 }
4606 break;
4607 case LOGICAL:
24d3c4a9 4608 logical = scan->flags;
c277df42 4609 break;
c476f425 4610
2ab05381 4611/*******************************************************************
2ab05381 4612
c476f425
DM
4613The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4614pattern, where A and B are subpatterns. (For simple A, CURLYM or
4615STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4616
c476f425 4617A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4618
c476f425
DM
4619On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4620state, which contains the current count, initialised to -1. It also sets
4621cur_curlyx to point to this state, with any previous value saved in the
4622state block.
2ab05381 4623
c476f425
DM
4624CURLYX then jumps straight to the WHILEM op, rather than executing A,
4625since the pattern may possibly match zero times (i.e. it's a while {} loop
4626rather than a do {} while loop).
2ab05381 4627
c476f425
DM
4628Each entry to WHILEM represents a successful match of A. The count in the
4629CURLYX block is incremented, another WHILEM state is pushed, and execution
4630passes to A or B depending on greediness and the current count.
2ab05381 4631
c476f425
DM
4632For example, if matching against the string a1a2a3b (where the aN are
4633substrings that match /A/), then the match progresses as follows: (the
4634pushed states are interspersed with the bits of strings matched so far):
2ab05381 4635
c476f425
DM
4636 <CURLYX cnt=-1>
4637 <CURLYX cnt=0><WHILEM>
4638 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4639 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4640 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4641 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4642
c476f425
DM
4643(Contrast this with something like CURLYM, which maintains only a single
4644backtrack state:
2ab05381 4645
c476f425
DM
4646 <CURLYM cnt=0> a1
4647 a1 <CURLYM cnt=1> a2
4648 a1 a2 <CURLYM cnt=2> a3
4649 a1 a2 a3 <CURLYM cnt=3> b
4650)
2ab05381 4651
c476f425
DM
4652Each WHILEM state block marks a point to backtrack to upon partial failure
4653of A or B, and also contains some minor state data related to that
4654iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4655overall state, such as the count, and pointers to the A and B ops.
2ab05381 4656
c476f425
DM
4657This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4658must always point to the *current* CURLYX block, the rules are:
2ab05381 4659
c476f425
DM
4660When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4661and set cur_curlyx to point the new block.
2ab05381 4662
c476f425
DM
4663When popping the CURLYX block after a successful or unsuccessful match,
4664restore the previous cur_curlyx.
2ab05381 4665
c476f425
DM
4666When WHILEM is about to execute B, save the current cur_curlyx, and set it
4667to the outer one saved in the CURLYX block.
2ab05381 4668
c476f425
DM
4669When popping the WHILEM block after a successful or unsuccessful B match,
4670restore the previous cur_curlyx.
2ab05381 4671
c476f425
DM
4672Here's an example for the pattern (AI* BI)*BO
4673I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4674
c476f425
DM
4675cur_
4676curlyx backtrack stack
4677------ ---------------
4678NULL
4679CO <CO prev=NULL> <WO>
4680CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4681CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4682NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4683
c476f425
DM
4684At this point the pattern succeeds, and we work back down the stack to
4685clean up, restoring as we go:
95b24440 4686
c476f425
DM
4687CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4688CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4689CO <CO prev=NULL> <WO>
4690NULL
a0374537 4691
c476f425
DM
4692*******************************************************************/
4693
4694#define ST st->u.curlyx
4695
4696 case CURLYX: /* start of /A*B/ (for complex A) */
4697 {
4698 /* No need to save/restore up to this paren */
4699 I32 parenfloor = scan->flags;
4700
4701 assert(next); /* keep Coverity happy */
4702 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4703 next += ARG(next);
4704
4705 /* XXXX Probably it is better to teach regpush to support
4706 parenfloor > PL_regsize... */
4707 if (parenfloor > (I32)*PL_reglastparen)
4708 parenfloor = *PL_reglastparen; /* Pessimization... */
4709
4710 ST.prev_curlyx= cur_curlyx;
4711 cur_curlyx = st;
4712 ST.cp = PL_savestack_ix;
4713
4714 /* these fields contain the state of the current curly.
4715 * they are accessed by subsequent WHILEMs */
4716 ST.parenfloor = parenfloor;
d02d6d97 4717 ST.me = scan;
c476f425 4718 ST.B = next;
24d3c4a9
DM
4719 ST.minmod = minmod;
4720 minmod = 0;
c476f425
DM
4721 ST.count = -1; /* this will be updated by WHILEM */
4722 ST.lastloc = NULL; /* this will be updated by WHILEM */
4723
4724 PL_reginput = locinput;
4725 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4726 /* NOTREACHED */
c476f425 4727 }
a0d0e21e 4728
c476f425 4729 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4730 cur_curlyx = ST.prev_curlyx;
4731 sayYES;
4732 /* NOTREACHED */
a0d0e21e 4733
c476f425
DM
4734 case CURLYX_end_fail: /* just failed to match all of A*B */
4735 regcpblow(ST.cp);
4736 cur_curlyx = ST.prev_curlyx;
4737 sayNO;
4738 /* NOTREACHED */
4633a7c4 4739
a0d0e21e 4740
c476f425
DM
4741#undef ST
4742#define ST st->u.whilem
4743
4744 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4745 {
4746 /* see the discussion above about CURLYX/WHILEM */
c476f425 4747 I32 n;
d02d6d97
DM
4748 int min = ARG1(cur_curlyx->u.curlyx.me);
4749 int max = ARG2(cur_curlyx->u.curlyx.me);
4750 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4751
c476f425
DM
4752 assert(cur_curlyx); /* keep Coverity happy */
4753 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4754 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4755 ST.cache_offset = 0;
4756 ST.cache_mask = 0;
4757
4758 PL_reginput = locinput;
4759
4760 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4761 "%*s whilem: matched %ld out of %d..%d\n",
4762 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4763 );
a0d0e21e 4764
c476f425 4765 /* First just match a string of min A's. */
a0d0e21e 4766
d02d6d97 4767 if (n < min) {
92e82afa 4768 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
c476f425 4769 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4770 REGCP_SET(ST.lastcp);
4771
d02d6d97 4772 PUSH_STATE_GOTO(WHILEM_A_pre, A);
c476f425
DM
4773 /* NOTREACHED */
4774 }
4775
4776 /* If degenerate A matches "", assume A done. */
4777
4778 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4779 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4780 "%*s whilem: empty match detected, trying continuation...\n",
4781 REPORT_CODE_OFF+depth*2, "")
4782 );
4783 goto do_whilem_B_max;
4784 }
4785
4786 /* super-linear cache processing */
4787
4788 if (scan->flags) {
a0d0e21e 4789
2c2d71f5 4790 if (!PL_reg_maxiter) {
c476f425
DM
4791 /* start the countdown: Postpone detection until we
4792 * know the match is not *that* much linear. */
2c2d71f5 4793 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4794 /* possible overflow for long strings and many CURLYX's */
4795 if (PL_reg_maxiter < 0)
4796 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4797 PL_reg_leftiter = PL_reg_maxiter;
4798 }
c476f425 4799
2c2d71f5 4800 if (PL_reg_leftiter-- == 0) {
c476f425 4801 /* initialise cache */
3298f257 4802 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4803 if (PL_reg_poscache) {
eb160463 4804 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4805 Renew(PL_reg_poscache, size, char);
4806 PL_reg_poscache_size = size;
4807 }
4808 Zero(PL_reg_poscache, size, char);
4809 }
4810 else {
4811 PL_reg_poscache_size = size;
a02a5408 4812 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4813 }
c476f425
DM
4814 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4815 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4816 PL_colors[4], PL_colors[5])
4817 );
2c2d71f5 4818 }
c476f425 4819
2c2d71f5 4820 if (PL_reg_leftiter < 0) {
c476f425
DM
4821 /* have we already failed at this position? */
4822 I32 offset, mask;
4823 offset = (scan->flags & 0xf) - 1
4824 + (locinput - PL_bostr) * (scan->flags>>4);
4825 mask = 1 << (offset % 8);
4826 offset /= 8;
4827 if (PL_reg_poscache[offset] & mask) {
4828 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4829 "%*s whilem: (cache) already tried at this position...\n",
4830 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4831 );
3298f257 4832 sayNO; /* cache records failure */
2c2d71f5 4833 }
c476f425
DM
4834 ST.cache_offset = offset;
4835 ST.cache_mask = mask;
2c2d71f5 4836 }
c476f425 4837 }
2c2d71f5 4838
c476f425 4839 /* Prefer B over A for minimal matching. */
a687059c 4840
c476f425
DM
4841 if (cur_curlyx->u.curlyx.minmod) {
4842 ST.save_curlyx = cur_curlyx;
4843 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4844 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4845 REGCP_SET(ST.lastcp);
4846 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4847 /* NOTREACHED */
4848 }
a0d0e21e 4849
c476f425
DM
4850 /* Prefer A over B for maximal matching. */
4851
d02d6d97 4852 if (n < max) { /* More greed allowed? */
c476f425
DM
4853 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4854 cur_curlyx->u.curlyx.lastloc = locinput;
4855 REGCP_SET(ST.lastcp);
d02d6d97 4856 PUSH_STATE_GOTO(WHILEM_A_max, A);
c476f425
DM
4857 /* NOTREACHED */
4858 }
4859 goto do_whilem_B_max;
4860 }
4861 /* NOTREACHED */
4862
4863 case WHILEM_B_min: /* just matched B in a minimal match */
4864 case WHILEM_B_max: /* just matched B in a maximal match */
4865 cur_curlyx = ST.save_curlyx;
4866 sayYES;
4867 /* NOTREACHED */
4868
4869 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4870 cur_curlyx = ST.save_curlyx;
4871 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4872 cur_curlyx->u.curlyx.count--;
4873 CACHEsayNO;
4874 /* NOTREACHED */
4875
4876 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
4877 /* FALL THROUGH */
4878 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
4879 REGCP_UNWIND(ST.lastcp);
4880 regcppop(rex);
c476f425
DM
4881 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4882 cur_curlyx->u.curlyx.count--;
4883 CACHEsayNO;
4884 /* NOTREACHED */
4885
4886 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4887 REGCP_UNWIND(ST.lastcp);
4888 regcppop(rex); /* Restore some previous $<digit>s? */
4889 PL_reginput = locinput;
4890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4891 "%*s whilem: failed, trying continuation...\n",
4892 REPORT_CODE_OFF+depth*2, "")
4893 );
4894 do_whilem_B_max:
4895 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4896 && ckWARN(WARN_REGEXP)
4897 && !(PL_reg_flags & RF_warned))
4898 {
4899 PL_reg_flags |= RF_warned;
dcbac5bb
FC
4900 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4901 "Complex regular subexpression recursion limit (%d) "
4902 "exceeded",
c476f425
DM
4903 REG_INFTY - 1);
4904 }
4905
4906 /* now try B */
4907 ST.save_curlyx = cur_curlyx;
4908 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4909 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4910 /* NOTREACHED */
4911
4912 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4913 cur_curlyx = ST.save_curlyx;
4914 REGCP_UNWIND(ST.lastcp);
4915 regcppop(rex);
4916
d02d6d97 4917 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
4918 /* Maximum greed exceeded */
4919 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4920 && ckWARN(WARN_REGEXP)
4921 && !(PL_reg_flags & RF_warned))
4922 {
3280af22 4923 PL_reg_flags |= RF_warned;
c476f425 4924 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
4925 "Complex regular subexpression recursion "
4926 "limit (%d) exceeded",
c476f425 4927 REG_INFTY - 1);
a0d0e21e 4928 }
c476f425 4929 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4930 CACHEsayNO;
a0d0e21e 4931 }
c476f425
DM
4932
4933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4934 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4935 );
4936 /* Try grabbing another A and see if it helps. */
4937 PL_reginput = locinput;
4938 cur_curlyx->u.curlyx.lastloc = locinput;
4939 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4940 REGCP_SET(ST.lastcp);
d02d6d97
DM
4941 PUSH_STATE_GOTO(WHILEM_A_min,
4942 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5f66b61c 4943 /* NOTREACHED */
40a82448
DM
4944
4945#undef ST
4946#define ST st->u.branch
4947
4948 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4949 next = scan + ARG(scan);
4950 if (next == scan)
4951 next = NULL;
40a82448
DM
4952 scan = NEXTOPER(scan);
4953 /* FALL THROUGH */
c277df42 4954
40a82448
DM
4955 case BRANCH: /* /(...|A|...)/ */
4956 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4957 ST.lastparen = *PL_reglastparen;
4958 ST.next_branch = next;
4959 REGCP_SET(ST.cp);
4960 PL_reginput = locinput;
02db2b7b 4961
40a82448 4962 /* Now go into the branch */
5d458dd8
YO
4963 if (has_cutgroup) {
4964 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4965 } else {
4966 PUSH_STATE_GOTO(BRANCH_next, scan);
4967 }
40a82448 4968 /* NOTREACHED */
5d458dd8
YO
4969 case CUTGROUP:
4970 PL_reginput = locinput;
4971 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4972 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4973 PUSH_STATE_GOTO(CUTGROUP_next,next);
4974 /* NOTREACHED */
4975 case CUTGROUP_next_fail:
4976 do_cutgroup = 1;
4977 no_final = 1;
4978 if (st->u.mark.mark_name)
4979 sv_commit = st->u.mark.mark_name;
4980 sayNO;
4981 /* NOTREACHED */
4982 case BRANCH_next:
4983 sayYES;
4984 /* NOTREACHED */
40a82448 4985 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4986 if (do_cutgroup) {
4987 do_cutgroup = 0;
4988 no_final = 0;
4989 }
40a82448
DM
4990 REGCP_UNWIND(ST.cp);
4991 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4992 PL_regoffs[n].end = -1;
40a82448 4993 *PL_reglastparen = n;
0a4db386 4994 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4995 scan = ST.next_branch;
4996 /* no more branches? */
5d458dd8
YO
4997 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4998 DEBUG_EXECUTE_r({
4999 PerlIO_printf( Perl_debug_log,
5000 "%*s %sBRANCH failed...%s\n",
5001 REPORT_CODE_OFF+depth*2, "",
5002 PL_colors[4],
5003 PL_colors[5] );
5004 });
5005 sayNO_SILENT;
5006 }
40a82448
DM
5007 continue; /* execute next BRANCH[J] op */
5008 /* NOTREACHED */
5009
a0d0e21e 5010 case MINMOD:
24d3c4a9 5011 minmod = 1;
a0d0e21e 5012 break;
40a82448
DM
5013
5014#undef ST
5015#define ST st->u.curlym
5016
5017 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5018
5019 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5020 * only a single backtracking state, no matter how many matches
40a82448
DM
5021 * there are in {m,n}. It relies on the pattern being constant
5022 * length, with no parens to influence future backrefs
5023 */
5024
5025 ST.me = scan;
dc45a647 5026 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
5027
5028 /* if paren positive, emulate an OPEN/CLOSE around A */
5029 if (ST.me->flags) {
3b6647e0 5030 U32 paren = ST.me->flags;
40a82448
DM
5031 if (paren > PL_regsize)
5032 PL_regsize = paren;
3b6647e0 5033 if (paren > *PL_reglastparen)
40a82448 5034 *PL_reglastparen = paren;
c277df42 5035 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5036 }
40a82448
DM
5037 ST.A = scan;
5038 ST.B = next;
5039 ST.alen = 0;
5040 ST.count = 0;
24d3c4a9
DM
5041 ST.minmod = minmod;
5042 minmod = 0;
40a82448
DM
5043 ST.c1 = CHRTEST_UNINIT;
5044 REGCP_SET(ST.cp);
6407bf3b 5045
40a82448
DM
5046 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5047 goto curlym_do_B;
5048
5049 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 5050 PL_reginput = locinput;
40a82448
DM
5051 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5052 /* NOTREACHED */
5f80c4cf 5053
40a82448
DM
5054 case CURLYM_A: /* we've just matched an A */
5055 locinput = st->locinput;
5056 nextchr = UCHARAT(locinput);
5057
5058 ST.count++;
5059 /* after first match, determine A's length: u.curlym.alen */
5060 if (ST.count == 1) {
5061 if (PL_reg_match_utf8) {
5062 char *s = locinput;
5063 while (s < PL_reginput) {
5064 ST.alen++;
5065 s += UTF8SKIP(s);
5066 }
5067 }
5068 else {
5069 ST.alen = PL_reginput - locinput;
5070 }
5071 if (ST.alen == 0)
5072 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5073 }
0cadcf80
DM
5074 DEBUG_EXECUTE_r(
5075 PerlIO_printf(Perl_debug_log,
40a82448 5076 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5077 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5078 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5079 );
5080
40a82448 5081 locinput = PL_reginput;
0a4db386
YO
5082
5083 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5084 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5085 goto fake_end;
5086
c966426a
DM
5087 {
5088 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5089 if ( max == REG_INFTY || ST.count < max )
5090 goto curlym_do_A; /* try to match another A */
5091 }
40a82448 5092 goto curlym_do_B; /* try to match B */
5f80c4cf 5093
40a82448
DM
5094 case CURLYM_A_fail: /* just failed to match an A */
5095 REGCP_UNWIND(ST.cp);
0a4db386
YO
5096
5097 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5098 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5099 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5100 sayNO;
0cadcf80 5101
40a82448
DM
5102 curlym_do_B: /* execute the B in /A{m,n}B/ */
5103 PL_reginput = locinput;
5104 if (ST.c1 == CHRTEST_UNINIT) {
5105 /* calculate c1 and c2 for possible match of 1st char
5106 * following curly */
5107 ST.c1 = ST.c2 = CHRTEST_VOID;
5108 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5109 regnode *text_node = ST.B;
5110 if (! HAS_TEXT(text_node))
5111 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5112 /* this used to be
5113
5114 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5115
5116 But the former is redundant in light of the latter.
5117
5118 if this changes back then the macro for
5119 IS_TEXT and friends need to change.
5120 */
5121 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5122 {
ee9b8eae 5123
40a82448 5124 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5125 switch (OP(text_node)) {
5126 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5127 case EXACTFA:
3c760661 5128 case EXACTFU_SS:
fab2782b 5129 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5130 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5131 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5132 default: ST.c2 = ST.c1;
5133 }
c277df42 5134 }
c277df42 5135 }
40a82448
DM
5136 }
5137
5138 DEBUG_EXECUTE_r(
5139 PerlIO_printf(Perl_debug_log,
5140 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5141 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5142 "", (IV)ST.count)
5143 );
5144 if (ST.c1 != CHRTEST_VOID
5145 && UCHARAT(PL_reginput) != ST.c1
5146 && UCHARAT(PL_reginput) != ST.c2)
5147 {
5148 /* simulate B failing */
3e901dc0
YO
5149 DEBUG_OPTIMISE_r(
5150 PerlIO_printf(Perl_debug_log,
5151 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5152 (int)(REPORT_CODE_OFF+(depth*2)),"",
5153 (IV)ST.c1,(IV)ST.c2
5154 ));
40a82448
DM
5155 state_num = CURLYM_B_fail;
5156 goto reenter_switch;
5157 }
5158
5159 if (ST.me->flags) {
5160 /* mark current A as captured */
5161 I32 paren = ST.me->flags;
5162 if (ST.count) {
f0ab9afb 5163 PL_regoffs[paren].start
40a82448 5164 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 5165 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 5166 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 5167 }
40a82448 5168 else
f0ab9afb 5169 PL_regoffs[paren].end = -1;
0a4db386 5170 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5171 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5172 {
5173 if (ST.count)
5174 goto fake_end;
5175 else
5176 sayNO;
5177 }
c277df42 5178 }
0a4db386 5179
40a82448 5180 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 5181 /* NOTREACHED */
40a82448
DM
5182
5183 case CURLYM_B_fail: /* just failed to match a B */
5184 REGCP_UNWIND(ST.cp);
5185 if (ST.minmod) {
84d2fa14
HS
5186 I32 max = ARG2(ST.me);
5187 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5188 sayNO;
5189 goto curlym_do_A; /* try to match a further A */
5190 }
5191 /* backtrack one A */
5192 if (ST.count == ARG1(ST.me) /* min */)
5193 sayNO;
5194 ST.count--;
5195 locinput = HOPc(locinput, -ST.alen);
5196 goto curlym_do_B; /* try to match B */
5197
c255a977
DM
5198#undef ST
5199#define ST st->u.curly
40a82448 5200
c255a977
DM
5201#define CURLY_SETPAREN(paren, success) \
5202 if (paren) { \
5203 if (success) { \
f0ab9afb
NC
5204 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5205 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 5206 *PL_reglastcloseparen = paren; \
c255a977
DM
5207 } \
5208 else \
f0ab9afb 5209 PL_regoffs[paren].end = -1; \
c255a977
DM
5210 }
5211
5212 case STAR: /* /A*B/ where A is width 1 */
5213 ST.paren = 0;
5214 ST.min = 0;
5215 ST.max = REG_INFTY;
a0d0e21e
LW
5216 scan = NEXTOPER(scan);
5217 goto repeat;
c255a977
DM
5218 case PLUS: /* /A+B/ where A is width 1 */
5219 ST.paren = 0;
5220 ST.min = 1;
5221 ST.max = REG_INFTY;
c277df42 5222 scan = NEXTOPER(scan);
c255a977
DM
5223 goto repeat;
5224 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5225 ST.paren = scan->flags; /* Which paren to set */
5226 if (ST.paren > PL_regsize)
5227 PL_regsize = ST.paren;
3b6647e0 5228 if (ST.paren > *PL_reglastparen)
c255a977
DM
5229 *PL_reglastparen = ST.paren;
5230 ST.min = ARG1(scan); /* min to match */
5231 ST.max = ARG2(scan); /* max to match */
0a4db386 5232 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5233 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5234 ST.min=1;
5235 ST.max=1;
5236 }
c255a977
DM
5237 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5238 goto repeat;
5239 case CURLY: /* /A{m,n}B/ where A is width 1 */
5240 ST.paren = 0;
5241 ST.min = ARG1(scan); /* min to match */
5242 ST.max = ARG2(scan); /* max to match */
5243 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5244 repeat:
a0d0e21e
LW
5245 /*
5246 * Lookahead to avoid useless match attempts
5247 * when we know what character comes next.
c255a977 5248 *
5f80c4cf
JP
5249 * Used to only do .*x and .*?x, but now it allows
5250 * for )'s, ('s and (?{ ... })'s to be in the way
5251 * of the quantifier and the EXACT-like node. -- japhy
5252 */
5253
c255a977
DM
5254 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5255 sayNO;
cca55fe3 5256 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5257 U8 *s;
5258 regnode *text_node = next;
5259
3dab1dad
YO
5260 if (! HAS_TEXT(text_node))
5261 FIND_NEXT_IMPT(text_node);
5f80c4cf 5262
9e137952 5263 if (! HAS_TEXT(text_node))
c255a977 5264 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5265 else {
ee9b8eae 5266 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5267 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5268 goto assume_ok_easy;
cca55fe3 5269 }
be8e71aa
YO
5270 else
5271 s = (U8*)STRING(text_node);
ee9b8eae
YO
5272
5273 /* Currently we only get here when
5274
5275 PL_rekind[OP(text_node)] == EXACT
5276
5277 if this changes back then the macro for IS_TEXT and
5278 friends need to change. */
f2ed9b32 5279 if (!UTF_PATTERN) {
9a5a5549
KW
5280 ST.c1 = *s;
5281 switch (OP(text_node)) {
5282 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5283 case EXACTFA:
3c760661 5284 case EXACTFU_SS:
fab2782b 5285 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5286 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5287 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5288 default: ST.c2 = ST.c1; break;
5289 }
1aa99e6b 5290 }
f2ed9b32 5291 else { /* UTF_PATTERN */
9a5a5549 5292 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
b0573d8b
KW
5293 STRLEN ulen;
5294 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5295
5296 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5297 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
e294cc5d 5298 uniflags);
5f80c4cf
JP
5299 }
5300 else {
c255a977 5301 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5302 uniflags);
5f80c4cf 5303 }
1aa99e6b
IH
5304 }
5305 }
bbce6d69 5306 }
a0d0e21e 5307 else
c255a977 5308 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5309 assume_ok_easy:
c255a977
DM
5310
5311 ST.A = scan;
5312 ST.B = next;
3280af22 5313 PL_reginput = locinput;
24d3c4a9
DM
5314 if (minmod) {
5315 minmod = 0;
e2e6a0f1 5316 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 5317 sayNO;
c255a977 5318 ST.count = ST.min;
a0ed51b3 5319 locinput = PL_reginput;
c255a977
DM
5320 REGCP_SET(ST.cp);
5321 if (ST.c1 == CHRTEST_VOID)
5322 goto curly_try_B_min;
5323
5324 ST.oldloc = locinput;
5325
5326 /* set ST.maxpos to the furthest point along the
5327 * string that could possibly match */
5328 if (ST.max == REG_INFTY) {
5329 ST.maxpos = PL_regeol - 1;
f2ed9b32 5330 if (utf8_target)
c255a977
DM
5331 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5332 ST.maxpos--;
5333 }
f2ed9b32 5334 else if (utf8_target) {
c255a977
DM
5335 int m = ST.max - ST.min;
5336 for (ST.maxpos = locinput;
5337 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5338 ST.maxpos += UTF8SKIP(ST.maxpos);
5339 }
5340 else {
5341 ST.maxpos = locinput + ST.max - ST.min;
5342 if (ST.maxpos >= PL_regeol)
5343 ST.maxpos = PL_regeol - 1;
5344 }
5345 goto curly_try_B_min_known;
5346
5347 }
5348 else {
e2e6a0f1 5349 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
5350 locinput = PL_reginput;
5351 if (ST.count < ST.min)
5352 sayNO;
5353 if ((ST.count > ST.min)
5354 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5355 {
5356 /* A{m,n} must come at the end of the string, there's
5357 * no point in backing off ... */
5358 ST.min = ST.count;
5359 /* ...except that $ and \Z can match before *and* after
5360 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5361 We may back off by one in this case. */
5362 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5363 ST.min--;
5364 }
5365 REGCP_SET(ST.cp);
5366 goto curly_try_B_max;
5367 }
5368 /* NOTREACHED */
5369
5370
5371 case CURLY_B_min_known_fail:
5372 /* failed to find B in a non-greedy match where c1,c2 valid */
5373 if (ST.paren && ST.count)
f0ab9afb 5374 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5375
5376 PL_reginput = locinput; /* Could be reset... */
5377 REGCP_UNWIND(ST.cp);
5378 /* Couldn't or didn't -- move forward. */
5379 ST.oldloc = locinput;
f2ed9b32 5380 if (utf8_target)
c255a977
DM
5381 locinput += UTF8SKIP(locinput);
5382 else
5383 locinput++;
5384 ST.count++;
5385 curly_try_B_min_known:
5386 /* find the next place where 'B' could work, then call B */
5387 {
5388 int n;
f2ed9b32 5389 if (utf8_target) {
c255a977
DM
5390 n = (ST.oldloc == locinput) ? 0 : 1;
5391 if (ST.c1 == ST.c2) {
5392 STRLEN len;
5393 /* set n to utf8_distance(oldloc, locinput) */
5394 while (locinput <= ST.maxpos &&
5395 utf8n_to_uvchr((U8*)locinput,
5396 UTF8_MAXBYTES, &len,
5397 uniflags) != (UV)ST.c1) {
5398 locinput += len;
5399 n++;
5400 }
1aa99e6b
IH
5401 }
5402 else {
c255a977
DM
5403 /* set n to utf8_distance(oldloc, locinput) */
5404 while (locinput <= ST.maxpos) {
5405 STRLEN len;
5406 const UV c = utf8n_to_uvchr((U8*)locinput,
5407 UTF8_MAXBYTES, &len,
5408 uniflags);
5409 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5410 break;
5411 locinput += len;
5412 n++;
1aa99e6b 5413 }
0fe9bf95
IZ
5414 }
5415 }
c255a977
DM
5416 else {
5417 if (ST.c1 == ST.c2) {
5418 while (locinput <= ST.maxpos &&
5419 UCHARAT(locinput) != ST.c1)
5420 locinput++;
bbce6d69 5421 }
c255a977
DM
5422 else {
5423 while (locinput <= ST.maxpos
5424 && UCHARAT(locinput) != ST.c1
5425 && UCHARAT(locinput) != ST.c2)
5426 locinput++;
a0ed51b3 5427 }
c255a977
DM
5428 n = locinput - ST.oldloc;
5429 }
5430 if (locinput > ST.maxpos)
5431 sayNO;
5432 /* PL_reginput == oldloc now */
5433 if (n) {
5434 ST.count += n;
e2e6a0f1 5435 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5436 sayNO;
a0d0e21e 5437 }
c255a977
DM
5438 PL_reginput = locinput;
5439 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5440 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5441 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5442 goto fake_end;
5443 }
c255a977 5444 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5445 }
c255a977
DM
5446 /* NOTREACHED */
5447
5448
5449 case CURLY_B_min_fail:
5450 /* failed to find B in a non-greedy match where c1,c2 invalid */
5451 if (ST.paren && ST.count)
f0ab9afb 5452 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5453
5454 REGCP_UNWIND(ST.cp);
5455 /* failed -- move forward one */
5456 PL_reginput = locinput;
e2e6a0f1 5457 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5458 ST.count++;
a0ed51b3 5459 locinput = PL_reginput;
c255a977
DM
5460 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5461 ST.count > 0)) /* count overflow ? */
15272685 5462 {
c255a977
DM
5463 curly_try_B_min:
5464 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5465 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5466 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5467 goto fake_end;
5468 }
c255a977 5469 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5470 }
5471 }
4633a7c4 5472 sayNO;
c255a977
DM
5473 /* NOTREACHED */
5474
5475
5476 curly_try_B_max:
5477 /* a successful greedy match: now try to match B */
40d049e4 5478 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5479 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5480 goto fake_end;
5481 }
c255a977
DM
5482 {
5483 UV c = 0;
5484 if (ST.c1 != CHRTEST_VOID)
f2ed9b32 5485 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
c255a977 5486 UTF8_MAXBYTES, 0, uniflags)
466787eb 5487 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5488 /* If it could work, try it. */
5489 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5490 CURLY_SETPAREN(ST.paren, ST.count);
5491 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5492 /* NOTREACHED */
5493 }
5494 }
5495 /* FALL THROUGH */
5496 case CURLY_B_max_fail:
5497 /* failed to find B in a greedy match */
5498 if (ST.paren && ST.count)
f0ab9afb 5499 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5500
5501 REGCP_UNWIND(ST.cp);
5502 /* back up. */
5503 if (--ST.count < ST.min)
5504 sayNO;
5505 PL_reginput = locinput = HOPc(locinput, -1);
5506 goto curly_try_B_max;
5507
5508#undef ST
5509
a0d0e21e 5510 case END:
6bda09f9 5511 fake_end:
faec1544
DM
5512 if (cur_eval) {
5513 /* we've just finished A in /(??{A})B/; now continue with B */
5514 I32 tmpix;
faec1544
DM
5515 st->u.eval.toggle_reg_flags
5516 = cur_eval->u.eval.toggle_reg_flags;
5517 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5518
288b8c02
NC
5519 st->u.eval.prev_rex = rex_sv; /* inner */
5520 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5521 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5522 rexi = RXi_GET(rex);
faec1544 5523 cur_curlyx = cur_eval->u.eval.prev_curlyx;
cd733337 5524 (void)ReREFCNT_inc(rex_sv);
faec1544 5525 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
34a81e2b
B
5526
5527 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5528 PL_reglastparen = &rex->lastparen;
5529 PL_reglastcloseparen = &rex->lastcloseparen;
5530
faec1544
DM
5531 REGCP_SET(st->u.eval.lastcp);
5532 PL_reginput = locinput;
5533
5534 /* Restore parens of the outer rex without popping the
5535 * savestack */
5536 tmpix = PL_savestack_ix;
5537 PL_savestack_ix = cur_eval->u.eval.lastcp;
5538 regcppop(rex);
5539 PL_savestack_ix = tmpix;
5540
5541 st->u.eval.prev_eval = cur_eval;
5542 cur_eval = cur_eval->u.eval.prev_eval;
5543 DEBUG_EXECUTE_r(
2a49f0f5
JH
5544 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5545 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5546 if ( nochange_depth )
5547 nochange_depth--;
5548
5549 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5550 st->u.eval.prev_eval->u.eval.B); /* match B */
5551 }
5552
3b0527fe 5553 if (locinput < reginfo->till) {
a3621e74 5554 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5555 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5556 PL_colors[4],
5557 (long)(locinput - PL_reg_starttry),
3b0527fe 5558 (long)(reginfo->till - PL_reg_starttry),
7821416a 5559 PL_colors[5]));
58e23c8d 5560
262b90c4 5561 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5562 }
5563 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5564 sayYES; /* Success! */
dad79028
DM
5565
5566 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5567 DEBUG_EXECUTE_r(
5568 PerlIO_printf(Perl_debug_log,
5569 "%*s %ssubpattern success...%s\n",
5bc10b2c 5570 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5571 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5572 sayYES; /* Success! */
dad79028 5573
40a82448
DM
5574#undef ST
5575#define ST st->u.ifmatch
5576
5577 case SUSPEND: /* (?>A) */
5578 ST.wanted = 1;
9fe1d20c 5579 PL_reginput = locinput;
9041c2e3 5580 goto do_ifmatch;
dad79028 5581
40a82448
DM
5582 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5583 ST.wanted = 0;
dad79028
DM
5584 goto ifmatch_trivial_fail_test;
5585
40a82448
DM
5586 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5587 ST.wanted = 1;
dad79028 5588 ifmatch_trivial_fail_test:
a0ed51b3 5589 if (scan->flags) {
52657f30 5590 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5591 if (!s) {
5592 /* trivial fail */
24d3c4a9
DM
5593 if (logical) {
5594 logical = 0;
f2338a2e 5595 sw = 1 - cBOOL(ST.wanted);
dad79028 5596 }
40a82448 5597 else if (ST.wanted)
dad79028
DM
5598 sayNO;
5599 next = scan + ARG(scan);
5600 if (next == scan)
5601 next = NULL;
5602 break;
5603 }
efb30f32 5604 PL_reginput = s;
a0ed51b3
LW
5605 }
5606 else
5607 PL_reginput = locinput;
5608
c277df42 5609 do_ifmatch:
40a82448 5610 ST.me = scan;
24d3c4a9 5611 ST.logical = logical;
24d786f4
YO
5612 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5613
40a82448
DM
5614 /* execute body of (?...A) */
5615 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5616 /* NOTREACHED */
5617
5618 case IFMATCH_A_fail: /* body of (?...A) failed */
5619 ST.wanted = !ST.wanted;
5620 /* FALL THROUGH */
5621
5622 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5623 if (ST.logical) {
f2338a2e 5624 sw = cBOOL(ST.wanted);
40a82448
DM
5625 }
5626 else if (!ST.wanted)
5627 sayNO;
5628
5629 if (OP(ST.me) == SUSPEND)
5630 locinput = PL_reginput;
5631 else {
5632 locinput = PL_reginput = st->locinput;
5633 nextchr = UCHARAT(locinput);
5634 }
5635 scan = ST.me + ARG(ST.me);
5636 if (scan == ST.me)
5637 scan = NULL;
5638 continue; /* execute B */
5639
5640#undef ST
dad79028 5641
c277df42 5642 case LONGJMP:
c277df42
IZ
5643 next = scan + ARG(scan);
5644 if (next == scan)
5645 next = NULL;
a0d0e21e 5646 break;
54612592 5647 case COMMIT:
e2e6a0f1
YO
5648 reginfo->cutpoint = PL_regeol;
5649 /* FALLTHROUGH */
5d458dd8 5650 case PRUNE:
24b23f37 5651 PL_reginput = locinput;
e2e6a0f1 5652 if (!scan->flags)
ad64d0ec 5653 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5654 PUSH_STATE_GOTO(COMMIT_next,next);
5655 /* NOTREACHED */
5656 case COMMIT_next_fail:
5657 no_final = 1;
5658 /* FALLTHROUGH */
7f69552c
YO
5659 case OPFAIL:
5660 sayNO;
e2e6a0f1
YO
5661 /* NOTREACHED */
5662
5663#define ST st->u.mark
5664 case MARKPOINT:
5665 ST.prev_mark = mark_state;
5d458dd8 5666 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5667 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5668 mark_state = st;
5669 ST.mark_loc = PL_reginput = locinput;
5670 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5671 /* NOTREACHED */
5672 case MARKPOINT_next:
5673 mark_state = ST.prev_mark;
5674 sayYES;
5675 /* NOTREACHED */
5676 case MARKPOINT_next_fail:
5d458dd8 5677 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5678 {
5679 if (ST.mark_loc > startpoint)
5680 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5681 popmark = NULL; /* we found our mark */
5682 sv_commit = ST.mark_name;
5683
5684 DEBUG_EXECUTE_r({
5d458dd8 5685 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5686 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5687 REPORT_CODE_OFF+depth*2, "",
be2597df 5688 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5689 });
5690 }
5691 mark_state = ST.prev_mark;
5d458dd8
YO
5692 sv_yes_mark = mark_state ?
5693 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5694 sayNO;
5695 /* NOTREACHED */
5d458dd8
YO
5696 case SKIP:
5697 PL_reginput = locinput;
5698 if (scan->flags) {
2bf803e2 5699 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5700 ST.mark_name = NULL;
e2e6a0f1 5701 ST.mark_loc = locinput;
5d458dd8
YO
5702 PUSH_STATE_GOTO(SKIP_next,next);
5703 } else {
2bf803e2 5704 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5705 otherwise do nothing. Meaning we need to scan
5706 */
5707 regmatch_state *cur = mark_state;
ad64d0ec 5708 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5709
5710 while (cur) {
5711 if ( sv_eq( cur->u.mark.mark_name,
5712 find ) )
5713 {
5714 ST.mark_name = find;
5715 PUSH_STATE_GOTO( SKIP_next, next );
5716 }
5717 cur = cur->u.mark.prev_mark;
5718 }
e2e6a0f1 5719 }
2bf803e2 5720 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5721 break;
5722 case SKIP_next_fail:
5723 if (ST.mark_name) {
5724 /* (*CUT:NAME) - Set up to search for the name as we
5725 collapse the stack*/
5726 popmark = ST.mark_name;
5727 } else {
5728 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5729 if (ST.mark_loc > startpoint)
5730 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5731 /* but we set sv_commit to latest mark_name if there
5732 is one so they can test to see how things lead to this
5733 cut */
5734 if (mark_state)
5735 sv_commit=mark_state->u.mark.mark_name;
5736 }
e2e6a0f1
YO
5737 no_final = 1;
5738 sayNO;
5739 /* NOTREACHED */
5740#undef ST
e1d1eefb 5741 case LNBREAK:
f2ed9b32 5742 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5743 locinput += n;
5744 nextchr = UCHARAT(locinput);
5745 } else
5746 sayNO;
5747 break;
5748
5749#define CASE_CLASS(nAmE) \
5750 case nAmE: \
1380b51d
DM
5751 if (locinput >= PL_regeol) \
5752 sayNO; \
f2ed9b32 5753 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5754 locinput += n; \
5755 nextchr = UCHARAT(locinput); \
5756 } else \
5757 sayNO; \
5758 break; \
5759 case N##nAmE: \
1380b51d
DM
5760 if (locinput >= PL_regeol) \
5761 sayNO; \
f2ed9b32 5762 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5763 sayNO; \
5764 } else { \
5765 locinput += UTF8SKIP(locinput); \
5766 nextchr = UCHARAT(locinput); \
5767 } \
5768 break
5769
5770 CASE_CLASS(VERTWS);
5771 CASE_CLASS(HORIZWS);
5772#undef CASE_CLASS
5773
a0d0e21e 5774 default:
b900a521 5775 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5776 PTR2UV(scan), OP(scan));
cea2e8a9 5777 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5778
5779 } /* end switch */
95b24440 5780
5d458dd8
YO
5781 /* switch break jumps here */
5782 scan = next; /* prepare to execute the next op and ... */
5783 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5784 /* NOTREACHED */
5785
40a82448
DM
5786 push_yes_state:
5787 /* push a state that backtracks on success */
5788 st->u.yes.prev_yes_state = yes_state;
5789 yes_state = st;
5790 /* FALL THROUGH */
5791 push_state:
5792 /* push a new regex state, then continue at scan */
5793 {
5794 regmatch_state *newst;
5795
24b23f37
YO
5796 DEBUG_STACK_r({
5797 regmatch_state *cur = st;
5798 regmatch_state *curyes = yes_state;
5799 int curd = depth;
5800 regmatch_slab *slab = PL_regmatch_slab;
5801 for (;curd > -1;cur--,curd--) {
5802 if (cur < SLAB_FIRST(slab)) {
5803 slab = slab->prev;
5804 cur = SLAB_LAST(slab);
5805 }
5806 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5807 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5808 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5809 (curyes == cur) ? "yes" : ""
5810 );
5811 if (curyes == cur)
5812 curyes = cur->u.yes.prev_yes_state;
5813 }
5814 } else
5815 DEBUG_STATE_pp("push")
5816 );
40a82448 5817 depth++;
40a82448
DM
5818 st->locinput = locinput;
5819 newst = st+1;
5820 if (newst > SLAB_LAST(PL_regmatch_slab))
5821 newst = S_push_slab(aTHX);
5822 PL_regmatch_state = newst;
786e8c11 5823
40a82448
DM
5824 locinput = PL_reginput;
5825 nextchr = UCHARAT(locinput);
5826 st = newst;
5827 continue;
5828 /* NOTREACHED */
5829 }
a0d0e21e 5830 }
a687059c 5831
a0d0e21e
LW
5832 /*
5833 * We get here only if there's trouble -- normally "case END" is
5834 * the terminating point.
5835 */
cea2e8a9 5836 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5837 /*NOTREACHED*/
4633a7c4
LW
5838 sayNO;
5839
262b90c4 5840yes:
77cb431f
DM
5841 if (yes_state) {
5842 /* we have successfully completed a subexpression, but we must now
5843 * pop to the state marked by yes_state and continue from there */
77cb431f 5844 assert(st != yes_state);
5bc10b2c
DM
5845#ifdef DEBUGGING
5846 while (st != yes_state) {
5847 st--;
5848 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5849 PL_regmatch_slab = PL_regmatch_slab->prev;
5850 st = SLAB_LAST(PL_regmatch_slab);
5851 }
e2e6a0f1 5852 DEBUG_STATE_r({
54612592
YO
5853 if (no_final) {
5854 DEBUG_STATE_pp("pop (no final)");
5855 } else {
5856 DEBUG_STATE_pp("pop (yes)");
5857 }
e2e6a0f1 5858 });
5bc10b2c
DM
5859 depth--;
5860 }
5861#else
77cb431f
DM
5862 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5863 || yes_state > SLAB_LAST(PL_regmatch_slab))
5864 {
5865 /* not in this slab, pop slab */
5866 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5867 PL_regmatch_slab = PL_regmatch_slab->prev;
5868 st = SLAB_LAST(PL_regmatch_slab);
5869 }
5870 depth -= (st - yes_state);
5bc10b2c 5871#endif
77cb431f
DM
5872 st = yes_state;
5873 yes_state = st->u.yes.prev_yes_state;
5874 PL_regmatch_state = st;
24b23f37 5875
5d458dd8
YO
5876 if (no_final) {
5877 locinput= st->locinput;
5878 nextchr = UCHARAT(locinput);
5879 }
54612592 5880 state_num = st->resume_state + no_final;
24d3c4a9 5881 goto reenter_switch;
77cb431f
DM
5882 }
5883
a3621e74 5884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5885 PL_colors[4], PL_colors[5]));
02db2b7b 5886
19b95bf0
DM
5887 if (PL_reg_eval_set) {
5888 /* each successfully executed (?{...}) block does the equivalent of
5889 * local $^R = do {...}
5890 * When popping the save stack, all these locals would be undone;
5891 * bypass this by setting the outermost saved $^R to the latest
5892 * value */
5893 if (oreplsv != GvSV(PL_replgv))
5894 sv_setsv(oreplsv, GvSV(PL_replgv));
5895 }
95b24440 5896 result = 1;
aa283a38 5897 goto final_exit;
4633a7c4
LW
5898
5899no:
a3621e74 5900 DEBUG_EXECUTE_r(
7821416a 5901 PerlIO_printf(Perl_debug_log,
786e8c11 5902 "%*s %sfailed...%s\n",
5bc10b2c 5903 REPORT_CODE_OFF+depth*2, "",
786e8c11 5904 PL_colors[4], PL_colors[5])
7821416a 5905 );
aa283a38 5906
262b90c4 5907no_silent:
54612592
YO
5908 if (no_final) {
5909 if (yes_state) {
5910 goto yes;
5911 } else {
5912 goto final_exit;
5913 }
5914 }
aa283a38
DM
5915 if (depth) {
5916 /* there's a previous state to backtrack to */
40a82448
DM
5917 st--;
5918 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5919 PL_regmatch_slab = PL_regmatch_slab->prev;
5920 st = SLAB_LAST(PL_regmatch_slab);
5921 }
5922 PL_regmatch_state = st;
40a82448
DM
5923 locinput= st->locinput;
5924 nextchr = UCHARAT(locinput);
5925
5bc10b2c
DM
5926 DEBUG_STATE_pp("pop");
5927 depth--;
262b90c4
DM
5928 if (yes_state == st)
5929 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5930
24d3c4a9
DM
5931 state_num = st->resume_state + 1; /* failure = success + 1 */
5932 goto reenter_switch;
95b24440 5933 }
24d3c4a9 5934 result = 0;
aa283a38 5935
262b90c4 5936 final_exit:
bbe252da 5937 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5938 SV *sv_err = get_sv("REGERROR", 1);
5939 SV *sv_mrk = get_sv("REGMARK", 1);
5940 if (result) {
e2e6a0f1 5941 sv_commit = &PL_sv_no;
5d458dd8
YO
5942 if (!sv_yes_mark)
5943 sv_yes_mark = &PL_sv_yes;
5944 } else {
5945 if (!sv_commit)
5946 sv_commit = &PL_sv_yes;
5947 sv_yes_mark = &PL_sv_no;
5948 }
5949 sv_setsv(sv_err, sv_commit);
5950 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5951 }
19b95bf0 5952
2f554ef7
DM
5953 /* clean up; in particular, free all slabs above current one */
5954 LEAVE_SCOPE(oldsave);
5d9a96ca 5955
95b24440 5956 return result;
a687059c
LW
5957}
5958
5959/*
5960 - regrepeat - repeatedly match something simple, report how many
5961 */
5962/*
5963 * [This routine now assumes that it will only match on things of length 1.
5964 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5965 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5966 */
76e3520e 5967STATIC I32
e2e6a0f1 5968S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5969{
27da23d5 5970 dVAR;
a0d0e21e 5971 register char *scan;
a0d0e21e 5972 register I32 c;
3280af22 5973 register char *loceol = PL_regeol;
a0ed51b3 5974 register I32 hardcount = 0;
f2ed9b32 5975 register bool utf8_target = PL_reg_match_utf8;
d513472c 5976 UV utf8_flags;
4f55667c
SP
5977#ifndef DEBUGGING
5978 PERL_UNUSED_ARG(depth);
5979#endif
a0d0e21e 5980
7918f24d
NC
5981 PERL_ARGS_ASSERT_REGREPEAT;
5982
3280af22 5983 scan = PL_reginput;
faf11cac
HS
5984 if (max == REG_INFTY)
5985 max = I32_MAX;
5986 else if (max < loceol - scan)
7f596f4c 5987 loceol = scan + max;
a0d0e21e 5988 switch (OP(p)) {
22c35a8c 5989 case REG_ANY:
f2ed9b32 5990 if (utf8_target) {
ffc61ed2 5991 loceol = PL_regeol;
1aa99e6b 5992 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5993 scan += UTF8SKIP(scan);
5994 hardcount++;
5995 }
5996 } else {
5997 while (scan < loceol && *scan != '\n')
5998 scan++;
a0ed51b3
LW
5999 }
6000 break;
ffc61ed2 6001 case SANY:
f2ed9b32 6002 if (utf8_target) {
def8e4ea 6003 loceol = PL_regeol;
a0804c9e 6004 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6005 scan += UTF8SKIP(scan);
6006 hardcount++;
6007 }
6008 }
6009 else
6010 scan = loceol;
a0ed51b3 6011 break;
f33976b4
DB
6012 case CANY:
6013 scan = loceol;
6014 break;
59d32103
KW
6015 case EXACT:
6016 /* To get here, EXACTish nodes must have *byte* length == 1. That
6017 * means they match only characters in the string that can be expressed
6018 * as a single byte. For non-utf8 strings, that means a simple match.
6019 * For utf8 strings, the character matched must be an invariant, or
6020 * downgradable to a single byte. The pattern's utf8ness is
6021 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6022 * it is, it's an invariant */
6023
6024 c = (U8)*STRING(p);
6025 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6026
6027 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6028 while (scan < loceol && UCHARAT(scan) == c) {
6029 scan++;
6030 }
6031 }
6032 else {
6033
6034 /* Here, the string is utf8, and the pattern char is different
6035 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6036 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6037 * look for those in sequence in the utf8 string */
6038 U8 high = UTF8_TWO_BYTE_HI(c);
6039 U8 low = UTF8_TWO_BYTE_LO(c);
6040 loceol = PL_regeol;
6041
6042 while (hardcount < max
6043 && scan + 1 < loceol
6044 && UCHARAT(scan) == high
6045 && UCHARAT(scan + 1) == low)
6046 {
6047 scan += 2;
6048 hardcount++;
6049 }
6050 }
6051 break;
2f7f8cb1
KW
6052 case EXACTFA:
6053 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6054 goto do_exactf;
6055
d4e0b827
KW
6056 case EXACTFL:
6057 PL_reg_flags |= RF_tainted;
17580e7a
KW
6058 utf8_flags = FOLDEQ_UTF8_LOCALE;
6059 goto do_exactf;
6060
d4e0b827 6061 case EXACTF:
62bf7766
KW
6062 utf8_flags = 0;
6063 goto do_exactf;
6064
3c760661 6065 case EXACTFU_SS:
fab2782b 6066 case EXACTFU_TRICKYFOLD:
9a5a5549 6067 case EXACTFU:
05f861a2 6068 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6069
2be3e190
KW
6070 /* The comments for the EXACT case above apply as well to these fold
6071 * ones */
634c83a2 6072
2f7f8cb1 6073 do_exactf:
090f7165 6074 c = (U8)*STRING(p);
634c83a2 6075 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6076
3c760661 6077 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6078 char *tmpeol = loceol;
6079 while (hardcount < max
d513472c
KW
6080 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6081 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6082 {
6083 scan = tmpeol;
6084 tmpeol = loceol;
6085 hardcount++;
6086 }
634c83a2 6087
59d32103
KW
6088 /* XXX Note that the above handles properly the German sharp s in
6089 * the pattern matching ss in the string. But it doesn't handle
6090 * properly cases where the string contains say 'LIGATURE ff' and
6091 * the pattern is 'f+'. This would require, say, a new function or
6092 * revised interface to foldEQ_utf8(), in which the maximum number
6093 * of characters to match could be passed and it would return how
6094 * many actually did. This is just one of many cases where
6095 * multi-char folds don't work properly, and so the fix is being
6096 * deferred */
6097 }
6098 else {
87381386 6099 U8 folded;
59d32103 6100
2be3e190
KW
6101 /* Here, the string isn't utf8 and c is a single byte; and either
6102 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6103 * doesn't affect c. Can just do simple comparisons for exact or
6104 * fold matching. */
d4e0b827 6105 switch (OP(p)) {
87381386 6106 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6107 case EXACTFA:
fab2782b 6108 case EXACTFU_TRICKYFOLD:
9a5a5549 6109 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6110 case EXACTFL: folded = PL_fold_locale[c]; break;
6111 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6112 }
6113 while (scan < loceol &&
6114 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6115 {
6116 scan++;
634c83a2
KW
6117 }
6118 }
bbce6d69 6119 break;
f56b6394 6120 case ANYOFV:
a0d0e21e 6121 case ANYOF:
4e8910e0
KW
6122 if (utf8_target || OP(p) == ANYOFV) {
6123 STRLEN inclasslen;
ffc61ed2 6124 loceol = PL_regeol;
4e8910e0
KW
6125 inclasslen = loceol - scan;
6126 while (hardcount < max
6127 && ((inclasslen = loceol - scan) > 0)
6128 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6129 {
6130 scan += inclasslen;
ffc61ed2
JH
6131 hardcount++;
6132 }
6133 } else {
32fc9b6a 6134 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6135 scan++;
6136 }
a0d0e21e 6137 break;
980866de 6138 case ALNUMU:
f2ed9b32 6139 if (utf8_target) {
980866de 6140 utf8_wordchar:
ffc61ed2 6141 loceol = PL_regeol;
1a4fad37 6142 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6143 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6144 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6145 {
ffc61ed2
JH
6146 scan += UTF8SKIP(scan);
6147 hardcount++;
6148 }
980866de 6149 } else {
a12cf05f
KW
6150 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6151 scan++;
6152 }
980866de
KW
6153 }
6154 break;
6155 case ALNUM:
6156 if (utf8_target)
6157 goto utf8_wordchar;
6158 while (scan < loceol && isALNUM((U8) *scan)) {
6159 scan++;
a0ed51b3
LW
6160 }
6161 break;
cfaf538b
KW
6162 case ALNUMA:
6163 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6164 scan++;
6165 }
6166 break;
bbce6d69 6167 case ALNUML:
3280af22 6168 PL_reg_flags |= RF_tainted;
f2ed9b32 6169 if (utf8_target) {
ffc61ed2 6170 loceol = PL_regeol;
1aa99e6b
IH
6171 while (hardcount < max && scan < loceol &&
6172 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6173 scan += UTF8SKIP(scan);
6174 hardcount++;
6175 }
6176 } else {
6177 while (scan < loceol && isALNUM_LC(*scan))
6178 scan++;
a0ed51b3
LW
6179 }
6180 break;
980866de 6181 case NALNUMU:
f2ed9b32 6182 if (utf8_target) {
980866de
KW
6183
6184 utf8_Nwordchar:
6185
ffc61ed2 6186 loceol = PL_regeol;
1a4fad37 6187 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6188 while (hardcount < max && scan < loceol &&
980866de 6189 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6190 {
ffc61ed2
JH
6191 scan += UTF8SKIP(scan);
6192 hardcount++;
6193 }
980866de 6194 } else {
a12cf05f
KW
6195 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6196 scan++;
6197 }
980866de
KW
6198 }
6199 break;
6200 case NALNUM:
6201 if (utf8_target)
6202 goto utf8_Nwordchar;
6203 while (scan < loceol && ! isALNUM((U8) *scan)) {
6204 scan++;
a0ed51b3
LW
6205 }
6206 break;
cfaf538b
KW
6207 case NALNUMA:
6208 if (utf8_target) {
6209 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6210 scan += UTF8SKIP(scan);
6211 }
6212 }
6213 else {
6214 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6215 scan++;
6216 }
6217 }
6218 break;
bbce6d69 6219 case NALNUML:
3280af22 6220 PL_reg_flags |= RF_tainted;
f2ed9b32 6221 if (utf8_target) {
ffc61ed2 6222 loceol = PL_regeol;
1aa99e6b
IH
6223 while (hardcount < max && scan < loceol &&
6224 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6225 scan += UTF8SKIP(scan);
6226 hardcount++;
6227 }
6228 } else {
6229 while (scan < loceol && !isALNUM_LC(*scan))
6230 scan++;
a0ed51b3
LW
6231 }
6232 break;
980866de 6233 case SPACEU:
f2ed9b32 6234 if (utf8_target) {
980866de
KW
6235
6236 utf8_space:
6237
ffc61ed2 6238 loceol = PL_regeol;
1a4fad37 6239 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6240 while (hardcount < max && scan < loceol &&
3568d838 6241 (*scan == ' ' ||
a12cf05f
KW
6242 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6243 {
ffc61ed2
JH
6244 scan += UTF8SKIP(scan);
6245 hardcount++;
6246 }
980866de
KW
6247 break;
6248 }
6249 else {
a12cf05f
KW
6250 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6251 scan++;
6252 }
980866de
KW
6253 break;
6254 }
6255 case SPACE:
6256 if (utf8_target)
6257 goto utf8_space;
6258
6259 while (scan < loceol && isSPACE((U8) *scan)) {
6260 scan++;
a0ed51b3
LW
6261 }
6262 break;
cfaf538b
KW
6263 case SPACEA:
6264 while (scan < loceol && isSPACE_A((U8) *scan)) {
6265 scan++;
6266 }
6267 break;
bbce6d69 6268 case SPACEL:
3280af22 6269 PL_reg_flags |= RF_tainted;
f2ed9b32 6270 if (utf8_target) {
ffc61ed2 6271 loceol = PL_regeol;
1aa99e6b 6272 while (hardcount < max && scan < loceol &&
6bbba904 6273 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6274 scan += UTF8SKIP(scan);
6275 hardcount++;
6276 }
6277 } else {
6278 while (scan < loceol && isSPACE_LC(*scan))
6279 scan++;
a0ed51b3
LW
6280 }
6281 break;
980866de 6282 case NSPACEU:
f2ed9b32 6283 if (utf8_target) {
980866de
KW
6284
6285 utf8_Nspace:
6286
ffc61ed2 6287 loceol = PL_regeol;
1a4fad37 6288 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6289 while (hardcount < max && scan < loceol &&
980866de
KW
6290 ! (*scan == ' ' ||
6291 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6292 {
ffc61ed2
JH
6293 scan += UTF8SKIP(scan);
6294 hardcount++;
6295 }
980866de
KW
6296 break;
6297 }
6298 else {
a12cf05f
KW
6299 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6300 scan++;
6301 }
980866de
KW
6302 }
6303 break;
6304 case NSPACE:
6305 if (utf8_target)
6306 goto utf8_Nspace;
6307
6308 while (scan < loceol && ! isSPACE((U8) *scan)) {
6309 scan++;
a0ed51b3 6310 }
0008a298 6311 break;
cfaf538b
KW
6312 case NSPACEA:
6313 if (utf8_target) {
6314 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6315 scan += UTF8SKIP(scan);
6316 }
6317 }
6318 else {
6319 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6320 scan++;
6321 }
6322 }
6323 break;
bbce6d69 6324 case NSPACEL:
3280af22 6325 PL_reg_flags |= RF_tainted;
f2ed9b32 6326 if (utf8_target) {
ffc61ed2 6327 loceol = PL_regeol;
1aa99e6b 6328 while (hardcount < max && scan < loceol &&
6bbba904 6329 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6330 scan += UTF8SKIP(scan);
6331 hardcount++;
6332 }
6333 } else {
6334 while (scan < loceol && !isSPACE_LC(*scan))
6335 scan++;
a0ed51b3
LW
6336 }
6337 break;
a0d0e21e 6338 case DIGIT:
f2ed9b32 6339 if (utf8_target) {
ffc61ed2 6340 loceol = PL_regeol;
1a4fad37 6341 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6342 while (hardcount < max && scan < loceol &&
f2ed9b32 6343 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6344 scan += UTF8SKIP(scan);
6345 hardcount++;
6346 }
6347 } else {
6348 while (scan < loceol && isDIGIT(*scan))
6349 scan++;
a0ed51b3
LW
6350 }
6351 break;
cfaf538b
KW
6352 case DIGITA:
6353 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6354 scan++;
6355 }
6356 break;
b77393f6
KW
6357 case DIGITL:
6358 PL_reg_flags |= RF_tainted;
6359 if (utf8_target) {
6360 loceol = PL_regeol;
6361 while (hardcount < max && scan < loceol &&
6362 isDIGIT_LC_utf8((U8*)scan)) {
6363 scan += UTF8SKIP(scan);
6364 hardcount++;
6365 }
6366 } else {
6367 while (scan < loceol && isDIGIT_LC(*scan))
6368 scan++;
6369 }
6370 break;
a0d0e21e 6371 case NDIGIT:
f2ed9b32 6372 if (utf8_target) {
ffc61ed2 6373 loceol = PL_regeol;
1a4fad37 6374 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6375 while (hardcount < max && scan < loceol &&
f2ed9b32 6376 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6377 scan += UTF8SKIP(scan);
6378 hardcount++;
6379 }
6380 } else {
6381 while (scan < loceol && !isDIGIT(*scan))
6382 scan++;
a0ed51b3 6383 }
cfaf538b
KW
6384 break;
6385 case NDIGITA:
6386 if (utf8_target) {
6387 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6388 scan += UTF8SKIP(scan);
6389 }
6390 }
6391 else {
6392 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6393 scan++;
6394 }
6395 }
6396 break;
b77393f6
KW
6397 case NDIGITL:
6398 PL_reg_flags |= RF_tainted;
6399 if (utf8_target) {
6400 loceol = PL_regeol;
6401 while (hardcount < max && scan < loceol &&
6402 !isDIGIT_LC_utf8((U8*)scan)) {
6403 scan += UTF8SKIP(scan);
6404 hardcount++;
6405 }
6406 } else {
6407 while (scan < loceol && !isDIGIT_LC(*scan))
6408 scan++;
6409 }
6410 break;
e1d1eefb 6411 case LNBREAK:
f2ed9b32 6412 if (utf8_target) {
e1d1eefb
YO
6413 loceol = PL_regeol;
6414 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6415 scan += c;
6416 hardcount++;
6417 }
6418 } else {
6419 /*
6420 LNBREAK can match two latin chars, which is ok,
6421 because we have a null terminated string, but we
6422 have to use hardcount in this situation
6423 */
6424 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6425 scan+=c;
6426 hardcount++;
6427 }
6428 }
6429 break;
6430 case HORIZWS:
f2ed9b32 6431 if (utf8_target) {
e1d1eefb
YO
6432 loceol = PL_regeol;
6433 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6434 scan += c;
6435 hardcount++;
6436 }
6437 } else {
6438 while (scan < loceol && is_HORIZWS_latin1(scan))
6439 scan++;
6440 }
a0ed51b3 6441 break;
e1d1eefb 6442 case NHORIZWS:
f2ed9b32 6443 if (utf8_target) {
e1d1eefb
YO
6444 loceol = PL_regeol;
6445 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6446 scan += UTF8SKIP(scan);
6447 hardcount++;
6448 }
6449 } else {
6450 while (scan < loceol && !is_HORIZWS_latin1(scan))
6451 scan++;
6452
6453 }
6454 break;
6455 case VERTWS:
f2ed9b32 6456 if (utf8_target) {
e1d1eefb
YO
6457 loceol = PL_regeol;
6458 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6459 scan += c;
6460 hardcount++;
6461 }
6462 } else {
6463 while (scan < loceol && is_VERTWS_latin1(scan))
6464 scan++;
6465
6466 }
6467 break;
6468 case NVERTWS:
f2ed9b32 6469 if (utf8_target) {
e1d1eefb
YO
6470 loceol = PL_regeol;
6471 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6472 scan += UTF8SKIP(scan);
6473 hardcount++;
6474 }
6475 } else {
6476 while (scan < loceol && !is_VERTWS_latin1(scan))
6477 scan++;
6478
6479 }
6480 break;
6481
a0d0e21e
LW
6482 default: /* Called on something of 0 width. */
6483 break; /* So match right here or not at all. */
6484 }
a687059c 6485
a0ed51b3
LW
6486 if (hardcount)
6487 c = hardcount;
6488 else
6489 c = scan - PL_reginput;
3280af22 6490 PL_reginput = scan;
a687059c 6491
a3621e74 6492 DEBUG_r({
e68ec53f 6493 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6494 DEBUG_EXECUTE_r({
e68ec53f
YO
6495 SV * const prop = sv_newmortal();
6496 regprop(prog, prop, p);
6497 PerlIO_printf(Perl_debug_log,
be8e71aa 6498 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6499 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6500 });
be8e71aa 6501 });
9041c2e3 6502
a0d0e21e 6503 return(c);
a687059c
LW
6504}
6505
c277df42 6506
be8e71aa 6507#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6508/*
6c6525b8
KW
6509- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6510create a copy so that changes the caller makes won't change the shared one
6511 */
ffc61ed2 6512SV *
32fc9b6a 6513Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6514{
6c6525b8
KW
6515 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6516 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6517}
6518#endif
6519
6520STATIC SV *
6521S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6522{
8c9eb58f
KW
6523 /* Returns the swash for the input 'node' in the regex 'prog'.
6524 * If <doinit> is true, will attempt to create the swash if not already
6525 * done.
6526 * If <listsvp> is non-null, will return the swash initialization string in
6527 * it.
6528 * If <altsvp> is non-null, will return the alternates to the regular swash
6529 * in it
6530 * Tied intimately to how regcomp.c sets up the data structure */
6531
97aff369 6532 dVAR;
9e55ce06
JH
6533 SV *sw = NULL;
6534 SV *si = NULL;
6535 SV *alt = NULL;
7a6c6baa
KW
6536 SV* invlist = NULL;
6537
f8fc2ecf
YO
6538 RXi_GET_DECL(prog,progi);
6539 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6540
6c6525b8 6541 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6542
ccb2541c
KW
6543 assert(ANYOF_NONBITMAP(node));
6544
4f639d21 6545 if (data && data->count) {
a3b680e6 6546 const U32 n = ARG(node);
ffc61ed2 6547
4f639d21 6548 if (data->what[n] == 's') {
ad64d0ec
NC
6549 SV * const rv = MUTABLE_SV(data->data[n]);
6550 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6551 SV **const ary = AvARRAY(av);
7a6c6baa 6552 bool invlist_has_user_defined_property;
9041c2e3 6553
8c9eb58f 6554 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6555
7a6c6baa
KW
6556 /* Elements 3 and 4 are either both present or both absent. [3] is
6557 * any inversion list generated at compile time; [4] indicates if
6558 * that inversion list has any user-defined properties in it. */
6559 if (av_len(av) >= 3) {
6560 invlist = ary[3];
6561 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6562 }
6563 else {
6564 invlist = NULL;
6565 invlist_has_user_defined_property = FALSE;
6566 }
6567
8c9eb58f
KW
6568 /* Element [1] is reserved for the set-up swash. If already there,
6569 * return it; if not, create it and store it there */
f192cf32
KW
6570 if (SvROK(ary[1])) {
6571 sw = ary[1];
6572 }
ffc61ed2 6573 else if (si && doinit) {
7a6c6baa
KW
6574
6575 sw = _core_swash_init("utf8", /* the utf8 package */
6576 "", /* nameless */
6577 si,
6578 1, /* binary */
6579 0, /* not from tr/// */
6580 FALSE, /* is error if can't find
6581 property */
6582 invlist,
6583 invlist_has_user_defined_property);
ffc61ed2
JH
6584 (void)av_store(av, 1, sw);
6585 }
8c9eb58f
KW
6586
6587 /* Element [2] is for any multi-char folds. Note that is a
6588 * fundamentally flawed design, because can't backtrack and try
6589 * again. See [perl #89774] */
f192cf32
KW
6590 if (SvTYPE(ary[2]) == SVt_PVAV) {
6591 alt = ary[2];
6592 }
ffc61ed2
JH
6593 }
6594 }
6595
7a6c6baa
KW
6596 if (listsvp) {
6597 SV* matches_string = newSVpvn("", 0);
6598 SV** invlistsvp;
6599
6600 /* Use the swash, if any, which has to have incorporated into it all
6601 * possibilities */
6602 if ( sw
6603 && SvROK(sw)
6604 && SvTYPE(SvRV(sw)) == SVt_PVHV
6605 && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6606 {
6607 invlist = *invlistsvp;
6608 }
6609 else if (si && si != &PL_sv_undef) {
6610
6611 /* If no swash, use the input nitialization string, if available */
6612 sv_catsv(matches_string, si);
6613 }
6614
6615 /* Add the inversion list to whatever we have. This may have come from
6616 * the swash, or from an input parameter */
6617 if (invlist) {
6618 sv_catsv(matches_string, _invlist_contents(invlist));
6619 }
6620 *listsvp = matches_string;
6621 }
6622
9e55ce06
JH
6623 if (altsvp)
6624 *altsvp = alt;
ffc61ed2
JH
6625
6626 return sw;
6627}
6628
6629/*
ba7b4546 6630 - reginclass - determine if a character falls into a character class
832705d4 6631
6698fab5
KW
6632 n is the ANYOF regnode
6633 p is the target string
6634 lenp is pointer to the maximum number of bytes of how far to go in p
6635 (This is assumed wthout checking to always be at least the current
6636 character's size)
6637 utf8_target tells whether p is in UTF-8.
832705d4 6638
4b3cda86
KW
6639 Returns true if matched; false otherwise. If lenp is not NULL, on return
6640 from a successful match, the value it points to will be updated to how many
6641 bytes in p were matched. If there was no match, the value is undefined,
6642 possibly changed from the input.
eba1359e 6643
d5788240
KW
6644 Note that this can be a synthetic start class, a combination of various
6645 nodes, so things you think might be mutually exclusive, such as locale,
6646 aren't. It can match both locale and non-locale
6647
bbce6d69 6648 */
6649
76e3520e 6650STATIC bool
f6ad78d8 6651S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6652{
27da23d5 6653 dVAR;
a3b680e6 6654 const char flags = ANYOF_FLAGS(n);
bbce6d69 6655 bool match = FALSE;
cc07378b 6656 UV c = *p;
f7ab54c6 6657 STRLEN c_len = 0;
6698fab5 6658 STRLEN maxlen;
1aa99e6b 6659
7918f24d
NC
6660 PERL_ARGS_ASSERT_REGINCLASS;
6661
4b3cda86 6662 /* If c is not already the code point, get it */
f2ed9b32 6663 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6664 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6665 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6666 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6667 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6668 * UTF8_ALLOW_FFFF */
f7ab54c6 6669 if (c_len == (STRLEN)-1)
e8a70c6f 6670 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6671 }
a5a291f5
KW
6672 else {
6673 c_len = 1;
6674 }
bbce6d69 6675
a5a291f5
KW
6676 /* Use passed in max length, or one character if none passed in or less
6677 * than one character. And assume will match just one character. This is
6678 * overwritten later if matched more. */
4b3cda86 6679 if (lenp) {
a5a291f5
KW
6680 maxlen = (*lenp > c_len) ? *lenp : c_len;
6681 *lenp = c_len;
4b3cda86
KW
6682
6683 }
6684 else {
a5a291f5 6685 maxlen = c_len;
4b3cda86
KW
6686 }
6687
7cdde544
KW
6688 /* If this character is potentially in the bitmap, check it */
6689 if (c < 256) {
ffc61ed2
JH
6690 if (ANYOF_BITMAP_TEST(n, c))
6691 match = TRUE;
11454c59
KW
6692 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6693 && ! utf8_target
6694 && ! isASCII(c))
6695 {
6696 match = TRUE;
6697 }
a0ed51b3 6698
78969a98
KW
6699 else if (flags & ANYOF_LOCALE) {
6700 PL_reg_flags |= RF_tainted;
6701
6702 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6703 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6704 {
ffc61ed2 6705 match = TRUE;
78969a98 6706 }
040aea3a
KW
6707 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6708 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6709 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6710 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6711 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6712 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6713 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6714 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6715 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6716 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6717 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
6718 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6719 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
6720 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6721 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6722 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6723 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6724 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6725 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6726 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6727 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6728 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6729 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6730 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6731 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6732 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6733 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6734 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6735 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
6736 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6737 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 6738 ) /* How's that for a conditional? */
78969a98 6739 ) {
ffc61ed2
JH
6740 match = TRUE;
6741 }
a0ed51b3 6742 }
a0ed51b3
LW
6743 }
6744
7cdde544 6745 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6746 * bitmap could match, try that. Locale nodes specifiy completely the
6747 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6748 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6749 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6750 * could be a combination of locale and non-locale nodes. So allow locale
6751 * to match for the synthetic start class, which will give a false
6752 * positive that will be resolved when the match is done again as not part
6753 * of the synthetic start class */
ef87b810 6754 if (!match) {
10ee90d2
KW
6755 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6756 match = TRUE; /* Everything above 255 matches */
e051a21d 6757 }
6f8d7d0d
KW
6758 else if (ANYOF_NONBITMAP(n)
6759 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6760 || (utf8_target
6761 && (c >=256
6762 || (! (flags & ANYOF_LOCALE))
6763 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6764 {
7cdde544 6765 AV *av;
210e6c47 6766 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6767
6768 if (sw) {
6769 U8 * utf8_p;
6770 if (utf8_target) {
6771 utf8_p = (U8 *) p;
6772 } else {
f56b6394
KW
6773
6774 /* Not utf8. Convert as much of the string as available up
6775 * to the limit of how far the (single) character in the
6776 * pattern can possibly match (no need to go further). If
6777 * the node is a straight ANYOF or not folding, it can't
6778 * match more than one. Otherwise, It can match up to how
6779 * far a single char can fold to. Since not utf8, each
6780 * character is a single byte, so the max it can be in
6781 * bytes is the same as the max it can be in characters */
6782 STRLEN len = (OP(n) == ANYOF
6783 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6784 ? 1
6785 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6786 ? maxlen
6787 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
6788 utf8_p = bytes_to_utf8(p, &len);
6789 }
f56b6394 6790
f7eb7452 6791 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 6792 match = TRUE;
39065660 6793 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
6794
6795 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
6796 * matches. The non-multi char folds have all been moved to
6797 * the compilation phase, and the multi-char folds have
6798 * been stored by regcomp into 'av'; we linearly check to
6799 * see if any match the target string (folded). We know
6800 * that the originals were each one character, but we don't
6801 * currently know how many characters/bytes each folded to,
6802 * except we do know that there are small limits imposed by
6803 * Unicode. XXX A performance enhancement would be to have
6804 * regcomp.c store the max number of chars/bytes that are
6805 * in an av entry, as, say the 0th element. Even better
6806 * would be to have a hash of the few characters that can
6807 * start a multi-char fold to the max number of chars of
6808 * those folds.
f56b6394
KW
6809 *
6810 * If there is a match, we will need to advance (if lenp is
6811 * specified) the match pointer in the target string. But
6812 * what we are comparing here isn't that string directly,
6813 * but its fold, whose length may differ from the original.
6814 * As we go along in constructing the fold, therefore, we
6815 * create a map so that we know how many bytes in the
6816 * source to advance given that we have matched a certain
6817 * number of bytes in the fold. This map is stored in
6438af90
KW
6818 * 'map_fold_len_back'. Let n mean the number of bytes in
6819 * the fold of the first character that we are folding.
6820 * Then map_fold_len_back[n] is set to the number of bytes
6821 * in that first character. Similarly let m be the
6822 * corresponding number for the second character to be
6823 * folded. Then map_fold_len_back[n+m] is set to the
6824 * number of bytes occupied by the first two source
6825 * characters. ... */
6826 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
6827 U8 folded[UTF8_MAXBYTES_CASE+1];
6828 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 6829 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 6830 chars */
f56b6394
KW
6831
6832 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6833
6834 /* Here, only need to fold the first char of the target
6438af90 6835 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 6836 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 6837 total_foldlen = foldlen;
6438af90
KW
6838 map_fold_len_back[foldlen] = (utf8_target)
6839 ? UTF8SKIP(utf8_p)
6840 : 1;
f56b6394
KW
6841 }
6842 else {
6843
6844 /* Here, need to fold more than the first char. Do so
6845 * up to the limits */
f56b6394
KW
6846 U8* source_ptr = utf8_p; /* The source for the fold
6847 is the regex target
6848 string */
6849 U8* folded_ptr = folded;
6850 U8* e = utf8_p + maxlen; /* Can't go beyond last
6851 available byte in the
6852 target string */
6438af90
KW
6853 U8 i;
6854 for (i = 0;
6855 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6856 i++)
f56b6394
KW
6857 {
6858
6859 /* Fold the next character */
6860 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6861 STRLEN this_char_foldlen;
6862 to_utf8_fold(source_ptr,
6863 this_char_folded,
6864 &this_char_foldlen);
6865
6866 /* Bail if it would exceed the byte limit for
6867 * folding a single char. */
6868 if (this_char_foldlen + folded_ptr - folded >
6869 UTF8_MAXBYTES_CASE)
6870 {
6871 break;
6872 }
6873
6438af90 6874 /* Add the fold of this character */
f56b6394
KW
6875 Copy(this_char_folded,
6876 folded_ptr,
6877 this_char_foldlen,
6878 U8);
f56b6394 6879 source_ptr += UTF8SKIP(source_ptr);
6438af90 6880 folded_ptr += this_char_foldlen;
deba3d96 6881 total_foldlen = folded_ptr - folded;
6438af90
KW
6882
6883 /* Create map from the number of bytes in the fold
6884 * back to the number of bytes in the source. If
6885 * the source isn't utf8, the byte count is just
6886 * the number of characters so far */
deba3d96 6887 map_fold_len_back[total_foldlen]
6438af90
KW
6888 = (utf8_target)
6889 ? source_ptr - utf8_p
6890 : i + 1;
f56b6394
KW
6891 }
6892 *folded_ptr = '\0';
f56b6394
KW
6893 }
6894
6895
6896 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
6897 * of multi-char folds. */
6898 if (av) {
7cdde544
KW
6899 I32 i;
6900 for (i = 0; i <= av_len(av); i++) {
6901 SV* const sv = *av_fetch(av, i, FALSE);
6902 STRLEN len;
6903 const char * const s = SvPV_const(sv, len);
6438af90 6904
f9126265
KW
6905 if (len <= total_foldlen
6906 && memEQ(s, (char*)folded, len)
6907
6908 /* If 0, means matched a partial char. See
6909 * [perl #90536] */
6910 && map_fold_len_back[len])
f56b6394
KW
6911 {
6912
6913 /* Advance the target string ptr to account for
6914 * this fold, but have to translate from the
6915 * folded length to the corresponding source
6438af90 6916 * length. */
8f2655f7
KW
6917 if (lenp) {
6918 *lenp = map_fold_len_back[len];
8f2655f7 6919 }
7cdde544
KW
6920 match = TRUE;
6921 break;
6922 }
6923 }
6924 }
7cdde544
KW
6925 }
6926
6927 /* If we allocated a string above, free it */
6928 if (! utf8_target) Safefree(utf8_p);
6929 }
6930 }
6931 }
6932
a0ed51b3
LW
6933 return (flags & ANYOF_INVERT) ? !match : match;
6934}
161b471a 6935
dfe13c55 6936STATIC U8 *
0ce71af7 6937S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 6938{
6af86488
KW
6939 /* return the position 'off' UTF-8 characters away from 's', forward if
6940 * 'off' >= 0, backwards if negative. But don't go outside of position
6941 * 'lim', which better be < s if off < 0 */
6942
97aff369 6943 dVAR;
7918f24d
NC
6944
6945 PERL_ARGS_ASSERT_REGHOP3;
6946
a0ed51b3 6947 if (off >= 0) {
1aa99e6b 6948 while (off-- && s < lim) {
ffc61ed2 6949 /* XXX could check well-formedness here */
a0ed51b3 6950 s += UTF8SKIP(s);
ffc61ed2 6951 }
a0ed51b3
LW
6952 }
6953 else {
1de06328
YO
6954 while (off++ && s > lim) {
6955 s--;
6956 if (UTF8_IS_CONTINUED(*s)) {
6957 while (s > lim && UTF8_IS_CONTINUATION(*s))
6958 s--;
a0ed51b3 6959 }
1de06328 6960 /* XXX could check well-formedness here */
a0ed51b3
LW
6961 }
6962 }
6963 return s;
6964}
161b471a 6965
f9f4320a
YO
6966#ifdef XXX_dmq
6967/* there are a bunch of places where we use two reghop3's that should
6968 be replaced with this routine. but since thats not done yet
6969 we ifdef it out - dmq
6970*/
dfe13c55 6971STATIC U8 *
1de06328
YO
6972S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6973{
6974 dVAR;
7918f24d
NC
6975
6976 PERL_ARGS_ASSERT_REGHOP4;
6977
1de06328
YO
6978 if (off >= 0) {
6979 while (off-- && s < rlim) {
6980 /* XXX could check well-formedness here */
6981 s += UTF8SKIP(s);
6982 }
6983 }
6984 else {
6985 while (off++ && s > llim) {
6986 s--;
6987 if (UTF8_IS_CONTINUED(*s)) {
6988 while (s > llim && UTF8_IS_CONTINUATION(*s))
6989 s--;
6990 }
6991 /* XXX could check well-formedness here */
6992 }
6993 }
6994 return s;
6995}
f9f4320a 6996#endif
1de06328
YO
6997
6998STATIC U8 *
0ce71af7 6999S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7000{
97aff369 7001 dVAR;
7918f24d
NC
7002
7003 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7004
a0ed51b3 7005 if (off >= 0) {
1aa99e6b 7006 while (off-- && s < lim) {
ffc61ed2 7007 /* XXX could check well-formedness here */
a0ed51b3 7008 s += UTF8SKIP(s);
ffc61ed2 7009 }
a0ed51b3 7010 if (off >= 0)
3dab1dad 7011 return NULL;
a0ed51b3
LW
7012 }
7013 else {
1de06328
YO
7014 while (off++ && s > lim) {
7015 s--;
7016 if (UTF8_IS_CONTINUED(*s)) {
7017 while (s > lim && UTF8_IS_CONTINUATION(*s))
7018 s--;
a0ed51b3 7019 }
1de06328 7020 /* XXX could check well-formedness here */
a0ed51b3
LW
7021 }
7022 if (off <= 0)
3dab1dad 7023 return NULL;
a0ed51b3
LW
7024 }
7025 return s;
7026}
51371543 7027
51371543 7028static void
acfe0abc 7029restore_pos(pTHX_ void *arg)
51371543 7030{
97aff369 7031 dVAR;
097eb12c 7032 regexp * const rex = (regexp *)arg;
51371543
GS
7033 if (PL_reg_eval_set) {
7034 if (PL_reg_oldsaved) {
4f639d21
DM
7035 rex->subbeg = PL_reg_oldsaved;
7036 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 7037#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7038 rex->saved_copy = PL_nrs;
ed252734 7039#endif
07bc277f 7040 RXp_MATCH_COPIED_on(rex);
51371543
GS
7041 }
7042 PL_reg_magic->mg_len = PL_reg_oldpos;
7043 PL_reg_eval_set = 0;
7044 PL_curpm = PL_reg_oldcurpm;
7045 }
7046}
33b8afdf
JH
7047
7048STATIC void
7049S_to_utf8_substr(pTHX_ register regexp *prog)
7050{
a1cac82e 7051 int i = 1;
7918f24d
NC
7052
7053 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7054
a1cac82e
NC
7055 do {
7056 if (prog->substrs->data[i].substr
7057 && !prog->substrs->data[i].utf8_substr) {
7058 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7059 prog->substrs->data[i].utf8_substr = sv;
7060 sv_utf8_upgrade(sv);
610460f9 7061 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7062 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7063 /* Trim the trailing \n that fbm_compile added last
7064 time. */
7065 SvCUR_set(sv, SvCUR(sv) - 1);
7066 /* Whilst this makes the SV technically "invalid" (as its
7067 buffer is no longer followed by "\0") when fbm_compile()
7068 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7069 fbm_compile(sv, FBMcf_TAIL);
7070 } else
7071 fbm_compile(sv, 0);
610460f9 7072 }
a1cac82e
NC
7073 if (prog->substrs->data[i].substr == prog->check_substr)
7074 prog->check_utf8 = sv;
7075 }
7076 } while (i--);
33b8afdf
JH
7077}
7078
7079STATIC void
7080S_to_byte_substr(pTHX_ register regexp *prog)
7081{
97aff369 7082 dVAR;
a1cac82e 7083 int i = 1;
7918f24d
NC
7084
7085 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7086
a1cac82e
NC
7087 do {
7088 if (prog->substrs->data[i].utf8_substr
7089 && !prog->substrs->data[i].substr) {
7090 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7091 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7092 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7093 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7094 /* Trim the trailing \n that fbm_compile added last
7095 time. */
7096 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7097 fbm_compile(sv, FBMcf_TAIL);
7098 } else
7099 fbm_compile(sv, 0);
7100 }
a1cac82e
NC
7101 } else {
7102 SvREFCNT_dec(sv);
7103 sv = &PL_sv_undef;
7104 }
7105 prog->substrs->data[i].substr = sv;
7106 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7107 prog->check_substr = sv;
33b8afdf 7108 }
a1cac82e 7109 } while (i--);
33b8afdf 7110}
66610fdd
RGS
7111
7112/*
7113 * Local variables:
7114 * c-indentation-style: bsd
7115 * c-basic-offset: 4
7116 * indent-tabs-mode: t
7117 * End:
7118 *
37442d52
RGS
7119 * ex: set ts=8 sts=4 sw=4 noet:
7120 */