This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
todo
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 89
a687059c
LW
90#ifndef STATIC
91#define STATIC static
92#endif
93
7e2509c1
KW
94/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
95 * call if there are no complications: i.e., if everything matchable is
96 * straight forward in the bitmap */
af364d03
KW
97#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
98 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 99
c277df42
IZ
100/*
101 * Forwards.
102 */
103
f2ed9b32 104#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 105#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 106
3dab1dad
YO
107#define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
52657f30 109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
110 : (U8*)(pos + off))
111#define HOPBACKc(pos, off) \
07be1b83
YO
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
8e11feef 115 ? (U8*)pos - off \
3dab1dad 116 : NULL)
efb30f32 117
e7409c1b 118#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 119#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 120
20d0b1e9 121/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
122#ifdef EBCDIC
123 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
124 * skip the check on EBCDIC platforms */
125# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
126#else
127# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
128 if (!CAT2(PL_utf8_,class)) { \
129 bool ok; \
130 ENTER; save_re_context(); \
131 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 133#endif
37e2e78e
KW
134
135/* Doesn't do an assert to verify that is correct */
136#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 137 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 138 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
139 ENTER; save_re_context(); \
140 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 LEAVE; } } STMT_END
37e2e78e 142
1a4fad37
AL
143#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
144#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
145#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 146
37e2e78e 147#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
148 LOAD_UTF8_CHARCLASS(X_begin, " "); \
149 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
150 /* These are utf8 constants, and not utf-ebcdic constants, so the \
151 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 152 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
153 \
154 /* No asserts are done for these, in case called on an early \
155 * Unicode version in which they map to nothing */ \
d275fa5e
CB
156 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
157 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
158 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 163
1dcf4a1b 164#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 165
ee9a90b8
KW
166/* The actual code for CCC_TRY, which uses several variables from the routine
167 * it's callable from. It is designed to be the bulk of a case statement.
168 * FUNC is the macro or function to call on non-utf8 targets that indicate if
169 * nextchr matches the class.
170 * UTF8_TEST is the whole test string to use for utf8 targets
171 * LOAD is what to use to test, and if not present to load in the swash for the
172 * class
173 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
174 * UTF8_TEST test.
175 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
176 * utf8 and a variant, load the swash if necessary and test using the utf8
177 * test. Advance to the next character if test is ok, otherwise fail; If not
178 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
179 * fails, or advance to the next character */
180
181#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
182 if (locinput >= PL_regeol) { \
183 sayNO; \
184 } \
185 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
186 LOAD_UTF8_CHARCLASS(CLASS, STR); \
187 if (POS_OR_NEG (UTF8_TEST)) { \
188 sayNO; \
189 } \
190 locinput += PL_utf8skip[nextchr]; \
191 nextchr = UCHARAT(locinput); \
192 break; \
193 } \
194 if (POS_OR_NEG (FUNC(nextchr))) { \
195 sayNO; \
196 } \
197 nextchr = UCHARAT(++locinput); \
980866de
KW
198 break;
199
ee9a90b8
KW
200/* Handle the non-locale cases for a character class and its complement. It
201 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
202 * This is because that code fails when the test succeeds, so we want to have
203 * the test fail so that the code succeeds. The swash is stored in a
204 * predictable PL_ place */
cfaf538b
KW
205#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
206 CLASS, STR) \
ee9a90b8
KW
207 case NAME: \
208 _CCC_TRY_CODE( !, FUNC, \
209 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
210 (U8*)locinput, TRUE)), \
211 CLASS, STR) \
212 case NNAME: \
1dcf4a1b 213 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
214 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
215 (U8*)locinput, TRUE)), \
216 CLASS, STR) \
217
218/* Generate the case statements for both locale and non-locale character
219 * classes in regmatch for classes that don't have special unicode semantics.
220 * Locales don't use an immediate swash, but an intermediary special locale
221 * function that is called on the pointer to the current place in the input
222 * string. That function will resolve to needing the same swash. One might
223 * think that because we don't know what the locale will match, we shouldn't
224 * check with the swash loading function that it loaded properly; ie, that we
225 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
226 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
227 * irrelevant here */
228#define CCC_TRY(NAME, NNAME, FUNC, \
229 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 230 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
231 CLASS, STR) \
232 case NAMEL: \
233 PL_reg_flags |= RF_tainted; \
234 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
235 case NNAMEL: \
236 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
237 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
238 CLASS, STR) \
cfaf538b
KW
239 case NAMEA: \
240 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
244 nextchr = UCHARAT(++locinput); \
245 break; \
246 case NNAMEA: \
247 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
248 sayNO; \
249 } \
250 if (utf8_target) { \
251 locinput += PL_utf8skip[nextchr]; \
252 nextchr = UCHARAT(locinput); \
253 } \
254 else { \
255 nextchr = UCHARAT(++locinput); \
256 } \
257 break; \
ee9a90b8
KW
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
260
261/* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263#define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
cfaf538b 266 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 267 CLASS, STR) \
cfaf538b
KW
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
271 CLASS, STR) \
ee9a90b8 272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 273
3dab1dad
YO
274/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
275
5f80c4cf 276/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
277/* it would be nice to rework regcomp.sym to generate this stuff. sigh
278 *
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
285*/
3e901dc0
YO
286#define JUMPABLE(rn) ( \
287 OP(rn) == OPEN || \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
289 OP(rn) == EVAL || \
cca55fe3
JP
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 292 OP(rn) == KEEPS || \
3dab1dad 293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 294)
ee9b8eae 295#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 296
ee9b8eae
YO
297#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298
299#if 0
300/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 303#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305
306#else
307/* ... so we use this as its faster. */
308#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 309#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
310#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
312
313#endif
e2d8ce26 314
a84d97b6
HS
315/*
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
318*/
cca55fe3 319#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 323 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 324 else if (type == PLUS) \
cca55fe3 325 rn = NEXTOPER(rn); \
3dab1dad 326 else if (type == IFMATCH) \
a84d97b6 327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 328 else rn += NEXT_OFF(rn); \
3dab1dad 329 } \
5f80c4cf 330} STMT_END
74750237 331
c476f425 332
acfe0abc 333static void restore_pos(pTHX_ void *arg);
51371543 334
87c0511b 335#define REGCP_PAREN_ELEMS 3
f067efbf 336#define REGCP_OTHER_ELEMS 3
e0fa7e2b 337#define REGCP_FRAME_ELEMS 1
620d5b66
NC
338/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
339 * are needed for the regexp context stack bookkeeping. */
340
76e3520e 341STATIC CHECKPOINT
b93070ed 342S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 343{
97aff369 344 dVAR;
a3b680e6 345 const int retval = PL_savestack_ix;
a3b680e6 346 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
347 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
348 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 349 I32 p;
40a82448 350 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 351
b93070ed
DM
352 PERL_ARGS_ASSERT_REGCPPUSH;
353
e49a9654 354 if (paren_elems_to_push < 0)
5637ef5b
NC
355 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
356 paren_elems_to_push);
e49a9654 357
e0fa7e2b
NC
358 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
359 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
360 " out of range (%lu-%ld)",
361 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 362
620d5b66 363 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 364
495f47a5
DM
365 DEBUG_BUFFERS_r(
366 if ((int)PL_regsize > (int)parenfloor)
367 PerlIO_printf(Perl_debug_log,
368 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
369 PTR2UV(rex),
370 PTR2UV(rex->offs)
371 );
372 );
87c0511b 373 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 374/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
375 SSPUSHINT(rex->offs[p].end);
376 SSPUSHINT(rex->offs[p].start);
1ca2007e 377 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 378 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
379 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
380 (UV)p,
381 (IV)rex->offs[p].start,
382 (IV)rex->offs[p].start_tmp,
383 (IV)rex->offs[p].end
40a82448 384 ));
a0d0e21e 385 }
b1ce53c5 386/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 387 SSPUSHINT(PL_regsize);
b93070ed
DM
388 SSPUSHINT(rex->lastparen);
389 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 390 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 391
a0d0e21e
LW
392 return retval;
393}
394
c277df42 395/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
396#define REGCP_SET(cp) \
397 DEBUG_STATE_r( \
ab3bbdeb 398 PerlIO_printf(Perl_debug_log, \
e4f74956 399 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
400 (IV)PL_savestack_ix)); \
401 cp = PL_savestack_ix
c3464db5 402
ab3bbdeb 403#define REGCP_UNWIND(cp) \
e4f74956 404 DEBUG_STATE_r( \
ab3bbdeb 405 if (cp != PL_savestack_ix) \
e4f74956
YO
406 PerlIO_printf(Perl_debug_log, \
407 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
408 (IV)(cp), (IV)PL_savestack_ix)); \
409 regcpblow(cp)
c277df42 410
a8d1f4b4
DM
411#define UNWIND_PAREN(lp, lcp) \
412 for (n = rex->lastparen; n > lp; n--) \
413 rex->offs[n].end = -1; \
414 rex->lastparen = n; \
415 rex->lastcloseparen = lcp;
416
417
f067efbf 418STATIC void
b93070ed 419S_regcppop(pTHX_ regexp *rex)
a0d0e21e 420{
97aff369 421 dVAR;
e0fa7e2b 422 UV i;
87c0511b 423 U32 paren;
a3621e74
YO
424 GET_RE_DEBUG_FLAGS_DECL;
425
7918f24d
NC
426 PERL_ARGS_ASSERT_REGCPPOP;
427
b1ce53c5 428 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 429 i = SSPOPUV;
e0fa7e2b
NC
430 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
431 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
432 rex->lastcloseparen = SSPOPINT;
433 rex->lastparen = SSPOPINT;
3280af22 434 PL_regsize = SSPOPINT;
b1ce53c5 435
620d5b66 436 i -= REGCP_OTHER_ELEMS;
b1ce53c5 437 /* Now restore the parentheses context. */
495f47a5
DM
438 DEBUG_BUFFERS_r(
439 if (i || rex->lastparen + 1 <= rex->nparens)
440 PerlIO_printf(Perl_debug_log,
441 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
442 PTR2UV(rex),
443 PTR2UV(rex->offs)
444 );
445 );
87c0511b 446 paren = PL_regsize;
620d5b66 447 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 448 I32 tmps;
1ca2007e 449 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 450 rex->offs[paren].start = SSPOPINT;
cf93c79d 451 tmps = SSPOPINT;
b93070ed
DM
452 if (paren <= rex->lastparen)
453 rex->offs[paren].end = tmps;
495f47a5
DM
454 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
455 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
456 (UV)paren,
457 (IV)rex->offs[paren].start,
458 (IV)rex->offs[paren].start_tmp,
459 (IV)rex->offs[paren].end,
460 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 461 );
87c0511b 462 paren--;
a0d0e21e 463 }
daf18116 464#if 1
dafc8851
JH
465 /* It would seem that the similar code in regtry()
466 * already takes care of this, and in fact it is in
467 * a better location to since this code can #if 0-ed out
468 * but the code in regtry() is needed or otherwise tests
469 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
470 * (as of patchlevel 7877) will fail. Then again,
471 * this code seems to be necessary or otherwise
225593e1
DM
472 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
473 * --jhi updated by dapm */
b93070ed 474 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 475 if (i > PL_regsize)
b93070ed
DM
476 rex->offs[i].start = -1;
477 rex->offs[i].end = -1;
495f47a5
DM
478 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
479 " \\%"UVuf": %s ..-1 undeffing\n",
480 (UV)i,
481 (i > PL_regsize) ? "-1" : " "
482 ));
a0d0e21e 483 }
dafc8851 484#endif
a0d0e21e
LW
485}
486
74088413
DM
487/* restore the parens and associated vars at savestack position ix,
488 * but without popping the stack */
489
490STATIC void
491S_regcp_restore(pTHX_ regexp *rex, I32 ix)
492{
493 I32 tmpix = PL_savestack_ix;
494 PL_savestack_ix = ix;
495 regcppop(rex);
496 PL_savestack_ix = tmpix;
497}
498
02db2b7b 499#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 500
a687059c 501/*
e50aee73 502 * pregexec and friends
a687059c
LW
503 */
504
76234dfb 505#ifndef PERL_IN_XSUB_RE
a687059c 506/*
c277df42 507 - pregexec - match a regexp against a string
a687059c 508 */
c277df42 509I32
49d7dfbc 510Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 511 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
512/* strend: pointer to null at end of string */
513/* strbeg: real beginning of string */
514/* minend: end of match must be >=minend after stringarg. */
515/* nosave: For optimizations. */
516{
7918f24d
NC
517 PERL_ARGS_ASSERT_PREGEXEC;
518
c277df42 519 return
9041c2e3 520 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
521 nosave ? 0 : REXEC_COPY_STR);
522}
76234dfb 523#endif
22e551b9 524
9041c2e3 525/*
cad2e5aa
JH
526 * Need to implement the following flags for reg_anch:
527 *
528 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
529 * USE_INTUIT_ML
530 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
531 * INTUIT_AUTORITATIVE_ML
532 * INTUIT_ONCE_NOML - Intuit can match in one location only.
533 * INTUIT_ONCE_ML
534 *
535 * Another flag for this function: SECOND_TIME (so that float substrs
536 * with giant delta may be not rechecked).
537 */
538
539/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
540
3f7c398e 541/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
542 Otherwise, only SvCUR(sv) is used to get strbeg. */
543
544/* XXXX We assume that strpos is strbeg unless sv. */
545
6eb5f6b9
JH
546/* XXXX Some places assume that there is a fixed substring.
547 An update may be needed if optimizer marks as "INTUITable"
548 RExen without fixed substrings. Similarly, it is assumed that
549 lengths of all the strings are no more than minlen, thus they
550 cannot come from lookahead.
40d049e4
YO
551 (Or minlen should take into account lookahead.)
552 NOTE: Some of this comment is not correct. minlen does now take account
553 of lookahead/behind. Further research is required. -- demerphq
554
555*/
6eb5f6b9 556
2c2d71f5
JH
557/* A failure to find a constant substring means that there is no need to make
558 an expensive call to REx engine, thus we celebrate a failure. Similarly,
559 finding a substring too deep into the string means that less calls to
30944b6d
IZ
560 regtry() should be needed.
561
562 REx compiler's optimizer found 4 possible hints:
563 a) Anchored substring;
564 b) Fixed substring;
565 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 566 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 567 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
568 string which does not contradict any of them.
569 */
2c2d71f5 570
6eb5f6b9
JH
571/* Most of decisions we do here should have been done at compile time.
572 The nodes of the REx which we used for the search should have been
573 deleted from the finite automaton. */
574
cad2e5aa 575char *
288b8c02 576Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 577 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 578{
97aff369 579 dVAR;
288b8c02 580 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 581 register I32 start_shift = 0;
cad2e5aa 582 /* Should be nonnegative! */
b7953727 583 register I32 end_shift = 0;
2c2d71f5
JH
584 register char *s;
585 register SV *check;
a1933d95 586 char *strbeg;
cad2e5aa 587 char *t;
f2ed9b32 588 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 589 I32 ml_anch;
bd61b366
SS
590 register char *other_last = NULL; /* other substr checked before this */
591 char *check_at = NULL; /* check substr found at this pos */
d8080198 592 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 593 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 594 RXi_GET_DECL(prog,progi);
30944b6d 595#ifdef DEBUGGING
890ce7af 596 const char * const i_strpos = strpos;
30944b6d 597#endif
a3621e74
YO
598 GET_RE_DEBUG_FLAGS_DECL;
599
7918f24d 600 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
601 PERL_UNUSED_ARG(flags);
602 PERL_UNUSED_ARG(data);
7918f24d 603
f2ed9b32 604 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 605
3c8556c3 606 if (RX_UTF8(rx)) {
b8d68ded
JH
607 PL_reg_flags |= RF_utf8;
608 }
ab3bbdeb 609 DEBUG_EXECUTE_r(
f2ed9b32 610 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
611 sv ? "Guessing start of match in sv for"
612 : "Guessing start of match in string for");
2a782b5b 613 );
cad2e5aa 614
c344f387
JH
615 /* CHR_DIST() would be more correct here but it makes things slow. */
616 if (prog->minlen > strend - strpos) {
a3621e74 617 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 618 "String too short... [re_intuit_start]\n"));
cad2e5aa 619 goto fail;
2c2d71f5 620 }
1de06328 621
a1933d95 622 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 623 PL_regeol = strend;
f2ed9b32 624 if (utf8_target) {
33b8afdf
JH
625 if (!prog->check_utf8 && prog->check_substr)
626 to_utf8_substr(prog);
627 check = prog->check_utf8;
628 } else {
629 if (!prog->check_substr && prog->check_utf8)
630 to_byte_substr(prog);
631 check = prog->check_substr;
632 }
1de06328 633 if (check == &PL_sv_undef) {
a3621e74 634 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 635 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
636 goto fail;
637 }
bbe252da
YO
638 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
639 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
640 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 641 && !multiline ) ); /* Check after \n? */
cad2e5aa 642
7e25d62c 643 if (!ml_anch) {
bbe252da
YO
644 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
645 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 646 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
647 && sv && !SvROK(sv)
648 && (strpos != strbeg)) {
a3621e74 649 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
650 goto fail;
651 }
652 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 653 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 654 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
655 I32 slen;
656
1aa99e6b 657 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 658
653099ff
GS
659 if (SvTAIL(check)) {
660 slen = SvCUR(check); /* >= 1 */
cad2e5aa 661
9041c2e3 662 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 663 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 664 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 665 goto fail_finish;
cad2e5aa
JH
666 }
667 /* Now should match s[0..slen-2] */
668 slen--;
3f7c398e 669 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 670 || (slen > 1
3f7c398e 671 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 672 report_neq:
a3621e74 673 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
674 goto fail_finish;
675 }
cad2e5aa 676 }
3f7c398e 677 else if (*SvPVX_const(check) != *s
653099ff 678 || ((slen = SvCUR(check)) > 1
3f7c398e 679 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 680 goto report_neq;
c315bfe8 681 check_at = s;
2c2d71f5 682 goto success_at_start;
7e25d62c 683 }
cad2e5aa 684 }
2c2d71f5 685 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 686 s = strpos;
2c2d71f5 687 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
688 end_shift = prog->check_end_shift;
689
2c2d71f5 690 if (!ml_anch) {
a3b680e6 691 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 692 - (SvTAIL(check) != 0);
a3b680e6 693 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
694
695 if (end_shift < eshift)
696 end_shift = eshift;
697 }
cad2e5aa 698 }
2c2d71f5 699 else { /* Can match at random position */
cad2e5aa
JH
700 ml_anch = 0;
701 s = strpos;
1de06328
YO
702 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
703 end_shift = prog->check_end_shift;
704
705 /* end shift should be non negative here */
cad2e5aa
JH
706 }
707
bcdf7404 708#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 709 if (end_shift < 0)
1de06328 710 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 711 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
712#endif
713
2c2d71f5
JH
714 restart:
715 /* Find a possible match in the region s..strend by looking for
716 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
717
718 {
719 I32 srch_start_shift = start_shift;
720 I32 srch_end_shift = end_shift;
c33e64f0
FC
721 U8* start_point;
722 U8* end_point;
1de06328
YO
723 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
724 srch_end_shift -= ((strbeg - s) - srch_start_shift);
725 srch_start_shift = strbeg - s;
726 }
6bda09f9 727 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
728 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
729 (IV)prog->check_offset_min,
730 (IV)srch_start_shift,
731 (IV)srch_end_shift,
732 (IV)prog->check_end_shift);
733 });
734
bbe252da 735 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
736 start_point= (U8*)(s + srch_start_shift);
737 end_point= (U8*)(strend - srch_end_shift);
738 } else {
739 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
740 end_point= HOP3(strend, -srch_end_shift, strbeg);
741 }
6bda09f9 742 DEBUG_OPTIMISE_MORE_r({
56570a2c 743 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 744 (int)(end_point - start_point),
fc8cd66c 745 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
746 start_point);
747 });
748
749 s = fbm_instr( start_point, end_point,
7fba1cd6 750 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 751 }
cad2e5aa
JH
752 /* Update the count-of-usability, remove useless subpatterns,
753 unshift s. */
2c2d71f5 754
ab3bbdeb 755 DEBUG_EXECUTE_r({
f2ed9b32 756 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
757 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
758 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 759 (s ? "Found" : "Did not find"),
f2ed9b32 760 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
761 ? "anchored" : "floating"),
762 quoted,
763 RE_SV_TAIL(check),
764 (s ? " at offset " : "...\n") );
765 });
2c2d71f5
JH
766
767 if (!s)
768 goto fail_finish;
2c2d71f5 769 /* Finish the diagnostic message */
a3621e74 770 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 771
1de06328
YO
772 /* XXX dmq: first branch is for positive lookbehind...
773 Our check string is offset from the beginning of the pattern.
774 So we need to do any stclass tests offset forward from that
775 point. I think. :-(
776 */
777
778
779
780 check_at=s;
781
782
2c2d71f5
JH
783 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
784 Start with the other substr.
785 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 786 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
787 *always* match. Probably should be marked during compile...
788 Probably it is right to do no SCREAM here...
789 */
790
f2ed9b32 791 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
792 : (prog->float_substr && prog->anchored_substr))
793 {
30944b6d 794 /* Take into account the "other" substring. */
2c2d71f5
JH
795 /* XXXX May be hopelessly wrong for UTF... */
796 if (!other_last)
6eb5f6b9 797 other_last = strpos;
f2ed9b32 798 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
799 do_other_anchored:
800 {
890ce7af
AL
801 char * const last = HOP3c(s, -start_shift, strbeg);
802 char *last1, *last2;
be8e71aa 803 char * const saved_s = s;
33b8afdf 804 SV* must;
2c2d71f5 805
2c2d71f5
JH
806 t = s - prog->check_offset_max;
807 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 808 && (!utf8_target
0ce71af7 809 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 810 && t > strpos)))
6f207bd3 811 NOOP;
2c2d71f5
JH
812 else
813 t = strpos;
1aa99e6b 814 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
815 if (t < other_last) /* These positions already checked */
816 t = other_last;
1aa99e6b 817 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
818 if (last < last1)
819 last1 = last;
1de06328
YO
820 /* XXXX It is not documented what units *_offsets are in.
821 We assume bytes, but this is clearly wrong.
822 Meaning this code needs to be carefully reviewed for errors.
823 dmq.
824 */
825
2c2d71f5 826 /* On end-of-str: see comment below. */
f2ed9b32 827 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
828 if (must == &PL_sv_undef) {
829 s = (char*)NULL;
1de06328 830 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
831 }
832 else
833 s = fbm_instr(
834 (unsigned char*)t,
835 HOP3(HOP3(last1, prog->anchored_offset, strend)
836 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
837 must,
7fba1cd6 838 multiline ? FBMrf_MULTILINE : 0
33b8afdf 839 );
ab3bbdeb 840 DEBUG_EXECUTE_r({
f2ed9b32 841 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
842 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
843 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 844 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
845 quoted, RE_SV_TAIL(must));
846 });
847
848
2c2d71f5
JH
849 if (!s) {
850 if (last1 >= last2) {
a3621e74 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
852 ", giving up...\n"));
853 goto fail_finish;
854 }
a3621e74 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 856 ", trying floating at offset %ld...\n",
be8e71aa 857 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
858 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
859 s = HOP3c(last, 1, strend);
2c2d71f5
JH
860 goto restart;
861 }
862 else {
a3621e74 863 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 864 (long)(s - i_strpos)));
1aa99e6b
IH
865 t = HOP3c(s, -prog->anchored_offset, strbeg);
866 other_last = HOP3c(s, 1, strend);
be8e71aa 867 s = saved_s;
2c2d71f5
JH
868 if (t == strpos)
869 goto try_at_start;
2c2d71f5
JH
870 goto try_at_offset;
871 }
30944b6d 872 }
2c2d71f5
JH
873 }
874 else { /* Take into account the floating substring. */
33b8afdf 875 char *last, *last1;
be8e71aa 876 char * const saved_s = s;
33b8afdf
JH
877 SV* must;
878
879 t = HOP3c(s, -start_shift, strbeg);
880 last1 = last =
881 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
882 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
883 last = HOP3c(t, prog->float_max_offset, strend);
884 s = HOP3c(t, prog->float_min_offset, strend);
885 if (s < other_last)
886 s = other_last;
2c2d71f5 887 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 888 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
889 /* fbm_instr() takes into account exact value of end-of-str
890 if the check is SvTAIL(ed). Since false positives are OK,
891 and end-of-str is not later than strend we are OK. */
892 if (must == &PL_sv_undef) {
893 s = (char*)NULL;
1de06328 894 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
895 }
896 else
2c2d71f5 897 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
898 (unsigned char*)last + SvCUR(must)
899 - (SvTAIL(must)!=0),
7fba1cd6 900 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 901 DEBUG_EXECUTE_r({
f2ed9b32 902 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
903 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
904 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 905 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
906 quoted, RE_SV_TAIL(must));
907 });
33b8afdf
JH
908 if (!s) {
909 if (last1 == last) {
a3621e74 910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
911 ", giving up...\n"));
912 goto fail_finish;
2c2d71f5 913 }
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 915 ", trying anchored starting at offset %ld...\n",
be8e71aa 916 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
917 other_last = last;
918 s = HOP3c(t, 1, strend);
919 goto restart;
920 }
921 else {
a3621e74 922 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
923 (long)(s - i_strpos)));
924 other_last = s; /* Fix this later. --Hugo */
be8e71aa 925 s = saved_s;
33b8afdf
JH
926 if (t == strpos)
927 goto try_at_start;
928 goto try_at_offset;
929 }
2c2d71f5 930 }
cad2e5aa 931 }
2c2d71f5 932
1de06328 933
9ef43ace 934 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 935
6bda09f9 936 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
937 PerlIO_printf(Perl_debug_log,
938 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
939 (IV)prog->check_offset_min,
940 (IV)prog->check_offset_max,
941 (IV)(s-strpos),
942 (IV)(t-strpos),
943 (IV)(t-s),
944 (IV)(strend-strpos)
945 )
946 );
947
2c2d71f5 948 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 949 && (!utf8_target
9ef43ace 950 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
951 && t > strpos)))
952 {
2c2d71f5
JH
953 /* Fixed substring is found far enough so that the match
954 cannot start at strpos. */
955 try_at_offset:
cad2e5aa 956 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
957 /* Eventually fbm_*() should handle this, but often
958 anchored_offset is not 0, so this check will not be wasted. */
959 /* XXXX In the code below we prefer to look for "^" even in
960 presence of anchored substrings. And we search even
961 beyond the found float position. These pessimizations
962 are historical artefacts only. */
963 find_anchor:
2c2d71f5 964 while (t < strend - prog->minlen) {
cad2e5aa 965 if (*t == '\n') {
4ee3650e 966 if (t < check_at - prog->check_offset_min) {
f2ed9b32 967 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
968 /* Since we moved from the found position,
969 we definitely contradict the found anchored
30944b6d
IZ
970 substr. Due to the above check we do not
971 contradict "check" substr.
972 Thus we can arrive here only if check substr
973 is float. Redo checking for "other"=="fixed".
974 */
9041c2e3 975 strpos = t + 1;
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 977 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
978 goto do_other_anchored;
979 }
4ee3650e
GS
980 /* We don't contradict the found floating substring. */
981 /* XXXX Why not check for STCLASS? */
cad2e5aa 982 s = t + 1;
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 984 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
985 goto set_useful;
986 }
4ee3650e
GS
987 /* Position contradicts check-string */
988 /* XXXX probably better to look for check-string
989 than for "\n", so one should lower the limit for t? */
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 991 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 992 other_last = strpos = s = t + 1;
cad2e5aa
JH
993 goto restart;
994 }
995 t++;
996 }
a3621e74 997 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 998 PL_colors[0], PL_colors[1]));
2c2d71f5 999 goto fail_finish;
cad2e5aa 1000 }
f5952150 1001 else {
a3621e74 1002 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1003 PL_colors[0], PL_colors[1]));
f5952150 1004 }
cad2e5aa
JH
1005 s = t;
1006 set_useful:
f2ed9b32 1007 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1008 }
1009 else {
f5952150 1010 /* The found string does not prohibit matching at strpos,
2c2d71f5 1011 - no optimization of calling REx engine can be performed,
f5952150
GS
1012 unless it was an MBOL and we are not after MBOL,
1013 or a future STCLASS check will fail this. */
2c2d71f5
JH
1014 try_at_start:
1015 /* Even in this situation we may use MBOL flag if strpos is offset
1016 wrt the start of the string. */
05b4157f 1017 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1018 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1019 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1020 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1021 {
cad2e5aa
JH
1022 t = strpos;
1023 goto find_anchor;
1024 }
a3621e74 1025 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1026 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1027 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1028 );
2c2d71f5 1029 success_at_start:
bbe252da 1030 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1031 && (utf8_target ? (
33b8afdf
JH
1032 prog->check_utf8 /* Could be deleted already */
1033 && --BmUSEFUL(prog->check_utf8) < 0
1034 && (prog->check_utf8 == prog->float_utf8)
1035 ) : (
1036 prog->check_substr /* Could be deleted already */
1037 && --BmUSEFUL(prog->check_substr) < 0
1038 && (prog->check_substr == prog->float_substr)
1039 )))
66e933ab 1040 {
cad2e5aa 1041 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1042 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1043 /* XXX Does the destruction order has to change with utf8_target? */
1044 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1045 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1046 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1047 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1048 check = NULL; /* abort */
cad2e5aa 1049 s = strpos;
486ec47a 1050 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1051 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1052 if (prog->intflags & PREGf_IMPLICIT)
1053 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1054 /* XXXX This is a remnant of the old implementation. It
1055 looks wasteful, since now INTUIT can use many
6eb5f6b9 1056 other heuristics. */
bbe252da 1057 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1058 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1059 }
1060 else
1061 s = strpos;
1062 }
1063
6eb5f6b9
JH
1064 /* Last resort... */
1065 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1066 /* trie stclasses are too expensive to use here, we are better off to
1067 leave it to regmatch itself */
f8fc2ecf 1068 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1069 /* minlen == 0 is possible if regstclass is \b or \B,
1070 and the fixed substr is ''$.
1071 Since minlen is already taken into account, s+1 is before strend;
1072 accidentally, minlen >= 1 guaranties no false positives at s + 1
1073 even for \b or \B. But (minlen? 1 : 0) below assumes that
1074 regstclass does not come from lookahead... */
1075 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1076 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1077 const U8* const str = (U8*)STRING(progi->regstclass);
1078 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1079 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1080 : 1);
1de06328
YO
1081 char * endpos;
1082 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1083 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1084 else if (prog->float_substr || prog->float_utf8)
1085 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1086 else
1087 endpos= strend;
1088
d8080198
YO
1089 if (checked_upto < s)
1090 checked_upto = s;
1091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1092 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1093
6eb5f6b9 1094 t = s;
d8080198
YO
1095 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1096 if (s) {
1097 checked_upto = s;
1098 } else {
6eb5f6b9 1099#ifdef DEBUGGING
cbbf8932 1100 const char *what = NULL;
6eb5f6b9
JH
1101#endif
1102 if (endpos == strend) {
a3621e74 1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1104 "Could not match STCLASS...\n") );
1105 goto fail;
1106 }
a3621e74 1107 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1108 "This position contradicts STCLASS...\n") );
bbe252da 1109 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1110 goto fail;
d8080198
YO
1111 checked_upto = HOPBACKc(endpos, start_shift);
1112 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1113 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1114 /* Contradict one of substrings */
33b8afdf 1115 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1116 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1117 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1118 hop_and_restart:
1aa99e6b 1119 s = HOP3c(t, 1, strend);
66e933ab
GS
1120 if (s + start_shift + end_shift > strend) {
1121 /* XXXX Should be taken into account earlier? */
a3621e74 1122 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1123 "Could not match STCLASS...\n") );
1124 goto fail;
1125 }
5e39e1e5
HS
1126 if (!check)
1127 goto giveup;
a3621e74 1128 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1129 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1130 what, (long)(s + start_shift - i_strpos)) );
1131 goto restart;
1132 }
66e933ab 1133 /* Have both, check_string is floating */
6eb5f6b9
JH
1134 if (t + start_shift >= check_at) /* Contradicts floating=check */
1135 goto retry_floating_check;
1136 /* Recheck anchored substring, but not floating... */
9041c2e3 1137 s = check_at;
5e39e1e5
HS
1138 if (!check)
1139 goto giveup;
a3621e74 1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1141 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1142 (long)(other_last - i_strpos)) );
1143 goto do_other_anchored;
1144 }
60e71179
GS
1145 /* Another way we could have checked stclass at the
1146 current position only: */
1147 if (ml_anch) {
1148 s = t = t + 1;
5e39e1e5
HS
1149 if (!check)
1150 goto giveup;
a3621e74 1151 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1152 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1153 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1154 goto try_at_offset;
66e933ab 1155 }
f2ed9b32 1156 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1157 goto fail;
486ec47a 1158 /* Check is floating substring. */
6eb5f6b9
JH
1159 retry_floating_check:
1160 t = check_at - start_shift;
a3621e74 1161 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1162 goto hop_and_restart;
1163 }
b7953727 1164 if (t != s) {
a3621e74 1165 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1166 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1167 (long)(t - i_strpos), (long)(s - i_strpos))
1168 );
1169 }
1170 else {
a3621e74 1171 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1172 "Does not contradict STCLASS...\n");
1173 );
1174 }
6eb5f6b9 1175 }
5e39e1e5 1176 giveup:
a3621e74 1177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1178 PL_colors[4], (check ? "Guessed" : "Giving up"),
1179 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1180 return s;
2c2d71f5
JH
1181
1182 fail_finish: /* Substring not found */
33b8afdf 1183 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1184 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1185 fail:
a3621e74 1186 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1187 PL_colors[4], PL_colors[5]));
bd61b366 1188 return NULL;
cad2e5aa 1189}
9661b544 1190
a0a388a1
YO
1191#define DECL_TRIE_TYPE(scan) \
1192 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1193 trie_type = ((scan->flags == EXACT) \
1194 ? (utf8_target ? trie_utf8 : trie_plain) \
1195 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1196
1197#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1198uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1199 STRLEN skiplen; \
1200 switch (trie_type) { \
1201 case trie_utf8_fold: \
1202 if ( foldlen>0 ) { \
1203 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1204 foldlen -= len; \
1205 uscan += len; \
1206 len=0; \
1207 } else { \
1208 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1209 len = UTF8SKIP(uc); \
1210 skiplen = UNISKIP( uvc ); \
1211 foldlen -= skiplen; \
1212 uscan = foldbuf + skiplen; \
1213 } \
1214 break; \
1215 case trie_latin_utf8_fold: \
1216 if ( foldlen>0 ) { \
1217 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1218 foldlen -= len; \
1219 uscan += len; \
1220 len=0; \
1221 } else { \
1222 len = 1; \
1223 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1224 skiplen = UNISKIP( uvc ); \
1225 foldlen -= skiplen; \
1226 uscan = foldbuf + skiplen; \
1227 } \
1228 break; \
1229 case trie_utf8: \
1230 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1231 break; \
1232 case trie_plain: \
1233 uvc = (UV)*uc; \
1234 len = 1; \
1235 } \
1236 if (uvc < 256) { \
1237 charid = trie->charmap[ uvc ]; \
1238 } \
1239 else { \
1240 charid = 0; \
1241 if (widecharmap) { \
1242 SV** const svpp = hv_fetch(widecharmap, \
1243 (char*)&uvc, sizeof(UV), 0); \
1244 if (svpp) \
1245 charid = (U16)SvIV(*svpp); \
1246 } \
1247 } \
4cadc6a9
YO
1248} STMT_END
1249
4cadc6a9
YO
1250#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1251STMT_START { \
1252 while (s <= e) { \
1253 if ( (CoNd) \
fac1af77 1254 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1255 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1256 goto got_it; \
1257 s++; \
1258 } \
1259} STMT_END
1260
1261#define REXEC_FBC_UTF8_SCAN(CoDe) \
1262STMT_START { \
1263 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1264 CoDe \
1265 s += uskip; \
1266 } \
1267} STMT_END
1268
1269#define REXEC_FBC_SCAN(CoDe) \
1270STMT_START { \
1271 while (s < strend) { \
1272 CoDe \
1273 s++; \
1274 } \
1275} STMT_END
1276
1277#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1278REXEC_FBC_UTF8_SCAN( \
1279 if (CoNd) { \
24b23f37 1280 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1281 goto got_it; \
1282 else \
1283 tmp = doevery; \
1284 } \
1285 else \
1286 tmp = 1; \
1287)
1288
1289#define REXEC_FBC_CLASS_SCAN(CoNd) \
1290REXEC_FBC_SCAN( \
1291 if (CoNd) { \
24b23f37 1292 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1293 goto got_it; \
1294 else \
1295 tmp = doevery; \
1296 } \
1297 else \
1298 tmp = 1; \
1299)
1300
1301#define REXEC_FBC_TRYIT \
24b23f37 1302if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1303 goto got_it
1304
e1d1eefb 1305#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1306 if (utf8_target) { \
e1d1eefb
YO
1307 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1308 } \
1309 else { \
1310 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1311 }
e1d1eefb 1312
4cadc6a9 1313#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1314 if (utf8_target) { \
4cadc6a9
YO
1315 UtFpReLoAd; \
1316 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1317 } \
1318 else { \
1319 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1320 }
4cadc6a9
YO
1321
1322#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1323 PL_reg_flags |= RF_tainted; \
f2ed9b32 1324 if (utf8_target) { \
4cadc6a9
YO
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1326 } \
1327 else { \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1329 }
4cadc6a9 1330
786e8c11
YO
1331#define DUMP_EXEC_POS(li,s,doutf8) \
1332 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1333
cfaf538b
KW
1334
1335#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1336 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1337 tmp = TEST_NON_UTF8(tmp); \
1338 REXEC_FBC_UTF8_SCAN( \
1339 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1340 tmp = !tmp; \
1341 IF_SUCCESS; \
1342 } \
1343 else { \
1344 IF_FAIL; \
1345 } \
1346 ); \
1347
1348#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1349 if (s == PL_bostr) { \
1350 tmp = '\n'; \
1351 } \
1352 else { \
1353 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1354 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1355 } \
1356 tmp = TeSt1_UtF8; \
1357 LOAD_UTF8_CHARCLASS_ALNUM(); \
1358 REXEC_FBC_UTF8_SCAN( \
1359 if (tmp == ! (TeSt2_UtF8)) { \
1360 tmp = !tmp; \
1361 IF_SUCCESS; \
1362 } \
1363 else { \
1364 IF_FAIL; \
1365 } \
1366 ); \
1367
63ac0dad
KW
1368/* The only difference between the BOUND and NBOUND cases is that
1369 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1370 * NBOUND. This is accomplished by passing it in either the if or else clause,
1371 * with the other one being empty */
1372#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1373 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1374
1375#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1376 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1377
1378#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1379 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1380
1381#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1382 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1383
63ac0dad
KW
1384
1385/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1386 * be passed in completely with the variable name being tested, which isn't
1387 * such a clean interface, but this is easier to read than it was before. We
1388 * are looking for the boundary (or non-boundary between a word and non-word
1389 * character. The utf8 and non-utf8 cases have the same logic, but the details
1390 * must be different. Find the "wordness" of the character just prior to this
1391 * one, and compare it with the wordness of this one. If they differ, we have
1392 * a boundary. At the beginning of the string, pretend that the previous
1393 * character was a new-line */
cfaf538b 1394#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1395 if (utf8_target) { \
cfaf538b 1396 UTF8_CODE \
63ac0dad
KW
1397 } \
1398 else { /* Not utf8 */ \
1399 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1400 tmp = TEST_NON_UTF8(tmp); \
1401 REXEC_FBC_SCAN( \
1402 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1403 tmp = !tmp; \
1404 IF_SUCCESS; \
1405 } \
1406 else { \
1407 IF_FAIL; \
1408 } \
1409 ); \
1410 } \
1411 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1412 goto got_it;
1413
786e8c11
YO
1414/* We know what class REx starts with. Try to find this position... */
1415/* if reginfo is NULL, its a dryrun */
1416/* annoyingly all the vars in this routine have different names from their counterparts
1417 in regmatch. /grrr */
1418
3c3eec57 1419STATIC char *
07be1b83 1420S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1421 const char *strend, regmatch_info *reginfo)
a687059c 1422{
27da23d5 1423 dVAR;
bbe252da 1424 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1425 char *pat_string; /* The pattern's exactish string */
1426 char *pat_end; /* ptr to end char of pat_string */
1427 re_fold_t folder; /* Function for computing non-utf8 folds */
1428 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1429 STRLEN ln;
5dab1207 1430 STRLEN lnc;
078c425b 1431 register STRLEN uskip;
fac1af77
KW
1432 U8 c1;
1433 U8 c2;
6eb5f6b9
JH
1434 char *e;
1435 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1436 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1437 UV utf8_fold_flags = 0;
f8fc2ecf 1438 RXi_GET_DECL(prog,progi);
7918f24d
NC
1439
1440 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1441
6eb5f6b9
JH
1442 /* We know what class it must start with. */
1443 switch (OP(c)) {
f56b6394 1444 case ANYOFV:
6eb5f6b9 1445 case ANYOF:
f56b6394 1446 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1447 STRLEN inclasslen = strend - s;
1448 REXEC_FBC_UTF8_CLASS_SCAN(
1449 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1450 }
1451 else {
6ef69d56 1452 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1453 }
6eb5f6b9 1454 break;
f33976b4 1455 case CANY:
4cadc6a9 1456 REXEC_FBC_SCAN(
24b23f37 1457 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1458 goto got_it;
1459 else
1460 tmp = doevery;
4cadc6a9 1461 );
f33976b4 1462 break;
2f7f8cb1
KW
1463
1464 case EXACTFA:
1465 if (UTF_PATTERN || utf8_target) {
1466 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1467 goto do_exactf_utf8;
1468 }
1469 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1470 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1471 goto do_exactf_non_utf8; /* isn't dealt with by these */
1472
6eb5f6b9 1473 case EXACTF:
62bf7766 1474 if (utf8_target) {
77a6d856
KW
1475
1476 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1477 utf8_fold_flags = 0;
fac1af77
KW
1478 goto do_exactf_utf8;
1479 }
1480 fold_array = PL_fold;
1481 folder = foldEQ;
1482 goto do_exactf_non_utf8;
1483
1484 case EXACTFL:
1485 if (UTF_PATTERN || utf8_target) {
17580e7a 1486 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1487 goto do_exactf_utf8;
1488 }
1489 fold_array = PL_fold_locale;
1490 folder = foldEQ_locale;
16d951b7
KW
1491 goto do_exactf_non_utf8;
1492
3c760661
KW
1493 case EXACTFU_SS:
1494 if (UTF_PATTERN) {
1495 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1496 }
1497 goto do_exactf_utf8;
1498
fab2782b 1499 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1500 case EXACTFU:
1501 if (UTF_PATTERN || utf8_target) {
77a6d856 1502 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1503 goto do_exactf_utf8;
1504 }
1505
1506 /* Any 'ss' in the pattern should have been replaced by regcomp,
1507 * so we don't have to worry here about this single special case
1508 * in the Latin1 range */
1509 fold_array = PL_fold_latin1;
1510 folder = foldEQ_latin1;
fac1af77
KW
1511
1512 /* FALL THROUGH */
1513
62bf7766
KW
1514 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1515 are no glitches with fold-length differences
1516 between the target string and pattern */
fac1af77
KW
1517
1518 /* The idea in the non-utf8 EXACTF* cases is to first find the
1519 * first character of the EXACTF* node and then, if necessary,
1520 * case-insensitively compare the full text of the node. c1 is the
1521 * first character. c2 is its fold. This logic will not work for
1522 * Unicode semantics and the german sharp ss, which hence should
1523 * not be compiled into a node that gets here. */
1524 pat_string = STRING(c);
1525 ln = STR_LEN(c); /* length to match in octets/bytes */
1526
8a90a8fe
KW
1527 /* We know that we have to match at least 'ln' bytes (which is the
1528 * same as characters, since not utf8). If we have to match 3
1529 * characters, and there are only 2 availabe, we know without
1530 * trying that it will fail; so don't start a match past the
1531 * required minimum number from the far end */
fac1af77
KW
1532 e = HOP3c(strend, -((I32)ln), s);
1533
1534 if (!reginfo && e < s) {
1535 e = s; /* Due to minlen logic of intuit() */
1536 }
1537
1538 c1 = *pat_string;
1539 c2 = fold_array[c1];
1540 if (c1 == c2) { /* If char and fold are the same */
1541 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1542 }
1543 else {
1544 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1545 }
1546 break;
1547
1548 do_exactf_utf8:
e067297c
KW
1549 {
1550 unsigned expansion;
1551
fac1af77
KW
1552
1553 /* If one of the operands is in utf8, we can't use the simpler
1554 * folding above, due to the fact that many different characters
1555 * can have the same fold, or portion of a fold, or different-
1556 * length fold */
1557 pat_string = STRING(c);
1558 ln = STR_LEN(c); /* length to match in octets/bytes */
1559 pat_end = pat_string + ln;
1560 lnc = (UTF_PATTERN) /* length to match in characters */
1561 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1562 : ln;
1563
e067297c
KW
1564 /* We have 'lnc' characters to match in the pattern, but because of
1565 * multi-character folding, each character in the target can match
1566 * up to 3 characters (Unicode guarantees it will never exceed
1567 * this) if it is utf8-encoded; and up to 2 if not (based on the
1568 * fact that the Latin 1 folds are already determined, and the
1569 * only multi-char fold in that range is the sharp-s folding to
1570 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1571 * string character. Adjust lnc accordingly, rounding up, so that
1572 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1573 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1574 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1575
1576 /* As in the non-UTF8 case, if we have to match 3 characters, and
1577 * only 2 are left, it's guaranteed to fail, so don't start a
1578 * match that would require us to go beyond the end of the string
1579 */
1580 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1581
1582 if (!reginfo && e < s) {
1583 e = s; /* Due to minlen logic of intuit() */
1584 }
1585
b33105db
KW
1586 /* XXX Note that we could recalculate e to stop the loop earlier,
1587 * as the worst case expansion above will rarely be met, and as we
1588 * go along we would usually find that e moves further to the left.
1589 * This would happen only after we reached the point in the loop
1590 * where if there were no expansion we should fail. Unclear if
1591 * worth the expense */
e067297c 1592
fac1af77
KW
1593 while (s <= e) {
1594 char *my_strend= (char *)strend;
1595 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1596 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1597 && (!reginfo || regtry(reginfo, &s)) )
1598 {
1599 goto got_it;
1600 }
bbdd8bad 1601 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1602 }
1603 break;
e067297c 1604 }
bbce6d69 1605 case BOUNDL:
3280af22 1606 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1607 FBC_BOUND(isALNUM_LC,
1608 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1609 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1610 break;
bbce6d69 1611 case NBOUNDL:
3280af22 1612 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1613 FBC_NBOUND(isALNUM_LC,
1614 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1615 isALNUM_LC_utf8((U8*)s));
1616 break;
1617 case BOUND:
1618 FBC_BOUND(isWORDCHAR,
1619 isALNUM_uni(tmp),
1620 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1621 break;
cfaf538b
KW
1622 case BOUNDA:
1623 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1624 isWORDCHAR_A(tmp),
1625 isWORDCHAR_A((U8*)s));
1626 break;
a0d0e21e 1627 case NBOUND:
63ac0dad
KW
1628 FBC_NBOUND(isWORDCHAR,
1629 isALNUM_uni(tmp),
1630 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631 break;
cfaf538b
KW
1632 case NBOUNDA:
1633 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1634 isWORDCHAR_A(tmp),
1635 isWORDCHAR_A((U8*)s));
1636 break;
63ac0dad
KW
1637 case BOUNDU:
1638 FBC_BOUND(isWORDCHAR_L1,
1639 isALNUM_uni(tmp),
1640 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1641 break;
1642 case NBOUNDU:
1643 FBC_NBOUND(isWORDCHAR_L1,
1644 isALNUM_uni(tmp),
1645 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1646 break;
bbce6d69 1647 case ALNUML:
4cadc6a9
YO
1648 REXEC_FBC_CSCAN_TAINT(
1649 isALNUM_LC_utf8((U8*)s),
1650 isALNUM_LC(*s)
1651 );
6895a8aa 1652 break;
980866de
KW
1653 case ALNUMU:
1654 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1655 LOAD_UTF8_CHARCLASS_ALNUM(),
1656 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1657 isWORDCHAR_L1((U8) *s)
1658 );
6895a8aa 1659 break;
980866de
KW
1660 case ALNUM:
1661 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1662 LOAD_UTF8_CHARCLASS_ALNUM(),
1663 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1664 isWORDCHAR((U8) *s)
1665 );
6895a8aa 1666 break;
cfaf538b 1667 case ALNUMA:
8e9da4d4
KW
1668 /* Don't need to worry about utf8, as it can match only a single
1669 * byte invariant character */
cfaf538b 1670 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1671 break;
980866de
KW
1672 case NALNUMU:
1673 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1674 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1675 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1676 ! isWORDCHAR_L1((U8) *s)
1677 );
6895a8aa 1678 break;
a0d0e21e 1679 case NALNUM:
4cadc6a9 1680 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1681 LOAD_UTF8_CHARCLASS_ALNUM(),
1682 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1683 ! isALNUM(*s)
4cadc6a9 1684 );
6895a8aa 1685 break;
cfaf538b 1686 case NALNUMA:
8e9da4d4
KW
1687 REXEC_FBC_CSCAN(
1688 !isWORDCHAR_A(*s),
1689 !isWORDCHAR_A(*s)
1690 );
1691 break;
bbce6d69 1692 case NALNUML:
4cadc6a9
YO
1693 REXEC_FBC_CSCAN_TAINT(
1694 !isALNUM_LC_utf8((U8*)s),
1695 !isALNUM_LC(*s)
1696 );
6895a8aa 1697 break;
980866de
KW
1698 case SPACEU:
1699 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1700 LOAD_UTF8_CHARCLASS_SPACE(),
1701 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1702 isSPACE_L1((U8) *s)
1703 );
6895a8aa 1704 break;
a0d0e21e 1705 case SPACE:
4cadc6a9 1706 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1707 LOAD_UTF8_CHARCLASS_SPACE(),
1708 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1709 isSPACE((U8) *s)
4cadc6a9 1710 );
6895a8aa 1711 break;
cfaf538b 1712 case SPACEA:
8e9da4d4
KW
1713 /* Don't need to worry about utf8, as it can match only a single
1714 * byte invariant character */
cfaf538b 1715 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1716 break;
bbce6d69 1717 case SPACEL:
4cadc6a9 1718 REXEC_FBC_CSCAN_TAINT(
6bbba904 1719 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1720 isSPACE_LC(*s)
1721 );
6895a8aa 1722 break;
980866de
KW
1723 case NSPACEU:
1724 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1725 LOAD_UTF8_CHARCLASS_SPACE(),
1726 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1727 ! isSPACE_L1((U8) *s)
1728 );
6895a8aa 1729 break;
a0d0e21e 1730 case NSPACE:
4cadc6a9 1731 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1732 LOAD_UTF8_CHARCLASS_SPACE(),
1733 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1734 ! isSPACE((U8) *s)
4cadc6a9 1735 );
6895a8aa 1736 break;
cfaf538b 1737 case NSPACEA:
8e9da4d4
KW
1738 REXEC_FBC_CSCAN(
1739 !isSPACE_A(*s),
1740 !isSPACE_A(*s)
1741 );
1742 break;
bbce6d69 1743 case NSPACEL:
4cadc6a9 1744 REXEC_FBC_CSCAN_TAINT(
6bbba904 1745 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1746 !isSPACE_LC(*s)
1747 );
6895a8aa 1748 break;
a0d0e21e 1749 case DIGIT:
4cadc6a9 1750 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1751 LOAD_UTF8_CHARCLASS_DIGIT(),
1752 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1753 isDIGIT(*s)
1754 );
6895a8aa 1755 break;
cfaf538b 1756 case DIGITA:
8e9da4d4
KW
1757 /* Don't need to worry about utf8, as it can match only a single
1758 * byte invariant character */
cfaf538b 1759 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1760 break;
b8c5462f 1761 case DIGITL:
4cadc6a9
YO
1762 REXEC_FBC_CSCAN_TAINT(
1763 isDIGIT_LC_utf8((U8*)s),
1764 isDIGIT_LC(*s)
1765 );
6895a8aa 1766 break;
a0d0e21e 1767 case NDIGIT:
4cadc6a9 1768 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1769 LOAD_UTF8_CHARCLASS_DIGIT(),
1770 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1771 !isDIGIT(*s)
1772 );
6895a8aa 1773 break;
cfaf538b 1774 case NDIGITA:
8e9da4d4
KW
1775 REXEC_FBC_CSCAN(
1776 !isDIGIT_A(*s),
1777 !isDIGIT_A(*s)
1778 );
1779 break;
b8c5462f 1780 case NDIGITL:
4cadc6a9
YO
1781 REXEC_FBC_CSCAN_TAINT(
1782 !isDIGIT_LC_utf8((U8*)s),
1783 !isDIGIT_LC(*s)
1784 );
6895a8aa 1785 break;
e1d1eefb
YO
1786 case LNBREAK:
1787 REXEC_FBC_CSCAN(
1788 is_LNBREAK_utf8(s),
1789 is_LNBREAK_latin1(s)
1790 );
6895a8aa 1791 break;
e1d1eefb
YO
1792 case VERTWS:
1793 REXEC_FBC_CSCAN(
1794 is_VERTWS_utf8(s),
1795 is_VERTWS_latin1(s)
1796 );
6895a8aa 1797 break;
e1d1eefb
YO
1798 case NVERTWS:
1799 REXEC_FBC_CSCAN(
1800 !is_VERTWS_utf8(s),
1801 !is_VERTWS_latin1(s)
1802 );
6895a8aa 1803 break;
e1d1eefb
YO
1804 case HORIZWS:
1805 REXEC_FBC_CSCAN(
1806 is_HORIZWS_utf8(s),
1807 is_HORIZWS_latin1(s)
1808 );
6895a8aa 1809 break;
e1d1eefb
YO
1810 case NHORIZWS:
1811 REXEC_FBC_CSCAN(
1812 !is_HORIZWS_utf8(s),
1813 !is_HORIZWS_latin1(s)
1814 );
6895a8aa 1815 break;
1de06328
YO
1816 case AHOCORASICKC:
1817 case AHOCORASICK:
07be1b83 1818 {
a0a388a1 1819 DECL_TRIE_TYPE(c);
07be1b83
YO
1820 /* what trie are we using right now */
1821 reg_ac_data *aho
f8fc2ecf 1822 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1823 reg_trie_data *trie
1824 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1825 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1826
1827 const char *last_start = strend - trie->minlen;
6148ee25 1828#ifdef DEBUGGING
07be1b83 1829 const char *real_start = s;
6148ee25 1830#endif
07be1b83 1831 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1832 SV *sv_points;
1833 U8 **points; /* map of where we were in the input string
786e8c11 1834 when reading a given char. For ASCII this
be8e71aa 1835 is unnecessary overhead as the relationship
38a44b82
NC
1836 is always 1:1, but for Unicode, especially
1837 case folded Unicode this is not true. */
f9e705e8 1838 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1839 U8 *bitmap=NULL;
1840
07be1b83
YO
1841
1842 GET_RE_DEBUG_FLAGS_DECL;
1843
be8e71aa
YO
1844 /* We can't just allocate points here. We need to wrap it in
1845 * an SV so it gets freed properly if there is a croak while
1846 * running the match */
1847 ENTER;
1848 SAVETMPS;
1849 sv_points=newSV(maxlen * sizeof(U8 *));
1850 SvCUR_set(sv_points,
1851 maxlen * sizeof(U8 *));
1852 SvPOK_on(sv_points);
1853 sv_2mortal(sv_points);
1854 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1855 if ( trie_type != trie_utf8_fold
1856 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1857 {
786e8c11
YO
1858 if (trie->bitmap)
1859 bitmap=(U8*)trie->bitmap;
1860 else
1861 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1862 }
786e8c11
YO
1863 /* this is the Aho-Corasick algorithm modified a touch
1864 to include special handling for long "unknown char"
1865 sequences. The basic idea being that we use AC as long
1866 as we are dealing with a possible matching char, when
1867 we encounter an unknown char (and we have not encountered
1868 an accepting state) we scan forward until we find a legal
1869 starting char.
1870 AC matching is basically that of trie matching, except
1871 that when we encounter a failing transition, we fall back
1872 to the current states "fail state", and try the current char
1873 again, a process we repeat until we reach the root state,
1874 state 1, or a legal transition. If we fail on the root state
1875 then we can either terminate if we have reached an accepting
1876 state previously, or restart the entire process from the beginning
1877 if we have not.
1878
1879 */
07be1b83
YO
1880 while (s <= last_start) {
1881 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1882 U8 *uc = (U8*)s;
1883 U16 charid = 0;
1884 U32 base = 1;
1885 U32 state = 1;
1886 UV uvc = 0;
1887 STRLEN len = 0;
1888 STRLEN foldlen = 0;
1889 U8 *uscan = (U8*)NULL;
1890 U8 *leftmost = NULL;
786e8c11
YO
1891#ifdef DEBUGGING
1892 U32 accepted_word= 0;
1893#endif
07be1b83
YO
1894 U32 pointpos = 0;
1895
1896 while ( state && uc <= (U8*)strend ) {
1897 int failed=0;
786e8c11
YO
1898 U32 word = aho->states[ state ].wordnum;
1899
1de06328
YO
1900 if( state==1 ) {
1901 if ( bitmap ) {
1902 DEBUG_TRIE_EXECUTE_r(
1903 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1904 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1905 (char *)uc, utf8_target );
1de06328
YO
1906 PerlIO_printf( Perl_debug_log,
1907 " Scanning for legal start char...\n");
1908 }
d085b490
YO
1909 );
1910 if (utf8_target) {
1911 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1912 uc += UTF8SKIP(uc);
1913 }
1914 } else {
1915 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1916 uc++;
1917 }
1918 }
1de06328 1919 s= (char *)uc;
786e8c11 1920 }
786e8c11
YO
1921 if (uc >(U8*)last_start) break;
1922 }
1923
1924 if ( word ) {
2e64971a 1925 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1926 if (!leftmost || lpos < leftmost) {
1927 DEBUG_r(accepted_word=word);
07be1b83 1928 leftmost= lpos;
786e8c11 1929 }
07be1b83 1930 if (base==0) break;
786e8c11 1931
07be1b83
YO
1932 }
1933 points[pointpos++ % maxlen]= uc;
55eed653
NC
1934 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1935 uscan, len, uvc, charid, foldlen,
1936 foldbuf, uniflags);
786e8c11
YO
1937 DEBUG_TRIE_EXECUTE_r({
1938 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1939 s, utf8_target );
07be1b83 1940 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1941 " Charid:%3u CP:%4"UVxf" ",
1942 charid, uvc);
1943 });
07be1b83
YO
1944
1945 do {
6148ee25 1946#ifdef DEBUGGING
786e8c11 1947 word = aho->states[ state ].wordnum;
6148ee25 1948#endif
07be1b83
YO
1949 base = aho->states[ state ].trans.base;
1950
786e8c11
YO
1951 DEBUG_TRIE_EXECUTE_r({
1952 if (failed)
1953 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1954 s, utf8_target );
07be1b83 1955 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1956 "%sState: %4"UVxf", word=%"UVxf,
1957 failed ? " Fail transition to " : "",
1958 (UV)state, (UV)word);
1959 });
07be1b83
YO
1960 if ( base ) {
1961 U32 tmp;
6dd2be57 1962 I32 offset;
07be1b83 1963 if (charid &&
6dd2be57
DM
1964 ( ((offset = base + charid
1965 - 1 - trie->uniquecharcount)) >= 0)
1966 && ((U32)offset < trie->lasttrans)
1967 && trie->trans[offset].check == state
1968 && (tmp=trie->trans[offset].next))
07be1b83 1969 {
786e8c11
YO
1970 DEBUG_TRIE_EXECUTE_r(
1971 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1972 state = tmp;
1973 break;
1974 }
1975 else {
786e8c11
YO
1976 DEBUG_TRIE_EXECUTE_r(
1977 PerlIO_printf( Perl_debug_log," - fail\n"));
1978 failed = 1;
1979 state = aho->fail[state];
07be1b83
YO
1980 }
1981 }
1982 else {
1983 /* we must be accepting here */
786e8c11
YO
1984 DEBUG_TRIE_EXECUTE_r(
1985 PerlIO_printf( Perl_debug_log," - accepting\n"));
1986 failed = 1;
07be1b83
YO
1987 break;
1988 }
1989 } while(state);
786e8c11 1990 uc += len;
07be1b83
YO
1991 if (failed) {
1992 if (leftmost)
1993 break;
786e8c11 1994 if (!state) state = 1;
07be1b83
YO
1995 }
1996 }
1997 if ( aho->states[ state ].wordnum ) {
2e64971a 1998 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1999 if (!leftmost || lpos < leftmost) {
2000 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2001 leftmost = lpos;
786e8c11 2002 }
07be1b83 2003 }
07be1b83
YO
2004 if (leftmost) {
2005 s = (char*)leftmost;
786e8c11
YO
2006 DEBUG_TRIE_EXECUTE_r({
2007 PerlIO_printf(
70685ca0
JH
2008 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2009 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2010 );
2011 });
24b23f37 2012 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2013 FREETMPS;
2014 LEAVE;
07be1b83 2015 goto got_it;
be8e71aa 2016 }
07be1b83 2017 s = HOPc(s,1);
786e8c11
YO
2018 DEBUG_TRIE_EXECUTE_r({
2019 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2020 });
07be1b83 2021 } else {
786e8c11
YO
2022 DEBUG_TRIE_EXECUTE_r(
2023 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2024 break;
2025 }
2026 }
be8e71aa
YO
2027 FREETMPS;
2028 LEAVE;
07be1b83
YO
2029 }
2030 break;
b3c9acc1 2031 default:
3c3eec57
GS
2032 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2033 break;
d6a28714 2034 }
6eb5f6b9
JH
2035 return 0;
2036 got_it:
2037 return s;
2038}
2039
fae667d5 2040
6eb5f6b9
JH
2041/*
2042 - regexec_flags - match a regexp against a string
2043 */
2044I32
288b8c02 2045Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2046 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2047/* strend: pointer to null at end of string */
2048/* strbeg: real beginning of string */
2049/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2050/* data: May be used for some additional optimizations.
2051 Currently its only used, with a U32 cast, for transmitting
2052 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2053/* nosave: For optimizations. */
2054{
97aff369 2055 dVAR;
288b8c02 2056 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2057 /*register*/ char *s;
6eb5f6b9 2058 register regnode *c;
24b23f37 2059 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2060 I32 minlen; /* must match at least this many chars */
2061 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2062 I32 end_shift = 0; /* Same for the end. */ /* CC */
2063 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2064 char *scream_olds = NULL;
f2ed9b32 2065 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2066 I32 multiline;
f8fc2ecf 2067 RXi_GET_DECL(prog,progi);
3b0527fe 2068 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2069 regexp_paren_pair *swap = NULL;
a3621e74
YO
2070 GET_RE_DEBUG_FLAGS_DECL;
2071
7918f24d 2072 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2073 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2074
2075 /* Be paranoid... */
2076 if (prog == NULL || startpos == NULL) {
2077 Perl_croak(aTHX_ "NULL regexp parameter");
2078 return 0;
2079 }
2080
bbe252da 2081 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2082 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2083
f2ed9b32 2084 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2085 DEBUG_EXECUTE_r(
f2ed9b32 2086 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2087 "Matching");
2088 );
bac06658 2089
6eb5f6b9 2090 minlen = prog->minlen;
1de06328
YO
2091
2092 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2094 "String too short [regexec_flags]...\n"));
2095 goto phooey;
1aa99e6b 2096 }
6eb5f6b9 2097
1de06328 2098
6eb5f6b9 2099 /* Check validity of program. */
f8fc2ecf 2100 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2101 Perl_croak(aTHX_ "corrupted regexp program");
2102 }
2103
2104 PL_reg_flags = 0;
ed301438 2105 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2106 PL_reg_maxiter = 0;
2107
3c8556c3 2108 if (RX_UTF8(rx))
6eb5f6b9
JH
2109 PL_reg_flags |= RF_utf8;
2110
2111 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2112 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2113 PL_bostr = strbeg;
3b0527fe 2114 reginfo.sv = sv;
6eb5f6b9
JH
2115
2116 /* Mark end of line for $ (and such) */
2117 PL_regeol = strend;
2118
2119 /* see how far we have to get to not match where we matched before */
3b0527fe 2120 reginfo.till = startpos+minend;
6eb5f6b9 2121
6eb5f6b9
JH
2122 /* If there is a "must appear" string, look for it. */
2123 s = startpos;
2124
bbe252da 2125 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2126 MAGIC *mg;
2c296965 2127 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2128 reginfo.ganch = startpos + prog->gofs;
2c296965 2129 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2130 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2131 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2132 && SvMAGIC(sv)
14befaf4
DM
2133 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2134 && mg->mg_len >= 0) {
3b0527fe 2135 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2138
bbe252da 2139 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2140 if (s > reginfo.ganch)
6eb5f6b9 2141 goto phooey;
58e23c8d 2142 s = reginfo.ganch - prog->gofs;
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2145 if (s < strbeg)
2146 goto phooey;
6eb5f6b9
JH
2147 }
2148 }
58e23c8d 2149 else if (data) {
70685ca0 2150 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2151 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2153
2154 } else { /* pos() not defined */
3b0527fe 2155 reginfo.ganch = strbeg;
2c296965
YO
2156 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2157 "GPOS: reginfo.ganch = strbeg\n"));
2158 }
6eb5f6b9 2159 }
288b8c02 2160 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2161 /* We have to be careful. If the previous successful match
2162 was from this regex we don't want a subsequent partially
2163 successful match to clobber the old results.
2164 So when we detect this possibility we add a swap buffer
2165 to the re, and switch the buffer each match. If we fail
2166 we switch it back, otherwise we leave it swapped.
2167 */
2168 swap = prog->offs;
2169 /* do we need a save destructor here for eval dies? */
2170 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2171 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2172 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2173 PTR2UV(prog),
2174 PTR2UV(swap),
2175 PTR2UV(prog->offs)
2176 ));
c74340f9 2177 }
a0714e2c 2178 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2179 re_scream_pos_data d;
2180
2181 d.scream_olds = &scream_olds;
2182 d.scream_pos = &scream_pos;
288b8c02 2183 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2184 if (!s) {
a3621e74 2185 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2186 goto phooey; /* not present */
3fa9c3d7 2187 }
6eb5f6b9
JH
2188 }
2189
1de06328 2190
6eb5f6b9
JH
2191
2192 /* Simplest case: anchored match need be tried only once. */
2193 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2194 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2195 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2196 goto got_it;
bbe252da
YO
2197 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2198 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2199 {
2200 char *end;
2201
2202 if (minlen)
2203 dontbother = minlen - 1;
1aa99e6b 2204 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2205 /* for multiline we only have to try after newlines */
33b8afdf 2206 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2207 /* because of the goto we can not easily reuse the macros for bifurcating the
2208 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2209 if (utf8_target) {
2210 if (s == startpos)
2211 goto after_try_utf8;
2212 while (1) {
2213 if (regtry(&reginfo, &s)) {
2214 goto got_it;
2215 }
2216 after_try_utf8:
2217 if (s > end) {
2218 goto phooey;
2219 }
2220 if (prog->extflags & RXf_USE_INTUIT) {
2221 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2222 if (!s) {
2223 goto phooey;
2224 }
2225 }
2226 else {
2227 s += UTF8SKIP(s);
2228 }
2229 }
2230 } /* end search for check string in unicode */
2231 else {
2232 if (s == startpos) {
2233 goto after_try_latin;
2234 }
2235 while (1) {
2236 if (regtry(&reginfo, &s)) {
2237 goto got_it;
2238 }
2239 after_try_latin:
2240 if (s > end) {
2241 goto phooey;
2242 }
2243 if (prog->extflags & RXf_USE_INTUIT) {
2244 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2245 if (!s) {
2246 goto phooey;
2247 }
2248 }
2249 else {
2250 s++;
2251 }
2252 }
2253 } /* end search for check string in latin*/
2254 } /* end search for check string */
2255 else { /* search for newline */
2256 if (s > startpos) {
2257 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2258 s--;
92f3d482 2259 }
21eede78
YO
2260 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2261 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2262 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2263 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2264 goto got_it;
2265 }
92f3d482
YO
2266 }
2267 } /* end search for newline */
2268 } /* end anchored/multiline check string search */
6eb5f6b9 2269 goto phooey;
bbe252da 2270 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2271 {
486ec47a 2272 /* the warning about reginfo.ganch being used without initialization
bbe252da 2273 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2274 and we only enter this block when the same bit is set. */
58e23c8d 2275 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2276
2277 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2278 goto got_it;
2279 goto phooey;
2280 }
2281
2282 /* Messy cases: unanchored match. */
bbe252da 2283 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2284 /* we have /x+whatever/ */
f2ed9b32 2285 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2286 char ch;
bf93d4cc
GS
2287#ifdef DEBUGGING
2288 int did_match = 0;
2289#endif
f2ed9b32
KW
2290 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2291 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2292 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2293
f2ed9b32 2294 if (utf8_target) {
4cadc6a9 2295 REXEC_FBC_SCAN(
6eb5f6b9 2296 if (*s == ch) {
a3621e74 2297 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2298 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2299 s += UTF8SKIP(s);
2300 while (s < strend && *s == ch)
2301 s += UTF8SKIP(s);
2302 }
4cadc6a9 2303 );
6eb5f6b9
JH
2304 }
2305 else {
4cadc6a9 2306 REXEC_FBC_SCAN(
6eb5f6b9 2307 if (*s == ch) {
a3621e74 2308 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2309 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2310 s++;
2311 while (s < strend && *s == ch)
2312 s++;
2313 }
4cadc6a9 2314 );
6eb5f6b9 2315 }
a3621e74 2316 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2317 PerlIO_printf(Perl_debug_log,
b7953727
JH
2318 "Did not find anchored character...\n")
2319 );
6eb5f6b9 2320 }
a0714e2c
SS
2321 else if (prog->anchored_substr != NULL
2322 || prog->anchored_utf8 != NULL
2323 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2324 && prog->float_max_offset < strend - s)) {
2325 SV *must;
2326 I32 back_max;
2327 I32 back_min;
2328 char *last;
6eb5f6b9 2329 char *last1; /* Last position checked before */
bf93d4cc
GS
2330#ifdef DEBUGGING
2331 int did_match = 0;
2332#endif
33b8afdf 2333 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2334 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2335 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2336 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2337 back_max = back_min = prog->anchored_offset;
2338 } else {
f2ed9b32
KW
2339 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2340 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2341 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2342 back_max = prog->float_max_offset;
2343 back_min = prog->float_min_offset;
2344 }
1de06328
YO
2345
2346
33b8afdf
JH
2347 if (must == &PL_sv_undef)
2348 /* could not downgrade utf8 check substring, so must fail */
2349 goto phooey;
2350
1de06328
YO
2351 if (back_min<0) {
2352 last = strend;
2353 } else {
2354 last = HOP3c(strend, /* Cannot start after this */
2355 -(I32)(CHR_SVLEN(must)
2356 - (SvTAIL(must) != 0) + back_min), strbeg);
2357 }
6eb5f6b9
JH
2358 if (s > PL_bostr)
2359 last1 = HOPc(s, -1);
2360 else
2361 last1 = s - 1; /* bogus */
2362
a0288114 2363 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2364 check_substr==must. */
2365 scream_pos = -1;
2366 dontbother = end_shift;
2367 strend = HOPc(strend, -dontbother);
2368 while ( (s <= last) &&
c33e64f0 2369 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2370 (unsigned char*)strend, must,
c33e64f0 2371 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2372 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2373 if (HOPc(s, -back_max) > last1) {
2374 last1 = HOPc(s, -back_min);
2375 s = HOPc(s, -back_max);
2376 }
2377 else {
52657f30 2378 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2379
2380 last1 = HOPc(s, -back_min);
52657f30 2381 s = t;
6eb5f6b9 2382 }
f2ed9b32 2383 if (utf8_target) {
6eb5f6b9 2384 while (s <= last1) {
24b23f37 2385 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2386 goto got_it;
2387 s += UTF8SKIP(s);
2388 }
2389 }
2390 else {
2391 while (s <= last1) {
24b23f37 2392 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2393 goto got_it;
2394 s++;
2395 }
2396 }
2397 }
ab3bbdeb 2398 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2399 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2400 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2401 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2402 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2403 ? "anchored" : "floating"),
ab3bbdeb
YO
2404 quoted, RE_SV_TAIL(must));
2405 });
6eb5f6b9
JH
2406 goto phooey;
2407 }
f8fc2ecf 2408 else if ( (c = progi->regstclass) ) {
f14c76ed 2409 if (minlen) {
f8fc2ecf 2410 const OPCODE op = OP(progi->regstclass);
66e933ab 2411 /* don't bother with what can't match */
786e8c11 2412 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2413 strend = HOPc(strend, -(minlen - 1));
2414 }
a3621e74 2415 DEBUG_EXECUTE_r({
be8e71aa 2416 SV * const prop = sv_newmortal();
32fc9b6a 2417 regprop(prog, prop, c);
0df25f3d 2418 {
f2ed9b32 2419 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2420 s,strend-s,60);
0df25f3d 2421 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2422 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2423 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2424 quoted, (int)(strend - s));
0df25f3d 2425 }
ffc61ed2 2426 });
3b0527fe 2427 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2428 goto got_it;
07be1b83 2429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2430 }
2431 else {
2432 dontbother = 0;
a0714e2c 2433 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2434 /* Trim the end. */
6af40bd7 2435 char *last= NULL;
33b8afdf 2436 SV* float_real;
c33e64f0
FC
2437 STRLEN len;
2438 const char *little;
33b8afdf 2439
f2ed9b32
KW
2440 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2441 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2442 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2443
c33e64f0
FC
2444 little = SvPV_const(float_real, len);
2445 if (SvTAIL(float_real)) {
1a13b075
YO
2446 /* This means that float_real contains an artificial \n on the end
2447 * due to the presence of something like this: /foo$/
2448 * where we can match both "foo" and "foo\n" at the end of the string.
2449 * So we have to compare the end of the string first against the float_real
2450 * without the \n and then against the full float_real with the string.
2451 * We have to watch out for cases where the string might be smaller
2452 * than the float_real or the float_real without the \n.
2453 */
2454 char *checkpos= strend - len;
2455 DEBUG_OPTIMISE_r(
2456 PerlIO_printf(Perl_debug_log,
2457 "%sChecking for float_real.%s\n",
2458 PL_colors[4], PL_colors[5]));
2459 if (checkpos + 1 < strbeg) {
2460 /* can't match, even if we remove the trailing \n string is too short to match */
2461 DEBUG_EXECUTE_r(
2462 PerlIO_printf(Perl_debug_log,
2463 "%sString shorter than required trailing substring, cannot match.%s\n",
2464 PL_colors[4], PL_colors[5]));
2465 goto phooey;
2466 } else if (memEQ(checkpos + 1, little, len - 1)) {
2467 /* can match, the end of the string matches without the "\n" */
2468 last = checkpos + 1;
2469 } else if (checkpos < strbeg) {
2470 /* cant match, string is too short when the "\n" is included */
2471 DEBUG_EXECUTE_r(
2472 PerlIO_printf(Perl_debug_log,
2473 "%sString does not contain required trailing substring, cannot match.%s\n",
2474 PL_colors[4], PL_colors[5]));
2475 goto phooey;
2476 } else if (!multiline) {
2477 /* non multiline match, so compare with the "\n" at the end of the string */
2478 if (memEQ(checkpos, little, len)) {
2479 last= checkpos;
2480 } else {
2481 DEBUG_EXECUTE_r(
2482 PerlIO_printf(Perl_debug_log,
2483 "%sString does not contain required trailing substring, cannot match.%s\n",
2484 PL_colors[4], PL_colors[5]));
2485 goto phooey;
2486 }
2487 } else {
2488 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2489 goto find_last;
1a13b075 2490 }
c33e64f0 2491 } else {
d6a28714 2492 find_last:
9041c2e3 2493 if (len)
d6a28714 2494 last = rninstr(s, strend, little, little + len);
b8c5462f 2495 else
a0288114 2496 last = strend; /* matching "$" */
b8c5462f 2497 }
6af40bd7
YO
2498 if (!last) {
2499 /* at one point this block contained a comment which was probably
2500 * incorrect, which said that this was a "should not happen" case.
2501 * Even if it was true when it was written I am pretty sure it is
2502 * not anymore, so I have removed the comment and replaced it with
2503 * this one. Yves */
6bda09f9
YO
2504 DEBUG_EXECUTE_r(
2505 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2506 "String does not contain required substring, cannot match.\n"
2507 ));
2508 goto phooey;
bf93d4cc 2509 }
d6a28714
JH
2510 dontbother = strend - last + prog->float_min_offset;
2511 }
2512 if (minlen && (dontbother < minlen))
2513 dontbother = minlen - 1;
2514 strend -= dontbother; /* this one's always in bytes! */
2515 /* We don't know much -- general case. */
f2ed9b32 2516 if (utf8_target) {
d6a28714 2517 for (;;) {
24b23f37 2518 if (regtry(&reginfo, &s))
d6a28714
JH
2519 goto got_it;
2520 if (s >= strend)
2521 break;
b8c5462f 2522 s += UTF8SKIP(s);
d6a28714
JH
2523 };
2524 }
2525 else {
2526 do {
24b23f37 2527 if (regtry(&reginfo, &s))
d6a28714
JH
2528 goto got_it;
2529 } while (s++ < strend);
2530 }
2531 }
2532
2533 /* Failure. */
2534 goto phooey;
2535
2536got_it:
495f47a5
DM
2537 DEBUG_BUFFERS_r(
2538 if (swap)
2539 PerlIO_printf(Perl_debug_log,
2540 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2541 PTR2UV(prog),
2542 PTR2UV(swap)
2543 );
2544 );
e9105d30 2545 Safefree(swap);
288b8c02 2546 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2547
ed301438 2548 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2549 restore_pos(aTHX_ prog);
5daac39c
NC
2550 if (RXp_PAREN_NAMES(prog))
2551 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2552
2553 /* make sure $`, $&, $', and $digit will work later */
2554 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2555 RX_MATCH_COPY_FREE(rx);
d6a28714 2556 if (flags & REXEC_COPY_STR) {
be8e71aa 2557 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2558#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2559 if ((SvIsCOW(sv)
2560 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2561 if (DEBUG_C_TEST) {
2562 PerlIO_printf(Perl_debug_log,
2563 "Copy on write: regexp capture, type %d\n",
2564 (int) SvTYPE(sv));
2565 }
2566 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2567 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2568 assert (SvPOKp(prog->saved_copy));
2569 } else
2570#endif
2571 {
288b8c02 2572 RX_MATCH_COPIED_on(rx);
ed252734
NC
2573 s = savepvn(strbeg, i);
2574 prog->subbeg = s;
2575 }
d6a28714 2576 prog->sublen = i;
d6a28714
JH
2577 }
2578 else {
2579 prog->subbeg = strbeg;
2580 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2581 }
2582 }
9041c2e3 2583
d6a28714
JH
2584 return 1;
2585
2586phooey:
a3621e74 2587 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2588 PL_colors[4], PL_colors[5]));
ed301438 2589 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2590 restore_pos(aTHX_ prog);
e9105d30 2591 if (swap) {
c74340f9 2592 /* we failed :-( roll it back */
495f47a5
DM
2593 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2594 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2595 PTR2UV(prog),
2596 PTR2UV(prog->offs),
2597 PTR2UV(swap)
2598 ));
e9105d30
GG
2599 Safefree(prog->offs);
2600 prog->offs = swap;
2601 }
2602
d6a28714
JH
2603 return 0;
2604}
2605
6bda09f9 2606
ec43f78b
DM
2607/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2608 * Do inc before dec, in case old and new rex are the same */
2609#define SET_reg_curpm(Re2) \
2610 if (PL_reg_state.re_state_eval_setup_done) { \
2611 (void)ReREFCNT_inc(Re2); \
2612 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2613 PM_SETRE((PL_reg_curpm), (Re2)); \
2614 }
2615
2616
d6a28714
JH
2617/*
2618 - regtry - try match at specific point
2619 */
2620STATIC I32 /* 0 failure, 1 success */
24b23f37 2621S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2622{
97aff369 2623 dVAR;
d6a28714 2624 CHECKPOINT lastcp;
288b8c02
NC
2625 REGEXP *const rx = reginfo->prog;
2626 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2627 RXi_GET_DECL(prog,progi);
a3621e74 2628 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2629
2630 PERL_ARGS_ASSERT_REGTRY;
2631
24b23f37 2632 reginfo->cutpoint=NULL;
d6a28714 2633
ed301438
DM
2634 if ((prog->extflags & RXf_EVAL_SEEN)
2635 && !PL_reg_state.re_state_eval_setup_done)
2636 {
d6a28714
JH
2637 MAGIC *mg;
2638
ed301438 2639 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2640 if (reginfo->sv) {
d6a28714 2641 /* Make $_ available to executed code. */
3b0527fe 2642 if (reginfo->sv != DEFSV) {
59f00321 2643 SAVE_DEFSV;
414bf5ae 2644 DEFSV_set(reginfo->sv);
b8c5462f 2645 }
d6a28714 2646
3b0527fe
DM
2647 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2648 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2649 /* prepare for quick setting of pos */
d300d9fa 2650#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2651 if (SvIsCOW(reginfo->sv))
2652 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2653#endif
3dab1dad 2654 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2655 &PL_vtbl_mglob, NULL, 0);
d6a28714 2656 mg->mg_len = -1;
b8c5462f 2657 }
d6a28714
JH
2658 PL_reg_magic = mg;
2659 PL_reg_oldpos = mg->mg_len;
4f639d21 2660 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2661 }
09687e5a 2662 if (!PL_reg_curpm) {
a02a5408 2663 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2664#ifdef USE_ITHREADS
2665 {
14a49a24 2666 SV* const repointer = &PL_sv_undef;
92313705
NC
2667 /* this regexp is also owned by the new PL_reg_curpm, which
2668 will try to free it. */
d2ece331 2669 av_push(PL_regex_padav, repointer);
09687e5a
AB
2670 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2671 PL_regex_pad = AvARRAY(PL_regex_padav);
2672 }
2673#endif
2674 }
ec43f78b 2675 SET_reg_curpm(rx);
d6a28714
JH
2676 PL_reg_oldcurpm = PL_curpm;
2677 PL_curpm = PL_reg_curpm;
07bc277f 2678 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2679 /* Here is a serious problem: we cannot rewrite subbeg,
2680 since it may be needed if this match fails. Thus
2681 $` inside (?{}) could fail... */
2682 PL_reg_oldsaved = prog->subbeg;
2683 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2684#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2685 PL_nrs = prog->saved_copy;
2686#endif
07bc277f 2687 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2688 }
2689 else
bd61b366 2690 PL_reg_oldsaved = NULL;
d6a28714
JH
2691 prog->subbeg = PL_bostr;
2692 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2693 }
97ca13b7
DM
2694#ifdef DEBUGGING
2695 PL_reg_starttry = *startpos;
2696#endif
f0ab9afb 2697 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2698 PL_reginput = *startpos;
d6a28714 2699 prog->lastparen = 0;
03994de8 2700 prog->lastcloseparen = 0;
d6a28714 2701 PL_regsize = 0;
d6a28714
JH
2702
2703 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2704 to do this again and again, prog->lastparen should take care of
3dd2943c 2705 this! --ilya*/
dafc8851
JH
2706
2707 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2708 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2709 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2710 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2711 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2712 * Meanwhile, this code *is* needed for the
daf18116
JH
2713 * above-mentioned test suite tests to succeed. The common theme
2714 * on those tests seems to be returning null fields from matches.
225593e1 2715 * --jhi updated by dapm */
dafc8851 2716#if 1
d6a28714 2717 if (prog->nparens) {
b93070ed 2718 regexp_paren_pair *pp = prog->offs;
097eb12c 2719 register I32 i;
b93070ed 2720 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2721 ++pp;
2722 pp->start = -1;
2723 pp->end = -1;
d6a28714
JH
2724 }
2725 }
dafc8851 2726#endif
02db2b7b 2727 REGCP_SET(lastcp);
f8fc2ecf 2728 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2729 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2730 return 1;
2731 }
24b23f37
YO
2732 if (reginfo->cutpoint)
2733 *startpos= reginfo->cutpoint;
02db2b7b 2734 REGCP_UNWIND(lastcp);
d6a28714
JH
2735 return 0;
2736}
2737
02db2b7b 2738
8ba1375e
MJD
2739#define sayYES goto yes
2740#define sayNO goto no
262b90c4 2741#define sayNO_SILENT goto no_silent
8ba1375e 2742
f9f4320a
YO
2743/* we dont use STMT_START/END here because it leads to
2744 "unreachable code" warnings, which are bogus, but distracting. */
2745#define CACHEsayNO \
c476f425
DM
2746 if (ST.cache_mask) \
2747 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2748 sayNO
3298f257 2749
a3621e74 2750/* this is used to determine how far from the left messages like
265c4333
YO
2751 'failed...' are printed. It should be set such that messages
2752 are inline with the regop output that created them.
a3621e74 2753*/
265c4333 2754#define REPORT_CODE_OFF 32
a3621e74
YO
2755
2756
40a82448
DM
2757#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2758#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2759
86545054
DM
2760#define SLAB_FIRST(s) (&(s)->states[0])
2761#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2762
5d9a96ca
DM
2763/* grab a new slab and return the first slot in it */
2764
2765STATIC regmatch_state *
2766S_push_slab(pTHX)
2767{
a35a87e7 2768#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2769 dMY_CXT;
2770#endif
5d9a96ca
DM
2771 regmatch_slab *s = PL_regmatch_slab->next;
2772 if (!s) {
2773 Newx(s, 1, regmatch_slab);
2774 s->prev = PL_regmatch_slab;
2775 s->next = NULL;
2776 PL_regmatch_slab->next = s;
2777 }
2778 PL_regmatch_slab = s;
86545054 2779 return SLAB_FIRST(s);
5d9a96ca 2780}
5b47454d 2781
95b24440 2782
40a82448
DM
2783/* push a new state then goto it */
2784
2785#define PUSH_STATE_GOTO(state, node) \
2786 scan = node; \
2787 st->resume_state = state; \
2788 goto push_state;
2789
2790/* push a new state with success backtracking, then goto it */
2791
2792#define PUSH_YES_STATE_GOTO(state, node) \
2793 scan = node; \
2794 st->resume_state = state; \
2795 goto push_yes_state;
2796
aa283a38 2797
aa283a38 2798
d6a28714 2799/*
95b24440 2800
bf1f174e
DM
2801regmatch() - main matching routine
2802
2803This is basically one big switch statement in a loop. We execute an op,
2804set 'next' to point the next op, and continue. If we come to a point which
2805we may need to backtrack to on failure such as (A|B|C), we push a
2806backtrack state onto the backtrack stack. On failure, we pop the top
2807state, and re-enter the loop at the state indicated. If there are no more
2808states to pop, we return failure.
2809
2810Sometimes we also need to backtrack on success; for example /A+/, where
2811after successfully matching one A, we need to go back and try to
2812match another one; similarly for lookahead assertions: if the assertion
2813completes successfully, we backtrack to the state just before the assertion
2814and then carry on. In these cases, the pushed state is marked as
2815'backtrack on success too'. This marking is in fact done by a chain of
2816pointers, each pointing to the previous 'yes' state. On success, we pop to
2817the nearest yes state, discarding any intermediate failure-only states.
2818Sometimes a yes state is pushed just to force some cleanup code to be
2819called at the end of a successful match or submatch; e.g. (??{$re}) uses
2820it to free the inner regex.
2821
2822Note that failure backtracking rewinds the cursor position, while
2823success backtracking leaves it alone.
2824
2825A pattern is complete when the END op is executed, while a subpattern
2826such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2827ops trigger the "pop to last yes state if any, otherwise return true"
2828behaviour.
2829
2830A common convention in this function is to use A and B to refer to the two
2831subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2832the subpattern to be matched possibly multiple times, while B is the entire
2833rest of the pattern. Variable and state names reflect this convention.
2834
2835The states in the main switch are the union of ops and failure/success of
2836substates associated with with that op. For example, IFMATCH is the op
2837that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2838'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2839successfully matched A and IFMATCH_A_fail is a state saying that we have
2840just failed to match A. Resume states always come in pairs. The backtrack
2841state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2842at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2843on success or failure.
2844
2845The struct that holds a backtracking state is actually a big union, with
2846one variant for each major type of op. The variable st points to the
2847top-most backtrack struct. To make the code clearer, within each
2848block of code we #define ST to alias the relevant union.
2849
2850Here's a concrete example of a (vastly oversimplified) IFMATCH
2851implementation:
2852
2853 switch (state) {
2854 ....
2855
2856#define ST st->u.ifmatch
2857
2858 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2859 ST.foo = ...; // some state we wish to save
95b24440 2860 ...
bf1f174e
DM
2861 // push a yes backtrack state with a resume value of
2862 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2863 // first node of A:
2864 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2865 // NOTREACHED
2866
2867 case IFMATCH_A: // we have successfully executed A; now continue with B
2868 next = B;
2869 bar = ST.foo; // do something with the preserved value
2870 break;
2871
2872 case IFMATCH_A_fail: // A failed, so the assertion failed
2873 ...; // do some housekeeping, then ...
2874 sayNO; // propagate the failure
2875
2876#undef ST
95b24440 2877
bf1f174e
DM
2878 ...
2879 }
95b24440 2880
bf1f174e
DM
2881For any old-timers reading this who are familiar with the old recursive
2882approach, the code above is equivalent to:
95b24440 2883
bf1f174e
DM
2884 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2885 {
2886 int foo = ...
95b24440 2887 ...
bf1f174e
DM
2888 if (regmatch(A)) {
2889 next = B;
2890 bar = foo;
2891 break;
95b24440 2892 }
bf1f174e
DM
2893 ...; // do some housekeeping, then ...
2894 sayNO; // propagate the failure
95b24440 2895 }
bf1f174e
DM
2896
2897The topmost backtrack state, pointed to by st, is usually free. If you
2898want to claim it, populate any ST.foo fields in it with values you wish to
2899save, then do one of
2900
2901 PUSH_STATE_GOTO(resume_state, node);
2902 PUSH_YES_STATE_GOTO(resume_state, node);
2903
2904which sets that backtrack state's resume value to 'resume_state', pushes a
2905new free entry to the top of the backtrack stack, then goes to 'node'.
2906On backtracking, the free slot is popped, and the saved state becomes the
2907new free state. An ST.foo field in this new top state can be temporarily
2908accessed to retrieve values, but once the main loop is re-entered, it
2909becomes available for reuse.
2910
2911Note that the depth of the backtrack stack constantly increases during the
2912left-to-right execution of the pattern, rather than going up and down with
2913the pattern nesting. For example the stack is at its maximum at Z at the
2914end of the pattern, rather than at X in the following:
2915
2916 /(((X)+)+)+....(Y)+....Z/
2917
2918The only exceptions to this are lookahead/behind assertions and the cut,
2919(?>A), which pop all the backtrack states associated with A before
2920continuing.
2921
486ec47a 2922Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2923PL_regmatch_state and st always point to the currently active state,
2924and PL_regmatch_slab points to the slab currently containing
2925PL_regmatch_state. The first time regmatch() is called, the first slab is
2926allocated, and is never freed until interpreter destruction. When the slab
2927is full, a new one is allocated and chained to the end. At exit from
2928regmatch(), slabs allocated since entry are freed.
2929
2930*/
95b24440 2931
40a82448 2932
5bc10b2c 2933#define DEBUG_STATE_pp(pp) \
265c4333 2934 DEBUG_STATE_r({ \
f2ed9b32 2935 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2936 PerlIO_printf(Perl_debug_log, \
5d458dd8 2937 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2938 depth*2, "", \
13d6edb4 2939 PL_reg_name[st->resume_state], \
5d458dd8
YO
2940 ((st==yes_state||st==mark_state) ? "[" : ""), \
2941 ((st==yes_state) ? "Y" : ""), \
2942 ((st==mark_state) ? "M" : ""), \
2943 ((st==yes_state||st==mark_state) ? "]" : "") \
2944 ); \
265c4333 2945 });
5bc10b2c 2946
40a82448 2947
3dab1dad 2948#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2949
3df15adc 2950#ifdef DEBUGGING
5bc10b2c 2951
ab3bbdeb 2952STATIC void
f2ed9b32 2953S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2954 const char *start, const char *end, const char *blurb)
2955{
efd26800 2956 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2957
2958 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2959
ab3bbdeb
YO
2960 if (!PL_colorset)
2961 reginitcolors();
2962 {
2963 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2964 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2965
f2ed9b32 2966 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2967 start, end - start, 60);
2968
2969 PerlIO_printf(Perl_debug_log,
2970 "%s%s REx%s %s against %s\n",
2971 PL_colors[4], blurb, PL_colors[5], s0, s1);
2972
f2ed9b32 2973 if (utf8_target||utf8_pat)
1de06328
YO
2974 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2975 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2976 utf8_pat && utf8_target ? " and " : "",
2977 utf8_target ? "string" : ""
ab3bbdeb
YO
2978 );
2979 }
2980}
3df15adc
YO
2981
2982STATIC void
786e8c11
YO
2983S_dump_exec_pos(pTHX_ const char *locinput,
2984 const regnode *scan,
2985 const char *loc_regeol,
2986 const char *loc_bostr,
2987 const char *loc_reg_starttry,
f2ed9b32 2988 const bool utf8_target)
07be1b83 2989{
786e8c11 2990 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2991 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2992 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2993 /* The part of the string before starttry has one color
2994 (pref0_len chars), between starttry and current
2995 position another one (pref_len - pref0_len chars),
2996 after the current position the third one.
2997 We assume that pref0_len <= pref_len, otherwise we
2998 decrease pref0_len. */
786e8c11
YO
2999 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3000 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3001 int pref0_len;
3002
7918f24d
NC
3003 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3004
f2ed9b32 3005 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3006 pref_len++;
786e8c11
YO
3007 pref0_len = pref_len - (locinput - loc_reg_starttry);
3008 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3009 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3010 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3011 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3012 l--;
3013 if (pref0_len < 0)
3014 pref0_len = 0;
3015 if (pref0_len > pref_len)
3016 pref0_len = pref_len;
3017 {
f2ed9b32 3018 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3019
ab3bbdeb 3020 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3021 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3022
ab3bbdeb 3023 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3024 (locinput - pref_len + pref0_len),
1de06328 3025 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3026
ab3bbdeb 3027 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3028 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3029
1de06328 3030 const STRLEN tlen=len0+len1+len2;
3df15adc 3031 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3032 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3033 (IV)(locinput - loc_bostr),
07be1b83 3034 len0, s0,
07be1b83 3035 len1, s1,
07be1b83 3036 (docolor ? "" : "> <"),
07be1b83 3037 len2, s2,
f9f4320a 3038 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3039 "");
3040 }
3041}
3df15adc 3042
07be1b83
YO
3043#endif
3044
0a4db386
YO
3045/* reg_check_named_buff_matched()
3046 * Checks to see if a named buffer has matched. The data array of
3047 * buffer numbers corresponding to the buffer is expected to reside
3048 * in the regexp->data->data array in the slot stored in the ARG() of
3049 * node involved. Note that this routine doesn't actually care about the
3050 * name, that information is not preserved from compilation to execution.
3051 * Returns the index of the leftmost defined buffer with the given name
3052 * or 0 if non of the buffers matched.
3053 */
3054STATIC I32
7918f24d
NC
3055S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3056{
0a4db386 3057 I32 n;
f8fc2ecf 3058 RXi_GET_DECL(rex,rexi);
ad64d0ec 3059 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3060 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3061
3062 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3063
0a4db386 3064 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3065 if ((I32)rex->lastparen >= nums[n] &&
3066 rex->offs[nums[n]].end != -1)
0a4db386
YO
3067 {
3068 return nums[n];
3069 }
3070 }
3071 return 0;
3072}
3073
2f554ef7
DM
3074
3075/* free all slabs above current one - called during LEAVE_SCOPE */
3076
3077STATIC void
3078S_clear_backtrack_stack(pTHX_ void *p)
3079{
3080 regmatch_slab *s = PL_regmatch_slab->next;
3081 PERL_UNUSED_ARG(p);
3082
3083 if (!s)
3084 return;
3085 PL_regmatch_slab->next = NULL;
3086 while (s) {
3087 regmatch_slab * const osl = s;
3088 s = s->next;
3089 Safefree(osl);
3090 }
3091}
3092
3093
d6a28714 3094STATIC I32 /* 0 failure, 1 success */
24b23f37 3095S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3096{
a35a87e7 3097#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3098 dMY_CXT;
3099#endif
27da23d5 3100 dVAR;
f2ed9b32 3101 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3102 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3103 REGEXP *rex_sv = reginfo->prog;
3104 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3105 RXi_GET_DECL(rex,rexi);
2f554ef7 3106 I32 oldsave;
5d9a96ca
DM
3107 /* the current state. This is a cached copy of PL_regmatch_state */
3108 register regmatch_state *st;
5d9a96ca
DM
3109 /* cache heavy used fields of st in registers */
3110 register regnode *scan;
3111 register regnode *next;
438e9bae 3112 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3113 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3114 register char *locinput = PL_reginput;
5d9a96ca 3115 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3116
b69b0499 3117 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3118 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3119 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3120 const U32 max_nochange_depth =
3121 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3122 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3123 regmatch_state *yes_state = NULL; /* state to pop to on success of
3124 subpattern */
e2e6a0f1
YO
3125 /* mark_state piggy backs on the yes_state logic so that when we unwind
3126 the stack on success we can update the mark_state as we go */
3127 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3128 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3129 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3130 U32 state_num;
5d458dd8
YO
3131 bool no_final = 0; /* prevent failure from backtracking? */
3132 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3133 char *startpoint = PL_reginput;
5d458dd8
YO
3134 SV *popmark = NULL; /* are we looking for a mark? */
3135 SV *sv_commit = NULL; /* last mark name seen in failure */
3136 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3137 during a successful match */
5d458dd8
YO
3138 U32 lastopen = 0; /* last open we saw */
3139 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3140 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3141 /* these three flags are set by various ops to signal information to
3142 * the very next op. They have a useful lifetime of exactly one loop
3143 * iteration, and are not preserved or restored by state pushes/pops
3144 */
3145 bool sw = 0; /* the condition value in (?(cond)a|b) */
3146 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3147 int logical = 0; /* the following EVAL is:
3148 0: (?{...})
3149 1: (?(?{...})X|Y)
3150 2: (??{...})
3151 or the following IFMATCH/UNLESSM is:
3152 false: plain (?=foo)
3153 true: used as a condition: (?(?=foo))
3154 */
81ed78b2
DM
3155 PAD* last_pad = NULL;
3156 dMULTICALL;
3157 I32 gimme = G_SCALAR;
3158 CV *caller_cv = NULL; /* who called us */
3159 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3160 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3161
95b24440 3162#ifdef DEBUGGING
e68ec53f 3163 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3164#endif
3165
81ed78b2
DM
3166 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3167 multicall_oldcatch = 0;
3168 multicall_cv = NULL;
3169 cx = NULL;
3170
3171
7918f24d
NC
3172 PERL_ARGS_ASSERT_REGMATCH;
3173
3b57cd43 3174 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3175 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3176 }));
5d9a96ca
DM
3177 /* on first ever call to regmatch, allocate first slab */
3178 if (!PL_regmatch_slab) {
3179 Newx(PL_regmatch_slab, 1, regmatch_slab);
3180 PL_regmatch_slab->prev = NULL;
3181 PL_regmatch_slab->next = NULL;
86545054 3182 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3183 }
3184
2f554ef7
DM
3185 oldsave = PL_savestack_ix;
3186 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3187 SAVEVPTR(PL_regmatch_slab);
3188 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3189
3190 /* grab next free state slot */
3191 st = ++PL_regmatch_state;
86545054 3192 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3193 st = PL_regmatch_state = S_push_slab(aTHX);
3194
d6a28714
JH
3195 /* Note that nextchr is a byte even in UTF */
3196 nextchr = UCHARAT(locinput);
3197 scan = prog;
3198 while (scan != NULL) {
8ba1375e 3199
a3621e74 3200 DEBUG_EXECUTE_r( {
6136c704 3201 SV * const prop = sv_newmortal();
1de06328 3202 regnode *rnext=regnext(scan);
f2ed9b32 3203 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3204 regprop(rex, prop, scan);
07be1b83
YO
3205
3206 PerlIO_printf(Perl_debug_log,
3207 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3208 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3209 SvPVX_const(prop),
1de06328 3210 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3211 0 : (IV)(rnext - rexi->program));
2a782b5b 3212 });
d6a28714
JH
3213
3214 next = scan + NEXT_OFF(scan);
3215 if (next == scan)
3216 next = NULL;
40a82448 3217 state_num = OP(scan);
d6a28714 3218
40a82448 3219 reenter_switch:
34a81e2b 3220
40a82448 3221 switch (state_num) {
d6a28714 3222 case BOL:
7fba1cd6 3223 if (locinput == PL_bostr)
d6a28714 3224 {
3b0527fe 3225 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3226 break;
3227 }
d6a28714
JH
3228 sayNO;
3229 case MBOL:
12d33761
HS
3230 if (locinput == PL_bostr ||
3231 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3232 {
b8c5462f
JH
3233 break;
3234 }
d6a28714
JH
3235 sayNO;
3236 case SBOL:
c2a73568 3237 if (locinput == PL_bostr)
b8c5462f 3238 break;
d6a28714
JH
3239 sayNO;
3240 case GPOS:
3b0527fe 3241 if (locinput == reginfo->ganch)
d6a28714
JH
3242 break;
3243 sayNO;
ee9b8eae
YO
3244
3245 case KEEPS:
3246 /* update the startpoint */
b93070ed 3247 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3248 PL_reginput = locinput;
b93070ed 3249 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3250 PUSH_STATE_GOTO(KEEPS_next, next);
3251 /*NOT-REACHED*/
3252 case KEEPS_next_fail:
3253 /* rollback the start point change */
b93070ed 3254 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3255 sayNO_SILENT;
3256 /*NOT-REACHED*/
d6a28714 3257 case EOL:
d6a28714
JH
3258 goto seol;
3259 case MEOL:
d6a28714 3260 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3261 sayNO;
b8c5462f 3262 break;
d6a28714
JH
3263 case SEOL:
3264 seol:
3265 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3266 sayNO;
d6a28714 3267 if (PL_regeol - locinput > 1)
b8c5462f 3268 sayNO;
b8c5462f 3269 break;
d6a28714
JH
3270 case EOS:
3271 if (PL_regeol != locinput)
b8c5462f 3272 sayNO;
d6a28714 3273 break;
ffc61ed2 3274 case SANY:
d6a28714 3275 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3276 sayNO;
f2ed9b32 3277 if (utf8_target) {
f33976b4
DB
3278 locinput += PL_utf8skip[nextchr];
3279 if (locinput > PL_regeol)
3280 sayNO;
3281 nextchr = UCHARAT(locinput);
3282 }
3283 else
3284 nextchr = UCHARAT(++locinput);
3285 break;
3286 case CANY:
3287 if (!nextchr && locinput >= PL_regeol)
3288 sayNO;
b8c5462f 3289 nextchr = UCHARAT(++locinput);
a0d0e21e 3290 break;
ffc61ed2 3291 case REG_ANY:
1aa99e6b
IH
3292 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3293 sayNO;
f2ed9b32 3294 if (utf8_target) {
b8c5462f 3295 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3296 if (locinput > PL_regeol)
3297 sayNO;
a0ed51b3 3298 nextchr = UCHARAT(locinput);
a0ed51b3 3299 }
1aa99e6b
IH
3300 else
3301 nextchr = UCHARAT(++locinput);
a0ed51b3 3302 break;
166ba7cd
DM
3303
3304#undef ST
3305#define ST st->u.trie
786e8c11
YO
3306 case TRIEC:
3307 /* In this case the charclass data is available inline so
3308 we can fail fast without a lot of extra overhead.
3309 */
fab2782b
YO
3310 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3311 DEBUG_EXECUTE_r(
3312 PerlIO_printf(Perl_debug_log,
3313 "%*s %sfailed to match trie start class...%s\n",
3314 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3315 );
3316 sayNO_SILENT;
118e2215 3317 assert(0); /* NOTREACHED */
786e8c11
YO
3318 }
3319 /* FALL THROUGH */
5b47454d 3320 case TRIE:
2e64971a
DM
3321 /* the basic plan of execution of the trie is:
3322 * At the beginning, run though all the states, and
3323 * find the longest-matching word. Also remember the position
3324 * of the shortest matching word. For example, this pattern:
3325 * 1 2 3 4 5
3326 * ab|a|x|abcd|abc
3327 * when matched against the string "abcde", will generate
3328 * accept states for all words except 3, with the longest
3329 * matching word being 4, and the shortest being 1 (with
3330 * the position being after char 1 of the string).
3331 *
3332 * Then for each matching word, in word order (i.e. 1,2,4,5),
3333 * we run the remainder of the pattern; on each try setting
3334 * the current position to the character following the word,
3335 * returning to try the next word on failure.
3336 *
3337 * We avoid having to build a list of words at runtime by
3338 * using a compile-time structure, wordinfo[].prev, which
3339 * gives, for each word, the previous accepting word (if any).
3340 * In the case above it would contain the mappings 1->2, 2->0,
3341 * 3->0, 4->5, 5->1. We can use this table to generate, from
3342 * the longest word (4 above), a list of all words, by
3343 * following the list of prev pointers; this gives us the
3344 * unordered list 4,5,1,2. Then given the current word we have
3345 * just tried, we can go through the list and find the
3346 * next-biggest word to try (so if we just failed on word 2,
3347 * the next in the list is 4).
3348 *
3349 * Since at runtime we don't record the matching position in
3350 * the string for each word, we have to work that out for
3351 * each word we're about to process. The wordinfo table holds
3352 * the character length of each word; given that we recorded
3353 * at the start: the position of the shortest word and its
3354 * length in chars, we just need to move the pointer the
3355 * difference between the two char lengths. Depending on
3356 * Unicode status and folding, that's cheap or expensive.
3357 *
3358 * This algorithm is optimised for the case where are only a
3359 * small number of accept states, i.e. 0,1, or maybe 2.
3360 * With lots of accepts states, and having to try all of them,
3361 * it becomes quadratic on number of accept states to find all
3362 * the next words.
3363 */
3364
3dab1dad 3365 {
07be1b83 3366 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3367 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3368
3369 /* what trie are we using right now */
be8e71aa 3370 reg_trie_data * const trie
f8fc2ecf 3371 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3372 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3373 U32 state = trie->startstate;
166ba7cd 3374
fab2782b 3375 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3376 if (trie->states[ state ].wordnum) {
3377 DEBUG_EXECUTE_r(
3378 PerlIO_printf(Perl_debug_log,
3379 "%*s %smatched empty string...%s\n",
5bc10b2c 3380 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3381 );
20dbff7c
YO
3382 if (!trie->jump)
3383 break;
3dab1dad
YO
3384 } else {
3385 DEBUG_EXECUTE_r(
3386 PerlIO_printf(Perl_debug_log,
786e8c11 3387 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3388 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3389 );
3390 sayNO_SILENT;
3391 }
3392 }
166ba7cd 3393
786e8c11
YO
3394 {
3395 U8 *uc = ( U8* )locinput;
3396
3397 STRLEN len = 0;
3398 STRLEN foldlen = 0;
3399 U8 *uscan = (U8*)NULL;
786e8c11 3400 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3401 U32 charcount = 0; /* how many input chars we have matched */
3402 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3403
786e8c11 3404 ST.jump = trie->jump;
786e8c11 3405 ST.me = scan;
2e64971a
DM
3406 ST.firstpos = NULL;
3407 ST.longfold = FALSE; /* char longer if folded => it's harder */
3408 ST.nextword = 0;
3409
3410 /* fully traverse the TRIE; note the position of the
3411 shortest accept state and the wordnum of the longest
3412 accept state */
07be1b83 3413
a3621e74 3414 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3415 U32 base = trie->states[ state ].trans.base;
f9f4320a 3416 UV uvc = 0;
acb909b4 3417 U16 charid = 0;
2e64971a
DM
3418 U16 wordnum;
3419 wordnum = trie->states[ state ].wordnum;
3420
3421 if (wordnum) { /* it's an accept state */
3422 if (!accepted) {
3423 accepted = 1;
3424 /* record first match position */
3425 if (ST.longfold) {
3426 ST.firstpos = (U8*)locinput;
3427 ST.firstchars = 0;
5b47454d 3428 }
2e64971a
DM
3429 else {
3430 ST.firstpos = uc;
3431 ST.firstchars = charcount;
3432 }
3433 }
3434 if (!ST.nextword || wordnum < ST.nextword)
3435 ST.nextword = wordnum;
3436 ST.topword = wordnum;
786e8c11 3437 }
a3621e74 3438
07be1b83 3439 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3440 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3441 PerlIO_printf( Perl_debug_log,
2e64971a 3442 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3443 2+depth * 2, "", PL_colors[4],
2e64971a 3444 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3445 });
a3621e74 3446
2e64971a 3447 /* read a char and goto next state */
a3621e74 3448 if ( base ) {
6dd2be57 3449 I32 offset;
55eed653
NC
3450 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3451 uscan, len, uvc, charid, foldlen,
3452 foldbuf, uniflags);
2e64971a
DM
3453 charcount++;
3454 if (foldlen>0)
3455 ST.longfold = TRUE;
5b47454d 3456 if (charid &&
6dd2be57
DM
3457 ( ((offset =
3458 base + charid - 1 - trie->uniquecharcount)) >= 0)
3459
3460 && ((U32)offset < trie->lasttrans)
3461 && trie->trans[offset].check == state)
5b47454d 3462 {
6dd2be57 3463 state = trie->trans[offset].next;
5b47454d
DM
3464 }
3465 else {
3466 state = 0;
3467 }
3468 uc += len;
3469
3470 }
3471 else {
a3621e74
YO
3472 state = 0;
3473 }
3474 DEBUG_TRIE_EXECUTE_r(
e4584336 3475 PerlIO_printf( Perl_debug_log,
786e8c11 3476 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3477 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3478 );
3479 }
2e64971a 3480 if (!accepted)
a3621e74 3481 sayNO;
a3621e74 3482
2e64971a
DM
3483 /* calculate total number of accept states */
3484 {
3485 U16 w = ST.topword;
3486 accepted = 0;
3487 while (w) {
3488 w = trie->wordinfo[w].prev;
3489 accepted++;
3490 }
3491 ST.accepted = accepted;
3492 }
3493
166ba7cd
DM
3494 DEBUG_EXECUTE_r(
3495 PerlIO_printf( Perl_debug_log,
3496 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3497 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3498 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3499 );
2e64971a 3500 goto trie_first_try; /* jump into the fail handler */
786e8c11 3501 }}
118e2215 3502 assert(0); /* NOTREACHED */
2e64971a
DM
3503
3504 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3505 if ( ST.jump) {
3506 REGCP_UNWIND(ST.cp);
a8d1f4b4 3507 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3508 }
2e64971a
DM
3509 if (!--ST.accepted) {
3510 DEBUG_EXECUTE_r({
3511 PerlIO_printf( Perl_debug_log,
3512 "%*s %sTRIE failed...%s\n",
3513 REPORT_CODE_OFF+depth*2, "",
3514 PL_colors[4],
3515 PL_colors[5] );
3516 });
3517 sayNO_SILENT;
3518 }
3519 {
3520 /* Find next-highest word to process. Note that this code
3521 * is O(N^2) per trie run (O(N) per branch), so keep tight */
d9a396a3
DM
3522 register U16 min = 0;
3523 register U16 word;
2e64971a
DM
3524 register U16 const nextword = ST.nextword;
3525 register reg_trie_wordinfo * const wordinfo
3526 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3527 for (word=ST.topword; word; word=wordinfo[word].prev) {
3528 if (word > nextword && (!min || word < min))
3529 min = word;
3530 }
3531 ST.nextword = min;
3532 }
3533
fae667d5 3534 trie_first_try:
5d458dd8
YO
3535 if (do_cutgroup) {
3536 do_cutgroup = 0;
3537 no_final = 0;
3538 }
fae667d5
YO
3539
3540 if ( ST.jump) {
b93070ed 3541 ST.lastparen = rex->lastparen;
f6033a9d 3542 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3543 REGCP_SET(ST.cp);
2e64971a 3544 }
a3621e74 3545
2e64971a 3546 /* find start char of end of current word */
166ba7cd 3547 {
2e64971a
DM
3548 U32 chars; /* how many chars to skip */
3549 U8 *uc = ST.firstpos;
3550 reg_trie_data * const trie
3551 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3552
3553 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3554 >= ST.firstchars);
3555 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3556 - ST.firstchars;
3557
3558 if (ST.longfold) {
3559 /* the hard option - fold each char in turn and find
3560 * its folded length (which may be different */
3561 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3562 STRLEN foldlen;
3563 STRLEN len;
d9a396a3 3564 UV uvc;
2e64971a
DM
3565 U8 *uscan;
3566
3567 while (chars) {
f2ed9b32 3568 if (utf8_target) {
2e64971a
DM
3569 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3570 uniflags);
3571 uc += len;
3572 }
3573 else {
3574 uvc = *uc;
3575 uc++;
3576 }
3577 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3578 uscan = foldbuf;
3579 while (foldlen) {
3580 if (!--chars)
3581 break;
3582 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3583 uniflags);
3584 uscan += len;
3585 foldlen -= len;
3586 }
3587 }
a3621e74 3588 }
2e64971a 3589 else {
f2ed9b32 3590 if (utf8_target)
2e64971a
DM
3591 while (chars--)
3592 uc += UTF8SKIP(uc);
3593 else
3594 uc += chars;
3595 }
3596 PL_reginput = (char *)uc;
3597 }
166ba7cd 3598
6603fe3e
DM
3599 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3600 ? ST.jump[ST.nextword]
3601 : NEXT_OFF(ST.me));
166ba7cd 3602
2e64971a
DM
3603 DEBUG_EXECUTE_r({
3604 PerlIO_printf( Perl_debug_log,
3605 "%*s %sTRIE matched word #%d, continuing%s\n",
3606 REPORT_CODE_OFF+depth*2, "",
3607 PL_colors[4],
3608 ST.nextword,
3609 PL_colors[5]
3610 );
3611 });
3612
3613 if (ST.accepted > 1 || has_cutgroup) {
3614 PUSH_STATE_GOTO(TRIE_next, scan);
118e2215 3615 assert(0); /* NOTREACHED */
166ba7cd 3616 }
2e64971a
DM
3617 /* only one choice left - just continue */
3618 DEBUG_EXECUTE_r({
3619 AV *const trie_words
3620 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3621 SV ** const tmp = av_fetch( trie_words,
3622 ST.nextword-1, 0 );
3623 SV *sv= tmp ? sv_newmortal() : NULL;
3624
3625 PerlIO_printf( Perl_debug_log,
3626 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3627 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3628 ST.nextword,
3629 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3630 PL_colors[0], PL_colors[1],
c89df6cf 3631 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3632 )
3633 : "not compiled under -Dr",
3634 PL_colors[5] );
3635 });
3636
3637 locinput = PL_reginput;
3638 nextchr = UCHARAT(locinput);
3639 continue; /* execute rest of RE */
118e2215 3640 assert(0); /* NOTREACHED */
166ba7cd
DM
3641#undef ST
3642
95b24440
DM
3643 case EXACT: {
3644 char *s = STRING(scan);
24d3c4a9 3645 ln = STR_LEN(scan);
f2ed9b32 3646 if (utf8_target != UTF_PATTERN) {
bc517b45 3647 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3648 char *l = locinput;
24d3c4a9 3649 const char * const e = s + ln;
a72c7584 3650
f2ed9b32 3651 if (utf8_target) {
5ff6fc6d 3652 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3653 while (s < e) {
a3b680e6 3654 STRLEN ulen;
1aa99e6b 3655 if (l >= PL_regeol)
5ff6fc6d
JH
3656 sayNO;
3657 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3658 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3659 uniflags))
5ff6fc6d 3660 sayNO;
bc517b45 3661 l += ulen;
5ff6fc6d 3662 s ++;
1aa99e6b 3663 }
5ff6fc6d
JH
3664 }
3665 else {
3666 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3667 while (s < e) {
a3b680e6 3668 STRLEN ulen;
1aa99e6b
IH
3669 if (l >= PL_regeol)
3670 sayNO;
5ff6fc6d 3671 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3672 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3673 uniflags))
1aa99e6b 3674 sayNO;
bc517b45 3675 s += ulen;
a72c7584 3676 l ++;
1aa99e6b 3677 }
5ff6fc6d 3678 }
1aa99e6b
IH
3679 locinput = l;
3680 nextchr = UCHARAT(locinput);
3681 break;
3682 }
bc517b45 3683 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3684 /* Inline the first character, for speed. */
3685 if (UCHARAT(s) != nextchr)
3686 sayNO;
24d3c4a9 3687 if (PL_regeol - locinput < ln)
d6a28714 3688 sayNO;
24d3c4a9 3689 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3690 sayNO;
24d3c4a9 3691 locinput += ln;
d6a28714
JH
3692 nextchr = UCHARAT(locinput);
3693 break;
95b24440 3694 }
9a5a5549 3695 case EXACTFL: {
a932d541 3696 re_fold_t folder;
9a5a5549
KW
3697 const U8 * fold_array;
3698 const char * s;
d513472c 3699 U32 fold_utf8_flags;
9a5a5549 3700
b8c5462f 3701 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3702 folder = foldEQ_locale;
3703 fold_array = PL_fold_locale;
17580e7a 3704 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3705 goto do_exactf;
3706
3c760661 3707 case EXACTFU_SS:
fab2782b 3708 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
3709 case EXACTFU:
3710 folder = foldEQ_latin1;
3711 fold_array = PL_fold_latin1;
2daa8fee 3712 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3713 goto do_exactf;
3714
2f7f8cb1
KW
3715 case EXACTFA:
3716 folder = foldEQ_latin1;
3717 fold_array = PL_fold_latin1;
57014d77 3718 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3719 goto do_exactf;
3720
9a5a5549
KW
3721 case EXACTF:
3722 folder = foldEQ;
3723 fold_array = PL_fold;
62bf7766 3724 fold_utf8_flags = 0;
9a5a5549
KW
3725
3726 do_exactf:
3727 s = STRING(scan);
24d3c4a9 3728 ln = STR_LEN(scan);
d6a28714 3729
3c760661
KW
3730 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3731 /* Either target or the pattern are utf8, or has the issue where
3732 * the fold lengths may differ. */
be8e71aa 3733 const char * const l = locinput;
d07ddd77 3734 char *e = PL_regeol;
bc517b45 3735
d513472c 3736 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3737 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3738 {
3739 sayNO;
5486206c 3740 }
d07ddd77
JH
3741 locinput = e;
3742 nextchr = UCHARAT(locinput);
3743 break;
a0ed51b3 3744 }
d6a28714 3745
0a138b74 3746 /* Neither the target nor the pattern are utf8 */
d6a28714 3747 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3748 UCHARAT(s) != fold_array[nextchr])
3749 {
a0ed51b3 3750 sayNO;
9a5a5549 3751 }
24d3c4a9 3752 if (PL_regeol - locinput < ln)
b8c5462f 3753 sayNO;
9a5a5549 3754 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3755 sayNO;
24d3c4a9 3756 locinput += ln;
d6a28714 3757 nextchr = UCHARAT(locinput);
a0d0e21e 3758 break;
9a5a5549 3759 }
63ac0dad
KW
3760
3761 /* XXX Could improve efficiency by separating these all out using a
3762 * macro or in-line function. At that point regcomp.c would no longer
3763 * have to set the FLAGS fields of these */
b2680017
YO
3764 case BOUNDL:
3765 case NBOUNDL:
3766 PL_reg_flags |= RF_tainted;
3767 /* FALL THROUGH */
3768 case BOUND:
63ac0dad 3769 case BOUNDU:
cfaf538b 3770 case BOUNDA:
b2680017 3771 case NBOUND:
63ac0dad 3772 case NBOUNDU:
cfaf538b 3773 case NBOUNDA:
b2680017 3774 /* was last char in word? */
f2e96b5d
KW
3775 if (utf8_target
3776 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3777 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3778 {
b2680017
YO
3779 if (locinput == PL_bostr)
3780 ln = '\n';
3781 else {
3782 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3783
3784 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3785 }
63ac0dad 3786 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3787 ln = isALNUM_uni(ln);
3788 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3789 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3790 }
3791 else {
3792 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3793 n = isALNUM_LC_utf8((U8*)locinput);
3794 }
3795 }
3796 else {
cfaf538b
KW
3797
3798 /* Here the string isn't utf8, or is utf8 and only ascii
3799 * characters are to match \w. In the latter case looking at
3800 * the byte just prior to the current one may be just the final
3801 * byte of a multi-byte character. This is ok. There are two
3802 * cases:
3803 * 1) it is a single byte character, and then the test is doing
3804 * just what it's supposed to.
3805 * 2) it is a multi-byte character, in which case the final
3806 * byte is never mistakable for ASCII, and so the test
3807 * will say it is not a word character, which is the
3808 * correct answer. */
b2680017
YO
3809 ln = (locinput != PL_bostr) ?
3810 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3811 switch (FLAGS(scan)) {
3812 case REGEX_UNICODE_CHARSET:
3813 ln = isWORDCHAR_L1(ln);
3814 n = isWORDCHAR_L1(nextchr);
3815 break;
3816 case REGEX_LOCALE_CHARSET:
3817 ln = isALNUM_LC(ln);
3818 n = isALNUM_LC(nextchr);
3819 break;
3820 case REGEX_DEPENDS_CHARSET:
3821 ln = isALNUM(ln);
3822 n = isALNUM(nextchr);
3823 break;
cfaf538b 3824 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3825 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3826 ln = isWORDCHAR_A(ln);
3827 n = isWORDCHAR_A(nextchr);
3828 break;
63ac0dad
KW
3829 default:
3830 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3831 break;
b2680017
YO
3832 }
3833 }
63ac0dad
KW
3834 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3835 * regcomp.sym */
3836 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3837 sayNO;
3838 break;
f56b6394 3839 case ANYOFV:
d6a28714 3840 case ANYOF:
f56b6394 3841 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3842 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3843 if (locinput >= PL_regeol)
3844 sayNO;
9e55ce06 3845
f2ed9b32 3846 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3847 sayNO;
b32d7d3e 3848 locinput += inclasslen;
b8c5462f 3849 nextchr = UCHARAT(locinput);
e0f9d4a8 3850 break;
ffc61ed2
JH
3851 }
3852 else {
3853 if (nextchr < 0)
3854 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3855 if (!nextchr && locinput >= PL_regeol)
3856 sayNO;
20ed0b26 3857 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3858 sayNO;
ffc61ed2 3859 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3860 break;
3861 }
b8c5462f 3862 break;
20d0b1e9 3863 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3864 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3865 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3866 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3867 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 3868 alnum, "a");
ee9a90b8
KW
3869
3870 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3871 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3872 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 3873 SPACEA, NSPACEA, isSPACE_A,
779d7b58 3874 space, " ");
ee9a90b8
KW
3875
3876 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3877 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 3878 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 3879 digit, "0");
20d0b1e9 3880
37e2e78e
KW
3881 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3882 a Unicode extended Grapheme Cluster */
3883 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3884 extended Grapheme Cluster is:
3885
3886 CR LF
3887 | Prepend* Begin Extend*
3888 | .
3889
3890 Begin is (Hangul-syllable | ! Control)
3891 Extend is (Grapheme_Extend | Spacing_Mark)
3892 Control is [ GCB_Control CR LF ]
3893
3894 The discussion below shows how the code for CLUMP is derived
3895 from this regex. Note that most of these concepts are from
3896 property values of the Grapheme Cluster Boundary (GCB) property.
3897 No code point can have multiple property values for a given
3898 property. Thus a code point in Prepend can't be in Control, but
3899 it must be in !Control. This is why Control above includes
3900 GCB_Control plus CR plus LF. The latter two are used in the GCB
3901 property separately, and so can't be in GCB_Control, even though
3902 they logically are controls. Control is not the same as gc=cc,
3903 but includes format and other characters as well.
3904
3905 The Unicode definition of Hangul-syllable is:
3906 L+
3907 | (L* ( ( V | LV ) V* | LVT ) T*)
3908 | T+
3909 )
3910 Each of these is a value for the GCB property, and hence must be
3911 disjoint, so the order they are tested is immaterial, so the
3912 above can safely be changed to
3913 T+
3914 | L+
3915 | (L* ( LVT | ( V | LV ) V*) T*)
3916
3917 The last two terms can be combined like this:
3918 L* ( L
3919 | (( LVT | ( V | LV ) V*) T*))
3920
3921 And refactored into this:
3922 L* (L | LVT T* | V V* T* | LV V* T*)
3923
3924 That means that if we have seen any L's at all we can quit
cc3b396d 3925 there, but if the next character is an LVT, a V, or an LV we
37e2e78e
KW
3926 should keep going.
3927
3928 There is a subtlety with Prepend* which showed up in testing.
3929 Note that the Begin, and only the Begin is required in:
3930 | Prepend* Begin Extend*
cc3b396d
KW
3931 Also, Begin contains '! Control'. A Prepend must be a
3932 '! Control', which means it must also be a Begin. What it
3933 comes down to is that if we match Prepend* and then find no
3934 suitable Begin afterwards, that if we backtrack the last
3935 Prepend, that one will be a suitable Begin.
37e2e78e
KW
3936 */
3937
b7c83a7e 3938 if (locinput >= PL_regeol)
a0ed51b3 3939 sayNO;
f2ed9b32 3940 if (! utf8_target) {
37e2e78e
KW
3941
3942 /* Match either CR LF or '.', as all the other possibilities
3943 * require utf8 */
3944 locinput++; /* Match the . or CR */
cc3b396d
KW
3945 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3946 match the LF */
37e2e78e
KW
3947 && locinput < PL_regeol
3948 && UCHARAT(locinput) == '\n') locinput++;
3949 }
3950 else {
3951
3952 /* Utf8: See if is ( CR LF ); already know that locinput <
3953 * PL_regeol, so locinput+1 is in bounds */
3954 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3955 locinput += 2;
3956 }
3957 else {
3958 /* In case have to backtrack to beginning, then match '.' */
3959 char *starting = locinput;
3960
3961 /* In case have to backtrack the last prepend */
3962 char *previous_prepend = 0;
3963
3964 LOAD_UTF8_CHARCLASS_GCB();
3965
3966 /* Match (prepend)* */
3967 while (locinput < PL_regeol
3968 && swash_fetch(PL_utf8_X_prepend,
f2ed9b32 3969 (U8*)locinput, utf8_target))
37e2e78e
KW
3970 {
3971 previous_prepend = locinput;
3972 locinput += UTF8SKIP(locinput);
3973 }
3974
3975 /* As noted above, if we matched a prepend character, but
3976 * the next thing won't match, back off the last prepend we
3977 * matched, as it is guaranteed to match the begin */
3978 if (previous_prepend
3979 && (locinput >= PL_regeol
3980 || ! swash_fetch(PL_utf8_X_begin,
f2ed9b32 3981 (U8*)locinput, utf8_target)))
37e2e78e
KW
3982 {
3983 locinput = previous_prepend;
3984 }
3985
3986 /* Note that here we know PL_regeol > locinput, as we
3987 * tested that upon input to this switch case, and if we
3988 * moved locinput forward, we tested the result just above
3989 * and it either passed, or we backed off so that it will
3990 * now pass */
f2ed9b32 3991 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
37e2e78e
KW
3992
3993 /* Here did not match the required 'Begin' in the
3994 * second term. So just match the very first
3995 * character, the '.' of the final term of the regex */
3996 locinput = starting + UTF8SKIP(starting);
3997 } else {
3998
3999 /* Here is the beginning of a character that can have
4000 * an extender. It is either a hangul syllable, or a
4001 * non-control */
4002 if (swash_fetch(PL_utf8_X_non_hangul,
f2ed9b32 4003 (U8*)locinput, utf8_target))
37e2e78e
KW
4004 {
4005
4006 /* Here not a Hangul syllable, must be a
4007 * ('! * Control') */
4008 locinput += UTF8SKIP(locinput);
4009 } else {
4010
4011 /* Here is a Hangul syllable. It can be composed
4012 * of several individual characters. One
4013 * possibility is T+ */
4014 if (swash_fetch(PL_utf8_X_T,
f2ed9b32 4015 (U8*)locinput, utf8_target))
37e2e78e
KW
4016 {
4017 while (locinput < PL_regeol
4018 && swash_fetch(PL_utf8_X_T,
f2ed9b32 4019 (U8*)locinput, utf8_target))
37e2e78e
KW
4020 {
4021 locinput += UTF8SKIP(locinput);
4022 }
4023 } else {
4024
4025 /* Here, not T+, but is a Hangul. That means
4026 * it is one of the others: L, LV, LVT or V,
4027 * and matches:
4028 * L* (L | LVT T* | V V* T* | LV V* T*) */
4029
4030 /* Match L* */
4031 while (locinput < PL_regeol
4032 && swash_fetch(PL_utf8_X_L,
f2ed9b32 4033 (U8*)locinput, utf8_target))
37e2e78e
KW
4034 {
4035 locinput += UTF8SKIP(locinput);
4036 }
4037
4038 /* Here, have exhausted L*. If the next
4039 * character is not an LV, LVT nor V, it means
4040 * we had to have at least one L, so matches L+
4041 * in the original equation, we have a complete
4042 * hangul syllable. Are done. */
4043
4044 if (locinput < PL_regeol
4045 && swash_fetch(PL_utf8_X_LV_LVT_V,
f2ed9b32 4046 (U8*)locinput, utf8_target))
37e2e78e
KW
4047 {
4048
4049 /* Otherwise keep going. Must be LV, LVT
4050 * or V. See if LVT */
4051 if (swash_fetch(PL_utf8_X_LVT,
f2ed9b32 4052 (U8*)locinput, utf8_target))
37e2e78e
KW
4053 {
4054 locinput += UTF8SKIP(locinput);
4055 } else {
4056
4057 /* Must be V or LV. Take it, then
4058 * match V* */
4059 locinput += UTF8SKIP(locinput);
4060 while (locinput < PL_regeol
4061 && swash_fetch(PL_utf8_X_V,
f2ed9b32 4062 (U8*)locinput, utf8_target))
37e2e78e
KW
4063 {
4064 locinput += UTF8SKIP(locinput);
4065 }
4066 }
4067
4068 /* And any of LV, LVT, or V can be followed
4069 * by T* */
4070 while (locinput < PL_regeol
4071 && swash_fetch(PL_utf8_X_T,
4072 (U8*)locinput,
f2ed9b32 4073 utf8_target))
37e2e78e
KW
4074 {
4075 locinput += UTF8SKIP(locinput);
4076 }
4077 }
4078 }
4079 }
4080
4081 /* Match any extender */
4082 while (locinput < PL_regeol
4083 && swash_fetch(PL_utf8_X_extend,
f2ed9b32 4084 (U8*)locinput, utf8_target))
37e2e78e
KW
4085 {
4086 locinput += UTF8SKIP(locinput);
4087 }
4088 }
4089 }
4090 if (locinput > PL_regeol) sayNO;
4091 }
a0ed51b3
LW
4092 nextchr = UCHARAT(locinput);
4093 break;
81714fb9
YO
4094
4095 case NREFFL:
d7ef4b73
KW
4096 { /* The capture buffer cases. The ones beginning with N for the
4097 named buffers just convert to the equivalent numbered and
4098 pretend they were called as the corresponding numbered buffer
4099 op. */
26ecd678
TC
4100 /* don't initialize these in the declaration, it makes C++
4101 unhappy */
81714fb9 4102 char *s;
ff1157ca 4103 char type;
8368298a
TC
4104 re_fold_t folder;
4105 const U8 *fold_array;
26ecd678 4106 UV utf8_fold_flags;
8368298a 4107
81714fb9 4108 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4109 folder = foldEQ_locale;
4110 fold_array = PL_fold_locale;
4111 type = REFFL;
17580e7a 4112 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4113 goto do_nref;
4114
2f7f8cb1
KW
4115 case NREFFA:
4116 folder = foldEQ_latin1;
4117 fold_array = PL_fold_latin1;
4118 type = REFFA;
4119 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4120 goto do_nref;
4121
d7ef4b73
KW
4122 case NREFFU:
4123 folder = foldEQ_latin1;
4124 fold_array = PL_fold_latin1;
4125 type = REFFU;
d513472c 4126 utf8_fold_flags = 0;
d7ef4b73
KW
4127 goto do_nref;
4128
81714fb9 4129 case NREFF:
d7ef4b73
KW
4130 folder = foldEQ;
4131 fold_array = PL_fold;
4132 type = REFF;
d513472c 4133 utf8_fold_flags = 0;
d7ef4b73
KW
4134 goto do_nref;
4135
4136 case NREF:
4137 type = REF;
83d7b90b
KW
4138 folder = NULL;
4139 fold_array = NULL;
d513472c 4140 utf8_fold_flags = 0;
d7ef4b73
KW
4141 do_nref:
4142
4143 /* For the named back references, find the corresponding buffer
4144 * number */
0a4db386
YO
4145 n = reg_check_named_buff_matched(rex,scan);
4146
d7ef4b73 4147 if ( ! n ) {
81714fb9 4148 sayNO;
d7ef4b73
KW
4149 }
4150 goto do_nref_ref_common;
4151
c8756f30 4152 case REFFL:
3280af22 4153 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4154 folder = foldEQ_locale;
4155 fold_array = PL_fold_locale;
17580e7a 4156 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4157 goto do_ref;
4158
2f7f8cb1
KW
4159 case REFFA:
4160 folder = foldEQ_latin1;
4161 fold_array = PL_fold_latin1;
4162 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4163 goto do_ref;
4164
d7ef4b73
KW
4165 case REFFU:
4166 folder = foldEQ_latin1;
4167 fold_array = PL_fold_latin1;
d513472c 4168 utf8_fold_flags = 0;
d7ef4b73
KW
4169 goto do_ref;
4170
4171 case REFF:
4172 folder = foldEQ;
4173 fold_array = PL_fold;
d513472c 4174 utf8_fold_flags = 0;
83d7b90b 4175 goto do_ref;
d7ef4b73 4176
c277df42 4177 case REF:
83d7b90b
KW
4178 folder = NULL;
4179 fold_array = NULL;
d513472c 4180 utf8_fold_flags = 0;
83d7b90b 4181
d7ef4b73 4182 do_ref:
81714fb9 4183 type = OP(scan);
d7ef4b73
KW
4184 n = ARG(scan); /* which paren pair */
4185
4186 do_nref_ref_common:
b93070ed 4187 ln = rex->offs[n].start;
2c2d71f5 4188 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4189 if (rex->lastparen < n || ln == -1)
af3f8c16 4190 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4191 if (ln == rex->offs[n].end)
a0d0e21e 4192 break;
a0ed51b3 4193
24d3c4a9 4194 s = PL_bostr + ln;
d7ef4b73 4195 if (type != REF /* REF can do byte comparison */
2f65c56d 4196 && (utf8_target || type == REFFU))
d7ef4b73
KW
4197 { /* XXX handle REFFL better */
4198 char * limit = PL_regeol;
4199
4200 /* This call case insensitively compares the entire buffer
4201 * at s, with the current input starting at locinput, but
4202 * not going off the end given by PL_regeol, and returns in
4203 * limit upon success, how much of the current input was
4204 * matched */
b93070ed 4205 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4206 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4207 {
4208 sayNO;
a0ed51b3 4209 }
d7ef4b73 4210 locinput = limit;
a0ed51b3
LW
4211 nextchr = UCHARAT(locinput);
4212 break;
4213 }
4214
d7ef4b73 4215 /* Not utf8: Inline the first character, for speed. */
76e3520e 4216 if (UCHARAT(s) != nextchr &&
81714fb9 4217 (type == REF ||
d7ef4b73 4218 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4219 sayNO;
b93070ed 4220 ln = rex->offs[n].end - ln;
24d3c4a9 4221 if (locinput + ln > PL_regeol)
4633a7c4 4222 sayNO;
81714fb9 4223 if (ln > 1 && (type == REF
24d3c4a9 4224 ? memNE(s, locinput, ln)
d7ef4b73 4225 : ! folder(s, locinput, ln)))
4633a7c4 4226 sayNO;
24d3c4a9 4227 locinput += ln;
76e3520e 4228 nextchr = UCHARAT(locinput);
a0d0e21e 4229 break;
81714fb9 4230 }
a0d0e21e 4231 case NOTHING:
c277df42 4232 case TAIL:
a0d0e21e
LW
4233 break;
4234 case BACK:
4235 break;
40a82448
DM
4236
4237#undef ST
4238#define ST st->u.eval
c277df42 4239 {
c277df42 4240 SV *ret;
d2f13c59 4241 REGEXP *re_sv;
6bda09f9 4242 regexp *re;
f8fc2ecf 4243 regexp_internal *rei;
1a147d38
YO
4244 regnode *startpoint;
4245
4246 case GOSTART:
e7707071
YO
4247 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4248 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4249 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4250 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4251 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4252 Perl_croak(aTHX_
4253 "Pattern subroutine nesting without pos change"
4254 " exceeded limit in regex");
6bda09f9
YO
4255 } else {
4256 nochange_depth = 0;
1a147d38 4257 }
288b8c02 4258 re_sv = rex_sv;
6bda09f9 4259 re = rex;
f8fc2ecf 4260 rei = rexi;
1a147d38 4261 if (OP(scan)==GOSUB) {
6bda09f9
YO
4262 startpoint = scan + ARG2L(scan);
4263 ST.close_paren = ARG(scan);
4264 } else {
f8fc2ecf 4265 startpoint = rei->program+1;
6bda09f9
YO
4266 ST.close_paren = 0;
4267 }
4268 goto eval_recurse_doit;
118e2215 4269 assert(0); /* NOTREACHED */
6bda09f9
YO
4270 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4271 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4272 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4273 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4274 } else {
4275 nochange_depth = 0;
4276 }
8e5e9ebe 4277 {
4aabdb9b 4278 /* execute the code in the {...} */
81ed78b2 4279
4aabdb9b 4280 dSP;
81ed78b2 4281 SV ** before;
1f4d1a1e 4282 OP * const oop = PL_op;
4aabdb9b 4283 COP * const ocurcop = PL_curcop;
81ed78b2 4284 OP *nop;
d80618d2 4285 char *saved_regeol = PL_regeol;
91332126 4286 struct re_save_state saved_state;
81ed78b2 4287 CV *newcv;
91332126 4288
74088413
DM
4289 /* save *all* paren positions */
4290 regcppush(rex, 0);
4291 REGCP_SET(runops_cp);
4292
6562f1c4 4293 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4294 * eval we would normally put it on the save stack, like with
4295 * save_re_context. However, re-evals have a weird scoping so we
4296 * can't just add ENTER/LEAVE here. With that, things like
4297 *
4298 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4299 *
4300 * would break, as they expect the localisation to be unwound
4301 * only when the re-engine backtracks through the bit that
4302 * localised it.
4303 *
4304 * What we do instead is just saving the state in a local c
4305 * variable.
4306 */
91332126 4307 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4308
d24ca0c5 4309 PL_reg_state.re_reparsing = FALSE;
91332126 4310
81ed78b2
DM
4311 if (!caller_cv)
4312 caller_cv = find_runcv(NULL);
4313
4aabdb9b 4314 n = ARG(scan);
81ed78b2 4315
b30fcab9 4316 if (rexi->data->what[n] == 'r') { /* code from an external qr */
81ed78b2 4317 newcv = ((struct regexp *)SvANY(
b30fcab9
DM
4318 (REGEXP*)(rexi->data->data[n])
4319 ))->qr_anoncv
81ed78b2
DM
4320 ;
4321 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4322 }
4323 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4324 newcv = caller_cv;
4325 nop = (OP*)rexi->data->data[n];
4326 assert(CvDEPTH(newcv));
68e2671b
DM
4327 }
4328 else {
d24ca0c5
DM
4329 /* literal with own CV */
4330 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4331 newcv = rex->qr_anoncv;
4332 nop = (OP*)rexi->data->data[n];
68e2671b 4333 }
81ed78b2 4334
0e458318
DM
4335 /* normally if we're about to execute code from the same
4336 * CV that we used previously, we just use the existing
4337 * CX stack entry. However, its possible that in the
4338 * meantime we may have backtracked, popped from the save
4339 * stack, and undone the SAVECOMPPAD(s) associated with
4340 * PUSH_MULTICALL; in which case PL_comppad no longer
4341 * points to newcv's pad. */
4342 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4343 {
4344 I32 depth = (newcv == caller_cv) ? 0 : 1;
4345 if (last_pushed_cv) {
4346 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4347 }
4348 else {
4349 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4350 }
4351 last_pushed_cv = newcv;
4352 }
4353 last_pad = PL_comppad;
4354
2e2e3f36
DM
4355 /* the initial nextstate you would normally execute
4356 * at the start of an eval (which would cause error
4357 * messages to come from the eval), may be optimised
4358 * away from the execution path in the regex code blocks;
4359 * so manually set PL_curcop to it initially */
4360 {
81ed78b2 4361 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4362 assert(o->op_type == OP_NULL);
4363 if (o->op_targ == OP_SCOPE) {
4364 o = cUNOPo->op_first;
4365 }
4366 else {
4367 assert(o->op_targ == OP_LEAVE);
4368 o = cUNOPo->op_first;
4369 assert(o->op_type == OP_ENTER);
4370 o = o->op_sibling;
4371 }
4372
4373 if (o->op_type != OP_STUB) {
4374 assert( o->op_type == OP_NEXTSTATE
4375 || o->op_type == OP_DBSTATE
4376 || (o->op_type == OP_NULL
4377 && ( o->op_targ == OP_NEXTSTATE
4378 || o->op_targ == OP_DBSTATE
4379 )
4380 )
4381 );
4382 PL_curcop = (COP*)o;
4383 }
4384 }
81ed78b2 4385 nop = nop->op_next;
2e2e3f36 4386
24b23f37 4387 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4388 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4389
b93070ed 4390 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4391
2bf803e2
YO
4392 if (sv_yes_mark) {
4393 SV *sv_mrk = get_sv("REGMARK", 1);
4394 sv_setsv(sv_mrk, sv_yes_mark);
4395 }
4396
81ed78b2
DM
4397 /* we don't use MULTICALL here as we want to call the
4398 * first op of the block of interest, rather than the
4399 * first op of the sub */
4400 before = SP;
4401 PL_op = nop;
8e5e9ebe
RGS
4402 CALLRUNOPS(aTHX); /* Scalar context. */
4403 SPAGAIN;
4404 if (SP == before)
075aa684 4405 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4406 else {
4407 ret = POPs;
4408 PUTBACK;
4409 }
4aabdb9b 4410
e4bfbed3
DM
4411 /* before restoring everything, evaluate the returned
4412 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4413 * PL_op or pad. Also need to process any magic vars
4414 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4415
4416 PL_op = NULL;
4417
5e98dac2 4418 re_sv = NULL;
e4bfbed3
DM
4419 if (logical == 0) /* (?{})/ */
4420 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4421 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4422 sw = cBOOL(SvTRUE(ret));
4423 logical = 0;
4424 }
4425 else { /* /(??{}) */
497d0a96
DM
4426 /* if its overloaded, let the regex compiler handle
4427 * it; otherwise extract regex, or stringify */
4428 if (!SvAMAGIC(ret)) {
4429 SV *sv = ret;
4430 if (SvROK(sv))
4431 sv = SvRV(sv);
4432 if (SvTYPE(sv) == SVt_REGEXP)
4433 re_sv = (REGEXP*) sv;
4434 else if (SvSMAGICAL(sv)) {
4435 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4436 if (mg)
4437 re_sv = (REGEXP *) mg->mg_obj;
4438 }
e4bfbed3 4439
497d0a96
DM
4440 /* force any magic, undef warnings here */
4441 if (!re_sv) {
4442 ret = sv_mortalcopy(ret);
4443 (void) SvPV_force_nolen(ret);
4444 }
e4bfbed3
DM
4445 }
4446
4447 }
4448
91332126
FR
4449 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4450
81ed78b2
DM
4451 /* *** Note that at this point we don't restore
4452 * PL_comppad, (or pop the CxSUB) on the assumption it may
4453 * be used again soon. This is safe as long as nothing
4454 * in the regexp code uses the pad ! */
4aabdb9b 4455 PL_op = oop;
4aabdb9b 4456 PL_curcop = ocurcop;
d80618d2 4457 PL_regeol = saved_regeol;
e4bfbed3
DM
4458 S_regcp_restore(aTHX_ rex, runops_cp);
4459
4460 if (logical != 2)
4aabdb9b 4461 break;
8e5e9ebe 4462 }
e4bfbed3
DM
4463
4464 /* only /(??{})/ from now on */
24d3c4a9 4465 logical = 0;
4aabdb9b 4466 {
4f639d21
DM
4467 /* extract RE object from returned value; compiling if
4468 * necessary */
5c35adbb 4469
575c37f6
DM
4470 if (re_sv) {
4471 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4472 }
0f5d15d6 4473 else {
c737faaf 4474 U32 pm_flags = 0;
a3b680e6 4475 const I32 osize = PL_regsize;
0f5d15d6 4476
9753d940
DM
4477 if (SvUTF8(ret) && IN_BYTES) {
4478 /* In use 'bytes': make a copy of the octet
4479 * sequence, but without the flag on */
b9ad30b4
NC
4480 STRLEN len;
4481 const char *const p = SvPV(ret, len);
4482 ret = newSVpvn_flags(p, len, SVs_TEMP);
4483 }
732caac7
DM
4484 if (rex->intflags & PREGf_USE_RE_EVAL)
4485 pm_flags |= PMf_USE_RE_EVAL;
4486
4487 /* if we got here, it should be an engine which
4488 * supports compiling code blocks and stuff */
4489 assert(rex->engine && rex->engine->op_comp);
ec841a27 4490 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4491 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4492 rex->engine, NULL, NULL,
4493 /* copy /msix etc to inner pattern */
4494 scan->flags,
4495 pm_flags);
732caac7 4496
9041c2e3 4497 if (!(SvFLAGS(ret)
faf82a0b 4498 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4499 | SVs_GMG))) {
a2794585
NC
4500 /* This isn't a first class regexp. Instead, it's
4501 caching a regexp onto an existing, Perl visible
4502 scalar. */
575c37f6 4503 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4504 }
0f5d15d6 4505 PL_regsize = osize;
74088413
DM
4506 /* safe to do now that any $1 etc has been
4507 * interpolated into the new pattern string and
4508 * compiled */
4509 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4510 }
575c37f6 4511 re = (struct regexp *)SvANY(re_sv);
4aabdb9b 4512 }
07bc277f 4513 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4514 re->subbeg = rex->subbeg;
4515 re->sublen = rex->sublen;
f8fc2ecf 4516 rei = RXi_GET(re);
6bda09f9 4517 DEBUG_EXECUTE_r(
f2ed9b32 4518 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4519 "Matching embedded");
4520 );
f8fc2ecf 4521 startpoint = rei->program + 1;
1a147d38 4522 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4523
1a147d38 4524 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4525 /* run the pattern returned from (??{...}) */
b93070ed 4526 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4527 REGCP_SET(ST.lastcp);
6bda09f9 4528
0357f1fd
ML
4529 re->lastparen = 0;
4530 re->lastcloseparen = 0;
4531
4aabdb9b 4532 PL_reginput = locinput;
ae0beba1 4533 PL_regsize = 0;
4aabdb9b
DM
4534
4535 /* XXXX This is too dramatic a measure... */
4536 PL_reg_maxiter = 0;
4537
faec1544 4538 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4539 if (RX_UTF8(re_sv))
faec1544
DM
4540 PL_reg_flags |= RF_utf8;
4541 else
4542 PL_reg_flags &= ~RF_utf8;
4543 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4544
288b8c02 4545 ST.prev_rex = rex_sv;
faec1544 4546 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
4547 rex_sv = re_sv;
4548 SET_reg_curpm(rex_sv);
288b8c02 4549 rex = re;
f8fc2ecf 4550 rexi = rei;
faec1544 4551 cur_curlyx = NULL;
40a82448 4552 ST.B = next;
faec1544
DM
4553 ST.prev_eval = cur_eval;
4554 cur_eval = st;
faec1544 4555 /* now continue from first node in postoned RE */
6bda09f9 4556 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
118e2215 4557 assert(0); /* NOTREACHED */
c277df42 4558 }
40a82448 4559
faec1544
DM
4560 case EVAL_AB: /* cleanup after a successful (??{A})B */
4561 /* note: this is called twice; first after popping B, then A */
4562 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4563 rex_sv = ST.prev_rex;
4564 SET_reg_curpm(rex_sv);
288b8c02 4565 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4566 rexi = RXi_GET(rex);
faec1544
DM
4567 regcpblow(ST.cp);
4568 cur_eval = ST.prev_eval;
4569 cur_curlyx = ST.prev_curlyx;
34a81e2b 4570
40a82448
DM
4571 /* XXXX This is too dramatic a measure... */
4572 PL_reg_maxiter = 0;
e7707071 4573 if ( nochange_depth )
4b196cd4 4574 nochange_depth--;
262b90c4 4575 sayYES;
40a82448 4576
40a82448 4577
faec1544
DM
4578 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4579 /* note: this is called twice; first after popping B, then A */
4580 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4581 rex_sv = ST.prev_rex;
4582 SET_reg_curpm(rex_sv);
288b8c02 4583 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4584 rexi = RXi_GET(rex);
0357f1fd 4585
40a82448
DM
4586 PL_reginput = locinput;
4587 REGCP_UNWIND(ST.lastcp);
4588 regcppop(rex);
faec1544
DM
4589 cur_eval = ST.prev_eval;
4590 cur_curlyx = ST.prev_curlyx;
4591 /* XXXX This is too dramatic a measure... */
4592 PL_reg_maxiter = 0;
e7707071 4593 if ( nochange_depth )
4b196cd4 4594 nochange_depth--;
40a82448 4595 sayNO_SILENT;
40a82448
DM
4596#undef ST
4597
a0d0e21e 4598 case OPEN:
c277df42 4599 n = ARG(scan); /* which paren pair */
1ca2007e 4600 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
4601 if (n > PL_regsize)
4602 PL_regsize = n;
495f47a5
DM
4603 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4604 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4605 PTR2UV(rex),
4606 PTR2UV(rex->offs),
4607 (UV)n,
4608 (IV)rex->offs[n].start_tmp,
4609 (UV)PL_regsize
4610 ));
e2e6a0f1 4611 lastopen = n;
a0d0e21e 4612 break;
495f47a5
DM
4613
4614/* XXX really need to log other places start/end are set too */
4615#define CLOSE_CAPTURE \
4616 rex->offs[n].start = rex->offs[n].start_tmp; \
4617 rex->offs[n].end = locinput - PL_bostr; \
4618 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4619 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4620 PTR2UV(rex), \
4621 PTR2UV(rex->offs), \
4622 (UV)n, \
4623 (IV)rex->offs[n].start, \
4624 (IV)rex->offs[n].end \
4625 ))
4626
a0d0e21e 4627 case CLOSE:
c277df42 4628 n = ARG(scan); /* which paren pair */
495f47a5 4629 CLOSE_CAPTURE;
7f69552c
YO
4630 /*if (n > PL_regsize)
4631 PL_regsize = n;*/
b93070ed
DM
4632 if (n > rex->lastparen)
4633 rex->lastparen = n;
4634 rex->lastcloseparen = n;
3b6647e0 4635 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4636 goto fake_end;
4637 }
a0d0e21e 4638 break;
e2e6a0f1
YO
4639 case ACCEPT:
4640 if (ARG(scan)){
4641 regnode *cursor;
4642 for (cursor=scan;
4643 cursor && OP(cursor)!=END;
4644 cursor=regnext(cursor))
4645 {
4646 if ( OP(cursor)==CLOSE ){
4647 n = ARG(cursor);
4648 if ( n <= lastopen ) {
495f47a5 4649 CLOSE_CAPTURE;
e2e6a0f1
YO
4650 /*if (n > PL_regsize)
4651 PL_regsize = n;*/
b93070ed
DM
4652 if (n > rex->lastparen)
4653 rex->lastparen = n;
4654 rex->lastcloseparen = n;
3b6647e0
RB
4655 if ( n == ARG(scan) || (cur_eval &&
4656 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4657 break;
4658 }
4659 }
4660 }
4661 }
4662 goto fake_end;
4663 /*NOTREACHED*/
c277df42
IZ
4664 case GROUPP:
4665 n = ARG(scan); /* which paren pair */
b93070ed 4666 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 4667 break;
0a4db386
YO
4668 case NGROUPP:
4669 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4670 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4671 break;
1a147d38 4672 case INSUBP:
0a4db386 4673 n = ARG(scan);
3b6647e0 4674 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4675 break;
4676 case DEFINEP:
4677 sw = 0;
4678 break;
c277df42 4679 case IFTHEN:
2c2d71f5 4680 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4681 if (sw)
c277df42
IZ
4682 next = NEXTOPER(NEXTOPER(scan));
4683 else {
4684 next = scan + ARG(scan);
4685 if (OP(next) == IFTHEN) /* Fake one. */
4686 next = NEXTOPER(NEXTOPER(next));
4687 }
4688 break;
4689 case LOGICAL:
24d3c4a9 4690 logical = scan->flags;
c277df42 4691 break;
c476f425 4692
2ab05381 4693/*******************************************************************
2ab05381 4694
c476f425
DM
4695The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4696pattern, where A and B are subpatterns. (For simple A, CURLYM or
4697STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4698
c476f425 4699A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4700
c476f425
DM
4701On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4702state, which contains the current count, initialised to -1. It also sets
4703cur_curlyx to point to this state, with any previous value saved in the
4704state block.
2ab05381 4705
c476f425
DM
4706CURLYX then jumps straight to the WHILEM op, rather than executing A,
4707since the pattern may possibly match zero times (i.e. it's a while {} loop
4708rather than a do {} while loop).
2ab05381 4709
c476f425
DM
4710Each entry to WHILEM represents a successful match of A. The count in the
4711CURLYX block is incremented, another WHILEM state is pushed, and execution
4712passes to A or B depending on greediness and the current count.
2ab05381 4713
c476f425
DM
4714For example, if matching against the string a1a2a3b (where the aN are
4715substrings that match /A/), then the match progresses as follows: (the
4716pushed states are interspersed with the bits of strings matched so far):
2ab05381 4717
c476f425
DM
4718 <CURLYX cnt=-1>
4719 <CURLYX cnt=0><WHILEM>
4720 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4721 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4722 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4723 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4724
c476f425
DM
4725(Contrast this with something like CURLYM, which maintains only a single
4726backtrack state:
2ab05381 4727
c476f425
DM
4728 <CURLYM cnt=0> a1
4729 a1 <CURLYM cnt=1> a2
4730 a1 a2 <CURLYM cnt=2> a3
4731 a1 a2 a3 <CURLYM cnt=3> b
4732)
2ab05381 4733
c476f425
DM
4734Each WHILEM state block marks a point to backtrack to upon partial failure
4735of A or B, and also contains some minor state data related to that
4736iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4737overall state, such as the count, and pointers to the A and B ops.
2ab05381 4738
c476f425
DM
4739This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4740must always point to the *current* CURLYX block, the rules are:
2ab05381 4741
c476f425
DM
4742When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4743and set cur_curlyx to point the new block.
2ab05381 4744
c476f425
DM
4745When popping the CURLYX block after a successful or unsuccessful match,
4746restore the previous cur_curlyx.
2ab05381 4747
c476f425
DM
4748When WHILEM is about to execute B, save the current cur_curlyx, and set it
4749to the outer one saved in the CURLYX block.
2ab05381 4750
c476f425
DM
4751When popping the WHILEM block after a successful or unsuccessful B match,
4752restore the previous cur_curlyx.
2ab05381 4753
c476f425
DM
4754Here's an example for the pattern (AI* BI)*BO
4755I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4756
c476f425
DM
4757cur_
4758curlyx backtrack stack
4759------ ---------------
4760NULL
4761CO <CO prev=NULL> <WO>
4762CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4763CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4764NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4765
c476f425
DM
4766At this point the pattern succeeds, and we work back down the stack to
4767clean up, restoring as we go:
95b24440 4768
c476f425
DM
4769CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4770CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4771CO <CO prev=NULL> <WO>
4772NULL
a0374537 4773
c476f425
DM
4774*******************************************************************/
4775
4776#define ST st->u.curlyx
4777
4778 case CURLYX: /* start of /A*B/ (for complex A) */
4779 {
4780 /* No need to save/restore up to this paren */
4781 I32 parenfloor = scan->flags;
4782
4783 assert(next); /* keep Coverity happy */
4784 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4785 next += ARG(next);
4786
4787 /* XXXX Probably it is better to teach regpush to support
4788 parenfloor > PL_regsize... */
b93070ed
DM
4789 if (parenfloor > (I32)rex->lastparen)
4790 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
4791
4792 ST.prev_curlyx= cur_curlyx;
4793 cur_curlyx = st;
4794 ST.cp = PL_savestack_ix;
4795
4796 /* these fields contain the state of the current curly.
4797 * they are accessed by subsequent WHILEMs */
4798 ST.parenfloor = parenfloor;
d02d6d97 4799 ST.me = scan;
c476f425 4800 ST.B = next;
24d3c4a9
DM
4801 ST.minmod = minmod;
4802 minmod = 0;
c476f425
DM
4803 ST.count = -1; /* this will be updated by WHILEM */
4804 ST.lastloc = NULL; /* this will be updated by WHILEM */
4805
4806 PL_reginput = locinput;
4807 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
118e2215 4808 assert(0); /* NOTREACHED */
c476f425 4809 }
a0d0e21e 4810
c476f425 4811 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4812 cur_curlyx = ST.prev_curlyx;
4813 sayYES;
118e2215 4814 assert(0); /* NOTREACHED */
a0d0e21e 4815
c476f425
DM
4816 case CURLYX_end_fail: /* just failed to match all of A*B */
4817 regcpblow(ST.cp);
4818 cur_curlyx = ST.prev_curlyx;
4819 sayNO;
118e2215 4820 assert(0); /* NOTREACHED */
4633a7c4 4821
a0d0e21e 4822
c476f425
DM
4823#undef ST
4824#define ST st->u.whilem
4825
4826 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4827 {
4828 /* see the discussion above about CURLYX/WHILEM */
c476f425 4829 I32 n;
d02d6d97
DM
4830 int min = ARG1(cur_curlyx->u.curlyx.me);
4831 int max = ARG2(cur_curlyx->u.curlyx.me);
4832 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4833
c476f425
DM
4834 assert(cur_curlyx); /* keep Coverity happy */
4835 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4836 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4837 ST.cache_offset = 0;
4838 ST.cache_mask = 0;
4839
4840 PL_reginput = locinput;
4841
4842 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4843 "%*s whilem: matched %ld out of %d..%d\n",
4844 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4845 );
a0d0e21e 4846
c476f425 4847 /* First just match a string of min A's. */
a0d0e21e 4848
d02d6d97 4849 if (n < min) {
b93070ed 4850 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 4851 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4852 REGCP_SET(ST.lastcp);
4853
d02d6d97 4854 PUSH_STATE_GOTO(WHILEM_A_pre, A);
118e2215 4855 assert(0); /* NOTREACHED */
c476f425
DM
4856 }
4857
4858 /* If degenerate A matches "", assume A done. */
4859
4860 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4861 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4862 "%*s whilem: empty match detected, trying continuation...\n",
4863 REPORT_CODE_OFF+depth*2, "")
4864 );
4865 goto do_whilem_B_max;
4866 }
4867
4868 /* super-linear cache processing */
4869
4870 if (scan->flags) {
a0d0e21e 4871
2c2d71f5 4872 if (!PL_reg_maxiter) {
c476f425
DM
4873 /* start the countdown: Postpone detection until we
4874 * know the match is not *that* much linear. */
2c2d71f5 4875 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4876 /* possible overflow for long strings and many CURLYX's */
4877 if (PL_reg_maxiter < 0)
4878 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4879 PL_reg_leftiter = PL_reg_maxiter;
4880 }
c476f425 4881
2c2d71f5 4882 if (PL_reg_leftiter-- == 0) {
c476f425 4883 /* initialise cache */
3298f257 4884 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4885 if (PL_reg_poscache) {
eb160463 4886 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4887 Renew(PL_reg_poscache, size, char);
4888 PL_reg_poscache_size = size;
4889 }
4890 Zero(PL_reg_poscache, size, char);
4891 }
4892 else {
4893 PL_reg_poscache_size = size;
a02a5408 4894 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4895 }
c476f425
DM
4896 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4897 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4898 PL_colors[4], PL_colors[5])
4899 );
2c2d71f5 4900 }
c476f425 4901
2c2d71f5 4902 if (PL_reg_leftiter < 0) {
c476f425
DM
4903 /* have we already failed at this position? */
4904 I32 offset, mask;
4905 offset = (scan->flags & 0xf) - 1
4906 + (locinput - PL_bostr) * (scan->flags>>4);
4907 mask = 1 << (offset % 8);
4908 offset /= 8;
4909 if (PL_reg_poscache[offset] & mask) {
4910 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4911 "%*s whilem: (cache) already tried at this position...\n",
4912 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4913 );
3298f257 4914 sayNO; /* cache records failure */
2c2d71f5 4915 }
c476f425
DM
4916 ST.cache_offset = offset;
4917 ST.cache_mask = mask;
2c2d71f5 4918 }
c476f425 4919 }
2c2d71f5 4920
c476f425 4921 /* Prefer B over A for minimal matching. */
a687059c 4922
c476f425
DM
4923 if (cur_curlyx->u.curlyx.minmod) {
4924 ST.save_curlyx = cur_curlyx;
4925 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 4926 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425
DM
4927 REGCP_SET(ST.lastcp);
4928 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
118e2215 4929 assert(0); /* NOTREACHED */
c476f425 4930 }
a0d0e21e 4931
c476f425
DM
4932 /* Prefer A over B for maximal matching. */
4933
d02d6d97 4934 if (n < max) { /* More greed allowed? */
b93070ed 4935 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
4936 cur_curlyx->u.curlyx.lastloc = locinput;
4937 REGCP_SET(ST.lastcp);
d02d6d97 4938 PUSH_STATE_GOTO(WHILEM_A_max, A);
118e2215 4939 assert(0); /* NOTREACHED */
c476f425
DM
4940 }
4941 goto do_whilem_B_max;
4942 }
118e2215 4943 assert(0); /* NOTREACHED */
c476f425
DM
4944
4945 case WHILEM_B_min: /* just matched B in a minimal match */
4946 case WHILEM_B_max: /* just matched B in a maximal match */
4947 cur_curlyx = ST.save_curlyx;
4948 sayYES;
118e2215 4949 assert(0); /* NOTREACHED */
c476f425
DM
4950
4951 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4952 cur_curlyx = ST.save_curlyx;
4953 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4954 cur_curlyx->u.curlyx.count--;
4955 CACHEsayNO;
118e2215 4956 assert(0); /* NOTREACHED */
c476f425
DM
4957
4958 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
4959 /* FALL THROUGH */
4960 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
4961 REGCP_UNWIND(ST.lastcp);
4962 regcppop(rex);
c476f425
DM
4963 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4964 cur_curlyx->u.curlyx.count--;
4965 CACHEsayNO;
118e2215 4966 assert(0); /* NOTREACHED */
c476f425
DM
4967
4968 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4969 REGCP_UNWIND(ST.lastcp);
4970 regcppop(rex); /* Restore some previous $<digit>s? */
4971 PL_reginput = locinput;
4972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4973 "%*s whilem: failed, trying continuation...\n",
4974 REPORT_CODE_OFF+depth*2, "")
4975 );
4976 do_whilem_B_max:
4977 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4978 && ckWARN(WARN_REGEXP)
4979 && !(PL_reg_flags & RF_warned))
4980 {
4981 PL_reg_flags |= RF_warned;
dcbac5bb
FC
4982 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4983 "Complex regular subexpression recursion limit (%d) "
4984 "exceeded",
c476f425
DM
4985 REG_INFTY - 1);
4986 }
4987
4988 /* now try B */
4989 ST.save_curlyx = cur_curlyx;
4990 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4991 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
118e2215 4992 assert(0); /* NOTREACHED */
c476f425
DM
4993
4994 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4995 cur_curlyx = ST.save_curlyx;
4996 REGCP_UNWIND(ST.lastcp);
4997 regcppop(rex);
4998
d02d6d97 4999 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5000 /* Maximum greed exceeded */
5001 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5002 && ckWARN(WARN_REGEXP)
5003 && !(PL_reg_flags & RF_warned))
5004 {
3280af22 5005 PL_reg_flags |= RF_warned;
c476f425 5006 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5007 "Complex regular subexpression recursion "
5008 "limit (%d) exceeded",
c476f425 5009 REG_INFTY - 1);
a0d0e21e 5010 }
c476f425 5011 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5012 CACHEsayNO;
a0d0e21e 5013 }
c476f425
DM
5014
5015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5016 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5017 );
5018 /* Try grabbing another A and see if it helps. */
5019 PL_reginput = locinput;
5020 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5021 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5022 REGCP_SET(ST.lastcp);
d02d6d97
DM
5023 PUSH_STATE_GOTO(WHILEM_A_min,
5024 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
118e2215 5025 assert(0); /* NOTREACHED */
40a82448
DM
5026
5027#undef ST
5028#define ST st->u.branch
5029
5030 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5031 next = scan + ARG(scan);
5032 if (next == scan)
5033 next = NULL;
40a82448
DM
5034 scan = NEXTOPER(scan);
5035 /* FALL THROUGH */
c277df42 5036
40a82448
DM
5037 case BRANCH: /* /(...|A|...)/ */
5038 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5039 ST.lastparen = rex->lastparen;
f6033a9d 5040 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5041 ST.next_branch = next;
5042 REGCP_SET(ST.cp);
5043 PL_reginput = locinput;
02db2b7b 5044
40a82448 5045 /* Now go into the branch */
5d458dd8
YO
5046 if (has_cutgroup) {
5047 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5048 } else {
5049 PUSH_STATE_GOTO(BRANCH_next, scan);
5050 }
118e2215 5051 assert(0); /* NOTREACHED */
5d458dd8
YO
5052 case CUTGROUP:
5053 PL_reginput = locinput;
5054 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5055 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8 5056 PUSH_STATE_GOTO(CUTGROUP_next,next);
118e2215 5057 assert(0); /* NOTREACHED */
5d458dd8
YO
5058 case CUTGROUP_next_fail:
5059 do_cutgroup = 1;
5060 no_final = 1;
5061 if (st->u.mark.mark_name)
5062 sv_commit = st->u.mark.mark_name;
5063 sayNO;
118e2215 5064 assert(0); /* NOTREACHED */
5d458dd8
YO
5065 case BRANCH_next:
5066 sayYES;
118e2215 5067 assert(0); /* NOTREACHED */
40a82448 5068 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5069 if (do_cutgroup) {
5070 do_cutgroup = 0;
5071 no_final = 0;
5072 }
40a82448 5073 REGCP_UNWIND(ST.cp);
a8d1f4b4 5074 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5075 scan = ST.next_branch;
5076 /* no more branches? */
5d458dd8
YO
5077 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5078 DEBUG_EXECUTE_r({
5079 PerlIO_printf( Perl_debug_log,
5080 "%*s %sBRANCH failed...%s\n",
5081 REPORT_CODE_OFF+depth*2, "",
5082 PL_colors[4],
5083 PL_colors[5] );
5084 });
5085 sayNO_SILENT;
5086 }
40a82448 5087 continue; /* execute next BRANCH[J] op */
118e2215 5088 assert(0); /* NOTREACHED */
40a82448 5089
a0d0e21e 5090 case MINMOD:
24d3c4a9 5091 minmod = 1;
a0d0e21e 5092 break;
40a82448
DM
5093
5094#undef ST
5095#define ST st->u.curlym
5096
5097 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5098
5099 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5100 * only a single backtracking state, no matter how many matches
40a82448
DM
5101 * there are in {m,n}. It relies on the pattern being constant
5102 * length, with no parens to influence future backrefs
5103 */
5104
5105 ST.me = scan;
dc45a647 5106 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5107
f6033a9d
DM
5108 ST.lastparen = rex->lastparen;
5109 ST.lastcloseparen = rex->lastcloseparen;
5110
40a82448
DM
5111 /* if paren positive, emulate an OPEN/CLOSE around A */
5112 if (ST.me->flags) {
3b6647e0 5113 U32 paren = ST.me->flags;
40a82448
DM
5114 if (paren > PL_regsize)
5115 PL_regsize = paren;
c277df42 5116 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5117 }
40a82448
DM
5118 ST.A = scan;
5119 ST.B = next;
5120 ST.alen = 0;
5121 ST.count = 0;
24d3c4a9
DM
5122 ST.minmod = minmod;
5123 minmod = 0;
40a82448
DM
5124 ST.c1 = CHRTEST_UNINIT;
5125 REGCP_SET(ST.cp);
6407bf3b 5126
40a82448
DM
5127 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5128 goto curlym_do_B;
5129
5130 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 5131 PL_reginput = locinput;
40a82448 5132 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
118e2215 5133 assert(0); /* NOTREACHED */
5f80c4cf 5134
40a82448
DM
5135 case CURLYM_A: /* we've just matched an A */
5136 locinput = st->locinput;
5137 nextchr = UCHARAT(locinput);
5138
5139 ST.count++;
5140 /* after first match, determine A's length: u.curlym.alen */
5141 if (ST.count == 1) {
5142 if (PL_reg_match_utf8) {
5143 char *s = locinput;
5144 while (s < PL_reginput) {
5145 ST.alen++;
5146 s += UTF8SKIP(s);
5147 }
5148 }
5149 else {
5150 ST.alen = PL_reginput - locinput;
5151 }
5152 if (ST.alen == 0)
5153 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5154 }
0cadcf80
DM
5155 DEBUG_EXECUTE_r(
5156 PerlIO_printf(Perl_debug_log,
40a82448 5157 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5158 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5159 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5160 );
5161
40a82448 5162 locinput = PL_reginput;
0a4db386
YO
5163
5164 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5165 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5166 goto fake_end;
5167
c966426a
DM
5168 {
5169 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5170 if ( max == REG_INFTY || ST.count < max )
5171 goto curlym_do_A; /* try to match another A */
5172 }
40a82448 5173 goto curlym_do_B; /* try to match B */
5f80c4cf 5174
40a82448
DM
5175 case CURLYM_A_fail: /* just failed to match an A */
5176 REGCP_UNWIND(ST.cp);
0a4db386
YO
5177
5178 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5179 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5180 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5181 sayNO;
0cadcf80 5182
40a82448
DM
5183 curlym_do_B: /* execute the B in /A{m,n}B/ */
5184 PL_reginput = locinput;
5185 if (ST.c1 == CHRTEST_UNINIT) {
5186 /* calculate c1 and c2 for possible match of 1st char
5187 * following curly */
5188 ST.c1 = ST.c2 = CHRTEST_VOID;
5189 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5190 regnode *text_node = ST.B;
5191 if (! HAS_TEXT(text_node))
5192 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5193 /* this used to be
5194
5195 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5196
5197 But the former is redundant in light of the latter.
5198
5199 if this changes back then the macro for
5200 IS_TEXT and friends need to change.
5201 */
5202 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5203 {
ee9b8eae 5204
40a82448 5205 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5206 switch (OP(text_node)) {
5207 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5208 case EXACTFA:
3c760661 5209 case EXACTFU_SS:
fab2782b 5210 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5211 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5212 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5213 default: ST.c2 = ST.c1;
5214 }
c277df42 5215 }
c277df42 5216 }
40a82448
DM
5217 }
5218
5219 DEBUG_EXECUTE_r(
5220 PerlIO_printf(Perl_debug_log,
5221 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5222 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5223 "", (IV)ST.count)
5224 );
5225 if (ST.c1 != CHRTEST_VOID
5226 && UCHARAT(PL_reginput) != ST.c1
5227 && UCHARAT(PL_reginput) != ST.c2)
5228 {
5229 /* simulate B failing */
3e901dc0
YO
5230 DEBUG_OPTIMISE_r(
5231 PerlIO_printf(Perl_debug_log,
5232 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5233 (int)(REPORT_CODE_OFF+(depth*2)),"",
5234 (IV)ST.c1,(IV)ST.c2
5235 ));
40a82448
DM
5236 state_num = CURLYM_B_fail;
5237 goto reenter_switch;
5238 }
5239
5240 if (ST.me->flags) {
f6033a9d 5241 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5242 I32 paren = ST.me->flags;
5243 if (ST.count) {
b93070ed 5244 rex->offs[paren].start
40a82448 5245 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
b93070ed 5246 rex->offs[paren].end = PL_reginput - PL_bostr;
f6033a9d
DM
5247 if ((U32)paren > rex->lastparen)
5248 rex->lastparen = paren;
5249 rex->lastcloseparen = paren;
c277df42 5250 }
40a82448 5251 else
b93070ed 5252 rex->offs[paren].end = -1;
0a4db386 5253 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5254 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5255 {
5256 if (ST.count)
5257 goto fake_end;
5258 else
5259 sayNO;
5260 }
c277df42 5261 }
0a4db386 5262
40a82448 5263 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
118e2215 5264 assert(0); /* NOTREACHED */
40a82448
DM
5265
5266 case CURLYM_B_fail: /* just failed to match a B */
5267 REGCP_UNWIND(ST.cp);
a8d1f4b4 5268 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5269 if (ST.minmod) {
84d2fa14
HS
5270 I32 max = ARG2(ST.me);
5271 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5272 sayNO;
5273 goto curlym_do_A; /* try to match a further A */
5274 }
5275 /* backtrack one A */
5276 if (ST.count == ARG1(ST.me) /* min */)
5277 sayNO;
5278 ST.count--;
5279 locinput = HOPc(locinput, -ST.alen);
5280 goto curlym_do_B; /* try to match B */
5281
c255a977
DM
5282#undef ST
5283#define ST st->u.curly
40a82448 5284
c255a977
DM
5285#define CURLY_SETPAREN(paren, success) \
5286 if (paren) { \
5287 if (success) { \
b93070ed
DM
5288 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5289 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5290 if (paren > rex->lastparen) \
5291 rex->lastparen = paren; \
b93070ed 5292 rex->lastcloseparen = paren; \
c255a977 5293 } \
f6033a9d 5294 else { \
b93070ed 5295 rex->offs[paren].end = -1; \
f6033a9d
DM
5296 rex->lastparen = ST.lastparen; \
5297 rex->lastcloseparen = ST.lastcloseparen; \
5298 } \
c255a977
DM
5299 }
5300
5301 case STAR: /* /A*B/ where A is width 1 */
5302 ST.paren = 0;
5303 ST.min = 0;
5304 ST.max = REG_INFTY;
a0d0e21e
LW
5305 scan = NEXTOPER(scan);
5306 goto repeat;
c255a977
DM
5307 case PLUS: /* /A+B/ where A is width 1 */
5308 ST.paren = 0;
5309 ST.min = 1;
5310 ST.max = REG_INFTY;
c277df42 5311 scan = NEXTOPER(scan);
c255a977
DM
5312 goto repeat;
5313 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5314 ST.paren = scan->flags; /* Which paren to set */
f6033a9d
DM
5315 ST.lastparen = rex->lastparen;
5316 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5317 if (ST.paren > PL_regsize)
5318 PL_regsize = ST.paren;
c255a977
DM
5319 ST.min = ARG1(scan); /* min to match */
5320 ST.max = ARG2(scan); /* max to match */
0a4db386 5321 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5322 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5323 ST.min=1;
5324 ST.max=1;
5325 }
c255a977
DM
5326 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5327 goto repeat;
5328 case CURLY: /* /A{m,n}B/ where A is width 1 */
5329 ST.paren = 0;
5330 ST.min = ARG1(scan); /* min to match */
5331 ST.max = ARG2(scan); /* max to match */
5332 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5333 repeat:
a0d0e21e
LW
5334 /*
5335 * Lookahead to avoid useless match attempts
5336 * when we know what character comes next.
c255a977 5337 *
5f80c4cf
JP
5338 * Used to only do .*x and .*?x, but now it allows
5339 * for )'s, ('s and (?{ ... })'s to be in the way
5340 * of the quantifier and the EXACT-like node. -- japhy
5341 */
5342
c255a977
DM
5343 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5344 sayNO;
cca55fe3 5345 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5346 U8 *s;
5347 regnode *text_node = next;
5348
3dab1dad
YO
5349 if (! HAS_TEXT(text_node))
5350 FIND_NEXT_IMPT(text_node);
5f80c4cf 5351
9e137952 5352 if (! HAS_TEXT(text_node))
c255a977 5353 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5354 else {
ee9b8eae 5355 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5356 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5357 goto assume_ok_easy;
cca55fe3 5358 }
be8e71aa
YO
5359 else
5360 s = (U8*)STRING(text_node);
ee9b8eae
YO
5361
5362 /* Currently we only get here when
5363
5364 PL_rekind[OP(text_node)] == EXACT
5365
5366 if this changes back then the macro for IS_TEXT and
5367 friends need to change. */
f2ed9b32 5368 if (!UTF_PATTERN) {
9a5a5549
KW
5369 ST.c1 = *s;
5370 switch (OP(text_node)) {
5371 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5372 case EXACTFA:
3c760661 5373 case EXACTFU_SS:
fab2782b 5374 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5375 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5376 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5377 default: ST.c2 = ST.c1; break;
5378 }
1aa99e6b 5379 }
f2ed9b32 5380 else { /* UTF_PATTERN */
9a5a5549 5381 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
b0573d8b
KW
5382 STRLEN ulen;
5383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5384
5385 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5386 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
e294cc5d 5387 uniflags);
5f80c4cf
JP
5388 }
5389 else {
c255a977 5390 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5391 uniflags);
5f80c4cf 5392 }
1aa99e6b
IH
5393 }
5394 }
bbce6d69 5395 }
a0d0e21e 5396 else
c255a977 5397 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5398 assume_ok_easy:
c255a977
DM
5399
5400 ST.A = scan;
5401 ST.B = next;
3280af22 5402 PL_reginput = locinput;
24d3c4a9
DM
5403 if (minmod) {
5404 minmod = 0;
e2e6a0f1 5405 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 5406 sayNO;
c255a977 5407 ST.count = ST.min;
a0ed51b3 5408 locinput = PL_reginput;
c255a977
DM
5409 REGCP_SET(ST.cp);
5410 if (ST.c1 == CHRTEST_VOID)
5411 goto curly_try_B_min;
5412
5413 ST.oldloc = locinput;
5414
5415 /* set ST.maxpos to the furthest point along the
5416 * string that could possibly match */
5417 if (ST.max == REG_INFTY) {
5418 ST.maxpos = PL_regeol - 1;
f2ed9b32 5419 if (utf8_target)
c255a977
DM
5420 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5421 ST.maxpos--;
5422 }
f2ed9b32 5423 else if (utf8_target) {
c255a977
DM
5424 int m = ST.max - ST.min;
5425 for (ST.maxpos = locinput;
5426 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5427 ST.maxpos += UTF8SKIP(ST.maxpos);
5428 }
5429 else {
5430 ST.maxpos = locinput + ST.max - ST.min;
5431 if (ST.maxpos >= PL_regeol)
5432 ST.maxpos = PL_regeol - 1;
5433 }
5434 goto curly_try_B_min_known;
5435
5436 }
5437 else {
e2e6a0f1 5438 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
5439 locinput = PL_reginput;
5440 if (ST.count < ST.min)
5441 sayNO;
5442 if ((ST.count > ST.min)
5443 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5444 {
5445 /* A{m,n} must come at the end of the string, there's
5446 * no point in backing off ... */
5447 ST.min = ST.count;
5448 /* ...except that $ and \Z can match before *and* after
5449 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5450 We may back off by one in this case. */
5451 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5452 ST.min--;
5453 }
5454 REGCP_SET(ST.cp);
5455 goto curly_try_B_max;
5456 }
118e2215 5457 assert(0); /* NOTREACHED */
c255a977
DM
5458
5459
5460 case CURLY_B_min_known_fail:
5461 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977
DM
5462
5463 PL_reginput = locinput; /* Could be reset... */
5464 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5465 if (ST.paren) {
5466 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5467 }
c255a977
DM
5468 /* Couldn't or didn't -- move forward. */
5469 ST.oldloc = locinput;
f2ed9b32 5470 if (utf8_target)
c255a977
DM
5471 locinput += UTF8SKIP(locinput);
5472 else
5473 locinput++;
5474 ST.count++;
5475 curly_try_B_min_known:
5476 /* find the next place where 'B' could work, then call B */
5477 {
5478 int n;
f2ed9b32 5479 if (utf8_target) {
c255a977
DM
5480 n = (ST.oldloc == locinput) ? 0 : 1;
5481 if (ST.c1 == ST.c2) {
5482 STRLEN len;
5483 /* set n to utf8_distance(oldloc, locinput) */
5484 while (locinput <= ST.maxpos &&
5485 utf8n_to_uvchr((U8*)locinput,
5486 UTF8_MAXBYTES, &len,
5487 uniflags) != (UV)ST.c1) {
5488 locinput += len;
5489 n++;
5490 }
1aa99e6b
IH
5491 }
5492 else {
c255a977
DM
5493 /* set n to utf8_distance(oldloc, locinput) */
5494 while (locinput <= ST.maxpos) {
5495 STRLEN len;
5496 const UV c = utf8n_to_uvchr((U8*)locinput,
5497 UTF8_MAXBYTES, &len,
5498 uniflags);
5499 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5500 break;
5501 locinput += len;
5502 n++;
1aa99e6b 5503 }
0fe9bf95
IZ
5504 }
5505 }
c255a977
DM
5506 else {
5507 if (ST.c1 == ST.c2) {
5508 while (locinput <= ST.maxpos &&
5509 UCHARAT(locinput) != ST.c1)
5510 locinput++;
bbce6d69 5511 }
c255a977
DM
5512 else {
5513 while (locinput <= ST.maxpos
5514 && UCHARAT(locinput) != ST.c1
5515 && UCHARAT(locinput) != ST.c2)
5516 locinput++;
a0ed51b3 5517 }
c255a977
DM
5518 n = locinput - ST.oldloc;
5519 }
5520 if (locinput > ST.maxpos)
5521 sayNO;
5522 /* PL_reginput == oldloc now */
5523 if (n) {
5524 ST.count += n;
e2e6a0f1 5525 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5526 sayNO;
a0d0e21e 5527 }
c255a977
DM
5528 PL_reginput = locinput;
5529 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5530 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5531 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5532 goto fake_end;
5533 }
c255a977 5534 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5535 }
118e2215 5536 assert(0); /* NOTREACHED */
c255a977
DM
5537
5538
5539 case CURLY_B_min_fail:
5540 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5541
5542 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5543 if (ST.paren) {
5544 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5545 }
c255a977
DM
5546 /* failed -- move forward one */
5547 PL_reginput = locinput;
e2e6a0f1 5548 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5549 ST.count++;
a0ed51b3 5550 locinput = PL_reginput;
c255a977
DM
5551 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5552 ST.count > 0)) /* count overflow ? */
15272685 5553 {
c255a977
DM
5554 curly_try_B_min:
5555 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5556 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5557 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5558 goto fake_end;
5559 }
c255a977 5560 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5561 }
5562 }
4633a7c4 5563 sayNO;
118e2215 5564 assert(0); /* NOTREACHED */
c255a977
DM
5565
5566
5567 curly_try_B_max:
5568 /* a successful greedy match: now try to match B */
40d049e4 5569 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5570 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5571 goto fake_end;
5572 }
c255a977
DM
5573 {
5574 UV c = 0;
5575 if (ST.c1 != CHRTEST_VOID)
f2ed9b32 5576 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
c255a977 5577 UTF8_MAXBYTES, 0, uniflags)
466787eb 5578 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5579 /* If it could work, try it. */
5580 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5581 CURLY_SETPAREN(ST.paren, ST.count);
5582 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
118e2215 5583 assert(0); /* NOTREACHED */
c255a977
DM
5584 }
5585 }
5586 /* FALL THROUGH */
5587 case CURLY_B_max_fail:
5588 /* failed to find B in a greedy match */
c255a977
DM
5589
5590 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5591 if (ST.paren) {
5592 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5593 }
c255a977
DM
5594 /* back up. */
5595 if (--ST.count < ST.min)
5596 sayNO;
5597 PL_reginput = locinput = HOPc(locinput, -1);
5598 goto curly_try_B_max;
5599
5600#undef ST
5601
a0d0e21e 5602 case END:
6bda09f9 5603 fake_end:
faec1544
DM
5604 if (cur_eval) {
5605 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
5606 st->u.eval.toggle_reg_flags
5607 = cur_eval->u.eval.toggle_reg_flags;
5608 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5609
288b8c02 5610 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 5611 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
5612 rex_sv = cur_eval->u.eval.prev_rex;
5613 SET_reg_curpm(rex_sv);
288b8c02 5614 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5615 rexi = RXi_GET(rex);
faec1544 5616 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 5617
faec1544
DM
5618 REGCP_SET(st->u.eval.lastcp);
5619 PL_reginput = locinput;
5620
5621 /* Restore parens of the outer rex without popping the
5622 * savestack */
74088413 5623 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
5624
5625 st->u.eval.prev_eval = cur_eval;
5626 cur_eval = cur_eval->u.eval.prev_eval;
5627 DEBUG_EXECUTE_r(
2a49f0f5
JH
5628 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5629 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5630 if ( nochange_depth )
5631 nochange_depth--;
5632
5633 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5634 st->u.eval.prev_eval->u.eval.B); /* match B */
5635 }
5636
3b0527fe 5637 if (locinput < reginfo->till) {
a3621e74 5638 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5639 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5640 PL_colors[4],
5641 (long)(locinput - PL_reg_starttry),
3b0527fe 5642 (long)(reginfo->till - PL_reg_starttry),
7821416a 5643 PL_colors[5]));
58e23c8d 5644
262b90c4 5645 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5646 }
5647 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5648 sayYES; /* Success! */
dad79028
DM
5649
5650 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5651 DEBUG_EXECUTE_r(
5652 PerlIO_printf(Perl_debug_log,
5653 "%*s %ssubpattern success...%s\n",
5bc10b2c 5654 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5655 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5656 sayYES; /* Success! */
dad79028 5657
40a82448
DM
5658#undef ST
5659#define ST st->u.ifmatch
5660
5661 case SUSPEND: /* (?>A) */
5662 ST.wanted = 1;
9fe1d20c 5663 PL_reginput = locinput;
9041c2e3 5664 goto do_ifmatch;
dad79028 5665
40a82448
DM
5666 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5667 ST.wanted = 0;
dad79028
DM
5668 goto ifmatch_trivial_fail_test;
5669
40a82448
DM
5670 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5671 ST.wanted = 1;
dad79028 5672 ifmatch_trivial_fail_test:
a0ed51b3 5673 if (scan->flags) {
52657f30 5674 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5675 if (!s) {
5676 /* trivial fail */
24d3c4a9
DM
5677 if (logical) {
5678 logical = 0;
f2338a2e 5679 sw = 1 - cBOOL(ST.wanted);
dad79028 5680 }
40a82448 5681 else if (ST.wanted)
dad79028
DM
5682 sayNO;
5683 next = scan + ARG(scan);
5684 if (next == scan)
5685 next = NULL;
5686 break;
5687 }
efb30f32 5688 PL_reginput = s;
a0ed51b3
LW
5689 }
5690 else
5691 PL_reginput = locinput;
5692
c277df42 5693 do_ifmatch:
40a82448 5694 ST.me = scan;
24d3c4a9 5695 ST.logical = logical;
24d786f4
YO
5696 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5697
40a82448
DM
5698 /* execute body of (?...A) */
5699 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
118e2215 5700 assert(0); /* NOTREACHED */
40a82448
DM
5701
5702 case IFMATCH_A_fail: /* body of (?...A) failed */
5703 ST.wanted = !ST.wanted;
5704 /* FALL THROUGH */
5705
5706 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5707 if (ST.logical) {
f2338a2e 5708 sw = cBOOL(ST.wanted);
40a82448
DM
5709 }
5710 else if (!ST.wanted)
5711 sayNO;
5712
5713 if (OP(ST.me) == SUSPEND)
5714 locinput = PL_reginput;
5715 else {
5716 locinput = PL_reginput = st->locinput;
5717 nextchr = UCHARAT(locinput);
5718 }
5719 scan = ST.me + ARG(ST.me);
5720 if (scan == ST.me)
5721 scan = NULL;
5722 continue; /* execute B */
5723
5724#undef ST
dad79028 5725
c277df42 5726 case LONGJMP:
c277df42
IZ
5727 next = scan + ARG(scan);
5728 if (next == scan)
5729 next = NULL;
a0d0e21e 5730 break;
54612592 5731 case COMMIT:
e2e6a0f1
YO
5732 reginfo->cutpoint = PL_regeol;
5733 /* FALLTHROUGH */
5d458dd8 5734 case PRUNE:
24b23f37 5735 PL_reginput = locinput;
e2e6a0f1 5736 if (!scan->flags)
ad64d0ec 5737 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592 5738 PUSH_STATE_GOTO(COMMIT_next,next);
118e2215 5739 assert(0); /* NOTREACHED */
54612592
YO
5740 case COMMIT_next_fail:
5741 no_final = 1;
5742 /* FALLTHROUGH */
7f69552c
YO
5743 case OPFAIL:
5744 sayNO;
118e2215 5745 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5746
5747#define ST st->u.mark
5748 case MARKPOINT:
5749 ST.prev_mark = mark_state;
5d458dd8 5750 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5751 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5752 mark_state = st;
5753 ST.mark_loc = PL_reginput = locinput;
5754 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
118e2215 5755 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5756 case MARKPOINT_next:
5757 mark_state = ST.prev_mark;
5758 sayYES;
118e2215 5759 assert(0); /* NOTREACHED */
e2e6a0f1 5760 case MARKPOINT_next_fail:
5d458dd8 5761 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5762 {
5763 if (ST.mark_loc > startpoint)
5764 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5765 popmark = NULL; /* we found our mark */
5766 sv_commit = ST.mark_name;
5767
5768 DEBUG_EXECUTE_r({
5d458dd8 5769 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5770 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5771 REPORT_CODE_OFF+depth*2, "",
be2597df 5772 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5773 });
5774 }
5775 mark_state = ST.prev_mark;
5d458dd8
YO
5776 sv_yes_mark = mark_state ?
5777 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 5778 sayNO;
118e2215 5779 assert(0); /* NOTREACHED */
5d458dd8
YO
5780 case SKIP:
5781 PL_reginput = locinput;
5782 if (scan->flags) {
2bf803e2 5783 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5784 ST.mark_name = NULL;
e2e6a0f1 5785 ST.mark_loc = locinput;
5d458dd8
YO
5786 PUSH_STATE_GOTO(SKIP_next,next);
5787 } else {
2bf803e2 5788 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5789 otherwise do nothing. Meaning we need to scan
5790 */
5791 regmatch_state *cur = mark_state;
ad64d0ec 5792 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5793
5794 while (cur) {
5795 if ( sv_eq( cur->u.mark.mark_name,
5796 find ) )
5797 {
5798 ST.mark_name = find;
5799 PUSH_STATE_GOTO( SKIP_next, next );
5800 }
5801 cur = cur->u.mark.prev_mark;
5802 }
e2e6a0f1 5803 }
2bf803e2 5804 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5805 break;
5806 case SKIP_next_fail:
5807 if (ST.mark_name) {
5808 /* (*CUT:NAME) - Set up to search for the name as we
5809 collapse the stack*/
5810 popmark = ST.mark_name;
5811 } else {
5812 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5813 if (ST.mark_loc > startpoint)
5814 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5815 /* but we set sv_commit to latest mark_name if there
5816 is one so they can test to see how things lead to this
5817 cut */
5818 if (mark_state)
5819 sv_commit=mark_state->u.mark.mark_name;
5820 }
e2e6a0f1
YO
5821 no_final = 1;
5822 sayNO;
118e2215 5823 assert(0); /* NOTREACHED */
e2e6a0f1 5824#undef ST
e1d1eefb 5825 case LNBREAK:
f2ed9b32 5826 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5827 locinput += n;
5828 nextchr = UCHARAT(locinput);
5829 } else
5830 sayNO;
5831 break;
5832
5833#define CASE_CLASS(nAmE) \
5834 case nAmE: \
1380b51d
DM
5835 if (locinput >= PL_regeol) \
5836 sayNO; \
f2ed9b32 5837 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5838 locinput += n; \
5839 nextchr = UCHARAT(locinput); \
5840 } else \
5841 sayNO; \
5842 break; \
5843 case N##nAmE: \
1380b51d
DM
5844 if (locinput >= PL_regeol) \
5845 sayNO; \
f2ed9b32 5846 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5847 sayNO; \
5848 } else { \
5849 locinput += UTF8SKIP(locinput); \
5850 nextchr = UCHARAT(locinput); \
5851 } \
5852 break
5853
5854 CASE_CLASS(VERTWS);
5855 CASE_CLASS(HORIZWS);
5856#undef CASE_CLASS
5857
a0d0e21e 5858 default:
b900a521 5859 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5860 PTR2UV(scan), OP(scan));
cea2e8a9 5861 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5862
5863 } /* end switch */
95b24440 5864
5d458dd8
YO
5865 /* switch break jumps here */
5866 scan = next; /* prepare to execute the next op and ... */
5867 continue; /* ... jump back to the top, reusing st */
118e2215 5868 assert(0); /* NOTREACHED */
95b24440 5869
40a82448
DM
5870 push_yes_state:
5871 /* push a state that backtracks on success */
5872 st->u.yes.prev_yes_state = yes_state;
5873 yes_state = st;
5874 /* FALL THROUGH */
5875 push_state:
5876 /* push a new regex state, then continue at scan */
5877 {
5878 regmatch_state *newst;
5879
24b23f37
YO
5880 DEBUG_STACK_r({
5881 regmatch_state *cur = st;
5882 regmatch_state *curyes = yes_state;
5883 int curd = depth;
5884 regmatch_slab *slab = PL_regmatch_slab;
5885 for (;curd > -1;cur--,curd--) {
5886 if (cur < SLAB_FIRST(slab)) {
5887 slab = slab->prev;
5888 cur = SLAB_LAST(slab);
5889 }
5890 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5891 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5892 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5893 (curyes == cur) ? "yes" : ""
5894 );
5895 if (curyes == cur)
5896 curyes = cur->u.yes.prev_yes_state;
5897 }
5898 } else
5899 DEBUG_STATE_pp("push")
5900 );
40a82448 5901 depth++;
40a82448
DM
5902 st->locinput = locinput;
5903 newst = st+1;
5904 if (newst > SLAB_LAST(PL_regmatch_slab))
5905 newst = S_push_slab(aTHX);
5906 PL_regmatch_state = newst;
786e8c11 5907
40a82448
DM
5908 locinput = PL_reginput;
5909 nextchr = UCHARAT(locinput);
5910 st = newst;
5911 continue;
118e2215 5912 assert(0); /* NOTREACHED */
40a82448 5913 }
a0d0e21e 5914 }
a687059c 5915
a0d0e21e
LW
5916 /*
5917 * We get here only if there's trouble -- normally "case END" is
5918 * the terminating point.
5919 */
cea2e8a9 5920 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5921 /*NOTREACHED*/
4633a7c4
LW
5922 sayNO;
5923
262b90c4 5924yes:
77cb431f
DM
5925 if (yes_state) {
5926 /* we have successfully completed a subexpression, but we must now
5927 * pop to the state marked by yes_state and continue from there */
77cb431f 5928 assert(st != yes_state);
5bc10b2c
DM
5929#ifdef DEBUGGING
5930 while (st != yes_state) {
5931 st--;
5932 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5933 PL_regmatch_slab = PL_regmatch_slab->prev;
5934 st = SLAB_LAST(PL_regmatch_slab);
5935 }
e2e6a0f1 5936 DEBUG_STATE_r({
54612592
YO
5937 if (no_final) {
5938 DEBUG_STATE_pp("pop (no final)");
5939 } else {
5940 DEBUG_STATE_pp("pop (yes)");
5941 }
e2e6a0f1 5942 });
5bc10b2c
DM
5943 depth--;
5944 }
5945#else
77cb431f
DM
5946 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5947 || yes_state > SLAB_LAST(PL_regmatch_slab))
5948 {
5949 /* not in this slab, pop slab */
5950 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5951 PL_regmatch_slab = PL_regmatch_slab->prev;
5952 st = SLAB_LAST(PL_regmatch_slab);
5953 }
5954 depth -= (st - yes_state);
5bc10b2c 5955#endif
77cb431f
DM
5956 st = yes_state;
5957 yes_state = st->u.yes.prev_yes_state;
5958 PL_regmatch_state = st;
24b23f37 5959
5d458dd8
YO
5960 if (no_final) {
5961 locinput= st->locinput;
5962 nextchr = UCHARAT(locinput);
5963 }
54612592 5964 state_num = st->resume_state + no_final;
24d3c4a9 5965 goto reenter_switch;
77cb431f
DM
5966 }
5967
a3621e74 5968 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5969 PL_colors[4], PL_colors[5]));
02db2b7b 5970
ed301438 5971 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
5972 /* each successfully executed (?{...}) block does the equivalent of
5973 * local $^R = do {...}
5974 * When popping the save stack, all these locals would be undone;
5975 * bypass this by setting the outermost saved $^R to the latest
5976 * value */
5977 if (oreplsv != GvSV(PL_replgv))
5978 sv_setsv(oreplsv, GvSV(PL_replgv));
5979 }
95b24440 5980 result = 1;
aa283a38 5981 goto final_exit;
4633a7c4
LW
5982
5983no:
a3621e74 5984 DEBUG_EXECUTE_r(
7821416a 5985 PerlIO_printf(Perl_debug_log,
786e8c11 5986 "%*s %sfailed...%s\n",
5bc10b2c 5987 REPORT_CODE_OFF+depth*2, "",
786e8c11 5988 PL_colors[4], PL_colors[5])
7821416a 5989 );
aa283a38 5990
262b90c4 5991no_silent:
54612592
YO
5992 if (no_final) {
5993 if (yes_state) {
5994 goto yes;
5995 } else {
5996 goto final_exit;
5997 }
5998 }
aa283a38
DM
5999 if (depth) {
6000 /* there's a previous state to backtrack to */
40a82448
DM
6001 st--;
6002 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6003 PL_regmatch_slab = PL_regmatch_slab->prev;
6004 st = SLAB_LAST(PL_regmatch_slab);
6005 }
6006 PL_regmatch_state = st;
40a82448
DM
6007 locinput= st->locinput;
6008 nextchr = UCHARAT(locinput);
6009
5bc10b2c
DM
6010 DEBUG_STATE_pp("pop");
6011 depth--;
262b90c4
DM
6012 if (yes_state == st)
6013 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6014
24d3c4a9
DM
6015 state_num = st->resume_state + 1; /* failure = success + 1 */
6016 goto reenter_switch;
95b24440 6017 }
24d3c4a9 6018 result = 0;
aa283a38 6019
262b90c4 6020 final_exit:
bbe252da 6021 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6022 SV *sv_err = get_sv("REGERROR", 1);
6023 SV *sv_mrk = get_sv("REGMARK", 1);
6024 if (result) {
e2e6a0f1 6025 sv_commit = &PL_sv_no;
5d458dd8
YO
6026 if (!sv_yes_mark)
6027 sv_yes_mark = &PL_sv_yes;
6028 } else {
6029 if (!sv_commit)
6030 sv_commit = &PL_sv_yes;
6031 sv_yes_mark = &PL_sv_no;
6032 }
6033 sv_setsv(sv_err, sv_commit);
6034 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6035 }
19b95bf0 6036
81ed78b2
DM
6037
6038 if (last_pushed_cv) {
6039 dSP;
6040 POP_MULTICALL;
6041 }
6042
2f554ef7
DM
6043 /* clean up; in particular, free all slabs above current one */
6044 LEAVE_SCOPE(oldsave);
5d9a96ca 6045
95b24440 6046 return result;
a687059c
LW
6047}
6048
6049/*
6050 - regrepeat - repeatedly match something simple, report how many
6051 */
6052/*
6053 * [This routine now assumes that it will only match on things of length 1.
6054 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 6055 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 6056 */
76e3520e 6057STATIC I32
e2e6a0f1 6058S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 6059{
27da23d5 6060 dVAR;
a0d0e21e 6061 register char *scan;
a0d0e21e 6062 register I32 c;
3280af22 6063 register char *loceol = PL_regeol;
a0ed51b3 6064 register I32 hardcount = 0;
f2ed9b32 6065 register bool utf8_target = PL_reg_match_utf8;
d513472c 6066 UV utf8_flags;
4f55667c
SP
6067#ifndef DEBUGGING
6068 PERL_UNUSED_ARG(depth);
6069#endif
a0d0e21e 6070
7918f24d
NC
6071 PERL_ARGS_ASSERT_REGREPEAT;
6072
3280af22 6073 scan = PL_reginput;
faf11cac
HS
6074 if (max == REG_INFTY)
6075 max = I32_MAX;
6076 else if (max < loceol - scan)
7f596f4c 6077 loceol = scan + max;
a0d0e21e 6078 switch (OP(p)) {
22c35a8c 6079 case REG_ANY:
f2ed9b32 6080 if (utf8_target) {
ffc61ed2 6081 loceol = PL_regeol;
1aa99e6b 6082 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6083 scan += UTF8SKIP(scan);
6084 hardcount++;
6085 }
6086 } else {
6087 while (scan < loceol && *scan != '\n')
6088 scan++;
a0ed51b3
LW
6089 }
6090 break;
ffc61ed2 6091 case SANY:
f2ed9b32 6092 if (utf8_target) {
def8e4ea 6093 loceol = PL_regeol;
a0804c9e 6094 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6095 scan += UTF8SKIP(scan);
6096 hardcount++;
6097 }
6098 }
6099 else
6100 scan = loceol;
a0ed51b3 6101 break;
f33976b4
DB
6102 case CANY:
6103 scan = loceol;
6104 break;
59d32103
KW
6105 case EXACT:
6106 /* To get here, EXACTish nodes must have *byte* length == 1. That
6107 * means they match only characters in the string that can be expressed
6108 * as a single byte. For non-utf8 strings, that means a simple match.
6109 * For utf8 strings, the character matched must be an invariant, or
6110 * downgradable to a single byte. The pattern's utf8ness is
6111 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6112 * it is, it's an invariant */
6113
6114 c = (U8)*STRING(p);
6115 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6116
6117 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6118 while (scan < loceol && UCHARAT(scan) == c) {
6119 scan++;
6120 }
6121 }
6122 else {
6123
6124 /* Here, the string is utf8, and the pattern char is different
6125 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6126 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6127 * look for those in sequence in the utf8 string */
6128 U8 high = UTF8_TWO_BYTE_HI(c);
6129 U8 low = UTF8_TWO_BYTE_LO(c);
6130 loceol = PL_regeol;
6131
6132 while (hardcount < max
6133 && scan + 1 < loceol
6134 && UCHARAT(scan) == high
6135 && UCHARAT(scan + 1) == low)
6136 {
6137 scan += 2;
6138 hardcount++;
6139 }
6140 }
6141 break;
2f7f8cb1
KW
6142 case EXACTFA:
6143 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6144 goto do_exactf;
6145
d4e0b827
KW
6146 case EXACTFL:
6147 PL_reg_flags |= RF_tainted;
17580e7a
KW
6148 utf8_flags = FOLDEQ_UTF8_LOCALE;
6149 goto do_exactf;
6150
d4e0b827 6151 case EXACTF:
62bf7766
KW
6152 utf8_flags = 0;
6153 goto do_exactf;
6154
3c760661 6155 case EXACTFU_SS:
fab2782b 6156 case EXACTFU_TRICKYFOLD:
9a5a5549 6157 case EXACTFU:
05f861a2 6158 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6159
2be3e190
KW
6160 /* The comments for the EXACT case above apply as well to these fold
6161 * ones */
634c83a2 6162
2f7f8cb1 6163 do_exactf:
090f7165 6164 c = (U8)*STRING(p);
634c83a2 6165 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6166
3c760661 6167 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6168 char *tmpeol = loceol;
6169 while (hardcount < max
d513472c
KW
6170 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6171 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6172 {
6173 scan = tmpeol;
6174 tmpeol = loceol;
6175 hardcount++;
6176 }
634c83a2 6177
59d32103
KW
6178 /* XXX Note that the above handles properly the German sharp s in
6179 * the pattern matching ss in the string. But it doesn't handle
6180 * properly cases where the string contains say 'LIGATURE ff' and
6181 * the pattern is 'f+'. This would require, say, a new function or
6182 * revised interface to foldEQ_utf8(), in which the maximum number
6183 * of characters to match could be passed and it would return how
6184 * many actually did. This is just one of many cases where
6185 * multi-char folds don't work properly, and so the fix is being
6186 * deferred */
6187 }
6188 else {
87381386 6189 U8 folded;
59d32103 6190
2be3e190
KW
6191 /* Here, the string isn't utf8 and c is a single byte; and either
6192 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6193 * doesn't affect c. Can just do simple comparisons for exact or
6194 * fold matching. */
d4e0b827 6195 switch (OP(p)) {
87381386 6196 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6197 case EXACTFA:
fab2782b 6198 case EXACTFU_TRICKYFOLD:
9a5a5549 6199 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6200 case EXACTFL: folded = PL_fold_locale[c]; break;
6201 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6202 }
6203 while (scan < loceol &&
6204 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6205 {
6206 scan++;
634c83a2
KW
6207 }
6208 }
bbce6d69 6209 break;
f56b6394 6210 case ANYOFV:
a0d0e21e 6211 case ANYOF:
4e8910e0
KW
6212 if (utf8_target || OP(p) == ANYOFV) {
6213 STRLEN inclasslen;
ffc61ed2 6214 loceol = PL_regeol;
4e8910e0
KW
6215 inclasslen = loceol - scan;
6216 while (hardcount < max
6217 && ((inclasslen = loceol - scan) > 0)
6218 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6219 {
6220 scan += inclasslen;
ffc61ed2
JH
6221 hardcount++;
6222 }
6223 } else {
32fc9b6a 6224 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6225 scan++;
6226 }
a0d0e21e 6227 break;
980866de 6228 case ALNUMU:
f2ed9b32 6229 if (utf8_target) {
980866de 6230 utf8_wordchar:
ffc61ed2 6231 loceol = PL_regeol;
1a4fad37 6232 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6233 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6234 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6235 {
ffc61ed2
JH
6236 scan += UTF8SKIP(scan);
6237 hardcount++;
6238 }
980866de 6239 } else {
a12cf05f
KW
6240 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6241 scan++;
6242 }
980866de
KW
6243 }
6244 break;
6245 case ALNUM:
6246 if (utf8_target)
6247 goto utf8_wordchar;
6248 while (scan < loceol && isALNUM((U8) *scan)) {
6249 scan++;
a0ed51b3
LW
6250 }
6251 break;
cfaf538b
KW
6252 case ALNUMA:
6253 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6254 scan++;
6255 }
6256 break;
bbce6d69 6257 case ALNUML:
3280af22 6258 PL_reg_flags |= RF_tainted;
f2ed9b32 6259 if (utf8_target) {
ffc61ed2 6260 loceol = PL_regeol;
1aa99e6b
IH
6261 while (hardcount < max && scan < loceol &&
6262 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6263 scan += UTF8SKIP(scan);
6264 hardcount++;
6265 }
6266 } else {
6267 while (scan < loceol && isALNUM_LC(*scan))
6268 scan++;
a0ed51b3
LW
6269 }
6270 break;
980866de 6271 case NALNUMU:
f2ed9b32 6272 if (utf8_target) {
980866de
KW
6273
6274 utf8_Nwordchar:
6275
ffc61ed2 6276 loceol = PL_regeol;
1a4fad37 6277 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6278 while (hardcount < max && scan < loceol &&
980866de 6279 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6280 {
ffc61ed2
JH
6281 scan += UTF8SKIP(scan);
6282 hardcount++;
6283 }
980866de 6284 } else {
a12cf05f
KW
6285 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6286 scan++;
6287 }
980866de
KW
6288 }
6289 break;
6290 case NALNUM:
6291 if (utf8_target)
6292 goto utf8_Nwordchar;
6293 while (scan < loceol && ! isALNUM((U8) *scan)) {
6294 scan++;
a0ed51b3
LW
6295 }
6296 break;
cfaf538b
KW
6297 case NALNUMA:
6298 if (utf8_target) {
6299 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6300 scan += UTF8SKIP(scan);
6301 }
6302 }
6303 else {
6304 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6305 scan++;
6306 }
6307 }
6308 break;
bbce6d69 6309 case NALNUML:
3280af22 6310 PL_reg_flags |= RF_tainted;
f2ed9b32 6311 if (utf8_target) {
ffc61ed2 6312 loceol = PL_regeol;
1aa99e6b
IH
6313 while (hardcount < max && scan < loceol &&
6314 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6315 scan += UTF8SKIP(scan);
6316 hardcount++;
6317 }
6318 } else {
6319 while (scan < loceol && !isALNUM_LC(*scan))
6320 scan++;
a0ed51b3
LW
6321 }
6322 break;
980866de 6323 case SPACEU:
f2ed9b32 6324 if (utf8_target) {
980866de
KW
6325
6326 utf8_space:
6327
ffc61ed2 6328 loceol = PL_regeol;
1a4fad37 6329 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6330 while (hardcount < max && scan < loceol &&
3568d838 6331 (*scan == ' ' ||
a12cf05f
KW
6332 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6333 {
ffc61ed2
JH
6334 scan += UTF8SKIP(scan);
6335 hardcount++;
6336 }
980866de
KW
6337 break;
6338 }
6339 else {
a12cf05f
KW
6340 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6341 scan++;
6342 }
980866de
KW
6343 break;
6344 }
6345 case SPACE:
6346 if (utf8_target)
6347 goto utf8_space;
6348
6349 while (scan < loceol && isSPACE((U8) *scan)) {
6350 scan++;
a0ed51b3
LW
6351 }
6352 break;
cfaf538b
KW
6353 case SPACEA:
6354 while (scan < loceol && isSPACE_A((U8) *scan)) {
6355 scan++;
6356 }
6357 break;
bbce6d69 6358 case SPACEL:
3280af22 6359 PL_reg_flags |= RF_tainted;
f2ed9b32 6360 if (utf8_target) {
ffc61ed2 6361 loceol = PL_regeol;
1aa99e6b 6362 while (hardcount < max && scan < loceol &&
6bbba904 6363 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6364 scan += UTF8SKIP(scan);
6365 hardcount++;
6366 }
6367 } else {
6368 while (scan < loceol && isSPACE_LC(*scan))
6369 scan++;
a0ed51b3
LW
6370 }
6371 break;
980866de 6372 case NSPACEU:
f2ed9b32 6373 if (utf8_target) {
980866de
KW
6374
6375 utf8_Nspace:
6376
ffc61ed2 6377 loceol = PL_regeol;
1a4fad37 6378 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6379 while (hardcount < max && scan < loceol &&
980866de
KW
6380 ! (*scan == ' ' ||
6381 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6382 {
ffc61ed2
JH
6383 scan += UTF8SKIP(scan);
6384 hardcount++;
6385 }
980866de
KW
6386 break;
6387 }
6388 else {
a12cf05f
KW
6389 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6390 scan++;
6391 }
980866de
KW
6392 }
6393 break;
6394 case NSPACE:
6395 if (utf8_target)
6396 goto utf8_Nspace;
6397
6398 while (scan < loceol && ! isSPACE((U8) *scan)) {
6399 scan++;
a0ed51b3 6400 }
0008a298 6401 break;
cfaf538b
KW
6402 case NSPACEA:
6403 if (utf8_target) {
6404 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6405 scan += UTF8SKIP(scan);
6406 }
6407 }
6408 else {
6409 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6410 scan++;
6411 }
6412 }
6413 break;
bbce6d69 6414 case NSPACEL:
3280af22 6415 PL_reg_flags |= RF_tainted;
f2ed9b32 6416 if (utf8_target) {
ffc61ed2 6417 loceol = PL_regeol;
1aa99e6b 6418 while (hardcount < max && scan < loceol &&
6bbba904 6419 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6420 scan += UTF8SKIP(scan);
6421 hardcount++;
6422 }
6423 } else {
6424 while (scan < loceol && !isSPACE_LC(*scan))
6425 scan++;
a0ed51b3
LW
6426 }
6427 break;
a0d0e21e 6428 case DIGIT:
f2ed9b32 6429 if (utf8_target) {
ffc61ed2 6430 loceol = PL_regeol;
1a4fad37 6431 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6432 while (hardcount < max && scan < loceol &&
f2ed9b32 6433 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6434 scan += UTF8SKIP(scan);
6435 hardcount++;
6436 }
6437 } else {
6438 while (scan < loceol && isDIGIT(*scan))
6439 scan++;
a0ed51b3
LW
6440 }
6441 break;
cfaf538b
KW
6442 case DIGITA:
6443 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6444 scan++;
6445 }
6446 break;
b77393f6
KW
6447 case DIGITL:
6448 PL_reg_flags |= RF_tainted;
6449 if (utf8_target) {
6450 loceol = PL_regeol;
6451 while (hardcount < max && scan < loceol &&
6452 isDIGIT_LC_utf8((U8*)scan)) {
6453 scan += UTF8SKIP(scan);
6454 hardcount++;
6455 }
6456 } else {
6457 while (scan < loceol && isDIGIT_LC(*scan))
6458 scan++;
6459 }
6460 break;
a0d0e21e 6461 case NDIGIT:
f2ed9b32 6462 if (utf8_target) {
ffc61ed2 6463 loceol = PL_regeol;
1a4fad37 6464 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6465 while (hardcount < max && scan < loceol &&
f2ed9b32 6466 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6467 scan += UTF8SKIP(scan);
6468 hardcount++;
6469 }
6470 } else {
6471 while (scan < loceol && !isDIGIT(*scan))
6472 scan++;
a0ed51b3 6473 }
cfaf538b
KW
6474 break;
6475 case NDIGITA:
6476 if (utf8_target) {
6477 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6478 scan += UTF8SKIP(scan);
6479 }
6480 }
6481 else {
6482 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6483 scan++;
6484 }
6485 }
6486 break;
b77393f6
KW
6487 case NDIGITL:
6488 PL_reg_flags |= RF_tainted;
6489 if (utf8_target) {
6490 loceol = PL_regeol;
6491 while (hardcount < max && scan < loceol &&
6492 !isDIGIT_LC_utf8((U8*)scan)) {
6493 scan += UTF8SKIP(scan);
6494 hardcount++;
6495 }
6496 } else {
6497 while (scan < loceol && !isDIGIT_LC(*scan))
6498 scan++;
6499 }
6500 break;
e1d1eefb 6501 case LNBREAK:
f2ed9b32 6502 if (utf8_target) {
e1d1eefb
YO
6503 loceol = PL_regeol;
6504 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6505 scan += c;
6506 hardcount++;
6507 }
6508 } else {
6509 /*
6510 LNBREAK can match two latin chars, which is ok,
6511 because we have a null terminated string, but we
6512 have to use hardcount in this situation
6513 */
6514 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6515 scan+=c;
6516 hardcount++;
6517 }
6518 }
6519 break;
6520 case HORIZWS:
f2ed9b32 6521 if (utf8_target) {
e1d1eefb
YO
6522 loceol = PL_regeol;
6523 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6524 scan += c;
6525 hardcount++;
6526 }
6527 } else {
6528 while (scan < loceol && is_HORIZWS_latin1(scan))
6529 scan++;
6530 }
a0ed51b3 6531 break;
e1d1eefb 6532 case NHORIZWS:
f2ed9b32 6533 if (utf8_target) {
e1d1eefb
YO
6534 loceol = PL_regeol;
6535 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6536 scan += UTF8SKIP(scan);
6537 hardcount++;
6538 }
6539 } else {
6540 while (scan < loceol && !is_HORIZWS_latin1(scan))
6541 scan++;
6542
6543 }
6544 break;
6545 case VERTWS:
f2ed9b32 6546 if (utf8_target) {
e1d1eefb
YO
6547 loceol = PL_regeol;
6548 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6549 scan += c;
6550 hardcount++;
6551 }
6552 } else {
6553 while (scan < loceol && is_VERTWS_latin1(scan))
6554 scan++;
6555
6556 }
6557 break;
6558 case NVERTWS:
f2ed9b32 6559 if (utf8_target) {
e1d1eefb
YO
6560 loceol = PL_regeol;
6561 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6562 scan += UTF8SKIP(scan);
6563 hardcount++;
6564 }
6565 } else {
6566 while (scan < loceol && !is_VERTWS_latin1(scan))
6567 scan++;
6568
6569 }
6570 break;
6571
a0d0e21e
LW
6572 default: /* Called on something of 0 width. */
6573 break; /* So match right here or not at all. */
6574 }
a687059c 6575
a0ed51b3
LW
6576 if (hardcount)
6577 c = hardcount;
6578 else
6579 c = scan - PL_reginput;
3280af22 6580 PL_reginput = scan;
a687059c 6581
a3621e74 6582 DEBUG_r({
e68ec53f 6583 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6584 DEBUG_EXECUTE_r({
e68ec53f
YO
6585 SV * const prop = sv_newmortal();
6586 regprop(prog, prop, p);
6587 PerlIO_printf(Perl_debug_log,
be8e71aa 6588 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6589 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6590 });
be8e71aa 6591 });
9041c2e3 6592
a0d0e21e 6593 return(c);
a687059c
LW
6594}
6595
c277df42 6596
be8e71aa 6597#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6598/*
6c6525b8
KW
6599- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6600create a copy so that changes the caller makes won't change the shared one
6601 */
ffc61ed2 6602SV *
32fc9b6a 6603Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6604{
6c6525b8
KW
6605 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6606 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6607}
6608#endif
6609
6610STATIC SV *
6611S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6612{
8c9eb58f
KW
6613 /* Returns the swash for the input 'node' in the regex 'prog'.
6614 * If <doinit> is true, will attempt to create the swash if not already
6615 * done.
6616 * If <listsvp> is non-null, will return the swash initialization string in
6617 * it.
6618 * If <altsvp> is non-null, will return the alternates to the regular swash
6619 * in it
6620 * Tied intimately to how regcomp.c sets up the data structure */
6621
97aff369 6622 dVAR;
9e55ce06
JH
6623 SV *sw = NULL;
6624 SV *si = NULL;
6625 SV *alt = NULL;
7a6c6baa
KW
6626 SV* invlist = NULL;
6627
f8fc2ecf
YO
6628 RXi_GET_DECL(prog,progi);
6629 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6630
6c6525b8 6631 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6632
ccb2541c
KW
6633 assert(ANYOF_NONBITMAP(node));
6634
4f639d21 6635 if (data && data->count) {
a3b680e6 6636 const U32 n = ARG(node);
ffc61ed2 6637
4f639d21 6638 if (data->what[n] == 's') {
ad64d0ec
NC
6639 SV * const rv = MUTABLE_SV(data->data[n]);
6640 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6641 SV **const ary = AvARRAY(av);
7a6c6baa 6642 bool invlist_has_user_defined_property;
9041c2e3 6643
8c9eb58f 6644 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6645
7a6c6baa
KW
6646 /* Elements 3 and 4 are either both present or both absent. [3] is
6647 * any inversion list generated at compile time; [4] indicates if
6648 * that inversion list has any user-defined properties in it. */
6649 if (av_len(av) >= 3) {
6650 invlist = ary[3];
6651 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6652 }
6653 else {
6654 invlist = NULL;
6655 invlist_has_user_defined_property = FALSE;
6656 }
6657
8c9eb58f
KW
6658 /* Element [1] is reserved for the set-up swash. If already there,
6659 * return it; if not, create it and store it there */
f192cf32
KW
6660 if (SvROK(ary[1])) {
6661 sw = ary[1];
6662 }
ffc61ed2 6663 else if (si && doinit) {
7a6c6baa
KW
6664
6665 sw = _core_swash_init("utf8", /* the utf8 package */
6666 "", /* nameless */
6667 si,
6668 1, /* binary */
6669 0, /* not from tr/// */
6670 FALSE, /* is error if can't find
6671 property */
6672 invlist,
6673 invlist_has_user_defined_property);
ffc61ed2
JH
6674 (void)av_store(av, 1, sw);
6675 }
8c9eb58f
KW
6676
6677 /* Element [2] is for any multi-char folds. Note that is a
6678 * fundamentally flawed design, because can't backtrack and try
6679 * again. See [perl #89774] */
f192cf32
KW
6680 if (SvTYPE(ary[2]) == SVt_PVAV) {
6681 alt = ary[2];
6682 }
ffc61ed2
JH
6683 }
6684 }
6685
7a6c6baa
KW
6686 if (listsvp) {
6687 SV* matches_string = newSVpvn("", 0);
6688 SV** invlistsvp;
6689
6690 /* Use the swash, if any, which has to have incorporated into it all
6691 * possibilities */
6692 if ( sw
6693 && SvROK(sw)
6694 && SvTYPE(SvRV(sw)) == SVt_PVHV
6695 && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6696 {
6697 invlist = *invlistsvp;
6698 }
6699 else if (si && si != &PL_sv_undef) {
6700
6701 /* If no swash, use the input nitialization string, if available */
6702 sv_catsv(matches_string, si);
6703 }
6704
6705 /* Add the inversion list to whatever we have. This may have come from
6706 * the swash, or from an input parameter */
6707 if (invlist) {
6708 sv_catsv(matches_string, _invlist_contents(invlist));
6709 }
6710 *listsvp = matches_string;
6711 }
6712
9e55ce06
JH
6713 if (altsvp)
6714 *altsvp = alt;
ffc61ed2
JH
6715
6716 return sw;
6717}
6718
6719/*
ba7b4546 6720 - reginclass - determine if a character falls into a character class
832705d4 6721
6698fab5
KW
6722 n is the ANYOF regnode
6723 p is the target string
6724 lenp is pointer to the maximum number of bytes of how far to go in p
6725 (This is assumed wthout checking to always be at least the current
6726 character's size)
6727 utf8_target tells whether p is in UTF-8.
832705d4 6728
4b3cda86
KW
6729 Returns true if matched; false otherwise. If lenp is not NULL, on return
6730 from a successful match, the value it points to will be updated to how many
6731 bytes in p were matched. If there was no match, the value is undefined,
6732 possibly changed from the input.
eba1359e 6733
d5788240
KW
6734 Note that this can be a synthetic start class, a combination of various
6735 nodes, so things you think might be mutually exclusive, such as locale,
6736 aren't. It can match both locale and non-locale
6737
bbce6d69 6738 */
6739
76e3520e 6740STATIC bool
f6ad78d8 6741S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6742{
27da23d5 6743 dVAR;
a3b680e6 6744 const char flags = ANYOF_FLAGS(n);
bbce6d69 6745 bool match = FALSE;
cc07378b 6746 UV c = *p;
f7ab54c6 6747 STRLEN c_len = 0;
6698fab5 6748 STRLEN maxlen;
1aa99e6b 6749
7918f24d
NC
6750 PERL_ARGS_ASSERT_REGINCLASS;
6751
4b3cda86 6752 /* If c is not already the code point, get it */
f2ed9b32 6753 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6754 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6755 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6756 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6757 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6758 * UTF8_ALLOW_FFFF */
f7ab54c6 6759 if (c_len == (STRLEN)-1)
e8a70c6f 6760 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6761 }
a5a291f5
KW
6762 else {
6763 c_len = 1;
6764 }
bbce6d69 6765
a5a291f5
KW
6766 /* Use passed in max length, or one character if none passed in or less
6767 * than one character. And assume will match just one character. This is
6768 * overwritten later if matched more. */
4b3cda86 6769 if (lenp) {
a5a291f5
KW
6770 maxlen = (*lenp > c_len) ? *lenp : c_len;
6771 *lenp = c_len;
4b3cda86
KW
6772
6773 }
6774 else {
a5a291f5 6775 maxlen = c_len;
4b3cda86
KW
6776 }
6777
7cdde544
KW
6778 /* If this character is potentially in the bitmap, check it */
6779 if (c < 256) {
ffc61ed2
JH
6780 if (ANYOF_BITMAP_TEST(n, c))
6781 match = TRUE;
11454c59
KW
6782 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6783 && ! utf8_target
6784 && ! isASCII(c))
6785 {
6786 match = TRUE;
6787 }
a0ed51b3 6788
78969a98
KW
6789 else if (flags & ANYOF_LOCALE) {
6790 PL_reg_flags |= RF_tainted;
6791
6792 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6793 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6794 {
ffc61ed2 6795 match = TRUE;
78969a98 6796 }
040aea3a
KW
6797 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6798 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6799 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6800 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6801 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6802 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6803 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6804 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6805 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6806 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6807 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
6808 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6809 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
6810 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6811 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6812 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6813 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6814 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6815 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6816 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6817 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6818 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6819 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6820 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6821 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6822 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6823 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6824 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6825 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
6826 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6827 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 6828 ) /* How's that for a conditional? */
78969a98 6829 ) {
ffc61ed2
JH
6830 match = TRUE;
6831 }
a0ed51b3 6832 }
a0ed51b3
LW
6833 }
6834
7cdde544 6835 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6836 * bitmap could match, try that. Locale nodes specifiy completely the
6837 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6838 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6839 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6840 * could be a combination of locale and non-locale nodes. So allow locale
6841 * to match for the synthetic start class, which will give a false
6842 * positive that will be resolved when the match is done again as not part
6843 * of the synthetic start class */
ef87b810 6844 if (!match) {
10ee90d2
KW
6845 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6846 match = TRUE; /* Everything above 255 matches */
e051a21d 6847 }
6f8d7d0d
KW
6848 else if (ANYOF_NONBITMAP(n)
6849 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6850 || (utf8_target
6851 && (c >=256
6852 || (! (flags & ANYOF_LOCALE))
6853 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6854 {
7cdde544 6855 AV *av;
210e6c47 6856 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6857
6858 if (sw) {
6859 U8 * utf8_p;
6860 if (utf8_target) {
6861 utf8_p = (U8 *) p;
6862 } else {
f56b6394
KW
6863
6864 /* Not utf8. Convert as much of the string as available up
6865 * to the limit of how far the (single) character in the
6866 * pattern can possibly match (no need to go further). If
6867 * the node is a straight ANYOF or not folding, it can't
6868 * match more than one. Otherwise, It can match up to how
6869 * far a single char can fold to. Since not utf8, each
6870 * character is a single byte, so the max it can be in
6871 * bytes is the same as the max it can be in characters */
6872 STRLEN len = (OP(n) == ANYOF
6873 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6874 ? 1
6875 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6876 ? maxlen
6877 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
6878 utf8_p = bytes_to_utf8(p, &len);
6879 }
f56b6394 6880
f7eb7452 6881 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 6882 match = TRUE;
39065660 6883 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
6884
6885 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
6886 * matches. The non-multi char folds have all been moved to
6887 * the compilation phase, and the multi-char folds have
6888 * been stored by regcomp into 'av'; we linearly check to
6889 * see if any match the target string (folded). We know
6890 * that the originals were each one character, but we don't
6891 * currently know how many characters/bytes each folded to,
6892 * except we do know that there are small limits imposed by
6893 * Unicode. XXX A performance enhancement would be to have
6894 * regcomp.c store the max number of chars/bytes that are
6895 * in an av entry, as, say the 0th element. Even better
6896 * would be to have a hash of the few characters that can
6897 * start a multi-char fold to the max number of chars of
6898 * those folds.
f56b6394
KW
6899 *
6900 * If there is a match, we will need to advance (if lenp is
6901 * specified) the match pointer in the target string. But
6902 * what we are comparing here isn't that string directly,
6903 * but its fold, whose length may differ from the original.
6904 * As we go along in constructing the fold, therefore, we
6905 * create a map so that we know how many bytes in the
6906 * source to advance given that we have matched a certain
6907 * number of bytes in the fold. This map is stored in
6438af90
KW
6908 * 'map_fold_len_back'. Let n mean the number of bytes in
6909 * the fold of the first character that we are folding.
6910 * Then map_fold_len_back[n] is set to the number of bytes
6911 * in that first character. Similarly let m be the
6912 * corresponding number for the second character to be
6913 * folded. Then map_fold_len_back[n+m] is set to the
6914 * number of bytes occupied by the first two source
6915 * characters. ... */
6916 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
6917 U8 folded[UTF8_MAXBYTES_CASE+1];
6918 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 6919 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 6920 chars */
f56b6394
KW
6921
6922 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6923
6924 /* Here, only need to fold the first char of the target
6438af90 6925 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 6926 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 6927 total_foldlen = foldlen;
6438af90
KW
6928 map_fold_len_back[foldlen] = (utf8_target)
6929 ? UTF8SKIP(utf8_p)
6930 : 1;
f56b6394
KW
6931 }
6932 else {
6933
6934 /* Here, need to fold more than the first char. Do so
6935 * up to the limits */
f56b6394
KW
6936 U8* source_ptr = utf8_p; /* The source for the fold
6937 is the regex target
6938 string */
6939 U8* folded_ptr = folded;
6940 U8* e = utf8_p + maxlen; /* Can't go beyond last
6941 available byte in the
6942 target string */
6438af90
KW
6943 U8 i;
6944 for (i = 0;
6945 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6946 i++)
f56b6394
KW
6947 {
6948
6949 /* Fold the next character */
6950 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6951 STRLEN this_char_foldlen;
6952 to_utf8_fold(source_ptr,
6953 this_char_folded,
6954 &this_char_foldlen);
6955
6956 /* Bail if it would exceed the byte limit for
6957 * folding a single char. */
6958 if (this_char_foldlen + folded_ptr - folded >
6959 UTF8_MAXBYTES_CASE)
6960 {
6961 break;
6962 }
6963
6438af90 6964 /* Add the fold of this character */
f56b6394
KW
6965 Copy(this_char_folded,
6966 folded_ptr,
6967 this_char_foldlen,
6968 U8);
f56b6394 6969 source_ptr += UTF8SKIP(source_ptr);
6438af90 6970 folded_ptr += this_char_foldlen;
deba3d96 6971 total_foldlen = folded_ptr - folded;
6438af90
KW
6972
6973 /* Create map from the number of bytes in the fold
6974 * back to the number of bytes in the source. If
6975 * the source isn't utf8, the byte count is just
6976 * the number of characters so far */
deba3d96 6977 map_fold_len_back[total_foldlen]
6438af90
KW
6978 = (utf8_target)
6979 ? source_ptr - utf8_p
6980 : i + 1;
f56b6394
KW
6981 }
6982 *folded_ptr = '\0';
f56b6394
KW
6983 }
6984
6985
6986 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
6987 * of multi-char folds. */
6988 if (av) {
7cdde544
KW
6989 I32 i;
6990 for (i = 0; i <= av_len(av); i++) {
6991 SV* const sv = *av_fetch(av, i, FALSE);
6992 STRLEN len;
6993 const char * const s = SvPV_const(sv, len);
6438af90 6994
f9126265
KW
6995 if (len <= total_foldlen
6996 && memEQ(s, (char*)folded, len)
6997
6998 /* If 0, means matched a partial char. See
6999 * [perl #90536] */
7000 && map_fold_len_back[len])
f56b6394
KW
7001 {
7002
7003 /* Advance the target string ptr to account for
7004 * this fold, but have to translate from the
7005 * folded length to the corresponding source
6438af90 7006 * length. */
8f2655f7
KW
7007 if (lenp) {
7008 *lenp = map_fold_len_back[len];
8f2655f7 7009 }
7cdde544
KW
7010 match = TRUE;
7011 break;
7012 }
7013 }
7014 }
7cdde544
KW
7015 }
7016
7017 /* If we allocated a string above, free it */
7018 if (! utf8_target) Safefree(utf8_p);
7019 }
7020 }
7021 }
7022
a0ed51b3
LW
7023 return (flags & ANYOF_INVERT) ? !match : match;
7024}
161b471a 7025
dfe13c55 7026STATIC U8 *
0ce71af7 7027S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7028{
6af86488
KW
7029 /* return the position 'off' UTF-8 characters away from 's', forward if
7030 * 'off' >= 0, backwards if negative. But don't go outside of position
7031 * 'lim', which better be < s if off < 0 */
7032
97aff369 7033 dVAR;
7918f24d
NC
7034
7035 PERL_ARGS_ASSERT_REGHOP3;
7036
a0ed51b3 7037 if (off >= 0) {
1aa99e6b 7038 while (off-- && s < lim) {
ffc61ed2 7039 /* XXX could check well-formedness here */
a0ed51b3 7040 s += UTF8SKIP(s);
ffc61ed2 7041 }
a0ed51b3
LW
7042 }
7043 else {
1de06328
YO
7044 while (off++ && s > lim) {
7045 s--;
7046 if (UTF8_IS_CONTINUED(*s)) {
7047 while (s > lim && UTF8_IS_CONTINUATION(*s))
7048 s--;
a0ed51b3 7049 }
1de06328 7050 /* XXX could check well-formedness here */
a0ed51b3
LW
7051 }
7052 }
7053 return s;
7054}
161b471a 7055
f9f4320a
YO
7056#ifdef XXX_dmq
7057/* there are a bunch of places where we use two reghop3's that should
7058 be replaced with this routine. but since thats not done yet
7059 we ifdef it out - dmq
7060*/
dfe13c55 7061STATIC U8 *
1de06328
YO
7062S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7063{
7064 dVAR;
7918f24d
NC
7065
7066 PERL_ARGS_ASSERT_REGHOP4;
7067
1de06328
YO
7068 if (off >= 0) {
7069 while (off-- && s < rlim) {
7070 /* XXX could check well-formedness here */
7071 s += UTF8SKIP(s);
7072 }
7073 }
7074 else {
7075 while (off++ && s > llim) {
7076 s--;
7077 if (UTF8_IS_CONTINUED(*s)) {
7078 while (s > llim && UTF8_IS_CONTINUATION(*s))
7079 s--;
7080 }
7081 /* XXX could check well-formedness here */
7082 }
7083 }
7084 return s;
7085}
f9f4320a 7086#endif
1de06328
YO
7087
7088STATIC U8 *
0ce71af7 7089S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7090{
97aff369 7091 dVAR;
7918f24d
NC
7092
7093 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7094
a0ed51b3 7095 if (off >= 0) {
1aa99e6b 7096 while (off-- && s < lim) {
ffc61ed2 7097 /* XXX could check well-formedness here */
a0ed51b3 7098 s += UTF8SKIP(s);
ffc61ed2 7099 }
a0ed51b3 7100 if (off >= 0)
3dab1dad 7101 return NULL;
a0ed51b3
LW
7102 }
7103 else {
1de06328
YO
7104 while (off++ && s > lim) {
7105 s--;
7106 if (UTF8_IS_CONTINUED(*s)) {
7107 while (s > lim && UTF8_IS_CONTINUATION(*s))
7108 s--;
a0ed51b3 7109 }
1de06328 7110 /* XXX could check well-formedness here */
a0ed51b3
LW
7111 }
7112 if (off <= 0)
3dab1dad 7113 return NULL;
a0ed51b3
LW
7114 }
7115 return s;
7116}
51371543 7117
51371543 7118static void
acfe0abc 7119restore_pos(pTHX_ void *arg)
51371543 7120{
97aff369 7121 dVAR;
097eb12c 7122 regexp * const rex = (regexp *)arg;
ed301438 7123 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7124 if (PL_reg_oldsaved) {
4f639d21
DM
7125 rex->subbeg = PL_reg_oldsaved;
7126 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 7127#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7128 rex->saved_copy = PL_nrs;
ed252734 7129#endif
07bc277f 7130 RXp_MATCH_COPIED_on(rex);
51371543
GS
7131 }
7132 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7133 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7134 PL_curpm = PL_reg_oldcurpm;
7135 }
7136}
33b8afdf
JH
7137
7138STATIC void
7139S_to_utf8_substr(pTHX_ register regexp *prog)
7140{
a1cac82e 7141 int i = 1;
7918f24d
NC
7142
7143 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7144
a1cac82e
NC
7145 do {
7146 if (prog->substrs->data[i].substr
7147 && !prog->substrs->data[i].utf8_substr) {
7148 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7149 prog->substrs->data[i].utf8_substr = sv;
7150 sv_utf8_upgrade(sv);
610460f9 7151 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7152 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7153 /* Trim the trailing \n that fbm_compile added last
7154 time. */
7155 SvCUR_set(sv, SvCUR(sv) - 1);
7156 /* Whilst this makes the SV technically "invalid" (as its
7157 buffer is no longer followed by "\0") when fbm_compile()
7158 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7159 fbm_compile(sv, FBMcf_TAIL);
7160 } else
7161 fbm_compile(sv, 0);
610460f9 7162 }
a1cac82e
NC
7163 if (prog->substrs->data[i].substr == prog->check_substr)
7164 prog->check_utf8 = sv;
7165 }
7166 } while (i--);
33b8afdf
JH
7167}
7168
7169STATIC void
7170S_to_byte_substr(pTHX_ register regexp *prog)
7171{
97aff369 7172 dVAR;
a1cac82e 7173 int i = 1;
7918f24d
NC
7174
7175 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7176
a1cac82e
NC
7177 do {
7178 if (prog->substrs->data[i].utf8_substr
7179 && !prog->substrs->data[i].substr) {
7180 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7181 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7182 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7183 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7184 /* Trim the trailing \n that fbm_compile added last
7185 time. */
7186 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7187 fbm_compile(sv, FBMcf_TAIL);
7188 } else
7189 fbm_compile(sv, 0);
7190 }
a1cac82e
NC
7191 } else {
7192 SvREFCNT_dec(sv);
7193 sv = &PL_sv_undef;
7194 }
7195 prog->substrs->data[i].substr = sv;
7196 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7197 prog->check_substr = sv;
33b8afdf 7198 }
a1cac82e 7199 } while (i--);
33b8afdf 7200}
66610fdd
RGS
7201
7202/*
7203 * Local variables:
7204 * c-indentation-style: bsd
7205 * c-basic-offset: 4
14d04a33 7206 * indent-tabs-mode: nil
66610fdd
RGS
7207 * End:
7208 *
14d04a33 7209 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7210 */