This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix typo in documentation
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
ef07e810 83#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 84#define RF_warned 2 /* warned about big count? */
faec1544 85
ab3bbdeb 86#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 87
f2ed9b32 88#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
89
90#define RS_init 1 /* eval environment created */
91#define RS_set 2 /* replsv value is set */
c277df42 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
a9a337f8
KW
125#ifdef EBCDIC
126 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
127 * skip the check on EBCDIC platforms */
128# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
129#else
130# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1
RB
131 if (!CAT2(PL_utf8_,class)) { \
132 bool ok; \
133 ENTER; save_re_context(); \
134 ok=CAT2(is_utf8_,class)((const U8*)str); \
9ac792f4 135 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
a9a337f8 136#endif
37e2e78e
KW
137
138/* Doesn't do an assert to verify that is correct */
139#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 140 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 141 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
142 ENTER; save_re_context(); \
143 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
144 LEAVE; } } STMT_END
37e2e78e 145
1a4fad37
AL
146#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
147#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
148#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 149
37e2e78e 150#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
d275fa5e
CB
151 LOAD_UTF8_CHARCLASS(X_begin, " "); \
152 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
37e2e78e
KW
153 /* These are utf8 constants, and not utf-ebcdic constants, so the \
154 * assert should likely and hopefully fail on an EBCDIC machine */ \
d275fa5e 155 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
37e2e78e
KW
156 \
157 /* No asserts are done for these, in case called on an early \
158 * Unicode version in which they map to nothing */ \
d275fa5e
CB
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
163 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
164 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
37e2e78e 165 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
20d0b1e9 166
1dcf4a1b 167#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 168
ee9a90b8
KW
169/* The actual code for CCC_TRY, which uses several variables from the routine
170 * it's callable from. It is designed to be the bulk of a case statement.
171 * FUNC is the macro or function to call on non-utf8 targets that indicate if
172 * nextchr matches the class.
173 * UTF8_TEST is the whole test string to use for utf8 targets
174 * LOAD is what to use to test, and if not present to load in the swash for the
175 * class
176 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
177 * UTF8_TEST test.
178 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
179 * utf8 and a variant, load the swash if necessary and test using the utf8
180 * test. Advance to the next character if test is ok, otherwise fail; If not
181 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
182 * fails, or advance to the next character */
183
184#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
185 if (locinput >= PL_regeol) { \
186 sayNO; \
187 } \
188 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
189 LOAD_UTF8_CHARCLASS(CLASS, STR); \
190 if (POS_OR_NEG (UTF8_TEST)) { \
191 sayNO; \
192 } \
193 locinput += PL_utf8skip[nextchr]; \
194 nextchr = UCHARAT(locinput); \
195 break; \
196 } \
197 if (POS_OR_NEG (FUNC(nextchr))) { \
198 sayNO; \
199 } \
200 nextchr = UCHARAT(++locinput); \
980866de
KW
201 break;
202
ee9a90b8
KW
203/* Handle the non-locale cases for a character class and its complement. It
204 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
205 * This is because that code fails when the test succeeds, so we want to have
206 * the test fail so that the code succeeds. The swash is stored in a
207 * predictable PL_ place */
cfaf538b
KW
208#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
209 CLASS, STR) \
ee9a90b8
KW
210 case NAME: \
211 _CCC_TRY_CODE( !, FUNC, \
212 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
213 (U8*)locinput, TRUE)), \
214 CLASS, STR) \
215 case NNAME: \
1dcf4a1b 216 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
217 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
218 (U8*)locinput, TRUE)), \
219 CLASS, STR) \
220
221/* Generate the case statements for both locale and non-locale character
222 * classes in regmatch for classes that don't have special unicode semantics.
223 * Locales don't use an immediate swash, but an intermediary special locale
224 * function that is called on the pointer to the current place in the input
225 * string. That function will resolve to needing the same swash. One might
226 * think that because we don't know what the locale will match, we shouldn't
227 * check with the swash loading function that it loaded properly; ie, that we
228 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
229 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
230 * irrelevant here */
231#define CCC_TRY(NAME, NNAME, FUNC, \
232 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 233 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
234 CLASS, STR) \
235 case NAMEL: \
236 PL_reg_flags |= RF_tainted; \
237 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
238 case NNAMEL: \
239 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
240 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
241 CLASS, STR) \
cfaf538b
KW
242 case NAMEA: \
243 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
244 sayNO; \
245 } \
246 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
247 nextchr = UCHARAT(++locinput); \
248 break; \
249 case NNAMEA: \
250 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
251 sayNO; \
252 } \
253 if (utf8_target) { \
254 locinput += PL_utf8skip[nextchr]; \
255 nextchr = UCHARAT(locinput); \
256 } \
257 else { \
258 nextchr = UCHARAT(++locinput); \
259 } \
260 break; \
ee9a90b8
KW
261 /* Generate the non-locale cases */ \
262 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
263
264/* This is like CCC_TRY, but has an extra set of parameters for generating case
265 * statements to handle separate Unicode semantics nodes */
266#define CCC_TRY_U(NAME, NNAME, FUNC, \
267 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
268 NAMEU, NNAMEU, FUNCU, \
cfaf538b 269 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 270 CLASS, STR) \
cfaf538b
KW
271 CCC_TRY(NAME, NNAME, FUNC, \
272 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
273 NAMEA, NNAMEA, FUNCA, \
274 CLASS, STR) \
ee9a90b8 275 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 276
3dab1dad
YO
277/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
278
5f80c4cf 279/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
280/* it would be nice to rework regcomp.sym to generate this stuff. sigh
281 *
282 * NOTE that *nothing* that affects backtracking should be in here, specifically
283 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
284 * node that is in between two EXACT like nodes when ascertaining what the required
285 * "follow" character is. This should probably be moved to regex compile time
286 * although it may be done at run time beause of the REF possibility - more
287 * investigation required. -- demerphq
288*/
3e901dc0
YO
289#define JUMPABLE(rn) ( \
290 OP(rn) == OPEN || \
291 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
292 OP(rn) == EVAL || \
cca55fe3
JP
293 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
294 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 295 OP(rn) == KEEPS || \
3dab1dad 296 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 297)
ee9b8eae 298#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 299
ee9b8eae
YO
300#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
301
302#if 0
303/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
304 we don't need this definition. */
305#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
ca600955 306#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
307#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
308
309#else
310/* ... so we use this as its faster. */
311#define IS_TEXT(rn) ( OP(rn)==EXACT )
ca600955 312#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
ee9b8eae
YO
313#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
314#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
315
316#endif
e2d8ce26 317
a84d97b6
HS
318/*
319 Search for mandatory following text node; for lookahead, the text must
320 follow but for lookbehind (rn->flags != 0) we skip to the next step.
321*/
cca55fe3 322#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
323 while (JUMPABLE(rn)) { \
324 const OPCODE type = OP(rn); \
325 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 326 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 327 else if (type == PLUS) \
cca55fe3 328 rn = NEXTOPER(rn); \
3dab1dad 329 else if (type == IFMATCH) \
a84d97b6 330 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 331 else rn += NEXT_OFF(rn); \
3dab1dad 332 } \
5f80c4cf 333} STMT_END
74750237 334
c476f425 335
acfe0abc 336static void restore_pos(pTHX_ void *arg);
51371543 337
620d5b66
NC
338#define REGCP_PAREN_ELEMS 4
339#define REGCP_OTHER_ELEMS 5
e0fa7e2b 340#define REGCP_FRAME_ELEMS 1
620d5b66
NC
341/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
342 * are needed for the regexp context stack bookkeeping. */
343
76e3520e 344STATIC CHECKPOINT
cea2e8a9 345S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 346{
97aff369 347 dVAR;
a3b680e6 348 const int retval = PL_savestack_ix;
a3b680e6 349 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
350 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
351 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
a0d0e21e 352 int p;
40a82448 353 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 354
e49a9654 355 if (paren_elems_to_push < 0)
5637ef5b
NC
356 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
357 paren_elems_to_push);
e49a9654 358
e0fa7e2b
NC
359 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
360 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
361 " out of range (%lu-%ld)",
362 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 363
620d5b66 364 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 365
3280af22 366 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
f0ab9afb
NC
368 SSPUSHINT(PL_regoffs[p].end);
369 SSPUSHINT(PL_regoffs[p].start);
3280af22 370 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 371 SSPUSHINT(p);
e7707071 372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
40a82448 373 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
f0ab9afb 374 (UV)p, (IV)PL_regoffs[p].start,
40a82448 375 (IV)(PL_reg_start_tmp[p] - PL_bostr),
f0ab9afb 376 (IV)PL_regoffs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
f0ab9afb 380 SSPUSHPTR(PL_regoffs);
3280af22
NIS
381 SSPUSHINT(PL_regsize);
382 SSPUSHINT(*PL_reglastparen);
a01268b5 383 SSPUSHINT(*PL_reglastcloseparen);
3280af22 384 SSPUSHPTR(PL_reginput);
e0fa7e2b 385 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 386
a0d0e21e
LW
387 return retval;
388}
389
c277df42 390/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
391#define REGCP_SET(cp) \
392 DEBUG_STATE_r( \
ab3bbdeb 393 PerlIO_printf(Perl_debug_log, \
e4f74956 394 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
395 (IV)PL_savestack_ix)); \
396 cp = PL_savestack_ix
c3464db5 397
ab3bbdeb 398#define REGCP_UNWIND(cp) \
e4f74956 399 DEBUG_STATE_r( \
ab3bbdeb 400 if (cp != PL_savestack_ix) \
e4f74956
YO
401 PerlIO_printf(Perl_debug_log, \
402 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
403 (IV)(cp), (IV)PL_savestack_ix)); \
404 regcpblow(cp)
c277df42 405
76e3520e 406STATIC char *
097eb12c 407S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 408{
97aff369 409 dVAR;
e0fa7e2b 410 UV i;
a0d0e21e 411 char *input;
a3621e74
YO
412 GET_RE_DEBUG_FLAGS_DECL;
413
7918f24d
NC
414 PERL_ARGS_ASSERT_REGCPPOP;
415
b1ce53c5 416 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 417 i = SSPOPUV;
e0fa7e2b
NC
418 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
419 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
a0d0e21e 420 input = (char *) SSPOPPTR;
a01268b5 421 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
422 *PL_reglastparen = SSPOPINT;
423 PL_regsize = SSPOPINT;
f0ab9afb 424 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
b1ce53c5 425
620d5b66 426 i -= REGCP_OTHER_ELEMS;
b1ce53c5 427 /* Now restore the parentheses context. */
620d5b66 428 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 429 I32 tmps;
097eb12c 430 U32 paren = (U32)SSPOPINT;
3280af22 431 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
f0ab9afb 432 PL_regoffs[paren].start = SSPOPINT;
cf93c79d 433 tmps = SSPOPINT;
3280af22 434 if (paren <= *PL_reglastparen)
f0ab9afb 435 PL_regoffs[paren].end = tmps;
e7707071 436 DEBUG_BUFFERS_r(
c3464db5 437 PerlIO_printf(Perl_debug_log,
b900a521 438 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
f0ab9afb 439 (UV)paren, (IV)PL_regoffs[paren].start,
b900a521 440 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
f0ab9afb 441 (IV)PL_regoffs[paren].end,
3280af22 442 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 443 );
a0d0e21e 444 }
e7707071 445 DEBUG_BUFFERS_r(
bb7a0f54 446 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 447 PerlIO_printf(Perl_debug_log,
faccc32b 448 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 449 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
450 }
451 );
daf18116 452#if 1
dafc8851
JH
453 /* It would seem that the similar code in regtry()
454 * already takes care of this, and in fact it is in
455 * a better location to since this code can #if 0-ed out
456 * but the code in regtry() is needed or otherwise tests
457 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
458 * (as of patchlevel 7877) will fail. Then again,
459 * this code seems to be necessary or otherwise
225593e1
DM
460 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
461 * --jhi updated by dapm */
3b6647e0 462 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
097eb12c 463 if (i > PL_regsize)
f0ab9afb
NC
464 PL_regoffs[i].start = -1;
465 PL_regoffs[i].end = -1;
a0d0e21e 466 }
dafc8851 467#endif
a0d0e21e
LW
468 return input;
469}
470
02db2b7b 471#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 472
a687059c 473/*
e50aee73 474 * pregexec and friends
a687059c
LW
475 */
476
76234dfb 477#ifndef PERL_IN_XSUB_RE
a687059c 478/*
c277df42 479 - pregexec - match a regexp against a string
a687059c 480 */
c277df42 481I32
49d7dfbc 482Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 483 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
484/* strend: pointer to null at end of string */
485/* strbeg: real beginning of string */
486/* minend: end of match must be >=minend after stringarg. */
487/* nosave: For optimizations. */
488{
7918f24d
NC
489 PERL_ARGS_ASSERT_PREGEXEC;
490
c277df42 491 return
9041c2e3 492 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
493 nosave ? 0 : REXEC_COPY_STR);
494}
76234dfb 495#endif
22e551b9 496
9041c2e3 497/*
cad2e5aa
JH
498 * Need to implement the following flags for reg_anch:
499 *
500 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
501 * USE_INTUIT_ML
502 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
503 * INTUIT_AUTORITATIVE_ML
504 * INTUIT_ONCE_NOML - Intuit can match in one location only.
505 * INTUIT_ONCE_ML
506 *
507 * Another flag for this function: SECOND_TIME (so that float substrs
508 * with giant delta may be not rechecked).
509 */
510
511/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
512
3f7c398e 513/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
514 Otherwise, only SvCUR(sv) is used to get strbeg. */
515
516/* XXXX We assume that strpos is strbeg unless sv. */
517
6eb5f6b9
JH
518/* XXXX Some places assume that there is a fixed substring.
519 An update may be needed if optimizer marks as "INTUITable"
520 RExen without fixed substrings. Similarly, it is assumed that
521 lengths of all the strings are no more than minlen, thus they
522 cannot come from lookahead.
40d049e4
YO
523 (Or minlen should take into account lookahead.)
524 NOTE: Some of this comment is not correct. minlen does now take account
525 of lookahead/behind. Further research is required. -- demerphq
526
527*/
6eb5f6b9 528
2c2d71f5
JH
529/* A failure to find a constant substring means that there is no need to make
530 an expensive call to REx engine, thus we celebrate a failure. Similarly,
531 finding a substring too deep into the string means that less calls to
30944b6d
IZ
532 regtry() should be needed.
533
534 REx compiler's optimizer found 4 possible hints:
535 a) Anchored substring;
536 b) Fixed substring;
537 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 538 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 539 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
540 string which does not contradict any of them.
541 */
2c2d71f5 542
6eb5f6b9
JH
543/* Most of decisions we do here should have been done at compile time.
544 The nodes of the REx which we used for the search should have been
545 deleted from the finite automaton. */
546
cad2e5aa 547char *
288b8c02 548Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 549 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 550{
97aff369 551 dVAR;
288b8c02 552 struct regexp *const prog = (struct regexp *)SvANY(rx);
b7953727 553 register I32 start_shift = 0;
cad2e5aa 554 /* Should be nonnegative! */
b7953727 555 register I32 end_shift = 0;
2c2d71f5
JH
556 register char *s;
557 register SV *check;
a1933d95 558 char *strbeg;
cad2e5aa 559 char *t;
f2ed9b32 560 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 561 I32 ml_anch;
bd61b366
SS
562 register char *other_last = NULL; /* other substr checked before this */
563 char *check_at = NULL; /* check substr found at this pos */
bbe252da 564 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 565 RXi_GET_DECL(prog,progi);
30944b6d 566#ifdef DEBUGGING
890ce7af 567 const char * const i_strpos = strpos;
30944b6d 568#endif
a3621e74
YO
569 GET_RE_DEBUG_FLAGS_DECL;
570
7918f24d
NC
571 PERL_ARGS_ASSERT_RE_INTUIT_START;
572
f2ed9b32 573 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 574
3c8556c3 575 if (RX_UTF8(rx)) {
b8d68ded
JH
576 PL_reg_flags |= RF_utf8;
577 }
ab3bbdeb 578 DEBUG_EXECUTE_r(
f2ed9b32 579 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
580 sv ? "Guessing start of match in sv for"
581 : "Guessing start of match in string for");
2a782b5b 582 );
cad2e5aa 583
c344f387
JH
584 /* CHR_DIST() would be more correct here but it makes things slow. */
585 if (prog->minlen > strend - strpos) {
a3621e74 586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 587 "String too short... [re_intuit_start]\n"));
cad2e5aa 588 goto fail;
2c2d71f5 589 }
1de06328 590
a1933d95 591 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 592 PL_regeol = strend;
f2ed9b32 593 if (utf8_target) {
33b8afdf
JH
594 if (!prog->check_utf8 && prog->check_substr)
595 to_utf8_substr(prog);
596 check = prog->check_utf8;
597 } else {
598 if (!prog->check_substr && prog->check_utf8)
599 to_byte_substr(prog);
600 check = prog->check_substr;
601 }
1de06328 602 if (check == &PL_sv_undef) {
a3621e74 603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 604 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
605 goto fail;
606 }
bbe252da
YO
607 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
608 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
609 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 610 && !multiline ) ); /* Check after \n? */
cad2e5aa 611
7e25d62c 612 if (!ml_anch) {
bbe252da
YO
613 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
614 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 615 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
616 && sv && !SvROK(sv)
617 && (strpos != strbeg)) {
a3621e74 618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
619 goto fail;
620 }
621 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 622 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 623 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
624 I32 slen;
625
1aa99e6b 626 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 627
653099ff
GS
628 if (SvTAIL(check)) {
629 slen = SvCUR(check); /* >= 1 */
cad2e5aa 630
9041c2e3 631 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 632 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 634 goto fail_finish;
cad2e5aa
JH
635 }
636 /* Now should match s[0..slen-2] */
637 slen--;
3f7c398e 638 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 639 || (slen > 1
3f7c398e 640 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 641 report_neq:
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
643 goto fail_finish;
644 }
cad2e5aa 645 }
3f7c398e 646 else if (*SvPVX_const(check) != *s
653099ff 647 || ((slen = SvCUR(check)) > 1
3f7c398e 648 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 649 goto report_neq;
c315bfe8 650 check_at = s;
2c2d71f5 651 goto success_at_start;
7e25d62c 652 }
cad2e5aa 653 }
2c2d71f5 654 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 655 s = strpos;
2c2d71f5 656 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
657 end_shift = prog->check_end_shift;
658
2c2d71f5 659 if (!ml_anch) {
a3b680e6 660 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 661 - (SvTAIL(check) != 0);
a3b680e6 662 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
663
664 if (end_shift < eshift)
665 end_shift = eshift;
666 }
cad2e5aa 667 }
2c2d71f5 668 else { /* Can match at random position */
cad2e5aa
JH
669 ml_anch = 0;
670 s = strpos;
1de06328
YO
671 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
672 end_shift = prog->check_end_shift;
673
674 /* end shift should be non negative here */
cad2e5aa
JH
675 }
676
bcdf7404 677#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 678 if (end_shift < 0)
1de06328 679 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 680 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
681#endif
682
2c2d71f5
JH
683 restart:
684 /* Find a possible match in the region s..strend by looking for
685 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
686
687 {
688 I32 srch_start_shift = start_shift;
689 I32 srch_end_shift = end_shift;
690 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
691 srch_end_shift -= ((strbeg - s) - srch_start_shift);
692 srch_start_shift = strbeg - s;
693 }
6bda09f9 694 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
695 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
696 (IV)prog->check_offset_min,
697 (IV)srch_start_shift,
698 (IV)srch_end_shift,
699 (IV)prog->check_end_shift);
700 });
701
f1905e1b 702 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
cad2e5aa 703 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 704 I32 * const pp = data ? data->scream_pos : &p;
4185c919 705 const MAGIC *mg;
72de20cd 706 bool found = FALSE;
cad2e5aa 707
4185c919
NC
708 assert(SvMAGICAL(sv));
709 mg = mg_find(sv, PERL_MAGIC_study);
710 assert(mg);
711
72de20cd
NC
712 if (mg->mg_private == 1) {
713 found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
714 } else if (mg->mg_private == 2) {
715 found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
716 } else {
717 assert (mg->mg_private == 4);
718 found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
719 }
720
721 if (found
2c2d71f5 722 || ( BmRARE(check) == '\n'
85c508c3 723 && (BmPREVIOUS(check) == SvCUR(check) - 1)
2c2d71f5 724 && SvTAIL(check) ))
9041c2e3 725 s = screaminstr(sv, check,
1de06328 726 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
cad2e5aa 727 else
2c2d71f5 728 goto fail_finish;
4addbd3b 729 /* we may be pointing at the wrong string */
07bc277f 730 if (s && RXp_MATCH_COPIED(prog))
3f7c398e 731 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
732 if (data)
733 *data->scream_olds = s;
734 }
1de06328
YO
735 else {
736 U8* start_point;
737 U8* end_point;
bbe252da 738 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
739 start_point= (U8*)(s + srch_start_shift);
740 end_point= (U8*)(strend - srch_end_shift);
741 } else {
742 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
743 end_point= HOP3(strend, -srch_end_shift, strbeg);
744 }
6bda09f9 745 DEBUG_OPTIMISE_MORE_r({
56570a2c 746 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 747 (int)(end_point - start_point),
fc8cd66c 748 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
749 start_point);
750 });
751
752 s = fbm_instr( start_point, end_point,
7fba1cd6 753 check, multiline ? FBMrf_MULTILINE : 0);
1de06328
YO
754 }
755 }
cad2e5aa
JH
756 /* Update the count-of-usability, remove useless subpatterns,
757 unshift s. */
2c2d71f5 758
ab3bbdeb 759 DEBUG_EXECUTE_r({
f2ed9b32 760 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
761 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
762 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 763 (s ? "Found" : "Did not find"),
f2ed9b32 764 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
765 ? "anchored" : "floating"),
766 quoted,
767 RE_SV_TAIL(check),
768 (s ? " at offset " : "...\n") );
769 });
2c2d71f5
JH
770
771 if (!s)
772 goto fail_finish;
2c2d71f5 773 /* Finish the diagnostic message */
a3621e74 774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 775
1de06328
YO
776 /* XXX dmq: first branch is for positive lookbehind...
777 Our check string is offset from the beginning of the pattern.
778 So we need to do any stclass tests offset forward from that
779 point. I think. :-(
780 */
781
782
783
784 check_at=s;
785
786
2c2d71f5
JH
787 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
788 Start with the other substr.
789 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 790 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
791 *always* match. Probably should be marked during compile...
792 Probably it is right to do no SCREAM here...
793 */
794
f2ed9b32 795 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
796 : (prog->float_substr && prog->anchored_substr))
797 {
30944b6d 798 /* Take into account the "other" substring. */
2c2d71f5
JH
799 /* XXXX May be hopelessly wrong for UTF... */
800 if (!other_last)
6eb5f6b9 801 other_last = strpos;
f2ed9b32 802 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
803 do_other_anchored:
804 {
890ce7af
AL
805 char * const last = HOP3c(s, -start_shift, strbeg);
806 char *last1, *last2;
be8e71aa 807 char * const saved_s = s;
33b8afdf 808 SV* must;
2c2d71f5 809
2c2d71f5
JH
810 t = s - prog->check_offset_max;
811 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 812 && (!utf8_target
0ce71af7 813 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 814 && t > strpos)))
6f207bd3 815 NOOP;
2c2d71f5
JH
816 else
817 t = strpos;
1aa99e6b 818 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
819 if (t < other_last) /* These positions already checked */
820 t = other_last;
1aa99e6b 821 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
822 if (last < last1)
823 last1 = last;
1de06328
YO
824 /* XXXX It is not documented what units *_offsets are in.
825 We assume bytes, but this is clearly wrong.
826 Meaning this code needs to be carefully reviewed for errors.
827 dmq.
828 */
829
2c2d71f5 830 /* On end-of-str: see comment below. */
f2ed9b32 831 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
832 if (must == &PL_sv_undef) {
833 s = (char*)NULL;
1de06328 834 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
835 }
836 else
837 s = fbm_instr(
838 (unsigned char*)t,
839 HOP3(HOP3(last1, prog->anchored_offset, strend)
840 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
841 must,
7fba1cd6 842 multiline ? FBMrf_MULTILINE : 0
33b8afdf 843 );
ab3bbdeb 844 DEBUG_EXECUTE_r({
f2ed9b32 845 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
846 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
847 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 848 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
849 quoted, RE_SV_TAIL(must));
850 });
851
852
2c2d71f5
JH
853 if (!s) {
854 if (last1 >= last2) {
a3621e74 855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
856 ", giving up...\n"));
857 goto fail_finish;
858 }
a3621e74 859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 860 ", trying floating at offset %ld...\n",
be8e71aa 861 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
862 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
863 s = HOP3c(last, 1, strend);
2c2d71f5
JH
864 goto restart;
865 }
866 else {
a3621e74 867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 868 (long)(s - i_strpos)));
1aa99e6b
IH
869 t = HOP3c(s, -prog->anchored_offset, strbeg);
870 other_last = HOP3c(s, 1, strend);
be8e71aa 871 s = saved_s;
2c2d71f5
JH
872 if (t == strpos)
873 goto try_at_start;
2c2d71f5
JH
874 goto try_at_offset;
875 }
30944b6d 876 }
2c2d71f5
JH
877 }
878 else { /* Take into account the floating substring. */
33b8afdf 879 char *last, *last1;
be8e71aa 880 char * const saved_s = s;
33b8afdf
JH
881 SV* must;
882
883 t = HOP3c(s, -start_shift, strbeg);
884 last1 = last =
885 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
886 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
887 last = HOP3c(t, prog->float_max_offset, strend);
888 s = HOP3c(t, prog->float_min_offset, strend);
889 if (s < other_last)
890 s = other_last;
2c2d71f5 891 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 892 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
893 /* fbm_instr() takes into account exact value of end-of-str
894 if the check is SvTAIL(ed). Since false positives are OK,
895 and end-of-str is not later than strend we are OK. */
896 if (must == &PL_sv_undef) {
897 s = (char*)NULL;
1de06328 898 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
899 }
900 else
2c2d71f5 901 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
902 (unsigned char*)last + SvCUR(must)
903 - (SvTAIL(must)!=0),
7fba1cd6 904 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 905 DEBUG_EXECUTE_r({
f2ed9b32 906 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
907 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
908 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 909 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
910 quoted, RE_SV_TAIL(must));
911 });
33b8afdf
JH
912 if (!s) {
913 if (last1 == last) {
a3621e74 914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
915 ", giving up...\n"));
916 goto fail_finish;
2c2d71f5 917 }
a3621e74 918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 919 ", trying anchored starting at offset %ld...\n",
be8e71aa 920 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
921 other_last = last;
922 s = HOP3c(t, 1, strend);
923 goto restart;
924 }
925 else {
a3621e74 926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
927 (long)(s - i_strpos)));
928 other_last = s; /* Fix this later. --Hugo */
be8e71aa 929 s = saved_s;
33b8afdf
JH
930 if (t == strpos)
931 goto try_at_start;
932 goto try_at_offset;
933 }
2c2d71f5 934 }
cad2e5aa 935 }
2c2d71f5 936
1de06328 937
9ef43ace 938 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 939
6bda09f9 940 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
941 PerlIO_printf(Perl_debug_log,
942 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
943 (IV)prog->check_offset_min,
944 (IV)prog->check_offset_max,
945 (IV)(s-strpos),
946 (IV)(t-strpos),
947 (IV)(t-s),
948 (IV)(strend-strpos)
949 )
950 );
951
2c2d71f5 952 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 953 && (!utf8_target
9ef43ace 954 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
955 && t > strpos)))
956 {
2c2d71f5
JH
957 /* Fixed substring is found far enough so that the match
958 cannot start at strpos. */
959 try_at_offset:
cad2e5aa 960 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
961 /* Eventually fbm_*() should handle this, but often
962 anchored_offset is not 0, so this check will not be wasted. */
963 /* XXXX In the code below we prefer to look for "^" even in
964 presence of anchored substrings. And we search even
965 beyond the found float position. These pessimizations
966 are historical artefacts only. */
967 find_anchor:
2c2d71f5 968 while (t < strend - prog->minlen) {
cad2e5aa 969 if (*t == '\n') {
4ee3650e 970 if (t < check_at - prog->check_offset_min) {
f2ed9b32 971 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
972 /* Since we moved from the found position,
973 we definitely contradict the found anchored
30944b6d
IZ
974 substr. Due to the above check we do not
975 contradict "check" substr.
976 Thus we can arrive here only if check substr
977 is float. Redo checking for "other"=="fixed".
978 */
9041c2e3 979 strpos = t + 1;
a3621e74 980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 981 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
982 goto do_other_anchored;
983 }
4ee3650e
GS
984 /* We don't contradict the found floating substring. */
985 /* XXXX Why not check for STCLASS? */
cad2e5aa 986 s = t + 1;
a3621e74 987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 988 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
989 goto set_useful;
990 }
4ee3650e
GS
991 /* Position contradicts check-string */
992 /* XXXX probably better to look for check-string
993 than for "\n", so one should lower the limit for t? */
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 996 other_last = strpos = s = t + 1;
cad2e5aa
JH
997 goto restart;
998 }
999 t++;
1000 }
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1002 PL_colors[0], PL_colors[1]));
2c2d71f5 1003 goto fail_finish;
cad2e5aa 1004 }
f5952150 1005 else {
a3621e74 1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1007 PL_colors[0], PL_colors[1]));
f5952150 1008 }
cad2e5aa
JH
1009 s = t;
1010 set_useful:
f2ed9b32 1011 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1012 }
1013 else {
f5952150 1014 /* The found string does not prohibit matching at strpos,
2c2d71f5 1015 - no optimization of calling REx engine can be performed,
f5952150
GS
1016 unless it was an MBOL and we are not after MBOL,
1017 or a future STCLASS check will fail this. */
2c2d71f5
JH
1018 try_at_start:
1019 /* Even in this situation we may use MBOL flag if strpos is offset
1020 wrt the start of the string. */
05b4157f 1021 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1022 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1023 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1024 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1025 {
cad2e5aa
JH
1026 t = strpos;
1027 goto find_anchor;
1028 }
a3621e74 1029 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1030 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1031 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1032 );
2c2d71f5 1033 success_at_start:
bbe252da 1034 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1035 && (utf8_target ? (
33b8afdf
JH
1036 prog->check_utf8 /* Could be deleted already */
1037 && --BmUSEFUL(prog->check_utf8) < 0
1038 && (prog->check_utf8 == prog->float_utf8)
1039 ) : (
1040 prog->check_substr /* Could be deleted already */
1041 && --BmUSEFUL(prog->check_substr) < 0
1042 && (prog->check_substr == prog->float_substr)
1043 )))
66e933ab 1044 {
cad2e5aa 1045 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1046 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1047 /* XXX Does the destruction order has to change with utf8_target? */
1048 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1050 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1051 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1052 check = NULL; /* abort */
cad2e5aa 1053 s = strpos;
486ec47a 1054 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1055 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056 if (prog->intflags & PREGf_IMPLICIT)
1057 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1058 /* XXXX This is a remnant of the old implementation. It
1059 looks wasteful, since now INTUIT can use many
6eb5f6b9 1060 other heuristics. */
bbe252da 1061 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1062 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1063 }
1064 else
1065 s = strpos;
1066 }
1067
6eb5f6b9
JH
1068 /* Last resort... */
1069 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1070 /* trie stclasses are too expensive to use here, we are better off to
1071 leave it to regmatch itself */
f8fc2ecf 1072 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1073 /* minlen == 0 is possible if regstclass is \b or \B,
1074 and the fixed substr is ''$.
1075 Since minlen is already taken into account, s+1 is before strend;
1076 accidentally, minlen >= 1 guaranties no false positives at s + 1
1077 even for \b or \B. But (minlen? 1 : 0) below assumes that
1078 regstclass does not come from lookahead... */
1079 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1080 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1081 const U8* const str = (U8*)STRING(progi->regstclass);
1082 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1084 : 1);
1de06328
YO
1085 char * endpos;
1086 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088 else if (prog->float_substr || prog->float_utf8)
1089 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1090 else
1091 endpos= strend;
1092
70685ca0
JH
1093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1de06328 1095
6eb5f6b9 1096 t = s;
f8fc2ecf 1097 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
6eb5f6b9
JH
1098 if (!s) {
1099#ifdef DEBUGGING
cbbf8932 1100 const char *what = NULL;
6eb5f6b9
JH
1101#endif
1102 if (endpos == strend) {
a3621e74 1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1104 "Could not match STCLASS...\n") );
1105 goto fail;
1106 }
a3621e74 1107 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1108 "This position contradicts STCLASS...\n") );
bbe252da 1109 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1110 goto fail;
6eb5f6b9 1111 /* Contradict one of substrings */
33b8afdf 1112 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1113 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1114 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1115 hop_and_restart:
1aa99e6b 1116 s = HOP3c(t, 1, strend);
66e933ab
GS
1117 if (s + start_shift + end_shift > strend) {
1118 /* XXXX Should be taken into account earlier? */
a3621e74 1119 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1120 "Could not match STCLASS...\n") );
1121 goto fail;
1122 }
5e39e1e5
HS
1123 if (!check)
1124 goto giveup;
a3621e74 1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1126 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1127 what, (long)(s + start_shift - i_strpos)) );
1128 goto restart;
1129 }
66e933ab 1130 /* Have both, check_string is floating */
6eb5f6b9
JH
1131 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132 goto retry_floating_check;
1133 /* Recheck anchored substring, but not floating... */
9041c2e3 1134 s = check_at;
5e39e1e5
HS
1135 if (!check)
1136 goto giveup;
a3621e74 1137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1138 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1139 (long)(other_last - i_strpos)) );
1140 goto do_other_anchored;
1141 }
60e71179
GS
1142 /* Another way we could have checked stclass at the
1143 current position only: */
1144 if (ml_anch) {
1145 s = t = t + 1;
5e39e1e5
HS
1146 if (!check)
1147 goto giveup;
a3621e74 1148 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1149 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1150 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1151 goto try_at_offset;
66e933ab 1152 }
f2ed9b32 1153 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1154 goto fail;
486ec47a 1155 /* Check is floating substring. */
6eb5f6b9
JH
1156 retry_floating_check:
1157 t = check_at - start_shift;
a3621e74 1158 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1159 goto hop_and_restart;
1160 }
b7953727 1161 if (t != s) {
a3621e74 1162 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1163 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1164 (long)(t - i_strpos), (long)(s - i_strpos))
1165 );
1166 }
1167 else {
a3621e74 1168 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1169 "Does not contradict STCLASS...\n");
1170 );
1171 }
6eb5f6b9 1172 }
5e39e1e5 1173 giveup:
a3621e74 1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1175 PL_colors[4], (check ? "Guessed" : "Giving up"),
1176 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1177 return s;
2c2d71f5
JH
1178
1179 fail_finish: /* Substring not found */
33b8afdf 1180 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1181 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1182 fail:
a3621e74 1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1184 PL_colors[4], PL_colors[5]));
bd61b366 1185 return NULL;
cad2e5aa 1186}
9661b544 1187
a0a388a1
YO
1188#define DECL_TRIE_TYPE(scan) \
1189 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1190 trie_type = (scan->flags != EXACT) \
f2ed9b32
KW
1191 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1192 : (utf8_target ? trie_utf8 : trie_plain)
3b0527fe 1193
55eed653
NC
1194#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1195uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
4cadc6a9
YO
1196 switch (trie_type) { \
1197 case trie_utf8_fold: \
1198 if ( foldlen>0 ) { \
0abd0d78 1199 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
4cadc6a9
YO
1200 foldlen -= len; \
1201 uscan += len; \
1202 len=0; \
1203 } else { \
ad0bff0e
KW
1204 uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen ); \
1205 len = UTF8SKIP(uc); \
4cadc6a9
YO
1206 foldlen -= UNISKIP( uvc ); \
1207 uscan = foldbuf + UNISKIP( uvc ); \
1208 } \
1209 break; \
a0a388a1
YO
1210 case trie_latin_utf8_fold: \
1211 if ( foldlen>0 ) { \
1212 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1213 foldlen -= len; \
1214 uscan += len; \
1215 len=0; \
1216 } else { \
1217 len = 1; \
1218 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1219 foldlen -= UNISKIP( uvc ); \
1220 uscan = foldbuf + UNISKIP( uvc ); \
1221 } \
1222 break; \
4cadc6a9
YO
1223 case trie_utf8: \
1224 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1225 break; \
1226 case trie_plain: \
1227 uvc = (UV)*uc; \
1228 len = 1; \
1229 } \
4cadc6a9
YO
1230 if (uvc < 256) { \
1231 charid = trie->charmap[ uvc ]; \
1232 } \
1233 else { \
1234 charid = 0; \
55eed653
NC
1235 if (widecharmap) { \
1236 SV** const svpp = hv_fetch(widecharmap, \
4cadc6a9
YO
1237 (char*)&uvc, sizeof(UV), 0); \
1238 if (svpp) \
1239 charid = (U16)SvIV(*svpp); \
1240 } \
1241 } \
1242} STMT_END
1243
4cadc6a9
YO
1244#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1245STMT_START { \
1246 while (s <= e) { \
1247 if ( (CoNd) \
fac1af77 1248 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1249 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1250 goto got_it; \
1251 s++; \
1252 } \
1253} STMT_END
1254
1255#define REXEC_FBC_UTF8_SCAN(CoDe) \
1256STMT_START { \
1257 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1258 CoDe \
1259 s += uskip; \
1260 } \
1261} STMT_END
1262
1263#define REXEC_FBC_SCAN(CoDe) \
1264STMT_START { \
1265 while (s < strend) { \
1266 CoDe \
1267 s++; \
1268 } \
1269} STMT_END
1270
1271#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1272REXEC_FBC_UTF8_SCAN( \
1273 if (CoNd) { \
24b23f37 1274 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1275 goto got_it; \
1276 else \
1277 tmp = doevery; \
1278 } \
1279 else \
1280 tmp = 1; \
1281)
1282
1283#define REXEC_FBC_CLASS_SCAN(CoNd) \
1284REXEC_FBC_SCAN( \
1285 if (CoNd) { \
24b23f37 1286 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1287 goto got_it; \
1288 else \
1289 tmp = doevery; \
1290 } \
1291 else \
1292 tmp = 1; \
1293)
1294
1295#define REXEC_FBC_TRYIT \
24b23f37 1296if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1297 goto got_it
1298
e1d1eefb 1299#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1300 if (utf8_target) { \
e1d1eefb
YO
1301 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1302 } \
1303 else { \
1304 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1305 }
e1d1eefb 1306
4cadc6a9 1307#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1308 if (utf8_target) { \
4cadc6a9
YO
1309 UtFpReLoAd; \
1310 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1311 } \
1312 else { \
1313 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1314 }
4cadc6a9
YO
1315
1316#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1317 PL_reg_flags |= RF_tainted; \
f2ed9b32 1318 if (utf8_target) { \
4cadc6a9
YO
1319 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1320 } \
1321 else { \
1322 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1323 }
4cadc6a9 1324
786e8c11
YO
1325#define DUMP_EXEC_POS(li,s,doutf8) \
1326 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1327
cfaf538b
KW
1328
1329#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1330 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1331 tmp = TEST_NON_UTF8(tmp); \
1332 REXEC_FBC_UTF8_SCAN( \
1333 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1334 tmp = !tmp; \
1335 IF_SUCCESS; \
1336 } \
1337 else { \
1338 IF_FAIL; \
1339 } \
1340 ); \
1341
1342#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1343 if (s == PL_bostr) { \
1344 tmp = '\n'; \
1345 } \
1346 else { \
1347 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1348 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1349 } \
1350 tmp = TeSt1_UtF8; \
1351 LOAD_UTF8_CHARCLASS_ALNUM(); \
1352 REXEC_FBC_UTF8_SCAN( \
1353 if (tmp == ! (TeSt2_UtF8)) { \
1354 tmp = !tmp; \
1355 IF_SUCCESS; \
1356 } \
1357 else { \
1358 IF_FAIL; \
1359 } \
1360 ); \
1361
63ac0dad
KW
1362/* The only difference between the BOUND and NBOUND cases is that
1363 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1364 * NBOUND. This is accomplished by passing it in either the if or else clause,
1365 * with the other one being empty */
1366#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1367 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1368
1369#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1370 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1371
1372#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1373 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1374
1375#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1376 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1377
63ac0dad
KW
1378
1379/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1380 * be passed in completely with the variable name being tested, which isn't
1381 * such a clean interface, but this is easier to read than it was before. We
1382 * are looking for the boundary (or non-boundary between a word and non-word
1383 * character. The utf8 and non-utf8 cases have the same logic, but the details
1384 * must be different. Find the "wordness" of the character just prior to this
1385 * one, and compare it with the wordness of this one. If they differ, we have
1386 * a boundary. At the beginning of the string, pretend that the previous
1387 * character was a new-line */
cfaf538b 1388#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1389 if (utf8_target) { \
cfaf538b 1390 UTF8_CODE \
63ac0dad
KW
1391 } \
1392 else { /* Not utf8 */ \
1393 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1394 tmp = TEST_NON_UTF8(tmp); \
1395 REXEC_FBC_SCAN( \
1396 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1397 tmp = !tmp; \
1398 IF_SUCCESS; \
1399 } \
1400 else { \
1401 IF_FAIL; \
1402 } \
1403 ); \
1404 } \
1405 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1406 goto got_it;
1407
786e8c11
YO
1408/* We know what class REx starts with. Try to find this position... */
1409/* if reginfo is NULL, its a dryrun */
1410/* annoyingly all the vars in this routine have different names from their counterparts
1411 in regmatch. /grrr */
1412
3c3eec57 1413STATIC char *
07be1b83 1414S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1415 const char *strend, regmatch_info *reginfo)
a687059c 1416{
27da23d5 1417 dVAR;
bbe252da 1418 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1419 char *pat_string; /* The pattern's exactish string */
1420 char *pat_end; /* ptr to end char of pat_string */
1421 re_fold_t folder; /* Function for computing non-utf8 folds */
1422 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1423 STRLEN ln;
5dab1207 1424 STRLEN lnc;
078c425b 1425 register STRLEN uskip;
fac1af77
KW
1426 U8 c1;
1427 U8 c2;
6eb5f6b9
JH
1428 char *e;
1429 register I32 tmp = 1; /* Scratch variable? */
f2ed9b32 1430 register const bool utf8_target = PL_reg_match_utf8;
453bfd44 1431 UV utf8_fold_flags = 0;
f8fc2ecf 1432 RXi_GET_DECL(prog,progi);
7918f24d
NC
1433
1434 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1435
6eb5f6b9
JH
1436 /* We know what class it must start with. */
1437 switch (OP(c)) {
f56b6394 1438 case ANYOFV:
6eb5f6b9 1439 case ANYOF:
f56b6394 1440 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1441 STRLEN inclasslen = strend - s;
1442 REXEC_FBC_UTF8_CLASS_SCAN(
1443 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1444 }
1445 else {
6ef69d56 1446 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1447 }
6eb5f6b9 1448 break;
f33976b4 1449 case CANY:
4cadc6a9 1450 REXEC_FBC_SCAN(
24b23f37 1451 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1452 goto got_it;
1453 else
1454 tmp = doevery;
4cadc6a9 1455 );
f33976b4 1456 break;
2f7f8cb1
KW
1457
1458 case EXACTFA:
1459 if (UTF_PATTERN || utf8_target) {
1460 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1461 goto do_exactf_utf8;
1462 }
1463 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1464 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1465 goto do_exactf_non_utf8; /* isn't dealt with by these */
1466
6eb5f6b9 1467 case EXACTF:
62bf7766 1468 if (utf8_target) {
77a6d856
KW
1469
1470 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1471 utf8_fold_flags = 0;
fac1af77
KW
1472 goto do_exactf_utf8;
1473 }
1474 fold_array = PL_fold;
1475 folder = foldEQ;
1476 goto do_exactf_non_utf8;
1477
1478 case EXACTFL:
1479 if (UTF_PATTERN || utf8_target) {
17580e7a 1480 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1481 goto do_exactf_utf8;
1482 }
1483 fold_array = PL_fold_locale;
1484 folder = foldEQ_locale;
16d951b7
KW
1485 goto do_exactf_non_utf8;
1486
3c760661
KW
1487 case EXACTFU_SS:
1488 if (UTF_PATTERN) {
1489 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1490 }
1491 goto do_exactf_utf8;
1492
ca600955 1493 case EXACTFU_NO_TRIE:
16d951b7
KW
1494 case EXACTFU:
1495 if (UTF_PATTERN || utf8_target) {
77a6d856 1496 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1497 goto do_exactf_utf8;
1498 }
1499
1500 /* Any 'ss' in the pattern should have been replaced by regcomp,
1501 * so we don't have to worry here about this single special case
1502 * in the Latin1 range */
1503 fold_array = PL_fold_latin1;
1504 folder = foldEQ_latin1;
fac1af77
KW
1505
1506 /* FALL THROUGH */
1507
62bf7766
KW
1508 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1509 are no glitches with fold-length differences
1510 between the target string and pattern */
fac1af77
KW
1511
1512 /* The idea in the non-utf8 EXACTF* cases is to first find the
1513 * first character of the EXACTF* node and then, if necessary,
1514 * case-insensitively compare the full text of the node. c1 is the
1515 * first character. c2 is its fold. This logic will not work for
1516 * Unicode semantics and the german sharp ss, which hence should
1517 * not be compiled into a node that gets here. */
1518 pat_string = STRING(c);
1519 ln = STR_LEN(c); /* length to match in octets/bytes */
1520
8a90a8fe
KW
1521 /* We know that we have to match at least 'ln' bytes (which is the
1522 * same as characters, since not utf8). If we have to match 3
1523 * characters, and there are only 2 availabe, we know without
1524 * trying that it will fail; so don't start a match past the
1525 * required minimum number from the far end */
fac1af77
KW
1526 e = HOP3c(strend, -((I32)ln), s);
1527
1528 if (!reginfo && e < s) {
1529 e = s; /* Due to minlen logic of intuit() */
1530 }
1531
1532 c1 = *pat_string;
1533 c2 = fold_array[c1];
1534 if (c1 == c2) { /* If char and fold are the same */
1535 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1536 }
1537 else {
1538 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1539 }
1540 break;
1541
1542 do_exactf_utf8:
e067297c
KW
1543 {
1544 unsigned expansion;
1545
fac1af77
KW
1546
1547 /* If one of the operands is in utf8, we can't use the simpler
1548 * folding above, due to the fact that many different characters
1549 * can have the same fold, or portion of a fold, or different-
1550 * length fold */
1551 pat_string = STRING(c);
1552 ln = STR_LEN(c); /* length to match in octets/bytes */
1553 pat_end = pat_string + ln;
1554 lnc = (UTF_PATTERN) /* length to match in characters */
1555 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1556 : ln;
1557
e067297c
KW
1558 /* We have 'lnc' characters to match in the pattern, but because of
1559 * multi-character folding, each character in the target can match
1560 * up to 3 characters (Unicode guarantees it will never exceed
1561 * this) if it is utf8-encoded; and up to 2 if not (based on the
1562 * fact that the Latin 1 folds are already determined, and the
1563 * only multi-char fold in that range is the sharp-s folding to
1564 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1565 * string character. Adjust lnc accordingly, rounding up, so that
1566 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1567 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1568 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1569
1570 /* As in the non-UTF8 case, if we have to match 3 characters, and
1571 * only 2 are left, it's guaranteed to fail, so don't start a
1572 * match that would require us to go beyond the end of the string
1573 */
1574 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1575
1576 if (!reginfo && e < s) {
1577 e = s; /* Due to minlen logic of intuit() */
1578 }
1579
b33105db
KW
1580 /* XXX Note that we could recalculate e to stop the loop earlier,
1581 * as the worst case expansion above will rarely be met, and as we
1582 * go along we would usually find that e moves further to the left.
1583 * This would happen only after we reached the point in the loop
1584 * where if there were no expansion we should fail. Unclear if
1585 * worth the expense */
e067297c 1586
fac1af77
KW
1587 while (s <= e) {
1588 char *my_strend= (char *)strend;
1589 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1590 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1591 && (!reginfo || regtry(reginfo, &s)) )
1592 {
1593 goto got_it;
1594 }
bbdd8bad 1595 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1596 }
1597 break;
e067297c 1598 }
bbce6d69 1599 case BOUNDL:
3280af22 1600 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1601 FBC_BOUND(isALNUM_LC,
1602 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1603 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1604 break;
bbce6d69 1605 case NBOUNDL:
3280af22 1606 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1607 FBC_NBOUND(isALNUM_LC,
1608 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1609 isALNUM_LC_utf8((U8*)s));
1610 break;
1611 case BOUND:
1612 FBC_BOUND(isWORDCHAR,
1613 isALNUM_uni(tmp),
1614 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1615 break;
cfaf538b
KW
1616 case BOUNDA:
1617 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1618 isWORDCHAR_A(tmp),
1619 isWORDCHAR_A((U8*)s));
1620 break;
a0d0e21e 1621 case NBOUND:
63ac0dad
KW
1622 FBC_NBOUND(isWORDCHAR,
1623 isALNUM_uni(tmp),
1624 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1625 break;
cfaf538b
KW
1626 case NBOUNDA:
1627 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1628 isWORDCHAR_A(tmp),
1629 isWORDCHAR_A((U8*)s));
1630 break;
63ac0dad
KW
1631 case BOUNDU:
1632 FBC_BOUND(isWORDCHAR_L1,
1633 isALNUM_uni(tmp),
1634 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1635 break;
1636 case NBOUNDU:
1637 FBC_NBOUND(isWORDCHAR_L1,
1638 isALNUM_uni(tmp),
1639 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1640 break;
bbce6d69 1641 case ALNUML:
4cadc6a9
YO
1642 REXEC_FBC_CSCAN_TAINT(
1643 isALNUM_LC_utf8((U8*)s),
1644 isALNUM_LC(*s)
1645 );
6895a8aa 1646 break;
980866de
KW
1647 case ALNUMU:
1648 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1649 LOAD_UTF8_CHARCLASS_ALNUM(),
1650 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1651 isWORDCHAR_L1((U8) *s)
1652 );
6895a8aa 1653 break;
980866de
KW
1654 case ALNUM:
1655 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1656 LOAD_UTF8_CHARCLASS_ALNUM(),
1657 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1658 isWORDCHAR((U8) *s)
1659 );
6895a8aa 1660 break;
cfaf538b 1661 case ALNUMA:
8e9da4d4
KW
1662 /* Don't need to worry about utf8, as it can match only a single
1663 * byte invariant character */
cfaf538b 1664 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1665 break;
980866de
KW
1666 case NALNUMU:
1667 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1668 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1669 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1670 ! isWORDCHAR_L1((U8) *s)
1671 );
6895a8aa 1672 break;
a0d0e21e 1673 case NALNUM:
4cadc6a9 1674 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1675 LOAD_UTF8_CHARCLASS_ALNUM(),
1676 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1677 ! isALNUM(*s)
4cadc6a9 1678 );
6895a8aa 1679 break;
cfaf538b 1680 case NALNUMA:
8e9da4d4
KW
1681 REXEC_FBC_CSCAN(
1682 !isWORDCHAR_A(*s),
1683 !isWORDCHAR_A(*s)
1684 );
1685 break;
bbce6d69 1686 case NALNUML:
4cadc6a9
YO
1687 REXEC_FBC_CSCAN_TAINT(
1688 !isALNUM_LC_utf8((U8*)s),
1689 !isALNUM_LC(*s)
1690 );
6895a8aa 1691 break;
980866de
KW
1692 case SPACEU:
1693 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1694 LOAD_UTF8_CHARCLASS_SPACE(),
1695 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1696 isSPACE_L1((U8) *s)
1697 );
6895a8aa 1698 break;
a0d0e21e 1699 case SPACE:
4cadc6a9 1700 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1701 LOAD_UTF8_CHARCLASS_SPACE(),
1702 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1703 isSPACE((U8) *s)
4cadc6a9 1704 );
6895a8aa 1705 break;
cfaf538b 1706 case SPACEA:
8e9da4d4
KW
1707 /* Don't need to worry about utf8, as it can match only a single
1708 * byte invariant character */
cfaf538b 1709 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1710 break;
bbce6d69 1711 case SPACEL:
4cadc6a9 1712 REXEC_FBC_CSCAN_TAINT(
6bbba904 1713 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1714 isSPACE_LC(*s)
1715 );
6895a8aa 1716 break;
980866de
KW
1717 case NSPACEU:
1718 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1719 LOAD_UTF8_CHARCLASS_SPACE(),
1720 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1721 ! isSPACE_L1((U8) *s)
1722 );
6895a8aa 1723 break;
a0d0e21e 1724 case NSPACE:
4cadc6a9 1725 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1726 LOAD_UTF8_CHARCLASS_SPACE(),
1727 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1728 ! isSPACE((U8) *s)
4cadc6a9 1729 );
6895a8aa 1730 break;
cfaf538b 1731 case NSPACEA:
8e9da4d4
KW
1732 REXEC_FBC_CSCAN(
1733 !isSPACE_A(*s),
1734 !isSPACE_A(*s)
1735 );
1736 break;
bbce6d69 1737 case NSPACEL:
4cadc6a9 1738 REXEC_FBC_CSCAN_TAINT(
6bbba904 1739 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1740 !isSPACE_LC(*s)
1741 );
6895a8aa 1742 break;
a0d0e21e 1743 case DIGIT:
4cadc6a9 1744 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1745 LOAD_UTF8_CHARCLASS_DIGIT(),
1746 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1747 isDIGIT(*s)
1748 );
6895a8aa 1749 break;
cfaf538b 1750 case DIGITA:
8e9da4d4
KW
1751 /* Don't need to worry about utf8, as it can match only a single
1752 * byte invariant character */
cfaf538b 1753 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1754 break;
b8c5462f 1755 case DIGITL:
4cadc6a9
YO
1756 REXEC_FBC_CSCAN_TAINT(
1757 isDIGIT_LC_utf8((U8*)s),
1758 isDIGIT_LC(*s)
1759 );
6895a8aa 1760 break;
a0d0e21e 1761 case NDIGIT:
4cadc6a9 1762 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1763 LOAD_UTF8_CHARCLASS_DIGIT(),
1764 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1765 !isDIGIT(*s)
1766 );
6895a8aa 1767 break;
cfaf538b 1768 case NDIGITA:
8e9da4d4
KW
1769 REXEC_FBC_CSCAN(
1770 !isDIGIT_A(*s),
1771 !isDIGIT_A(*s)
1772 );
1773 break;
b8c5462f 1774 case NDIGITL:
4cadc6a9
YO
1775 REXEC_FBC_CSCAN_TAINT(
1776 !isDIGIT_LC_utf8((U8*)s),
1777 !isDIGIT_LC(*s)
1778 );
6895a8aa 1779 break;
e1d1eefb
YO
1780 case LNBREAK:
1781 REXEC_FBC_CSCAN(
1782 is_LNBREAK_utf8(s),
1783 is_LNBREAK_latin1(s)
1784 );
6895a8aa 1785 break;
e1d1eefb
YO
1786 case VERTWS:
1787 REXEC_FBC_CSCAN(
1788 is_VERTWS_utf8(s),
1789 is_VERTWS_latin1(s)
1790 );
6895a8aa 1791 break;
e1d1eefb
YO
1792 case NVERTWS:
1793 REXEC_FBC_CSCAN(
1794 !is_VERTWS_utf8(s),
1795 !is_VERTWS_latin1(s)
1796 );
6895a8aa 1797 break;
e1d1eefb
YO
1798 case HORIZWS:
1799 REXEC_FBC_CSCAN(
1800 is_HORIZWS_utf8(s),
1801 is_HORIZWS_latin1(s)
1802 );
6895a8aa 1803 break;
e1d1eefb
YO
1804 case NHORIZWS:
1805 REXEC_FBC_CSCAN(
1806 !is_HORIZWS_utf8(s),
1807 !is_HORIZWS_latin1(s)
1808 );
6895a8aa 1809 break;
1de06328
YO
1810 case AHOCORASICKC:
1811 case AHOCORASICK:
07be1b83 1812 {
a0a388a1 1813 DECL_TRIE_TYPE(c);
07be1b83
YO
1814 /* what trie are we using right now */
1815 reg_ac_data *aho
f8fc2ecf 1816 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1817 reg_trie_data *trie
1818 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1819 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1820
1821 const char *last_start = strend - trie->minlen;
6148ee25 1822#ifdef DEBUGGING
07be1b83 1823 const char *real_start = s;
6148ee25 1824#endif
07be1b83 1825 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1826 SV *sv_points;
1827 U8 **points; /* map of where we were in the input string
786e8c11 1828 when reading a given char. For ASCII this
be8e71aa 1829 is unnecessary overhead as the relationship
38a44b82
NC
1830 is always 1:1, but for Unicode, especially
1831 case folded Unicode this is not true. */
f9e705e8 1832 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1833 U8 *bitmap=NULL;
1834
07be1b83
YO
1835
1836 GET_RE_DEBUG_FLAGS_DECL;
1837
be8e71aa
YO
1838 /* We can't just allocate points here. We need to wrap it in
1839 * an SV so it gets freed properly if there is a croak while
1840 * running the match */
1841 ENTER;
1842 SAVETMPS;
1843 sv_points=newSV(maxlen * sizeof(U8 *));
1844 SvCUR_set(sv_points,
1845 maxlen * sizeof(U8 *));
1846 SvPOK_on(sv_points);
1847 sv_2mortal(sv_points);
1848 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1849 if ( trie_type != trie_utf8_fold
1850 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1851 {
786e8c11
YO
1852 if (trie->bitmap)
1853 bitmap=(U8*)trie->bitmap;
1854 else
1855 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1856 }
786e8c11
YO
1857 /* this is the Aho-Corasick algorithm modified a touch
1858 to include special handling for long "unknown char"
1859 sequences. The basic idea being that we use AC as long
1860 as we are dealing with a possible matching char, when
1861 we encounter an unknown char (and we have not encountered
1862 an accepting state) we scan forward until we find a legal
1863 starting char.
1864 AC matching is basically that of trie matching, except
1865 that when we encounter a failing transition, we fall back
1866 to the current states "fail state", and try the current char
1867 again, a process we repeat until we reach the root state,
1868 state 1, or a legal transition. If we fail on the root state
1869 then we can either terminate if we have reached an accepting
1870 state previously, or restart the entire process from the beginning
1871 if we have not.
1872
1873 */
07be1b83
YO
1874 while (s <= last_start) {
1875 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1876 U8 *uc = (U8*)s;
1877 U16 charid = 0;
1878 U32 base = 1;
1879 U32 state = 1;
1880 UV uvc = 0;
1881 STRLEN len = 0;
1882 STRLEN foldlen = 0;
1883 U8 *uscan = (U8*)NULL;
1884 U8 *leftmost = NULL;
786e8c11
YO
1885#ifdef DEBUGGING
1886 U32 accepted_word= 0;
1887#endif
07be1b83
YO
1888 U32 pointpos = 0;
1889
1890 while ( state && uc <= (U8*)strend ) {
1891 int failed=0;
786e8c11
YO
1892 U32 word = aho->states[ state ].wordnum;
1893
1de06328
YO
1894 if( state==1 ) {
1895 if ( bitmap ) {
1896 DEBUG_TRIE_EXECUTE_r(
1897 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1898 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1899 (char *)uc, utf8_target );
1de06328
YO
1900 PerlIO_printf( Perl_debug_log,
1901 " Scanning for legal start char...\n");
1902 }
d085b490
YO
1903 );
1904 if (utf8_target) {
1905 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 uc += UTF8SKIP(uc);
1907 }
1908 } else {
1909 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910 uc++;
1911 }
1912 }
1de06328 1913 s= (char *)uc;
786e8c11 1914 }
786e8c11
YO
1915 if (uc >(U8*)last_start) break;
1916 }
1917
1918 if ( word ) {
2e64971a 1919 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1920 if (!leftmost || lpos < leftmost) {
1921 DEBUG_r(accepted_word=word);
07be1b83 1922 leftmost= lpos;
786e8c11 1923 }
07be1b83 1924 if (base==0) break;
786e8c11 1925
07be1b83
YO
1926 }
1927 points[pointpos++ % maxlen]= uc;
55eed653
NC
1928 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1929 uscan, len, uvc, charid, foldlen,
1930 foldbuf, uniflags);
786e8c11
YO
1931 DEBUG_TRIE_EXECUTE_r({
1932 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1933 s, utf8_target );
07be1b83 1934 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1935 " Charid:%3u CP:%4"UVxf" ",
1936 charid, uvc);
1937 });
07be1b83
YO
1938
1939 do {
6148ee25 1940#ifdef DEBUGGING
786e8c11 1941 word = aho->states[ state ].wordnum;
6148ee25 1942#endif
07be1b83
YO
1943 base = aho->states[ state ].trans.base;
1944
786e8c11
YO
1945 DEBUG_TRIE_EXECUTE_r({
1946 if (failed)
1947 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1948 s, utf8_target );
07be1b83 1949 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1950 "%sState: %4"UVxf", word=%"UVxf,
1951 failed ? " Fail transition to " : "",
1952 (UV)state, (UV)word);
1953 });
07be1b83
YO
1954 if ( base ) {
1955 U32 tmp;
6dd2be57 1956 I32 offset;
07be1b83 1957 if (charid &&
6dd2be57
DM
1958 ( ((offset = base + charid
1959 - 1 - trie->uniquecharcount)) >= 0)
1960 && ((U32)offset < trie->lasttrans)
1961 && trie->trans[offset].check == state
1962 && (tmp=trie->trans[offset].next))
07be1b83 1963 {
786e8c11
YO
1964 DEBUG_TRIE_EXECUTE_r(
1965 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1966 state = tmp;
1967 break;
1968 }
1969 else {
786e8c11
YO
1970 DEBUG_TRIE_EXECUTE_r(
1971 PerlIO_printf( Perl_debug_log," - fail\n"));
1972 failed = 1;
1973 state = aho->fail[state];
07be1b83
YO
1974 }
1975 }
1976 else {
1977 /* we must be accepting here */
786e8c11
YO
1978 DEBUG_TRIE_EXECUTE_r(
1979 PerlIO_printf( Perl_debug_log," - accepting\n"));
1980 failed = 1;
07be1b83
YO
1981 break;
1982 }
1983 } while(state);
786e8c11 1984 uc += len;
07be1b83
YO
1985 if (failed) {
1986 if (leftmost)
1987 break;
786e8c11 1988 if (!state) state = 1;
07be1b83
YO
1989 }
1990 }
1991 if ( aho->states[ state ].wordnum ) {
2e64971a 1992 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
1993 if (!leftmost || lpos < leftmost) {
1994 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 1995 leftmost = lpos;
786e8c11 1996 }
07be1b83 1997 }
07be1b83
YO
1998 if (leftmost) {
1999 s = (char*)leftmost;
786e8c11
YO
2000 DEBUG_TRIE_EXECUTE_r({
2001 PerlIO_printf(
70685ca0
JH
2002 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2003 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2004 );
2005 });
24b23f37 2006 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2007 FREETMPS;
2008 LEAVE;
07be1b83 2009 goto got_it;
be8e71aa 2010 }
07be1b83 2011 s = HOPc(s,1);
786e8c11
YO
2012 DEBUG_TRIE_EXECUTE_r({
2013 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2014 });
07be1b83 2015 } else {
786e8c11
YO
2016 DEBUG_TRIE_EXECUTE_r(
2017 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2018 break;
2019 }
2020 }
be8e71aa
YO
2021 FREETMPS;
2022 LEAVE;
07be1b83
YO
2023 }
2024 break;
b3c9acc1 2025 default:
3c3eec57
GS
2026 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2027 break;
d6a28714 2028 }
6eb5f6b9
JH
2029 return 0;
2030 got_it:
2031 return s;
2032}
2033
fae667d5 2034
6eb5f6b9
JH
2035/*
2036 - regexec_flags - match a regexp against a string
2037 */
2038I32
288b8c02 2039Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2040 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2041/* strend: pointer to null at end of string */
2042/* strbeg: real beginning of string */
2043/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2044/* data: May be used for some additional optimizations.
2045 Currently its only used, with a U32 cast, for transmitting
2046 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2047/* nosave: For optimizations. */
2048{
97aff369 2049 dVAR;
288b8c02 2050 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2051 /*register*/ char *s;
6eb5f6b9 2052 register regnode *c;
24b23f37 2053 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2054 I32 minlen; /* must match at least this many chars */
2055 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2056 I32 end_shift = 0; /* Same for the end. */ /* CC */
2057 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2058 char *scream_olds = NULL;
f2ed9b32 2059 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2060 I32 multiline;
f8fc2ecf 2061 RXi_GET_DECL(prog,progi);
3b0527fe 2062 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2063 regexp_paren_pair *swap = NULL;
a3621e74
YO
2064 GET_RE_DEBUG_FLAGS_DECL;
2065
7918f24d 2066 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2067 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2068
2069 /* Be paranoid... */
2070 if (prog == NULL || startpos == NULL) {
2071 Perl_croak(aTHX_ "NULL regexp parameter");
2072 return 0;
2073 }
2074
bbe252da 2075 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2076 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2077
f2ed9b32 2078 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2079 DEBUG_EXECUTE_r(
f2ed9b32 2080 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2081 "Matching");
2082 );
bac06658 2083
6eb5f6b9 2084 minlen = prog->minlen;
1de06328
YO
2085
2086 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2088 "String too short [regexec_flags]...\n"));
2089 goto phooey;
1aa99e6b 2090 }
6eb5f6b9 2091
1de06328 2092
6eb5f6b9 2093 /* Check validity of program. */
f8fc2ecf 2094 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2095 Perl_croak(aTHX_ "corrupted regexp program");
2096 }
2097
2098 PL_reg_flags = 0;
2099 PL_reg_eval_set = 0;
2100 PL_reg_maxiter = 0;
2101
3c8556c3 2102 if (RX_UTF8(rx))
6eb5f6b9
JH
2103 PL_reg_flags |= RF_utf8;
2104
2105 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2106 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2107 PL_bostr = strbeg;
3b0527fe 2108 reginfo.sv = sv;
6eb5f6b9
JH
2109
2110 /* Mark end of line for $ (and such) */
2111 PL_regeol = strend;
2112
2113 /* see how far we have to get to not match where we matched before */
3b0527fe 2114 reginfo.till = startpos+minend;
6eb5f6b9 2115
6eb5f6b9
JH
2116 /* If there is a "must appear" string, look for it. */
2117 s = startpos;
2118
bbe252da 2119 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2120 MAGIC *mg;
2c296965 2121 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2122 reginfo.ganch = startpos + prog->gofs;
2c296965 2123 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2124 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2125 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2126 && SvMAGIC(sv)
14befaf4
DM
2127 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2128 && mg->mg_len >= 0) {
3b0527fe 2129 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2130 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2131 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2132
bbe252da 2133 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2134 if (s > reginfo.ganch)
6eb5f6b9 2135 goto phooey;
58e23c8d 2136 s = reginfo.ganch - prog->gofs;
2c296965 2137 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2138 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2139 if (s < strbeg)
2140 goto phooey;
6eb5f6b9
JH
2141 }
2142 }
58e23c8d 2143 else if (data) {
70685ca0 2144 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2145 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2146 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2147
2148 } else { /* pos() not defined */
3b0527fe 2149 reginfo.ganch = strbeg;
2c296965
YO
2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2151 "GPOS: reginfo.ganch = strbeg\n"));
2152 }
6eb5f6b9 2153 }
288b8c02 2154 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2155 /* We have to be careful. If the previous successful match
2156 was from this regex we don't want a subsequent partially
2157 successful match to clobber the old results.
2158 So when we detect this possibility we add a swap buffer
2159 to the re, and switch the buffer each match. If we fail
2160 we switch it back, otherwise we leave it swapped.
2161 */
2162 swap = prog->offs;
2163 /* do we need a save destructor here for eval dies? */
2164 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
c74340f9 2165 }
a0714e2c 2166 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2167 re_scream_pos_data d;
2168
2169 d.scream_olds = &scream_olds;
2170 d.scream_pos = &scream_pos;
288b8c02 2171 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2172 if (!s) {
a3621e74 2173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2174 goto phooey; /* not present */
3fa9c3d7 2175 }
6eb5f6b9
JH
2176 }
2177
1de06328 2178
6eb5f6b9
JH
2179
2180 /* Simplest case: anchored match need be tried only once. */
2181 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2182 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2183 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2184 goto got_it;
bbe252da
YO
2185 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2186 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2187 {
2188 char *end;
2189
2190 if (minlen)
2191 dontbother = minlen - 1;
1aa99e6b 2192 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2193 /* for multiline we only have to try after newlines */
33b8afdf 2194 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2195 /* because of the goto we can not easily reuse the macros for bifurcating the
2196 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2197 if (utf8_target) {
2198 if (s == startpos)
2199 goto after_try_utf8;
2200 while (1) {
2201 if (regtry(&reginfo, &s)) {
2202 goto got_it;
2203 }
2204 after_try_utf8:
2205 if (s > end) {
2206 goto phooey;
2207 }
2208 if (prog->extflags & RXf_USE_INTUIT) {
2209 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2210 if (!s) {
2211 goto phooey;
2212 }
2213 }
2214 else {
2215 s += UTF8SKIP(s);
2216 }
2217 }
2218 } /* end search for check string in unicode */
2219 else {
2220 if (s == startpos) {
2221 goto after_try_latin;
2222 }
2223 while (1) {
2224 if (regtry(&reginfo, &s)) {
2225 goto got_it;
2226 }
2227 after_try_latin:
2228 if (s > end) {
2229 goto phooey;
2230 }
2231 if (prog->extflags & RXf_USE_INTUIT) {
2232 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2233 if (!s) {
2234 goto phooey;
2235 }
2236 }
2237 else {
2238 s++;
2239 }
2240 }
2241 } /* end search for check string in latin*/
2242 } /* end search for check string */
2243 else { /* search for newline */
2244 if (s > startpos) {
2245 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2246 s--;
92f3d482 2247 }
21eede78
YO
2248 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2249 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2250 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2251 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2252 goto got_it;
2253 }
92f3d482
YO
2254 }
2255 } /* end search for newline */
2256 } /* end anchored/multiline check string search */
6eb5f6b9 2257 goto phooey;
bbe252da 2258 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2259 {
486ec47a 2260 /* the warning about reginfo.ganch being used without initialization
bbe252da 2261 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2262 and we only enter this block when the same bit is set. */
58e23c8d 2263 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2264
2265 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2266 goto got_it;
2267 goto phooey;
2268 }
2269
2270 /* Messy cases: unanchored match. */
bbe252da 2271 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2272 /* we have /x+whatever/ */
f2ed9b32 2273 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2274 char ch;
bf93d4cc
GS
2275#ifdef DEBUGGING
2276 int did_match = 0;
2277#endif
f2ed9b32
KW
2278 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2279 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2280 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2281
f2ed9b32 2282 if (utf8_target) {
4cadc6a9 2283 REXEC_FBC_SCAN(
6eb5f6b9 2284 if (*s == ch) {
a3621e74 2285 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2286 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2287 s += UTF8SKIP(s);
2288 while (s < strend && *s == ch)
2289 s += UTF8SKIP(s);
2290 }
4cadc6a9 2291 );
6eb5f6b9
JH
2292 }
2293 else {
4cadc6a9 2294 REXEC_FBC_SCAN(
6eb5f6b9 2295 if (*s == ch) {
a3621e74 2296 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2297 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2298 s++;
2299 while (s < strend && *s == ch)
2300 s++;
2301 }
4cadc6a9 2302 );
6eb5f6b9 2303 }
a3621e74 2304 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2305 PerlIO_printf(Perl_debug_log,
b7953727
JH
2306 "Did not find anchored character...\n")
2307 );
6eb5f6b9 2308 }
a0714e2c
SS
2309 else if (prog->anchored_substr != NULL
2310 || prog->anchored_utf8 != NULL
2311 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2312 && prog->float_max_offset < strend - s)) {
2313 SV *must;
2314 I32 back_max;
2315 I32 back_min;
2316 char *last;
6eb5f6b9 2317 char *last1; /* Last position checked before */
bf93d4cc
GS
2318#ifdef DEBUGGING
2319 int did_match = 0;
2320#endif
33b8afdf 2321 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2322 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2323 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2324 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2325 back_max = back_min = prog->anchored_offset;
2326 } else {
f2ed9b32
KW
2327 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2328 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2329 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2330 back_max = prog->float_max_offset;
2331 back_min = prog->float_min_offset;
2332 }
1de06328
YO
2333
2334
33b8afdf
JH
2335 if (must == &PL_sv_undef)
2336 /* could not downgrade utf8 check substring, so must fail */
2337 goto phooey;
2338
1de06328
YO
2339 if (back_min<0) {
2340 last = strend;
2341 } else {
2342 last = HOP3c(strend, /* Cannot start after this */
2343 -(I32)(CHR_SVLEN(must)
2344 - (SvTAIL(must) != 0) + back_min), strbeg);
2345 }
6eb5f6b9
JH
2346 if (s > PL_bostr)
2347 last1 = HOPc(s, -1);
2348 else
2349 last1 = s - 1; /* bogus */
2350
a0288114 2351 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2352 check_substr==must. */
2353 scream_pos = -1;
2354 dontbother = end_shift;
2355 strend = HOPc(strend, -dontbother);
2356 while ( (s <= last) &&
f1905e1b 2357 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
1de06328 2358 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
6eb5f6b9 2359 end_shift, &scream_pos, 0))
1de06328 2360 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2361 (unsigned char*)strend, must,
7fba1cd6 2362 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b 2363 /* we may be pointing at the wrong string */
07bc277f 2364 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
3f7c398e 2365 s = strbeg + (s - SvPVX_const(sv));
a3621e74 2366 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2367 if (HOPc(s, -back_max) > last1) {
2368 last1 = HOPc(s, -back_min);
2369 s = HOPc(s, -back_max);
2370 }
2371 else {
52657f30 2372 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2373
2374 last1 = HOPc(s, -back_min);
52657f30 2375 s = t;
6eb5f6b9 2376 }
f2ed9b32 2377 if (utf8_target) {
6eb5f6b9 2378 while (s <= last1) {
24b23f37 2379 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2380 goto got_it;
2381 s += UTF8SKIP(s);
2382 }
2383 }
2384 else {
2385 while (s <= last1) {
24b23f37 2386 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2387 goto got_it;
2388 s++;
2389 }
2390 }
2391 }
ab3bbdeb 2392 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2393 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2394 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2395 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2396 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2397 ? "anchored" : "floating"),
ab3bbdeb
YO
2398 quoted, RE_SV_TAIL(must));
2399 });
6eb5f6b9
JH
2400 goto phooey;
2401 }
f8fc2ecf 2402 else if ( (c = progi->regstclass) ) {
f14c76ed 2403 if (minlen) {
f8fc2ecf 2404 const OPCODE op = OP(progi->regstclass);
66e933ab 2405 /* don't bother with what can't match */
786e8c11 2406 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2407 strend = HOPc(strend, -(minlen - 1));
2408 }
a3621e74 2409 DEBUG_EXECUTE_r({
be8e71aa 2410 SV * const prop = sv_newmortal();
32fc9b6a 2411 regprop(prog, prop, c);
0df25f3d 2412 {
f2ed9b32 2413 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2414 s,strend-s,60);
0df25f3d 2415 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2416 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2417 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2418 quoted, (int)(strend - s));
0df25f3d 2419 }
ffc61ed2 2420 });
3b0527fe 2421 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2422 goto got_it;
07be1b83 2423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2424 }
2425 else {
2426 dontbother = 0;
a0714e2c 2427 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2428 /* Trim the end. */
d6a28714 2429 char *last;
33b8afdf
JH
2430 SV* float_real;
2431
f2ed9b32
KW
2432 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2433 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2434 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2435
f1905e1b 2436 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
33b8afdf 2437 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
2438 end_shift, &scream_pos, 1); /* last one */
2439 if (!last)
ffc61ed2 2440 last = scream_olds; /* Only one occurrence. */
4addbd3b 2441 /* we may be pointing at the wrong string */
07bc277f 2442 else if (RXp_MATCH_COPIED(prog))
3f7c398e 2443 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 2444 }
d6a28714
JH
2445 else {
2446 STRLEN len;
cfd0369c 2447 const char * const little = SvPV_const(float_real, len);
d6a28714 2448
33b8afdf 2449 if (SvTAIL(float_real)) {
d6a28714
JH
2450 if (memEQ(strend - len + 1, little, len - 1))
2451 last = strend - len + 1;
7fba1cd6 2452 else if (!multiline)
9041c2e3 2453 last = memEQ(strend - len, little, len)
bd61b366 2454 ? strend - len : NULL;
b8c5462f 2455 else
d6a28714
JH
2456 goto find_last;
2457 } else {
2458 find_last:
9041c2e3 2459 if (len)
d6a28714 2460 last = rninstr(s, strend, little, little + len);
b8c5462f 2461 else
a0288114 2462 last = strend; /* matching "$" */
b8c5462f 2463 }
b8c5462f 2464 }
bf93d4cc 2465 if (last == NULL) {
6bda09f9
YO
2466 DEBUG_EXECUTE_r(
2467 PerlIO_printf(Perl_debug_log,
2468 "%sCan't trim the tail, match fails (should not happen)%s\n",
2469 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
2470 goto phooey; /* Should not happen! */
2471 }
d6a28714
JH
2472 dontbother = strend - last + prog->float_min_offset;
2473 }
2474 if (minlen && (dontbother < minlen))
2475 dontbother = minlen - 1;
2476 strend -= dontbother; /* this one's always in bytes! */
2477 /* We don't know much -- general case. */
f2ed9b32 2478 if (utf8_target) {
d6a28714 2479 for (;;) {
24b23f37 2480 if (regtry(&reginfo, &s))
d6a28714
JH
2481 goto got_it;
2482 if (s >= strend)
2483 break;
b8c5462f 2484 s += UTF8SKIP(s);
d6a28714
JH
2485 };
2486 }
2487 else {
2488 do {
24b23f37 2489 if (regtry(&reginfo, &s))
d6a28714
JH
2490 goto got_it;
2491 } while (s++ < strend);
2492 }
2493 }
2494
2495 /* Failure. */
2496 goto phooey;
2497
2498got_it:
e9105d30 2499 Safefree(swap);
288b8c02 2500 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2501
19b95bf0 2502 if (PL_reg_eval_set)
4f639d21 2503 restore_pos(aTHX_ prog);
5daac39c
NC
2504 if (RXp_PAREN_NAMES(prog))
2505 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2506
2507 /* make sure $`, $&, $', and $digit will work later */
2508 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2509 RX_MATCH_COPY_FREE(rx);
d6a28714 2510 if (flags & REXEC_COPY_STR) {
be8e71aa 2511 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2512#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2513 if ((SvIsCOW(sv)
2514 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2515 if (DEBUG_C_TEST) {
2516 PerlIO_printf(Perl_debug_log,
2517 "Copy on write: regexp capture, type %d\n",
2518 (int) SvTYPE(sv));
2519 }
2520 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2521 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2522 assert (SvPOKp(prog->saved_copy));
2523 } else
2524#endif
2525 {
288b8c02 2526 RX_MATCH_COPIED_on(rx);
ed252734
NC
2527 s = savepvn(strbeg, i);
2528 prog->subbeg = s;
2529 }
d6a28714 2530 prog->sublen = i;
d6a28714
JH
2531 }
2532 else {
2533 prog->subbeg = strbeg;
2534 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2535 }
2536 }
9041c2e3 2537
d6a28714
JH
2538 return 1;
2539
2540phooey:
a3621e74 2541 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2542 PL_colors[4], PL_colors[5]));
d6a28714 2543 if (PL_reg_eval_set)
4f639d21 2544 restore_pos(aTHX_ prog);
e9105d30 2545 if (swap) {
c74340f9 2546 /* we failed :-( roll it back */
e9105d30
GG
2547 Safefree(prog->offs);
2548 prog->offs = swap;
2549 }
2550
d6a28714
JH
2551 return 0;
2552}
2553
6bda09f9 2554
d6a28714
JH
2555/*
2556 - regtry - try match at specific point
2557 */
2558STATIC I32 /* 0 failure, 1 success */
24b23f37 2559S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2560{
97aff369 2561 dVAR;
d6a28714 2562 CHECKPOINT lastcp;
288b8c02
NC
2563 REGEXP *const rx = reginfo->prog;
2564 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2565 RXi_GET_DECL(prog,progi);
a3621e74 2566 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2567
2568 PERL_ARGS_ASSERT_REGTRY;
2569
24b23f37 2570 reginfo->cutpoint=NULL;
d6a28714 2571
bbe252da 2572 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
d6a28714
JH
2573 MAGIC *mg;
2574
2575 PL_reg_eval_set = RS_init;
a3621e74 2576 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2577 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2578 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2579 ));
ea8d6ae1 2580 SAVESTACK_CXPOS();
d6a28714
JH
2581 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2582 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2583 SAVETMPS;
2584 /* Apparently this is not needed, judging by wantarray. */
e8347627 2585 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2586 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2587
3b0527fe 2588 if (reginfo->sv) {
d6a28714 2589 /* Make $_ available to executed code. */
3b0527fe 2590 if (reginfo->sv != DEFSV) {
59f00321 2591 SAVE_DEFSV;
414bf5ae 2592 DEFSV_set(reginfo->sv);
b8c5462f 2593 }
d6a28714 2594
3b0527fe
DM
2595 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2596 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2597 /* prepare for quick setting of pos */
d300d9fa 2598#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2599 if (SvIsCOW(reginfo->sv))
2600 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2601#endif
3dab1dad 2602 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2603 &PL_vtbl_mglob, NULL, 0);
d6a28714 2604 mg->mg_len = -1;
b8c5462f 2605 }
d6a28714
JH
2606 PL_reg_magic = mg;
2607 PL_reg_oldpos = mg->mg_len;
4f639d21 2608 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2609 }
09687e5a 2610 if (!PL_reg_curpm) {
a02a5408 2611 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2612#ifdef USE_ITHREADS
2613 {
14a49a24 2614 SV* const repointer = &PL_sv_undef;
92313705
NC
2615 /* this regexp is also owned by the new PL_reg_curpm, which
2616 will try to free it. */
d2ece331 2617 av_push(PL_regex_padav, repointer);
09687e5a
AB
2618 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2619 PL_regex_pad = AvARRAY(PL_regex_padav);
2620 }
2621#endif
2622 }
86c29d75
NC
2623#ifdef USE_ITHREADS
2624 /* It seems that non-ithreads works both with and without this code.
2625 So for efficiency reasons it seems best not to have the code
2626 compiled when it is not needed. */
92313705
NC
2627 /* This is safe against NULLs: */
2628 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2629 /* PM_reg_curpm owns a reference to this regexp. */
cd733337 2630 (void)ReREFCNT_inc(rx);
86c29d75 2631#endif
288b8c02 2632 PM_SETRE(PL_reg_curpm, rx);
d6a28714
JH
2633 PL_reg_oldcurpm = PL_curpm;
2634 PL_curpm = PL_reg_curpm;
07bc277f 2635 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2636 /* Here is a serious problem: we cannot rewrite subbeg,
2637 since it may be needed if this match fails. Thus
2638 $` inside (?{}) could fail... */
2639 PL_reg_oldsaved = prog->subbeg;
2640 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2641#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2642 PL_nrs = prog->saved_copy;
2643#endif
07bc277f 2644 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2645 }
2646 else
bd61b366 2647 PL_reg_oldsaved = NULL;
d6a28714
JH
2648 prog->subbeg = PL_bostr;
2649 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2650 }
24b23f37 2651 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
f0ab9afb 2652 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2653 PL_reginput = *startpos;
d6a28714 2654 PL_reglastparen = &prog->lastparen;
a01268b5 2655 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2656 prog->lastparen = 0;
03994de8 2657 prog->lastcloseparen = 0;
d6a28714 2658 PL_regsize = 0;
f0ab9afb 2659 PL_regoffs = prog->offs;
d6a28714
JH
2660 if (PL_reg_start_tmpl <= prog->nparens) {
2661 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2662 if(PL_reg_start_tmp)
2663 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2664 else
a02a5408 2665 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2666 }
2667
2668 /* XXXX What this code is doing here?!!! There should be no need
2669 to do this again and again, PL_reglastparen should take care of
3dd2943c 2670 this! --ilya*/
dafc8851
JH
2671
2672 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2673 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116 2674 * PL_reglastparen), is not needed at all by the test suite
225593e1
DM
2675 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2676 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2677 * Meanwhile, this code *is* needed for the
daf18116
JH
2678 * above-mentioned test suite tests to succeed. The common theme
2679 * on those tests seems to be returning null fields from matches.
225593e1 2680 * --jhi updated by dapm */
dafc8851 2681#if 1
d6a28714 2682 if (prog->nparens) {
f0ab9afb 2683 regexp_paren_pair *pp = PL_regoffs;
097eb12c 2684 register I32 i;
eb160463 2685 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
f0ab9afb
NC
2686 ++pp;
2687 pp->start = -1;
2688 pp->end = -1;
d6a28714
JH
2689 }
2690 }
dafc8851 2691#endif
02db2b7b 2692 REGCP_SET(lastcp);
f8fc2ecf 2693 if (regmatch(reginfo, progi->program + 1)) {
f0ab9afb 2694 PL_regoffs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2695 return 1;
2696 }
24b23f37
YO
2697 if (reginfo->cutpoint)
2698 *startpos= reginfo->cutpoint;
02db2b7b 2699 REGCP_UNWIND(lastcp);
d6a28714
JH
2700 return 0;
2701}
2702
02db2b7b 2703
8ba1375e
MJD
2704#define sayYES goto yes
2705#define sayNO goto no
262b90c4 2706#define sayNO_SILENT goto no_silent
8ba1375e 2707
f9f4320a
YO
2708/* we dont use STMT_START/END here because it leads to
2709 "unreachable code" warnings, which are bogus, but distracting. */
2710#define CACHEsayNO \
c476f425
DM
2711 if (ST.cache_mask) \
2712 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2713 sayNO
3298f257 2714
a3621e74 2715/* this is used to determine how far from the left messages like
265c4333
YO
2716 'failed...' are printed. It should be set such that messages
2717 are inline with the regop output that created them.
a3621e74 2718*/
265c4333 2719#define REPORT_CODE_OFF 32
a3621e74
YO
2720
2721
40a82448
DM
2722#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2723#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2724
86545054
DM
2725#define SLAB_FIRST(s) (&(s)->states[0])
2726#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2727
5d9a96ca
DM
2728/* grab a new slab and return the first slot in it */
2729
2730STATIC regmatch_state *
2731S_push_slab(pTHX)
2732{
a35a87e7 2733#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2734 dMY_CXT;
2735#endif
5d9a96ca
DM
2736 regmatch_slab *s = PL_regmatch_slab->next;
2737 if (!s) {
2738 Newx(s, 1, regmatch_slab);
2739 s->prev = PL_regmatch_slab;
2740 s->next = NULL;
2741 PL_regmatch_slab->next = s;
2742 }
2743 PL_regmatch_slab = s;
86545054 2744 return SLAB_FIRST(s);
5d9a96ca 2745}
5b47454d 2746
95b24440 2747
40a82448
DM
2748/* push a new state then goto it */
2749
2750#define PUSH_STATE_GOTO(state, node) \
2751 scan = node; \
2752 st->resume_state = state; \
2753 goto push_state;
2754
2755/* push a new state with success backtracking, then goto it */
2756
2757#define PUSH_YES_STATE_GOTO(state, node) \
2758 scan = node; \
2759 st->resume_state = state; \
2760 goto push_yes_state;
2761
aa283a38 2762
aa283a38 2763
d6a28714 2764/*
95b24440 2765
bf1f174e
DM
2766regmatch() - main matching routine
2767
2768This is basically one big switch statement in a loop. We execute an op,
2769set 'next' to point the next op, and continue. If we come to a point which
2770we may need to backtrack to on failure such as (A|B|C), we push a
2771backtrack state onto the backtrack stack. On failure, we pop the top
2772state, and re-enter the loop at the state indicated. If there are no more
2773states to pop, we return failure.
2774
2775Sometimes we also need to backtrack on success; for example /A+/, where
2776after successfully matching one A, we need to go back and try to
2777match another one; similarly for lookahead assertions: if the assertion
2778completes successfully, we backtrack to the state just before the assertion
2779and then carry on. In these cases, the pushed state is marked as
2780'backtrack on success too'. This marking is in fact done by a chain of
2781pointers, each pointing to the previous 'yes' state. On success, we pop to
2782the nearest yes state, discarding any intermediate failure-only states.
2783Sometimes a yes state is pushed just to force some cleanup code to be
2784called at the end of a successful match or submatch; e.g. (??{$re}) uses
2785it to free the inner regex.
2786
2787Note that failure backtracking rewinds the cursor position, while
2788success backtracking leaves it alone.
2789
2790A pattern is complete when the END op is executed, while a subpattern
2791such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2792ops trigger the "pop to last yes state if any, otherwise return true"
2793behaviour.
2794
2795A common convention in this function is to use A and B to refer to the two
2796subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2797the subpattern to be matched possibly multiple times, while B is the entire
2798rest of the pattern. Variable and state names reflect this convention.
2799
2800The states in the main switch are the union of ops and failure/success of
2801substates associated with with that op. For example, IFMATCH is the op
2802that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2803'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2804successfully matched A and IFMATCH_A_fail is a state saying that we have
2805just failed to match A. Resume states always come in pairs. The backtrack
2806state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2807at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2808on success or failure.
2809
2810The struct that holds a backtracking state is actually a big union, with
2811one variant for each major type of op. The variable st points to the
2812top-most backtrack struct. To make the code clearer, within each
2813block of code we #define ST to alias the relevant union.
2814
2815Here's a concrete example of a (vastly oversimplified) IFMATCH
2816implementation:
2817
2818 switch (state) {
2819 ....
2820
2821#define ST st->u.ifmatch
2822
2823 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2824 ST.foo = ...; // some state we wish to save
95b24440 2825 ...
bf1f174e
DM
2826 // push a yes backtrack state with a resume value of
2827 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2828 // first node of A:
2829 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2830 // NOTREACHED
2831
2832 case IFMATCH_A: // we have successfully executed A; now continue with B
2833 next = B;
2834 bar = ST.foo; // do something with the preserved value
2835 break;
2836
2837 case IFMATCH_A_fail: // A failed, so the assertion failed
2838 ...; // do some housekeeping, then ...
2839 sayNO; // propagate the failure
2840
2841#undef ST
95b24440 2842
bf1f174e
DM
2843 ...
2844 }
95b24440 2845
bf1f174e
DM
2846For any old-timers reading this who are familiar with the old recursive
2847approach, the code above is equivalent to:
95b24440 2848
bf1f174e
DM
2849 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2850 {
2851 int foo = ...
95b24440 2852 ...
bf1f174e
DM
2853 if (regmatch(A)) {
2854 next = B;
2855 bar = foo;
2856 break;
95b24440 2857 }
bf1f174e
DM
2858 ...; // do some housekeeping, then ...
2859 sayNO; // propagate the failure
95b24440 2860 }
bf1f174e
DM
2861
2862The topmost backtrack state, pointed to by st, is usually free. If you
2863want to claim it, populate any ST.foo fields in it with values you wish to
2864save, then do one of
2865
2866 PUSH_STATE_GOTO(resume_state, node);
2867 PUSH_YES_STATE_GOTO(resume_state, node);
2868
2869which sets that backtrack state's resume value to 'resume_state', pushes a
2870new free entry to the top of the backtrack stack, then goes to 'node'.
2871On backtracking, the free slot is popped, and the saved state becomes the
2872new free state. An ST.foo field in this new top state can be temporarily
2873accessed to retrieve values, but once the main loop is re-entered, it
2874becomes available for reuse.
2875
2876Note that the depth of the backtrack stack constantly increases during the
2877left-to-right execution of the pattern, rather than going up and down with
2878the pattern nesting. For example the stack is at its maximum at Z at the
2879end of the pattern, rather than at X in the following:
2880
2881 /(((X)+)+)+....(Y)+....Z/
2882
2883The only exceptions to this are lookahead/behind assertions and the cut,
2884(?>A), which pop all the backtrack states associated with A before
2885continuing.
2886
486ec47a 2887Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2888PL_regmatch_state and st always point to the currently active state,
2889and PL_regmatch_slab points to the slab currently containing
2890PL_regmatch_state. The first time regmatch() is called, the first slab is
2891allocated, and is never freed until interpreter destruction. When the slab
2892is full, a new one is allocated and chained to the end. At exit from
2893regmatch(), slabs allocated since entry are freed.
2894
2895*/
95b24440 2896
40a82448 2897
5bc10b2c 2898#define DEBUG_STATE_pp(pp) \
265c4333 2899 DEBUG_STATE_r({ \
f2ed9b32 2900 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2901 PerlIO_printf(Perl_debug_log, \
5d458dd8 2902 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2903 depth*2, "", \
13d6edb4 2904 PL_reg_name[st->resume_state], \
5d458dd8
YO
2905 ((st==yes_state||st==mark_state) ? "[" : ""), \
2906 ((st==yes_state) ? "Y" : ""), \
2907 ((st==mark_state) ? "M" : ""), \
2908 ((st==yes_state||st==mark_state) ? "]" : "") \
2909 ); \
265c4333 2910 });
5bc10b2c 2911
40a82448 2912
3dab1dad 2913#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2914
3df15adc 2915#ifdef DEBUGGING
5bc10b2c 2916
ab3bbdeb 2917STATIC void
f2ed9b32 2918S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2919 const char *start, const char *end, const char *blurb)
2920{
efd26800 2921 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2922
2923 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2924
ab3bbdeb
YO
2925 if (!PL_colorset)
2926 reginitcolors();
2927 {
2928 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2929 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2930
f2ed9b32 2931 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2932 start, end - start, 60);
2933
2934 PerlIO_printf(Perl_debug_log,
2935 "%s%s REx%s %s against %s\n",
2936 PL_colors[4], blurb, PL_colors[5], s0, s1);
2937
f2ed9b32 2938 if (utf8_target||utf8_pat)
1de06328
YO
2939 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2940 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2941 utf8_pat && utf8_target ? " and " : "",
2942 utf8_target ? "string" : ""
ab3bbdeb
YO
2943 );
2944 }
2945}
3df15adc
YO
2946
2947STATIC void
786e8c11
YO
2948S_dump_exec_pos(pTHX_ const char *locinput,
2949 const regnode *scan,
2950 const char *loc_regeol,
2951 const char *loc_bostr,
2952 const char *loc_reg_starttry,
f2ed9b32 2953 const bool utf8_target)
07be1b83 2954{
786e8c11 2955 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2956 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2957 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
2958 /* The part of the string before starttry has one color
2959 (pref0_len chars), between starttry and current
2960 position another one (pref_len - pref0_len chars),
2961 after the current position the third one.
2962 We assume that pref0_len <= pref_len, otherwise we
2963 decrease pref0_len. */
786e8c11
YO
2964 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2965 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
2966 int pref0_len;
2967
7918f24d
NC
2968 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2969
f2ed9b32 2970 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 2971 pref_len++;
786e8c11
YO
2972 pref0_len = pref_len - (locinput - loc_reg_starttry);
2973 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2974 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2975 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 2976 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
2977 l--;
2978 if (pref0_len < 0)
2979 pref0_len = 0;
2980 if (pref0_len > pref_len)
2981 pref0_len = pref_len;
2982 {
f2ed9b32 2983 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2984
ab3bbdeb 2985 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 2986 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2987
ab3bbdeb 2988 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2989 (locinput - pref_len + pref0_len),
1de06328 2990 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2991
ab3bbdeb 2992 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 2993 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 2994
1de06328 2995 const STRLEN tlen=len0+len1+len2;
3df15adc 2996 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2997 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 2998 (IV)(locinput - loc_bostr),
07be1b83 2999 len0, s0,
07be1b83 3000 len1, s1,
07be1b83 3001 (docolor ? "" : "> <"),
07be1b83 3002 len2, s2,
f9f4320a 3003 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3004 "");
3005 }
3006}
3df15adc 3007
07be1b83
YO
3008#endif
3009
0a4db386
YO
3010/* reg_check_named_buff_matched()
3011 * Checks to see if a named buffer has matched. The data array of
3012 * buffer numbers corresponding to the buffer is expected to reside
3013 * in the regexp->data->data array in the slot stored in the ARG() of
3014 * node involved. Note that this routine doesn't actually care about the
3015 * name, that information is not preserved from compilation to execution.
3016 * Returns the index of the leftmost defined buffer with the given name
3017 * or 0 if non of the buffers matched.
3018 */
3019STATIC I32
7918f24d
NC
3020S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3021{
0a4db386 3022 I32 n;
f8fc2ecf 3023 RXi_GET_DECL(rex,rexi);
ad64d0ec 3024 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3025 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3026
3027 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3028
0a4db386
YO
3029 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3030 if ((I32)*PL_reglastparen >= nums[n] &&
f0ab9afb 3031 PL_regoffs[nums[n]].end != -1)
0a4db386
YO
3032 {
3033 return nums[n];
3034 }
3035 }
3036 return 0;
3037}
3038
2f554ef7
DM
3039
3040/* free all slabs above current one - called during LEAVE_SCOPE */
3041
3042STATIC void
3043S_clear_backtrack_stack(pTHX_ void *p)
3044{
3045 regmatch_slab *s = PL_regmatch_slab->next;
3046 PERL_UNUSED_ARG(p);
3047
3048 if (!s)
3049 return;
3050 PL_regmatch_slab->next = NULL;
3051 while (s) {
3052 regmatch_slab * const osl = s;
3053 s = s->next;
3054 Safefree(osl);
3055 }
3056}
3057
3058
28d8d7f4
YO
3059#define SETREX(Re1,Re2) \
3060 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3061 Re1 = (Re2)
3062
d6a28714 3063STATIC I32 /* 0 failure, 1 success */
24b23f37 3064S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3065{
a35a87e7 3066#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3067 dMY_CXT;
3068#endif
27da23d5 3069 dVAR;
f2ed9b32 3070 register const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3071 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3072 REGEXP *rex_sv = reginfo->prog;
3073 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3074 RXi_GET_DECL(rex,rexi);
2f554ef7 3075 I32 oldsave;
5d9a96ca
DM
3076 /* the current state. This is a cached copy of PL_regmatch_state */
3077 register regmatch_state *st;
5d9a96ca
DM
3078 /* cache heavy used fields of st in registers */
3079 register regnode *scan;
3080 register regnode *next;
438e9bae 3081 register U32 n = 0; /* general value; init to avoid compiler warning */
24d3c4a9 3082 register I32 ln = 0; /* len or last; init to avoid compiler warning */
5d9a96ca 3083 register char *locinput = PL_reginput;
5d9a96ca 3084 register I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3085
b69b0499 3086 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3087 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3088 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3089 const U32 max_nochange_depth =
3090 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3091 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3092 regmatch_state *yes_state = NULL; /* state to pop to on success of
3093 subpattern */
e2e6a0f1
YO
3094 /* mark_state piggy backs on the yes_state logic so that when we unwind
3095 the stack on success we can update the mark_state as we go */
3096 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3097 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3098 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3099 U32 state_num;
5d458dd8
YO
3100 bool no_final = 0; /* prevent failure from backtracking? */
3101 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3102 char *startpoint = PL_reginput;
5d458dd8
YO
3103 SV *popmark = NULL; /* are we looking for a mark? */
3104 SV *sv_commit = NULL; /* last mark name seen in failure */
3105 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3106 during a successful match */
5d458dd8
YO
3107 U32 lastopen = 0; /* last open we saw */
3108 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3109 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3110 /* these three flags are set by various ops to signal information to
3111 * the very next op. They have a useful lifetime of exactly one loop
3112 * iteration, and are not preserved or restored by state pushes/pops
3113 */
3114 bool sw = 0; /* the condition value in (?(cond)a|b) */
3115 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3116 int logical = 0; /* the following EVAL is:
3117 0: (?{...})
3118 1: (?(?{...})X|Y)
3119 2: (??{...})
3120 or the following IFMATCH/UNLESSM is:
3121 false: plain (?=foo)
3122 true: used as a condition: (?(?=foo))
3123 */
95b24440 3124#ifdef DEBUGGING
e68ec53f 3125 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3126#endif
3127
7918f24d
NC
3128 PERL_ARGS_ASSERT_REGMATCH;
3129
3b57cd43 3130 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3131 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3132 }));
5d9a96ca
DM
3133 /* on first ever call to regmatch, allocate first slab */
3134 if (!PL_regmatch_slab) {
3135 Newx(PL_regmatch_slab, 1, regmatch_slab);
3136 PL_regmatch_slab->prev = NULL;
3137 PL_regmatch_slab->next = NULL;
86545054 3138 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3139 }
3140
2f554ef7
DM
3141 oldsave = PL_savestack_ix;
3142 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3143 SAVEVPTR(PL_regmatch_slab);
3144 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3145
3146 /* grab next free state slot */
3147 st = ++PL_regmatch_state;
86545054 3148 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3149 st = PL_regmatch_state = S_push_slab(aTHX);
3150
d6a28714
JH
3151 /* Note that nextchr is a byte even in UTF */
3152 nextchr = UCHARAT(locinput);
3153 scan = prog;
3154 while (scan != NULL) {
8ba1375e 3155
a3621e74 3156 DEBUG_EXECUTE_r( {
6136c704 3157 SV * const prop = sv_newmortal();
1de06328 3158 regnode *rnext=regnext(scan);
f2ed9b32 3159 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3160 regprop(rex, prop, scan);
07be1b83
YO
3161
3162 PerlIO_printf(Perl_debug_log,
3163 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3164 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3165 SvPVX_const(prop),
1de06328 3166 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3167 0 : (IV)(rnext - rexi->program));
2a782b5b 3168 });
d6a28714
JH
3169
3170 next = scan + NEXT_OFF(scan);
3171 if (next == scan)
3172 next = NULL;
40a82448 3173 state_num = OP(scan);
d6a28714 3174
40a82448 3175 reenter_switch:
34a81e2b
B
3176
3177 assert(PL_reglastparen == &rex->lastparen);
3178 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3179 assert(PL_regoffs == rex->offs);
3180
40a82448 3181 switch (state_num) {
d6a28714 3182 case BOL:
7fba1cd6 3183 if (locinput == PL_bostr)
d6a28714 3184 {
3b0527fe 3185 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3186 break;
3187 }
d6a28714
JH
3188 sayNO;
3189 case MBOL:
12d33761
HS
3190 if (locinput == PL_bostr ||
3191 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3192 {
b8c5462f
JH
3193 break;
3194 }
d6a28714
JH
3195 sayNO;
3196 case SBOL:
c2a73568 3197 if (locinput == PL_bostr)
b8c5462f 3198 break;
d6a28714
JH
3199 sayNO;
3200 case GPOS:
3b0527fe 3201 if (locinput == reginfo->ganch)
d6a28714
JH
3202 break;
3203 sayNO;
ee9b8eae
YO
3204
3205 case KEEPS:
3206 /* update the startpoint */
f0ab9afb 3207 st->u.keeper.val = PL_regoffs[0].start;
ee9b8eae 3208 PL_reginput = locinput;
f0ab9afb 3209 PL_regoffs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3210 PUSH_STATE_GOTO(KEEPS_next, next);
3211 /*NOT-REACHED*/
3212 case KEEPS_next_fail:
3213 /* rollback the start point change */
f0ab9afb 3214 PL_regoffs[0].start = st->u.keeper.val;
ee9b8eae
YO
3215 sayNO_SILENT;
3216 /*NOT-REACHED*/
d6a28714 3217 case EOL:
d6a28714
JH
3218 goto seol;
3219 case MEOL:
d6a28714 3220 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3221 sayNO;
b8c5462f 3222 break;
d6a28714
JH
3223 case SEOL:
3224 seol:
3225 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3226 sayNO;
d6a28714 3227 if (PL_regeol - locinput > 1)
b8c5462f 3228 sayNO;
b8c5462f 3229 break;
d6a28714
JH
3230 case EOS:
3231 if (PL_regeol != locinput)
b8c5462f 3232 sayNO;
d6a28714 3233 break;
ffc61ed2 3234 case SANY:
d6a28714 3235 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3236 sayNO;
f2ed9b32 3237 if (utf8_target) {
f33976b4
DB
3238 locinput += PL_utf8skip[nextchr];
3239 if (locinput > PL_regeol)
3240 sayNO;
3241 nextchr = UCHARAT(locinput);
3242 }
3243 else
3244 nextchr = UCHARAT(++locinput);
3245 break;
3246 case CANY:
3247 if (!nextchr && locinput >= PL_regeol)
3248 sayNO;
b8c5462f 3249 nextchr = UCHARAT(++locinput);
a0d0e21e 3250 break;
ffc61ed2 3251 case REG_ANY:
1aa99e6b
IH
3252 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3253 sayNO;
f2ed9b32 3254 if (utf8_target) {
b8c5462f 3255 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3256 if (locinput > PL_regeol)
3257 sayNO;
a0ed51b3 3258 nextchr = UCHARAT(locinput);
a0ed51b3 3259 }
1aa99e6b
IH
3260 else
3261 nextchr = UCHARAT(++locinput);
a0ed51b3 3262 break;
166ba7cd
DM
3263
3264#undef ST
3265#define ST st->u.trie
786e8c11
YO
3266 case TRIEC:
3267 /* In this case the charclass data is available inline so
3268 we can fail fast without a lot of extra overhead.
3269 */
f2ed9b32 3270 if (scan->flags == EXACT || !utf8_target) {
786e8c11
YO
3271 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3272 DEBUG_EXECUTE_r(
3273 PerlIO_printf(Perl_debug_log,
3274 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3275 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
786e8c11
YO
3276 );
3277 sayNO_SILENT;
3278 /* NOTREACHED */
3279 }
3280 }
3281 /* FALL THROUGH */
5b47454d 3282 case TRIE:
2e64971a
DM
3283 /* the basic plan of execution of the trie is:
3284 * At the beginning, run though all the states, and
3285 * find the longest-matching word. Also remember the position
3286 * of the shortest matching word. For example, this pattern:
3287 * 1 2 3 4 5
3288 * ab|a|x|abcd|abc
3289 * when matched against the string "abcde", will generate
3290 * accept states for all words except 3, with the longest
3291 * matching word being 4, and the shortest being 1 (with
3292 * the position being after char 1 of the string).
3293 *
3294 * Then for each matching word, in word order (i.e. 1,2,4,5),
3295 * we run the remainder of the pattern; on each try setting
3296 * the current position to the character following the word,
3297 * returning to try the next word on failure.
3298 *
3299 * We avoid having to build a list of words at runtime by
3300 * using a compile-time structure, wordinfo[].prev, which
3301 * gives, for each word, the previous accepting word (if any).
3302 * In the case above it would contain the mappings 1->2, 2->0,
3303 * 3->0, 4->5, 5->1. We can use this table to generate, from
3304 * the longest word (4 above), a list of all words, by
3305 * following the list of prev pointers; this gives us the
3306 * unordered list 4,5,1,2. Then given the current word we have
3307 * just tried, we can go through the list and find the
3308 * next-biggest word to try (so if we just failed on word 2,
3309 * the next in the list is 4).
3310 *
3311 * Since at runtime we don't record the matching position in
3312 * the string for each word, we have to work that out for
3313 * each word we're about to process. The wordinfo table holds
3314 * the character length of each word; given that we recorded
3315 * at the start: the position of the shortest word and its
3316 * length in chars, we just need to move the pointer the
3317 * difference between the two char lengths. Depending on
3318 * Unicode status and folding, that's cheap or expensive.
3319 *
3320 * This algorithm is optimised for the case where are only a
3321 * small number of accept states, i.e. 0,1, or maybe 2.
3322 * With lots of accepts states, and having to try all of them,
3323 * it becomes quadratic on number of accept states to find all
3324 * the next words.
3325 */
3326
3dab1dad 3327 {
07be1b83 3328 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3329 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3330
3331 /* what trie are we using right now */
be8e71aa 3332 reg_trie_data * const trie
f8fc2ecf 3333 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3334 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3335 U32 state = trie->startstate;
166ba7cd 3336
3dab1dad
YO
3337 if (trie->bitmap && trie_type != trie_utf8_fold &&
3338 !TRIE_BITMAP_TEST(trie,*locinput)
3339 ) {
3340 if (trie->states[ state ].wordnum) {
3341 DEBUG_EXECUTE_r(
3342 PerlIO_printf(Perl_debug_log,
3343 "%*s %smatched empty string...%s\n",
5bc10b2c 3344 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3345 );
20dbff7c
YO
3346 if (!trie->jump)
3347 break;
3dab1dad
YO
3348 } else {
3349 DEBUG_EXECUTE_r(
3350 PerlIO_printf(Perl_debug_log,
786e8c11 3351 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3352 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3353 );
3354 sayNO_SILENT;
3355 }
3356 }
166ba7cd 3357
786e8c11
YO
3358 {
3359 U8 *uc = ( U8* )locinput;
3360
3361 STRLEN len = 0;
3362 STRLEN foldlen = 0;
3363 U8 *uscan = (U8*)NULL;
786e8c11 3364 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3365 U32 charcount = 0; /* how many input chars we have matched */
3366 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3367
786e8c11
YO
3368 ST.B = next;
3369 ST.jump = trie->jump;
786e8c11 3370 ST.me = scan;
2e64971a
DM
3371 ST.firstpos = NULL;
3372 ST.longfold = FALSE; /* char longer if folded => it's harder */
3373 ST.nextword = 0;
3374
3375 /* fully traverse the TRIE; note the position of the
3376 shortest accept state and the wordnum of the longest
3377 accept state */
07be1b83 3378
a3621e74 3379 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3380 U32 base = trie->states[ state ].trans.base;
f9f4320a 3381 UV uvc = 0;
acb909b4 3382 U16 charid = 0;
2e64971a
DM
3383 U16 wordnum;
3384 wordnum = trie->states[ state ].wordnum;
3385
3386 if (wordnum) { /* it's an accept state */
3387 if (!accepted) {
3388 accepted = 1;
3389 /* record first match position */
3390 if (ST.longfold) {
3391 ST.firstpos = (U8*)locinput;
3392 ST.firstchars = 0;
5b47454d 3393 }
2e64971a
DM
3394 else {
3395 ST.firstpos = uc;
3396 ST.firstchars = charcount;
3397 }
3398 }
3399 if (!ST.nextword || wordnum < ST.nextword)
3400 ST.nextword = wordnum;
3401 ST.topword = wordnum;
786e8c11 3402 }
a3621e74 3403
07be1b83 3404 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3405 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3406 PerlIO_printf( Perl_debug_log,
2e64971a 3407 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3408 2+depth * 2, "", PL_colors[4],
2e64971a 3409 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3410 });
a3621e74 3411
2e64971a 3412 /* read a char and goto next state */
a3621e74 3413 if ( base ) {
6dd2be57 3414 I32 offset;
55eed653
NC
3415 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3416 uscan, len, uvc, charid, foldlen,
3417 foldbuf, uniflags);
2e64971a
DM
3418 charcount++;
3419 if (foldlen>0)
3420 ST.longfold = TRUE;
5b47454d 3421 if (charid &&
6dd2be57
DM
3422 ( ((offset =
3423 base + charid - 1 - trie->uniquecharcount)) >= 0)
3424
3425 && ((U32)offset < trie->lasttrans)
3426 && trie->trans[offset].check == state)
5b47454d 3427 {
6dd2be57 3428 state = trie->trans[offset].next;
5b47454d
DM
3429 }
3430 else {
3431 state = 0;
3432 }
3433 uc += len;
3434
3435 }
3436 else {
a3621e74
YO
3437 state = 0;
3438 }
3439 DEBUG_TRIE_EXECUTE_r(
e4584336 3440 PerlIO_printf( Perl_debug_log,
786e8c11 3441 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3442 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3443 );
3444 }
2e64971a 3445 if (!accepted)
a3621e74 3446 sayNO;
a3621e74 3447
2e64971a
DM
3448 /* calculate total number of accept states */
3449 {
3450 U16 w = ST.topword;
3451 accepted = 0;
3452 while (w) {
3453 w = trie->wordinfo[w].prev;
3454 accepted++;
3455 }
3456 ST.accepted = accepted;
3457 }
3458
166ba7cd
DM
3459 DEBUG_EXECUTE_r(
3460 PerlIO_printf( Perl_debug_log,
3461 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3462 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3463 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3464 );
2e64971a 3465 goto trie_first_try; /* jump into the fail handler */
786e8c11 3466 }}
fae667d5 3467 /* NOTREACHED */
2e64971a
DM
3468
3469 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3470 if ( ST.jump) {
3471 REGCP_UNWIND(ST.cp);
3472 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 3473 PL_regoffs[n].end = -1;
fae667d5
YO
3474 *PL_reglastparen = n;
3475 }
2e64971a
DM
3476 if (!--ST.accepted) {
3477 DEBUG_EXECUTE_r({
3478 PerlIO_printf( Perl_debug_log,
3479 "%*s %sTRIE failed...%s\n",
3480 REPORT_CODE_OFF+depth*2, "",
3481 PL_colors[4],
3482 PL_colors[5] );
3483 });
3484 sayNO_SILENT;
3485 }
3486 {
3487 /* Find next-highest word to process. Note that this code
3488 * is O(N^2) per trie run (O(N) per branch), so keep tight */
d9a396a3
DM
3489 register U16 min = 0;
3490 register U16 word;
2e64971a
DM
3491 register U16 const nextword = ST.nextword;
3492 register reg_trie_wordinfo * const wordinfo
3493 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3494 for (word=ST.topword; word; word=wordinfo[word].prev) {
3495 if (word > nextword && (!min || word < min))
3496 min = word;
3497 }
3498 ST.nextword = min;
3499 }
3500
fae667d5 3501 trie_first_try:
5d458dd8
YO
3502 if (do_cutgroup) {
3503 do_cutgroup = 0;
3504 no_final = 0;
3505 }
fae667d5
YO
3506
3507 if ( ST.jump) {
3508 ST.lastparen = *PL_reglastparen;
3509 REGCP_SET(ST.cp);
2e64971a 3510 }
a3621e74 3511
2e64971a 3512 /* find start char of end of current word */
166ba7cd 3513 {
2e64971a
DM
3514 U32 chars; /* how many chars to skip */
3515 U8 *uc = ST.firstpos;
3516 reg_trie_data * const trie
3517 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3518
3519 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3520 >= ST.firstchars);
3521 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3522 - ST.firstchars;
3523
3524 if (ST.longfold) {
3525 /* the hard option - fold each char in turn and find
3526 * its folded length (which may be different */
3527 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3528 STRLEN foldlen;
3529 STRLEN len;
d9a396a3 3530 UV uvc;
2e64971a
DM
3531 U8 *uscan;
3532
3533 while (chars) {
f2ed9b32 3534 if (utf8_target) {
2e64971a
DM
3535 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3536 uniflags);
3537 uc += len;
3538 }
3539 else {
3540 uvc = *uc;
3541 uc++;
3542 }
3543 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3544 uscan = foldbuf;
3545 while (foldlen) {
3546 if (!--chars)
3547 break;
3548 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3549 uniflags);
3550 uscan += len;
3551 foldlen -= len;
3552 }
3553 }
a3621e74 3554 }
2e64971a 3555 else {
f2ed9b32 3556 if (utf8_target)
2e64971a
DM
3557 while (chars--)
3558 uc += UTF8SKIP(uc);
3559 else
3560 uc += chars;
3561 }
3562 PL_reginput = (char *)uc;
3563 }
166ba7cd 3564
2e64971a
DM
3565 scan = (ST.jump && ST.jump[ST.nextword])
3566 ? ST.me + ST.jump[ST.nextword]
3567 : ST.B;
166ba7cd 3568
2e64971a
DM
3569 DEBUG_EXECUTE_r({
3570 PerlIO_printf( Perl_debug_log,
3571 "%*s %sTRIE matched word #%d, continuing%s\n",
3572 REPORT_CODE_OFF+depth*2, "",
3573 PL_colors[4],
3574 ST.nextword,
3575 PL_colors[5]
3576 );
3577 });
3578
3579 if (ST.accepted > 1 || has_cutgroup) {
3580 PUSH_STATE_GOTO(TRIE_next, scan);
3581 /* NOTREACHED */
166ba7cd 3582 }
2e64971a
DM
3583 /* only one choice left - just continue */
3584 DEBUG_EXECUTE_r({
3585 AV *const trie_words
3586 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3587 SV ** const tmp = av_fetch( trie_words,
3588 ST.nextword-1, 0 );
3589 SV *sv= tmp ? sv_newmortal() : NULL;
3590
3591 PerlIO_printf( Perl_debug_log,
3592 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3593 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3594 ST.nextword,
3595 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3596 PL_colors[0], PL_colors[1],
c89df6cf 3597 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3598 )
3599 : "not compiled under -Dr",
3600 PL_colors[5] );
3601 });
3602
3603 locinput = PL_reginput;
3604 nextchr = UCHARAT(locinput);
3605 continue; /* execute rest of RE */
166ba7cd 3606 /* NOTREACHED */
166ba7cd
DM
3607#undef ST
3608
95b24440
DM
3609 case EXACT: {
3610 char *s = STRING(scan);
24d3c4a9 3611 ln = STR_LEN(scan);
f2ed9b32 3612 if (utf8_target != UTF_PATTERN) {
bc517b45 3613 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3614 char *l = locinput;
24d3c4a9 3615 const char * const e = s + ln;
a72c7584 3616
f2ed9b32 3617 if (utf8_target) {
5ff6fc6d 3618 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3619 while (s < e) {
a3b680e6 3620 STRLEN ulen;
1aa99e6b 3621 if (l >= PL_regeol)
5ff6fc6d
JH
3622 sayNO;
3623 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3624 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3625 uniflags))
5ff6fc6d 3626 sayNO;
bc517b45 3627 l += ulen;
5ff6fc6d 3628 s ++;
1aa99e6b 3629 }
5ff6fc6d
JH
3630 }
3631 else {
3632 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3633 while (s < e) {
a3b680e6 3634 STRLEN ulen;
1aa99e6b
IH
3635 if (l >= PL_regeol)
3636 sayNO;
5ff6fc6d 3637 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3638 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3639 uniflags))
1aa99e6b 3640 sayNO;
bc517b45 3641 s += ulen;
a72c7584 3642 l ++;
1aa99e6b 3643 }
5ff6fc6d 3644 }
1aa99e6b
IH
3645 locinput = l;
3646 nextchr = UCHARAT(locinput);
3647 break;
3648 }
bc517b45 3649 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3650 /* Inline the first character, for speed. */
3651 if (UCHARAT(s) != nextchr)
3652 sayNO;
24d3c4a9 3653 if (PL_regeol - locinput < ln)
d6a28714 3654 sayNO;
24d3c4a9 3655 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3656 sayNO;
24d3c4a9 3657 locinput += ln;
d6a28714
JH
3658 nextchr = UCHARAT(locinput);
3659 break;
95b24440 3660 }
9a5a5549 3661 case EXACTFL: {
a932d541 3662 re_fold_t folder;
9a5a5549
KW
3663 const U8 * fold_array;
3664 const char * s;
d513472c 3665 U32 fold_utf8_flags;
9a5a5549 3666
b8c5462f 3667 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3668 folder = foldEQ_locale;
3669 fold_array = PL_fold_locale;
17580e7a 3670 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3671 goto do_exactf;
3672
3c760661 3673 case EXACTFU_SS:
ca600955 3674 case EXACTFU_NO_TRIE:
9a5a5549
KW
3675 case EXACTFU:
3676 folder = foldEQ_latin1;
3677 fold_array = PL_fold_latin1;
2daa8fee 3678 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3679 goto do_exactf;
3680
2f7f8cb1
KW
3681 case EXACTFA:
3682 folder = foldEQ_latin1;
3683 fold_array = PL_fold_latin1;
57014d77 3684 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3685 goto do_exactf;
3686
9a5a5549
KW
3687 case EXACTF:
3688 folder = foldEQ;
3689 fold_array = PL_fold;
62bf7766 3690 fold_utf8_flags = 0;
9a5a5549
KW
3691
3692 do_exactf:
3693 s = STRING(scan);
24d3c4a9 3694 ln = STR_LEN(scan);
d6a28714 3695
3c760661
KW
3696 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3697 /* Either target or the pattern are utf8, or has the issue where
3698 * the fold lengths may differ. */
be8e71aa 3699 const char * const l = locinput;
d07ddd77 3700 char *e = PL_regeol;
bc517b45 3701
d513472c 3702 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3703 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3704 {
3705 sayNO;
5486206c 3706 }
d07ddd77
JH
3707 locinput = e;
3708 nextchr = UCHARAT(locinput);
3709 break;
a0ed51b3 3710 }
d6a28714 3711
0a138b74 3712 /* Neither the target nor the pattern are utf8 */
d6a28714 3713 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3714 UCHARAT(s) != fold_array[nextchr])
3715 {
a0ed51b3 3716 sayNO;
9a5a5549 3717 }
24d3c4a9 3718 if (PL_regeol - locinput < ln)
b8c5462f 3719 sayNO;
9a5a5549 3720 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3721 sayNO;
24d3c4a9 3722 locinput += ln;
d6a28714 3723 nextchr = UCHARAT(locinput);
a0d0e21e 3724 break;
9a5a5549 3725 }
63ac0dad
KW
3726
3727 /* XXX Could improve efficiency by separating these all out using a
3728 * macro or in-line function. At that point regcomp.c would no longer
3729 * have to set the FLAGS fields of these */
b2680017
YO
3730 case BOUNDL:
3731 case NBOUNDL:
3732 PL_reg_flags |= RF_tainted;
3733 /* FALL THROUGH */
3734 case BOUND:
63ac0dad 3735 case BOUNDU:
cfaf538b 3736 case BOUNDA:
b2680017 3737 case NBOUND:
63ac0dad 3738 case NBOUNDU:
cfaf538b 3739 case NBOUNDA:
b2680017 3740 /* was last char in word? */
f2e96b5d
KW
3741 if (utf8_target
3742 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3743 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3744 {
b2680017
YO
3745 if (locinput == PL_bostr)
3746 ln = '\n';
3747 else {
3748 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3749
3750 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3751 }
63ac0dad 3752 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3753 ln = isALNUM_uni(ln);
3754 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3755 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3756 }
3757 else {
3758 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3759 n = isALNUM_LC_utf8((U8*)locinput);
3760 }
3761 }
3762 else {
cfaf538b
KW
3763
3764 /* Here the string isn't utf8, or is utf8 and only ascii
3765 * characters are to match \w. In the latter case looking at
3766 * the byte just prior to the current one may be just the final
3767 * byte of a multi-byte character. This is ok. There are two
3768 * cases:
3769 * 1) it is a single byte character, and then the test is doing
3770 * just what it's supposed to.
3771 * 2) it is a multi-byte character, in which case the final
3772 * byte is never mistakable for ASCII, and so the test
3773 * will say it is not a word character, which is the
3774 * correct answer. */
b2680017
YO
3775 ln = (locinput != PL_bostr) ?
3776 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3777 switch (FLAGS(scan)) {
3778 case REGEX_UNICODE_CHARSET:
3779 ln = isWORDCHAR_L1(ln);
3780 n = isWORDCHAR_L1(nextchr);
3781 break;
3782 case REGEX_LOCALE_CHARSET:
3783 ln = isALNUM_LC(ln);
3784 n = isALNUM_LC(nextchr);
3785 break;
3786 case REGEX_DEPENDS_CHARSET:
3787 ln = isALNUM(ln);
3788 n = isALNUM(nextchr);
3789 break;
cfaf538b 3790 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3791 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3792 ln = isWORDCHAR_A(ln);
3793 n = isWORDCHAR_A(nextchr);
3794 break;
63ac0dad
KW
3795 default:
3796 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3797 break;
b2680017
YO
3798 }
3799 }
63ac0dad
KW
3800 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3801 * regcomp.sym */
3802 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3803 sayNO;
3804 break;
f56b6394 3805 case ANYOFV:
d6a28714 3806 case ANYOF:
f56b6394 3807 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3808 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3809 if (locinput >= PL_regeol)
3810 sayNO;
9e55ce06 3811
f2ed9b32 3812 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3813 sayNO;
b32d7d3e 3814 locinput += inclasslen;
b8c5462f 3815 nextchr = UCHARAT(locinput);
e0f9d4a8 3816 break;
ffc61ed2
JH
3817 }
3818 else {
3819 if (nextchr < 0)
3820 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3821 if (!nextchr && locinput >= PL_regeol)
3822 sayNO;
20ed0b26 3823 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3824 sayNO;
ffc61ed2 3825 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3826 break;
3827 }
b8c5462f 3828 break;
20d0b1e9 3829 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3830 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3831 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3832 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3833 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 3834 alnum, "a");
ee9a90b8
KW
3835
3836 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3837 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3838 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 3839 SPACEA, NSPACEA, isSPACE_A,
779d7b58 3840 space, " ");
ee9a90b8
KW
3841
3842 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3843 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 3844 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 3845 digit, "0");
20d0b1e9 3846
37e2e78e
KW
3847 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3848 a Unicode extended Grapheme Cluster */
3849 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3850 extended Grapheme Cluster is:
3851
3852 CR LF
3853 | Prepend* Begin Extend*
3854 | .
3855
3856 Begin is (Hangul-syllable | ! Control)
3857 Extend is (Grapheme_Extend | Spacing_Mark)
3858 Control is [ GCB_Control CR LF ]
3859
3860 The discussion below shows how the code for CLUMP is derived
3861 from this regex. Note that most of these concepts are from
3862 property values of the Grapheme Cluster Boundary (GCB) property.
3863 No code point can have multiple property values for a given
3864 property. Thus a code point in Prepend can't be in Control, but
3865 it must be in !Control. This is why Control above includes
3866 GCB_Control plus CR plus LF. The latter two are used in the GCB
3867 property separately, and so can't be in GCB_Control, even though
3868 they logically are controls. Control is not the same as gc=cc,
3869 but includes format and other characters as well.
3870
3871 The Unicode definition of Hangul-syllable is:
3872 L+
3873 | (L* ( ( V | LV ) V* | LVT ) T*)
3874 | T+
3875 )
3876 Each of these is a value for the GCB property, and hence must be
3877 disjoint, so the order they are tested is immaterial, so the
3878 above can safely be changed to
3879 T+
3880 | L+
3881 | (L* ( LVT | ( V | LV ) V*) T*)
3882
3883 The last two terms can be combined like this:
3884 L* ( L
3885 | (( LVT | ( V | LV ) V*) T*))
3886
3887 And refactored into this:
3888 L* (L | LVT T* | V V* T* | LV V* T*)
3889
3890 That means that if we have seen any L's at all we can quit
cc3b396d 3891 there, but if the next character is an LVT, a V, or an LV we
37e2e78e
KW
3892 should keep going.
3893
3894 There is a subtlety with Prepend* which showed up in testing.
3895 Note that the Begin, and only the Begin is required in:
3896 | Prepend* Begin Extend*
cc3b396d
KW
3897 Also, Begin contains '! Control'. A Prepend must be a
3898 '! Control', which means it must also be a Begin. What it
3899 comes down to is that if we match Prepend* and then find no
3900 suitable Begin afterwards, that if we backtrack the last
3901 Prepend, that one will be a suitable Begin.
37e2e78e
KW
3902 */
3903
b7c83a7e 3904 if (locinput >= PL_regeol)
a0ed51b3 3905 sayNO;
f2ed9b32 3906 if (! utf8_target) {
37e2e78e
KW
3907
3908 /* Match either CR LF or '.', as all the other possibilities
3909 * require utf8 */
3910 locinput++; /* Match the . or CR */
cc3b396d
KW
3911 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3912 match the LF */
37e2e78e
KW
3913 && locinput < PL_regeol
3914 && UCHARAT(locinput) == '\n') locinput++;
3915 }
3916 else {
3917
3918 /* Utf8: See if is ( CR LF ); already know that locinput <
3919 * PL_regeol, so locinput+1 is in bounds */
3920 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3921 locinput += 2;
3922 }
3923 else {
3924 /* In case have to backtrack to beginning, then match '.' */
3925 char *starting = locinput;
3926
3927 /* In case have to backtrack the last prepend */
3928 char *previous_prepend = 0;
3929
3930 LOAD_UTF8_CHARCLASS_GCB();
3931
3932 /* Match (prepend)* */
3933 while (locinput < PL_regeol
3934 && swash_fetch(PL_utf8_X_prepend,
f2ed9b32 3935 (U8*)locinput, utf8_target))
37e2e78e
KW
3936 {
3937 previous_prepend = locinput;
3938 locinput += UTF8SKIP(locinput);
3939 }
3940
3941 /* As noted above, if we matched a prepend character, but
3942 * the next thing won't match, back off the last prepend we
3943 * matched, as it is guaranteed to match the begin */
3944 if (previous_prepend
3945 && (locinput >= PL_regeol
3946 || ! swash_fetch(PL_utf8_X_begin,
f2ed9b32 3947 (U8*)locinput, utf8_target)))
37e2e78e
KW
3948 {
3949 locinput = previous_prepend;
3950 }
3951
3952 /* Note that here we know PL_regeol > locinput, as we
3953 * tested that upon input to this switch case, and if we
3954 * moved locinput forward, we tested the result just above
3955 * and it either passed, or we backed off so that it will
3956 * now pass */
f2ed9b32 3957 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
37e2e78e
KW
3958
3959 /* Here did not match the required 'Begin' in the
3960 * second term. So just match the very first
3961 * character, the '.' of the final term of the regex */
3962 locinput = starting + UTF8SKIP(starting);
3963 } else {
3964
3965 /* Here is the beginning of a character that can have
3966 * an extender. It is either a hangul syllable, or a
3967 * non-control */
3968 if (swash_fetch(PL_utf8_X_non_hangul,
f2ed9b32 3969 (U8*)locinput, utf8_target))
37e2e78e
KW
3970 {
3971
3972 /* Here not a Hangul syllable, must be a
3973 * ('! * Control') */
3974 locinput += UTF8SKIP(locinput);
3975 } else {
3976
3977 /* Here is a Hangul syllable. It can be composed
3978 * of several individual characters. One
3979 * possibility is T+ */
3980 if (swash_fetch(PL_utf8_X_T,
f2ed9b32 3981 (U8*)locinput, utf8_target))
37e2e78e
KW
3982 {
3983 while (locinput < PL_regeol
3984 && swash_fetch(PL_utf8_X_T,
f2ed9b32 3985 (U8*)locinput, utf8_target))
37e2e78e
KW
3986 {
3987 locinput += UTF8SKIP(locinput);
3988 }
3989 } else {
3990
3991 /* Here, not T+, but is a Hangul. That means
3992 * it is one of the others: L, LV, LVT or V,
3993 * and matches:
3994 * L* (L | LVT T* | V V* T* | LV V* T*) */
3995
3996 /* Match L* */
3997 while (locinput < PL_regeol
3998 && swash_fetch(PL_utf8_X_L,
f2ed9b32 3999 (U8*)locinput, utf8_target))
37e2e78e
KW
4000 {
4001 locinput += UTF8SKIP(locinput);
4002 }
4003
4004 /* Here, have exhausted L*. If the next
4005 * character is not an LV, LVT nor V, it means
4006 * we had to have at least one L, so matches L+
4007 * in the original equation, we have a complete
4008 * hangul syllable. Are done. */
4009
4010 if (locinput < PL_regeol
4011 && swash_fetch(PL_utf8_X_LV_LVT_V,
f2ed9b32 4012 (U8*)locinput, utf8_target))
37e2e78e
KW
4013 {
4014
4015 /* Otherwise keep going. Must be LV, LVT
4016 * or V. See if LVT */
4017 if (swash_fetch(PL_utf8_X_LVT,
f2ed9b32 4018 (U8*)locinput, utf8_target))
37e2e78e
KW
4019 {
4020 locinput += UTF8SKIP(locinput);
4021 } else {
4022
4023 /* Must be V or LV. Take it, then
4024 * match V* */
4025 locinput += UTF8SKIP(locinput);
4026 while (locinput < PL_regeol
4027 && swash_fetch(PL_utf8_X_V,
f2ed9b32 4028 (U8*)locinput, utf8_target))
37e2e78e
KW
4029 {
4030 locinput += UTF8SKIP(locinput);
4031 }
4032 }
4033
4034 /* And any of LV, LVT, or V can be followed
4035 * by T* */
4036 while (locinput < PL_regeol
4037 && swash_fetch(PL_utf8_X_T,
4038 (U8*)locinput,
f2ed9b32 4039 utf8_target))
37e2e78e
KW
4040 {
4041 locinput += UTF8SKIP(locinput);
4042 }
4043 }
4044 }
4045 }
4046
4047 /* Match any extender */
4048 while (locinput < PL_regeol
4049 && swash_fetch(PL_utf8_X_extend,
f2ed9b32 4050 (U8*)locinput, utf8_target))
37e2e78e
KW
4051 {
4052 locinput += UTF8SKIP(locinput);
4053 }
4054 }
4055 }
4056 if (locinput > PL_regeol) sayNO;
4057 }
a0ed51b3
LW
4058 nextchr = UCHARAT(locinput);
4059 break;
81714fb9
YO
4060
4061 case NREFFL:
d7ef4b73
KW
4062 { /* The capture buffer cases. The ones beginning with N for the
4063 named buffers just convert to the equivalent numbered and
4064 pretend they were called as the corresponding numbered buffer
4065 op. */
26ecd678
TC
4066 /* don't initialize these in the declaration, it makes C++
4067 unhappy */
81714fb9 4068 char *s;
ff1157ca 4069 char type;
8368298a
TC
4070 re_fold_t folder;
4071 const U8 *fold_array;
26ecd678 4072 UV utf8_fold_flags;
8368298a 4073
81714fb9 4074 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4075 folder = foldEQ_locale;
4076 fold_array = PL_fold_locale;
4077 type = REFFL;
17580e7a 4078 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4079 goto do_nref;
4080
2f7f8cb1
KW
4081 case NREFFA:
4082 folder = foldEQ_latin1;
4083 fold_array = PL_fold_latin1;
4084 type = REFFA;
4085 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4086 goto do_nref;
4087
d7ef4b73
KW
4088 case NREFFU:
4089 folder = foldEQ_latin1;
4090 fold_array = PL_fold_latin1;
4091 type = REFFU;
d513472c 4092 utf8_fold_flags = 0;
d7ef4b73
KW
4093 goto do_nref;
4094
81714fb9 4095 case NREFF:
d7ef4b73
KW
4096 folder = foldEQ;
4097 fold_array = PL_fold;
4098 type = REFF;
d513472c 4099 utf8_fold_flags = 0;
d7ef4b73
KW
4100 goto do_nref;
4101
4102 case NREF:
4103 type = REF;
83d7b90b
KW
4104 folder = NULL;
4105 fold_array = NULL;
d513472c 4106 utf8_fold_flags = 0;
d7ef4b73
KW
4107 do_nref:
4108
4109 /* For the named back references, find the corresponding buffer
4110 * number */
0a4db386
YO
4111 n = reg_check_named_buff_matched(rex,scan);
4112
d7ef4b73 4113 if ( ! n ) {
81714fb9 4114 sayNO;
d7ef4b73
KW
4115 }
4116 goto do_nref_ref_common;
4117
c8756f30 4118 case REFFL:
3280af22 4119 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4120 folder = foldEQ_locale;
4121 fold_array = PL_fold_locale;
17580e7a 4122 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4123 goto do_ref;
4124
2f7f8cb1
KW
4125 case REFFA:
4126 folder = foldEQ_latin1;
4127 fold_array = PL_fold_latin1;
4128 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4129 goto do_ref;
4130
d7ef4b73
KW
4131 case REFFU:
4132 folder = foldEQ_latin1;
4133 fold_array = PL_fold_latin1;
d513472c 4134 utf8_fold_flags = 0;
d7ef4b73
KW
4135 goto do_ref;
4136
4137 case REFF:
4138 folder = foldEQ;
4139 fold_array = PL_fold;
d513472c 4140 utf8_fold_flags = 0;
83d7b90b 4141 goto do_ref;
d7ef4b73 4142
c277df42 4143 case REF:
83d7b90b
KW
4144 folder = NULL;
4145 fold_array = NULL;
d513472c 4146 utf8_fold_flags = 0;
83d7b90b 4147
d7ef4b73 4148 do_ref:
81714fb9 4149 type = OP(scan);
d7ef4b73
KW
4150 n = ARG(scan); /* which paren pair */
4151
4152 do_nref_ref_common:
f0ab9afb 4153 ln = PL_regoffs[n].start;
2c2d71f5 4154 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3b6647e0 4155 if (*PL_reglastparen < n || ln == -1)
af3f8c16 4156 sayNO; /* Do not match unless seen CLOSEn. */
f0ab9afb 4157 if (ln == PL_regoffs[n].end)
a0d0e21e 4158 break;
a0ed51b3 4159
24d3c4a9 4160 s = PL_bostr + ln;
d7ef4b73 4161 if (type != REF /* REF can do byte comparison */
2f65c56d 4162 && (utf8_target || type == REFFU))
d7ef4b73
KW
4163 { /* XXX handle REFFL better */
4164 char * limit = PL_regeol;
4165
4166 /* This call case insensitively compares the entire buffer
4167 * at s, with the current input starting at locinput, but
4168 * not going off the end given by PL_regeol, and returns in
4169 * limit upon success, how much of the current input was
4170 * matched */
d513472c
KW
4171 if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4172 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4173 {
4174 sayNO;
a0ed51b3 4175 }
d7ef4b73 4176 locinput = limit;
a0ed51b3
LW
4177 nextchr = UCHARAT(locinput);
4178 break;
4179 }
4180
d7ef4b73 4181 /* Not utf8: Inline the first character, for speed. */
76e3520e 4182 if (UCHARAT(s) != nextchr &&
81714fb9 4183 (type == REF ||
d7ef4b73 4184 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4185 sayNO;
f0ab9afb 4186 ln = PL_regoffs[n].end - ln;
24d3c4a9 4187 if (locinput + ln > PL_regeol)
4633a7c4 4188 sayNO;
81714fb9 4189 if (ln > 1 && (type == REF
24d3c4a9 4190 ? memNE(s, locinput, ln)
d7ef4b73 4191 : ! folder(s, locinput, ln)))
4633a7c4 4192 sayNO;
24d3c4a9 4193 locinput += ln;
76e3520e 4194 nextchr = UCHARAT(locinput);
a0d0e21e 4195 break;
81714fb9 4196 }
a0d0e21e 4197 case NOTHING:
c277df42 4198 case TAIL:
a0d0e21e
LW
4199 break;
4200 case BACK:
4201 break;
40a82448
DM
4202
4203#undef ST
4204#define ST st->u.eval
c277df42 4205 {
c277df42 4206 SV *ret;
d2f13c59 4207 REGEXP *re_sv;
6bda09f9 4208 regexp *re;
f8fc2ecf 4209 regexp_internal *rei;
1a147d38
YO
4210 regnode *startpoint;
4211
4212 case GOSTART:
e7707071
YO
4213 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4214 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4215 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4216 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4217 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4218 Perl_croak(aTHX_
4219 "Pattern subroutine nesting without pos change"
4220 " exceeded limit in regex");
6bda09f9
YO
4221 } else {
4222 nochange_depth = 0;
1a147d38 4223 }
288b8c02 4224 re_sv = rex_sv;
6bda09f9 4225 re = rex;
f8fc2ecf 4226 rei = rexi;
288b8c02 4227 (void)ReREFCNT_inc(rex_sv);
1a147d38 4228 if (OP(scan)==GOSUB) {
6bda09f9
YO
4229 startpoint = scan + ARG2L(scan);
4230 ST.close_paren = ARG(scan);
4231 } else {
f8fc2ecf 4232 startpoint = rei->program+1;
6bda09f9
YO
4233 ST.close_paren = 0;
4234 }
4235 goto eval_recurse_doit;
4236 /* NOTREACHED */
4237 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4238 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4239 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4240 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4241 } else {
4242 nochange_depth = 0;
4243 }
8e5e9ebe 4244 {
4aabdb9b
DM
4245 /* execute the code in the {...} */
4246 dSP;
6136c704 4247 SV ** const before = SP;
4aabdb9b
DM
4248 OP_4tree * const oop = PL_op;
4249 COP * const ocurcop = PL_curcop;
4250 PAD *old_comppad;
d80618d2 4251 char *saved_regeol = PL_regeol;
91332126
FR
4252 struct re_save_state saved_state;
4253
6562f1c4 4254 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4255 * eval we would normally put it on the save stack, like with
4256 * save_re_context. However, re-evals have a weird scoping so we
4257 * can't just add ENTER/LEAVE here. With that, things like
4258 *
4259 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4260 *
4261 * would break, as they expect the localisation to be unwound
4262 * only when the re-engine backtracks through the bit that
4263 * localised it.
4264 *
4265 * What we do instead is just saving the state in a local c
4266 * variable.
4267 */
91332126
FR
4268 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4269
4aabdb9b 4270 n = ARG(scan);
f8fc2ecf 4271 PL_op = (OP_4tree*)rexi->data->data[n];
24b23f37
YO
4272 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4273 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
9e103e26
DM
4274 /* wrap the call in two SAVECOMPPADs. This ensures that
4275 * when the save stack is eventually unwound, all the
4276 * accumulated SAVEt_CLEARSV's will be processed with
4277 * interspersed SAVEt_COMPPAD's to ensure that lexicals
4278 * are cleared in the right pad */
4279 SAVECOMPPAD();
f8fc2ecf 4280 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
f0ab9afb 4281 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4282
2bf803e2
YO
4283 if (sv_yes_mark) {
4284 SV *sv_mrk = get_sv("REGMARK", 1);
4285 sv_setsv(sv_mrk, sv_yes_mark);
4286 }
4287
8e5e9ebe
RGS
4288 CALLRUNOPS(aTHX); /* Scalar context. */
4289 SPAGAIN;
4290 if (SP == before)
075aa684 4291 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4292 else {
4293 ret = POPs;
4294 PUTBACK;
4295 }
4aabdb9b 4296
91332126
FR
4297 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4298
4aabdb9b 4299 PL_op = oop;
9e103e26 4300 SAVECOMPPAD();
4aabdb9b
DM
4301 PAD_RESTORE_LOCAL(old_comppad);
4302 PL_curcop = ocurcop;
d80618d2 4303 PL_regeol = saved_regeol;
24d3c4a9 4304 if (!logical) {
4aabdb9b
DM
4305 /* /(?{...})/ */
4306 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
4307 break;
4308 }
8e5e9ebe 4309 }
24d3c4a9
DM
4310 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4311 logical = 0;
4aabdb9b 4312 {
4f639d21
DM
4313 /* extract RE object from returned value; compiling if
4314 * necessary */
6136c704 4315 MAGIC *mg = NULL;
288b8c02 4316 REGEXP *rx = NULL;
5c35adbb
NC
4317
4318 if (SvROK(ret)) {
288b8c02 4319 SV *const sv = SvRV(ret);
5c35adbb
NC
4320
4321 if (SvTYPE(sv) == SVt_REGEXP) {
d2f13c59 4322 rx = (REGEXP*) sv;
5c35adbb
NC
4323 } else if (SvSMAGICAL(sv)) {
4324 mg = mg_find(sv, PERL_MAGIC_qr);
4325 assert(mg);
4326 }
4327 } else if (SvTYPE(ret) == SVt_REGEXP) {
d2f13c59 4328 rx = (REGEXP*) ret;
5c35adbb 4329 } else if (SvSMAGICAL(ret)) {
124ee91a
NC
4330 if (SvGMAGICAL(ret)) {
4331 /* I don't believe that there is ever qr magic
4332 here. */
4333 assert(!mg_find(ret, PERL_MAGIC_qr));
faf82a0b 4334 sv_unmagic(ret, PERL_MAGIC_qr);
124ee91a
NC
4335 }
4336 else {
faf82a0b 4337 mg = mg_find(ret, PERL_MAGIC_qr);
124ee91a
NC
4338 /* testing suggests mg only ends up non-NULL for
4339 scalars who were upgraded and compiled in the
4340 else block below. In turn, this is only
4341 triggered in the "postponed utf8 string" tests
4342 in t/op/pat.t */
4343 }
0f5d15d6 4344 }
faf82a0b 4345
0f5d15d6 4346 if (mg) {
d2f13c59 4347 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
e2560c33 4348 assert(rx);
0f5d15d6 4349 }
288b8c02 4350 if (rx) {
f0826785 4351 rx = reg_temp_copy(NULL, rx);
288b8c02 4352 }
0f5d15d6 4353 else {
c737faaf 4354 U32 pm_flags = 0;
a3b680e6 4355 const I32 osize = PL_regsize;
0f5d15d6 4356
b9ad30b4
NC
4357 if (DO_UTF8(ret)) {
4358 assert (SvUTF8(ret));
4359 } else if (SvUTF8(ret)) {
4360 /* Not doing UTF-8, despite what the SV says. Is
4361 this only if we're trapped in use 'bytes'? */
4362 /* Make a copy of the octet sequence, but without
4363 the flag on, as the compiler now honours the
4364 SvUTF8 flag on ret. */
4365 STRLEN len;
4366 const char *const p = SvPV(ret, len);
4367 ret = newSVpvn_flags(p, len, SVs_TEMP);
4368 }
288b8c02 4369 rx = CALLREGCOMP(ret, pm_flags);
9041c2e3 4370 if (!(SvFLAGS(ret)
faf82a0b 4371 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4372 | SVs_GMG))) {
a2794585
NC
4373 /* This isn't a first class regexp. Instead, it's
4374 caching a regexp onto an existing, Perl visible
4375 scalar. */
ad64d0ec 4376 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4377 }
0f5d15d6 4378 PL_regsize = osize;
0f5d15d6 4379 }
288b8c02
NC
4380 re_sv = rx;
4381 re = (struct regexp *)SvANY(rx);
4aabdb9b 4382 }
07bc277f 4383 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4384 re->subbeg = rex->subbeg;
4385 re->sublen = rex->sublen;
f8fc2ecf 4386 rei = RXi_GET(re);
6bda09f9 4387 DEBUG_EXECUTE_r(
f2ed9b32 4388 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4389 "Matching embedded");
4390 );
f8fc2ecf 4391 startpoint = rei->program + 1;
1a147d38 4392 ST.close_paren = 0; /* only used for GOSUB */
6bda09f9
YO
4393 /* borrowed from regtry */
4394 if (PL_reg_start_tmpl <= re->nparens) {
4395 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4396 if(PL_reg_start_tmp)
4397 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4398 else
4399 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
dd5def09 4400 }
aa283a38 4401
1a147d38 4402 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4403 /* run the pattern returned from (??{...}) */
40a82448
DM
4404 ST.cp = regcppush(0); /* Save *all* the positions. */
4405 REGCP_SET(ST.lastcp);
6bda09f9 4406
f0ab9afb 4407 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
6bda09f9 4408
0357f1fd
ML
4409 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4410 PL_reglastparen = &re->lastparen;
4411 PL_reglastcloseparen = &re->lastcloseparen;
4412 re->lastparen = 0;
4413 re->lastcloseparen = 0;
4414
4aabdb9b 4415 PL_reginput = locinput;
ae0beba1 4416 PL_regsize = 0;
4aabdb9b
DM
4417
4418 /* XXXX This is too dramatic a measure... */
4419 PL_reg_maxiter = 0;
4420
faec1544 4421 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4422 if (RX_UTF8(re_sv))
faec1544
DM
4423 PL_reg_flags |= RF_utf8;
4424 else
4425 PL_reg_flags &= ~RF_utf8;
4426 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4427
288b8c02 4428 ST.prev_rex = rex_sv;
faec1544 4429 ST.prev_curlyx = cur_curlyx;
288b8c02
NC
4430 SETREX(rex_sv,re_sv);
4431 rex = re;
f8fc2ecf 4432 rexi = rei;
faec1544 4433 cur_curlyx = NULL;
40a82448 4434 ST.B = next;
faec1544
DM
4435 ST.prev_eval = cur_eval;
4436 cur_eval = st;
faec1544 4437 /* now continue from first node in postoned RE */
6bda09f9 4438 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4aabdb9b 4439 /* NOTREACHED */
a0ed51b3 4440 }
24d3c4a9 4441 /* logical is 1, /(?(?{...})X|Y)/ */
f2338a2e 4442 sw = cBOOL(SvTRUE(ret));
24d3c4a9 4443 logical = 0;
c277df42
IZ
4444 break;
4445 }
40a82448 4446
faec1544
DM
4447 case EVAL_AB: /* cleanup after a successful (??{A})B */
4448 /* note: this is called twice; first after popping B, then A */
4449 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4450 ReREFCNT_dec(rex_sv);
4451 SETREX(rex_sv,ST.prev_rex);
4452 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4453 rexi = RXi_GET(rex);
faec1544
DM
4454 regcpblow(ST.cp);
4455 cur_eval = ST.prev_eval;
4456 cur_curlyx = ST.prev_curlyx;
34a81e2b
B
4457
4458 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4459 PL_reglastparen = &rex->lastparen;
4460 PL_reglastcloseparen = &rex->lastcloseparen;
34a81e2b
B
4461 /* also update PL_regoffs */
4462 PL_regoffs = rex->offs;
0357f1fd 4463
40a82448
DM
4464 /* XXXX This is too dramatic a measure... */
4465 PL_reg_maxiter = 0;
e7707071 4466 if ( nochange_depth )
4b196cd4 4467 nochange_depth--;
262b90c4 4468 sayYES;
40a82448 4469
40a82448 4470
faec1544
DM
4471 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4472 /* note: this is called twice; first after popping B, then A */
4473 PL_reg_flags ^= ST.toggle_reg_flags;
288b8c02
NC
4474 ReREFCNT_dec(rex_sv);
4475 SETREX(rex_sv,ST.prev_rex);
4476 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4477 rexi = RXi_GET(rex);
34a81e2b 4478 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
0357f1fd
ML
4479 PL_reglastparen = &rex->lastparen;
4480 PL_reglastcloseparen = &rex->lastcloseparen;
4481
40a82448
DM
4482 PL_reginput = locinput;
4483 REGCP_UNWIND(ST.lastcp);
4484 regcppop(rex);
faec1544
DM
4485 cur_eval = ST.prev_eval;
4486 cur_curlyx = ST.prev_curlyx;
4487 /* XXXX This is too dramatic a measure... */
4488 PL_reg_maxiter = 0;
e7707071 4489 if ( nochange_depth )
4b196cd4 4490 nochange_depth--;
40a82448 4491 sayNO_SILENT;
40a82448
DM
4492#undef ST
4493
a0d0e21e 4494 case OPEN:
c277df42 4495 n = ARG(scan); /* which paren pair */
3280af22
NIS
4496 PL_reg_start_tmp[n] = locinput;
4497 if (n > PL_regsize)
4498 PL_regsize = n;
e2e6a0f1 4499 lastopen = n;
a0d0e21e
LW
4500 break;
4501 case CLOSE:
c277df42 4502 n = ARG(scan); /* which paren pair */
f0ab9afb
NC
4503 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4504 PL_regoffs[n].end = locinput - PL_bostr;
7f69552c
YO
4505 /*if (n > PL_regsize)
4506 PL_regsize = n;*/
3b6647e0 4507 if (n > *PL_reglastparen)
3280af22 4508 *PL_reglastparen = n;
a01268b5 4509 *PL_reglastcloseparen = n;
3b6647e0 4510 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4511 goto fake_end;
4512 }
a0d0e21e 4513 break;
e2e6a0f1
YO
4514 case ACCEPT:
4515 if (ARG(scan)){
4516 regnode *cursor;
4517 for (cursor=scan;
4518 cursor && OP(cursor)!=END;
4519 cursor=regnext(cursor))
4520 {
4521 if ( OP(cursor)==CLOSE ){
4522 n = ARG(cursor);
4523 if ( n <= lastopen ) {
f0ab9afb
NC
4524 PL_regoffs[n].start
4525 = PL_reg_start_tmp[n] - PL_bostr;
4526 PL_regoffs[n].end = locinput - PL_bostr;
e2e6a0f1
YO
4527 /*if (n > PL_regsize)
4528 PL_regsize = n;*/
3b6647e0 4529 if (n > *PL_reglastparen)
e2e6a0f1
YO
4530 *PL_reglastparen = n;
4531 *PL_reglastcloseparen = n;
3b6647e0
RB
4532 if ( n == ARG(scan) || (cur_eval &&
4533 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4534 break;
4535 }
4536 }
4537 }
4538 }
4539 goto fake_end;
4540 /*NOTREACHED*/
c277df42
IZ
4541 case GROUPP:
4542 n = ARG(scan); /* which paren pair */
f2338a2e 4543 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
c277df42 4544 break;
0a4db386
YO
4545 case NGROUPP:
4546 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4547 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4548 break;
1a147d38 4549 case INSUBP:
0a4db386 4550 n = ARG(scan);
3b6647e0 4551 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4552 break;
4553 case DEFINEP:
4554 sw = 0;
4555 break;
c277df42 4556 case IFTHEN:
2c2d71f5 4557 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4558 if (sw)
c277df42
IZ
4559 next = NEXTOPER(NEXTOPER(scan));
4560 else {
4561 next = scan + ARG(scan);
4562 if (OP(next) == IFTHEN) /* Fake one. */
4563 next = NEXTOPER(NEXTOPER(next));
4564 }
4565 break;
4566 case LOGICAL:
24d3c4a9 4567 logical = scan->flags;
c277df42 4568 break;
c476f425 4569
2ab05381 4570/*******************************************************************
2ab05381 4571
c476f425
DM
4572The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4573pattern, where A and B are subpatterns. (For simple A, CURLYM or
4574STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4575
c476f425 4576A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4577
c476f425
DM
4578On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4579state, which contains the current count, initialised to -1. It also sets
4580cur_curlyx to point to this state, with any previous value saved in the
4581state block.
2ab05381 4582
c476f425
DM
4583CURLYX then jumps straight to the WHILEM op, rather than executing A,
4584since the pattern may possibly match zero times (i.e. it's a while {} loop
4585rather than a do {} while loop).
2ab05381 4586
c476f425
DM
4587Each entry to WHILEM represents a successful match of A. The count in the
4588CURLYX block is incremented, another WHILEM state is pushed, and execution
4589passes to A or B depending on greediness and the current count.
2ab05381 4590
c476f425
DM
4591For example, if matching against the string a1a2a3b (where the aN are
4592substrings that match /A/), then the match progresses as follows: (the
4593pushed states are interspersed with the bits of strings matched so far):
2ab05381 4594
c476f425
DM
4595 <CURLYX cnt=-1>
4596 <CURLYX cnt=0><WHILEM>
4597 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4598 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4599 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4600 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4601
c476f425
DM
4602(Contrast this with something like CURLYM, which maintains only a single
4603backtrack state:
2ab05381 4604
c476f425
DM
4605 <CURLYM cnt=0> a1
4606 a1 <CURLYM cnt=1> a2
4607 a1 a2 <CURLYM cnt=2> a3
4608 a1 a2 a3 <CURLYM cnt=3> b
4609)
2ab05381 4610
c476f425
DM
4611Each WHILEM state block marks a point to backtrack to upon partial failure
4612of A or B, and also contains some minor state data related to that
4613iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4614overall state, such as the count, and pointers to the A and B ops.
2ab05381 4615
c476f425
DM
4616This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4617must always point to the *current* CURLYX block, the rules are:
2ab05381 4618
c476f425
DM
4619When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4620and set cur_curlyx to point the new block.
2ab05381 4621
c476f425
DM
4622When popping the CURLYX block after a successful or unsuccessful match,
4623restore the previous cur_curlyx.
2ab05381 4624
c476f425
DM
4625When WHILEM is about to execute B, save the current cur_curlyx, and set it
4626to the outer one saved in the CURLYX block.
2ab05381 4627
c476f425
DM
4628When popping the WHILEM block after a successful or unsuccessful B match,
4629restore the previous cur_curlyx.
2ab05381 4630
c476f425
DM
4631Here's an example for the pattern (AI* BI)*BO
4632I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4633
c476f425
DM
4634cur_
4635curlyx backtrack stack
4636------ ---------------
4637NULL
4638CO <CO prev=NULL> <WO>
4639CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4640CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4641NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4642
c476f425
DM
4643At this point the pattern succeeds, and we work back down the stack to
4644clean up, restoring as we go:
95b24440 4645
c476f425
DM
4646CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4647CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4648CO <CO prev=NULL> <WO>
4649NULL
a0374537 4650
c476f425
DM
4651*******************************************************************/
4652
4653#define ST st->u.curlyx
4654
4655 case CURLYX: /* start of /A*B/ (for complex A) */
4656 {
4657 /* No need to save/restore up to this paren */
4658 I32 parenfloor = scan->flags;
4659
4660 assert(next); /* keep Coverity happy */
4661 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4662 next += ARG(next);
4663
4664 /* XXXX Probably it is better to teach regpush to support
4665 parenfloor > PL_regsize... */
4666 if (parenfloor > (I32)*PL_reglastparen)
4667 parenfloor = *PL_reglastparen; /* Pessimization... */
4668
4669 ST.prev_curlyx= cur_curlyx;
4670 cur_curlyx = st;
4671 ST.cp = PL_savestack_ix;
4672
4673 /* these fields contain the state of the current curly.
4674 * they are accessed by subsequent WHILEMs */
4675 ST.parenfloor = parenfloor;
d02d6d97 4676 ST.me = scan;
c476f425 4677 ST.B = next;
24d3c4a9
DM
4678 ST.minmod = minmod;
4679 minmod = 0;
c476f425
DM
4680 ST.count = -1; /* this will be updated by WHILEM */
4681 ST.lastloc = NULL; /* this will be updated by WHILEM */
4682
4683 PL_reginput = locinput;
4684 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
5f66b61c 4685 /* NOTREACHED */
c476f425 4686 }
a0d0e21e 4687
c476f425 4688 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4689 cur_curlyx = ST.prev_curlyx;
4690 sayYES;
4691 /* NOTREACHED */
a0d0e21e 4692
c476f425
DM
4693 case CURLYX_end_fail: /* just failed to match all of A*B */
4694 regcpblow(ST.cp);
4695 cur_curlyx = ST.prev_curlyx;
4696 sayNO;
4697 /* NOTREACHED */
4633a7c4 4698
a0d0e21e 4699
c476f425
DM
4700#undef ST
4701#define ST st->u.whilem
4702
4703 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4704 {
4705 /* see the discussion above about CURLYX/WHILEM */
c476f425 4706 I32 n;
d02d6d97
DM
4707 int min = ARG1(cur_curlyx->u.curlyx.me);
4708 int max = ARG2(cur_curlyx->u.curlyx.me);
4709 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4710
c476f425
DM
4711 assert(cur_curlyx); /* keep Coverity happy */
4712 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4713 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4714 ST.cache_offset = 0;
4715 ST.cache_mask = 0;
4716
4717 PL_reginput = locinput;
4718
4719 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4720 "%*s whilem: matched %ld out of %d..%d\n",
4721 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4722 );
a0d0e21e 4723
c476f425 4724 /* First just match a string of min A's. */
a0d0e21e 4725
d02d6d97 4726 if (n < min) {
92e82afa 4727 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
c476f425 4728 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4729 REGCP_SET(ST.lastcp);
4730
d02d6d97 4731 PUSH_STATE_GOTO(WHILEM_A_pre, A);
c476f425
DM
4732 /* NOTREACHED */
4733 }
4734
4735 /* If degenerate A matches "", assume A done. */
4736
4737 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4738 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4739 "%*s whilem: empty match detected, trying continuation...\n",
4740 REPORT_CODE_OFF+depth*2, "")
4741 );
4742 goto do_whilem_B_max;
4743 }
4744
4745 /* super-linear cache processing */
4746
4747 if (scan->flags) {
a0d0e21e 4748
2c2d71f5 4749 if (!PL_reg_maxiter) {
c476f425
DM
4750 /* start the countdown: Postpone detection until we
4751 * know the match is not *that* much linear. */
2c2d71f5 4752 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4753 /* possible overflow for long strings and many CURLYX's */
4754 if (PL_reg_maxiter < 0)
4755 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4756 PL_reg_leftiter = PL_reg_maxiter;
4757 }
c476f425 4758
2c2d71f5 4759 if (PL_reg_leftiter-- == 0) {
c476f425 4760 /* initialise cache */
3298f257 4761 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4762 if (PL_reg_poscache) {
eb160463 4763 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4764 Renew(PL_reg_poscache, size, char);
4765 PL_reg_poscache_size = size;
4766 }
4767 Zero(PL_reg_poscache, size, char);
4768 }
4769 else {
4770 PL_reg_poscache_size = size;
a02a5408 4771 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4772 }
c476f425
DM
4773 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4774 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4775 PL_colors[4], PL_colors[5])
4776 );
2c2d71f5 4777 }
c476f425 4778
2c2d71f5 4779 if (PL_reg_leftiter < 0) {
c476f425
DM
4780 /* have we already failed at this position? */
4781 I32 offset, mask;
4782 offset = (scan->flags & 0xf) - 1
4783 + (locinput - PL_bostr) * (scan->flags>>4);
4784 mask = 1 << (offset % 8);
4785 offset /= 8;
4786 if (PL_reg_poscache[offset] & mask) {
4787 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4788 "%*s whilem: (cache) already tried at this position...\n",
4789 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4790 );
3298f257 4791 sayNO; /* cache records failure */
2c2d71f5 4792 }
c476f425
DM
4793 ST.cache_offset = offset;
4794 ST.cache_mask = mask;
2c2d71f5 4795 }
c476f425 4796 }
2c2d71f5 4797
c476f425 4798 /* Prefer B over A for minimal matching. */
a687059c 4799
c476f425
DM
4800 if (cur_curlyx->u.curlyx.minmod) {
4801 ST.save_curlyx = cur_curlyx;
4802 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4803 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4804 REGCP_SET(ST.lastcp);
4805 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4806 /* NOTREACHED */
4807 }
a0d0e21e 4808
c476f425
DM
4809 /* Prefer A over B for maximal matching. */
4810
d02d6d97 4811 if (n < max) { /* More greed allowed? */
c476f425
DM
4812 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4813 cur_curlyx->u.curlyx.lastloc = locinput;
4814 REGCP_SET(ST.lastcp);
d02d6d97 4815 PUSH_STATE_GOTO(WHILEM_A_max, A);
c476f425
DM
4816 /* NOTREACHED */
4817 }
4818 goto do_whilem_B_max;
4819 }
4820 /* NOTREACHED */
4821
4822 case WHILEM_B_min: /* just matched B in a minimal match */
4823 case WHILEM_B_max: /* just matched B in a maximal match */
4824 cur_curlyx = ST.save_curlyx;
4825 sayYES;
4826 /* NOTREACHED */
4827
4828 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4829 cur_curlyx = ST.save_curlyx;
4830 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4831 cur_curlyx->u.curlyx.count--;
4832 CACHEsayNO;
4833 /* NOTREACHED */
4834
4835 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
4836 /* FALL THROUGH */
4837 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
4838 REGCP_UNWIND(ST.lastcp);
4839 regcppop(rex);
c476f425
DM
4840 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4841 cur_curlyx->u.curlyx.count--;
4842 CACHEsayNO;
4843 /* NOTREACHED */
4844
4845 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4846 REGCP_UNWIND(ST.lastcp);
4847 regcppop(rex); /* Restore some previous $<digit>s? */
4848 PL_reginput = locinput;
4849 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4850 "%*s whilem: failed, trying continuation...\n",
4851 REPORT_CODE_OFF+depth*2, "")
4852 );
4853 do_whilem_B_max:
4854 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4855 && ckWARN(WARN_REGEXP)
4856 && !(PL_reg_flags & RF_warned))
4857 {
4858 PL_reg_flags |= RF_warned;
dcbac5bb
FC
4859 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4860 "Complex regular subexpression recursion limit (%d) "
4861 "exceeded",
c476f425
DM
4862 REG_INFTY - 1);
4863 }
4864
4865 /* now try B */
4866 ST.save_curlyx = cur_curlyx;
4867 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4868 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4869 /* NOTREACHED */
4870
4871 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4872 cur_curlyx = ST.save_curlyx;
4873 REGCP_UNWIND(ST.lastcp);
4874 regcppop(rex);
4875
d02d6d97 4876 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
4877 /* Maximum greed exceeded */
4878 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4879 && ckWARN(WARN_REGEXP)
4880 && !(PL_reg_flags & RF_warned))
4881 {
3280af22 4882 PL_reg_flags |= RF_warned;
c476f425 4883 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
4884 "Complex regular subexpression recursion "
4885 "limit (%d) exceeded",
c476f425 4886 REG_INFTY - 1);
a0d0e21e 4887 }
c476f425 4888 cur_curlyx->u.curlyx.count--;
3ab3c9b4 4889 CACHEsayNO;
a0d0e21e 4890 }
c476f425
DM
4891
4892 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4893 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4894 );
4895 /* Try grabbing another A and see if it helps. */
4896 PL_reginput = locinput;
4897 cur_curlyx->u.curlyx.lastloc = locinput;
4898 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4899 REGCP_SET(ST.lastcp);
d02d6d97
DM
4900 PUSH_STATE_GOTO(WHILEM_A_min,
4901 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
5f66b61c 4902 /* NOTREACHED */
40a82448
DM
4903
4904#undef ST
4905#define ST st->u.branch
4906
4907 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
4908 next = scan + ARG(scan);
4909 if (next == scan)
4910 next = NULL;
40a82448
DM
4911 scan = NEXTOPER(scan);
4912 /* FALL THROUGH */
c277df42 4913
40a82448
DM
4914 case BRANCH: /* /(...|A|...)/ */
4915 scan = NEXTOPER(scan); /* scan now points to inner node */
40a82448
DM
4916 ST.lastparen = *PL_reglastparen;
4917 ST.next_branch = next;
4918 REGCP_SET(ST.cp);
4919 PL_reginput = locinput;
02db2b7b 4920
40a82448 4921 /* Now go into the branch */
5d458dd8
YO
4922 if (has_cutgroup) {
4923 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4924 } else {
4925 PUSH_STATE_GOTO(BRANCH_next, scan);
4926 }
40a82448 4927 /* NOTREACHED */
5d458dd8
YO
4928 case CUTGROUP:
4929 PL_reginput = locinput;
4930 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 4931 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
4932 PUSH_STATE_GOTO(CUTGROUP_next,next);
4933 /* NOTREACHED */
4934 case CUTGROUP_next_fail:
4935 do_cutgroup = 1;
4936 no_final = 1;
4937 if (st->u.mark.mark_name)
4938 sv_commit = st->u.mark.mark_name;
4939 sayNO;
4940 /* NOTREACHED */
4941 case BRANCH_next:
4942 sayYES;
4943 /* NOTREACHED */
40a82448 4944 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
4945 if (do_cutgroup) {
4946 do_cutgroup = 0;
4947 no_final = 0;
4948 }
40a82448
DM
4949 REGCP_UNWIND(ST.cp);
4950 for (n = *PL_reglastparen; n > ST.lastparen; n--)
f0ab9afb 4951 PL_regoffs[n].end = -1;
40a82448 4952 *PL_reglastparen = n;
0a4db386 4953 /*dmq: *PL_reglastcloseparen = n; */
40a82448
DM
4954 scan = ST.next_branch;
4955 /* no more branches? */
5d458dd8
YO
4956 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4957 DEBUG_EXECUTE_r({
4958 PerlIO_printf( Perl_debug_log,
4959 "%*s %sBRANCH failed...%s\n",
4960 REPORT_CODE_OFF+depth*2, "",
4961 PL_colors[4],
4962 PL_colors[5] );
4963 });
4964 sayNO_SILENT;
4965 }
40a82448
DM
4966 continue; /* execute next BRANCH[J] op */
4967 /* NOTREACHED */
4968
a0d0e21e 4969 case MINMOD:
24d3c4a9 4970 minmod = 1;
a0d0e21e 4971 break;
40a82448
DM
4972
4973#undef ST
4974#define ST st->u.curlym
4975
4976 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4977
4978 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 4979 * only a single backtracking state, no matter how many matches
40a82448
DM
4980 * there are in {m,n}. It relies on the pattern being constant
4981 * length, with no parens to influence future backrefs
4982 */
4983
4984 ST.me = scan;
dc45a647 4985 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448
DM
4986
4987 /* if paren positive, emulate an OPEN/CLOSE around A */
4988 if (ST.me->flags) {
3b6647e0 4989 U32 paren = ST.me->flags;
40a82448
DM
4990 if (paren > PL_regsize)
4991 PL_regsize = paren;
3b6647e0 4992 if (paren > *PL_reglastparen)
40a82448 4993 *PL_reglastparen = paren;
c277df42 4994 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 4995 }
40a82448
DM
4996 ST.A = scan;
4997 ST.B = next;
4998 ST.alen = 0;
4999 ST.count = 0;
24d3c4a9
DM
5000 ST.minmod = minmod;
5001 minmod = 0;
40a82448
DM
5002 ST.c1 = CHRTEST_UNINIT;
5003 REGCP_SET(ST.cp);
6407bf3b 5004
40a82448
DM
5005 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5006 goto curlym_do_B;
5007
5008 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 5009 PL_reginput = locinput;
40a82448
DM
5010 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
5011 /* NOTREACHED */
5f80c4cf 5012
40a82448
DM
5013 case CURLYM_A: /* we've just matched an A */
5014 locinput = st->locinput;
5015 nextchr = UCHARAT(locinput);
5016
5017 ST.count++;
5018 /* after first match, determine A's length: u.curlym.alen */
5019 if (ST.count == 1) {
5020 if (PL_reg_match_utf8) {
5021 char *s = locinput;
5022 while (s < PL_reginput) {
5023 ST.alen++;
5024 s += UTF8SKIP(s);
5025 }
5026 }
5027 else {
5028 ST.alen = PL_reginput - locinput;
5029 }
5030 if (ST.alen == 0)
5031 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5032 }
0cadcf80
DM
5033 DEBUG_EXECUTE_r(
5034 PerlIO_printf(Perl_debug_log,
40a82448 5035 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5036 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5037 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5038 );
5039
40a82448 5040 locinput = PL_reginput;
0a4db386
YO
5041
5042 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5043 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5044 goto fake_end;
5045
c966426a
DM
5046 {
5047 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5048 if ( max == REG_INFTY || ST.count < max )
5049 goto curlym_do_A; /* try to match another A */
5050 }
40a82448 5051 goto curlym_do_B; /* try to match B */
5f80c4cf 5052
40a82448
DM
5053 case CURLYM_A_fail: /* just failed to match an A */
5054 REGCP_UNWIND(ST.cp);
0a4db386
YO
5055
5056 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5057 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5058 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5059 sayNO;
0cadcf80 5060
40a82448
DM
5061 curlym_do_B: /* execute the B in /A{m,n}B/ */
5062 PL_reginput = locinput;
5063 if (ST.c1 == CHRTEST_UNINIT) {
5064 /* calculate c1 and c2 for possible match of 1st char
5065 * following curly */
5066 ST.c1 = ST.c2 = CHRTEST_VOID;
5067 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5068 regnode *text_node = ST.B;
5069 if (! HAS_TEXT(text_node))
5070 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5071 /* this used to be
5072
5073 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5074
5075 But the former is redundant in light of the latter.
5076
5077 if this changes back then the macro for
5078 IS_TEXT and friends need to change.
5079 */
5080 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5081 {
ee9b8eae 5082
40a82448 5083 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5084 switch (OP(text_node)) {
5085 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5086 case EXACTFA:
3c760661 5087 case EXACTFU_SS:
ca600955 5088 case EXACTFU_NO_TRIE:
9a5a5549
KW
5089 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5090 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5091 default: ST.c2 = ST.c1;
5092 }
c277df42 5093 }
c277df42 5094 }
40a82448
DM
5095 }
5096
5097 DEBUG_EXECUTE_r(
5098 PerlIO_printf(Perl_debug_log,
5099 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5100 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5101 "", (IV)ST.count)
5102 );
5103 if (ST.c1 != CHRTEST_VOID
5104 && UCHARAT(PL_reginput) != ST.c1
5105 && UCHARAT(PL_reginput) != ST.c2)
5106 {
5107 /* simulate B failing */
3e901dc0
YO
5108 DEBUG_OPTIMISE_r(
5109 PerlIO_printf(Perl_debug_log,
5110 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5111 (int)(REPORT_CODE_OFF+(depth*2)),"",
5112 (IV)ST.c1,(IV)ST.c2
5113 ));
40a82448
DM
5114 state_num = CURLYM_B_fail;
5115 goto reenter_switch;
5116 }
5117
5118 if (ST.me->flags) {
5119 /* mark current A as captured */
5120 I32 paren = ST.me->flags;
5121 if (ST.count) {
f0ab9afb 5122 PL_regoffs[paren].start
40a82448 5123 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
f0ab9afb 5124 PL_regoffs[paren].end = PL_reginput - PL_bostr;
0a4db386 5125 /*dmq: *PL_reglastcloseparen = paren; */
c277df42 5126 }
40a82448 5127 else
f0ab9afb 5128 PL_regoffs[paren].end = -1;
0a4db386 5129 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5130 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5131 {
5132 if (ST.count)
5133 goto fake_end;
5134 else
5135 sayNO;
5136 }
c277df42 5137 }
0a4db386 5138
40a82448 5139 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5f66b61c 5140 /* NOTREACHED */
40a82448
DM
5141
5142 case CURLYM_B_fail: /* just failed to match a B */
5143 REGCP_UNWIND(ST.cp);
5144 if (ST.minmod) {
84d2fa14
HS
5145 I32 max = ARG2(ST.me);
5146 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5147 sayNO;
5148 goto curlym_do_A; /* try to match a further A */
5149 }
5150 /* backtrack one A */
5151 if (ST.count == ARG1(ST.me) /* min */)
5152 sayNO;
5153 ST.count--;
5154 locinput = HOPc(locinput, -ST.alen);
5155 goto curlym_do_B; /* try to match B */
5156
c255a977
DM
5157#undef ST
5158#define ST st->u.curly
40a82448 5159
c255a977
DM
5160#define CURLY_SETPAREN(paren, success) \
5161 if (paren) { \
5162 if (success) { \
f0ab9afb
NC
5163 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5164 PL_regoffs[paren].end = locinput - PL_bostr; \
0a4db386 5165 *PL_reglastcloseparen = paren; \
c255a977
DM
5166 } \
5167 else \
f0ab9afb 5168 PL_regoffs[paren].end = -1; \
c255a977
DM
5169 }
5170
5171 case STAR: /* /A*B/ where A is width 1 */
5172 ST.paren = 0;
5173 ST.min = 0;
5174 ST.max = REG_INFTY;
a0d0e21e
LW
5175 scan = NEXTOPER(scan);
5176 goto repeat;
c255a977
DM
5177 case PLUS: /* /A+B/ where A is width 1 */
5178 ST.paren = 0;
5179 ST.min = 1;
5180 ST.max = REG_INFTY;
c277df42 5181 scan = NEXTOPER(scan);
c255a977
DM
5182 goto repeat;
5183 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5184 ST.paren = scan->flags; /* Which paren to set */
5185 if (ST.paren > PL_regsize)
5186 PL_regsize = ST.paren;
3b6647e0 5187 if (ST.paren > *PL_reglastparen)
c255a977
DM
5188 *PL_reglastparen = ST.paren;
5189 ST.min = ARG1(scan); /* min to match */
5190 ST.max = ARG2(scan); /* max to match */
0a4db386 5191 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5192 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5193 ST.min=1;
5194 ST.max=1;
5195 }
c255a977
DM
5196 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5197 goto repeat;
5198 case CURLY: /* /A{m,n}B/ where A is width 1 */
5199 ST.paren = 0;
5200 ST.min = ARG1(scan); /* min to match */
5201 ST.max = ARG2(scan); /* max to match */
5202 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5203 repeat:
a0d0e21e
LW
5204 /*
5205 * Lookahead to avoid useless match attempts
5206 * when we know what character comes next.
c255a977 5207 *
5f80c4cf
JP
5208 * Used to only do .*x and .*?x, but now it allows
5209 * for )'s, ('s and (?{ ... })'s to be in the way
5210 * of the quantifier and the EXACT-like node. -- japhy
5211 */
5212
c255a977
DM
5213 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5214 sayNO;
cca55fe3 5215 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5216 U8 *s;
5217 regnode *text_node = next;
5218
3dab1dad
YO
5219 if (! HAS_TEXT(text_node))
5220 FIND_NEXT_IMPT(text_node);
5f80c4cf 5221
9e137952 5222 if (! HAS_TEXT(text_node))
c255a977 5223 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5224 else {
ee9b8eae 5225 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5226 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5227 goto assume_ok_easy;
cca55fe3 5228 }
be8e71aa
YO
5229 else
5230 s = (U8*)STRING(text_node);
ee9b8eae
YO
5231
5232 /* Currently we only get here when
5233
5234 PL_rekind[OP(text_node)] == EXACT
5235
5236 if this changes back then the macro for IS_TEXT and
5237 friends need to change. */
f2ed9b32 5238 if (!UTF_PATTERN) {
9a5a5549
KW
5239 ST.c1 = *s;
5240 switch (OP(text_node)) {
5241 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5242 case EXACTFA:
3c760661 5243 case EXACTFU_SS:
ca600955 5244 case EXACTFU_NO_TRIE:
9a5a5549
KW
5245 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5246 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5247 default: ST.c2 = ST.c1; break;
5248 }
1aa99e6b 5249 }
f2ed9b32 5250 else { /* UTF_PATTERN */
9a5a5549 5251 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
a2a2844f 5252 STRLEN ulen1, ulen2;
89ebb4a3
JH
5253 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5254 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
5255
5256 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5257 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
e294cc5d
JH
5258#ifdef EBCDIC
5259 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5260 ckWARN(WARN_UTF8) ?
5261 0 : UTF8_ALLOW_ANY);
5262 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5263 ckWARN(WARN_UTF8) ?
5264 0 : UTF8_ALLOW_ANY);
5265#else
c255a977 5266 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
e294cc5d 5267 uniflags);
c255a977 5268 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
e294cc5d
JH
5269 uniflags);
5270#endif
5f80c4cf
JP
5271 }
5272 else {
c255a977 5273 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5274 uniflags);
5f80c4cf 5275 }
1aa99e6b
IH
5276 }
5277 }
bbce6d69 5278 }
a0d0e21e 5279 else
c255a977 5280 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5281 assume_ok_easy:
c255a977
DM
5282
5283 ST.A = scan;
5284 ST.B = next;
3280af22 5285 PL_reginput = locinput;
24d3c4a9
DM
5286 if (minmod) {
5287 minmod = 0;
e2e6a0f1 5288 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 5289 sayNO;
c255a977 5290 ST.count = ST.min;
a0ed51b3 5291 locinput = PL_reginput;
c255a977
DM
5292 REGCP_SET(ST.cp);
5293 if (ST.c1 == CHRTEST_VOID)
5294 goto curly_try_B_min;
5295
5296 ST.oldloc = locinput;
5297
5298 /* set ST.maxpos to the furthest point along the
5299 * string that could possibly match */
5300 if (ST.max == REG_INFTY) {
5301 ST.maxpos = PL_regeol - 1;
f2ed9b32 5302 if (utf8_target)
c255a977
DM
5303 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5304 ST.maxpos--;
5305 }
f2ed9b32 5306 else if (utf8_target) {
c255a977
DM
5307 int m = ST.max - ST.min;
5308 for (ST.maxpos = locinput;
5309 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5310 ST.maxpos += UTF8SKIP(ST.maxpos);
5311 }
5312 else {
5313 ST.maxpos = locinput + ST.max - ST.min;
5314 if (ST.maxpos >= PL_regeol)
5315 ST.maxpos = PL_regeol - 1;
5316 }
5317 goto curly_try_B_min_known;
5318
5319 }
5320 else {
e2e6a0f1 5321 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
5322 locinput = PL_reginput;
5323 if (ST.count < ST.min)
5324 sayNO;
5325 if ((ST.count > ST.min)
5326 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5327 {
5328 /* A{m,n} must come at the end of the string, there's
5329 * no point in backing off ... */
5330 ST.min = ST.count;
5331 /* ...except that $ and \Z can match before *and* after
5332 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5333 We may back off by one in this case. */
5334 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5335 ST.min--;
5336 }
5337 REGCP_SET(ST.cp);
5338 goto curly_try_B_max;
5339 }
5340 /* NOTREACHED */
5341
5342
5343 case CURLY_B_min_known_fail:
5344 /* failed to find B in a non-greedy match where c1,c2 valid */
5345 if (ST.paren && ST.count)
f0ab9afb 5346 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5347
5348 PL_reginput = locinput; /* Could be reset... */
5349 REGCP_UNWIND(ST.cp);
5350 /* Couldn't or didn't -- move forward. */
5351 ST.oldloc = locinput;
f2ed9b32 5352 if (utf8_target)
c255a977
DM
5353 locinput += UTF8SKIP(locinput);
5354 else
5355 locinput++;
5356 ST.count++;
5357 curly_try_B_min_known:
5358 /* find the next place where 'B' could work, then call B */
5359 {
5360 int n;
f2ed9b32 5361 if (utf8_target) {
c255a977
DM
5362 n = (ST.oldloc == locinput) ? 0 : 1;
5363 if (ST.c1 == ST.c2) {
5364 STRLEN len;
5365 /* set n to utf8_distance(oldloc, locinput) */
5366 while (locinput <= ST.maxpos &&
5367 utf8n_to_uvchr((U8*)locinput,
5368 UTF8_MAXBYTES, &len,
5369 uniflags) != (UV)ST.c1) {
5370 locinput += len;
5371 n++;
5372 }
1aa99e6b
IH
5373 }
5374 else {
c255a977
DM
5375 /* set n to utf8_distance(oldloc, locinput) */
5376 while (locinput <= ST.maxpos) {
5377 STRLEN len;
5378 const UV c = utf8n_to_uvchr((U8*)locinput,
5379 UTF8_MAXBYTES, &len,
5380 uniflags);
5381 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5382 break;
5383 locinput += len;
5384 n++;
1aa99e6b 5385 }
0fe9bf95
IZ
5386 }
5387 }
c255a977
DM
5388 else {
5389 if (ST.c1 == ST.c2) {
5390 while (locinput <= ST.maxpos &&
5391 UCHARAT(locinput) != ST.c1)
5392 locinput++;
bbce6d69 5393 }
c255a977
DM
5394 else {
5395 while (locinput <= ST.maxpos
5396 && UCHARAT(locinput) != ST.c1
5397 && UCHARAT(locinput) != ST.c2)
5398 locinput++;
a0ed51b3 5399 }
c255a977
DM
5400 n = locinput - ST.oldloc;
5401 }
5402 if (locinput > ST.maxpos)
5403 sayNO;
5404 /* PL_reginput == oldloc now */
5405 if (n) {
5406 ST.count += n;
e2e6a0f1 5407 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5408 sayNO;
a0d0e21e 5409 }
c255a977
DM
5410 PL_reginput = locinput;
5411 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5412 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5413 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5414 goto fake_end;
5415 }
c255a977 5416 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5417 }
c255a977
DM
5418 /* NOTREACHED */
5419
5420
5421 case CURLY_B_min_fail:
5422 /* failed to find B in a non-greedy match where c1,c2 invalid */
5423 if (ST.paren && ST.count)
f0ab9afb 5424 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5425
5426 REGCP_UNWIND(ST.cp);
5427 /* failed -- move forward one */
5428 PL_reginput = locinput;
e2e6a0f1 5429 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5430 ST.count++;
a0ed51b3 5431 locinput = PL_reginput;
c255a977
DM
5432 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5433 ST.count > 0)) /* count overflow ? */
15272685 5434 {
c255a977
DM
5435 curly_try_B_min:
5436 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5437 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5438 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5439 goto fake_end;
5440 }
c255a977 5441 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5442 }
5443 }
4633a7c4 5444 sayNO;
c255a977
DM
5445 /* NOTREACHED */
5446
5447
5448 curly_try_B_max:
5449 /* a successful greedy match: now try to match B */
40d049e4 5450 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5451 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5452 goto fake_end;
5453 }
c255a977
DM
5454 {
5455 UV c = 0;
5456 if (ST.c1 != CHRTEST_VOID)
f2ed9b32 5457 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
c255a977 5458 UTF8_MAXBYTES, 0, uniflags)
466787eb 5459 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5460 /* If it could work, try it. */
5461 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5462 CURLY_SETPAREN(ST.paren, ST.count);
5463 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5464 /* NOTREACHED */
5465 }
5466 }
5467 /* FALL THROUGH */
5468 case CURLY_B_max_fail:
5469 /* failed to find B in a greedy match */
5470 if (ST.paren && ST.count)
f0ab9afb 5471 PL_regoffs[ST.paren].end = -1;
c255a977
DM
5472
5473 REGCP_UNWIND(ST.cp);
5474 /* back up. */
5475 if (--ST.count < ST.min)
5476 sayNO;
5477 PL_reginput = locinput = HOPc(locinput, -1);
5478 goto curly_try_B_max;
5479
5480#undef ST
5481
a0d0e21e 5482 case END:
6bda09f9 5483 fake_end:
faec1544
DM
5484 if (cur_eval) {
5485 /* we've just finished A in /(??{A})B/; now continue with B */
5486 I32 tmpix;
faec1544
DM
5487 st->u.eval.toggle_reg_flags
5488 = cur_eval->u.eval.toggle_reg_flags;
5489 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5490
288b8c02
NC
5491 st->u.eval.prev_rex = rex_sv; /* inner */
5492 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5493 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5494 rexi = RXi_GET(rex);
faec1544 5495 cur_curlyx = cur_eval->u.eval.prev_curlyx;
cd733337 5496 (void)ReREFCNT_inc(rex_sv);
faec1544 5497 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
34a81e2b
B
5498
5499 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5500 PL_reglastparen = &rex->lastparen;
5501 PL_reglastcloseparen = &rex->lastcloseparen;
5502
faec1544
DM
5503 REGCP_SET(st->u.eval.lastcp);
5504 PL_reginput = locinput;
5505
5506 /* Restore parens of the outer rex without popping the
5507 * savestack */
5508 tmpix = PL_savestack_ix;
5509 PL_savestack_ix = cur_eval->u.eval.lastcp;
5510 regcppop(rex);
5511 PL_savestack_ix = tmpix;
5512
5513 st->u.eval.prev_eval = cur_eval;
5514 cur_eval = cur_eval->u.eval.prev_eval;
5515 DEBUG_EXECUTE_r(
2a49f0f5
JH
5516 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5517 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5518 if ( nochange_depth )
5519 nochange_depth--;
5520
5521 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5522 st->u.eval.prev_eval->u.eval.B); /* match B */
5523 }
5524
3b0527fe 5525 if (locinput < reginfo->till) {
a3621e74 5526 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5527 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5528 PL_colors[4],
5529 (long)(locinput - PL_reg_starttry),
3b0527fe 5530 (long)(reginfo->till - PL_reg_starttry),
7821416a 5531 PL_colors[5]));
58e23c8d 5532
262b90c4 5533 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5534 }
5535 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5536 sayYES; /* Success! */
dad79028
DM
5537
5538 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5539 DEBUG_EXECUTE_r(
5540 PerlIO_printf(Perl_debug_log,
5541 "%*s %ssubpattern success...%s\n",
5bc10b2c 5542 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5543 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5544 sayYES; /* Success! */
dad79028 5545
40a82448
DM
5546#undef ST
5547#define ST st->u.ifmatch
5548
5549 case SUSPEND: /* (?>A) */
5550 ST.wanted = 1;
9fe1d20c 5551 PL_reginput = locinput;
9041c2e3 5552 goto do_ifmatch;
dad79028 5553
40a82448
DM
5554 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5555 ST.wanted = 0;
dad79028
DM
5556 goto ifmatch_trivial_fail_test;
5557
40a82448
DM
5558 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5559 ST.wanted = 1;
dad79028 5560 ifmatch_trivial_fail_test:
a0ed51b3 5561 if (scan->flags) {
52657f30 5562 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5563 if (!s) {
5564 /* trivial fail */
24d3c4a9
DM
5565 if (logical) {
5566 logical = 0;
f2338a2e 5567 sw = 1 - cBOOL(ST.wanted);
dad79028 5568 }
40a82448 5569 else if (ST.wanted)
dad79028
DM
5570 sayNO;
5571 next = scan + ARG(scan);
5572 if (next == scan)
5573 next = NULL;
5574 break;
5575 }
efb30f32 5576 PL_reginput = s;
a0ed51b3
LW
5577 }
5578 else
5579 PL_reginput = locinput;
5580
c277df42 5581 do_ifmatch:
40a82448 5582 ST.me = scan;
24d3c4a9 5583 ST.logical = logical;
24d786f4
YO
5584 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5585
40a82448
DM
5586 /* execute body of (?...A) */
5587 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5588 /* NOTREACHED */
5589
5590 case IFMATCH_A_fail: /* body of (?...A) failed */
5591 ST.wanted = !ST.wanted;
5592 /* FALL THROUGH */
5593
5594 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5595 if (ST.logical) {
f2338a2e 5596 sw = cBOOL(ST.wanted);
40a82448
DM
5597 }
5598 else if (!ST.wanted)
5599 sayNO;
5600
5601 if (OP(ST.me) == SUSPEND)
5602 locinput = PL_reginput;
5603 else {
5604 locinput = PL_reginput = st->locinput;
5605 nextchr = UCHARAT(locinput);
5606 }
5607 scan = ST.me + ARG(ST.me);
5608 if (scan == ST.me)
5609 scan = NULL;
5610 continue; /* execute B */
5611
5612#undef ST
dad79028 5613
c277df42 5614 case LONGJMP:
c277df42
IZ
5615 next = scan + ARG(scan);
5616 if (next == scan)
5617 next = NULL;
a0d0e21e 5618 break;
54612592 5619 case COMMIT:
e2e6a0f1
YO
5620 reginfo->cutpoint = PL_regeol;
5621 /* FALLTHROUGH */
5d458dd8 5622 case PRUNE:
24b23f37 5623 PL_reginput = locinput;
e2e6a0f1 5624 if (!scan->flags)
ad64d0ec 5625 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592
YO
5626 PUSH_STATE_GOTO(COMMIT_next,next);
5627 /* NOTREACHED */
5628 case COMMIT_next_fail:
5629 no_final = 1;
5630 /* FALLTHROUGH */
7f69552c
YO
5631 case OPFAIL:
5632 sayNO;
e2e6a0f1
YO
5633 /* NOTREACHED */
5634
5635#define ST st->u.mark
5636 case MARKPOINT:
5637 ST.prev_mark = mark_state;
5d458dd8 5638 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5639 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5640 mark_state = st;
5641 ST.mark_loc = PL_reginput = locinput;
5642 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5643 /* NOTREACHED */
5644 case MARKPOINT_next:
5645 mark_state = ST.prev_mark;
5646 sayYES;
5647 /* NOTREACHED */
5648 case MARKPOINT_next_fail:
5d458dd8 5649 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5650 {
5651 if (ST.mark_loc > startpoint)
5652 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5653 popmark = NULL; /* we found our mark */
5654 sv_commit = ST.mark_name;
5655
5656 DEBUG_EXECUTE_r({
5d458dd8 5657 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5658 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5659 REPORT_CODE_OFF+depth*2, "",
be2597df 5660 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5661 });
5662 }
5663 mark_state = ST.prev_mark;
5d458dd8
YO
5664 sv_yes_mark = mark_state ?
5665 mark_state->u.mark.mark_name : NULL;
e2e6a0f1
YO
5666 sayNO;
5667 /* NOTREACHED */
5d458dd8
YO
5668 case SKIP:
5669 PL_reginput = locinput;
5670 if (scan->flags) {
2bf803e2 5671 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5672 ST.mark_name = NULL;
e2e6a0f1 5673 ST.mark_loc = locinput;
5d458dd8
YO
5674 PUSH_STATE_GOTO(SKIP_next,next);
5675 } else {
2bf803e2 5676 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5677 otherwise do nothing. Meaning we need to scan
5678 */
5679 regmatch_state *cur = mark_state;
ad64d0ec 5680 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5681
5682 while (cur) {
5683 if ( sv_eq( cur->u.mark.mark_name,
5684 find ) )
5685 {
5686 ST.mark_name = find;
5687 PUSH_STATE_GOTO( SKIP_next, next );
5688 }
5689 cur = cur->u.mark.prev_mark;
5690 }
e2e6a0f1 5691 }
2bf803e2 5692 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5693 break;
5694 case SKIP_next_fail:
5695 if (ST.mark_name) {
5696 /* (*CUT:NAME) - Set up to search for the name as we
5697 collapse the stack*/
5698 popmark = ST.mark_name;
5699 } else {
5700 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5701 if (ST.mark_loc > startpoint)
5702 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5703 /* but we set sv_commit to latest mark_name if there
5704 is one so they can test to see how things lead to this
5705 cut */
5706 if (mark_state)
5707 sv_commit=mark_state->u.mark.mark_name;
5708 }
e2e6a0f1
YO
5709 no_final = 1;
5710 sayNO;
5711 /* NOTREACHED */
5712#undef ST
e1d1eefb 5713 case LNBREAK:
f2ed9b32 5714 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5715 locinput += n;
5716 nextchr = UCHARAT(locinput);
5717 } else
5718 sayNO;
5719 break;
5720
5721#define CASE_CLASS(nAmE) \
5722 case nAmE: \
1380b51d
DM
5723 if (locinput >= PL_regeol) \
5724 sayNO; \
f2ed9b32 5725 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5726 locinput += n; \
5727 nextchr = UCHARAT(locinput); \
5728 } else \
5729 sayNO; \
5730 break; \
5731 case N##nAmE: \
1380b51d
DM
5732 if (locinput >= PL_regeol) \
5733 sayNO; \
f2ed9b32 5734 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5735 sayNO; \
5736 } else { \
5737 locinput += UTF8SKIP(locinput); \
5738 nextchr = UCHARAT(locinput); \
5739 } \
5740 break
5741
5742 CASE_CLASS(VERTWS);
5743 CASE_CLASS(HORIZWS);
5744#undef CASE_CLASS
5745
a0d0e21e 5746 default:
b900a521 5747 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5748 PTR2UV(scan), OP(scan));
cea2e8a9 5749 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5750
5751 } /* end switch */
95b24440 5752
5d458dd8
YO
5753 /* switch break jumps here */
5754 scan = next; /* prepare to execute the next op and ... */
5755 continue; /* ... jump back to the top, reusing st */
95b24440
DM
5756 /* NOTREACHED */
5757
40a82448
DM
5758 push_yes_state:
5759 /* push a state that backtracks on success */
5760 st->u.yes.prev_yes_state = yes_state;
5761 yes_state = st;
5762 /* FALL THROUGH */
5763 push_state:
5764 /* push a new regex state, then continue at scan */
5765 {
5766 regmatch_state *newst;
5767
24b23f37
YO
5768 DEBUG_STACK_r({
5769 regmatch_state *cur = st;
5770 regmatch_state *curyes = yes_state;
5771 int curd = depth;
5772 regmatch_slab *slab = PL_regmatch_slab;
5773 for (;curd > -1;cur--,curd--) {
5774 if (cur < SLAB_FIRST(slab)) {
5775 slab = slab->prev;
5776 cur = SLAB_LAST(slab);
5777 }
5778 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5779 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5780 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5781 (curyes == cur) ? "yes" : ""
5782 );
5783 if (curyes == cur)
5784 curyes = cur->u.yes.prev_yes_state;
5785 }
5786 } else
5787 DEBUG_STATE_pp("push")
5788 );
40a82448 5789 depth++;
40a82448
DM
5790 st->locinput = locinput;
5791 newst = st+1;
5792 if (newst > SLAB_LAST(PL_regmatch_slab))
5793 newst = S_push_slab(aTHX);
5794 PL_regmatch_state = newst;
786e8c11 5795
40a82448
DM
5796 locinput = PL_reginput;
5797 nextchr = UCHARAT(locinput);
5798 st = newst;
5799 continue;
5800 /* NOTREACHED */
5801 }
a0d0e21e 5802 }
a687059c 5803
a0d0e21e
LW
5804 /*
5805 * We get here only if there's trouble -- normally "case END" is
5806 * the terminating point.
5807 */
cea2e8a9 5808 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5809 /*NOTREACHED*/
4633a7c4
LW
5810 sayNO;
5811
262b90c4 5812yes:
77cb431f
DM
5813 if (yes_state) {
5814 /* we have successfully completed a subexpression, but we must now
5815 * pop to the state marked by yes_state and continue from there */
77cb431f 5816 assert(st != yes_state);
5bc10b2c
DM
5817#ifdef DEBUGGING
5818 while (st != yes_state) {
5819 st--;
5820 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5821 PL_regmatch_slab = PL_regmatch_slab->prev;
5822 st = SLAB_LAST(PL_regmatch_slab);
5823 }
e2e6a0f1 5824 DEBUG_STATE_r({
54612592
YO
5825 if (no_final) {
5826 DEBUG_STATE_pp("pop (no final)");
5827 } else {
5828 DEBUG_STATE_pp("pop (yes)");
5829 }
e2e6a0f1 5830 });
5bc10b2c
DM
5831 depth--;
5832 }
5833#else
77cb431f
DM
5834 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5835 || yes_state > SLAB_LAST(PL_regmatch_slab))
5836 {
5837 /* not in this slab, pop slab */
5838 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5839 PL_regmatch_slab = PL_regmatch_slab->prev;
5840 st = SLAB_LAST(PL_regmatch_slab);
5841 }
5842 depth -= (st - yes_state);
5bc10b2c 5843#endif
77cb431f
DM
5844 st = yes_state;
5845 yes_state = st->u.yes.prev_yes_state;
5846 PL_regmatch_state = st;
24b23f37 5847
5d458dd8
YO
5848 if (no_final) {
5849 locinput= st->locinput;
5850 nextchr = UCHARAT(locinput);
5851 }
54612592 5852 state_num = st->resume_state + no_final;
24d3c4a9 5853 goto reenter_switch;
77cb431f
DM
5854 }
5855
a3621e74 5856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5857 PL_colors[4], PL_colors[5]));
02db2b7b 5858
19b95bf0
DM
5859 if (PL_reg_eval_set) {
5860 /* each successfully executed (?{...}) block does the equivalent of
5861 * local $^R = do {...}
5862 * When popping the save stack, all these locals would be undone;
5863 * bypass this by setting the outermost saved $^R to the latest
5864 * value */
5865 if (oreplsv != GvSV(PL_replgv))
5866 sv_setsv(oreplsv, GvSV(PL_replgv));
5867 }
95b24440 5868 result = 1;
aa283a38 5869 goto final_exit;
4633a7c4
LW
5870
5871no:
a3621e74 5872 DEBUG_EXECUTE_r(
7821416a 5873 PerlIO_printf(Perl_debug_log,
786e8c11 5874 "%*s %sfailed...%s\n",
5bc10b2c 5875 REPORT_CODE_OFF+depth*2, "",
786e8c11 5876 PL_colors[4], PL_colors[5])
7821416a 5877 );
aa283a38 5878
262b90c4 5879no_silent:
54612592
YO
5880 if (no_final) {
5881 if (yes_state) {
5882 goto yes;
5883 } else {
5884 goto final_exit;
5885 }
5886 }
aa283a38
DM
5887 if (depth) {
5888 /* there's a previous state to backtrack to */
40a82448
DM
5889 st--;
5890 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5891 PL_regmatch_slab = PL_regmatch_slab->prev;
5892 st = SLAB_LAST(PL_regmatch_slab);
5893 }
5894 PL_regmatch_state = st;
40a82448
DM
5895 locinput= st->locinput;
5896 nextchr = UCHARAT(locinput);
5897
5bc10b2c
DM
5898 DEBUG_STATE_pp("pop");
5899 depth--;
262b90c4
DM
5900 if (yes_state == st)
5901 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 5902
24d3c4a9
DM
5903 state_num = st->resume_state + 1; /* failure = success + 1 */
5904 goto reenter_switch;
95b24440 5905 }
24d3c4a9 5906 result = 0;
aa283a38 5907
262b90c4 5908 final_exit:
bbe252da 5909 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
5910 SV *sv_err = get_sv("REGERROR", 1);
5911 SV *sv_mrk = get_sv("REGMARK", 1);
5912 if (result) {
e2e6a0f1 5913 sv_commit = &PL_sv_no;
5d458dd8
YO
5914 if (!sv_yes_mark)
5915 sv_yes_mark = &PL_sv_yes;
5916 } else {
5917 if (!sv_commit)
5918 sv_commit = &PL_sv_yes;
5919 sv_yes_mark = &PL_sv_no;
5920 }
5921 sv_setsv(sv_err, sv_commit);
5922 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 5923 }
19b95bf0 5924
2f554ef7
DM
5925 /* clean up; in particular, free all slabs above current one */
5926 LEAVE_SCOPE(oldsave);
5d9a96ca 5927
95b24440 5928 return result;
a687059c
LW
5929}
5930
5931/*
5932 - regrepeat - repeatedly match something simple, report how many
5933 */
5934/*
5935 * [This routine now assumes that it will only match on things of length 1.
5936 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 5937 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 5938 */
76e3520e 5939STATIC I32
e2e6a0f1 5940S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 5941{
27da23d5 5942 dVAR;
a0d0e21e 5943 register char *scan;
a0d0e21e 5944 register I32 c;
3280af22 5945 register char *loceol = PL_regeol;
a0ed51b3 5946 register I32 hardcount = 0;
f2ed9b32 5947 register bool utf8_target = PL_reg_match_utf8;
d513472c 5948 UV utf8_flags;
4f55667c
SP
5949#ifndef DEBUGGING
5950 PERL_UNUSED_ARG(depth);
5951#endif
a0d0e21e 5952
7918f24d
NC
5953 PERL_ARGS_ASSERT_REGREPEAT;
5954
3280af22 5955 scan = PL_reginput;
faf11cac
HS
5956 if (max == REG_INFTY)
5957 max = I32_MAX;
5958 else if (max < loceol - scan)
7f596f4c 5959 loceol = scan + max;
a0d0e21e 5960 switch (OP(p)) {
22c35a8c 5961 case REG_ANY:
f2ed9b32 5962 if (utf8_target) {
ffc61ed2 5963 loceol = PL_regeol;
1aa99e6b 5964 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
5965 scan += UTF8SKIP(scan);
5966 hardcount++;
5967 }
5968 } else {
5969 while (scan < loceol && *scan != '\n')
5970 scan++;
a0ed51b3
LW
5971 }
5972 break;
ffc61ed2 5973 case SANY:
f2ed9b32 5974 if (utf8_target) {
def8e4ea 5975 loceol = PL_regeol;
a0804c9e 5976 while (scan < loceol && hardcount < max) {
def8e4ea
JH
5977 scan += UTF8SKIP(scan);
5978 hardcount++;
5979 }
5980 }
5981 else
5982 scan = loceol;
a0ed51b3 5983 break;
f33976b4
DB
5984 case CANY:
5985 scan = loceol;
5986 break;
59d32103
KW
5987 case EXACT:
5988 /* To get here, EXACTish nodes must have *byte* length == 1. That
5989 * means they match only characters in the string that can be expressed
5990 * as a single byte. For non-utf8 strings, that means a simple match.
5991 * For utf8 strings, the character matched must be an invariant, or
5992 * downgradable to a single byte. The pattern's utf8ness is
5993 * irrelevant, as since it's a single byte, it either isn't utf8, or if
5994 * it is, it's an invariant */
5995
5996 c = (U8)*STRING(p);
5997 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5998
5999 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6000 while (scan < loceol && UCHARAT(scan) == c) {
6001 scan++;
6002 }
6003 }
6004 else {
6005
6006 /* Here, the string is utf8, and the pattern char is different
6007 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6008 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6009 * look for those in sequence in the utf8 string */
6010 U8 high = UTF8_TWO_BYTE_HI(c);
6011 U8 low = UTF8_TWO_BYTE_LO(c);
6012 loceol = PL_regeol;
6013
6014 while (hardcount < max
6015 && scan + 1 < loceol
6016 && UCHARAT(scan) == high
6017 && UCHARAT(scan + 1) == low)
6018 {
6019 scan += 2;
6020 hardcount++;
6021 }
6022 }
6023 break;
2f7f8cb1
KW
6024 case EXACTFA:
6025 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6026 goto do_exactf;
6027
d4e0b827
KW
6028 case EXACTFL:
6029 PL_reg_flags |= RF_tainted;
17580e7a
KW
6030 utf8_flags = FOLDEQ_UTF8_LOCALE;
6031 goto do_exactf;
6032
d4e0b827 6033 case EXACTF:
62bf7766
KW
6034 utf8_flags = 0;
6035 goto do_exactf;
6036
3c760661 6037 case EXACTFU_SS:
ca600955 6038 case EXACTFU_NO_TRIE:
9a5a5549 6039 case EXACTFU:
05f861a2 6040 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6041
2be3e190
KW
6042 /* The comments for the EXACT case above apply as well to these fold
6043 * ones */
634c83a2 6044
2f7f8cb1 6045 do_exactf:
090f7165 6046 c = (U8)*STRING(p);
634c83a2 6047 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6048
3c760661 6049 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6050 char *tmpeol = loceol;
6051 while (hardcount < max
d513472c
KW
6052 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6053 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6054 {
6055 scan = tmpeol;
6056 tmpeol = loceol;
6057 hardcount++;
6058 }
634c83a2 6059
59d32103
KW
6060 /* XXX Note that the above handles properly the German sharp s in
6061 * the pattern matching ss in the string. But it doesn't handle
6062 * properly cases where the string contains say 'LIGATURE ff' and
6063 * the pattern is 'f+'. This would require, say, a new function or
6064 * revised interface to foldEQ_utf8(), in which the maximum number
6065 * of characters to match could be passed and it would return how
6066 * many actually did. This is just one of many cases where
6067 * multi-char folds don't work properly, and so the fix is being
6068 * deferred */
6069 }
6070 else {
87381386 6071 U8 folded;
59d32103 6072
2be3e190
KW
6073 /* Here, the string isn't utf8 and c is a single byte; and either
6074 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6075 * doesn't affect c. Can just do simple comparisons for exact or
6076 * fold matching. */
d4e0b827 6077 switch (OP(p)) {
87381386 6078 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6079 case EXACTFA:
ca600955 6080 case EXACTFU_NO_TRIE:
9a5a5549 6081 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6082 case EXACTFL: folded = PL_fold_locale[c]; break;
6083 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6084 }
6085 while (scan < loceol &&
6086 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6087 {
6088 scan++;
634c83a2
KW
6089 }
6090 }
bbce6d69 6091 break;
f56b6394 6092 case ANYOFV:
a0d0e21e 6093 case ANYOF:
4e8910e0
KW
6094 if (utf8_target || OP(p) == ANYOFV) {
6095 STRLEN inclasslen;
ffc61ed2 6096 loceol = PL_regeol;
4e8910e0
KW
6097 inclasslen = loceol - scan;
6098 while (hardcount < max
6099 && ((inclasslen = loceol - scan) > 0)
6100 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6101 {
6102 scan += inclasslen;
ffc61ed2
JH
6103 hardcount++;
6104 }
6105 } else {
32fc9b6a 6106 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6107 scan++;
6108 }
a0d0e21e 6109 break;
980866de 6110 case ALNUMU:
f2ed9b32 6111 if (utf8_target) {
980866de 6112 utf8_wordchar:
ffc61ed2 6113 loceol = PL_regeol;
1a4fad37 6114 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6115 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6116 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6117 {
ffc61ed2
JH
6118 scan += UTF8SKIP(scan);
6119 hardcount++;
6120 }
980866de 6121 } else {
a12cf05f
KW
6122 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6123 scan++;
6124 }
980866de
KW
6125 }
6126 break;
6127 case ALNUM:
6128 if (utf8_target)
6129 goto utf8_wordchar;
6130 while (scan < loceol && isALNUM((U8) *scan)) {
6131 scan++;
a0ed51b3
LW
6132 }
6133 break;
cfaf538b
KW
6134 case ALNUMA:
6135 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6136 scan++;
6137 }
6138 break;
bbce6d69 6139 case ALNUML:
3280af22 6140 PL_reg_flags |= RF_tainted;
f2ed9b32 6141 if (utf8_target) {
ffc61ed2 6142 loceol = PL_regeol;
1aa99e6b
IH
6143 while (hardcount < max && scan < loceol &&
6144 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6145 scan += UTF8SKIP(scan);
6146 hardcount++;
6147 }
6148 } else {
6149 while (scan < loceol && isALNUM_LC(*scan))
6150 scan++;
a0ed51b3
LW
6151 }
6152 break;
980866de 6153 case NALNUMU:
f2ed9b32 6154 if (utf8_target) {
980866de
KW
6155
6156 utf8_Nwordchar:
6157
ffc61ed2 6158 loceol = PL_regeol;
1a4fad37 6159 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6160 while (hardcount < max && scan < loceol &&
980866de 6161 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6162 {
ffc61ed2
JH
6163 scan += UTF8SKIP(scan);
6164 hardcount++;
6165 }
980866de 6166 } else {
a12cf05f
KW
6167 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6168 scan++;
6169 }
980866de
KW
6170 }
6171 break;
6172 case NALNUM:
6173 if (utf8_target)
6174 goto utf8_Nwordchar;
6175 while (scan < loceol && ! isALNUM((U8) *scan)) {
6176 scan++;
a0ed51b3
LW
6177 }
6178 break;
cfaf538b
KW
6179 case NALNUMA:
6180 if (utf8_target) {
6181 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6182 scan += UTF8SKIP(scan);
6183 }
6184 }
6185 else {
6186 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6187 scan++;
6188 }
6189 }
6190 break;
bbce6d69 6191 case NALNUML:
3280af22 6192 PL_reg_flags |= RF_tainted;
f2ed9b32 6193 if (utf8_target) {
ffc61ed2 6194 loceol = PL_regeol;
1aa99e6b
IH
6195 while (hardcount < max && scan < loceol &&
6196 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6197 scan += UTF8SKIP(scan);
6198 hardcount++;
6199 }
6200 } else {
6201 while (scan < loceol && !isALNUM_LC(*scan))
6202 scan++;
a0ed51b3
LW
6203 }
6204 break;
980866de 6205 case SPACEU:
f2ed9b32 6206 if (utf8_target) {
980866de
KW
6207
6208 utf8_space:
6209
ffc61ed2 6210 loceol = PL_regeol;
1a4fad37 6211 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6212 while (hardcount < max && scan < loceol &&
3568d838 6213 (*scan == ' ' ||
a12cf05f
KW
6214 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6215 {
ffc61ed2
JH
6216 scan += UTF8SKIP(scan);
6217 hardcount++;
6218 }
980866de
KW
6219 break;
6220 }
6221 else {
a12cf05f
KW
6222 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6223 scan++;
6224 }
980866de
KW
6225 break;
6226 }
6227 case SPACE:
6228 if (utf8_target)
6229 goto utf8_space;
6230
6231 while (scan < loceol && isSPACE((U8) *scan)) {
6232 scan++;
a0ed51b3
LW
6233 }
6234 break;
cfaf538b
KW
6235 case SPACEA:
6236 while (scan < loceol && isSPACE_A((U8) *scan)) {
6237 scan++;
6238 }
6239 break;
bbce6d69 6240 case SPACEL:
3280af22 6241 PL_reg_flags |= RF_tainted;
f2ed9b32 6242 if (utf8_target) {
ffc61ed2 6243 loceol = PL_regeol;
1aa99e6b 6244 while (hardcount < max && scan < loceol &&
6bbba904 6245 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6246 scan += UTF8SKIP(scan);
6247 hardcount++;
6248 }
6249 } else {
6250 while (scan < loceol && isSPACE_LC(*scan))
6251 scan++;
a0ed51b3
LW
6252 }
6253 break;
980866de 6254 case NSPACEU:
f2ed9b32 6255 if (utf8_target) {
980866de
KW
6256
6257 utf8_Nspace:
6258
ffc61ed2 6259 loceol = PL_regeol;
1a4fad37 6260 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6261 while (hardcount < max && scan < loceol &&
980866de
KW
6262 ! (*scan == ' ' ||
6263 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6264 {
ffc61ed2
JH
6265 scan += UTF8SKIP(scan);
6266 hardcount++;
6267 }
980866de
KW
6268 break;
6269 }
6270 else {
a12cf05f
KW
6271 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6272 scan++;
6273 }
980866de
KW
6274 }
6275 break;
6276 case NSPACE:
6277 if (utf8_target)
6278 goto utf8_Nspace;
6279
6280 while (scan < loceol && ! isSPACE((U8) *scan)) {
6281 scan++;
a0ed51b3 6282 }
0008a298 6283 break;
cfaf538b
KW
6284 case NSPACEA:
6285 if (utf8_target) {
6286 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6287 scan += UTF8SKIP(scan);
6288 }
6289 }
6290 else {
6291 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6292 scan++;
6293 }
6294 }
6295 break;
bbce6d69 6296 case NSPACEL:
3280af22 6297 PL_reg_flags |= RF_tainted;
f2ed9b32 6298 if (utf8_target) {
ffc61ed2 6299 loceol = PL_regeol;
1aa99e6b 6300 while (hardcount < max && scan < loceol &&
6bbba904 6301 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6302 scan += UTF8SKIP(scan);
6303 hardcount++;
6304 }
6305 } else {
6306 while (scan < loceol && !isSPACE_LC(*scan))
6307 scan++;
a0ed51b3
LW
6308 }
6309 break;
a0d0e21e 6310 case DIGIT:
f2ed9b32 6311 if (utf8_target) {
ffc61ed2 6312 loceol = PL_regeol;
1a4fad37 6313 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6314 while (hardcount < max && scan < loceol &&
f2ed9b32 6315 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6316 scan += UTF8SKIP(scan);
6317 hardcount++;
6318 }
6319 } else {
6320 while (scan < loceol && isDIGIT(*scan))
6321 scan++;
a0ed51b3
LW
6322 }
6323 break;
cfaf538b
KW
6324 case DIGITA:
6325 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6326 scan++;
6327 }
6328 break;
b77393f6
KW
6329 case DIGITL:
6330 PL_reg_flags |= RF_tainted;
6331 if (utf8_target) {
6332 loceol = PL_regeol;
6333 while (hardcount < max && scan < loceol &&
6334 isDIGIT_LC_utf8((U8*)scan)) {
6335 scan += UTF8SKIP(scan);
6336 hardcount++;
6337 }
6338 } else {
6339 while (scan < loceol && isDIGIT_LC(*scan))
6340 scan++;
6341 }
6342 break;
a0d0e21e 6343 case NDIGIT:
f2ed9b32 6344 if (utf8_target) {
ffc61ed2 6345 loceol = PL_regeol;
1a4fad37 6346 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6347 while (hardcount < max && scan < loceol &&
f2ed9b32 6348 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6349 scan += UTF8SKIP(scan);
6350 hardcount++;
6351 }
6352 } else {
6353 while (scan < loceol && !isDIGIT(*scan))
6354 scan++;
a0ed51b3 6355 }
cfaf538b
KW
6356 break;
6357 case NDIGITA:
6358 if (utf8_target) {
6359 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6360 scan += UTF8SKIP(scan);
6361 }
6362 }
6363 else {
6364 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6365 scan++;
6366 }
6367 }
6368 break;
b77393f6
KW
6369 case NDIGITL:
6370 PL_reg_flags |= RF_tainted;
6371 if (utf8_target) {
6372 loceol = PL_regeol;
6373 while (hardcount < max && scan < loceol &&
6374 !isDIGIT_LC_utf8((U8*)scan)) {
6375 scan += UTF8SKIP(scan);
6376 hardcount++;
6377 }
6378 } else {
6379 while (scan < loceol && !isDIGIT_LC(*scan))
6380 scan++;
6381 }
6382 break;
e1d1eefb 6383 case LNBREAK:
f2ed9b32 6384 if (utf8_target) {
e1d1eefb
YO
6385 loceol = PL_regeol;
6386 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6387 scan += c;
6388 hardcount++;
6389 }
6390 } else {
6391 /*
6392 LNBREAK can match two latin chars, which is ok,
6393 because we have a null terminated string, but we
6394 have to use hardcount in this situation
6395 */
6396 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6397 scan+=c;
6398 hardcount++;
6399 }
6400 }
6401 break;
6402 case HORIZWS:
f2ed9b32 6403 if (utf8_target) {
e1d1eefb
YO
6404 loceol = PL_regeol;
6405 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6406 scan += c;
6407 hardcount++;
6408 }
6409 } else {
6410 while (scan < loceol && is_HORIZWS_latin1(scan))
6411 scan++;
6412 }
a0ed51b3 6413 break;
e1d1eefb 6414 case NHORIZWS:
f2ed9b32 6415 if (utf8_target) {
e1d1eefb
YO
6416 loceol = PL_regeol;
6417 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6418 scan += UTF8SKIP(scan);
6419 hardcount++;
6420 }
6421 } else {
6422 while (scan < loceol && !is_HORIZWS_latin1(scan))
6423 scan++;
6424
6425 }
6426 break;
6427 case VERTWS:
f2ed9b32 6428 if (utf8_target) {
e1d1eefb
YO
6429 loceol = PL_regeol;
6430 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6431 scan += c;
6432 hardcount++;
6433 }
6434 } else {
6435 while (scan < loceol && is_VERTWS_latin1(scan))
6436 scan++;
6437
6438 }
6439 break;
6440 case NVERTWS:
f2ed9b32 6441 if (utf8_target) {
e1d1eefb
YO
6442 loceol = PL_regeol;
6443 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6444 scan += UTF8SKIP(scan);
6445 hardcount++;
6446 }
6447 } else {
6448 while (scan < loceol && !is_VERTWS_latin1(scan))
6449 scan++;
6450
6451 }
6452 break;
6453
a0d0e21e
LW
6454 default: /* Called on something of 0 width. */
6455 break; /* So match right here or not at all. */
6456 }
a687059c 6457
a0ed51b3
LW
6458 if (hardcount)
6459 c = hardcount;
6460 else
6461 c = scan - PL_reginput;
3280af22 6462 PL_reginput = scan;
a687059c 6463
a3621e74 6464 DEBUG_r({
e68ec53f 6465 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6466 DEBUG_EXECUTE_r({
e68ec53f
YO
6467 SV * const prop = sv_newmortal();
6468 regprop(prog, prop, p);
6469 PerlIO_printf(Perl_debug_log,
be8e71aa 6470 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6471 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6472 });
be8e71aa 6473 });
9041c2e3 6474
a0d0e21e 6475 return(c);
a687059c
LW
6476}
6477
c277df42 6478
be8e71aa 6479#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6480/*
6c6525b8
KW
6481- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6482create a copy so that changes the caller makes won't change the shared one
6483 */
ffc61ed2 6484SV *
32fc9b6a 6485Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6486{
6c6525b8
KW
6487 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6488 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6489}
6490#endif
6491
6492STATIC SV *
6493S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6494{
8c9eb58f
KW
6495 /* Returns the swash for the input 'node' in the regex 'prog'.
6496 * If <doinit> is true, will attempt to create the swash if not already
6497 * done.
6498 * If <listsvp> is non-null, will return the swash initialization string in
6499 * it.
6500 * If <altsvp> is non-null, will return the alternates to the regular swash
6501 * in it
6502 * Tied intimately to how regcomp.c sets up the data structure */
6503
97aff369 6504 dVAR;
9e55ce06
JH
6505 SV *sw = NULL;
6506 SV *si = NULL;
6507 SV *alt = NULL;
7a6c6baa
KW
6508 SV* invlist = NULL;
6509
f8fc2ecf
YO
6510 RXi_GET_DECL(prog,progi);
6511 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6512
6c6525b8 6513 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6514
ccb2541c
KW
6515 assert(ANYOF_NONBITMAP(node));
6516
4f639d21 6517 if (data && data->count) {
a3b680e6 6518 const U32 n = ARG(node);
ffc61ed2 6519
4f639d21 6520 if (data->what[n] == 's') {
ad64d0ec
NC
6521 SV * const rv = MUTABLE_SV(data->data[n]);
6522 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6523 SV **const ary = AvARRAY(av);
7a6c6baa 6524 bool invlist_has_user_defined_property;
9041c2e3 6525
8c9eb58f 6526 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6527
7a6c6baa
KW
6528 /* Elements 3 and 4 are either both present or both absent. [3] is
6529 * any inversion list generated at compile time; [4] indicates if
6530 * that inversion list has any user-defined properties in it. */
6531 if (av_len(av) >= 3) {
6532 invlist = ary[3];
6533 invlist_has_user_defined_property = cBOOL(SvUV(ary[4]));
6534 }
6535 else {
6536 invlist = NULL;
6537 invlist_has_user_defined_property = FALSE;
6538 }
6539
8c9eb58f
KW
6540 /* Element [1] is reserved for the set-up swash. If already there,
6541 * return it; if not, create it and store it there */
f192cf32
KW
6542 if (SvROK(ary[1])) {
6543 sw = ary[1];
6544 }
ffc61ed2 6545 else if (si && doinit) {
7a6c6baa
KW
6546
6547 sw = _core_swash_init("utf8", /* the utf8 package */
6548 "", /* nameless */
6549 si,
6550 1, /* binary */
6551 0, /* not from tr/// */
6552 FALSE, /* is error if can't find
6553 property */
6554 invlist,
6555 invlist_has_user_defined_property);
ffc61ed2
JH
6556 (void)av_store(av, 1, sw);
6557 }
8c9eb58f
KW
6558
6559 /* Element [2] is for any multi-char folds. Note that is a
6560 * fundamentally flawed design, because can't backtrack and try
6561 * again. See [perl #89774] */
f192cf32
KW
6562 if (SvTYPE(ary[2]) == SVt_PVAV) {
6563 alt = ary[2];
6564 }
ffc61ed2
JH
6565 }
6566 }
6567
7a6c6baa
KW
6568 if (listsvp) {
6569 SV* matches_string = newSVpvn("", 0);
6570 SV** invlistsvp;
6571
6572 /* Use the swash, if any, which has to have incorporated into it all
6573 * possibilities */
6574 if ( sw
6575 && SvROK(sw)
6576 && SvTYPE(SvRV(sw)) == SVt_PVHV
6577 && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE)))
6578 {
6579 invlist = *invlistsvp;
6580 }
6581 else if (si && si != &PL_sv_undef) {
6582
6583 /* If no swash, use the input nitialization string, if available */
6584 sv_catsv(matches_string, si);
6585 }
6586
6587 /* Add the inversion list to whatever we have. This may have come from
6588 * the swash, or from an input parameter */
6589 if (invlist) {
6590 sv_catsv(matches_string, _invlist_contents(invlist));
6591 }
6592 *listsvp = matches_string;
6593 }
6594
9e55ce06
JH
6595 if (altsvp)
6596 *altsvp = alt;
ffc61ed2
JH
6597
6598 return sw;
6599}
6600
6601/*
ba7b4546 6602 - reginclass - determine if a character falls into a character class
832705d4 6603
6698fab5
KW
6604 n is the ANYOF regnode
6605 p is the target string
6606 lenp is pointer to the maximum number of bytes of how far to go in p
6607 (This is assumed wthout checking to always be at least the current
6608 character's size)
6609 utf8_target tells whether p is in UTF-8.
832705d4 6610
4b3cda86
KW
6611 Returns true if matched; false otherwise. If lenp is not NULL, on return
6612 from a successful match, the value it points to will be updated to how many
6613 bytes in p were matched. If there was no match, the value is undefined,
6614 possibly changed from the input.
eba1359e 6615
d5788240
KW
6616 Note that this can be a synthetic start class, a combination of various
6617 nodes, so things you think might be mutually exclusive, such as locale,
6618 aren't. It can match both locale and non-locale
6619
bbce6d69 6620 */
6621
76e3520e 6622STATIC bool
f6ad78d8 6623S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6624{
27da23d5 6625 dVAR;
a3b680e6 6626 const char flags = ANYOF_FLAGS(n);
bbce6d69 6627 bool match = FALSE;
cc07378b 6628 UV c = *p;
f7ab54c6 6629 STRLEN c_len = 0;
6698fab5 6630 STRLEN maxlen;
1aa99e6b 6631
7918f24d
NC
6632 PERL_ARGS_ASSERT_REGINCLASS;
6633
4b3cda86 6634 /* If c is not already the code point, get it */
f2ed9b32 6635 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6636 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6637 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6638 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6639 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6640 * UTF8_ALLOW_FFFF */
f7ab54c6 6641 if (c_len == (STRLEN)-1)
e8a70c6f 6642 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6643 }
a5a291f5
KW
6644 else {
6645 c_len = 1;
6646 }
bbce6d69 6647
a5a291f5
KW
6648 /* Use passed in max length, or one character if none passed in or less
6649 * than one character. And assume will match just one character. This is
6650 * overwritten later if matched more. */
4b3cda86 6651 if (lenp) {
a5a291f5
KW
6652 maxlen = (*lenp > c_len) ? *lenp : c_len;
6653 *lenp = c_len;
4b3cda86
KW
6654
6655 }
6656 else {
a5a291f5 6657 maxlen = c_len;
4b3cda86
KW
6658 }
6659
7cdde544
KW
6660 /* If this character is potentially in the bitmap, check it */
6661 if (c < 256) {
ffc61ed2
JH
6662 if (ANYOF_BITMAP_TEST(n, c))
6663 match = TRUE;
11454c59
KW
6664 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6665 && ! utf8_target
6666 && ! isASCII(c))
6667 {
6668 match = TRUE;
6669 }
a0ed51b3 6670
78969a98
KW
6671 else if (flags & ANYOF_LOCALE) {
6672 PL_reg_flags |= RF_tainted;
6673
6674 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6675 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6676 {
ffc61ed2 6677 match = TRUE;
78969a98 6678 }
040aea3a
KW
6679 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6680 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6681 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6682 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6683 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6684 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6685 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6686 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6687 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6688 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6689 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6690 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6691 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6692 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6693 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6694 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6695 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6696 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6697 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6698 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6699 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6700 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6701 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6702 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6703 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6704 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6705 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6706 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6707 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6708 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6709 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6710 ) /* How's that for a conditional? */
78969a98 6711 ) {
ffc61ed2
JH
6712 match = TRUE;
6713 }
a0ed51b3 6714 }
a0ed51b3
LW
6715 }
6716
7cdde544 6717 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6718 * bitmap could match, try that. Locale nodes specifiy completely the
6719 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6720 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6721 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6722 * could be a combination of locale and non-locale nodes. So allow locale
6723 * to match for the synthetic start class, which will give a false
6724 * positive that will be resolved when the match is done again as not part
6725 * of the synthetic start class */
ef87b810 6726 if (!match) {
10ee90d2
KW
6727 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6728 match = TRUE; /* Everything above 255 matches */
e051a21d 6729 }
6f8d7d0d
KW
6730 else if (ANYOF_NONBITMAP(n)
6731 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6732 || (utf8_target
6733 && (c >=256
6734 || (! (flags & ANYOF_LOCALE))
6735 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6736 {
7cdde544 6737 AV *av;
210e6c47 6738 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6739
6740 if (sw) {
6741 U8 * utf8_p;
6742 if (utf8_target) {
6743 utf8_p = (U8 *) p;
6744 } else {
f56b6394
KW
6745
6746 /* Not utf8. Convert as much of the string as available up
6747 * to the limit of how far the (single) character in the
6748 * pattern can possibly match (no need to go further). If
6749 * the node is a straight ANYOF or not folding, it can't
6750 * match more than one. Otherwise, It can match up to how
6751 * far a single char can fold to. Since not utf8, each
6752 * character is a single byte, so the max it can be in
6753 * bytes is the same as the max it can be in characters */
6754 STRLEN len = (OP(n) == ANYOF
6755 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6756 ? 1
6757 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6758 ? maxlen
6759 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
6760 utf8_p = bytes_to_utf8(p, &len);
6761 }
f56b6394 6762
f7eb7452 6763 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 6764 match = TRUE;
39065660 6765 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
6766
6767 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
6768 * matches. The non-multi char folds have all been moved to
6769 * the compilation phase, and the multi-char folds have
6770 * been stored by regcomp into 'av'; we linearly check to
6771 * see if any match the target string (folded). We know
6772 * that the originals were each one character, but we don't
6773 * currently know how many characters/bytes each folded to,
6774 * except we do know that there are small limits imposed by
6775 * Unicode. XXX A performance enhancement would be to have
6776 * regcomp.c store the max number of chars/bytes that are
6777 * in an av entry, as, say the 0th element. Even better
6778 * would be to have a hash of the few characters that can
6779 * start a multi-char fold to the max number of chars of
6780 * those folds.
f56b6394
KW
6781 *
6782 * If there is a match, we will need to advance (if lenp is
6783 * specified) the match pointer in the target string. But
6784 * what we are comparing here isn't that string directly,
6785 * but its fold, whose length may differ from the original.
6786 * As we go along in constructing the fold, therefore, we
6787 * create a map so that we know how many bytes in the
6788 * source to advance given that we have matched a certain
6789 * number of bytes in the fold. This map is stored in
6438af90
KW
6790 * 'map_fold_len_back'. Let n mean the number of bytes in
6791 * the fold of the first character that we are folding.
6792 * Then map_fold_len_back[n] is set to the number of bytes
6793 * in that first character. Similarly let m be the
6794 * corresponding number for the second character to be
6795 * folded. Then map_fold_len_back[n+m] is set to the
6796 * number of bytes occupied by the first two source
6797 * characters. ... */
6798 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
6799 U8 folded[UTF8_MAXBYTES_CASE+1];
6800 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 6801 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 6802 chars */
f56b6394
KW
6803
6804 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6805
6806 /* Here, only need to fold the first char of the target
6438af90 6807 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 6808 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 6809 total_foldlen = foldlen;
6438af90
KW
6810 map_fold_len_back[foldlen] = (utf8_target)
6811 ? UTF8SKIP(utf8_p)
6812 : 1;
f56b6394
KW
6813 }
6814 else {
6815
6816 /* Here, need to fold more than the first char. Do so
6817 * up to the limits */
f56b6394
KW
6818 U8* source_ptr = utf8_p; /* The source for the fold
6819 is the regex target
6820 string */
6821 U8* folded_ptr = folded;
6822 U8* e = utf8_p + maxlen; /* Can't go beyond last
6823 available byte in the
6824 target string */
6438af90
KW
6825 U8 i;
6826 for (i = 0;
6827 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6828 i++)
f56b6394
KW
6829 {
6830
6831 /* Fold the next character */
6832 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6833 STRLEN this_char_foldlen;
6834 to_utf8_fold(source_ptr,
6835 this_char_folded,
6836 &this_char_foldlen);
6837
6838 /* Bail if it would exceed the byte limit for
6839 * folding a single char. */
6840 if (this_char_foldlen + folded_ptr - folded >
6841 UTF8_MAXBYTES_CASE)
6842 {
6843 break;
6844 }
6845
6438af90 6846 /* Add the fold of this character */
f56b6394
KW
6847 Copy(this_char_folded,
6848 folded_ptr,
6849 this_char_foldlen,
6850 U8);
f56b6394 6851 source_ptr += UTF8SKIP(source_ptr);
6438af90 6852 folded_ptr += this_char_foldlen;
deba3d96 6853 total_foldlen = folded_ptr - folded;
6438af90
KW
6854
6855 /* Create map from the number of bytes in the fold
6856 * back to the number of bytes in the source. If
6857 * the source isn't utf8, the byte count is just
6858 * the number of characters so far */
deba3d96 6859 map_fold_len_back[total_foldlen]
6438af90
KW
6860 = (utf8_target)
6861 ? source_ptr - utf8_p
6862 : i + 1;
f56b6394
KW
6863 }
6864 *folded_ptr = '\0';
f56b6394
KW
6865 }
6866
6867
6868 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
6869 * of multi-char folds. */
6870 if (av) {
7cdde544
KW
6871 I32 i;
6872 for (i = 0; i <= av_len(av); i++) {
6873 SV* const sv = *av_fetch(av, i, FALSE);
6874 STRLEN len;
6875 const char * const s = SvPV_const(sv, len);
6438af90 6876
f9126265
KW
6877 if (len <= total_foldlen
6878 && memEQ(s, (char*)folded, len)
6879
6880 /* If 0, means matched a partial char. See
6881 * [perl #90536] */
6882 && map_fold_len_back[len])
f56b6394
KW
6883 {
6884
6885 /* Advance the target string ptr to account for
6886 * this fold, but have to translate from the
6887 * folded length to the corresponding source
6438af90 6888 * length. */
8f2655f7
KW
6889 if (lenp) {
6890 *lenp = map_fold_len_back[len];
8f2655f7 6891 }
7cdde544
KW
6892 match = TRUE;
6893 break;
6894 }
6895 }
6896 }
7cdde544
KW
6897 }
6898
6899 /* If we allocated a string above, free it */
6900 if (! utf8_target) Safefree(utf8_p);
6901 }
6902 }
6903 }
6904
a0ed51b3
LW
6905 return (flags & ANYOF_INVERT) ? !match : match;
6906}
161b471a 6907
dfe13c55 6908STATIC U8 *
0ce71af7 6909S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 6910{
6af86488
KW
6911 /* return the position 'off' UTF-8 characters away from 's', forward if
6912 * 'off' >= 0, backwards if negative. But don't go outside of position
6913 * 'lim', which better be < s if off < 0 */
6914
97aff369 6915 dVAR;
7918f24d
NC
6916
6917 PERL_ARGS_ASSERT_REGHOP3;
6918
a0ed51b3 6919 if (off >= 0) {
1aa99e6b 6920 while (off-- && s < lim) {
ffc61ed2 6921 /* XXX could check well-formedness here */
a0ed51b3 6922 s += UTF8SKIP(s);
ffc61ed2 6923 }
a0ed51b3
LW
6924 }
6925 else {
1de06328
YO
6926 while (off++ && s > lim) {
6927 s--;
6928 if (UTF8_IS_CONTINUED(*s)) {
6929 while (s > lim && UTF8_IS_CONTINUATION(*s))
6930 s--;
a0ed51b3 6931 }
1de06328 6932 /* XXX could check well-formedness here */
a0ed51b3
LW
6933 }
6934 }
6935 return s;
6936}
161b471a 6937
f9f4320a
YO
6938#ifdef XXX_dmq
6939/* there are a bunch of places where we use two reghop3's that should
6940 be replaced with this routine. but since thats not done yet
6941 we ifdef it out - dmq
6942*/
dfe13c55 6943STATIC U8 *
1de06328
YO
6944S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6945{
6946 dVAR;
7918f24d
NC
6947
6948 PERL_ARGS_ASSERT_REGHOP4;
6949
1de06328
YO
6950 if (off >= 0) {
6951 while (off-- && s < rlim) {
6952 /* XXX could check well-formedness here */
6953 s += UTF8SKIP(s);
6954 }
6955 }
6956 else {
6957 while (off++ && s > llim) {
6958 s--;
6959 if (UTF8_IS_CONTINUED(*s)) {
6960 while (s > llim && UTF8_IS_CONTINUATION(*s))
6961 s--;
6962 }
6963 /* XXX could check well-formedness here */
6964 }
6965 }
6966 return s;
6967}
f9f4320a 6968#endif
1de06328
YO
6969
6970STATIC U8 *
0ce71af7 6971S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 6972{
97aff369 6973 dVAR;
7918f24d
NC
6974
6975 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6976
a0ed51b3 6977 if (off >= 0) {
1aa99e6b 6978 while (off-- && s < lim) {
ffc61ed2 6979 /* XXX could check well-formedness here */
a0ed51b3 6980 s += UTF8SKIP(s);
ffc61ed2 6981 }
a0ed51b3 6982 if (off >= 0)
3dab1dad 6983 return NULL;
a0ed51b3
LW
6984 }
6985 else {
1de06328
YO
6986 while (off++ && s > lim) {
6987 s--;
6988 if (UTF8_IS_CONTINUED(*s)) {
6989 while (s > lim && UTF8_IS_CONTINUATION(*s))
6990 s--;
a0ed51b3 6991 }
1de06328 6992 /* XXX could check well-formedness here */
a0ed51b3
LW
6993 }
6994 if (off <= 0)
3dab1dad 6995 return NULL;
a0ed51b3
LW
6996 }
6997 return s;
6998}
51371543 6999
51371543 7000static void
acfe0abc 7001restore_pos(pTHX_ void *arg)
51371543 7002{
97aff369 7003 dVAR;
097eb12c 7004 regexp * const rex = (regexp *)arg;
51371543
GS
7005 if (PL_reg_eval_set) {
7006 if (PL_reg_oldsaved) {
4f639d21
DM
7007 rex->subbeg = PL_reg_oldsaved;
7008 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 7009#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7010 rex->saved_copy = PL_nrs;
ed252734 7011#endif
07bc277f 7012 RXp_MATCH_COPIED_on(rex);
51371543
GS
7013 }
7014 PL_reg_magic->mg_len = PL_reg_oldpos;
7015 PL_reg_eval_set = 0;
7016 PL_curpm = PL_reg_oldcurpm;
7017 }
7018}
33b8afdf
JH
7019
7020STATIC void
7021S_to_utf8_substr(pTHX_ register regexp *prog)
7022{
a1cac82e 7023 int i = 1;
7918f24d
NC
7024
7025 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7026
a1cac82e
NC
7027 do {
7028 if (prog->substrs->data[i].substr
7029 && !prog->substrs->data[i].utf8_substr) {
7030 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7031 prog->substrs->data[i].utf8_substr = sv;
7032 sv_utf8_upgrade(sv);
610460f9 7033 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7034 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7035 /* Trim the trailing \n that fbm_compile added last
7036 time. */
7037 SvCUR_set(sv, SvCUR(sv) - 1);
7038 /* Whilst this makes the SV technically "invalid" (as its
7039 buffer is no longer followed by "\0") when fbm_compile()
7040 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7041 fbm_compile(sv, FBMcf_TAIL);
7042 } else
7043 fbm_compile(sv, 0);
610460f9 7044 }
a1cac82e
NC
7045 if (prog->substrs->data[i].substr == prog->check_substr)
7046 prog->check_utf8 = sv;
7047 }
7048 } while (i--);
33b8afdf
JH
7049}
7050
7051STATIC void
7052S_to_byte_substr(pTHX_ register regexp *prog)
7053{
97aff369 7054 dVAR;
a1cac82e 7055 int i = 1;
7918f24d
NC
7056
7057 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7058
a1cac82e
NC
7059 do {
7060 if (prog->substrs->data[i].utf8_substr
7061 && !prog->substrs->data[i].substr) {
7062 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7063 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7064 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7065 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7066 /* Trim the trailing \n that fbm_compile added last
7067 time. */
7068 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7069 fbm_compile(sv, FBMcf_TAIL);
7070 } else
7071 fbm_compile(sv, 0);
7072 }
a1cac82e
NC
7073 } else {
7074 SvREFCNT_dec(sv);
7075 sv = &PL_sv_undef;
7076 }
7077 prog->substrs->data[i].substr = sv;
7078 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7079 prog->check_substr = sv;
33b8afdf 7080 }
a1cac82e 7081 } while (i--);
33b8afdf 7082}
66610fdd
RGS
7083
7084/*
7085 * Local variables:
7086 * c-indentation-style: bsd
7087 * c-basic-offset: 4
7088 * indent-tabs-mode: t
7089 * End:
7090 *
37442d52
RGS
7091 * ex: set ts=8 sts=4 sw=4 noet:
7092 */