This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Make new File::Copy test case insensitive.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
61dad979 84#include "utf8_strings.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
27d4fc33 147 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979
KW
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
149 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
151 LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \
152 LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \
153 LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \
154 LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \
155 LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
20d0b1e9 156
1dcf4a1b 157#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 158
ee9a90b8
KW
159/* The actual code for CCC_TRY, which uses several variables from the routine
160 * it's callable from. It is designed to be the bulk of a case statement.
161 * FUNC is the macro or function to call on non-utf8 targets that indicate if
162 * nextchr matches the class.
163 * UTF8_TEST is the whole test string to use for utf8 targets
164 * LOAD is what to use to test, and if not present to load in the swash for the
165 * class
166 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
167 * UTF8_TEST test.
168 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
169 * utf8 and a variant, load the swash if necessary and test using the utf8
170 * test. Advance to the next character if test is ok, otherwise fail; If not
171 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
172 * fails, or advance to the next character */
173
174#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
175 if (locinput >= PL_regeol) { \
176 sayNO; \
177 } \
178 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
179 LOAD_UTF8_CHARCLASS(CLASS, STR); \
180 if (POS_OR_NEG (UTF8_TEST)) { \
181 sayNO; \
182 } \
183 locinput += PL_utf8skip[nextchr]; \
184 nextchr = UCHARAT(locinput); \
185 break; \
186 } \
187 if (POS_OR_NEG (FUNC(nextchr))) { \
188 sayNO; \
189 } \
190 nextchr = UCHARAT(++locinput); \
980866de
KW
191 break;
192
ee9a90b8
KW
193/* Handle the non-locale cases for a character class and its complement. It
194 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
195 * This is because that code fails when the test succeeds, so we want to have
196 * the test fail so that the code succeeds. The swash is stored in a
197 * predictable PL_ place */
cfaf538b
KW
198#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
199 CLASS, STR) \
ee9a90b8
KW
200 case NAME: \
201 _CCC_TRY_CODE( !, FUNC, \
202 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
203 (U8*)locinput, TRUE)), \
204 CLASS, STR) \
205 case NNAME: \
1dcf4a1b 206 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
207 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
208 (U8*)locinput, TRUE)), \
209 CLASS, STR) \
210
211/* Generate the case statements for both locale and non-locale character
212 * classes in regmatch for classes that don't have special unicode semantics.
213 * Locales don't use an immediate swash, but an intermediary special locale
214 * function that is called on the pointer to the current place in the input
215 * string. That function will resolve to needing the same swash. One might
216 * think that because we don't know what the locale will match, we shouldn't
217 * check with the swash loading function that it loaded properly; ie, that we
218 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
219 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
220 * irrelevant here */
221#define CCC_TRY(NAME, NNAME, FUNC, \
222 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 223 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
224 CLASS, STR) \
225 case NAMEL: \
226 PL_reg_flags |= RF_tainted; \
227 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
228 case NNAMEL: \
229 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
230 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
231 CLASS, STR) \
cfaf538b
KW
232 case NAMEA: \
233 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
237 nextchr = UCHARAT(++locinput); \
238 break; \
239 case NNAMEA: \
240 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 if (utf8_target) { \
244 locinput += PL_utf8skip[nextchr]; \
245 nextchr = UCHARAT(locinput); \
246 } \
247 else { \
248 nextchr = UCHARAT(++locinput); \
249 } \
250 break; \
ee9a90b8
KW
251 /* Generate the non-locale cases */ \
252 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
253
254/* This is like CCC_TRY, but has an extra set of parameters for generating case
255 * statements to handle separate Unicode semantics nodes */
256#define CCC_TRY_U(NAME, NNAME, FUNC, \
257 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
258 NAMEU, NNAMEU, FUNCU, \
cfaf538b 259 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 260 CLASS, STR) \
cfaf538b
KW
261 CCC_TRY(NAME, NNAME, FUNC, \
262 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
263 NAMEA, NNAMEA, FUNCA, \
264 CLASS, STR) \
ee9a90b8 265 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 296#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 302#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
303#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
305
306#endif
e2d8ce26 307
a84d97b6
HS
308/*
309 Search for mandatory following text node; for lookahead, the text must
310 follow but for lookbehind (rn->flags != 0) we skip to the next step.
311*/
cca55fe3 312#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
313 while (JUMPABLE(rn)) { \
314 const OPCODE type = OP(rn); \
315 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 316 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 317 else if (type == PLUS) \
cca55fe3 318 rn = NEXTOPER(rn); \
3dab1dad 319 else if (type == IFMATCH) \
a84d97b6 320 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 321 else rn += NEXT_OFF(rn); \
3dab1dad 322 } \
5f80c4cf 323} STMT_END
74750237 324
c476f425 325
acfe0abc 326static void restore_pos(pTHX_ void *arg);
51371543 327
87c0511b 328#define REGCP_PAREN_ELEMS 3
f067efbf 329#define REGCP_OTHER_ELEMS 3
e0fa7e2b 330#define REGCP_FRAME_ELEMS 1
620d5b66
NC
331/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
332 * are needed for the regexp context stack bookkeeping. */
333
76e3520e 334STATIC CHECKPOINT
b93070ed 335S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 336{
97aff369 337 dVAR;
a3b680e6 338 const int retval = PL_savestack_ix;
a3b680e6 339 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
340 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
341 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 342 I32 p;
40a82448 343 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 344
b93070ed
DM
345 PERL_ARGS_ASSERT_REGCPPUSH;
346
e49a9654 347 if (paren_elems_to_push < 0)
5637ef5b
NC
348 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
349 paren_elems_to_push);
e49a9654 350
e0fa7e2b
NC
351 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
352 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
353 " out of range (%lu-%ld)",
354 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 355
620d5b66 356 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 357
495f47a5
DM
358 DEBUG_BUFFERS_r(
359 if ((int)PL_regsize > (int)parenfloor)
360 PerlIO_printf(Perl_debug_log,
361 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
362 PTR2UV(rex),
363 PTR2UV(rex->offs)
364 );
365 );
87c0511b 366 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
368 SSPUSHINT(rex->offs[p].end);
369 SSPUSHINT(rex->offs[p].start);
1ca2007e 370 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 371 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
372 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
373 (UV)p,
374 (IV)rex->offs[p].start,
375 (IV)rex->offs[p].start_tmp,
376 (IV)rex->offs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 380 SSPUSHINT(PL_regsize);
b93070ed
DM
381 SSPUSHINT(rex->lastparen);
382 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 383 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 384
a0d0e21e
LW
385 return retval;
386}
387
c277df42 388/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
389#define REGCP_SET(cp) \
390 DEBUG_STATE_r( \
ab3bbdeb 391 PerlIO_printf(Perl_debug_log, \
e4f74956 392 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
393 (IV)PL_savestack_ix)); \
394 cp = PL_savestack_ix
c3464db5 395
ab3bbdeb 396#define REGCP_UNWIND(cp) \
e4f74956 397 DEBUG_STATE_r( \
ab3bbdeb 398 if (cp != PL_savestack_ix) \
e4f74956
YO
399 PerlIO_printf(Perl_debug_log, \
400 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
401 (IV)(cp), (IV)PL_savestack_ix)); \
402 regcpblow(cp)
c277df42 403
a8d1f4b4
DM
404#define UNWIND_PAREN(lp, lcp) \
405 for (n = rex->lastparen; n > lp; n--) \
406 rex->offs[n].end = -1; \
407 rex->lastparen = n; \
408 rex->lastcloseparen = lcp;
409
410
f067efbf 411STATIC void
b93070ed 412S_regcppop(pTHX_ regexp *rex)
a0d0e21e 413{
97aff369 414 dVAR;
e0fa7e2b 415 UV i;
87c0511b 416 U32 paren;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
87c0511b 439 paren = PL_regsize;
620d5b66 440 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 441 I32 tmps;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
87c0511b 455 paren--;
a0d0e21e 456 }
daf18116 457#if 1
dafc8851
JH
458 /* It would seem that the similar code in regtry()
459 * already takes care of this, and in fact it is in
460 * a better location to since this code can #if 0-ed out
461 * but the code in regtry() is needed or otherwise tests
462 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
463 * (as of patchlevel 7877) will fail. Then again,
464 * this code seems to be necessary or otherwise
225593e1
DM
465 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
466 * --jhi updated by dapm */
b93070ed 467 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 468 if (i > PL_regsize)
b93070ed
DM
469 rex->offs[i].start = -1;
470 rex->offs[i].end = -1;
495f47a5
DM
471 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
472 " \\%"UVuf": %s ..-1 undeffing\n",
473 (UV)i,
474 (i > PL_regsize) ? "-1" : " "
475 ));
a0d0e21e 476 }
dafc8851 477#endif
a0d0e21e
LW
478}
479
74088413
DM
480/* restore the parens and associated vars at savestack position ix,
481 * but without popping the stack */
482
483STATIC void
484S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485{
486 I32 tmpix = PL_savestack_ix;
487 PL_savestack_ix = ix;
488 regcppop(rex);
489 PL_savestack_ix = tmpix;
490}
491
02db2b7b 492#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 493
a687059c 494/*
e50aee73 495 * pregexec and friends
a687059c
LW
496 */
497
76234dfb 498#ifndef PERL_IN_XSUB_RE
a687059c 499/*
c277df42 500 - pregexec - match a regexp against a string
a687059c 501 */
c277df42 502I32
49d7dfbc 503Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 504 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
505/* strend: pointer to null at end of string */
506/* strbeg: real beginning of string */
507/* minend: end of match must be >=minend after stringarg. */
508/* nosave: For optimizations. */
509{
7918f24d
NC
510 PERL_ARGS_ASSERT_PREGEXEC;
511
c277df42 512 return
9041c2e3 513 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
514 nosave ? 0 : REXEC_COPY_STR);
515}
76234dfb 516#endif
22e551b9 517
9041c2e3 518/*
cad2e5aa
JH
519 * Need to implement the following flags for reg_anch:
520 *
521 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
522 * USE_INTUIT_ML
523 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
524 * INTUIT_AUTORITATIVE_ML
525 * INTUIT_ONCE_NOML - Intuit can match in one location only.
526 * INTUIT_ONCE_ML
527 *
528 * Another flag for this function: SECOND_TIME (so that float substrs
529 * with giant delta may be not rechecked).
530 */
531
532/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
533
3f7c398e 534/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
535 Otherwise, only SvCUR(sv) is used to get strbeg. */
536
537/* XXXX We assume that strpos is strbeg unless sv. */
538
6eb5f6b9
JH
539/* XXXX Some places assume that there is a fixed substring.
540 An update may be needed if optimizer marks as "INTUITable"
541 RExen without fixed substrings. Similarly, it is assumed that
542 lengths of all the strings are no more than minlen, thus they
543 cannot come from lookahead.
40d049e4
YO
544 (Or minlen should take into account lookahead.)
545 NOTE: Some of this comment is not correct. minlen does now take account
546 of lookahead/behind. Further research is required. -- demerphq
547
548*/
6eb5f6b9 549
2c2d71f5
JH
550/* A failure to find a constant substring means that there is no need to make
551 an expensive call to REx engine, thus we celebrate a failure. Similarly,
552 finding a substring too deep into the string means that less calls to
30944b6d
IZ
553 regtry() should be needed.
554
555 REx compiler's optimizer found 4 possible hints:
556 a) Anchored substring;
557 b) Fixed substring;
558 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 559 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 560 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
561 string which does not contradict any of them.
562 */
2c2d71f5 563
6eb5f6b9
JH
564/* Most of decisions we do here should have been done at compile time.
565 The nodes of the REx which we used for the search should have been
566 deleted from the finite automaton. */
567
cad2e5aa 568char *
288b8c02 569Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 570 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 571{
97aff369 572 dVAR;
288b8c02 573 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 574 I32 start_shift = 0;
cad2e5aa 575 /* Should be nonnegative! */
eb578fdb
KW
576 I32 end_shift = 0;
577 char *s;
578 SV *check;
a1933d95 579 char *strbeg;
cad2e5aa 580 char *t;
f2ed9b32 581 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 582 I32 ml_anch;
eb578fdb 583 char *other_last = NULL; /* other substr checked before this */
bd61b366 584 char *check_at = NULL; /* check substr found at this pos */
d8080198 585 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 586 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 587 RXi_GET_DECL(prog,progi);
30944b6d 588#ifdef DEBUGGING
890ce7af 589 const char * const i_strpos = strpos;
30944b6d 590#endif
a3621e74
YO
591 GET_RE_DEBUG_FLAGS_DECL;
592
7918f24d 593 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
594 PERL_UNUSED_ARG(flags);
595 PERL_UNUSED_ARG(data);
7918f24d 596
f2ed9b32 597 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 598
3c8556c3 599 if (RX_UTF8(rx)) {
b8d68ded
JH
600 PL_reg_flags |= RF_utf8;
601 }
ab3bbdeb 602 DEBUG_EXECUTE_r(
f2ed9b32 603 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
604 sv ? "Guessing start of match in sv for"
605 : "Guessing start of match in string for");
2a782b5b 606 );
cad2e5aa 607
c344f387
JH
608 /* CHR_DIST() would be more correct here but it makes things slow. */
609 if (prog->minlen > strend - strpos) {
a3621e74 610 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 611 "String too short... [re_intuit_start]\n"));
cad2e5aa 612 goto fail;
2c2d71f5 613 }
1de06328 614
a1933d95 615 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 616 PL_regeol = strend;
f2ed9b32 617 if (utf8_target) {
33b8afdf
JH
618 if (!prog->check_utf8 && prog->check_substr)
619 to_utf8_substr(prog);
620 check = prog->check_utf8;
621 } else {
622 if (!prog->check_substr && prog->check_utf8)
623 to_byte_substr(prog);
624 check = prog->check_substr;
625 }
1de06328 626 if (check == &PL_sv_undef) {
a3621e74 627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 628 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
629 goto fail;
630 }
bbe252da
YO
631 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
632 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
633 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 634 && !multiline ) ); /* Check after \n? */
cad2e5aa 635
7e25d62c 636 if (!ml_anch) {
bbe252da
YO
637 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
638 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 639 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
640 && sv && !SvROK(sv)
641 && (strpos != strbeg)) {
a3621e74 642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
643 goto fail;
644 }
645 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 646 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 647 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
648 I32 slen;
649
1aa99e6b 650 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 651
653099ff
GS
652 if (SvTAIL(check)) {
653 slen = SvCUR(check); /* >= 1 */
cad2e5aa 654
9041c2e3 655 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 656 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 658 goto fail_finish;
cad2e5aa
JH
659 }
660 /* Now should match s[0..slen-2] */
661 slen--;
3f7c398e 662 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 663 || (slen > 1
3f7c398e 664 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 665 report_neq:
a3621e74 666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
667 goto fail_finish;
668 }
cad2e5aa 669 }
3f7c398e 670 else if (*SvPVX_const(check) != *s
653099ff 671 || ((slen = SvCUR(check)) > 1
3f7c398e 672 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 673 goto report_neq;
c315bfe8 674 check_at = s;
2c2d71f5 675 goto success_at_start;
7e25d62c 676 }
cad2e5aa 677 }
2c2d71f5 678 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 679 s = strpos;
2c2d71f5 680 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
681 end_shift = prog->check_end_shift;
682
2c2d71f5 683 if (!ml_anch) {
a3b680e6 684 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 685 - (SvTAIL(check) != 0);
a3b680e6 686 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
687
688 if (end_shift < eshift)
689 end_shift = eshift;
690 }
cad2e5aa 691 }
2c2d71f5 692 else { /* Can match at random position */
cad2e5aa
JH
693 ml_anch = 0;
694 s = strpos;
1de06328
YO
695 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
696 end_shift = prog->check_end_shift;
697
698 /* end shift should be non negative here */
cad2e5aa
JH
699 }
700
bcdf7404 701#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 702 if (end_shift < 0)
1de06328 703 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 704 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
705#endif
706
2c2d71f5
JH
707 restart:
708 /* Find a possible match in the region s..strend by looking for
709 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
710
711 {
712 I32 srch_start_shift = start_shift;
713 I32 srch_end_shift = end_shift;
c33e64f0
FC
714 U8* start_point;
715 U8* end_point;
1de06328
YO
716 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
717 srch_end_shift -= ((strbeg - s) - srch_start_shift);
718 srch_start_shift = strbeg - s;
719 }
6bda09f9 720 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
721 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
722 (IV)prog->check_offset_min,
723 (IV)srch_start_shift,
724 (IV)srch_end_shift,
725 (IV)prog->check_end_shift);
726 });
727
bbe252da 728 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
729 start_point= (U8*)(s + srch_start_shift);
730 end_point= (U8*)(strend - srch_end_shift);
731 } else {
732 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
733 end_point= HOP3(strend, -srch_end_shift, strbeg);
734 }
6bda09f9 735 DEBUG_OPTIMISE_MORE_r({
56570a2c 736 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 737 (int)(end_point - start_point),
fc8cd66c 738 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
739 start_point);
740 });
741
742 s = fbm_instr( start_point, end_point,
7fba1cd6 743 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 744 }
cad2e5aa
JH
745 /* Update the count-of-usability, remove useless subpatterns,
746 unshift s. */
2c2d71f5 747
ab3bbdeb 748 DEBUG_EXECUTE_r({
f2ed9b32 749 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
750 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
751 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 752 (s ? "Found" : "Did not find"),
f2ed9b32 753 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
754 ? "anchored" : "floating"),
755 quoted,
756 RE_SV_TAIL(check),
757 (s ? " at offset " : "...\n") );
758 });
2c2d71f5
JH
759
760 if (!s)
761 goto fail_finish;
2c2d71f5 762 /* Finish the diagnostic message */
a3621e74 763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 764
1de06328
YO
765 /* XXX dmq: first branch is for positive lookbehind...
766 Our check string is offset from the beginning of the pattern.
767 So we need to do any stclass tests offset forward from that
768 point. I think. :-(
769 */
770
771
772
773 check_at=s;
774
775
2c2d71f5
JH
776 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
777 Start with the other substr.
778 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 779 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
780 *always* match. Probably should be marked during compile...
781 Probably it is right to do no SCREAM here...
782 */
783
f2ed9b32 784 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
785 : (prog->float_substr && prog->anchored_substr))
786 {
30944b6d 787 /* Take into account the "other" substring. */
2c2d71f5
JH
788 /* XXXX May be hopelessly wrong for UTF... */
789 if (!other_last)
6eb5f6b9 790 other_last = strpos;
f2ed9b32 791 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
792 do_other_anchored:
793 {
890ce7af
AL
794 char * const last = HOP3c(s, -start_shift, strbeg);
795 char *last1, *last2;
be8e71aa 796 char * const saved_s = s;
33b8afdf 797 SV* must;
2c2d71f5 798
2c2d71f5
JH
799 t = s - prog->check_offset_max;
800 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 801 && (!utf8_target
0ce71af7 802 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 803 && t > strpos)))
6f207bd3 804 NOOP;
2c2d71f5
JH
805 else
806 t = strpos;
1aa99e6b 807 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
808 if (t < other_last) /* These positions already checked */
809 t = other_last;
1aa99e6b 810 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
811 if (last < last1)
812 last1 = last;
1de06328
YO
813 /* XXXX It is not documented what units *_offsets are in.
814 We assume bytes, but this is clearly wrong.
815 Meaning this code needs to be carefully reviewed for errors.
816 dmq.
817 */
818
2c2d71f5 819 /* On end-of-str: see comment below. */
f2ed9b32 820 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
821 if (must == &PL_sv_undef) {
822 s = (char*)NULL;
1de06328 823 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
824 }
825 else
826 s = fbm_instr(
827 (unsigned char*)t,
828 HOP3(HOP3(last1, prog->anchored_offset, strend)
829 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
830 must,
7fba1cd6 831 multiline ? FBMrf_MULTILINE : 0
33b8afdf 832 );
ab3bbdeb 833 DEBUG_EXECUTE_r({
f2ed9b32 834 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
835 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
836 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 837 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
838 quoted, RE_SV_TAIL(must));
839 });
840
841
2c2d71f5
JH
842 if (!s) {
843 if (last1 >= last2) {
a3621e74 844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
845 ", giving up...\n"));
846 goto fail_finish;
847 }
a3621e74 848 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 849 ", trying floating at offset %ld...\n",
be8e71aa 850 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
851 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
852 s = HOP3c(last, 1, strend);
2c2d71f5
JH
853 goto restart;
854 }
855 else {
a3621e74 856 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 857 (long)(s - i_strpos)));
1aa99e6b
IH
858 t = HOP3c(s, -prog->anchored_offset, strbeg);
859 other_last = HOP3c(s, 1, strend);
be8e71aa 860 s = saved_s;
2c2d71f5
JH
861 if (t == strpos)
862 goto try_at_start;
2c2d71f5
JH
863 goto try_at_offset;
864 }
30944b6d 865 }
2c2d71f5
JH
866 }
867 else { /* Take into account the floating substring. */
33b8afdf 868 char *last, *last1;
be8e71aa 869 char * const saved_s = s;
33b8afdf
JH
870 SV* must;
871
872 t = HOP3c(s, -start_shift, strbeg);
873 last1 = last =
874 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
875 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
876 last = HOP3c(t, prog->float_max_offset, strend);
877 s = HOP3c(t, prog->float_min_offset, strend);
878 if (s < other_last)
879 s = other_last;
2c2d71f5 880 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 881 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
882 /* fbm_instr() takes into account exact value of end-of-str
883 if the check is SvTAIL(ed). Since false positives are OK,
884 and end-of-str is not later than strend we are OK. */
885 if (must == &PL_sv_undef) {
886 s = (char*)NULL;
1de06328 887 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
888 }
889 else
2c2d71f5 890 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
891 (unsigned char*)last + SvCUR(must)
892 - (SvTAIL(must)!=0),
7fba1cd6 893 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 894 DEBUG_EXECUTE_r({
f2ed9b32 895 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
896 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
897 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 898 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
899 quoted, RE_SV_TAIL(must));
900 });
33b8afdf
JH
901 if (!s) {
902 if (last1 == last) {
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
904 ", giving up...\n"));
905 goto fail_finish;
2c2d71f5 906 }
a3621e74 907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 908 ", trying anchored starting at offset %ld...\n",
be8e71aa 909 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
910 other_last = last;
911 s = HOP3c(t, 1, strend);
912 goto restart;
913 }
914 else {
a3621e74 915 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
916 (long)(s - i_strpos)));
917 other_last = s; /* Fix this later. --Hugo */
be8e71aa 918 s = saved_s;
33b8afdf
JH
919 if (t == strpos)
920 goto try_at_start;
921 goto try_at_offset;
922 }
2c2d71f5 923 }
cad2e5aa 924 }
2c2d71f5 925
1de06328 926
9ef43ace 927 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 928
6bda09f9 929 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
930 PerlIO_printf(Perl_debug_log,
931 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
932 (IV)prog->check_offset_min,
933 (IV)prog->check_offset_max,
934 (IV)(s-strpos),
935 (IV)(t-strpos),
936 (IV)(t-s),
937 (IV)(strend-strpos)
938 )
939 );
940
2c2d71f5 941 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 942 && (!utf8_target
9ef43ace 943 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
944 && t > strpos)))
945 {
2c2d71f5
JH
946 /* Fixed substring is found far enough so that the match
947 cannot start at strpos. */
948 try_at_offset:
cad2e5aa 949 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
950 /* Eventually fbm_*() should handle this, but often
951 anchored_offset is not 0, so this check will not be wasted. */
952 /* XXXX In the code below we prefer to look for "^" even in
953 presence of anchored substrings. And we search even
954 beyond the found float position. These pessimizations
955 are historical artefacts only. */
956 find_anchor:
2c2d71f5 957 while (t < strend - prog->minlen) {
cad2e5aa 958 if (*t == '\n') {
4ee3650e 959 if (t < check_at - prog->check_offset_min) {
f2ed9b32 960 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
961 /* Since we moved from the found position,
962 we definitely contradict the found anchored
30944b6d
IZ
963 substr. Due to the above check we do not
964 contradict "check" substr.
965 Thus we can arrive here only if check substr
966 is float. Redo checking for "other"=="fixed".
967 */
9041c2e3 968 strpos = t + 1;
a3621e74 969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 970 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
971 goto do_other_anchored;
972 }
4ee3650e
GS
973 /* We don't contradict the found floating substring. */
974 /* XXXX Why not check for STCLASS? */
cad2e5aa 975 s = t + 1;
a3621e74 976 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 977 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
978 goto set_useful;
979 }
4ee3650e
GS
980 /* Position contradicts check-string */
981 /* XXXX probably better to look for check-string
982 than for "\n", so one should lower the limit for t? */
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 984 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 985 other_last = strpos = s = t + 1;
cad2e5aa
JH
986 goto restart;
987 }
988 t++;
989 }
a3621e74 990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 991 PL_colors[0], PL_colors[1]));
2c2d71f5 992 goto fail_finish;
cad2e5aa 993 }
f5952150 994 else {
a3621e74 995 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 996 PL_colors[0], PL_colors[1]));
f5952150 997 }
cad2e5aa
JH
998 s = t;
999 set_useful:
f2ed9b32 1000 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1001 }
1002 else {
f5952150 1003 /* The found string does not prohibit matching at strpos,
2c2d71f5 1004 - no optimization of calling REx engine can be performed,
f5952150
GS
1005 unless it was an MBOL and we are not after MBOL,
1006 or a future STCLASS check will fail this. */
2c2d71f5
JH
1007 try_at_start:
1008 /* Even in this situation we may use MBOL flag if strpos is offset
1009 wrt the start of the string. */
05b4157f 1010 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1011 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1012 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1013 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1014 {
cad2e5aa
JH
1015 t = strpos;
1016 goto find_anchor;
1017 }
a3621e74 1018 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1019 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1020 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1021 );
2c2d71f5 1022 success_at_start:
bbe252da 1023 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1024 && (utf8_target ? (
33b8afdf
JH
1025 prog->check_utf8 /* Could be deleted already */
1026 && --BmUSEFUL(prog->check_utf8) < 0
1027 && (prog->check_utf8 == prog->float_utf8)
1028 ) : (
1029 prog->check_substr /* Could be deleted already */
1030 && --BmUSEFUL(prog->check_substr) < 0
1031 && (prog->check_substr == prog->float_substr)
1032 )))
66e933ab 1033 {
cad2e5aa 1034 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1035 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1036 /* XXX Does the destruction order has to change with utf8_target? */
1037 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1038 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1039 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1040 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1041 check = NULL; /* abort */
cad2e5aa 1042 s = strpos;
486ec47a 1043 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1044 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1045 if (prog->intflags & PREGf_IMPLICIT)
1046 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1047 /* XXXX This is a remnant of the old implementation. It
1048 looks wasteful, since now INTUIT can use many
6eb5f6b9 1049 other heuristics. */
bbe252da 1050 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1051 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1052 }
1053 else
1054 s = strpos;
1055 }
1056
6eb5f6b9
JH
1057 /* Last resort... */
1058 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1059 /* trie stclasses are too expensive to use here, we are better off to
1060 leave it to regmatch itself */
f8fc2ecf 1061 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1062 /* minlen == 0 is possible if regstclass is \b or \B,
1063 and the fixed substr is ''$.
1064 Since minlen is already taken into account, s+1 is before strend;
1065 accidentally, minlen >= 1 guaranties no false positives at s + 1
1066 even for \b or \B. But (minlen? 1 : 0) below assumes that
1067 regstclass does not come from lookahead... */
1068 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1069 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1070 const U8* const str = (U8*)STRING(progi->regstclass);
1071 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1072 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1073 : 1);
1de06328
YO
1074 char * endpos;
1075 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1076 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1077 else if (prog->float_substr || prog->float_utf8)
1078 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1079 else
1080 endpos= strend;
1081
d8080198
YO
1082 if (checked_upto < s)
1083 checked_upto = s;
1084 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1085 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1086
6eb5f6b9 1087 t = s;
d8080198
YO
1088 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1089 if (s) {
1090 checked_upto = s;
1091 } else {
6eb5f6b9 1092#ifdef DEBUGGING
cbbf8932 1093 const char *what = NULL;
6eb5f6b9
JH
1094#endif
1095 if (endpos == strend) {
a3621e74 1096 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1097 "Could not match STCLASS...\n") );
1098 goto fail;
1099 }
a3621e74 1100 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1101 "This position contradicts STCLASS...\n") );
bbe252da 1102 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1103 goto fail;
d8080198
YO
1104 checked_upto = HOPBACKc(endpos, start_shift);
1105 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1106 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1107 /* Contradict one of substrings */
33b8afdf 1108 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1109 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1110 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1111 hop_and_restart:
1aa99e6b 1112 s = HOP3c(t, 1, strend);
66e933ab
GS
1113 if (s + start_shift + end_shift > strend) {
1114 /* XXXX Should be taken into account earlier? */
a3621e74 1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1116 "Could not match STCLASS...\n") );
1117 goto fail;
1118 }
5e39e1e5
HS
1119 if (!check)
1120 goto giveup;
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1122 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1123 what, (long)(s + start_shift - i_strpos)) );
1124 goto restart;
1125 }
66e933ab 1126 /* Have both, check_string is floating */
6eb5f6b9
JH
1127 if (t + start_shift >= check_at) /* Contradicts floating=check */
1128 goto retry_floating_check;
1129 /* Recheck anchored substring, but not floating... */
9041c2e3 1130 s = check_at;
5e39e1e5
HS
1131 if (!check)
1132 goto giveup;
a3621e74 1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1134 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1135 (long)(other_last - i_strpos)) );
1136 goto do_other_anchored;
1137 }
60e71179
GS
1138 /* Another way we could have checked stclass at the
1139 current position only: */
1140 if (ml_anch) {
1141 s = t = t + 1;
5e39e1e5
HS
1142 if (!check)
1143 goto giveup;
a3621e74 1144 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1145 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1146 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1147 goto try_at_offset;
66e933ab 1148 }
f2ed9b32 1149 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1150 goto fail;
486ec47a 1151 /* Check is floating substring. */
6eb5f6b9
JH
1152 retry_floating_check:
1153 t = check_at - start_shift;
a3621e74 1154 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1155 goto hop_and_restart;
1156 }
b7953727 1157 if (t != s) {
a3621e74 1158 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1159 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1160 (long)(t - i_strpos), (long)(s - i_strpos))
1161 );
1162 }
1163 else {
a3621e74 1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1165 "Does not contradict STCLASS...\n");
1166 );
1167 }
6eb5f6b9 1168 }
5e39e1e5 1169 giveup:
a3621e74 1170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1171 PL_colors[4], (check ? "Guessed" : "Giving up"),
1172 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1173 return s;
2c2d71f5
JH
1174
1175 fail_finish: /* Substring not found */
33b8afdf 1176 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1177 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1178 fail:
a3621e74 1179 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1180 PL_colors[4], PL_colors[5]));
bd61b366 1181 return NULL;
cad2e5aa 1182}
9661b544 1183
a0a388a1
YO
1184#define DECL_TRIE_TYPE(scan) \
1185 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1186 trie_type = ((scan->flags == EXACT) \
1187 ? (utf8_target ? trie_utf8 : trie_plain) \
1188 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1189
1190#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1191uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1192 STRLEN skiplen; \
1193 switch (trie_type) { \
1194 case trie_utf8_fold: \
1195 if ( foldlen>0 ) { \
1196 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1197 foldlen -= len; \
1198 uscan += len; \
1199 len=0; \
1200 } else { \
1201 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1202 len = UTF8SKIP(uc); \
1203 skiplen = UNISKIP( uvc ); \
1204 foldlen -= skiplen; \
1205 uscan = foldbuf + skiplen; \
1206 } \
1207 break; \
1208 case trie_latin_utf8_fold: \
1209 if ( foldlen>0 ) { \
1210 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1211 foldlen -= len; \
1212 uscan += len; \
1213 len=0; \
1214 } else { \
1215 len = 1; \
1216 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1217 skiplen = UNISKIP( uvc ); \
1218 foldlen -= skiplen; \
1219 uscan = foldbuf + skiplen; \
1220 } \
1221 break; \
1222 case trie_utf8: \
1223 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1224 break; \
1225 case trie_plain: \
1226 uvc = (UV)*uc; \
1227 len = 1; \
1228 } \
1229 if (uvc < 256) { \
1230 charid = trie->charmap[ uvc ]; \
1231 } \
1232 else { \
1233 charid = 0; \
1234 if (widecharmap) { \
1235 SV** const svpp = hv_fetch(widecharmap, \
1236 (char*)&uvc, sizeof(UV), 0); \
1237 if (svpp) \
1238 charid = (U16)SvIV(*svpp); \
1239 } \
1240 } \
4cadc6a9
YO
1241} STMT_END
1242
4cadc6a9
YO
1243#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1244STMT_START { \
1245 while (s <= e) { \
1246 if ( (CoNd) \
fac1af77 1247 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1248 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1249 goto got_it; \
1250 s++; \
1251 } \
1252} STMT_END
1253
1254#define REXEC_FBC_UTF8_SCAN(CoDe) \
1255STMT_START { \
1256 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1257 CoDe \
1258 s += uskip; \
1259 } \
1260} STMT_END
1261
1262#define REXEC_FBC_SCAN(CoDe) \
1263STMT_START { \
1264 while (s < strend) { \
1265 CoDe \
1266 s++; \
1267 } \
1268} STMT_END
1269
1270#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1271REXEC_FBC_UTF8_SCAN( \
1272 if (CoNd) { \
24b23f37 1273 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1274 goto got_it; \
1275 else \
1276 tmp = doevery; \
1277 } \
1278 else \
1279 tmp = 1; \
1280)
1281
1282#define REXEC_FBC_CLASS_SCAN(CoNd) \
1283REXEC_FBC_SCAN( \
1284 if (CoNd) { \
24b23f37 1285 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1286 goto got_it; \
1287 else \
1288 tmp = doevery; \
1289 } \
1290 else \
1291 tmp = 1; \
1292)
1293
1294#define REXEC_FBC_TRYIT \
24b23f37 1295if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1296 goto got_it
1297
e1d1eefb 1298#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1299 if (utf8_target) { \
e1d1eefb
YO
1300 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1301 } \
1302 else { \
1303 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1304 }
e1d1eefb 1305
4cadc6a9 1306#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1307 if (utf8_target) { \
4cadc6a9
YO
1308 UtFpReLoAd; \
1309 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1310 } \
1311 else { \
1312 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1313 }
4cadc6a9
YO
1314
1315#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1316 PL_reg_flags |= RF_tainted; \
f2ed9b32 1317 if (utf8_target) { \
4cadc6a9
YO
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 } \
1320 else { \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1322 }
4cadc6a9 1323
786e8c11
YO
1324#define DUMP_EXEC_POS(li,s,doutf8) \
1325 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1326
cfaf538b
KW
1327
1328#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1329 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1330 tmp = TEST_NON_UTF8(tmp); \
1331 REXEC_FBC_UTF8_SCAN( \
1332 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1333 tmp = !tmp; \
1334 IF_SUCCESS; \
1335 } \
1336 else { \
1337 IF_FAIL; \
1338 } \
1339 ); \
1340
1341#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1342 if (s == PL_bostr) { \
1343 tmp = '\n'; \
1344 } \
1345 else { \
1346 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1347 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1348 } \
1349 tmp = TeSt1_UtF8; \
1350 LOAD_UTF8_CHARCLASS_ALNUM(); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! (TeSt2_UtF8)) { \
1353 tmp = !tmp; \
1354 IF_SUCCESS; \
1355 } \
1356 else { \
1357 IF_FAIL; \
1358 } \
1359 ); \
1360
63ac0dad
KW
1361/* The only difference between the BOUND and NBOUND cases is that
1362 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1363 * NBOUND. This is accomplished by passing it in either the if or else clause,
1364 * with the other one being empty */
1365#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1366 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1367
1368#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1369 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1370
1371#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1372 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1373
1374#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1375 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1376
63ac0dad
KW
1377
1378/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1379 * be passed in completely with the variable name being tested, which isn't
1380 * such a clean interface, but this is easier to read than it was before. We
1381 * are looking for the boundary (or non-boundary between a word and non-word
1382 * character. The utf8 and non-utf8 cases have the same logic, but the details
1383 * must be different. Find the "wordness" of the character just prior to this
1384 * one, and compare it with the wordness of this one. If they differ, we have
1385 * a boundary. At the beginning of the string, pretend that the previous
1386 * character was a new-line */
cfaf538b 1387#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1388 if (utf8_target) { \
cfaf538b 1389 UTF8_CODE \
63ac0dad
KW
1390 } \
1391 else { /* Not utf8 */ \
1392 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1393 tmp = TEST_NON_UTF8(tmp); \
1394 REXEC_FBC_SCAN( \
1395 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1396 tmp = !tmp; \
1397 IF_SUCCESS; \
1398 } \
1399 else { \
1400 IF_FAIL; \
1401 } \
1402 ); \
1403 } \
1404 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1405 goto got_it;
1406
786e8c11
YO
1407/* We know what class REx starts with. Try to find this position... */
1408/* if reginfo is NULL, its a dryrun */
1409/* annoyingly all the vars in this routine have different names from their counterparts
1410 in regmatch. /grrr */
1411
3c3eec57 1412STATIC char *
07be1b83 1413S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1414 const char *strend, regmatch_info *reginfo)
a687059c 1415{
27da23d5 1416 dVAR;
bbe252da 1417 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1418 char *pat_string; /* The pattern's exactish string */
1419 char *pat_end; /* ptr to end char of pat_string */
1420 re_fold_t folder; /* Function for computing non-utf8 folds */
1421 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1422 STRLEN ln;
5dab1207 1423 STRLEN lnc;
eb578fdb 1424 STRLEN uskip;
fac1af77
KW
1425 U8 c1;
1426 U8 c2;
6eb5f6b9 1427 char *e;
eb578fdb
KW
1428 I32 tmp = 1; /* Scratch variable? */
1429 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1430 UV utf8_fold_flags = 0;
f8fc2ecf 1431 RXi_GET_DECL(prog,progi);
7918f24d
NC
1432
1433 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1434
6eb5f6b9
JH
1435 /* We know what class it must start with. */
1436 switch (OP(c)) {
f56b6394 1437 case ANYOFV:
6eb5f6b9 1438 case ANYOF:
f56b6394 1439 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1440 STRLEN inclasslen = strend - s;
1441 REXEC_FBC_UTF8_CLASS_SCAN(
1442 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1443 }
1444 else {
6ef69d56 1445 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1446 }
6eb5f6b9 1447 break;
f33976b4 1448 case CANY:
4cadc6a9 1449 REXEC_FBC_SCAN(
24b23f37 1450 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1451 goto got_it;
1452 else
1453 tmp = doevery;
4cadc6a9 1454 );
f33976b4 1455 break;
2f7f8cb1
KW
1456
1457 case EXACTFA:
1458 if (UTF_PATTERN || utf8_target) {
1459 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1460 goto do_exactf_utf8;
1461 }
1462 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1463 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1464 goto do_exactf_non_utf8; /* isn't dealt with by these */
1465
6eb5f6b9 1466 case EXACTF:
62bf7766 1467 if (utf8_target) {
77a6d856
KW
1468
1469 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1470 utf8_fold_flags = 0;
fac1af77
KW
1471 goto do_exactf_utf8;
1472 }
1473 fold_array = PL_fold;
1474 folder = foldEQ;
1475 goto do_exactf_non_utf8;
1476
1477 case EXACTFL:
1478 if (UTF_PATTERN || utf8_target) {
17580e7a 1479 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1480 goto do_exactf_utf8;
1481 }
1482 fold_array = PL_fold_locale;
1483 folder = foldEQ_locale;
16d951b7
KW
1484 goto do_exactf_non_utf8;
1485
3c760661
KW
1486 case EXACTFU_SS:
1487 if (UTF_PATTERN) {
1488 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1489 }
1490 goto do_exactf_utf8;
1491
fab2782b 1492 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1493 case EXACTFU:
1494 if (UTF_PATTERN || utf8_target) {
77a6d856 1495 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1496 goto do_exactf_utf8;
1497 }
1498
1499 /* Any 'ss' in the pattern should have been replaced by regcomp,
1500 * so we don't have to worry here about this single special case
1501 * in the Latin1 range */
1502 fold_array = PL_fold_latin1;
1503 folder = foldEQ_latin1;
fac1af77
KW
1504
1505 /* FALL THROUGH */
1506
62bf7766
KW
1507 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1508 are no glitches with fold-length differences
1509 between the target string and pattern */
fac1af77
KW
1510
1511 /* The idea in the non-utf8 EXACTF* cases is to first find the
1512 * first character of the EXACTF* node and then, if necessary,
1513 * case-insensitively compare the full text of the node. c1 is the
1514 * first character. c2 is its fold. This logic will not work for
1515 * Unicode semantics and the german sharp ss, which hence should
1516 * not be compiled into a node that gets here. */
1517 pat_string = STRING(c);
1518 ln = STR_LEN(c); /* length to match in octets/bytes */
1519
8a90a8fe
KW
1520 /* We know that we have to match at least 'ln' bytes (which is the
1521 * same as characters, since not utf8). If we have to match 3
1522 * characters, and there are only 2 availabe, we know without
1523 * trying that it will fail; so don't start a match past the
1524 * required minimum number from the far end */
fac1af77
KW
1525 e = HOP3c(strend, -((I32)ln), s);
1526
1527 if (!reginfo && e < s) {
1528 e = s; /* Due to minlen logic of intuit() */
1529 }
1530
1531 c1 = *pat_string;
1532 c2 = fold_array[c1];
1533 if (c1 == c2) { /* If char and fold are the same */
1534 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1535 }
1536 else {
1537 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1538 }
1539 break;
1540
1541 do_exactf_utf8:
e067297c
KW
1542 {
1543 unsigned expansion;
1544
fac1af77
KW
1545
1546 /* If one of the operands is in utf8, we can't use the simpler
1547 * folding above, due to the fact that many different characters
1548 * can have the same fold, or portion of a fold, or different-
1549 * length fold */
1550 pat_string = STRING(c);
1551 ln = STR_LEN(c); /* length to match in octets/bytes */
1552 pat_end = pat_string + ln;
1553 lnc = (UTF_PATTERN) /* length to match in characters */
1554 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1555 : ln;
1556
e067297c
KW
1557 /* We have 'lnc' characters to match in the pattern, but because of
1558 * multi-character folding, each character in the target can match
1559 * up to 3 characters (Unicode guarantees it will never exceed
1560 * this) if it is utf8-encoded; and up to 2 if not (based on the
1561 * fact that the Latin 1 folds are already determined, and the
1562 * only multi-char fold in that range is the sharp-s folding to
1563 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1564 * string character. Adjust lnc accordingly, rounding up, so that
1565 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1566 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1567 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1568
1569 /* As in the non-UTF8 case, if we have to match 3 characters, and
1570 * only 2 are left, it's guaranteed to fail, so don't start a
1571 * match that would require us to go beyond the end of the string
1572 */
1573 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1574
1575 if (!reginfo && e < s) {
1576 e = s; /* Due to minlen logic of intuit() */
1577 }
1578
b33105db
KW
1579 /* XXX Note that we could recalculate e to stop the loop earlier,
1580 * as the worst case expansion above will rarely be met, and as we
1581 * go along we would usually find that e moves further to the left.
1582 * This would happen only after we reached the point in the loop
1583 * where if there were no expansion we should fail. Unclear if
1584 * worth the expense */
e067297c 1585
fac1af77
KW
1586 while (s <= e) {
1587 char *my_strend= (char *)strend;
1588 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1589 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1590 && (!reginfo || regtry(reginfo, &s)) )
1591 {
1592 goto got_it;
1593 }
bbdd8bad 1594 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1595 }
1596 break;
e067297c 1597 }
bbce6d69 1598 case BOUNDL:
3280af22 1599 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1600 FBC_BOUND(isALNUM_LC,
1601 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1602 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1603 break;
bbce6d69 1604 case NBOUNDL:
3280af22 1605 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1606 FBC_NBOUND(isALNUM_LC,
1607 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1608 isALNUM_LC_utf8((U8*)s));
1609 break;
1610 case BOUND:
1611 FBC_BOUND(isWORDCHAR,
1612 isALNUM_uni(tmp),
1613 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1614 break;
cfaf538b
KW
1615 case BOUNDA:
1616 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1617 isWORDCHAR_A(tmp),
1618 isWORDCHAR_A((U8*)s));
1619 break;
a0d0e21e 1620 case NBOUND:
63ac0dad
KW
1621 FBC_NBOUND(isWORDCHAR,
1622 isALNUM_uni(tmp),
1623 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1624 break;
cfaf538b
KW
1625 case NBOUNDA:
1626 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1627 isWORDCHAR_A(tmp),
1628 isWORDCHAR_A((U8*)s));
1629 break;
63ac0dad
KW
1630 case BOUNDU:
1631 FBC_BOUND(isWORDCHAR_L1,
1632 isALNUM_uni(tmp),
1633 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1634 break;
1635 case NBOUNDU:
1636 FBC_NBOUND(isWORDCHAR_L1,
1637 isALNUM_uni(tmp),
1638 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1639 break;
bbce6d69 1640 case ALNUML:
4cadc6a9
YO
1641 REXEC_FBC_CSCAN_TAINT(
1642 isALNUM_LC_utf8((U8*)s),
1643 isALNUM_LC(*s)
1644 );
6895a8aa 1645 break;
980866de
KW
1646 case ALNUMU:
1647 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1648 LOAD_UTF8_CHARCLASS_ALNUM(),
1649 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1650 isWORDCHAR_L1((U8) *s)
1651 );
6895a8aa 1652 break;
980866de
KW
1653 case ALNUM:
1654 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1655 LOAD_UTF8_CHARCLASS_ALNUM(),
1656 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1657 isWORDCHAR((U8) *s)
1658 );
6895a8aa 1659 break;
cfaf538b 1660 case ALNUMA:
8e9da4d4
KW
1661 /* Don't need to worry about utf8, as it can match only a single
1662 * byte invariant character */
cfaf538b 1663 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1664 break;
980866de
KW
1665 case NALNUMU:
1666 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1667 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1668 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1669 ! isWORDCHAR_L1((U8) *s)
1670 );
6895a8aa 1671 break;
a0d0e21e 1672 case NALNUM:
4cadc6a9 1673 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1674 LOAD_UTF8_CHARCLASS_ALNUM(),
1675 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1676 ! isALNUM(*s)
4cadc6a9 1677 );
6895a8aa 1678 break;
cfaf538b 1679 case NALNUMA:
8e9da4d4
KW
1680 REXEC_FBC_CSCAN(
1681 !isWORDCHAR_A(*s),
1682 !isWORDCHAR_A(*s)
1683 );
1684 break;
bbce6d69 1685 case NALNUML:
4cadc6a9
YO
1686 REXEC_FBC_CSCAN_TAINT(
1687 !isALNUM_LC_utf8((U8*)s),
1688 !isALNUM_LC(*s)
1689 );
6895a8aa 1690 break;
980866de
KW
1691 case SPACEU:
1692 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1693 LOAD_UTF8_CHARCLASS_SPACE(),
1694 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1695 isSPACE_L1((U8) *s)
1696 );
6895a8aa 1697 break;
a0d0e21e 1698 case SPACE:
4cadc6a9 1699 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1700 LOAD_UTF8_CHARCLASS_SPACE(),
1701 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1702 isSPACE((U8) *s)
4cadc6a9 1703 );
6895a8aa 1704 break;
cfaf538b 1705 case SPACEA:
8e9da4d4
KW
1706 /* Don't need to worry about utf8, as it can match only a single
1707 * byte invariant character */
cfaf538b 1708 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1709 break;
bbce6d69 1710 case SPACEL:
4cadc6a9 1711 REXEC_FBC_CSCAN_TAINT(
6bbba904 1712 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1713 isSPACE_LC(*s)
1714 );
6895a8aa 1715 break;
980866de
KW
1716 case NSPACEU:
1717 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1718 LOAD_UTF8_CHARCLASS_SPACE(),
1719 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1720 ! isSPACE_L1((U8) *s)
1721 );
6895a8aa 1722 break;
a0d0e21e 1723 case NSPACE:
4cadc6a9 1724 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1725 LOAD_UTF8_CHARCLASS_SPACE(),
1726 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1727 ! isSPACE((U8) *s)
4cadc6a9 1728 );
6895a8aa 1729 break;
cfaf538b 1730 case NSPACEA:
8e9da4d4
KW
1731 REXEC_FBC_CSCAN(
1732 !isSPACE_A(*s),
1733 !isSPACE_A(*s)
1734 );
1735 break;
bbce6d69 1736 case NSPACEL:
4cadc6a9 1737 REXEC_FBC_CSCAN_TAINT(
6bbba904 1738 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1739 !isSPACE_LC(*s)
1740 );
6895a8aa 1741 break;
a0d0e21e 1742 case DIGIT:
4cadc6a9 1743 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1744 LOAD_UTF8_CHARCLASS_DIGIT(),
1745 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1746 isDIGIT(*s)
1747 );
6895a8aa 1748 break;
cfaf538b 1749 case DIGITA:
8e9da4d4
KW
1750 /* Don't need to worry about utf8, as it can match only a single
1751 * byte invariant character */
cfaf538b 1752 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1753 break;
b8c5462f 1754 case DIGITL:
4cadc6a9
YO
1755 REXEC_FBC_CSCAN_TAINT(
1756 isDIGIT_LC_utf8((U8*)s),
1757 isDIGIT_LC(*s)
1758 );
6895a8aa 1759 break;
a0d0e21e 1760 case NDIGIT:
4cadc6a9 1761 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1762 LOAD_UTF8_CHARCLASS_DIGIT(),
1763 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1764 !isDIGIT(*s)
1765 );
6895a8aa 1766 break;
cfaf538b 1767 case NDIGITA:
8e9da4d4
KW
1768 REXEC_FBC_CSCAN(
1769 !isDIGIT_A(*s),
1770 !isDIGIT_A(*s)
1771 );
1772 break;
b8c5462f 1773 case NDIGITL:
4cadc6a9
YO
1774 REXEC_FBC_CSCAN_TAINT(
1775 !isDIGIT_LC_utf8((U8*)s),
1776 !isDIGIT_LC(*s)
1777 );
6895a8aa 1778 break;
e1d1eefb
YO
1779 case LNBREAK:
1780 REXEC_FBC_CSCAN(
1781 is_LNBREAK_utf8(s),
1782 is_LNBREAK_latin1(s)
1783 );
6895a8aa 1784 break;
e1d1eefb
YO
1785 case VERTWS:
1786 REXEC_FBC_CSCAN(
1787 is_VERTWS_utf8(s),
1788 is_VERTWS_latin1(s)
1789 );
6895a8aa 1790 break;
e1d1eefb
YO
1791 case NVERTWS:
1792 REXEC_FBC_CSCAN(
1793 !is_VERTWS_utf8(s),
1794 !is_VERTWS_latin1(s)
1795 );
6895a8aa 1796 break;
e1d1eefb
YO
1797 case HORIZWS:
1798 REXEC_FBC_CSCAN(
1799 is_HORIZWS_utf8(s),
1800 is_HORIZWS_latin1(s)
1801 );
6895a8aa 1802 break;
e1d1eefb
YO
1803 case NHORIZWS:
1804 REXEC_FBC_CSCAN(
1805 !is_HORIZWS_utf8(s),
1806 !is_HORIZWS_latin1(s)
1807 );
6895a8aa 1808 break;
0658cdde
KW
1809 case POSIXA:
1810 /* Don't need to worry about utf8, as it can match only a single
1811 * byte invariant character. The flag in this node type is the
1812 * class number to pass to _generic_isCC() to build a mask for
1813 * searching in PL_charclass[] */
1814 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1815 break;
1816 case NPOSIXA:
1817 REXEC_FBC_CSCAN(
1818 !_generic_isCC_A(*s, FLAGS(c)),
1819 !_generic_isCC_A(*s, FLAGS(c))
1820 );
1821 break;
1822
1de06328
YO
1823 case AHOCORASICKC:
1824 case AHOCORASICK:
07be1b83 1825 {
a0a388a1 1826 DECL_TRIE_TYPE(c);
07be1b83
YO
1827 /* what trie are we using right now */
1828 reg_ac_data *aho
f8fc2ecf 1829 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1830 reg_trie_data *trie
1831 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1832 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1833
1834 const char *last_start = strend - trie->minlen;
6148ee25 1835#ifdef DEBUGGING
07be1b83 1836 const char *real_start = s;
6148ee25 1837#endif
07be1b83 1838 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1839 SV *sv_points;
1840 U8 **points; /* map of where we were in the input string
786e8c11 1841 when reading a given char. For ASCII this
be8e71aa 1842 is unnecessary overhead as the relationship
38a44b82
NC
1843 is always 1:1, but for Unicode, especially
1844 case folded Unicode this is not true. */
f9e705e8 1845 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1846 U8 *bitmap=NULL;
1847
07be1b83
YO
1848
1849 GET_RE_DEBUG_FLAGS_DECL;
1850
be8e71aa
YO
1851 /* We can't just allocate points here. We need to wrap it in
1852 * an SV so it gets freed properly if there is a croak while
1853 * running the match */
1854 ENTER;
1855 SAVETMPS;
1856 sv_points=newSV(maxlen * sizeof(U8 *));
1857 SvCUR_set(sv_points,
1858 maxlen * sizeof(U8 *));
1859 SvPOK_on(sv_points);
1860 sv_2mortal(sv_points);
1861 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1862 if ( trie_type != trie_utf8_fold
1863 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1864 {
786e8c11
YO
1865 if (trie->bitmap)
1866 bitmap=(U8*)trie->bitmap;
1867 else
1868 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1869 }
786e8c11
YO
1870 /* this is the Aho-Corasick algorithm modified a touch
1871 to include special handling for long "unknown char"
1872 sequences. The basic idea being that we use AC as long
1873 as we are dealing with a possible matching char, when
1874 we encounter an unknown char (and we have not encountered
1875 an accepting state) we scan forward until we find a legal
1876 starting char.
1877 AC matching is basically that of trie matching, except
1878 that when we encounter a failing transition, we fall back
1879 to the current states "fail state", and try the current char
1880 again, a process we repeat until we reach the root state,
1881 state 1, or a legal transition. If we fail on the root state
1882 then we can either terminate if we have reached an accepting
1883 state previously, or restart the entire process from the beginning
1884 if we have not.
1885
1886 */
07be1b83
YO
1887 while (s <= last_start) {
1888 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1889 U8 *uc = (U8*)s;
1890 U16 charid = 0;
1891 U32 base = 1;
1892 U32 state = 1;
1893 UV uvc = 0;
1894 STRLEN len = 0;
1895 STRLEN foldlen = 0;
1896 U8 *uscan = (U8*)NULL;
1897 U8 *leftmost = NULL;
786e8c11
YO
1898#ifdef DEBUGGING
1899 U32 accepted_word= 0;
1900#endif
07be1b83
YO
1901 U32 pointpos = 0;
1902
1903 while ( state && uc <= (U8*)strend ) {
1904 int failed=0;
786e8c11
YO
1905 U32 word = aho->states[ state ].wordnum;
1906
1de06328
YO
1907 if( state==1 ) {
1908 if ( bitmap ) {
1909 DEBUG_TRIE_EXECUTE_r(
1910 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1911 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1912 (char *)uc, utf8_target );
1de06328
YO
1913 PerlIO_printf( Perl_debug_log,
1914 " Scanning for legal start char...\n");
1915 }
d085b490
YO
1916 );
1917 if (utf8_target) {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 uc += UTF8SKIP(uc);
1920 }
1921 } else {
1922 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1923 uc++;
1924 }
1925 }
1de06328 1926 s= (char *)uc;
786e8c11 1927 }
786e8c11
YO
1928 if (uc >(U8*)last_start) break;
1929 }
1930
1931 if ( word ) {
2e64971a 1932 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1933 if (!leftmost || lpos < leftmost) {
1934 DEBUG_r(accepted_word=word);
07be1b83 1935 leftmost= lpos;
786e8c11 1936 }
07be1b83 1937 if (base==0) break;
786e8c11 1938
07be1b83
YO
1939 }
1940 points[pointpos++ % maxlen]= uc;
55eed653
NC
1941 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1942 uscan, len, uvc, charid, foldlen,
1943 foldbuf, uniflags);
786e8c11
YO
1944 DEBUG_TRIE_EXECUTE_r({
1945 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1946 s, utf8_target );
07be1b83 1947 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1948 " Charid:%3u CP:%4"UVxf" ",
1949 charid, uvc);
1950 });
07be1b83
YO
1951
1952 do {
6148ee25 1953#ifdef DEBUGGING
786e8c11 1954 word = aho->states[ state ].wordnum;
6148ee25 1955#endif
07be1b83
YO
1956 base = aho->states[ state ].trans.base;
1957
786e8c11
YO
1958 DEBUG_TRIE_EXECUTE_r({
1959 if (failed)
1960 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1961 s, utf8_target );
07be1b83 1962 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1963 "%sState: %4"UVxf", word=%"UVxf,
1964 failed ? " Fail transition to " : "",
1965 (UV)state, (UV)word);
1966 });
07be1b83
YO
1967 if ( base ) {
1968 U32 tmp;
6dd2be57 1969 I32 offset;
07be1b83 1970 if (charid &&
6dd2be57
DM
1971 ( ((offset = base + charid
1972 - 1 - trie->uniquecharcount)) >= 0)
1973 && ((U32)offset < trie->lasttrans)
1974 && trie->trans[offset].check == state
1975 && (tmp=trie->trans[offset].next))
07be1b83 1976 {
786e8c11
YO
1977 DEBUG_TRIE_EXECUTE_r(
1978 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1979 state = tmp;
1980 break;
1981 }
1982 else {
786e8c11
YO
1983 DEBUG_TRIE_EXECUTE_r(
1984 PerlIO_printf( Perl_debug_log," - fail\n"));
1985 failed = 1;
1986 state = aho->fail[state];
07be1b83
YO
1987 }
1988 }
1989 else {
1990 /* we must be accepting here */
786e8c11
YO
1991 DEBUG_TRIE_EXECUTE_r(
1992 PerlIO_printf( Perl_debug_log," - accepting\n"));
1993 failed = 1;
07be1b83
YO
1994 break;
1995 }
1996 } while(state);
786e8c11 1997 uc += len;
07be1b83
YO
1998 if (failed) {
1999 if (leftmost)
2000 break;
786e8c11 2001 if (!state) state = 1;
07be1b83
YO
2002 }
2003 }
2004 if ( aho->states[ state ].wordnum ) {
2e64971a 2005 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2006 if (!leftmost || lpos < leftmost) {
2007 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2008 leftmost = lpos;
786e8c11 2009 }
07be1b83 2010 }
07be1b83
YO
2011 if (leftmost) {
2012 s = (char*)leftmost;
786e8c11
YO
2013 DEBUG_TRIE_EXECUTE_r({
2014 PerlIO_printf(
70685ca0
JH
2015 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2016 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2017 );
2018 });
24b23f37 2019 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2020 FREETMPS;
2021 LEAVE;
07be1b83 2022 goto got_it;
be8e71aa 2023 }
07be1b83 2024 s = HOPc(s,1);
786e8c11
YO
2025 DEBUG_TRIE_EXECUTE_r({
2026 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2027 });
07be1b83 2028 } else {
786e8c11
YO
2029 DEBUG_TRIE_EXECUTE_r(
2030 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2031 break;
2032 }
2033 }
be8e71aa
YO
2034 FREETMPS;
2035 LEAVE;
07be1b83
YO
2036 }
2037 break;
b3c9acc1 2038 default:
3c3eec57
GS
2039 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2040 break;
d6a28714 2041 }
6eb5f6b9
JH
2042 return 0;
2043 got_it:
2044 return s;
2045}
2046
fae667d5 2047
6eb5f6b9
JH
2048/*
2049 - regexec_flags - match a regexp against a string
2050 */
2051I32
288b8c02 2052Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9
JH
2053 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2054/* strend: pointer to null at end of string */
2055/* strbeg: real beginning of string */
2056/* minend: end of match must be >=minend after stringarg. */
58e23c8d
YO
2057/* data: May be used for some additional optimizations.
2058 Currently its only used, with a U32 cast, for transmitting
2059 the ganch offset when doing a /g match. This will change */
6eb5f6b9
JH
2060/* nosave: For optimizations. */
2061{
97aff369 2062 dVAR;
288b8c02 2063 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2064 /*register*/ char *s;
eb578fdb 2065 regnode *c;
24b23f37 2066 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2067 I32 minlen; /* must match at least this many chars */
2068 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2069 I32 end_shift = 0; /* Same for the end. */ /* CC */
2070 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2071 char *scream_olds = NULL;
f2ed9b32 2072 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2073 I32 multiline;
f8fc2ecf 2074 RXi_GET_DECL(prog,progi);
3b0527fe 2075 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2076 regexp_paren_pair *swap = NULL;
a3621e74
YO
2077 GET_RE_DEBUG_FLAGS_DECL;
2078
7918f24d 2079 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2080 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2081
2082 /* Be paranoid... */
2083 if (prog == NULL || startpos == NULL) {
2084 Perl_croak(aTHX_ "NULL regexp parameter");
2085 return 0;
2086 }
2087
bbe252da 2088 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2089 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2090
f2ed9b32 2091 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2092 DEBUG_EXECUTE_r(
f2ed9b32 2093 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2094 "Matching");
2095 );
bac06658 2096
6eb5f6b9 2097 minlen = prog->minlen;
1de06328
YO
2098
2099 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2101 "String too short [regexec_flags]...\n"));
2102 goto phooey;
1aa99e6b 2103 }
6eb5f6b9 2104
1de06328 2105
6eb5f6b9 2106 /* Check validity of program. */
f8fc2ecf 2107 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2108 Perl_croak(aTHX_ "corrupted regexp program");
2109 }
2110
2111 PL_reg_flags = 0;
ed301438 2112 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2113 PL_reg_maxiter = 0;
2114
3c8556c3 2115 if (RX_UTF8(rx))
6eb5f6b9
JH
2116 PL_reg_flags |= RF_utf8;
2117
2118 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2119 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2120 PL_bostr = strbeg;
3b0527fe 2121 reginfo.sv = sv;
6eb5f6b9
JH
2122
2123 /* Mark end of line for $ (and such) */
2124 PL_regeol = strend;
2125
2126 /* see how far we have to get to not match where we matched before */
3b0527fe 2127 reginfo.till = startpos+minend;
6eb5f6b9 2128
6eb5f6b9
JH
2129 /* If there is a "must appear" string, look for it. */
2130 s = startpos;
2131
bbe252da 2132 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2133 MAGIC *mg;
2c296965 2134 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2135 reginfo.ganch = startpos + prog->gofs;
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2138 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2139 && SvMAGIC(sv)
14befaf4
DM
2140 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141 && mg->mg_len >= 0) {
3b0527fe 2142 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2145
bbe252da 2146 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2147 if (s > reginfo.ganch)
6eb5f6b9 2148 goto phooey;
58e23c8d 2149 s = reginfo.ganch - prog->gofs;
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2152 if (s < strbeg)
2153 goto phooey;
6eb5f6b9
JH
2154 }
2155 }
58e23c8d 2156 else if (data) {
70685ca0 2157 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161 } else { /* pos() not defined */
3b0527fe 2162 reginfo.ganch = strbeg;
2c296965
YO
2163 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164 "GPOS: reginfo.ganch = strbeg\n"));
2165 }
6eb5f6b9 2166 }
288b8c02 2167 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2168 /* We have to be careful. If the previous successful match
2169 was from this regex we don't want a subsequent partially
2170 successful match to clobber the old results.
2171 So when we detect this possibility we add a swap buffer
2172 to the re, and switch the buffer each match. If we fail
2173 we switch it back, otherwise we leave it swapped.
2174 */
2175 swap = prog->offs;
2176 /* do we need a save destructor here for eval dies? */
2177 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2178 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180 PTR2UV(prog),
2181 PTR2UV(swap),
2182 PTR2UV(prog->offs)
2183 ));
c74340f9 2184 }
a0714e2c 2185 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2186 re_scream_pos_data d;
2187
2188 d.scream_olds = &scream_olds;
2189 d.scream_pos = &scream_pos;
288b8c02 2190 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2191 if (!s) {
a3621e74 2192 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2193 goto phooey; /* not present */
3fa9c3d7 2194 }
6eb5f6b9
JH
2195 }
2196
1de06328 2197
6eb5f6b9
JH
2198
2199 /* Simplest case: anchored match need be tried only once. */
2200 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2201 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2202 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2203 goto got_it;
bbe252da
YO
2204 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2206 {
2207 char *end;
2208
2209 if (minlen)
2210 dontbother = minlen - 1;
1aa99e6b 2211 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2212 /* for multiline we only have to try after newlines */
33b8afdf 2213 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2214 /* because of the goto we can not easily reuse the macros for bifurcating the
2215 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216 if (utf8_target) {
2217 if (s == startpos)
2218 goto after_try_utf8;
2219 while (1) {
2220 if (regtry(&reginfo, &s)) {
2221 goto got_it;
2222 }
2223 after_try_utf8:
2224 if (s > end) {
2225 goto phooey;
2226 }
2227 if (prog->extflags & RXf_USE_INTUIT) {
2228 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229 if (!s) {
2230 goto phooey;
2231 }
2232 }
2233 else {
2234 s += UTF8SKIP(s);
2235 }
2236 }
2237 } /* end search for check string in unicode */
2238 else {
2239 if (s == startpos) {
2240 goto after_try_latin;
2241 }
2242 while (1) {
2243 if (regtry(&reginfo, &s)) {
2244 goto got_it;
2245 }
2246 after_try_latin:
2247 if (s > end) {
2248 goto phooey;
2249 }
2250 if (prog->extflags & RXf_USE_INTUIT) {
2251 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252 if (!s) {
2253 goto phooey;
2254 }
2255 }
2256 else {
2257 s++;
2258 }
2259 }
2260 } /* end search for check string in latin*/
2261 } /* end search for check string */
2262 else { /* search for newline */
2263 if (s > startpos) {
2264 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2265 s--;
92f3d482 2266 }
21eede78
YO
2267 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2269 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2270 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2271 goto got_it;
2272 }
92f3d482
YO
2273 }
2274 } /* end search for newline */
2275 } /* end anchored/multiline check string search */
6eb5f6b9 2276 goto phooey;
bbe252da 2277 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2278 {
486ec47a 2279 /* the warning about reginfo.ganch being used without initialization
bbe252da 2280 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2281 and we only enter this block when the same bit is set. */
58e23c8d 2282 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2283
2284 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2285 goto got_it;
2286 goto phooey;
2287 }
2288
2289 /* Messy cases: unanchored match. */
bbe252da 2290 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2291 /* we have /x+whatever/ */
f2ed9b32 2292 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2293 char ch;
bf93d4cc
GS
2294#ifdef DEBUGGING
2295 int did_match = 0;
2296#endif
f2ed9b32
KW
2297 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2300
f2ed9b32 2301 if (utf8_target) {
4cadc6a9 2302 REXEC_FBC_SCAN(
6eb5f6b9 2303 if (*s == ch) {
a3621e74 2304 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2305 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2306 s += UTF8SKIP(s);
2307 while (s < strend && *s == ch)
2308 s += UTF8SKIP(s);
2309 }
4cadc6a9 2310 );
6eb5f6b9
JH
2311 }
2312 else {
4cadc6a9 2313 REXEC_FBC_SCAN(
6eb5f6b9 2314 if (*s == ch) {
a3621e74 2315 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2316 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2317 s++;
2318 while (s < strend && *s == ch)
2319 s++;
2320 }
4cadc6a9 2321 );
6eb5f6b9 2322 }
a3621e74 2323 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2324 PerlIO_printf(Perl_debug_log,
b7953727
JH
2325 "Did not find anchored character...\n")
2326 );
6eb5f6b9 2327 }
a0714e2c
SS
2328 else if (prog->anchored_substr != NULL
2329 || prog->anchored_utf8 != NULL
2330 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2331 && prog->float_max_offset < strend - s)) {
2332 SV *must;
2333 I32 back_max;
2334 I32 back_min;
2335 char *last;
6eb5f6b9 2336 char *last1; /* Last position checked before */
bf93d4cc
GS
2337#ifdef DEBUGGING
2338 int did_match = 0;
2339#endif
33b8afdf 2340 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2341 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2344 back_max = back_min = prog->anchored_offset;
2345 } else {
f2ed9b32
KW
2346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2349 back_max = prog->float_max_offset;
2350 back_min = prog->float_min_offset;
2351 }
1de06328
YO
2352
2353
33b8afdf
JH
2354 if (must == &PL_sv_undef)
2355 /* could not downgrade utf8 check substring, so must fail */
2356 goto phooey;
2357
1de06328
YO
2358 if (back_min<0) {
2359 last = strend;
2360 } else {
2361 last = HOP3c(strend, /* Cannot start after this */
2362 -(I32)(CHR_SVLEN(must)
2363 - (SvTAIL(must) != 0) + back_min), strbeg);
2364 }
6eb5f6b9
JH
2365 if (s > PL_bostr)
2366 last1 = HOPc(s, -1);
2367 else
2368 last1 = s - 1; /* bogus */
2369
a0288114 2370 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2371 check_substr==must. */
2372 scream_pos = -1;
2373 dontbother = end_shift;
2374 strend = HOPc(strend, -dontbother);
2375 while ( (s <= last) &&
c33e64f0 2376 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2377 (unsigned char*)strend, must,
c33e64f0 2378 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2379 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2380 if (HOPc(s, -back_max) > last1) {
2381 last1 = HOPc(s, -back_min);
2382 s = HOPc(s, -back_max);
2383 }
2384 else {
52657f30 2385 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2386
2387 last1 = HOPc(s, -back_min);
52657f30 2388 s = t;
6eb5f6b9 2389 }
f2ed9b32 2390 if (utf8_target) {
6eb5f6b9 2391 while (s <= last1) {
24b23f37 2392 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2393 goto got_it;
2394 s += UTF8SKIP(s);
2395 }
2396 }
2397 else {
2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s++;
2402 }
2403 }
2404 }
ab3bbdeb 2405 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2406 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2407 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2409 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2410 ? "anchored" : "floating"),
ab3bbdeb
YO
2411 quoted, RE_SV_TAIL(must));
2412 });
6eb5f6b9
JH
2413 goto phooey;
2414 }
f8fc2ecf 2415 else if ( (c = progi->regstclass) ) {
f14c76ed 2416 if (minlen) {
f8fc2ecf 2417 const OPCODE op = OP(progi->regstclass);
66e933ab 2418 /* don't bother with what can't match */
786e8c11 2419 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2420 strend = HOPc(strend, -(minlen - 1));
2421 }
a3621e74 2422 DEBUG_EXECUTE_r({
be8e71aa 2423 SV * const prop = sv_newmortal();
32fc9b6a 2424 regprop(prog, prop, c);
0df25f3d 2425 {
f2ed9b32 2426 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2427 s,strend-s,60);
0df25f3d 2428 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2429 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2430 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2431 quoted, (int)(strend - s));
0df25f3d 2432 }
ffc61ed2 2433 });
3b0527fe 2434 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2435 goto got_it;
07be1b83 2436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2437 }
2438 else {
2439 dontbother = 0;
a0714e2c 2440 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2441 /* Trim the end. */
6af40bd7 2442 char *last= NULL;
33b8afdf 2443 SV* float_real;
c33e64f0
FC
2444 STRLEN len;
2445 const char *little;
33b8afdf 2446
f2ed9b32
KW
2447 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2450
c33e64f0
FC
2451 little = SvPV_const(float_real, len);
2452 if (SvTAIL(float_real)) {
1a13b075
YO
2453 /* This means that float_real contains an artificial \n on the end
2454 * due to the presence of something like this: /foo$/
2455 * where we can match both "foo" and "foo\n" at the end of the string.
2456 * So we have to compare the end of the string first against the float_real
2457 * without the \n and then against the full float_real with the string.
2458 * We have to watch out for cases where the string might be smaller
2459 * than the float_real or the float_real without the \n.
2460 */
2461 char *checkpos= strend - len;
2462 DEBUG_OPTIMISE_r(
2463 PerlIO_printf(Perl_debug_log,
2464 "%sChecking for float_real.%s\n",
2465 PL_colors[4], PL_colors[5]));
2466 if (checkpos + 1 < strbeg) {
2467 /* can't match, even if we remove the trailing \n string is too short to match */
2468 DEBUG_EXECUTE_r(
2469 PerlIO_printf(Perl_debug_log,
2470 "%sString shorter than required trailing substring, cannot match.%s\n",
2471 PL_colors[4], PL_colors[5]));
2472 goto phooey;
2473 } else if (memEQ(checkpos + 1, little, len - 1)) {
2474 /* can match, the end of the string matches without the "\n" */
2475 last = checkpos + 1;
2476 } else if (checkpos < strbeg) {
2477 /* cant match, string is too short when the "\n" is included */
2478 DEBUG_EXECUTE_r(
2479 PerlIO_printf(Perl_debug_log,
2480 "%sString does not contain required trailing substring, cannot match.%s\n",
2481 PL_colors[4], PL_colors[5]));
2482 goto phooey;
2483 } else if (!multiline) {
2484 /* non multiline match, so compare with the "\n" at the end of the string */
2485 if (memEQ(checkpos, little, len)) {
2486 last= checkpos;
2487 } else {
2488 DEBUG_EXECUTE_r(
2489 PerlIO_printf(Perl_debug_log,
2490 "%sString does not contain required trailing substring, cannot match.%s\n",
2491 PL_colors[4], PL_colors[5]));
2492 goto phooey;
2493 }
2494 } else {
2495 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2496 goto find_last;
1a13b075 2497 }
c33e64f0 2498 } else {
d6a28714 2499 find_last:
9041c2e3 2500 if (len)
d6a28714 2501 last = rninstr(s, strend, little, little + len);
b8c5462f 2502 else
a0288114 2503 last = strend; /* matching "$" */
b8c5462f 2504 }
6af40bd7
YO
2505 if (!last) {
2506 /* at one point this block contained a comment which was probably
2507 * incorrect, which said that this was a "should not happen" case.
2508 * Even if it was true when it was written I am pretty sure it is
2509 * not anymore, so I have removed the comment and replaced it with
2510 * this one. Yves */
6bda09f9
YO
2511 DEBUG_EXECUTE_r(
2512 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2513 "String does not contain required substring, cannot match.\n"
2514 ));
2515 goto phooey;
bf93d4cc 2516 }
d6a28714
JH
2517 dontbother = strend - last + prog->float_min_offset;
2518 }
2519 if (minlen && (dontbother < minlen))
2520 dontbother = minlen - 1;
2521 strend -= dontbother; /* this one's always in bytes! */
2522 /* We don't know much -- general case. */
f2ed9b32 2523 if (utf8_target) {
d6a28714 2524 for (;;) {
24b23f37 2525 if (regtry(&reginfo, &s))
d6a28714
JH
2526 goto got_it;
2527 if (s >= strend)
2528 break;
b8c5462f 2529 s += UTF8SKIP(s);
d6a28714
JH
2530 };
2531 }
2532 else {
2533 do {
24b23f37 2534 if (regtry(&reginfo, &s))
d6a28714
JH
2535 goto got_it;
2536 } while (s++ < strend);
2537 }
2538 }
2539
2540 /* Failure. */
2541 goto phooey;
2542
2543got_it:
495f47a5
DM
2544 DEBUG_BUFFERS_r(
2545 if (swap)
2546 PerlIO_printf(Perl_debug_log,
2547 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548 PTR2UV(prog),
2549 PTR2UV(swap)
2550 );
2551 );
e9105d30 2552 Safefree(swap);
288b8c02 2553 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2554
ed301438 2555 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2556 restore_pos(aTHX_ prog);
5daac39c
NC
2557 if (RXp_PAREN_NAMES(prog))
2558 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2559
2560 /* make sure $`, $&, $', and $digit will work later */
2561 if ( !(flags & REXEC_NOT_FIRST) ) {
288b8c02 2562 RX_MATCH_COPY_FREE(rx);
d6a28714 2563 if (flags & REXEC_COPY_STR) {
be8e71aa 2564 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 2565#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2566 if ((SvIsCOW(sv)
2567 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2568 if (DEBUG_C_TEST) {
2569 PerlIO_printf(Perl_debug_log,
2570 "Copy on write: regexp capture, type %d\n",
2571 (int) SvTYPE(sv));
2572 }
2573 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2574 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
2575 assert (SvPOKp(prog->saved_copy));
2576 } else
2577#endif
2578 {
288b8c02 2579 RX_MATCH_COPIED_on(rx);
ed252734
NC
2580 s = savepvn(strbeg, i);
2581 prog->subbeg = s;
2582 }
d6a28714 2583 prog->sublen = i;
d6a28714
JH
2584 }
2585 else {
2586 prog->subbeg = strbeg;
2587 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2588 }
2589 }
9041c2e3 2590
d6a28714
JH
2591 return 1;
2592
2593phooey:
a3621e74 2594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2595 PL_colors[4], PL_colors[5]));
ed301438 2596 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2597 restore_pos(aTHX_ prog);
e9105d30 2598 if (swap) {
c74340f9 2599 /* we failed :-( roll it back */
495f47a5
DM
2600 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2601 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2602 PTR2UV(prog),
2603 PTR2UV(prog->offs),
2604 PTR2UV(swap)
2605 ));
e9105d30
GG
2606 Safefree(prog->offs);
2607 prog->offs = swap;
2608 }
2609
d6a28714
JH
2610 return 0;
2611}
2612
6bda09f9 2613
ec43f78b
DM
2614/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2615 * Do inc before dec, in case old and new rex are the same */
2616#define SET_reg_curpm(Re2) \
2617 if (PL_reg_state.re_state_eval_setup_done) { \
2618 (void)ReREFCNT_inc(Re2); \
2619 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2620 PM_SETRE((PL_reg_curpm), (Re2)); \
2621 }
2622
2623
d6a28714
JH
2624/*
2625 - regtry - try match at specific point
2626 */
2627STATIC I32 /* 0 failure, 1 success */
24b23f37 2628S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2629{
97aff369 2630 dVAR;
d6a28714 2631 CHECKPOINT lastcp;
288b8c02
NC
2632 REGEXP *const rx = reginfo->prog;
2633 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2634 RXi_GET_DECL(prog,progi);
a3621e74 2635 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2636
2637 PERL_ARGS_ASSERT_REGTRY;
2638
24b23f37 2639 reginfo->cutpoint=NULL;
d6a28714 2640
ed301438
DM
2641 if ((prog->extflags & RXf_EVAL_SEEN)
2642 && !PL_reg_state.re_state_eval_setup_done)
2643 {
d6a28714
JH
2644 MAGIC *mg;
2645
ed301438 2646 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2647 if (reginfo->sv) {
d6a28714 2648 /* Make $_ available to executed code. */
3b0527fe 2649 if (reginfo->sv != DEFSV) {
59f00321 2650 SAVE_DEFSV;
414bf5ae 2651 DEFSV_set(reginfo->sv);
b8c5462f 2652 }
d6a28714 2653
3b0527fe
DM
2654 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2655 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2656 /* prepare for quick setting of pos */
d300d9fa 2657#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2658 if (SvIsCOW(reginfo->sv))
2659 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2660#endif
3dab1dad 2661 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2662 &PL_vtbl_mglob, NULL, 0);
d6a28714 2663 mg->mg_len = -1;
b8c5462f 2664 }
d6a28714
JH
2665 PL_reg_magic = mg;
2666 PL_reg_oldpos = mg->mg_len;
4f639d21 2667 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2668 }
09687e5a 2669 if (!PL_reg_curpm) {
a02a5408 2670 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2671#ifdef USE_ITHREADS
2672 {
14a49a24 2673 SV* const repointer = &PL_sv_undef;
92313705
NC
2674 /* this regexp is also owned by the new PL_reg_curpm, which
2675 will try to free it. */
d2ece331 2676 av_push(PL_regex_padav, repointer);
09687e5a
AB
2677 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2678 PL_regex_pad = AvARRAY(PL_regex_padav);
2679 }
2680#endif
2681 }
ec43f78b 2682 SET_reg_curpm(rx);
d6a28714
JH
2683 PL_reg_oldcurpm = PL_curpm;
2684 PL_curpm = PL_reg_curpm;
07bc277f 2685 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2686 /* Here is a serious problem: we cannot rewrite subbeg,
2687 since it may be needed if this match fails. Thus
2688 $` inside (?{}) could fail... */
2689 PL_reg_oldsaved = prog->subbeg;
2690 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2691#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2692 PL_nrs = prog->saved_copy;
2693#endif
07bc277f 2694 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2695 }
2696 else
bd61b366 2697 PL_reg_oldsaved = NULL;
d6a28714
JH
2698 prog->subbeg = PL_bostr;
2699 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2700 }
97ca13b7
DM
2701#ifdef DEBUGGING
2702 PL_reg_starttry = *startpos;
2703#endif
f0ab9afb 2704 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2705 PL_reginput = *startpos;
d6a28714 2706 prog->lastparen = 0;
03994de8 2707 prog->lastcloseparen = 0;
d6a28714 2708 PL_regsize = 0;
d6a28714
JH
2709
2710 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2711 to do this again and again, prog->lastparen should take care of
3dd2943c 2712 this! --ilya*/
dafc8851
JH
2713
2714 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2715 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2716 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2717 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2718 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2719 * Meanwhile, this code *is* needed for the
daf18116
JH
2720 * above-mentioned test suite tests to succeed. The common theme
2721 * on those tests seems to be returning null fields from matches.
225593e1 2722 * --jhi updated by dapm */
dafc8851 2723#if 1
d6a28714 2724 if (prog->nparens) {
b93070ed 2725 regexp_paren_pair *pp = prog->offs;
eb578fdb 2726 I32 i;
b93070ed 2727 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2728 ++pp;
2729 pp->start = -1;
2730 pp->end = -1;
d6a28714
JH
2731 }
2732 }
dafc8851 2733#endif
02db2b7b 2734 REGCP_SET(lastcp);
f8fc2ecf 2735 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2736 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2737 return 1;
2738 }
24b23f37
YO
2739 if (reginfo->cutpoint)
2740 *startpos= reginfo->cutpoint;
02db2b7b 2741 REGCP_UNWIND(lastcp);
d6a28714
JH
2742 return 0;
2743}
2744
02db2b7b 2745
8ba1375e
MJD
2746#define sayYES goto yes
2747#define sayNO goto no
262b90c4 2748#define sayNO_SILENT goto no_silent
8ba1375e 2749
f9f4320a
YO
2750/* we dont use STMT_START/END here because it leads to
2751 "unreachable code" warnings, which are bogus, but distracting. */
2752#define CACHEsayNO \
c476f425
DM
2753 if (ST.cache_mask) \
2754 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2755 sayNO
3298f257 2756
a3621e74 2757/* this is used to determine how far from the left messages like
265c4333
YO
2758 'failed...' are printed. It should be set such that messages
2759 are inline with the regop output that created them.
a3621e74 2760*/
265c4333 2761#define REPORT_CODE_OFF 32
a3621e74
YO
2762
2763
40a82448
DM
2764#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2765#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2766
86545054
DM
2767#define SLAB_FIRST(s) (&(s)->states[0])
2768#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2769
5d9a96ca
DM
2770/* grab a new slab and return the first slot in it */
2771
2772STATIC regmatch_state *
2773S_push_slab(pTHX)
2774{
a35a87e7 2775#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2776 dMY_CXT;
2777#endif
5d9a96ca
DM
2778 regmatch_slab *s = PL_regmatch_slab->next;
2779 if (!s) {
2780 Newx(s, 1, regmatch_slab);
2781 s->prev = PL_regmatch_slab;
2782 s->next = NULL;
2783 PL_regmatch_slab->next = s;
2784 }
2785 PL_regmatch_slab = s;
86545054 2786 return SLAB_FIRST(s);
5d9a96ca 2787}
5b47454d 2788
95b24440 2789
40a82448
DM
2790/* push a new state then goto it */
2791
2792#define PUSH_STATE_GOTO(state, node) \
2793 scan = node; \
2794 st->resume_state = state; \
2795 goto push_state;
2796
2797/* push a new state with success backtracking, then goto it */
2798
2799#define PUSH_YES_STATE_GOTO(state, node) \
2800 scan = node; \
2801 st->resume_state = state; \
2802 goto push_yes_state;
2803
aa283a38 2804
aa283a38 2805
d6a28714 2806/*
95b24440 2807
bf1f174e
DM
2808regmatch() - main matching routine
2809
2810This is basically one big switch statement in a loop. We execute an op,
2811set 'next' to point the next op, and continue. If we come to a point which
2812we may need to backtrack to on failure such as (A|B|C), we push a
2813backtrack state onto the backtrack stack. On failure, we pop the top
2814state, and re-enter the loop at the state indicated. If there are no more
2815states to pop, we return failure.
2816
2817Sometimes we also need to backtrack on success; for example /A+/, where
2818after successfully matching one A, we need to go back and try to
2819match another one; similarly for lookahead assertions: if the assertion
2820completes successfully, we backtrack to the state just before the assertion
2821and then carry on. In these cases, the pushed state is marked as
2822'backtrack on success too'. This marking is in fact done by a chain of
2823pointers, each pointing to the previous 'yes' state. On success, we pop to
2824the nearest yes state, discarding any intermediate failure-only states.
2825Sometimes a yes state is pushed just to force some cleanup code to be
2826called at the end of a successful match or submatch; e.g. (??{$re}) uses
2827it to free the inner regex.
2828
2829Note that failure backtracking rewinds the cursor position, while
2830success backtracking leaves it alone.
2831
2832A pattern is complete when the END op is executed, while a subpattern
2833such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2834ops trigger the "pop to last yes state if any, otherwise return true"
2835behaviour.
2836
2837A common convention in this function is to use A and B to refer to the two
2838subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2839the subpattern to be matched possibly multiple times, while B is the entire
2840rest of the pattern. Variable and state names reflect this convention.
2841
2842The states in the main switch are the union of ops and failure/success of
2843substates associated with with that op. For example, IFMATCH is the op
2844that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2845'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2846successfully matched A and IFMATCH_A_fail is a state saying that we have
2847just failed to match A. Resume states always come in pairs. The backtrack
2848state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2849at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2850on success or failure.
2851
2852The struct that holds a backtracking state is actually a big union, with
2853one variant for each major type of op. The variable st points to the
2854top-most backtrack struct. To make the code clearer, within each
2855block of code we #define ST to alias the relevant union.
2856
2857Here's a concrete example of a (vastly oversimplified) IFMATCH
2858implementation:
2859
2860 switch (state) {
2861 ....
2862
2863#define ST st->u.ifmatch
2864
2865 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2866 ST.foo = ...; // some state we wish to save
95b24440 2867 ...
bf1f174e
DM
2868 // push a yes backtrack state with a resume value of
2869 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2870 // first node of A:
2871 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2872 // NOTREACHED
2873
2874 case IFMATCH_A: // we have successfully executed A; now continue with B
2875 next = B;
2876 bar = ST.foo; // do something with the preserved value
2877 break;
2878
2879 case IFMATCH_A_fail: // A failed, so the assertion failed
2880 ...; // do some housekeeping, then ...
2881 sayNO; // propagate the failure
2882
2883#undef ST
95b24440 2884
bf1f174e
DM
2885 ...
2886 }
95b24440 2887
bf1f174e
DM
2888For any old-timers reading this who are familiar with the old recursive
2889approach, the code above is equivalent to:
95b24440 2890
bf1f174e
DM
2891 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2892 {
2893 int foo = ...
95b24440 2894 ...
bf1f174e
DM
2895 if (regmatch(A)) {
2896 next = B;
2897 bar = foo;
2898 break;
95b24440 2899 }
bf1f174e
DM
2900 ...; // do some housekeeping, then ...
2901 sayNO; // propagate the failure
95b24440 2902 }
bf1f174e
DM
2903
2904The topmost backtrack state, pointed to by st, is usually free. If you
2905want to claim it, populate any ST.foo fields in it with values you wish to
2906save, then do one of
2907
2908 PUSH_STATE_GOTO(resume_state, node);
2909 PUSH_YES_STATE_GOTO(resume_state, node);
2910
2911which sets that backtrack state's resume value to 'resume_state', pushes a
2912new free entry to the top of the backtrack stack, then goes to 'node'.
2913On backtracking, the free slot is popped, and the saved state becomes the
2914new free state. An ST.foo field in this new top state can be temporarily
2915accessed to retrieve values, but once the main loop is re-entered, it
2916becomes available for reuse.
2917
2918Note that the depth of the backtrack stack constantly increases during the
2919left-to-right execution of the pattern, rather than going up and down with
2920the pattern nesting. For example the stack is at its maximum at Z at the
2921end of the pattern, rather than at X in the following:
2922
2923 /(((X)+)+)+....(Y)+....Z/
2924
2925The only exceptions to this are lookahead/behind assertions and the cut,
2926(?>A), which pop all the backtrack states associated with A before
2927continuing.
2928
486ec47a 2929Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
2930PL_regmatch_state and st always point to the currently active state,
2931and PL_regmatch_slab points to the slab currently containing
2932PL_regmatch_state. The first time regmatch() is called, the first slab is
2933allocated, and is never freed until interpreter destruction. When the slab
2934is full, a new one is allocated and chained to the end. At exit from
2935regmatch(), slabs allocated since entry are freed.
2936
2937*/
95b24440 2938
40a82448 2939
5bc10b2c 2940#define DEBUG_STATE_pp(pp) \
265c4333 2941 DEBUG_STATE_r({ \
f2ed9b32 2942 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 2943 PerlIO_printf(Perl_debug_log, \
5d458dd8 2944 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 2945 depth*2, "", \
13d6edb4 2946 PL_reg_name[st->resume_state], \
5d458dd8
YO
2947 ((st==yes_state||st==mark_state) ? "[" : ""), \
2948 ((st==yes_state) ? "Y" : ""), \
2949 ((st==mark_state) ? "M" : ""), \
2950 ((st==yes_state||st==mark_state) ? "]" : "") \
2951 ); \
265c4333 2952 });
5bc10b2c 2953
40a82448 2954
3dab1dad 2955#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2956
3df15adc 2957#ifdef DEBUGGING
5bc10b2c 2958
ab3bbdeb 2959STATIC void
f2ed9b32 2960S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
2961 const char *start, const char *end, const char *blurb)
2962{
efd26800 2963 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
2964
2965 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2966
ab3bbdeb
YO
2967 if (!PL_colorset)
2968 reginitcolors();
2969 {
2970 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 2971 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 2972
f2ed9b32 2973 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
2974 start, end - start, 60);
2975
2976 PerlIO_printf(Perl_debug_log,
2977 "%s%s REx%s %s against %s\n",
2978 PL_colors[4], blurb, PL_colors[5], s0, s1);
2979
f2ed9b32 2980 if (utf8_target||utf8_pat)
1de06328
YO
2981 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2982 utf8_pat ? "pattern" : "",
f2ed9b32
KW
2983 utf8_pat && utf8_target ? " and " : "",
2984 utf8_target ? "string" : ""
ab3bbdeb
YO
2985 );
2986 }
2987}
3df15adc
YO
2988
2989STATIC void
786e8c11
YO
2990S_dump_exec_pos(pTHX_ const char *locinput,
2991 const regnode *scan,
2992 const char *loc_regeol,
2993 const char *loc_bostr,
2994 const char *loc_reg_starttry,
f2ed9b32 2995 const bool utf8_target)
07be1b83 2996{
786e8c11 2997 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 2998 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 2999 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3000 /* The part of the string before starttry has one color
3001 (pref0_len chars), between starttry and current
3002 position another one (pref_len - pref0_len chars),
3003 after the current position the third one.
3004 We assume that pref0_len <= pref_len, otherwise we
3005 decrease pref0_len. */
786e8c11
YO
3006 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3007 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3008 int pref0_len;
3009
7918f24d
NC
3010 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3011
f2ed9b32 3012 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3013 pref_len++;
786e8c11
YO
3014 pref0_len = pref_len - (locinput - loc_reg_starttry);
3015 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3016 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3017 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3018 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3019 l--;
3020 if (pref0_len < 0)
3021 pref0_len = 0;
3022 if (pref0_len > pref_len)
3023 pref0_len = pref_len;
3024 {
f2ed9b32 3025 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3026
ab3bbdeb 3027 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3028 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3029
ab3bbdeb 3030 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3031 (locinput - pref_len + pref0_len),
1de06328 3032 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3033
ab3bbdeb 3034 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3035 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3036
1de06328 3037 const STRLEN tlen=len0+len1+len2;
3df15adc 3038 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3039 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3040 (IV)(locinput - loc_bostr),
07be1b83 3041 len0, s0,
07be1b83 3042 len1, s1,
07be1b83 3043 (docolor ? "" : "> <"),
07be1b83 3044 len2, s2,
f9f4320a 3045 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3046 "");
3047 }
3048}
3df15adc 3049
07be1b83
YO
3050#endif
3051
0a4db386
YO
3052/* reg_check_named_buff_matched()
3053 * Checks to see if a named buffer has matched. The data array of
3054 * buffer numbers corresponding to the buffer is expected to reside
3055 * in the regexp->data->data array in the slot stored in the ARG() of
3056 * node involved. Note that this routine doesn't actually care about the
3057 * name, that information is not preserved from compilation to execution.
3058 * Returns the index of the leftmost defined buffer with the given name
3059 * or 0 if non of the buffers matched.
3060 */
3061STATIC I32
7918f24d
NC
3062S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3063{
0a4db386 3064 I32 n;
f8fc2ecf 3065 RXi_GET_DECL(rex,rexi);
ad64d0ec 3066 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3067 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3068
3069 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3070
0a4db386 3071 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3072 if ((I32)rex->lastparen >= nums[n] &&
3073 rex->offs[nums[n]].end != -1)
0a4db386
YO
3074 {
3075 return nums[n];
3076 }
3077 }
3078 return 0;
3079}
3080
2f554ef7
DM
3081
3082/* free all slabs above current one - called during LEAVE_SCOPE */
3083
3084STATIC void
3085S_clear_backtrack_stack(pTHX_ void *p)
3086{
3087 regmatch_slab *s = PL_regmatch_slab->next;
3088 PERL_UNUSED_ARG(p);
3089
3090 if (!s)
3091 return;
3092 PL_regmatch_slab->next = NULL;
3093 while (s) {
3094 regmatch_slab * const osl = s;
3095 s = s->next;
3096 Safefree(osl);
3097 }
3098}
3099
3100
d6a28714 3101STATIC I32 /* 0 failure, 1 success */
24b23f37 3102S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3103{
a35a87e7 3104#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3105 dMY_CXT;
3106#endif
27da23d5 3107 dVAR;
eb578fdb 3108 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3109 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3110 REGEXP *rex_sv = reginfo->prog;
3111 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3112 RXi_GET_DECL(rex,rexi);
2f554ef7 3113 I32 oldsave;
5d9a96ca 3114 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3115 regmatch_state *st;
5d9a96ca 3116 /* cache heavy used fields of st in registers */
eb578fdb
KW
3117 regnode *scan;
3118 regnode *next;
3119 U32 n = 0; /* general value; init to avoid compiler warning */
3120 I32 ln = 0; /* len or last; init to avoid compiler warning */
3121 char *locinput = PL_reginput;
3122 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3123
b69b0499 3124 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3125 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3126 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3127 const U32 max_nochange_depth =
3128 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3129 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3130 regmatch_state *yes_state = NULL; /* state to pop to on success of
3131 subpattern */
e2e6a0f1
YO
3132 /* mark_state piggy backs on the yes_state logic so that when we unwind
3133 the stack on success we can update the mark_state as we go */
3134 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3135 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3136 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3137 U32 state_num;
5d458dd8
YO
3138 bool no_final = 0; /* prevent failure from backtracking? */
3139 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3140 char *startpoint = PL_reginput;
5d458dd8
YO
3141 SV *popmark = NULL; /* are we looking for a mark? */
3142 SV *sv_commit = NULL; /* last mark name seen in failure */
3143 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3144 during a successful match */
5d458dd8
YO
3145 U32 lastopen = 0; /* last open we saw */
3146 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3147 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3148 /* these three flags are set by various ops to signal information to
3149 * the very next op. They have a useful lifetime of exactly one loop
3150 * iteration, and are not preserved or restored by state pushes/pops
3151 */
3152 bool sw = 0; /* the condition value in (?(cond)a|b) */
3153 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3154 int logical = 0; /* the following EVAL is:
3155 0: (?{...})
3156 1: (?(?{...})X|Y)
3157 2: (??{...})
3158 or the following IFMATCH/UNLESSM is:
3159 false: plain (?=foo)
3160 true: used as a condition: (?(?=foo))
3161 */
81ed78b2
DM
3162 PAD* last_pad = NULL;
3163 dMULTICALL;
3164 I32 gimme = G_SCALAR;
3165 CV *caller_cv = NULL; /* who called us */
3166 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3167 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3168
95b24440 3169#ifdef DEBUGGING
e68ec53f 3170 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3171#endif
3172
81ed78b2
DM
3173 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3174 multicall_oldcatch = 0;
3175 multicall_cv = NULL;
3176 cx = NULL;
4f8dbb2d
JL
3177 PERL_UNUSED_VAR(multicall_cop);
3178 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3179
3180
7918f24d
NC
3181 PERL_ARGS_ASSERT_REGMATCH;
3182
3b57cd43 3183 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3184 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3185 }));
5d9a96ca
DM
3186 /* on first ever call to regmatch, allocate first slab */
3187 if (!PL_regmatch_slab) {
3188 Newx(PL_regmatch_slab, 1, regmatch_slab);
3189 PL_regmatch_slab->prev = NULL;
3190 PL_regmatch_slab->next = NULL;
86545054 3191 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3192 }
3193
2f554ef7
DM
3194 oldsave = PL_savestack_ix;
3195 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3196 SAVEVPTR(PL_regmatch_slab);
3197 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3198
3199 /* grab next free state slot */
3200 st = ++PL_regmatch_state;
86545054 3201 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3202 st = PL_regmatch_state = S_push_slab(aTHX);
3203
d6a28714
JH
3204 /* Note that nextchr is a byte even in UTF */
3205 nextchr = UCHARAT(locinput);
3206 scan = prog;
3207 while (scan != NULL) {
8ba1375e 3208
a3621e74 3209 DEBUG_EXECUTE_r( {
6136c704 3210 SV * const prop = sv_newmortal();
1de06328 3211 regnode *rnext=regnext(scan);
f2ed9b32 3212 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3213 regprop(rex, prop, scan);
07be1b83
YO
3214
3215 PerlIO_printf(Perl_debug_log,
3216 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3217 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3218 SvPVX_const(prop),
1de06328 3219 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3220 0 : (IV)(rnext - rexi->program));
2a782b5b 3221 });
d6a28714
JH
3222
3223 next = scan + NEXT_OFF(scan);
3224 if (next == scan)
3225 next = NULL;
40a82448 3226 state_num = OP(scan);
d6a28714 3227
40a82448 3228 reenter_switch:
34a81e2b 3229
40a82448 3230 switch (state_num) {
d6a28714 3231 case BOL:
7fba1cd6 3232 if (locinput == PL_bostr)
d6a28714 3233 {
3b0527fe 3234 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3235 break;
3236 }
d6a28714
JH
3237 sayNO;
3238 case MBOL:
12d33761
HS
3239 if (locinput == PL_bostr ||
3240 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3241 {
b8c5462f
JH
3242 break;
3243 }
d6a28714
JH
3244 sayNO;
3245 case SBOL:
c2a73568 3246 if (locinput == PL_bostr)
b8c5462f 3247 break;
d6a28714
JH
3248 sayNO;
3249 case GPOS:
3b0527fe 3250 if (locinput == reginfo->ganch)
d6a28714
JH
3251 break;
3252 sayNO;
ee9b8eae
YO
3253
3254 case KEEPS:
3255 /* update the startpoint */
b93070ed 3256 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3257 PL_reginput = locinput;
b93070ed 3258 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3259 PUSH_STATE_GOTO(KEEPS_next, next);
3260 /*NOT-REACHED*/
3261 case KEEPS_next_fail:
3262 /* rollback the start point change */
b93070ed 3263 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3264 sayNO_SILENT;
3265 /*NOT-REACHED*/
d6a28714 3266 case EOL:
d6a28714
JH
3267 goto seol;
3268 case MEOL:
d6a28714 3269 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3270 sayNO;
b8c5462f 3271 break;
d6a28714
JH
3272 case SEOL:
3273 seol:
3274 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3275 sayNO;
d6a28714 3276 if (PL_regeol - locinput > 1)
b8c5462f 3277 sayNO;
b8c5462f 3278 break;
d6a28714
JH
3279 case EOS:
3280 if (PL_regeol != locinput)
b8c5462f 3281 sayNO;
d6a28714 3282 break;
ffc61ed2 3283 case SANY:
d6a28714 3284 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3285 sayNO;
f2ed9b32 3286 if (utf8_target) {
f33976b4
DB
3287 locinput += PL_utf8skip[nextchr];
3288 if (locinput > PL_regeol)
3289 sayNO;
3290 nextchr = UCHARAT(locinput);
3291 }
3292 else
3293 nextchr = UCHARAT(++locinput);
3294 break;
3295 case CANY:
3296 if (!nextchr && locinput >= PL_regeol)
3297 sayNO;
b8c5462f 3298 nextchr = UCHARAT(++locinput);
a0d0e21e 3299 break;
ffc61ed2 3300 case REG_ANY:
1aa99e6b
IH
3301 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3302 sayNO;
f2ed9b32 3303 if (utf8_target) {
b8c5462f 3304 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3305 if (locinput > PL_regeol)
3306 sayNO;
a0ed51b3 3307 nextchr = UCHARAT(locinput);
a0ed51b3 3308 }
1aa99e6b
IH
3309 else
3310 nextchr = UCHARAT(++locinput);
a0ed51b3 3311 break;
166ba7cd
DM
3312
3313#undef ST
3314#define ST st->u.trie
786e8c11
YO
3315 case TRIEC:
3316 /* In this case the charclass data is available inline so
3317 we can fail fast without a lot of extra overhead.
3318 */
fab2782b
YO
3319 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3320 DEBUG_EXECUTE_r(
3321 PerlIO_printf(Perl_debug_log,
3322 "%*s %sfailed to match trie start class...%s\n",
3323 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3324 );
3325 sayNO_SILENT;
118e2215 3326 assert(0); /* NOTREACHED */
786e8c11
YO
3327 }
3328 /* FALL THROUGH */
5b47454d 3329 case TRIE:
2e64971a
DM
3330 /* the basic plan of execution of the trie is:
3331 * At the beginning, run though all the states, and
3332 * find the longest-matching word. Also remember the position
3333 * of the shortest matching word. For example, this pattern:
3334 * 1 2 3 4 5
3335 * ab|a|x|abcd|abc
3336 * when matched against the string "abcde", will generate
3337 * accept states for all words except 3, with the longest
3338 * matching word being 4, and the shortest being 1 (with
3339 * the position being after char 1 of the string).
3340 *
3341 * Then for each matching word, in word order (i.e. 1,2,4,5),
3342 * we run the remainder of the pattern; on each try setting
3343 * the current position to the character following the word,
3344 * returning to try the next word on failure.
3345 *
3346 * We avoid having to build a list of words at runtime by
3347 * using a compile-time structure, wordinfo[].prev, which
3348 * gives, for each word, the previous accepting word (if any).
3349 * In the case above it would contain the mappings 1->2, 2->0,
3350 * 3->0, 4->5, 5->1. We can use this table to generate, from
3351 * the longest word (4 above), a list of all words, by
3352 * following the list of prev pointers; this gives us the
3353 * unordered list 4,5,1,2. Then given the current word we have
3354 * just tried, we can go through the list and find the
3355 * next-biggest word to try (so if we just failed on word 2,
3356 * the next in the list is 4).
3357 *
3358 * Since at runtime we don't record the matching position in
3359 * the string for each word, we have to work that out for
3360 * each word we're about to process. The wordinfo table holds
3361 * the character length of each word; given that we recorded
3362 * at the start: the position of the shortest word and its
3363 * length in chars, we just need to move the pointer the
3364 * difference between the two char lengths. Depending on
3365 * Unicode status and folding, that's cheap or expensive.
3366 *
3367 * This algorithm is optimised for the case where are only a
3368 * small number of accept states, i.e. 0,1, or maybe 2.
3369 * With lots of accepts states, and having to try all of them,
3370 * it becomes quadratic on number of accept states to find all
3371 * the next words.
3372 */
3373
3dab1dad 3374 {
07be1b83 3375 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3376 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3377
3378 /* what trie are we using right now */
be8e71aa 3379 reg_trie_data * const trie
f8fc2ecf 3380 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3381 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3382 U32 state = trie->startstate;
166ba7cd 3383
fab2782b 3384 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3385 if (trie->states[ state ].wordnum) {
3386 DEBUG_EXECUTE_r(
3387 PerlIO_printf(Perl_debug_log,
3388 "%*s %smatched empty string...%s\n",
5bc10b2c 3389 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3390 );
20dbff7c
YO
3391 if (!trie->jump)
3392 break;
3dab1dad
YO
3393 } else {
3394 DEBUG_EXECUTE_r(
3395 PerlIO_printf(Perl_debug_log,
786e8c11 3396 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3397 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3398 );
3399 sayNO_SILENT;
3400 }
3401 }
166ba7cd 3402
786e8c11
YO
3403 {
3404 U8 *uc = ( U8* )locinput;
3405
3406 STRLEN len = 0;
3407 STRLEN foldlen = 0;
3408 U8 *uscan = (U8*)NULL;
786e8c11 3409 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3410 U32 charcount = 0; /* how many input chars we have matched */
3411 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3412
786e8c11 3413 ST.jump = trie->jump;
786e8c11 3414 ST.me = scan;
2e64971a
DM
3415 ST.firstpos = NULL;
3416 ST.longfold = FALSE; /* char longer if folded => it's harder */
3417 ST.nextword = 0;
3418
3419 /* fully traverse the TRIE; note the position of the
3420 shortest accept state and the wordnum of the longest
3421 accept state */
07be1b83 3422
a3621e74 3423 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3424 U32 base = trie->states[ state ].trans.base;
f9f4320a 3425 UV uvc = 0;
acb909b4 3426 U16 charid = 0;
2e64971a
DM
3427 U16 wordnum;
3428 wordnum = trie->states[ state ].wordnum;
3429
3430 if (wordnum) { /* it's an accept state */
3431 if (!accepted) {
3432 accepted = 1;
3433 /* record first match position */
3434 if (ST.longfold) {
3435 ST.firstpos = (U8*)locinput;
3436 ST.firstchars = 0;
5b47454d 3437 }
2e64971a
DM
3438 else {
3439 ST.firstpos = uc;
3440 ST.firstchars = charcount;
3441 }
3442 }
3443 if (!ST.nextword || wordnum < ST.nextword)
3444 ST.nextword = wordnum;
3445 ST.topword = wordnum;
786e8c11 3446 }
a3621e74 3447
07be1b83 3448 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3449 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3450 PerlIO_printf( Perl_debug_log,
2e64971a 3451 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3452 2+depth * 2, "", PL_colors[4],
2e64971a 3453 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3454 });
a3621e74 3455
2e64971a 3456 /* read a char and goto next state */
a3621e74 3457 if ( base ) {
6dd2be57 3458 I32 offset;
55eed653
NC
3459 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3460 uscan, len, uvc, charid, foldlen,
3461 foldbuf, uniflags);
2e64971a
DM
3462 charcount++;
3463 if (foldlen>0)
3464 ST.longfold = TRUE;
5b47454d 3465 if (charid &&
6dd2be57
DM
3466 ( ((offset =
3467 base + charid - 1 - trie->uniquecharcount)) >= 0)
3468
3469 && ((U32)offset < trie->lasttrans)
3470 && trie->trans[offset].check == state)
5b47454d 3471 {
6dd2be57 3472 state = trie->trans[offset].next;
5b47454d
DM
3473 }
3474 else {
3475 state = 0;
3476 }
3477 uc += len;
3478
3479 }
3480 else {
a3621e74
YO
3481 state = 0;
3482 }
3483 DEBUG_TRIE_EXECUTE_r(
e4584336 3484 PerlIO_printf( Perl_debug_log,
786e8c11 3485 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3486 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3487 );
3488 }
2e64971a 3489 if (!accepted)
a3621e74 3490 sayNO;
a3621e74 3491
2e64971a
DM
3492 /* calculate total number of accept states */
3493 {
3494 U16 w = ST.topword;
3495 accepted = 0;
3496 while (w) {
3497 w = trie->wordinfo[w].prev;
3498 accepted++;
3499 }
3500 ST.accepted = accepted;
3501 }
3502
166ba7cd
DM
3503 DEBUG_EXECUTE_r(
3504 PerlIO_printf( Perl_debug_log,
3505 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3506 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3507 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3508 );
2e64971a 3509 goto trie_first_try; /* jump into the fail handler */
786e8c11 3510 }}
118e2215 3511 assert(0); /* NOTREACHED */
2e64971a
DM
3512
3513 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3514 if ( ST.jump) {
3515 REGCP_UNWIND(ST.cp);
a8d1f4b4 3516 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3517 }
2e64971a
DM
3518 if (!--ST.accepted) {
3519 DEBUG_EXECUTE_r({
3520 PerlIO_printf( Perl_debug_log,
3521 "%*s %sTRIE failed...%s\n",
3522 REPORT_CODE_OFF+depth*2, "",
3523 PL_colors[4],
3524 PL_colors[5] );
3525 });
3526 sayNO_SILENT;
3527 }
3528 {
3529 /* Find next-highest word to process. Note that this code
3530 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3531 U16 min = 0;
3532 U16 word;
3533 U16 const nextword = ST.nextword;
3534 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3535 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3536 for (word=ST.topword; word; word=wordinfo[word].prev) {
3537 if (word > nextword && (!min || word < min))
3538 min = word;
3539 }
3540 ST.nextword = min;
3541 }
3542
fae667d5 3543 trie_first_try:
5d458dd8
YO
3544 if (do_cutgroup) {
3545 do_cutgroup = 0;
3546 no_final = 0;
3547 }
fae667d5
YO
3548
3549 if ( ST.jump) {
b93070ed 3550 ST.lastparen = rex->lastparen;
f6033a9d 3551 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3552 REGCP_SET(ST.cp);
2e64971a 3553 }
a3621e74 3554
2e64971a 3555 /* find start char of end of current word */
166ba7cd 3556 {
2e64971a
DM
3557 U32 chars; /* how many chars to skip */
3558 U8 *uc = ST.firstpos;
3559 reg_trie_data * const trie
3560 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3561
3562 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3563 >= ST.firstchars);
3564 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3565 - ST.firstchars;
3566
3567 if (ST.longfold) {
3568 /* the hard option - fold each char in turn and find
3569 * its folded length (which may be different */
3570 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3571 STRLEN foldlen;
3572 STRLEN len;
d9a396a3 3573 UV uvc;
2e64971a
DM
3574 U8 *uscan;
3575
3576 while (chars) {
f2ed9b32 3577 if (utf8_target) {
2e64971a
DM
3578 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3579 uniflags);
3580 uc += len;
3581 }
3582 else {
3583 uvc = *uc;
3584 uc++;
3585 }
3586 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3587 uscan = foldbuf;
3588 while (foldlen) {
3589 if (!--chars)
3590 break;
3591 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3592 uniflags);
3593 uscan += len;
3594 foldlen -= len;
3595 }
3596 }
a3621e74 3597 }
2e64971a 3598 else {
f2ed9b32 3599 if (utf8_target)
2e64971a
DM
3600 while (chars--)
3601 uc += UTF8SKIP(uc);
3602 else
3603 uc += chars;
3604 }
3605 PL_reginput = (char *)uc;
3606 }
166ba7cd 3607
6603fe3e
DM
3608 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3609 ? ST.jump[ST.nextword]
3610 : NEXT_OFF(ST.me));
166ba7cd 3611
2e64971a
DM
3612 DEBUG_EXECUTE_r({
3613 PerlIO_printf( Perl_debug_log,
3614 "%*s %sTRIE matched word #%d, continuing%s\n",
3615 REPORT_CODE_OFF+depth*2, "",
3616 PL_colors[4],
3617 ST.nextword,
3618 PL_colors[5]
3619 );
3620 });
3621
3622 if (ST.accepted > 1 || has_cutgroup) {
3623 PUSH_STATE_GOTO(TRIE_next, scan);
118e2215 3624 assert(0); /* NOTREACHED */
166ba7cd 3625 }
2e64971a
DM
3626 /* only one choice left - just continue */
3627 DEBUG_EXECUTE_r({
3628 AV *const trie_words
3629 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3630 SV ** const tmp = av_fetch( trie_words,
3631 ST.nextword-1, 0 );
3632 SV *sv= tmp ? sv_newmortal() : NULL;
3633
3634 PerlIO_printf( Perl_debug_log,
3635 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3636 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3637 ST.nextword,
3638 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3639 PL_colors[0], PL_colors[1],
c89df6cf 3640 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3641 )
3642 : "not compiled under -Dr",
3643 PL_colors[5] );
3644 });
3645
3646 locinput = PL_reginput;
3647 nextchr = UCHARAT(locinput);
3648 continue; /* execute rest of RE */
118e2215 3649 assert(0); /* NOTREACHED */
166ba7cd
DM
3650#undef ST
3651
95b24440
DM
3652 case EXACT: {
3653 char *s = STRING(scan);
24d3c4a9 3654 ln = STR_LEN(scan);
f2ed9b32 3655 if (utf8_target != UTF_PATTERN) {
bc517b45 3656 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3657 char *l = locinput;
24d3c4a9 3658 const char * const e = s + ln;
a72c7584 3659
f2ed9b32 3660 if (utf8_target) {
5ff6fc6d 3661 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3662 while (s < e) {
a3b680e6 3663 STRLEN ulen;
1aa99e6b 3664 if (l >= PL_regeol)
5ff6fc6d
JH
3665 sayNO;
3666 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3667 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3668 uniflags))
5ff6fc6d 3669 sayNO;
bc517b45 3670 l += ulen;
5ff6fc6d 3671 s ++;
1aa99e6b 3672 }
5ff6fc6d
JH
3673 }
3674 else {
3675 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3676 while (s < e) {
a3b680e6 3677 STRLEN ulen;
1aa99e6b
IH
3678 if (l >= PL_regeol)
3679 sayNO;
5ff6fc6d 3680 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3681 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3682 uniflags))
1aa99e6b 3683 sayNO;
bc517b45 3684 s += ulen;
a72c7584 3685 l ++;
1aa99e6b 3686 }
5ff6fc6d 3687 }
1aa99e6b
IH
3688 locinput = l;
3689 nextchr = UCHARAT(locinput);
3690 break;
3691 }
bc517b45 3692 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3693 /* Inline the first character, for speed. */
3694 if (UCHARAT(s) != nextchr)
3695 sayNO;
24d3c4a9 3696 if (PL_regeol - locinput < ln)
d6a28714 3697 sayNO;
24d3c4a9 3698 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3699 sayNO;
24d3c4a9 3700 locinput += ln;
d6a28714
JH
3701 nextchr = UCHARAT(locinput);
3702 break;
95b24440 3703 }
9a5a5549 3704 case EXACTFL: {
a932d541 3705 re_fold_t folder;
9a5a5549
KW
3706 const U8 * fold_array;
3707 const char * s;
d513472c 3708 U32 fold_utf8_flags;
9a5a5549 3709
b8c5462f 3710 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3711 folder = foldEQ_locale;
3712 fold_array = PL_fold_locale;
17580e7a 3713 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3714 goto do_exactf;
3715
3c760661 3716 case EXACTFU_SS:
fab2782b 3717 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
3718 case EXACTFU:
3719 folder = foldEQ_latin1;
3720 fold_array = PL_fold_latin1;
2daa8fee 3721 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3722 goto do_exactf;
3723
2f7f8cb1
KW
3724 case EXACTFA:
3725 folder = foldEQ_latin1;
3726 fold_array = PL_fold_latin1;
57014d77 3727 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3728 goto do_exactf;
3729
9a5a5549
KW
3730 case EXACTF:
3731 folder = foldEQ;
3732 fold_array = PL_fold;
62bf7766 3733 fold_utf8_flags = 0;
9a5a5549
KW
3734
3735 do_exactf:
3736 s = STRING(scan);
24d3c4a9 3737 ln = STR_LEN(scan);
d6a28714 3738
3c760661
KW
3739 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3740 /* Either target or the pattern are utf8, or has the issue where
3741 * the fold lengths may differ. */
be8e71aa 3742 const char * const l = locinput;
d07ddd77 3743 char *e = PL_regeol;
bc517b45 3744
d513472c 3745 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3746 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3747 {
3748 sayNO;
5486206c 3749 }
d07ddd77
JH
3750 locinput = e;
3751 nextchr = UCHARAT(locinput);
3752 break;
a0ed51b3 3753 }
d6a28714 3754
0a138b74 3755 /* Neither the target nor the pattern are utf8 */
d6a28714 3756 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3757 UCHARAT(s) != fold_array[nextchr])
3758 {
a0ed51b3 3759 sayNO;
9a5a5549 3760 }
24d3c4a9 3761 if (PL_regeol - locinput < ln)
b8c5462f 3762 sayNO;
9a5a5549 3763 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3764 sayNO;
24d3c4a9 3765 locinput += ln;
d6a28714 3766 nextchr = UCHARAT(locinput);
a0d0e21e 3767 break;
9a5a5549 3768 }
63ac0dad
KW
3769
3770 /* XXX Could improve efficiency by separating these all out using a
3771 * macro or in-line function. At that point regcomp.c would no longer
3772 * have to set the FLAGS fields of these */
b2680017
YO
3773 case BOUNDL:
3774 case NBOUNDL:
3775 PL_reg_flags |= RF_tainted;
3776 /* FALL THROUGH */
3777 case BOUND:
63ac0dad 3778 case BOUNDU:
cfaf538b 3779 case BOUNDA:
b2680017 3780 case NBOUND:
63ac0dad 3781 case NBOUNDU:
cfaf538b 3782 case NBOUNDA:
b2680017 3783 /* was last char in word? */
f2e96b5d
KW
3784 if (utf8_target
3785 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3786 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3787 {
b2680017
YO
3788 if (locinput == PL_bostr)
3789 ln = '\n';
3790 else {
3791 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3792
3793 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3794 }
63ac0dad 3795 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3796 ln = isALNUM_uni(ln);
3797 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3798 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3799 }
3800 else {
3801 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3802 n = isALNUM_LC_utf8((U8*)locinput);
3803 }
3804 }
3805 else {
cfaf538b
KW
3806
3807 /* Here the string isn't utf8, or is utf8 and only ascii
3808 * characters are to match \w. In the latter case looking at
3809 * the byte just prior to the current one may be just the final
3810 * byte of a multi-byte character. This is ok. There are two
3811 * cases:
3812 * 1) it is a single byte character, and then the test is doing
3813 * just what it's supposed to.
3814 * 2) it is a multi-byte character, in which case the final
3815 * byte is never mistakable for ASCII, and so the test
3816 * will say it is not a word character, which is the
3817 * correct answer. */
b2680017
YO
3818 ln = (locinput != PL_bostr) ?
3819 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3820 switch (FLAGS(scan)) {
3821 case REGEX_UNICODE_CHARSET:
3822 ln = isWORDCHAR_L1(ln);
3823 n = isWORDCHAR_L1(nextchr);
3824 break;
3825 case REGEX_LOCALE_CHARSET:
3826 ln = isALNUM_LC(ln);
3827 n = isALNUM_LC(nextchr);
3828 break;
3829 case REGEX_DEPENDS_CHARSET:
3830 ln = isALNUM(ln);
3831 n = isALNUM(nextchr);
3832 break;
cfaf538b 3833 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3834 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3835 ln = isWORDCHAR_A(ln);
3836 n = isWORDCHAR_A(nextchr);
3837 break;
63ac0dad
KW
3838 default:
3839 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3840 break;
b2680017
YO
3841 }
3842 }
63ac0dad
KW
3843 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3844 * regcomp.sym */
3845 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3846 sayNO;
3847 break;
f56b6394 3848 case ANYOFV:
d6a28714 3849 case ANYOF:
f56b6394 3850 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3851 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3852 if (locinput >= PL_regeol)
3853 sayNO;
9e55ce06 3854
f2ed9b32 3855 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3856 sayNO;
b32d7d3e 3857 locinput += inclasslen;
b8c5462f 3858 nextchr = UCHARAT(locinput);
e0f9d4a8 3859 break;
ffc61ed2
JH
3860 }
3861 else {
3862 if (nextchr < 0)
3863 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3864 if (!nextchr && locinput >= PL_regeol)
3865 sayNO;
20ed0b26 3866 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3867 sayNO;
ffc61ed2 3868 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3869 break;
3870 }
b8c5462f 3871 break;
20d0b1e9 3872 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3873 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3874 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3875 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3876 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 3877 alnum, "a");
ee9a90b8
KW
3878
3879 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3880 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3881 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 3882 SPACEA, NSPACEA, isSPACE_A,
779d7b58 3883 space, " ");
ee9a90b8
KW
3884
3885 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3886 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 3887 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 3888 digit, "0");
20d0b1e9 3889
0658cdde
KW
3890 case POSIXA:
3891 if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
3892 sayNO;
3893 }
3894 /* Matched a utf8-invariant, so don't have to worry about utf8 */
3895 nextchr = UCHARAT(++locinput);
3896 break;
3897 case NPOSIXA:
3898 if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
3899 sayNO;
3900 }
3901 if (utf8_target) {
3902 locinput += PL_utf8skip[nextchr];
3903 nextchr = UCHARAT(locinput);
3904 }
3905 else {
3906 nextchr = UCHARAT(++locinput);
3907 }
3908 break;
3909
37e2e78e
KW
3910 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3911 a Unicode extended Grapheme Cluster */
3912 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3913 extended Grapheme Cluster is:
3914
3915 CR LF
3916 | Prepend* Begin Extend*
3917 | .
3918
1e958ea9
KW
3919 Begin is: ( Special_Begin | ! Control )
3920 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
3921 Extend is: ( Grapheme_Extend | Spacing_Mark )
3922 Control is: [ GCB_Control CR LF ]
3923 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 3924
27d4fc33
KW
3925 If we create a 'Regular_Begin' = Begin - Special_Begin, then
3926 we can rewrite
3927
3928 Begin is ( Regular_Begin + Special Begin )
3929
3930 It turns out that 98.4% of all Unicode code points match
3931 Regular_Begin. Doing it this way eliminates a table match in
3932 the previouls implementation for almost all Unicode code points.
3933
37e2e78e
KW
3934 There is a subtlety with Prepend* which showed up in testing.
3935 Note that the Begin, and only the Begin is required in:
3936 | Prepend* Begin Extend*
cc3b396d
KW
3937 Also, Begin contains '! Control'. A Prepend must be a
3938 '! Control', which means it must also be a Begin. What it
3939 comes down to is that if we match Prepend* and then find no
3940 suitable Begin afterwards, that if we backtrack the last
3941 Prepend, that one will be a suitable Begin.
37e2e78e
KW
3942 */
3943
b7c83a7e 3944 if (locinput >= PL_regeol)
a0ed51b3 3945 sayNO;
f2ed9b32 3946 if (! utf8_target) {
37e2e78e
KW
3947
3948 /* Match either CR LF or '.', as all the other possibilities
3949 * require utf8 */
3950 locinput++; /* Match the . or CR */
cc3b396d
KW
3951 if (nextchr == '\r' /* And if it was CR, and the next is LF,
3952 match the LF */
37e2e78e
KW
3953 && locinput < PL_regeol
3954 && UCHARAT(locinput) == '\n') locinput++;
3955 }
3956 else {
3957
3958 /* Utf8: See if is ( CR LF ); already know that locinput <
3959 * PL_regeol, so locinput+1 is in bounds */
3960 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3961 locinput += 2;
3962 }
3963 else {
3964 /* In case have to backtrack to beginning, then match '.' */
3965 char *starting = locinput;
3966
3967 /* In case have to backtrack the last prepend */
3968 char *previous_prepend = 0;
3969
3970 LOAD_UTF8_CHARCLASS_GCB();
3971
a1853d78
KW
3972 /* Match (prepend)*, but don't bother trying if empty (as
3973 * being set to _undef indicates) */
3974 if (PL_utf8_X_prepend != &PL_sv_undef) {
72167004
KW
3975 while (locinput < PL_regeol
3976 && swash_fetch(PL_utf8_X_prepend,
3977 (U8*)locinput, utf8_target))
3978 {
3979 previous_prepend = locinput;
3980 locinput += UTF8SKIP(locinput);
3981 }
a1853d78 3982 }
37e2e78e
KW
3983
3984 /* As noted above, if we matched a prepend character, but
3985 * the next thing won't match, back off the last prepend we
3986 * matched, as it is guaranteed to match the begin */
3987 if (previous_prepend
3988 && (locinput >= PL_regeol
27d4fc33 3989 || ! swash_fetch(PL_utf8_X_regular_begin,
f2ed9b32 3990 (U8*)locinput, utf8_target)))
37e2e78e
KW
3991 {
3992 locinput = previous_prepend;
3993 }
3994
3995 /* Note that here we know PL_regeol > locinput, as we
3996 * tested that upon input to this switch case, and if we
3997 * moved locinput forward, we tested the result just above
3998 * and it either passed, or we backed off so that it will
3999 * now pass */
11dfcd49
KW
4000 if (swash_fetch(PL_utf8_X_regular_begin,
4001 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4002 locinput += UTF8SKIP(locinput);
4003 }
4004 else if (! swash_fetch(PL_utf8_X_special_begin,
4005 (U8*)locinput, utf8_target))
4006 {
37e2e78e
KW
4007
4008 /* Here did not match the required 'Begin' in the
4009 * second term. So just match the very first
4010 * character, the '.' of the final term of the regex */
4011 locinput = starting + UTF8SKIP(starting);
27d4fc33 4012 goto exit_utf8;
37e2e78e
KW
4013 } else {
4014
11dfcd49
KW
4015 /* Here is a special begin. It can be composed of
4016 * several individual characters. One possibility is
4017 * RI+ */
4018 if (swash_fetch(PL_utf8_X_RI,
4019 (U8*)locinput, utf8_target))
4020 {
4021 locinput += UTF8SKIP(locinput);
4022 while (locinput < PL_regeol
4023 && swash_fetch(PL_utf8_X_RI,
4024 (U8*)locinput, utf8_target))
4025 {
cd94d768 4026 locinput += UTF8SKIP(locinput);
11dfcd49
KW
4027 }
4028 } else /* Another possibility is T+ */
4029 if (swash_fetch(PL_utf8_X_T,
4030 (U8*)locinput, utf8_target))
4031 {
4032 locinput += UTF8SKIP(locinput);
4033 while (locinput < PL_regeol
4034 && swash_fetch(PL_utf8_X_T,
4035 (U8*)locinput, utf8_target))
4036 {
cd94d768 4037 locinput += UTF8SKIP(locinput);
11dfcd49
KW
4038 }
4039 } else {
4040
4041 /* Here, neither RI+ nor T+; must be some other
4042 * Hangul. That means it is one of the others: L,
4043 * LV, LVT or V, and matches:
4044 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4045
4046 /* Match L* */
4047 while (locinput < PL_regeol
4048 && swash_fetch(PL_utf8_X_L,
4049 (U8*)locinput, utf8_target))
4050 {
4051 locinput += UTF8SKIP(locinput);
4052 }
37e2e78e 4053
11dfcd49
KW
4054 /* Here, have exhausted L*. If the next character
4055 * is not an LV, LVT nor V, it means we had to have
4056 * at least one L, so matches L+ in the original
4057 * equation, we have a complete hangul syllable.
4058 * Are done. */
4059
4060 if (locinput < PL_regeol
4061 && swash_fetch(PL_utf8_X_LV_LVT_V,
4062 (U8*)locinput, utf8_target))
4063 {
4064
4065 /* Otherwise keep going. Must be LV, LVT or V.
4066 * See if LVT */
4067 if (is_utf8_X_LVT((U8*)locinput)) {
4068 locinput += UTF8SKIP(locinput);
4069 } else {
4070
4071 /* Must be V or LV. Take it, then match
4072 * V* */
4073 locinput += UTF8SKIP(locinput);
4074 while (locinput < PL_regeol
4075 && swash_fetch(PL_utf8_X_V,
4076 (U8*)locinput,
4077 utf8_target))
4078 {
4079 locinput += UTF8SKIP(locinput);
4080 }
4081 }
37e2e78e 4082
11dfcd49
KW
4083 /* And any of LV, LVT, or V can be followed
4084 * by T* */
4085 while (locinput < PL_regeol
4086 && swash_fetch(PL_utf8_X_T,
4087 (U8*)locinput,
4088 utf8_target))
4089 {
4090 locinput += UTF8SKIP(locinput);
4091 }
4092 }
cd94d768 4093 }
11dfcd49 4094 }
37e2e78e 4095
11dfcd49
KW
4096 /* Match any extender */
4097 while (locinput < PL_regeol
4098 && swash_fetch(PL_utf8_X_extend,
4099 (U8*)locinput, utf8_target))
4100 {
4101 locinput += UTF8SKIP(locinput);
4102 }
37e2e78e 4103 }
27d4fc33 4104 exit_utf8:
37e2e78e
KW
4105 if (locinput > PL_regeol) sayNO;
4106 }
a0ed51b3
LW
4107 nextchr = UCHARAT(locinput);
4108 break;
81714fb9
YO
4109
4110 case NREFFL:
d7ef4b73
KW
4111 { /* The capture buffer cases. The ones beginning with N for the
4112 named buffers just convert to the equivalent numbered and
4113 pretend they were called as the corresponding numbered buffer
4114 op. */
26ecd678
TC
4115 /* don't initialize these in the declaration, it makes C++
4116 unhappy */
81714fb9 4117 char *s;
ff1157ca 4118 char type;
8368298a
TC
4119 re_fold_t folder;
4120 const U8 *fold_array;
26ecd678 4121 UV utf8_fold_flags;
8368298a 4122
81714fb9 4123 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4124 folder = foldEQ_locale;
4125 fold_array = PL_fold_locale;
4126 type = REFFL;
17580e7a 4127 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4128 goto do_nref;
4129
2f7f8cb1
KW
4130 case NREFFA:
4131 folder = foldEQ_latin1;
4132 fold_array = PL_fold_latin1;
4133 type = REFFA;
4134 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4135 goto do_nref;
4136
d7ef4b73
KW
4137 case NREFFU:
4138 folder = foldEQ_latin1;
4139 fold_array = PL_fold_latin1;
4140 type = REFFU;
d513472c 4141 utf8_fold_flags = 0;
d7ef4b73
KW
4142 goto do_nref;
4143
81714fb9 4144 case NREFF:
d7ef4b73
KW
4145 folder = foldEQ;
4146 fold_array = PL_fold;
4147 type = REFF;
d513472c 4148 utf8_fold_flags = 0;
d7ef4b73
KW
4149 goto do_nref;
4150
4151 case NREF:
4152 type = REF;
83d7b90b
KW
4153 folder = NULL;
4154 fold_array = NULL;
d513472c 4155 utf8_fold_flags = 0;
d7ef4b73
KW
4156 do_nref:
4157
4158 /* For the named back references, find the corresponding buffer
4159 * number */
0a4db386
YO
4160 n = reg_check_named_buff_matched(rex,scan);
4161
d7ef4b73 4162 if ( ! n ) {
81714fb9 4163 sayNO;
d7ef4b73
KW
4164 }
4165 goto do_nref_ref_common;
4166
c8756f30 4167 case REFFL:
3280af22 4168 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4169 folder = foldEQ_locale;
4170 fold_array = PL_fold_locale;
17580e7a 4171 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4172 goto do_ref;
4173
2f7f8cb1
KW
4174 case REFFA:
4175 folder = foldEQ_latin1;
4176 fold_array = PL_fold_latin1;
4177 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4178 goto do_ref;
4179
d7ef4b73
KW
4180 case REFFU:
4181 folder = foldEQ_latin1;
4182 fold_array = PL_fold_latin1;
d513472c 4183 utf8_fold_flags = 0;
d7ef4b73
KW
4184 goto do_ref;
4185
4186 case REFF:
4187 folder = foldEQ;
4188 fold_array = PL_fold;
d513472c 4189 utf8_fold_flags = 0;
83d7b90b 4190 goto do_ref;
d7ef4b73 4191
c277df42 4192 case REF:
83d7b90b
KW
4193 folder = NULL;
4194 fold_array = NULL;
d513472c 4195 utf8_fold_flags = 0;
83d7b90b 4196
d7ef4b73 4197 do_ref:
81714fb9 4198 type = OP(scan);
d7ef4b73
KW
4199 n = ARG(scan); /* which paren pair */
4200
4201 do_nref_ref_common:
b93070ed 4202 ln = rex->offs[n].start;
2c2d71f5 4203 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4204 if (rex->lastparen < n || ln == -1)
af3f8c16 4205 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4206 if (ln == rex->offs[n].end)
a0d0e21e 4207 break;
a0ed51b3 4208
24d3c4a9 4209 s = PL_bostr + ln;
d7ef4b73 4210 if (type != REF /* REF can do byte comparison */
2f65c56d 4211 && (utf8_target || type == REFFU))
d7ef4b73
KW
4212 { /* XXX handle REFFL better */
4213 char * limit = PL_regeol;
4214
4215 /* This call case insensitively compares the entire buffer
4216 * at s, with the current input starting at locinput, but
4217 * not going off the end given by PL_regeol, and returns in
4218 * limit upon success, how much of the current input was
4219 * matched */
b93070ed 4220 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4221 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4222 {
4223 sayNO;
a0ed51b3 4224 }
d7ef4b73 4225 locinput = limit;
a0ed51b3
LW
4226 nextchr = UCHARAT(locinput);
4227 break;
4228 }
4229
d7ef4b73 4230 /* Not utf8: Inline the first character, for speed. */
76e3520e 4231 if (UCHARAT(s) != nextchr &&
81714fb9 4232 (type == REF ||
d7ef4b73 4233 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4234 sayNO;
b93070ed 4235 ln = rex->offs[n].end - ln;
24d3c4a9 4236 if (locinput + ln > PL_regeol)
4633a7c4 4237 sayNO;
81714fb9 4238 if (ln > 1 && (type == REF
24d3c4a9 4239 ? memNE(s, locinput, ln)
d7ef4b73 4240 : ! folder(s, locinput, ln)))
4633a7c4 4241 sayNO;
24d3c4a9 4242 locinput += ln;
76e3520e 4243 nextchr = UCHARAT(locinput);
a0d0e21e 4244 break;
81714fb9 4245 }
a0d0e21e 4246 case NOTHING:
c277df42 4247 case TAIL:
a0d0e21e
LW
4248 break;
4249 case BACK:
4250 break;
40a82448
DM
4251
4252#undef ST
4253#define ST st->u.eval
c277df42 4254 {
c277df42 4255 SV *ret;
d2f13c59 4256 REGEXP *re_sv;
6bda09f9 4257 regexp *re;
f8fc2ecf 4258 regexp_internal *rei;
1a147d38
YO
4259 regnode *startpoint;
4260
4261 case GOSTART:
e7707071
YO
4262 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4263 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4264 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4265 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4266 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4267 Perl_croak(aTHX_
4268 "Pattern subroutine nesting without pos change"
4269 " exceeded limit in regex");
6bda09f9
YO
4270 } else {
4271 nochange_depth = 0;
1a147d38 4272 }
288b8c02 4273 re_sv = rex_sv;
6bda09f9 4274 re = rex;
f8fc2ecf 4275 rei = rexi;
1a147d38 4276 if (OP(scan)==GOSUB) {
6bda09f9
YO
4277 startpoint = scan + ARG2L(scan);
4278 ST.close_paren = ARG(scan);
4279 } else {
f8fc2ecf 4280 startpoint = rei->program+1;
6bda09f9
YO
4281 ST.close_paren = 0;
4282 }
4283 goto eval_recurse_doit;
118e2215 4284 assert(0); /* NOTREACHED */
6bda09f9
YO
4285 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4286 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4287 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4288 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4289 } else {
4290 nochange_depth = 0;
4291 }
8e5e9ebe 4292 {
4aabdb9b 4293 /* execute the code in the {...} */
81ed78b2 4294
4aabdb9b 4295 dSP;
81ed78b2 4296 SV ** before;
1f4d1a1e 4297 OP * const oop = PL_op;
4aabdb9b 4298 COP * const ocurcop = PL_curcop;
81ed78b2 4299 OP *nop;
d80618d2 4300 char *saved_regeol = PL_regeol;
91332126 4301 struct re_save_state saved_state;
81ed78b2 4302 CV *newcv;
91332126 4303
74088413
DM
4304 /* save *all* paren positions */
4305 regcppush(rex, 0);
4306 REGCP_SET(runops_cp);
4307
6562f1c4 4308 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4309 * eval we would normally put it on the save stack, like with
4310 * save_re_context. However, re-evals have a weird scoping so we
4311 * can't just add ENTER/LEAVE here. With that, things like
4312 *
4313 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4314 *
4315 * would break, as they expect the localisation to be unwound
4316 * only when the re-engine backtracks through the bit that
4317 * localised it.
4318 *
4319 * What we do instead is just saving the state in a local c
4320 * variable.
4321 */
91332126 4322 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4323
d24ca0c5 4324 PL_reg_state.re_reparsing = FALSE;
91332126 4325
81ed78b2
DM
4326 if (!caller_cv)
4327 caller_cv = find_runcv(NULL);
4328
4aabdb9b 4329 n = ARG(scan);
81ed78b2 4330
b30fcab9 4331 if (rexi->data->what[n] == 'r') { /* code from an external qr */
81ed78b2 4332 newcv = ((struct regexp *)SvANY(
b30fcab9
DM
4333 (REGEXP*)(rexi->data->data[n])
4334 ))->qr_anoncv
81ed78b2
DM
4335 ;
4336 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4337 }
4338 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4339 newcv = caller_cv;
4340 nop = (OP*)rexi->data->data[n];
4341 assert(CvDEPTH(newcv));
68e2671b
DM
4342 }
4343 else {
d24ca0c5
DM
4344 /* literal with own CV */
4345 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4346 newcv = rex->qr_anoncv;
4347 nop = (OP*)rexi->data->data[n];
68e2671b 4348 }
81ed78b2 4349
0e458318
DM
4350 /* normally if we're about to execute code from the same
4351 * CV that we used previously, we just use the existing
4352 * CX stack entry. However, its possible that in the
4353 * meantime we may have backtracked, popped from the save
4354 * stack, and undone the SAVECOMPPAD(s) associated with
4355 * PUSH_MULTICALL; in which case PL_comppad no longer
4356 * points to newcv's pad. */
4357 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4358 {
4359 I32 depth = (newcv == caller_cv) ? 0 : 1;
4360 if (last_pushed_cv) {
4361 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4362 }
4363 else {
4364 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4365 }
4366 last_pushed_cv = newcv;
4367 }
4368 last_pad = PL_comppad;
4369
2e2e3f36
DM
4370 /* the initial nextstate you would normally execute
4371 * at the start of an eval (which would cause error
4372 * messages to come from the eval), may be optimised
4373 * away from the execution path in the regex code blocks;
4374 * so manually set PL_curcop to it initially */
4375 {
81ed78b2 4376 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4377 assert(o->op_type == OP_NULL);
4378 if (o->op_targ == OP_SCOPE) {
4379 o = cUNOPo->op_first;
4380 }
4381 else {
4382 assert(o->op_targ == OP_LEAVE);
4383 o = cUNOPo->op_first;
4384 assert(o->op_type == OP_ENTER);
4385 o = o->op_sibling;
4386 }
4387
4388 if (o->op_type != OP_STUB) {
4389 assert( o->op_type == OP_NEXTSTATE
4390 || o->op_type == OP_DBSTATE
4391 || (o->op_type == OP_NULL
4392 && ( o->op_targ == OP_NEXTSTATE
4393 || o->op_targ == OP_DBSTATE
4394 )
4395 )
4396 );
4397 PL_curcop = (COP*)o;
4398 }
4399 }
81ed78b2 4400 nop = nop->op_next;
2e2e3f36 4401
24b23f37 4402 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4403 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4404
b93070ed 4405 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4406
2bf803e2
YO
4407 if (sv_yes_mark) {
4408 SV *sv_mrk = get_sv("REGMARK", 1);
4409 sv_setsv(sv_mrk, sv_yes_mark);
4410 }
4411
81ed78b2
DM
4412 /* we don't use MULTICALL here as we want to call the
4413 * first op of the block of interest, rather than the
4414 * first op of the sub */
4415 before = SP;
4416 PL_op = nop;
8e5e9ebe
RGS
4417 CALLRUNOPS(aTHX); /* Scalar context. */
4418 SPAGAIN;
4419 if (SP == before)
075aa684 4420 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4421 else {
4422 ret = POPs;
4423 PUTBACK;
4424 }
4aabdb9b 4425
e4bfbed3
DM
4426 /* before restoring everything, evaluate the returned
4427 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4428 * PL_op or pad. Also need to process any magic vars
4429 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4430
4431 PL_op = NULL;
4432
5e98dac2 4433 re_sv = NULL;
e4bfbed3
DM
4434 if (logical == 0) /* (?{})/ */
4435 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4436 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4437 sw = cBOOL(SvTRUE(ret));
4438 logical = 0;
4439 }
4440 else { /* /(??{}) */
497d0a96
DM
4441 /* if its overloaded, let the regex compiler handle
4442 * it; otherwise extract regex, or stringify */
4443 if (!SvAMAGIC(ret)) {
4444 SV *sv = ret;
4445 if (SvROK(sv))
4446 sv = SvRV(sv);
4447 if (SvTYPE(sv) == SVt_REGEXP)
4448 re_sv = (REGEXP*) sv;
4449 else if (SvSMAGICAL(sv)) {
4450 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4451 if (mg)
4452 re_sv = (REGEXP *) mg->mg_obj;
4453 }
e4bfbed3 4454
497d0a96
DM
4455 /* force any magic, undef warnings here */
4456 if (!re_sv) {
4457 ret = sv_mortalcopy(ret);
4458 (void) SvPV_force_nolen(ret);
4459 }
e4bfbed3
DM
4460 }
4461
4462 }
4463
91332126
FR
4464 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4465
81ed78b2
DM
4466 /* *** Note that at this point we don't restore
4467 * PL_comppad, (or pop the CxSUB) on the assumption it may
4468 * be used again soon. This is safe as long as nothing
4469 * in the regexp code uses the pad ! */
4aabdb9b 4470 PL_op = oop;
4aabdb9b 4471 PL_curcop = ocurcop;
d80618d2 4472 PL_regeol = saved_regeol;
e4bfbed3
DM
4473 S_regcp_restore(aTHX_ rex, runops_cp);
4474
4475 if (logical != 2)
4aabdb9b 4476 break;
8e5e9ebe 4477 }
e4bfbed3
DM
4478
4479 /* only /(??{})/ from now on */
24d3c4a9 4480 logical = 0;
4aabdb9b 4481 {
4f639d21
DM
4482 /* extract RE object from returned value; compiling if
4483 * necessary */
5c35adbb 4484
575c37f6
DM
4485 if (re_sv) {
4486 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4487 }
0f5d15d6 4488 else {
c737faaf 4489 U32 pm_flags = 0;
a3b680e6 4490 const I32 osize = PL_regsize;
0f5d15d6 4491
9753d940
DM
4492 if (SvUTF8(ret) && IN_BYTES) {
4493 /* In use 'bytes': make a copy of the octet
4494 * sequence, but without the flag on */
b9ad30b4
NC
4495 STRLEN len;
4496 const char *const p = SvPV(ret, len);
4497 ret = newSVpvn_flags(p, len, SVs_TEMP);
4498 }
732caac7
DM
4499 if (rex->intflags & PREGf_USE_RE_EVAL)
4500 pm_flags |= PMf_USE_RE_EVAL;
4501
4502 /* if we got here, it should be an engine which
4503 * supports compiling code blocks and stuff */
4504 assert(rex->engine && rex->engine->op_comp);
ec841a27 4505 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4506 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4507 rex->engine, NULL, NULL,
4508 /* copy /msix etc to inner pattern */
4509 scan->flags,
4510 pm_flags);
732caac7 4511
9041c2e3 4512 if (!(SvFLAGS(ret)
faf82a0b 4513 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4514 | SVs_GMG))) {
a2794585
NC
4515 /* This isn't a first class regexp. Instead, it's
4516 caching a regexp onto an existing, Perl visible
4517 scalar. */
575c37f6 4518 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4519 }
0f5d15d6 4520 PL_regsize = osize;
74088413
DM
4521 /* safe to do now that any $1 etc has been
4522 * interpolated into the new pattern string and
4523 * compiled */
4524 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4525 }
575c37f6 4526 re = (struct regexp *)SvANY(re_sv);
4aabdb9b 4527 }
07bc277f 4528 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4529 re->subbeg = rex->subbeg;
4530 re->sublen = rex->sublen;
f8fc2ecf 4531 rei = RXi_GET(re);
6bda09f9 4532 DEBUG_EXECUTE_r(
f2ed9b32 4533 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4534 "Matching embedded");
4535 );
f8fc2ecf 4536 startpoint = rei->program + 1;
1a147d38 4537 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4538
1a147d38 4539 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4540 /* run the pattern returned from (??{...}) */
b93070ed 4541 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4542 REGCP_SET(ST.lastcp);
6bda09f9 4543
0357f1fd
ML
4544 re->lastparen = 0;
4545 re->lastcloseparen = 0;
4546
4aabdb9b 4547 PL_reginput = locinput;
ae0beba1 4548 PL_regsize = 0;
4aabdb9b
DM
4549
4550 /* XXXX This is too dramatic a measure... */
4551 PL_reg_maxiter = 0;
4552
faec1544 4553 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4554 if (RX_UTF8(re_sv))
faec1544
DM
4555 PL_reg_flags |= RF_utf8;
4556 else
4557 PL_reg_flags &= ~RF_utf8;
4558 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4559
288b8c02 4560 ST.prev_rex = rex_sv;
faec1544 4561 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
4562 rex_sv = re_sv;
4563 SET_reg_curpm(rex_sv);
288b8c02 4564 rex = re;
f8fc2ecf 4565 rexi = rei;
faec1544 4566 cur_curlyx = NULL;
40a82448 4567 ST.B = next;
faec1544
DM
4568 ST.prev_eval = cur_eval;
4569 cur_eval = st;
faec1544 4570 /* now continue from first node in postoned RE */
6bda09f9 4571 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
118e2215 4572 assert(0); /* NOTREACHED */
c277df42 4573 }
40a82448 4574
faec1544
DM
4575 case EVAL_AB: /* cleanup after a successful (??{A})B */
4576 /* note: this is called twice; first after popping B, then A */
4577 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4578 rex_sv = ST.prev_rex;
4579 SET_reg_curpm(rex_sv);
288b8c02 4580 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4581 rexi = RXi_GET(rex);
faec1544
DM
4582 regcpblow(ST.cp);
4583 cur_eval = ST.prev_eval;
4584 cur_curlyx = ST.prev_curlyx;
34a81e2b 4585
40a82448
DM
4586 /* XXXX This is too dramatic a measure... */
4587 PL_reg_maxiter = 0;
e7707071 4588 if ( nochange_depth )
4b196cd4 4589 nochange_depth--;
262b90c4 4590 sayYES;
40a82448 4591
40a82448 4592
faec1544
DM
4593 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4594 /* note: this is called twice; first after popping B, then A */
4595 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4596 rex_sv = ST.prev_rex;
4597 SET_reg_curpm(rex_sv);
288b8c02 4598 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4599 rexi = RXi_GET(rex);
0357f1fd 4600
40a82448
DM
4601 PL_reginput = locinput;
4602 REGCP_UNWIND(ST.lastcp);
4603 regcppop(rex);
faec1544
DM
4604 cur_eval = ST.prev_eval;
4605 cur_curlyx = ST.prev_curlyx;
4606 /* XXXX This is too dramatic a measure... */
4607 PL_reg_maxiter = 0;
e7707071 4608 if ( nochange_depth )
4b196cd4 4609 nochange_depth--;
40a82448 4610 sayNO_SILENT;
40a82448
DM
4611#undef ST
4612
a0d0e21e 4613 case OPEN:
c277df42 4614 n = ARG(scan); /* which paren pair */
1ca2007e 4615 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
4616 if (n > PL_regsize)
4617 PL_regsize = n;
495f47a5
DM
4618 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4619 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4620 PTR2UV(rex),
4621 PTR2UV(rex->offs),
4622 (UV)n,
4623 (IV)rex->offs[n].start_tmp,
4624 (UV)PL_regsize
4625 ));
e2e6a0f1 4626 lastopen = n;
a0d0e21e 4627 break;
495f47a5
DM
4628
4629/* XXX really need to log other places start/end are set too */
4630#define CLOSE_CAPTURE \
4631 rex->offs[n].start = rex->offs[n].start_tmp; \
4632 rex->offs[n].end = locinput - PL_bostr; \
4633 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4634 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4635 PTR2UV(rex), \
4636 PTR2UV(rex->offs), \
4637 (UV)n, \
4638 (IV)rex->offs[n].start, \
4639 (IV)rex->offs[n].end \
4640 ))
4641
a0d0e21e 4642 case CLOSE:
c277df42 4643 n = ARG(scan); /* which paren pair */
495f47a5 4644 CLOSE_CAPTURE;
7f69552c
YO
4645 /*if (n > PL_regsize)
4646 PL_regsize = n;*/
b93070ed
DM
4647 if (n > rex->lastparen)
4648 rex->lastparen = n;
4649 rex->lastcloseparen = n;
3b6647e0 4650 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4651 goto fake_end;
4652 }
a0d0e21e 4653 break;
e2e6a0f1
YO
4654 case ACCEPT:
4655 if (ARG(scan)){
4656 regnode *cursor;
4657 for (cursor=scan;
4658 cursor && OP(cursor)!=END;
4659 cursor=regnext(cursor))
4660 {
4661 if ( OP(cursor)==CLOSE ){
4662 n = ARG(cursor);
4663 if ( n <= lastopen ) {
495f47a5 4664 CLOSE_CAPTURE;
e2e6a0f1
YO
4665 /*if (n > PL_regsize)
4666 PL_regsize = n;*/
b93070ed
DM
4667 if (n > rex->lastparen)
4668 rex->lastparen = n;
4669 rex->lastcloseparen = n;
3b6647e0
RB
4670 if ( n == ARG(scan) || (cur_eval &&
4671 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4672 break;
4673 }
4674 }
4675 }
4676 }
4677 goto fake_end;
4678 /*NOTREACHED*/
c277df42
IZ
4679 case GROUPP:
4680 n = ARG(scan); /* which paren pair */
b93070ed 4681 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 4682 break;
0a4db386
YO
4683 case NGROUPP:
4684 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4685 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4686 break;
1a147d38 4687 case INSUBP:
0a4db386 4688 n = ARG(scan);
3b6647e0 4689 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4690 break;
4691 case DEFINEP:
4692 sw = 0;
4693 break;
c277df42 4694 case IFTHEN:
2c2d71f5 4695 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4696 if (sw)
c277df42
IZ
4697 next = NEXTOPER(NEXTOPER(scan));
4698 else {
4699 next = scan + ARG(scan);
4700 if (OP(next) == IFTHEN) /* Fake one. */
4701 next = NEXTOPER(NEXTOPER(next));
4702 }
4703 break;
4704 case LOGICAL:
24d3c4a9 4705 logical = scan->flags;
c277df42 4706 break;
c476f425 4707
2ab05381 4708/*******************************************************************
2ab05381 4709
c476f425
DM
4710The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4711pattern, where A and B are subpatterns. (For simple A, CURLYM or
4712STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4713
c476f425 4714A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4715
c476f425
DM
4716On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4717state, which contains the current count, initialised to -1. It also sets
4718cur_curlyx to point to this state, with any previous value saved in the
4719state block.
2ab05381 4720
c476f425
DM
4721CURLYX then jumps straight to the WHILEM op, rather than executing A,
4722since the pattern may possibly match zero times (i.e. it's a while {} loop
4723rather than a do {} while loop).
2ab05381 4724
c476f425
DM
4725Each entry to WHILEM represents a successful match of A. The count in the
4726CURLYX block is incremented, another WHILEM state is pushed, and execution
4727passes to A or B depending on greediness and the current count.
2ab05381 4728
c476f425
DM
4729For example, if matching against the string a1a2a3b (where the aN are
4730substrings that match /A/), then the match progresses as follows: (the
4731pushed states are interspersed with the bits of strings matched so far):
2ab05381 4732
c476f425
DM
4733 <CURLYX cnt=-1>
4734 <CURLYX cnt=0><WHILEM>
4735 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4736 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4737 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4738 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4739
c476f425
DM
4740(Contrast this with something like CURLYM, which maintains only a single
4741backtrack state:
2ab05381 4742
c476f425
DM
4743 <CURLYM cnt=0> a1
4744 a1 <CURLYM cnt=1> a2
4745 a1 a2 <CURLYM cnt=2> a3
4746 a1 a2 a3 <CURLYM cnt=3> b
4747)
2ab05381 4748
c476f425
DM
4749Each WHILEM state block marks a point to backtrack to upon partial failure
4750of A or B, and also contains some minor state data related to that
4751iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4752overall state, such as the count, and pointers to the A and B ops.
2ab05381 4753
c476f425
DM
4754This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4755must always point to the *current* CURLYX block, the rules are:
2ab05381 4756
c476f425
DM
4757When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4758and set cur_curlyx to point the new block.
2ab05381 4759
c476f425
DM
4760When popping the CURLYX block after a successful or unsuccessful match,
4761restore the previous cur_curlyx.
2ab05381 4762
c476f425
DM
4763When WHILEM is about to execute B, save the current cur_curlyx, and set it
4764to the outer one saved in the CURLYX block.
2ab05381 4765
c476f425
DM
4766When popping the WHILEM block after a successful or unsuccessful B match,
4767restore the previous cur_curlyx.
2ab05381 4768
c476f425
DM
4769Here's an example for the pattern (AI* BI)*BO
4770I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4771
c476f425
DM
4772cur_
4773curlyx backtrack stack
4774------ ---------------
4775NULL
4776CO <CO prev=NULL> <WO>
4777CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4778CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4779NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4780
c476f425
DM
4781At this point the pattern succeeds, and we work back down the stack to
4782clean up, restoring as we go:
95b24440 4783
c476f425
DM
4784CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4785CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4786CO <CO prev=NULL> <WO>
4787NULL
a0374537 4788
c476f425
DM
4789*******************************************************************/
4790
4791#define ST st->u.curlyx
4792
4793 case CURLYX: /* start of /A*B/ (for complex A) */
4794 {
4795 /* No need to save/restore up to this paren */
4796 I32 parenfloor = scan->flags;
4797
4798 assert(next); /* keep Coverity happy */
4799 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4800 next += ARG(next);
4801
4802 /* XXXX Probably it is better to teach regpush to support
4803 parenfloor > PL_regsize... */
b93070ed
DM
4804 if (parenfloor > (I32)rex->lastparen)
4805 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
4806
4807 ST.prev_curlyx= cur_curlyx;
4808 cur_curlyx = st;
4809 ST.cp = PL_savestack_ix;
4810
4811 /* these fields contain the state of the current curly.
4812 * they are accessed by subsequent WHILEMs */
4813 ST.parenfloor = parenfloor;
d02d6d97 4814 ST.me = scan;
c476f425 4815 ST.B = next;
24d3c4a9
DM
4816 ST.minmod = minmod;
4817 minmod = 0;
c476f425
DM
4818 ST.count = -1; /* this will be updated by WHILEM */
4819 ST.lastloc = NULL; /* this will be updated by WHILEM */
4820
4821 PL_reginput = locinput;
4822 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
118e2215 4823 assert(0); /* NOTREACHED */
c476f425 4824 }
a0d0e21e 4825
c476f425 4826 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4827 cur_curlyx = ST.prev_curlyx;
4828 sayYES;
118e2215 4829 assert(0); /* NOTREACHED */
a0d0e21e 4830
c476f425
DM
4831 case CURLYX_end_fail: /* just failed to match all of A*B */
4832 regcpblow(ST.cp);
4833 cur_curlyx = ST.prev_curlyx;
4834 sayNO;
118e2215 4835 assert(0); /* NOTREACHED */
4633a7c4 4836
a0d0e21e 4837
c476f425
DM
4838#undef ST
4839#define ST st->u.whilem
4840
4841 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4842 {
4843 /* see the discussion above about CURLYX/WHILEM */
c476f425 4844 I32 n;
d02d6d97
DM
4845 int min = ARG1(cur_curlyx->u.curlyx.me);
4846 int max = ARG2(cur_curlyx->u.curlyx.me);
4847 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4848
c476f425
DM
4849 assert(cur_curlyx); /* keep Coverity happy */
4850 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4851 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4852 ST.cache_offset = 0;
4853 ST.cache_mask = 0;
4854
4855 PL_reginput = locinput;
4856
4857 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4858 "%*s whilem: matched %ld out of %d..%d\n",
4859 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4860 );
a0d0e21e 4861
c476f425 4862 /* First just match a string of min A's. */
a0d0e21e 4863
d02d6d97 4864 if (n < min) {
b93070ed 4865 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 4866 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4867 REGCP_SET(ST.lastcp);
4868
d02d6d97 4869 PUSH_STATE_GOTO(WHILEM_A_pre, A);
118e2215 4870 assert(0); /* NOTREACHED */
c476f425
DM
4871 }
4872
4873 /* If degenerate A matches "", assume A done. */
4874
4875 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4876 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4877 "%*s whilem: empty match detected, trying continuation...\n",
4878 REPORT_CODE_OFF+depth*2, "")
4879 );
4880 goto do_whilem_B_max;
4881 }
4882
4883 /* super-linear cache processing */
4884
4885 if (scan->flags) {
a0d0e21e 4886
2c2d71f5 4887 if (!PL_reg_maxiter) {
c476f425
DM
4888 /* start the countdown: Postpone detection until we
4889 * know the match is not *that* much linear. */
2c2d71f5 4890 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4891 /* possible overflow for long strings and many CURLYX's */
4892 if (PL_reg_maxiter < 0)
4893 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4894 PL_reg_leftiter = PL_reg_maxiter;
4895 }
c476f425 4896
2c2d71f5 4897 if (PL_reg_leftiter-- == 0) {
c476f425 4898 /* initialise cache */
3298f257 4899 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 4900 if (PL_reg_poscache) {
eb160463 4901 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
4902 Renew(PL_reg_poscache, size, char);
4903 PL_reg_poscache_size = size;
4904 }
4905 Zero(PL_reg_poscache, size, char);
4906 }
4907 else {
4908 PL_reg_poscache_size = size;
a02a5408 4909 Newxz(PL_reg_poscache, size, char);
2c2d71f5 4910 }
c476f425
DM
4911 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4912 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4913 PL_colors[4], PL_colors[5])
4914 );
2c2d71f5 4915 }
c476f425 4916
2c2d71f5 4917 if (PL_reg_leftiter < 0) {
c476f425
DM
4918 /* have we already failed at this position? */
4919 I32 offset, mask;
4920 offset = (scan->flags & 0xf) - 1
4921 + (locinput - PL_bostr) * (scan->flags>>4);
4922 mask = 1 << (offset % 8);
4923 offset /= 8;
4924 if (PL_reg_poscache[offset] & mask) {
4925 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4926 "%*s whilem: (cache) already tried at this position...\n",
4927 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 4928 );
3298f257 4929 sayNO; /* cache records failure */
2c2d71f5 4930 }
c476f425
DM
4931 ST.cache_offset = offset;
4932 ST.cache_mask = mask;
2c2d71f5 4933 }
c476f425 4934 }
2c2d71f5 4935
c476f425 4936 /* Prefer B over A for minimal matching. */
a687059c 4937
c476f425
DM
4938 if (cur_curlyx->u.curlyx.minmod) {
4939 ST.save_curlyx = cur_curlyx;
4940 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 4941 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425
DM
4942 REGCP_SET(ST.lastcp);
4943 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
118e2215 4944 assert(0); /* NOTREACHED */
c476f425 4945 }
a0d0e21e 4946
c476f425
DM
4947 /* Prefer A over B for maximal matching. */
4948
d02d6d97 4949 if (n < max) { /* More greed allowed? */
b93070ed 4950 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
4951 cur_curlyx->u.curlyx.lastloc = locinput;
4952 REGCP_SET(ST.lastcp);
d02d6d97 4953 PUSH_STATE_GOTO(WHILEM_A_max, A);
118e2215 4954 assert(0); /* NOTREACHED */
c476f425
DM
4955 }
4956 goto do_whilem_B_max;
4957 }
118e2215 4958 assert(0); /* NOTREACHED */
c476f425
DM
4959
4960 case WHILEM_B_min: /* just matched B in a minimal match */
4961 case WHILEM_B_max: /* just matched B in a maximal match */
4962 cur_curlyx = ST.save_curlyx;
4963 sayYES;
118e2215 4964 assert(0); /* NOTREACHED */
c476f425
DM
4965
4966 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4967 cur_curlyx = ST.save_curlyx;
4968 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4969 cur_curlyx->u.curlyx.count--;
4970 CACHEsayNO;
118e2215 4971 assert(0); /* NOTREACHED */
c476f425
DM
4972
4973 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
4974 /* FALL THROUGH */
4975 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
4976 REGCP_UNWIND(ST.lastcp);
4977 regcppop(rex);
c476f425
DM
4978 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4979 cur_curlyx->u.curlyx.count--;
4980 CACHEsayNO;
118e2215 4981 assert(0); /* NOTREACHED */
c476f425
DM
4982
4983 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4984 REGCP_UNWIND(ST.lastcp);
4985 regcppop(rex); /* Restore some previous $<digit>s? */
4986 PL_reginput = locinput;
4987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4988 "%*s whilem: failed, trying continuation...\n",
4989 REPORT_CODE_OFF+depth*2, "")
4990 );
4991 do_whilem_B_max:
4992 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4993 && ckWARN(WARN_REGEXP)
4994 && !(PL_reg_flags & RF_warned))
4995 {
4996 PL_reg_flags |= RF_warned;
dcbac5bb
FC
4997 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4998 "Complex regular subexpression recursion limit (%d) "
4999 "exceeded",
c476f425
DM
5000 REG_INFTY - 1);
5001 }
5002
5003 /* now try B */
5004 ST.save_curlyx = cur_curlyx;
5005 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5006 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
118e2215 5007 assert(0); /* NOTREACHED */
c476f425
DM
5008
5009 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5010 cur_curlyx = ST.save_curlyx;
5011 REGCP_UNWIND(ST.lastcp);
5012 regcppop(rex);
5013
d02d6d97 5014 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5015 /* Maximum greed exceeded */
5016 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5017 && ckWARN(WARN_REGEXP)
5018 && !(PL_reg_flags & RF_warned))
5019 {
3280af22 5020 PL_reg_flags |= RF_warned;
c476f425 5021 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5022 "Complex regular subexpression recursion "
5023 "limit (%d) exceeded",
c476f425 5024 REG_INFTY - 1);
a0d0e21e 5025 }
c476f425 5026 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5027 CACHEsayNO;
a0d0e21e 5028 }
c476f425
DM
5029
5030 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5031 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5032 );
5033 /* Try grabbing another A and see if it helps. */
5034 PL_reginput = locinput;
5035 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5036 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5037 REGCP_SET(ST.lastcp);
d02d6d97
DM
5038 PUSH_STATE_GOTO(WHILEM_A_min,
5039 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
118e2215 5040 assert(0); /* NOTREACHED */
40a82448
DM
5041
5042#undef ST
5043#define ST st->u.branch
5044
5045 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5046 next = scan + ARG(scan);
5047 if (next == scan)
5048 next = NULL;
40a82448
DM
5049 scan = NEXTOPER(scan);
5050 /* FALL THROUGH */
c277df42 5051
40a82448
DM
5052 case BRANCH: /* /(...|A|...)/ */
5053 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5054 ST.lastparen = rex->lastparen;
f6033a9d 5055 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5056 ST.next_branch = next;
5057 REGCP_SET(ST.cp);
5058 PL_reginput = locinput;
02db2b7b 5059
40a82448 5060 /* Now go into the branch */
5d458dd8
YO
5061 if (has_cutgroup) {
5062 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5063 } else {
5064 PUSH_STATE_GOTO(BRANCH_next, scan);
5065 }
118e2215 5066 assert(0); /* NOTREACHED */
5d458dd8
YO
5067 case CUTGROUP:
5068 PL_reginput = locinput;
5069 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5070 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8 5071 PUSH_STATE_GOTO(CUTGROUP_next,next);
118e2215 5072 assert(0); /* NOTREACHED */
5d458dd8
YO
5073 case CUTGROUP_next_fail:
5074 do_cutgroup = 1;
5075 no_final = 1;
5076 if (st->u.mark.mark_name)
5077 sv_commit = st->u.mark.mark_name;
5078 sayNO;
118e2215 5079 assert(0); /* NOTREACHED */
5d458dd8
YO
5080 case BRANCH_next:
5081 sayYES;
118e2215 5082 assert(0); /* NOTREACHED */
40a82448 5083 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5084 if (do_cutgroup) {
5085 do_cutgroup = 0;
5086 no_final = 0;
5087 }
40a82448 5088 REGCP_UNWIND(ST.cp);
a8d1f4b4 5089 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5090 scan = ST.next_branch;
5091 /* no more branches? */
5d458dd8
YO
5092 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5093 DEBUG_EXECUTE_r({
5094 PerlIO_printf( Perl_debug_log,
5095 "%*s %sBRANCH failed...%s\n",
5096 REPORT_CODE_OFF+depth*2, "",
5097 PL_colors[4],
5098 PL_colors[5] );
5099 });
5100 sayNO_SILENT;
5101 }
40a82448 5102 continue; /* execute next BRANCH[J] op */
118e2215 5103 assert(0); /* NOTREACHED */
40a82448 5104
a0d0e21e 5105 case MINMOD:
24d3c4a9 5106 minmod = 1;
a0d0e21e 5107 break;
40a82448
DM
5108
5109#undef ST
5110#define ST st->u.curlym
5111
5112 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5113
5114 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5115 * only a single backtracking state, no matter how many matches
40a82448
DM
5116 * there are in {m,n}. It relies on the pattern being constant
5117 * length, with no parens to influence future backrefs
5118 */
5119
5120 ST.me = scan;
dc45a647 5121 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5122
f6033a9d
DM
5123 ST.lastparen = rex->lastparen;
5124 ST.lastcloseparen = rex->lastcloseparen;
5125
40a82448
DM
5126 /* if paren positive, emulate an OPEN/CLOSE around A */
5127 if (ST.me->flags) {
3b6647e0 5128 U32 paren = ST.me->flags;
40a82448
DM
5129 if (paren > PL_regsize)
5130 PL_regsize = paren;
c277df42 5131 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5132 }
40a82448
DM
5133 ST.A = scan;
5134 ST.B = next;
5135 ST.alen = 0;
5136 ST.count = 0;
24d3c4a9
DM
5137 ST.minmod = minmod;
5138 minmod = 0;
40a82448
DM
5139 ST.c1 = CHRTEST_UNINIT;
5140 REGCP_SET(ST.cp);
6407bf3b 5141
40a82448
DM
5142 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5143 goto curlym_do_B;
5144
5145 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 5146 PL_reginput = locinput;
40a82448 5147 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
118e2215 5148 assert(0); /* NOTREACHED */
5f80c4cf 5149
40a82448
DM
5150 case CURLYM_A: /* we've just matched an A */
5151 locinput = st->locinput;
5152 nextchr = UCHARAT(locinput);
5153
5154 ST.count++;
5155 /* after first match, determine A's length: u.curlym.alen */
5156 if (ST.count == 1) {
5157 if (PL_reg_match_utf8) {
5158 char *s = locinput;
5159 while (s < PL_reginput) {
5160 ST.alen++;
5161 s += UTF8SKIP(s);
5162 }
5163 }
5164 else {
5165 ST.alen = PL_reginput - locinput;
5166 }
5167 if (ST.alen == 0)
5168 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5169 }
0cadcf80
DM
5170 DEBUG_EXECUTE_r(
5171 PerlIO_printf(Perl_debug_log,
40a82448 5172 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5173 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5174 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5175 );
5176
40a82448 5177 locinput = PL_reginput;
0a4db386
YO
5178
5179 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5180 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5181 goto fake_end;
5182
c966426a
DM
5183 {
5184 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5185 if ( max == REG_INFTY || ST.count < max )
5186 goto curlym_do_A; /* try to match another A */
5187 }
40a82448 5188 goto curlym_do_B; /* try to match B */
5f80c4cf 5189
40a82448
DM
5190 case CURLYM_A_fail: /* just failed to match an A */
5191 REGCP_UNWIND(ST.cp);
0a4db386
YO
5192
5193 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5194 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5195 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5196 sayNO;
0cadcf80 5197
40a82448
DM
5198 curlym_do_B: /* execute the B in /A{m,n}B/ */
5199 PL_reginput = locinput;
5200 if (ST.c1 == CHRTEST_UNINIT) {
5201 /* calculate c1 and c2 for possible match of 1st char
5202 * following curly */
5203 ST.c1 = ST.c2 = CHRTEST_VOID;
5204 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5205 regnode *text_node = ST.B;
5206 if (! HAS_TEXT(text_node))
5207 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5208 /* this used to be
5209
5210 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5211
5212 But the former is redundant in light of the latter.
5213
5214 if this changes back then the macro for
5215 IS_TEXT and friends need to change.
5216 */
5217 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5218 {
ee9b8eae 5219
40a82448 5220 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5221 switch (OP(text_node)) {
5222 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5223 case EXACTFA:
3c760661 5224 case EXACTFU_SS:
fab2782b 5225 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5226 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5227 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5228 default: ST.c2 = ST.c1;
5229 }
c277df42 5230 }
c277df42 5231 }
40a82448
DM
5232 }
5233
5234 DEBUG_EXECUTE_r(
5235 PerlIO_printf(Perl_debug_log,
5236 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5237 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5238 "", (IV)ST.count)
5239 );
5240 if (ST.c1 != CHRTEST_VOID
5241 && UCHARAT(PL_reginput) != ST.c1
5242 && UCHARAT(PL_reginput) != ST.c2)
5243 {
5244 /* simulate B failing */
3e901dc0
YO
5245 DEBUG_OPTIMISE_r(
5246 PerlIO_printf(Perl_debug_log,
5247 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5248 (int)(REPORT_CODE_OFF+(depth*2)),"",
5249 (IV)ST.c1,(IV)ST.c2
5250 ));
40a82448
DM
5251 state_num = CURLYM_B_fail;
5252 goto reenter_switch;
5253 }
5254
5255 if (ST.me->flags) {
f6033a9d 5256 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5257 I32 paren = ST.me->flags;
5258 if (ST.count) {
b93070ed 5259 rex->offs[paren].start
40a82448 5260 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
b93070ed 5261 rex->offs[paren].end = PL_reginput - PL_bostr;
f6033a9d
DM
5262 if ((U32)paren > rex->lastparen)
5263 rex->lastparen = paren;
5264 rex->lastcloseparen = paren;
c277df42 5265 }
40a82448 5266 else
b93070ed 5267 rex->offs[paren].end = -1;
0a4db386 5268 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5269 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5270 {
5271 if (ST.count)
5272 goto fake_end;
5273 else
5274 sayNO;
5275 }
c277df42 5276 }
0a4db386 5277
40a82448 5278 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
118e2215 5279 assert(0); /* NOTREACHED */
40a82448
DM
5280
5281 case CURLYM_B_fail: /* just failed to match a B */
5282 REGCP_UNWIND(ST.cp);
a8d1f4b4 5283 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5284 if (ST.minmod) {
84d2fa14
HS
5285 I32 max = ARG2(ST.me);
5286 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5287 sayNO;
5288 goto curlym_do_A; /* try to match a further A */
5289 }
5290 /* backtrack one A */
5291 if (ST.count == ARG1(ST.me) /* min */)
5292 sayNO;
5293 ST.count--;
5294 locinput = HOPc(locinput, -ST.alen);
5295 goto curlym_do_B; /* try to match B */
5296
c255a977
DM
5297#undef ST
5298#define ST st->u.curly
40a82448 5299
c255a977
DM
5300#define CURLY_SETPAREN(paren, success) \
5301 if (paren) { \
5302 if (success) { \
b93070ed
DM
5303 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5304 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5305 if (paren > rex->lastparen) \
5306 rex->lastparen = paren; \
b93070ed 5307 rex->lastcloseparen = paren; \
c255a977 5308 } \
f6033a9d 5309 else { \
b93070ed 5310 rex->offs[paren].end = -1; \
f6033a9d
DM
5311 rex->lastparen = ST.lastparen; \
5312 rex->lastcloseparen = ST.lastcloseparen; \
5313 } \
c255a977
DM
5314 }
5315
5316 case STAR: /* /A*B/ where A is width 1 */
5317 ST.paren = 0;
5318 ST.min = 0;
5319 ST.max = REG_INFTY;
a0d0e21e
LW
5320 scan = NEXTOPER(scan);
5321 goto repeat;
c255a977
DM
5322 case PLUS: /* /A+B/ where A is width 1 */
5323 ST.paren = 0;
5324 ST.min = 1;
5325 ST.max = REG_INFTY;
c277df42 5326 scan = NEXTOPER(scan);
c255a977
DM
5327 goto repeat;
5328 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5329 ST.paren = scan->flags; /* Which paren to set */
f6033a9d
DM
5330 ST.lastparen = rex->lastparen;
5331 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5332 if (ST.paren > PL_regsize)
5333 PL_regsize = ST.paren;
c255a977
DM
5334 ST.min = ARG1(scan); /* min to match */
5335 ST.max = ARG2(scan); /* max to match */
0a4db386 5336 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5337 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5338 ST.min=1;
5339 ST.max=1;
5340 }
c255a977
DM
5341 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5342 goto repeat;
5343 case CURLY: /* /A{m,n}B/ where A is width 1 */
5344 ST.paren = 0;
5345 ST.min = ARG1(scan); /* min to match */
5346 ST.max = ARG2(scan); /* max to match */
5347 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5348 repeat:
a0d0e21e
LW
5349 /*
5350 * Lookahead to avoid useless match attempts
5351 * when we know what character comes next.
c255a977 5352 *
5f80c4cf
JP
5353 * Used to only do .*x and .*?x, but now it allows
5354 * for )'s, ('s and (?{ ... })'s to be in the way
5355 * of the quantifier and the EXACT-like node. -- japhy
5356 */
5357
c255a977
DM
5358 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5359 sayNO;
cca55fe3 5360 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5361 U8 *s;
5362 regnode *text_node = next;
5363
3dab1dad
YO
5364 if (! HAS_TEXT(text_node))
5365 FIND_NEXT_IMPT(text_node);
5f80c4cf 5366
9e137952 5367 if (! HAS_TEXT(text_node))
c255a977 5368 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5369 else {
ee9b8eae 5370 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5371 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5372 goto assume_ok_easy;
cca55fe3 5373 }
be8e71aa
YO
5374 else
5375 s = (U8*)STRING(text_node);
ee9b8eae
YO
5376
5377 /* Currently we only get here when
5378
5379 PL_rekind[OP(text_node)] == EXACT
5380
5381 if this changes back then the macro for IS_TEXT and
5382 friends need to change. */
f2ed9b32 5383 if (!UTF_PATTERN) {
9a5a5549
KW
5384 ST.c1 = *s;
5385 switch (OP(text_node)) {
5386 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5387 case EXACTFA:
3c760661 5388 case EXACTFU_SS:
fab2782b 5389 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5390 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5391 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5392 default: ST.c2 = ST.c1; break;
5393 }
1aa99e6b 5394 }
f2ed9b32 5395 else { /* UTF_PATTERN */
9a5a5549 5396 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
b0573d8b
KW
5397 STRLEN ulen;
5398 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5399
5400 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5401 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
e294cc5d 5402 uniflags);
5f80c4cf
JP
5403 }
5404 else {
c255a977 5405 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5406 uniflags);
5f80c4cf 5407 }
1aa99e6b
IH
5408 }
5409 }
bbce6d69 5410 }
a0d0e21e 5411 else
c255a977 5412 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5413 assume_ok_easy:
c255a977
DM
5414
5415 ST.A = scan;
5416 ST.B = next;
3280af22 5417 PL_reginput = locinput;
24d3c4a9
DM
5418 if (minmod) {
5419 minmod = 0;
e2e6a0f1 5420 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 5421 sayNO;
c255a977 5422 ST.count = ST.min;
a0ed51b3 5423 locinput = PL_reginput;
c255a977
DM
5424 REGCP_SET(ST.cp);
5425 if (ST.c1 == CHRTEST_VOID)
5426 goto curly_try_B_min;
5427
5428 ST.oldloc = locinput;
5429
5430 /* set ST.maxpos to the furthest point along the
5431 * string that could possibly match */
5432 if (ST.max == REG_INFTY) {
5433 ST.maxpos = PL_regeol - 1;
f2ed9b32 5434 if (utf8_target)
c255a977
DM
5435 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5436 ST.maxpos--;
5437 }
f2ed9b32 5438 else if (utf8_target) {
c255a977
DM
5439 int m = ST.max - ST.min;
5440 for (ST.maxpos = locinput;
5441 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5442 ST.maxpos += UTF8SKIP(ST.maxpos);
5443 }
5444 else {
5445 ST.maxpos = locinput + ST.max - ST.min;
5446 if (ST.maxpos >= PL_regeol)
5447 ST.maxpos = PL_regeol - 1;
5448 }
5449 goto curly_try_B_min_known;
5450
5451 }
5452 else {
e2e6a0f1 5453 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
5454 locinput = PL_reginput;
5455 if (ST.count < ST.min)
5456 sayNO;
5457 if ((ST.count > ST.min)
5458 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5459 {
5460 /* A{m,n} must come at the end of the string, there's
5461 * no point in backing off ... */
5462 ST.min = ST.count;
5463 /* ...except that $ and \Z can match before *and* after
5464 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5465 We may back off by one in this case. */
5466 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5467 ST.min--;
5468 }
5469 REGCP_SET(ST.cp);
5470 goto curly_try_B_max;
5471 }
118e2215 5472 assert(0); /* NOTREACHED */
c255a977
DM
5473
5474
5475 case CURLY_B_min_known_fail:
5476 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977
DM
5477
5478 PL_reginput = locinput; /* Could be reset... */
5479 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5480 if (ST.paren) {
5481 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5482 }
c255a977
DM
5483 /* Couldn't or didn't -- move forward. */
5484 ST.oldloc = locinput;
f2ed9b32 5485 if (utf8_target)
c255a977
DM
5486 locinput += UTF8SKIP(locinput);
5487 else
5488 locinput++;
5489 ST.count++;
5490 curly_try_B_min_known:
5491 /* find the next place where 'B' could work, then call B */
5492 {
5493 int n;
f2ed9b32 5494 if (utf8_target) {
c255a977
DM
5495 n = (ST.oldloc == locinput) ? 0 : 1;
5496 if (ST.c1 == ST.c2) {
5497 STRLEN len;
5498 /* set n to utf8_distance(oldloc, locinput) */
5499 while (locinput <= ST.maxpos &&
5500 utf8n_to_uvchr((U8*)locinput,
5501 UTF8_MAXBYTES, &len,
5502 uniflags) != (UV)ST.c1) {
5503 locinput += len;
5504 n++;
5505 }
1aa99e6b
IH
5506 }
5507 else {
c255a977
DM
5508 /* set n to utf8_distance(oldloc, locinput) */
5509 while (locinput <= ST.maxpos) {
5510 STRLEN len;
5511 const UV c = utf8n_to_uvchr((U8*)locinput,
5512 UTF8_MAXBYTES, &len,
5513 uniflags);
5514 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5515 break;
5516 locinput += len;
5517 n++;
1aa99e6b 5518 }
0fe9bf95
IZ
5519 }
5520 }
c255a977
DM
5521 else {
5522 if (ST.c1 == ST.c2) {
5523 while (locinput <= ST.maxpos &&
5524 UCHARAT(locinput) != ST.c1)
5525 locinput++;
bbce6d69 5526 }
c255a977
DM
5527 else {
5528 while (locinput <= ST.maxpos
5529 && UCHARAT(locinput) != ST.c1
5530 && UCHARAT(locinput) != ST.c2)
5531 locinput++;
a0ed51b3 5532 }
c255a977
DM
5533 n = locinput - ST.oldloc;
5534 }
5535 if (locinput > ST.maxpos)
5536 sayNO;
5537 /* PL_reginput == oldloc now */
5538 if (n) {
5539 ST.count += n;
e2e6a0f1 5540 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5541 sayNO;
a0d0e21e 5542 }
c255a977
DM
5543 PL_reginput = locinput;
5544 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5545 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5546 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5547 goto fake_end;
5548 }
c255a977 5549 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5550 }
118e2215 5551 assert(0); /* NOTREACHED */
c255a977
DM
5552
5553
5554 case CURLY_B_min_fail:
5555 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5556
5557 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5558 if (ST.paren) {
5559 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5560 }
c255a977
DM
5561 /* failed -- move forward one */
5562 PL_reginput = locinput;
e2e6a0f1 5563 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5564 ST.count++;
a0ed51b3 5565 locinput = PL_reginput;
c255a977
DM
5566 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5567 ST.count > 0)) /* count overflow ? */
15272685 5568 {
c255a977
DM
5569 curly_try_B_min:
5570 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5571 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5572 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5573 goto fake_end;
5574 }
c255a977 5575 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5576 }
5577 }
4633a7c4 5578 sayNO;
118e2215 5579 assert(0); /* NOTREACHED */
c255a977
DM
5580
5581
5582 curly_try_B_max:
5583 /* a successful greedy match: now try to match B */
40d049e4 5584 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5585 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5586 goto fake_end;
5587 }
c255a977
DM
5588 {
5589 UV c = 0;
5590 if (ST.c1 != CHRTEST_VOID)
f2ed9b32 5591 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
c255a977 5592 UTF8_MAXBYTES, 0, uniflags)
466787eb 5593 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5594 /* If it could work, try it. */
5595 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5596 CURLY_SETPAREN(ST.paren, ST.count);
5597 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
118e2215 5598 assert(0); /* NOTREACHED */
c255a977
DM
5599 }
5600 }
5601 /* FALL THROUGH */
5602 case CURLY_B_max_fail:
5603 /* failed to find B in a greedy match */
c255a977
DM
5604
5605 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5606 if (ST.paren) {
5607 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5608 }
c255a977
DM
5609 /* back up. */
5610 if (--ST.count < ST.min)
5611 sayNO;
5612 PL_reginput = locinput = HOPc(locinput, -1);
5613 goto curly_try_B_max;
5614
5615#undef ST
5616
a0d0e21e 5617 case END:
6bda09f9 5618 fake_end:
faec1544
DM
5619 if (cur_eval) {
5620 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
5621 st->u.eval.toggle_reg_flags
5622 = cur_eval->u.eval.toggle_reg_flags;
5623 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5624
288b8c02 5625 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 5626 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
5627 rex_sv = cur_eval->u.eval.prev_rex;
5628 SET_reg_curpm(rex_sv);
288b8c02 5629 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5630 rexi = RXi_GET(rex);
faec1544 5631 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 5632
faec1544
DM
5633 REGCP_SET(st->u.eval.lastcp);
5634 PL_reginput = locinput;
5635
5636 /* Restore parens of the outer rex without popping the
5637 * savestack */
74088413 5638 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
5639
5640 st->u.eval.prev_eval = cur_eval;
5641 cur_eval = cur_eval->u.eval.prev_eval;
5642 DEBUG_EXECUTE_r(
2a49f0f5
JH
5643 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5644 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5645 if ( nochange_depth )
5646 nochange_depth--;
5647
5648 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5649 st->u.eval.prev_eval->u.eval.B); /* match B */
5650 }
5651
3b0527fe 5652 if (locinput < reginfo->till) {
a3621e74 5653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5654 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5655 PL_colors[4],
5656 (long)(locinput - PL_reg_starttry),
3b0527fe 5657 (long)(reginfo->till - PL_reg_starttry),
7821416a 5658 PL_colors[5]));
58e23c8d 5659
262b90c4 5660 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5661 }
5662 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5663 sayYES; /* Success! */
dad79028
DM
5664
5665 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5666 DEBUG_EXECUTE_r(
5667 PerlIO_printf(Perl_debug_log,
5668 "%*s %ssubpattern success...%s\n",
5bc10b2c 5669 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5670 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5671 sayYES; /* Success! */
dad79028 5672
40a82448
DM
5673#undef ST
5674#define ST st->u.ifmatch
5675
5676 case SUSPEND: /* (?>A) */
5677 ST.wanted = 1;
9fe1d20c 5678 PL_reginput = locinput;
9041c2e3 5679 goto do_ifmatch;
dad79028 5680
40a82448
DM
5681 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5682 ST.wanted = 0;
dad79028
DM
5683 goto ifmatch_trivial_fail_test;
5684
40a82448
DM
5685 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5686 ST.wanted = 1;
dad79028 5687 ifmatch_trivial_fail_test:
a0ed51b3 5688 if (scan->flags) {
52657f30 5689 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5690 if (!s) {
5691 /* trivial fail */
24d3c4a9
DM
5692 if (logical) {
5693 logical = 0;
f2338a2e 5694 sw = 1 - cBOOL(ST.wanted);
dad79028 5695 }
40a82448 5696 else if (ST.wanted)
dad79028
DM
5697 sayNO;
5698 next = scan + ARG(scan);
5699 if (next == scan)
5700 next = NULL;
5701 break;
5702 }
efb30f32 5703 PL_reginput = s;
a0ed51b3
LW
5704 }
5705 else
5706 PL_reginput = locinput;
5707
c277df42 5708 do_ifmatch:
40a82448 5709 ST.me = scan;
24d3c4a9 5710 ST.logical = logical;
24d786f4
YO
5711 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5712
40a82448
DM
5713 /* execute body of (?...A) */
5714 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
118e2215 5715 assert(0); /* NOTREACHED */
40a82448
DM
5716
5717 case IFMATCH_A_fail: /* body of (?...A) failed */
5718 ST.wanted = !ST.wanted;
5719 /* FALL THROUGH */
5720
5721 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5722 if (ST.logical) {
f2338a2e 5723 sw = cBOOL(ST.wanted);
40a82448
DM
5724 }
5725 else if (!ST.wanted)
5726 sayNO;
5727
5728 if (OP(ST.me) == SUSPEND)
5729 locinput = PL_reginput;
5730 else {
5731 locinput = PL_reginput = st->locinput;
5732 nextchr = UCHARAT(locinput);
5733 }
5734 scan = ST.me + ARG(ST.me);
5735 if (scan == ST.me)
5736 scan = NULL;
5737 continue; /* execute B */
5738
5739#undef ST
dad79028 5740
c277df42 5741 case LONGJMP:
c277df42
IZ
5742 next = scan + ARG(scan);
5743 if (next == scan)
5744 next = NULL;
a0d0e21e 5745 break;
54612592 5746 case COMMIT:
e2e6a0f1
YO
5747 reginfo->cutpoint = PL_regeol;
5748 /* FALLTHROUGH */
5d458dd8 5749 case PRUNE:
24b23f37 5750 PL_reginput = locinput;
e2e6a0f1 5751 if (!scan->flags)
ad64d0ec 5752 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592 5753 PUSH_STATE_GOTO(COMMIT_next,next);
118e2215 5754 assert(0); /* NOTREACHED */
54612592
YO
5755 case COMMIT_next_fail:
5756 no_final = 1;
5757 /* FALLTHROUGH */
7f69552c
YO
5758 case OPFAIL:
5759 sayNO;
118e2215 5760 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5761
5762#define ST st->u.mark
5763 case MARKPOINT:
5764 ST.prev_mark = mark_state;
5d458dd8 5765 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5766 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5767 mark_state = st;
5768 ST.mark_loc = PL_reginput = locinput;
5769 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
118e2215 5770 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5771 case MARKPOINT_next:
5772 mark_state = ST.prev_mark;
5773 sayYES;
118e2215 5774 assert(0); /* NOTREACHED */
e2e6a0f1 5775 case MARKPOINT_next_fail:
5d458dd8 5776 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5777 {
5778 if (ST.mark_loc > startpoint)
5779 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5780 popmark = NULL; /* we found our mark */
5781 sv_commit = ST.mark_name;
5782
5783 DEBUG_EXECUTE_r({
5d458dd8 5784 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5785 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5786 REPORT_CODE_OFF+depth*2, "",
be2597df 5787 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5788 });
5789 }
5790 mark_state = ST.prev_mark;
5d458dd8
YO
5791 sv_yes_mark = mark_state ?
5792 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 5793 sayNO;
118e2215 5794 assert(0); /* NOTREACHED */
5d458dd8
YO
5795 case SKIP:
5796 PL_reginput = locinput;
5797 if (scan->flags) {
2bf803e2 5798 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5799 ST.mark_name = NULL;
e2e6a0f1 5800 ST.mark_loc = locinput;
5d458dd8
YO
5801 PUSH_STATE_GOTO(SKIP_next,next);
5802 } else {
2bf803e2 5803 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5804 otherwise do nothing. Meaning we need to scan
5805 */
5806 regmatch_state *cur = mark_state;
ad64d0ec 5807 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5808
5809 while (cur) {
5810 if ( sv_eq( cur->u.mark.mark_name,
5811 find ) )
5812 {
5813 ST.mark_name = find;
5814 PUSH_STATE_GOTO( SKIP_next, next );
5815 }
5816 cur = cur->u.mark.prev_mark;
5817 }
e2e6a0f1 5818 }
2bf803e2 5819 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5820 break;
5821 case SKIP_next_fail:
5822 if (ST.mark_name) {
5823 /* (*CUT:NAME) - Set up to search for the name as we
5824 collapse the stack*/
5825 popmark = ST.mark_name;
5826 } else {
5827 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5828 if (ST.mark_loc > startpoint)
5829 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5830 /* but we set sv_commit to latest mark_name if there
5831 is one so they can test to see how things lead to this
5832 cut */
5833 if (mark_state)
5834 sv_commit=mark_state->u.mark.mark_name;
5835 }
e2e6a0f1
YO
5836 no_final = 1;
5837 sayNO;
118e2215 5838 assert(0); /* NOTREACHED */
e2e6a0f1 5839#undef ST
e1d1eefb 5840 case LNBREAK:
f2ed9b32 5841 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5842 locinput += n;
5843 nextchr = UCHARAT(locinput);
5844 } else
5845 sayNO;
5846 break;
5847
5848#define CASE_CLASS(nAmE) \
5849 case nAmE: \
1380b51d
DM
5850 if (locinput >= PL_regeol) \
5851 sayNO; \
f2ed9b32 5852 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5853 locinput += n; \
5854 nextchr = UCHARAT(locinput); \
5855 } else \
5856 sayNO; \
5857 break; \
5858 case N##nAmE: \
1380b51d
DM
5859 if (locinput >= PL_regeol) \
5860 sayNO; \
f2ed9b32 5861 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5862 sayNO; \
5863 } else { \
5864 locinput += UTF8SKIP(locinput); \
5865 nextchr = UCHARAT(locinput); \
5866 } \
5867 break
5868
5869 CASE_CLASS(VERTWS);
5870 CASE_CLASS(HORIZWS);
5871#undef CASE_CLASS
5872
a0d0e21e 5873 default:
b900a521 5874 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5875 PTR2UV(scan), OP(scan));
cea2e8a9 5876 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5877
5878 } /* end switch */
95b24440 5879
5d458dd8
YO
5880 /* switch break jumps here */
5881 scan = next; /* prepare to execute the next op and ... */
5882 continue; /* ... jump back to the top, reusing st */
118e2215 5883 assert(0); /* NOTREACHED */
95b24440 5884
40a82448
DM
5885 push_yes_state:
5886 /* push a state that backtracks on success */
5887 st->u.yes.prev_yes_state = yes_state;
5888 yes_state = st;
5889 /* FALL THROUGH */
5890 push_state:
5891 /* push a new regex state, then continue at scan */
5892 {
5893 regmatch_state *newst;
5894
24b23f37
YO
5895 DEBUG_STACK_r({
5896 regmatch_state *cur = st;
5897 regmatch_state *curyes = yes_state;
5898 int curd = depth;
5899 regmatch_slab *slab = PL_regmatch_slab;
5900 for (;curd > -1;cur--,curd--) {
5901 if (cur < SLAB_FIRST(slab)) {
5902 slab = slab->prev;
5903 cur = SLAB_LAST(slab);
5904 }
5905 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5906 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 5907 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
5908 (curyes == cur) ? "yes" : ""
5909 );
5910 if (curyes == cur)
5911 curyes = cur->u.yes.prev_yes_state;
5912 }
5913 } else
5914 DEBUG_STATE_pp("push")
5915 );
40a82448 5916 depth++;
40a82448
DM
5917 st->locinput = locinput;
5918 newst = st+1;
5919 if (newst > SLAB_LAST(PL_regmatch_slab))
5920 newst = S_push_slab(aTHX);
5921 PL_regmatch_state = newst;
786e8c11 5922
40a82448
DM
5923 locinput = PL_reginput;
5924 nextchr = UCHARAT(locinput);
5925 st = newst;
5926 continue;
118e2215 5927 assert(0); /* NOTREACHED */
40a82448 5928 }
a0d0e21e 5929 }
a687059c 5930
a0d0e21e
LW
5931 /*
5932 * We get here only if there's trouble -- normally "case END" is
5933 * the terminating point.
5934 */
cea2e8a9 5935 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 5936 /*NOTREACHED*/
4633a7c4
LW
5937 sayNO;
5938
262b90c4 5939yes:
77cb431f
DM
5940 if (yes_state) {
5941 /* we have successfully completed a subexpression, but we must now
5942 * pop to the state marked by yes_state and continue from there */
77cb431f 5943 assert(st != yes_state);
5bc10b2c
DM
5944#ifdef DEBUGGING
5945 while (st != yes_state) {
5946 st--;
5947 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5948 PL_regmatch_slab = PL_regmatch_slab->prev;
5949 st = SLAB_LAST(PL_regmatch_slab);
5950 }
e2e6a0f1 5951 DEBUG_STATE_r({
54612592
YO
5952 if (no_final) {
5953 DEBUG_STATE_pp("pop (no final)");
5954 } else {
5955 DEBUG_STATE_pp("pop (yes)");
5956 }
e2e6a0f1 5957 });
5bc10b2c
DM
5958 depth--;
5959 }
5960#else
77cb431f
DM
5961 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5962 || yes_state > SLAB_LAST(PL_regmatch_slab))
5963 {
5964 /* not in this slab, pop slab */
5965 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5966 PL_regmatch_slab = PL_regmatch_slab->prev;
5967 st = SLAB_LAST(PL_regmatch_slab);
5968 }
5969 depth -= (st - yes_state);
5bc10b2c 5970#endif
77cb431f
DM
5971 st = yes_state;
5972 yes_state = st->u.yes.prev_yes_state;
5973 PL_regmatch_state = st;
24b23f37 5974
5d458dd8
YO
5975 if (no_final) {
5976 locinput= st->locinput;
5977 nextchr = UCHARAT(locinput);
5978 }
54612592 5979 state_num = st->resume_state + no_final;
24d3c4a9 5980 goto reenter_switch;
77cb431f
DM
5981 }
5982
a3621e74 5983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 5984 PL_colors[4], PL_colors[5]));
02db2b7b 5985
ed301438 5986 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
5987 /* each successfully executed (?{...}) block does the equivalent of
5988 * local $^R = do {...}
5989 * When popping the save stack, all these locals would be undone;
5990 * bypass this by setting the outermost saved $^R to the latest
5991 * value */
5992 if (oreplsv != GvSV(PL_replgv))
5993 sv_setsv(oreplsv, GvSV(PL_replgv));
5994 }
95b24440 5995 result = 1;
aa283a38 5996 goto final_exit;
4633a7c4
LW
5997
5998no:
a3621e74 5999 DEBUG_EXECUTE_r(
7821416a 6000 PerlIO_printf(Perl_debug_log,
786e8c11 6001 "%*s %sfailed...%s\n",
5bc10b2c 6002 REPORT_CODE_OFF+depth*2, "",
786e8c11 6003 PL_colors[4], PL_colors[5])
7821416a 6004 );
aa283a38 6005
262b90c4 6006no_silent:
54612592
YO
6007 if (no_final) {
6008 if (yes_state) {
6009 goto yes;
6010 } else {
6011 goto final_exit;
6012 }
6013 }
aa283a38
DM
6014 if (depth) {
6015 /* there's a previous state to backtrack to */
40a82448
DM
6016 st--;
6017 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6018 PL_regmatch_slab = PL_regmatch_slab->prev;
6019 st = SLAB_LAST(PL_regmatch_slab);
6020 }
6021 PL_regmatch_state = st;
40a82448
DM
6022 locinput= st->locinput;
6023 nextchr = UCHARAT(locinput);
6024
5bc10b2c
DM
6025 DEBUG_STATE_pp("pop");
6026 depth--;
262b90c4
DM
6027 if (yes_state == st)
6028 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6029
24d3c4a9
DM
6030 state_num = st->resume_state + 1; /* failure = success + 1 */
6031 goto reenter_switch;
95b24440 6032 }
24d3c4a9 6033 result = 0;
aa283a38 6034
262b90c4 6035 final_exit:
bbe252da 6036 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6037 SV *sv_err = get_sv("REGERROR", 1);
6038 SV *sv_mrk = get_sv("REGMARK", 1);
6039 if (result) {
e2e6a0f1 6040 sv_commit = &PL_sv_no;
5d458dd8
YO
6041 if (!sv_yes_mark)
6042 sv_yes_mark = &PL_sv_yes;
6043 } else {
6044 if (!sv_commit)
6045 sv_commit = &PL_sv_yes;
6046 sv_yes_mark = &PL_sv_no;
6047 }
6048 sv_setsv(sv_err, sv_commit);
6049 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6050 }
19b95bf0 6051
81ed78b2
DM
6052
6053 if (last_pushed_cv) {
6054 dSP;
6055 POP_MULTICALL;
4f8dbb2d 6056 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6057 }
6058
2f554ef7
DM
6059 /* clean up; in particular, free all slabs above current one */
6060 LEAVE_SCOPE(oldsave);
5d9a96ca 6061
95b24440 6062 return result;
a687059c
LW
6063}
6064
6065/*
6066 - regrepeat - repeatedly match something simple, report how many
6067 */
6068/*
6069 * [This routine now assumes that it will only match on things of length 1.
6070 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 6071 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 6072 */
76e3520e 6073STATIC I32
e2e6a0f1 6074S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 6075{
27da23d5 6076 dVAR;
eb578fdb
KW
6077 char *scan;
6078 I32 c;
6079 char *loceol = PL_regeol;
6080 I32 hardcount = 0;
6081 bool utf8_target = PL_reg_match_utf8;
d513472c 6082 UV utf8_flags;
4f55667c
SP
6083#ifndef DEBUGGING
6084 PERL_UNUSED_ARG(depth);
6085#endif
a0d0e21e 6086
7918f24d
NC
6087 PERL_ARGS_ASSERT_REGREPEAT;
6088
3280af22 6089 scan = PL_reginput;
faf11cac
HS
6090 if (max == REG_INFTY)
6091 max = I32_MAX;
6092 else if (max < loceol - scan)
7f596f4c 6093 loceol = scan + max;
a0d0e21e 6094 switch (OP(p)) {
22c35a8c 6095 case REG_ANY:
f2ed9b32 6096 if (utf8_target) {
ffc61ed2 6097 loceol = PL_regeol;
1aa99e6b 6098 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6099 scan += UTF8SKIP(scan);
6100 hardcount++;
6101 }
6102 } else {
6103 while (scan < loceol && *scan != '\n')
6104 scan++;
a0ed51b3
LW
6105 }
6106 break;
ffc61ed2 6107 case SANY:
f2ed9b32 6108 if (utf8_target) {
def8e4ea 6109 loceol = PL_regeol;
a0804c9e 6110 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6111 scan += UTF8SKIP(scan);
6112 hardcount++;
6113 }
6114 }
6115 else
6116 scan = loceol;
a0ed51b3 6117 break;
f33976b4
DB
6118 case CANY:
6119 scan = loceol;
6120 break;
59d32103
KW
6121 case EXACT:
6122 /* To get here, EXACTish nodes must have *byte* length == 1. That
6123 * means they match only characters in the string that can be expressed
6124 * as a single byte. For non-utf8 strings, that means a simple match.
6125 * For utf8 strings, the character matched must be an invariant, or
6126 * downgradable to a single byte. The pattern's utf8ness is
6127 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6128 * it is, it's an invariant */
6129
6130 c = (U8)*STRING(p);
6131 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6132
6133 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6134 while (scan < loceol && UCHARAT(scan) == c) {
6135 scan++;
6136 }
6137 }
6138 else {
6139
6140 /* Here, the string is utf8, and the pattern char is different
6141 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6142 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6143 * look for those in sequence in the utf8 string */
6144 U8 high = UTF8_TWO_BYTE_HI(c);
6145 U8 low = UTF8_TWO_BYTE_LO(c);
6146 loceol = PL_regeol;
6147
6148 while (hardcount < max
6149 && scan + 1 < loceol
6150 && UCHARAT(scan) == high
6151 && UCHARAT(scan + 1) == low)
6152 {
6153 scan += 2;
6154 hardcount++;
6155 }
6156 }
6157 break;
2f7f8cb1
KW
6158 case EXACTFA:
6159 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6160 goto do_exactf;
6161
d4e0b827
KW
6162 case EXACTFL:
6163 PL_reg_flags |= RF_tainted;
17580e7a
KW
6164 utf8_flags = FOLDEQ_UTF8_LOCALE;
6165 goto do_exactf;
6166
d4e0b827 6167 case EXACTF:
62bf7766
KW
6168 utf8_flags = 0;
6169 goto do_exactf;
6170
3c760661 6171 case EXACTFU_SS:
fab2782b 6172 case EXACTFU_TRICKYFOLD:
9a5a5549 6173 case EXACTFU:
05f861a2 6174 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6175
2be3e190
KW
6176 /* The comments for the EXACT case above apply as well to these fold
6177 * ones */
634c83a2 6178
2f7f8cb1 6179 do_exactf:
090f7165 6180 c = (U8)*STRING(p);
634c83a2 6181 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6182
3c760661 6183 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6184 char *tmpeol = loceol;
6185 while (hardcount < max
d513472c
KW
6186 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6187 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6188 {
6189 scan = tmpeol;
6190 tmpeol = loceol;
6191 hardcount++;
6192 }
634c83a2 6193
59d32103
KW
6194 /* XXX Note that the above handles properly the German sharp s in
6195 * the pattern matching ss in the string. But it doesn't handle
6196 * properly cases where the string contains say 'LIGATURE ff' and
6197 * the pattern is 'f+'. This would require, say, a new function or
6198 * revised interface to foldEQ_utf8(), in which the maximum number
6199 * of characters to match could be passed and it would return how
6200 * many actually did. This is just one of many cases where
6201 * multi-char folds don't work properly, and so the fix is being
6202 * deferred */
6203 }
6204 else {
87381386 6205 U8 folded;
59d32103 6206
2be3e190
KW
6207 /* Here, the string isn't utf8 and c is a single byte; and either
6208 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6209 * doesn't affect c. Can just do simple comparisons for exact or
6210 * fold matching. */
d4e0b827 6211 switch (OP(p)) {
87381386 6212 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6213 case EXACTFA:
fab2782b 6214 case EXACTFU_TRICKYFOLD:
9a5a5549 6215 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6216 case EXACTFL: folded = PL_fold_locale[c]; break;
6217 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6218 }
6219 while (scan < loceol &&
6220 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6221 {
6222 scan++;
634c83a2
KW
6223 }
6224 }
bbce6d69 6225 break;
f56b6394 6226 case ANYOFV:
a0d0e21e 6227 case ANYOF:
4e8910e0
KW
6228 if (utf8_target || OP(p) == ANYOFV) {
6229 STRLEN inclasslen;
ffc61ed2 6230 loceol = PL_regeol;
4e8910e0
KW
6231 inclasslen = loceol - scan;
6232 while (hardcount < max
6233 && ((inclasslen = loceol - scan) > 0)
6234 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6235 {
6236 scan += inclasslen;
ffc61ed2
JH
6237 hardcount++;
6238 }
6239 } else {
32fc9b6a 6240 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6241 scan++;
6242 }
a0d0e21e 6243 break;
980866de 6244 case ALNUMU:
f2ed9b32 6245 if (utf8_target) {
980866de 6246 utf8_wordchar:
ffc61ed2 6247 loceol = PL_regeol;
1a4fad37 6248 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6249 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6250 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6251 {
ffc61ed2
JH
6252 scan += UTF8SKIP(scan);
6253 hardcount++;
6254 }
980866de 6255 } else {
a12cf05f
KW
6256 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6257 scan++;
6258 }
980866de
KW
6259 }
6260 break;
6261 case ALNUM:
6262 if (utf8_target)
6263 goto utf8_wordchar;
6264 while (scan < loceol && isALNUM((U8) *scan)) {
6265 scan++;
a0ed51b3
LW
6266 }
6267 break;
cfaf538b
KW
6268 case ALNUMA:
6269 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6270 scan++;
6271 }
6272 break;
bbce6d69 6273 case ALNUML:
3280af22 6274 PL_reg_flags |= RF_tainted;
f2ed9b32 6275 if (utf8_target) {
ffc61ed2 6276 loceol = PL_regeol;
1aa99e6b
IH
6277 while (hardcount < max && scan < loceol &&
6278 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6279 scan += UTF8SKIP(scan);
6280 hardcount++;
6281 }
6282 } else {
6283 while (scan < loceol && isALNUM_LC(*scan))
6284 scan++;
a0ed51b3
LW
6285 }
6286 break;
980866de 6287 case NALNUMU:
f2ed9b32 6288 if (utf8_target) {
980866de
KW
6289
6290 utf8_Nwordchar:
6291
ffc61ed2 6292 loceol = PL_regeol;
1a4fad37 6293 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6294 while (hardcount < max && scan < loceol &&
980866de 6295 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6296 {
ffc61ed2
JH
6297 scan += UTF8SKIP(scan);
6298 hardcount++;
6299 }
980866de 6300 } else {
a12cf05f
KW
6301 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6302 scan++;
6303 }
980866de
KW
6304 }
6305 break;
6306 case NALNUM:
6307 if (utf8_target)
6308 goto utf8_Nwordchar;
6309 while (scan < loceol && ! isALNUM((U8) *scan)) {
6310 scan++;
a0ed51b3
LW
6311 }
6312 break;
0658cdde
KW
6313
6314 case POSIXA:
6315 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6316 scan++;
6317 }
6318 break;
6319 case NPOSIXA:
6320 if (utf8_target) {
6321 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6322 scan += UTF8SKIP(scan);
6323 }
6324 }
6325 else {
6326 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6327 scan++;
6328 }
6329 }
6330 break;
cfaf538b
KW
6331 case NALNUMA:
6332 if (utf8_target) {
6333 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6334 scan += UTF8SKIP(scan);
6335 }
6336 }
6337 else {
6338 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6339 scan++;
6340 }
6341 }
6342 break;
bbce6d69 6343 case NALNUML:
3280af22 6344 PL_reg_flags |= RF_tainted;
f2ed9b32 6345 if (utf8_target) {
ffc61ed2 6346 loceol = PL_regeol;
1aa99e6b
IH
6347 while (hardcount < max && scan < loceol &&
6348 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6349 scan += UTF8SKIP(scan);
6350 hardcount++;
6351 }
6352 } else {
6353 while (scan < loceol && !isALNUM_LC(*scan))
6354 scan++;
a0ed51b3
LW
6355 }
6356 break;
980866de 6357 case SPACEU:
f2ed9b32 6358 if (utf8_target) {
980866de
KW
6359
6360 utf8_space:
6361
ffc61ed2 6362 loceol = PL_regeol;
1a4fad37 6363 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6364 while (hardcount < max && scan < loceol &&
3568d838 6365 (*scan == ' ' ||
a12cf05f
KW
6366 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6367 {
ffc61ed2
JH
6368 scan += UTF8SKIP(scan);
6369 hardcount++;
6370 }
980866de
KW
6371 break;
6372 }
6373 else {
a12cf05f
KW
6374 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6375 scan++;
6376 }
980866de
KW
6377 break;
6378 }
6379 case SPACE:
6380 if (utf8_target)
6381 goto utf8_space;
6382
6383 while (scan < loceol && isSPACE((U8) *scan)) {
6384 scan++;
a0ed51b3
LW
6385 }
6386 break;
cfaf538b
KW
6387 case SPACEA:
6388 while (scan < loceol && isSPACE_A((U8) *scan)) {
6389 scan++;
6390 }
6391 break;
bbce6d69 6392 case SPACEL:
3280af22 6393 PL_reg_flags |= RF_tainted;
f2ed9b32 6394 if (utf8_target) {
ffc61ed2 6395 loceol = PL_regeol;
1aa99e6b 6396 while (hardcount < max && scan < loceol &&
6bbba904 6397 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6398 scan += UTF8SKIP(scan);
6399 hardcount++;
6400 }
6401 } else {
6402 while (scan < loceol && isSPACE_LC(*scan))
6403 scan++;
a0ed51b3
LW
6404 }
6405 break;
980866de 6406 case NSPACEU:
f2ed9b32 6407 if (utf8_target) {
980866de
KW
6408
6409 utf8_Nspace:
6410
ffc61ed2 6411 loceol = PL_regeol;
1a4fad37 6412 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6413 while (hardcount < max && scan < loceol &&
980866de
KW
6414 ! (*scan == ' ' ||
6415 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6416 {
ffc61ed2
JH
6417 scan += UTF8SKIP(scan);
6418 hardcount++;
6419 }
980866de
KW
6420 break;
6421 }
6422 else {
a12cf05f
KW
6423 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6424 scan++;
6425 }
980866de
KW
6426 }
6427 break;
6428 case NSPACE:
6429 if (utf8_target)
6430 goto utf8_Nspace;
6431
6432 while (scan < loceol && ! isSPACE((U8) *scan)) {
6433 scan++;
a0ed51b3 6434 }
0008a298 6435 break;
cfaf538b
KW
6436 case NSPACEA:
6437 if (utf8_target) {
6438 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6439 scan += UTF8SKIP(scan);
6440 }
6441 }
6442 else {
6443 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6444 scan++;
6445 }
6446 }
6447 break;
bbce6d69 6448 case NSPACEL:
3280af22 6449 PL_reg_flags |= RF_tainted;
f2ed9b32 6450 if (utf8_target) {
ffc61ed2 6451 loceol = PL_regeol;
1aa99e6b 6452 while (hardcount < max && scan < loceol &&
6bbba904 6453 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6454 scan += UTF8SKIP(scan);
6455 hardcount++;
6456 }
6457 } else {
6458 while (scan < loceol && !isSPACE_LC(*scan))
6459 scan++;
a0ed51b3
LW
6460 }
6461 break;
a0d0e21e 6462 case DIGIT:
f2ed9b32 6463 if (utf8_target) {
ffc61ed2 6464 loceol = PL_regeol;
1a4fad37 6465 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6466 while (hardcount < max && scan < loceol &&
f2ed9b32 6467 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6468 scan += UTF8SKIP(scan);
6469 hardcount++;
6470 }
6471 } else {
6472 while (scan < loceol && isDIGIT(*scan))
6473 scan++;
a0ed51b3
LW
6474 }
6475 break;
cfaf538b
KW
6476 case DIGITA:
6477 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6478 scan++;
6479 }
6480 break;
b77393f6
KW
6481 case DIGITL:
6482 PL_reg_flags |= RF_tainted;
6483 if (utf8_target) {
6484 loceol = PL_regeol;
6485 while (hardcount < max && scan < loceol &&
6486 isDIGIT_LC_utf8((U8*)scan)) {
6487 scan += UTF8SKIP(scan);
6488 hardcount++;
6489 }
6490 } else {
6491 while (scan < loceol && isDIGIT_LC(*scan))
6492 scan++;
6493 }
6494 break;
a0d0e21e 6495 case NDIGIT:
f2ed9b32 6496 if (utf8_target) {
ffc61ed2 6497 loceol = PL_regeol;
1a4fad37 6498 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6499 while (hardcount < max && scan < loceol &&
f2ed9b32 6500 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6501 scan += UTF8SKIP(scan);
6502 hardcount++;
6503 }
6504 } else {
6505 while (scan < loceol && !isDIGIT(*scan))
6506 scan++;
a0ed51b3 6507 }
cfaf538b
KW
6508 break;
6509 case NDIGITA:
6510 if (utf8_target) {
6511 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6512 scan += UTF8SKIP(scan);
6513 }
6514 }
6515 else {
6516 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6517 scan++;
6518 }
6519 }
6520 break;
b77393f6
KW
6521 case NDIGITL:
6522 PL_reg_flags |= RF_tainted;
6523 if (utf8_target) {
6524 loceol = PL_regeol;
6525 while (hardcount < max && scan < loceol &&
6526 !isDIGIT_LC_utf8((U8*)scan)) {
6527 scan += UTF8SKIP(scan);
6528 hardcount++;
6529 }
6530 } else {
6531 while (scan < loceol && !isDIGIT_LC(*scan))
6532 scan++;
6533 }
6534 break;
e1d1eefb 6535 case LNBREAK:
f2ed9b32 6536 if (utf8_target) {
e1d1eefb
YO
6537 loceol = PL_regeol;
6538 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6539 scan += c;
6540 hardcount++;
6541 }
6542 } else {
6543 /*
6544 LNBREAK can match two latin chars, which is ok,
6545 because we have a null terminated string, but we
6546 have to use hardcount in this situation
6547 */
6548 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6549 scan+=c;
6550 hardcount++;
6551 }
6552 }
6553 break;
6554 case HORIZWS:
f2ed9b32 6555 if (utf8_target) {
e1d1eefb
YO
6556 loceol = PL_regeol;
6557 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6558 scan += c;
6559 hardcount++;
6560 }
6561 } else {
6562 while (scan < loceol && is_HORIZWS_latin1(scan))
6563 scan++;
6564 }
a0ed51b3 6565 break;
e1d1eefb 6566 case NHORIZWS:
f2ed9b32 6567 if (utf8_target) {
e1d1eefb
YO
6568 loceol = PL_regeol;
6569 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6570 scan += UTF8SKIP(scan);
6571 hardcount++;
6572 }
6573 } else {
6574 while (scan < loceol && !is_HORIZWS_latin1(scan))
6575 scan++;
6576
6577 }
6578 break;
6579 case VERTWS:
f2ed9b32 6580 if (utf8_target) {
e1d1eefb
YO
6581 loceol = PL_regeol;
6582 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6583 scan += c;
6584 hardcount++;
6585 }
6586 } else {
6587 while (scan < loceol && is_VERTWS_latin1(scan))
6588 scan++;
6589
6590 }
6591 break;
6592 case NVERTWS:
f2ed9b32 6593 if (utf8_target) {
e1d1eefb
YO
6594 loceol = PL_regeol;
6595 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6596 scan += UTF8SKIP(scan);
6597 hardcount++;
6598 }
6599 } else {
6600 while (scan < loceol && !is_VERTWS_latin1(scan))
6601 scan++;
6602
6603 }
6604 break;
6605
a0d0e21e
LW
6606 default: /* Called on something of 0 width. */
6607 break; /* So match right here or not at all. */
6608 }
a687059c 6609
a0ed51b3
LW
6610 if (hardcount)
6611 c = hardcount;
6612 else
6613 c = scan - PL_reginput;
3280af22 6614 PL_reginput = scan;
a687059c 6615
a3621e74 6616 DEBUG_r({
e68ec53f 6617 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6618 DEBUG_EXECUTE_r({
e68ec53f
YO
6619 SV * const prop = sv_newmortal();
6620 regprop(prog, prop, p);
6621 PerlIO_printf(Perl_debug_log,
be8e71aa 6622 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6623 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6624 });
be8e71aa 6625 });
9041c2e3 6626
a0d0e21e 6627 return(c);
a687059c
LW
6628}
6629
c277df42 6630
be8e71aa 6631#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6632/*
6c6525b8
KW
6633- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6634create a copy so that changes the caller makes won't change the shared one
6635 */
ffc61ed2 6636SV *
32fc9b6a 6637Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6638{
6c6525b8
KW
6639 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6640 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6641}
6642#endif
6643
6644STATIC SV *
6645S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6646{
8c9eb58f
KW
6647 /* Returns the swash for the input 'node' in the regex 'prog'.
6648 * If <doinit> is true, will attempt to create the swash if not already
6649 * done.
6650 * If <listsvp> is non-null, will return the swash initialization string in
6651 * it.
6652 * If <altsvp> is non-null, will return the alternates to the regular swash
6653 * in it
6654 * Tied intimately to how regcomp.c sets up the data structure */
6655
97aff369 6656 dVAR;
9e55ce06
JH
6657 SV *sw = NULL;
6658 SV *si = NULL;
6659 SV *alt = NULL;
7a6c6baa
KW
6660 SV* invlist = NULL;
6661
f8fc2ecf
YO
6662 RXi_GET_DECL(prog,progi);
6663 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6664
6c6525b8 6665 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6666
ccb2541c
KW
6667 assert(ANYOF_NONBITMAP(node));
6668
4f639d21 6669 if (data && data->count) {
a3b680e6 6670 const U32 n = ARG(node);
ffc61ed2 6671
4f639d21 6672 if (data->what[n] == 's') {
ad64d0ec
NC
6673 SV * const rv = MUTABLE_SV(data->data[n]);
6674 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6675 SV **const ary = AvARRAY(av);
87367d5f 6676 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 6677
8c9eb58f 6678 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6679
7a6c6baa
KW
6680 /* Elements 3 and 4 are either both present or both absent. [3] is
6681 * any inversion list generated at compile time; [4] indicates if
6682 * that inversion list has any user-defined properties in it. */
6683 if (av_len(av) >= 3) {
6684 invlist = ary[3];
83199d38
KW
6685 if (SvUV(ary[4])) {
6686 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6687 }
7a6c6baa
KW
6688 }
6689 else {
6690 invlist = NULL;
7a6c6baa
KW
6691 }
6692
8c9eb58f
KW
6693 /* Element [1] is reserved for the set-up swash. If already there,
6694 * return it; if not, create it and store it there */
f192cf32
KW
6695 if (SvROK(ary[1])) {
6696 sw = ary[1];
6697 }
ffc61ed2 6698 else if (si && doinit) {
7a6c6baa
KW
6699
6700 sw = _core_swash_init("utf8", /* the utf8 package */
6701 "", /* nameless */
6702 si,
6703 1, /* binary */
6704 0, /* not from tr/// */
7a6c6baa 6705 invlist,
83199d38 6706 &swash_init_flags);
ffc61ed2
JH
6707 (void)av_store(av, 1, sw);
6708 }
8c9eb58f
KW
6709
6710 /* Element [2] is for any multi-char folds. Note that is a
6711 * fundamentally flawed design, because can't backtrack and try
6712 * again. See [perl #89774] */
f192cf32
KW
6713 if (SvTYPE(ary[2]) == SVt_PVAV) {
6714 alt = ary[2];
6715 }
ffc61ed2
JH
6716 }
6717 }
6718
7a6c6baa
KW
6719 if (listsvp) {
6720 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
6721
6722 /* Use the swash, if any, which has to have incorporated into it all
6723 * possibilities */
872dd7e0
KW
6724 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6725 && (si && si != &PL_sv_undef))
6726 {
7a6c6baa 6727
872dd7e0 6728 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
6729 sv_catsv(matches_string, si);
6730 }
6731
6732 /* Add the inversion list to whatever we have. This may have come from
6733 * the swash, or from an input parameter */
6734 if (invlist) {
6735 sv_catsv(matches_string, _invlist_contents(invlist));
6736 }
6737 *listsvp = matches_string;
6738 }
6739
9e55ce06
JH
6740 if (altsvp)
6741 *altsvp = alt;
ffc61ed2
JH
6742
6743 return sw;
6744}
6745
6746/*
ba7b4546 6747 - reginclass - determine if a character falls into a character class
832705d4 6748
6698fab5
KW
6749 n is the ANYOF regnode
6750 p is the target string
6751 lenp is pointer to the maximum number of bytes of how far to go in p
6752 (This is assumed wthout checking to always be at least the current
6753 character's size)
6754 utf8_target tells whether p is in UTF-8.
832705d4 6755
4b3cda86
KW
6756 Returns true if matched; false otherwise. If lenp is not NULL, on return
6757 from a successful match, the value it points to will be updated to how many
6758 bytes in p were matched. If there was no match, the value is undefined,
6759 possibly changed from the input.
eba1359e 6760
d5788240
KW
6761 Note that this can be a synthetic start class, a combination of various
6762 nodes, so things you think might be mutually exclusive, such as locale,
6763 aren't. It can match both locale and non-locale
6764
bbce6d69 6765 */
6766
76e3520e 6767STATIC bool
f6ad78d8 6768S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6769{
27da23d5 6770 dVAR;
a3b680e6 6771 const char flags = ANYOF_FLAGS(n);
bbce6d69 6772 bool match = FALSE;
cc07378b 6773 UV c = *p;
f7ab54c6 6774 STRLEN c_len = 0;
6698fab5 6775 STRLEN maxlen;
1aa99e6b 6776
7918f24d
NC
6777 PERL_ARGS_ASSERT_REGINCLASS;
6778
4b3cda86 6779 /* If c is not already the code point, get it */
f2ed9b32 6780 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6781 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6782 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6783 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6784 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6785 * UTF8_ALLOW_FFFF */
f7ab54c6 6786 if (c_len == (STRLEN)-1)
e8a70c6f 6787 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6788 }
a5a291f5
KW
6789 else {
6790 c_len = 1;
6791 }
bbce6d69 6792
a5a291f5
KW
6793 /* Use passed in max length, or one character if none passed in or less
6794 * than one character. And assume will match just one character. This is
6795 * overwritten later if matched more. */
4b3cda86 6796 if (lenp) {
a5a291f5
KW
6797 maxlen = (*lenp > c_len) ? *lenp : c_len;
6798 *lenp = c_len;
4b3cda86
KW
6799
6800 }
6801 else {
a5a291f5 6802 maxlen = c_len;
4b3cda86
KW
6803 }
6804
7cdde544
KW
6805 /* If this character is potentially in the bitmap, check it */
6806 if (c < 256) {
ffc61ed2
JH
6807 if (ANYOF_BITMAP_TEST(n, c))
6808 match = TRUE;
11454c59
KW
6809 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6810 && ! utf8_target
6811 && ! isASCII(c))
6812 {
6813 match = TRUE;
6814 }
a0ed51b3 6815
78969a98
KW
6816 else if (flags & ANYOF_LOCALE) {
6817 PL_reg_flags |= RF_tainted;
6818
6819 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6820 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6821 {
ffc61ed2 6822 match = TRUE;
78969a98 6823 }
040aea3a
KW
6824 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6825 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6826 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6827 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6828 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6829 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6830 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6831 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6832 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6833 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6834 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
6835 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6836 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
6837 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6838 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6839 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6840 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6841 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6842 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6843 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6844 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6845 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6846 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6847 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6848 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6849 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6850 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6851 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6852 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
6853 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6854 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 6855 ) /* How's that for a conditional? */
78969a98 6856 ) {
ffc61ed2
JH
6857 match = TRUE;
6858 }
a0ed51b3 6859 }
a0ed51b3
LW
6860 }
6861
7cdde544 6862 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6863 * bitmap could match, try that. Locale nodes specifiy completely the
6864 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6865 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6866 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6867 * could be a combination of locale and non-locale nodes. So allow locale
6868 * to match for the synthetic start class, which will give a false
6869 * positive that will be resolved when the match is done again as not part
6870 * of the synthetic start class */
ef87b810 6871 if (!match) {
10ee90d2
KW
6872 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6873 match = TRUE; /* Everything above 255 matches */
e051a21d 6874 }
6f8d7d0d
KW
6875 else if (ANYOF_NONBITMAP(n)
6876 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6877 || (utf8_target
6878 && (c >=256
6879 || (! (flags & ANYOF_LOCALE))
6880 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6881 {
7cdde544 6882 AV *av;
210e6c47 6883 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6884
6885 if (sw) {
6886 U8 * utf8_p;
6887 if (utf8_target) {
6888 utf8_p = (U8 *) p;
6889 } else {
f56b6394
KW
6890
6891 /* Not utf8. Convert as much of the string as available up
6892 * to the limit of how far the (single) character in the
6893 * pattern can possibly match (no need to go further). If
6894 * the node is a straight ANYOF or not folding, it can't
6895 * match more than one. Otherwise, It can match up to how
6896 * far a single char can fold to. Since not utf8, each
6897 * character is a single byte, so the max it can be in
6898 * bytes is the same as the max it can be in characters */
6899 STRLEN len = (OP(n) == ANYOF
6900 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6901 ? 1
6902 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6903 ? maxlen
6904 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
6905 utf8_p = bytes_to_utf8(p, &len);
6906 }
f56b6394 6907
f7eb7452 6908 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 6909 match = TRUE;
39065660 6910 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
6911
6912 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
6913 * matches. The non-multi char folds have all been moved to
6914 * the compilation phase, and the multi-char folds have
6915 * been stored by regcomp into 'av'; we linearly check to
6916 * see if any match the target string (folded). We know
6917 * that the originals were each one character, but we don't
6918 * currently know how many characters/bytes each folded to,
6919 * except we do know that there are small limits imposed by
6920 * Unicode. XXX A performance enhancement would be to have
6921 * regcomp.c store the max number of chars/bytes that are
6922 * in an av entry, as, say the 0th element. Even better
6923 * would be to have a hash of the few characters that can
6924 * start a multi-char fold to the max number of chars of
6925 * those folds.
f56b6394
KW
6926 *
6927 * If there is a match, we will need to advance (if lenp is
6928 * specified) the match pointer in the target string. But
6929 * what we are comparing here isn't that string directly,
6930 * but its fold, whose length may differ from the original.
6931 * As we go along in constructing the fold, therefore, we
6932 * create a map so that we know how many bytes in the
6933 * source to advance given that we have matched a certain
6934 * number of bytes in the fold. This map is stored in
6438af90
KW
6935 * 'map_fold_len_back'. Let n mean the number of bytes in
6936 * the fold of the first character that we are folding.
6937 * Then map_fold_len_back[n] is set to the number of bytes
6938 * in that first character. Similarly let m be the
6939 * corresponding number for the second character to be
6940 * folded. Then map_fold_len_back[n+m] is set to the
6941 * number of bytes occupied by the first two source
6942 * characters. ... */
6943 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
6944 U8 folded[UTF8_MAXBYTES_CASE+1];
6945 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 6946 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 6947 chars */
f56b6394
KW
6948
6949 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6950
6951 /* Here, only need to fold the first char of the target
6438af90 6952 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 6953 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 6954 total_foldlen = foldlen;
6438af90
KW
6955 map_fold_len_back[foldlen] = (utf8_target)
6956 ? UTF8SKIP(utf8_p)
6957 : 1;
f56b6394
KW
6958 }
6959 else {
6960
6961 /* Here, need to fold more than the first char. Do so
6962 * up to the limits */
f56b6394
KW
6963 U8* source_ptr = utf8_p; /* The source for the fold
6964 is the regex target
6965 string */
6966 U8* folded_ptr = folded;
6967 U8* e = utf8_p + maxlen; /* Can't go beyond last
6968 available byte in the
6969 target string */
6438af90
KW
6970 U8 i;
6971 for (i = 0;
6972 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6973 i++)
f56b6394
KW
6974 {
6975
6976 /* Fold the next character */
6977 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6978 STRLEN this_char_foldlen;
6979 to_utf8_fold(source_ptr,
6980 this_char_folded,
6981 &this_char_foldlen);
6982
6983 /* Bail if it would exceed the byte limit for
6984 * folding a single char. */
6985 if (this_char_foldlen + folded_ptr - folded >
6986 UTF8_MAXBYTES_CASE)
6987 {
6988 break;
6989 }
6990
6438af90 6991 /* Add the fold of this character */
f56b6394
KW
6992 Copy(this_char_folded,
6993 folded_ptr,
6994 this_char_foldlen,
6995 U8);
f56b6394 6996 source_ptr += UTF8SKIP(source_ptr);
6438af90 6997 folded_ptr += this_char_foldlen;
deba3d96 6998 total_foldlen = folded_ptr - folded;
6438af90
KW
6999
7000 /* Create map from the number of bytes in the fold
7001 * back to the number of bytes in the source. If
7002 * the source isn't utf8, the byte count is just
7003 * the number of characters so far */
deba3d96 7004 map_fold_len_back[total_foldlen]
6438af90
KW
7005 = (utf8_target)
7006 ? source_ptr - utf8_p
7007 : i + 1;
f56b6394
KW
7008 }
7009 *folded_ptr = '\0';
f56b6394
KW
7010 }
7011
7012
7013 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
7014 * of multi-char folds. */
7015 if (av) {
7cdde544
KW
7016 I32 i;
7017 for (i = 0; i <= av_len(av); i++) {
7018 SV* const sv = *av_fetch(av, i, FALSE);
7019 STRLEN len;
7020 const char * const s = SvPV_const(sv, len);
6438af90 7021
f9126265
KW
7022 if (len <= total_foldlen
7023 && memEQ(s, (char*)folded, len)
7024
7025 /* If 0, means matched a partial char. See
7026 * [perl #90536] */
7027 && map_fold_len_back[len])
f56b6394
KW
7028 {
7029
7030 /* Advance the target string ptr to account for
7031 * this fold, but have to translate from the
7032 * folded length to the corresponding source
6438af90 7033 * length. */
8f2655f7
KW
7034 if (lenp) {
7035 *lenp = map_fold_len_back[len];
8f2655f7 7036 }
7cdde544
KW
7037 match = TRUE;
7038 break;
7039 }
7040 }
7041 }
7cdde544
KW
7042 }
7043
7044 /* If we allocated a string above, free it */
7045 if (! utf8_target) Safefree(utf8_p);
7046 }
7047 }
5073ffbd
KW
7048
7049 if (UNICODE_IS_SUPER(c)
7050 && (flags & ANYOF_WARN_SUPER)
7051 && ckWARN_d(WARN_NON_UNICODE))
7052 {
7053 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7054 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7055 }
7cdde544
KW
7056 }
7057
f0fdc1c9
KW
7058 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7059 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7060}
161b471a 7061
dfe13c55 7062STATIC U8 *
0ce71af7 7063S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7064{
6af86488
KW
7065 /* return the position 'off' UTF-8 characters away from 's', forward if
7066 * 'off' >= 0, backwards if negative. But don't go outside of position
7067 * 'lim', which better be < s if off < 0 */
7068
97aff369 7069 dVAR;
7918f24d
NC
7070
7071 PERL_ARGS_ASSERT_REGHOP3;
7072
a0ed51b3 7073 if (off >= 0) {
1aa99e6b 7074 while (off-- && s < lim) {
ffc61ed2 7075 /* XXX could check well-formedness here */
a0ed51b3 7076 s += UTF8SKIP(s);
ffc61ed2 7077 }
a0ed51b3
LW
7078 }
7079 else {
1de06328
YO
7080 while (off++ && s > lim) {
7081 s--;
7082 if (UTF8_IS_CONTINUED(*s)) {
7083 while (s > lim && UTF8_IS_CONTINUATION(*s))
7084 s--;
a0ed51b3 7085 }
1de06328 7086 /* XXX could check well-formedness here */
a0ed51b3
LW
7087 }
7088 }
7089 return s;
7090}
161b471a 7091
f9f4320a
YO
7092#ifdef XXX_dmq
7093/* there are a bunch of places where we use two reghop3's that should
7094 be replaced with this routine. but since thats not done yet
7095 we ifdef it out - dmq
7096*/
dfe13c55 7097STATIC U8 *
1de06328
YO
7098S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7099{
7100 dVAR;
7918f24d
NC
7101
7102 PERL_ARGS_ASSERT_REGHOP4;
7103
1de06328
YO
7104 if (off >= 0) {
7105 while (off-- && s < rlim) {
7106 /* XXX could check well-formedness here */
7107 s += UTF8SKIP(s);
7108 }
7109 }
7110 else {
7111 while (off++ && s > llim) {
7112 s--;
7113 if (UTF8_IS_CONTINUED(*s)) {
7114 while (s > llim && UTF8_IS_CONTINUATION(*s))
7115 s--;
7116 }
7117 /* XXX could check well-formedness here */
7118 }
7119 }
7120 return s;
7121}
f9f4320a 7122#endif
1de06328
YO
7123
7124STATIC U8 *
0ce71af7 7125S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7126{
97aff369 7127 dVAR;
7918f24d
NC
7128
7129 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7130
a0ed51b3 7131 if (off >= 0) {
1aa99e6b 7132 while (off-- && s < lim) {
ffc61ed2 7133 /* XXX could check well-formedness here */
a0ed51b3 7134 s += UTF8SKIP(s);
ffc61ed2 7135 }
a0ed51b3 7136 if (off >= 0)
3dab1dad 7137 return NULL;
a0ed51b3
LW
7138 }
7139 else {
1de06328
YO
7140 while (off++ && s > lim) {
7141 s--;
7142 if (UTF8_IS_CONTINUED(*s)) {
7143 while (s > lim && UTF8_IS_CONTINUATION(*s))
7144 s--;
a0ed51b3 7145 }
1de06328 7146 /* XXX could check well-formedness here */
a0ed51b3
LW
7147 }
7148 if (off <= 0)
3dab1dad 7149 return NULL;
a0ed51b3
LW
7150 }
7151 return s;
7152}
51371543 7153
51371543 7154static void
acfe0abc 7155restore_pos(pTHX_ void *arg)
51371543 7156{
97aff369 7157 dVAR;
097eb12c 7158 regexp * const rex = (regexp *)arg;
ed301438 7159 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7160 if (PL_reg_oldsaved) {
4f639d21
DM
7161 rex->subbeg = PL_reg_oldsaved;
7162 rex->sublen = PL_reg_oldsavedlen;
f8c7b90f 7163#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7164 rex->saved_copy = PL_nrs;
ed252734 7165#endif
07bc277f 7166 RXp_MATCH_COPIED_on(rex);
51371543
GS
7167 }
7168 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7169 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7170 PL_curpm = PL_reg_oldcurpm;
7171 }
7172}
33b8afdf
JH
7173
7174STATIC void
7175S_to_utf8_substr(pTHX_ register regexp *prog)
7176{
a1cac82e 7177 int i = 1;
7918f24d
NC
7178
7179 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7180
a1cac82e
NC
7181 do {
7182 if (prog->substrs->data[i].substr
7183 && !prog->substrs->data[i].utf8_substr) {
7184 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7185 prog->substrs->data[i].utf8_substr = sv;
7186 sv_utf8_upgrade(sv);
610460f9 7187 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7188 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7189 /* Trim the trailing \n that fbm_compile added last
7190 time. */
7191 SvCUR_set(sv, SvCUR(sv) - 1);
7192 /* Whilst this makes the SV technically "invalid" (as its
7193 buffer is no longer followed by "\0") when fbm_compile()
7194 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7195 fbm_compile(sv, FBMcf_TAIL);
7196 } else
7197 fbm_compile(sv, 0);
610460f9 7198 }
a1cac82e
NC
7199 if (prog->substrs->data[i].substr == prog->check_substr)
7200 prog->check_utf8 = sv;
7201 }
7202 } while (i--);
33b8afdf
JH
7203}
7204
7205STATIC void
7206S_to_byte_substr(pTHX_ register regexp *prog)
7207{
97aff369 7208 dVAR;
a1cac82e 7209 int i = 1;
7918f24d
NC
7210
7211 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7212
a1cac82e
NC
7213 do {
7214 if (prog->substrs->data[i].utf8_substr
7215 && !prog->substrs->data[i].substr) {
7216 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7217 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7218 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7219 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7220 /* Trim the trailing \n that fbm_compile added last
7221 time. */
7222 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7223 fbm_compile(sv, FBMcf_TAIL);
7224 } else
7225 fbm_compile(sv, 0);
7226 }
a1cac82e
NC
7227 } else {
7228 SvREFCNT_dec(sv);
7229 sv = &PL_sv_undef;
7230 }
7231 prog->substrs->data[i].substr = sv;
7232 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7233 prog->check_substr = sv;
33b8afdf 7234 }
a1cac82e 7235 } while (i--);
33b8afdf 7236}
66610fdd
RGS
7237
7238/*
7239 * Local variables:
7240 * c-indentation-style: bsd
7241 * c-basic-offset: 4
14d04a33 7242 * indent-tabs-mode: nil
66610fdd
RGS
7243 * End:
7244 *
14d04a33 7245 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7246 */