This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/regcharclass.pl: Generate macros for \X processing
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
27d4fc33 147 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979
KW
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \
149 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \
151 LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \
152 LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \
153 LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \
154 LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \
155 LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8)
20d0b1e9 156
1dcf4a1b 157#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 158
ee9a90b8
KW
159/* The actual code for CCC_TRY, which uses several variables from the routine
160 * it's callable from. It is designed to be the bulk of a case statement.
161 * FUNC is the macro or function to call on non-utf8 targets that indicate if
162 * nextchr matches the class.
163 * UTF8_TEST is the whole test string to use for utf8 targets
164 * LOAD is what to use to test, and if not present to load in the swash for the
165 * class
166 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
167 * UTF8_TEST test.
168 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
169 * utf8 and a variant, load the swash if necessary and test using the utf8
170 * test. Advance to the next character if test is ok, otherwise fail; If not
171 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
172 * fails, or advance to the next character */
173
174#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
175 if (locinput >= PL_regeol) { \
176 sayNO; \
177 } \
178 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
179 LOAD_UTF8_CHARCLASS(CLASS, STR); \
180 if (POS_OR_NEG (UTF8_TEST)) { \
181 sayNO; \
182 } \
183 locinput += PL_utf8skip[nextchr]; \
184 nextchr = UCHARAT(locinput); \
185 break; \
186 } \
187 if (POS_OR_NEG (FUNC(nextchr))) { \
188 sayNO; \
189 } \
190 nextchr = UCHARAT(++locinput); \
980866de
KW
191 break;
192
ee9a90b8
KW
193/* Handle the non-locale cases for a character class and its complement. It
194 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
195 * This is because that code fails when the test succeeds, so we want to have
196 * the test fail so that the code succeeds. The swash is stored in a
197 * predictable PL_ place */
cfaf538b
KW
198#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
199 CLASS, STR) \
ee9a90b8
KW
200 case NAME: \
201 _CCC_TRY_CODE( !, FUNC, \
202 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
203 (U8*)locinput, TRUE)), \
204 CLASS, STR) \
205 case NNAME: \
1dcf4a1b 206 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
207 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
208 (U8*)locinput, TRUE)), \
209 CLASS, STR) \
210
211/* Generate the case statements for both locale and non-locale character
212 * classes in regmatch for classes that don't have special unicode semantics.
213 * Locales don't use an immediate swash, but an intermediary special locale
214 * function that is called on the pointer to the current place in the input
215 * string. That function will resolve to needing the same swash. One might
216 * think that because we don't know what the locale will match, we shouldn't
217 * check with the swash loading function that it loaded properly; ie, that we
218 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
219 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
220 * irrelevant here */
221#define CCC_TRY(NAME, NNAME, FUNC, \
222 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 223 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
224 CLASS, STR) \
225 case NAMEL: \
226 PL_reg_flags |= RF_tainted; \
227 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
228 case NNAMEL: \
229 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
230 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
231 CLASS, STR) \
cfaf538b
KW
232 case NAMEA: \
233 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
237 nextchr = UCHARAT(++locinput); \
238 break; \
239 case NNAMEA: \
240 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
241 sayNO; \
242 } \
243 if (utf8_target) { \
244 locinput += PL_utf8skip[nextchr]; \
245 nextchr = UCHARAT(locinput); \
246 } \
247 else { \
248 nextchr = UCHARAT(++locinput); \
249 } \
250 break; \
ee9a90b8
KW
251 /* Generate the non-locale cases */ \
252 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
253
254/* This is like CCC_TRY, but has an extra set of parameters for generating case
255 * statements to handle separate Unicode semantics nodes */
256#define CCC_TRY_U(NAME, NNAME, FUNC, \
257 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
258 NAMEU, NNAMEU, FUNCU, \
cfaf538b 259 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 260 CLASS, STR) \
cfaf538b
KW
261 CCC_TRY(NAME, NNAME, FUNC, \
262 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
263 NAMEA, NNAMEA, FUNCA, \
264 CLASS, STR) \
ee9a90b8 265 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 266
3dab1dad
YO
267/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
268
5f80c4cf 269/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
270/* it would be nice to rework regcomp.sym to generate this stuff. sigh
271 *
272 * NOTE that *nothing* that affects backtracking should be in here, specifically
273 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
274 * node that is in between two EXACT like nodes when ascertaining what the required
275 * "follow" character is. This should probably be moved to regex compile time
276 * although it may be done at run time beause of the REF possibility - more
277 * investigation required. -- demerphq
278*/
3e901dc0
YO
279#define JUMPABLE(rn) ( \
280 OP(rn) == OPEN || \
281 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
282 OP(rn) == EVAL || \
cca55fe3
JP
283 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
284 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 285 OP(rn) == KEEPS || \
3dab1dad 286 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 287)
ee9b8eae 288#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 289
ee9b8eae
YO
290#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
291
292#if 0
293/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
294 we don't need this definition. */
295#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 296#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
298
299#else
300/* ... so we use this as its faster. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 302#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
303#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
304#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
305
306#endif
e2d8ce26 307
a84d97b6
HS
308/*
309 Search for mandatory following text node; for lookahead, the text must
310 follow but for lookbehind (rn->flags != 0) we skip to the next step.
311*/
cca55fe3 312#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
313 while (JUMPABLE(rn)) { \
314 const OPCODE type = OP(rn); \
315 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 316 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 317 else if (type == PLUS) \
cca55fe3 318 rn = NEXTOPER(rn); \
3dab1dad 319 else if (type == IFMATCH) \
a84d97b6 320 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 321 else rn += NEXT_OFF(rn); \
3dab1dad 322 } \
5f80c4cf 323} STMT_END
74750237 324
c476f425 325
acfe0abc 326static void restore_pos(pTHX_ void *arg);
51371543 327
87c0511b 328#define REGCP_PAREN_ELEMS 3
f067efbf 329#define REGCP_OTHER_ELEMS 3
e0fa7e2b 330#define REGCP_FRAME_ELEMS 1
620d5b66
NC
331/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
332 * are needed for the regexp context stack bookkeeping. */
333
76e3520e 334STATIC CHECKPOINT
b93070ed 335S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 336{
97aff369 337 dVAR;
a3b680e6 338 const int retval = PL_savestack_ix;
a3b680e6 339 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
340 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
341 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 342 I32 p;
40a82448 343 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 344
b93070ed
DM
345 PERL_ARGS_ASSERT_REGCPPUSH;
346
e49a9654 347 if (paren_elems_to_push < 0)
5637ef5b
NC
348 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
349 paren_elems_to_push);
e49a9654 350
e0fa7e2b
NC
351 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
352 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
353 " out of range (%lu-%ld)",
354 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 355
620d5b66 356 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 357
495f47a5
DM
358 DEBUG_BUFFERS_r(
359 if ((int)PL_regsize > (int)parenfloor)
360 PerlIO_printf(Perl_debug_log,
361 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
362 PTR2UV(rex),
363 PTR2UV(rex->offs)
364 );
365 );
87c0511b 366 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 367/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
368 SSPUSHINT(rex->offs[p].end);
369 SSPUSHINT(rex->offs[p].start);
1ca2007e 370 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 371 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
372 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
373 (UV)p,
374 (IV)rex->offs[p].start,
375 (IV)rex->offs[p].start_tmp,
376 (IV)rex->offs[p].end
40a82448 377 ));
a0d0e21e 378 }
b1ce53c5 379/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 380 SSPUSHINT(PL_regsize);
b93070ed
DM
381 SSPUSHINT(rex->lastparen);
382 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 383 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 384
a0d0e21e
LW
385 return retval;
386}
387
c277df42 388/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
389#define REGCP_SET(cp) \
390 DEBUG_STATE_r( \
ab3bbdeb 391 PerlIO_printf(Perl_debug_log, \
e4f74956 392 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
393 (IV)PL_savestack_ix)); \
394 cp = PL_savestack_ix
c3464db5 395
ab3bbdeb 396#define REGCP_UNWIND(cp) \
e4f74956 397 DEBUG_STATE_r( \
ab3bbdeb 398 if (cp != PL_savestack_ix) \
e4f74956
YO
399 PerlIO_printf(Perl_debug_log, \
400 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
401 (IV)(cp), (IV)PL_savestack_ix)); \
402 regcpblow(cp)
c277df42 403
a8d1f4b4
DM
404#define UNWIND_PAREN(lp, lcp) \
405 for (n = rex->lastparen; n > lp; n--) \
406 rex->offs[n].end = -1; \
407 rex->lastparen = n; \
408 rex->lastcloseparen = lcp;
409
410
f067efbf 411STATIC void
b93070ed 412S_regcppop(pTHX_ regexp *rex)
a0d0e21e 413{
97aff369 414 dVAR;
e0fa7e2b 415 UV i;
87c0511b 416 U32 paren;
a3621e74
YO
417 GET_RE_DEBUG_FLAGS_DECL;
418
7918f24d
NC
419 PERL_ARGS_ASSERT_REGCPPOP;
420
b1ce53c5 421 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 422 i = SSPOPUV;
e0fa7e2b
NC
423 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
424 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
425 rex->lastcloseparen = SSPOPINT;
426 rex->lastparen = SSPOPINT;
3280af22 427 PL_regsize = SSPOPINT;
b1ce53c5 428
620d5b66 429 i -= REGCP_OTHER_ELEMS;
b1ce53c5 430 /* Now restore the parentheses context. */
495f47a5
DM
431 DEBUG_BUFFERS_r(
432 if (i || rex->lastparen + 1 <= rex->nparens)
433 PerlIO_printf(Perl_debug_log,
434 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
435 PTR2UV(rex),
436 PTR2UV(rex->offs)
437 );
438 );
87c0511b 439 paren = PL_regsize;
620d5b66 440 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 441 I32 tmps;
1ca2007e 442 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 443 rex->offs[paren].start = SSPOPINT;
cf93c79d 444 tmps = SSPOPINT;
b93070ed
DM
445 if (paren <= rex->lastparen)
446 rex->offs[paren].end = tmps;
495f47a5
DM
447 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
448 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
449 (UV)paren,
450 (IV)rex->offs[paren].start,
451 (IV)rex->offs[paren].start_tmp,
452 (IV)rex->offs[paren].end,
453 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 454 );
87c0511b 455 paren--;
a0d0e21e 456 }
daf18116 457#if 1
dafc8851
JH
458 /* It would seem that the similar code in regtry()
459 * already takes care of this, and in fact it is in
460 * a better location to since this code can #if 0-ed out
461 * but the code in regtry() is needed or otherwise tests
462 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
463 * (as of patchlevel 7877) will fail. Then again,
464 * this code seems to be necessary or otherwise
225593e1
DM
465 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
466 * --jhi updated by dapm */
b93070ed 467 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 468 if (i > PL_regsize)
b93070ed
DM
469 rex->offs[i].start = -1;
470 rex->offs[i].end = -1;
495f47a5
DM
471 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
472 " \\%"UVuf": %s ..-1 undeffing\n",
473 (UV)i,
474 (i > PL_regsize) ? "-1" : " "
475 ));
a0d0e21e 476 }
dafc8851 477#endif
a0d0e21e
LW
478}
479
74088413
DM
480/* restore the parens and associated vars at savestack position ix,
481 * but without popping the stack */
482
483STATIC void
484S_regcp_restore(pTHX_ regexp *rex, I32 ix)
485{
486 I32 tmpix = PL_savestack_ix;
487 PL_savestack_ix = ix;
488 regcppop(rex);
489 PL_savestack_ix = tmpix;
490}
491
02db2b7b 492#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 493
a687059c 494/*
e50aee73 495 * pregexec and friends
a687059c
LW
496 */
497
76234dfb 498#ifndef PERL_IN_XSUB_RE
a687059c 499/*
c277df42 500 - pregexec - match a regexp against a string
a687059c 501 */
c277df42 502I32
49d7dfbc 503Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 504 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
505/* stringarg: the point in the string at which to begin matching */
506/* strend: pointer to null at end of string */
507/* strbeg: real beginning of string */
508/* minend: end of match must be >= minend bytes after stringarg. */
509/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
510 * itself is accessed via the pointers above */
511/* nosave: For optimizations. */
c277df42 512{
7918f24d
NC
513 PERL_ARGS_ASSERT_PREGEXEC;
514
c277df42 515 return
9041c2e3 516 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
517 nosave ? 0 : REXEC_COPY_STR);
518}
76234dfb 519#endif
22e551b9 520
9041c2e3 521/*
cad2e5aa
JH
522 * Need to implement the following flags for reg_anch:
523 *
524 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
525 * USE_INTUIT_ML
526 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
527 * INTUIT_AUTORITATIVE_ML
528 * INTUIT_ONCE_NOML - Intuit can match in one location only.
529 * INTUIT_ONCE_ML
530 *
531 * Another flag for this function: SECOND_TIME (so that float substrs
532 * with giant delta may be not rechecked).
533 */
534
535/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
536
3f7c398e 537/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
538 Otherwise, only SvCUR(sv) is used to get strbeg. */
539
540/* XXXX We assume that strpos is strbeg unless sv. */
541
6eb5f6b9
JH
542/* XXXX Some places assume that there is a fixed substring.
543 An update may be needed if optimizer marks as "INTUITable"
544 RExen without fixed substrings. Similarly, it is assumed that
545 lengths of all the strings are no more than minlen, thus they
546 cannot come from lookahead.
40d049e4
YO
547 (Or minlen should take into account lookahead.)
548 NOTE: Some of this comment is not correct. minlen does now take account
549 of lookahead/behind. Further research is required. -- demerphq
550
551*/
6eb5f6b9 552
2c2d71f5
JH
553/* A failure to find a constant substring means that there is no need to make
554 an expensive call to REx engine, thus we celebrate a failure. Similarly,
555 finding a substring too deep into the string means that less calls to
30944b6d
IZ
556 regtry() should be needed.
557
558 REx compiler's optimizer found 4 possible hints:
559 a) Anchored substring;
560 b) Fixed substring;
561 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 562 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 563 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
564 string which does not contradict any of them.
565 */
2c2d71f5 566
6eb5f6b9
JH
567/* Most of decisions we do here should have been done at compile time.
568 The nodes of the REx which we used for the search should have been
569 deleted from the finite automaton. */
570
cad2e5aa 571char *
288b8c02 572Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 573 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 574{
97aff369 575 dVAR;
288b8c02 576 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 577 I32 start_shift = 0;
cad2e5aa 578 /* Should be nonnegative! */
eb578fdb
KW
579 I32 end_shift = 0;
580 char *s;
581 SV *check;
a1933d95 582 char *strbeg;
cad2e5aa 583 char *t;
f2ed9b32 584 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 585 I32 ml_anch;
eb578fdb 586 char *other_last = NULL; /* other substr checked before this */
bd61b366 587 char *check_at = NULL; /* check substr found at this pos */
d8080198 588 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 589 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 590 RXi_GET_DECL(prog,progi);
30944b6d 591#ifdef DEBUGGING
890ce7af 592 const char * const i_strpos = strpos;
30944b6d 593#endif
a3621e74
YO
594 GET_RE_DEBUG_FLAGS_DECL;
595
7918f24d 596 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
597 PERL_UNUSED_ARG(flags);
598 PERL_UNUSED_ARG(data);
7918f24d 599
f2ed9b32 600 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 601
3c8556c3 602 if (RX_UTF8(rx)) {
b8d68ded
JH
603 PL_reg_flags |= RF_utf8;
604 }
ab3bbdeb 605 DEBUG_EXECUTE_r(
f2ed9b32 606 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
607 sv ? "Guessing start of match in sv for"
608 : "Guessing start of match in string for");
2a782b5b 609 );
cad2e5aa 610
c344f387
JH
611 /* CHR_DIST() would be more correct here but it makes things slow. */
612 if (prog->minlen > strend - strpos) {
a3621e74 613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 614 "String too short... [re_intuit_start]\n"));
cad2e5aa 615 goto fail;
2c2d71f5 616 }
1de06328 617
a1933d95 618 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 619 PL_regeol = strend;
f2ed9b32 620 if (utf8_target) {
33b8afdf
JH
621 if (!prog->check_utf8 && prog->check_substr)
622 to_utf8_substr(prog);
623 check = prog->check_utf8;
624 } else {
625 if (!prog->check_substr && prog->check_utf8)
626 to_byte_substr(prog);
627 check = prog->check_substr;
628 }
1de06328 629 if (check == &PL_sv_undef) {
a3621e74 630 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 631 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
632 goto fail;
633 }
bbe252da
YO
634 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
635 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
636 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 637 && !multiline ) ); /* Check after \n? */
cad2e5aa 638
7e25d62c 639 if (!ml_anch) {
bbe252da
YO
640 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
641 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 642 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
643 && sv && !SvROK(sv)
644 && (strpos != strbeg)) {
a3621e74 645 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
646 goto fail;
647 }
648 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 649 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 650 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
651 I32 slen;
652
1aa99e6b 653 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 654
653099ff
GS
655 if (SvTAIL(check)) {
656 slen = SvCUR(check); /* >= 1 */
cad2e5aa 657
9041c2e3 658 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 659 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 661 goto fail_finish;
cad2e5aa
JH
662 }
663 /* Now should match s[0..slen-2] */
664 slen--;
3f7c398e 665 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 666 || (slen > 1
3f7c398e 667 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 668 report_neq:
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
670 goto fail_finish;
671 }
cad2e5aa 672 }
3f7c398e 673 else if (*SvPVX_const(check) != *s
653099ff 674 || ((slen = SvCUR(check)) > 1
3f7c398e 675 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 676 goto report_neq;
c315bfe8 677 check_at = s;
2c2d71f5 678 goto success_at_start;
7e25d62c 679 }
cad2e5aa 680 }
2c2d71f5 681 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 682 s = strpos;
2c2d71f5 683 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
684 end_shift = prog->check_end_shift;
685
2c2d71f5 686 if (!ml_anch) {
a3b680e6 687 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 688 - (SvTAIL(check) != 0);
a3b680e6 689 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
690
691 if (end_shift < eshift)
692 end_shift = eshift;
693 }
cad2e5aa 694 }
2c2d71f5 695 else { /* Can match at random position */
cad2e5aa
JH
696 ml_anch = 0;
697 s = strpos;
1de06328
YO
698 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
699 end_shift = prog->check_end_shift;
700
701 /* end shift should be non negative here */
cad2e5aa
JH
702 }
703
bcdf7404 704#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 705 if (end_shift < 0)
1de06328 706 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 707 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
708#endif
709
2c2d71f5
JH
710 restart:
711 /* Find a possible match in the region s..strend by looking for
712 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
713
714 {
715 I32 srch_start_shift = start_shift;
716 I32 srch_end_shift = end_shift;
c33e64f0
FC
717 U8* start_point;
718 U8* end_point;
1de06328
YO
719 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
720 srch_end_shift -= ((strbeg - s) - srch_start_shift);
721 srch_start_shift = strbeg - s;
722 }
6bda09f9 723 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
724 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
725 (IV)prog->check_offset_min,
726 (IV)srch_start_shift,
727 (IV)srch_end_shift,
728 (IV)prog->check_end_shift);
729 });
730
bbe252da 731 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
732 start_point= (U8*)(s + srch_start_shift);
733 end_point= (U8*)(strend - srch_end_shift);
734 } else {
735 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
736 end_point= HOP3(strend, -srch_end_shift, strbeg);
737 }
6bda09f9 738 DEBUG_OPTIMISE_MORE_r({
56570a2c 739 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 740 (int)(end_point - start_point),
fc8cd66c 741 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
742 start_point);
743 });
744
745 s = fbm_instr( start_point, end_point,
7fba1cd6 746 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 747 }
cad2e5aa
JH
748 /* Update the count-of-usability, remove useless subpatterns,
749 unshift s. */
2c2d71f5 750
ab3bbdeb 751 DEBUG_EXECUTE_r({
f2ed9b32 752 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
753 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
754 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 755 (s ? "Found" : "Did not find"),
f2ed9b32 756 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
757 ? "anchored" : "floating"),
758 quoted,
759 RE_SV_TAIL(check),
760 (s ? " at offset " : "...\n") );
761 });
2c2d71f5
JH
762
763 if (!s)
764 goto fail_finish;
2c2d71f5 765 /* Finish the diagnostic message */
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 767
1de06328
YO
768 /* XXX dmq: first branch is for positive lookbehind...
769 Our check string is offset from the beginning of the pattern.
770 So we need to do any stclass tests offset forward from that
771 point. I think. :-(
772 */
773
774
775
776 check_at=s;
777
778
2c2d71f5
JH
779 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
780 Start with the other substr.
781 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 782 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
783 *always* match. Probably should be marked during compile...
784 Probably it is right to do no SCREAM here...
785 */
786
f2ed9b32 787 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
788 : (prog->float_substr && prog->anchored_substr))
789 {
30944b6d 790 /* Take into account the "other" substring. */
2c2d71f5
JH
791 /* XXXX May be hopelessly wrong for UTF... */
792 if (!other_last)
6eb5f6b9 793 other_last = strpos;
f2ed9b32 794 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
795 do_other_anchored:
796 {
890ce7af
AL
797 char * const last = HOP3c(s, -start_shift, strbeg);
798 char *last1, *last2;
be8e71aa 799 char * const saved_s = s;
33b8afdf 800 SV* must;
2c2d71f5 801
2c2d71f5
JH
802 t = s - prog->check_offset_max;
803 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 804 && (!utf8_target
0ce71af7 805 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 806 && t > strpos)))
6f207bd3 807 NOOP;
2c2d71f5
JH
808 else
809 t = strpos;
1aa99e6b 810 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
811 if (t < other_last) /* These positions already checked */
812 t = other_last;
1aa99e6b 813 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
814 if (last < last1)
815 last1 = last;
1de06328
YO
816 /* XXXX It is not documented what units *_offsets are in.
817 We assume bytes, but this is clearly wrong.
818 Meaning this code needs to be carefully reviewed for errors.
819 dmq.
820 */
821
2c2d71f5 822 /* On end-of-str: see comment below. */
f2ed9b32 823 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
824 if (must == &PL_sv_undef) {
825 s = (char*)NULL;
1de06328 826 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
827 }
828 else
829 s = fbm_instr(
830 (unsigned char*)t,
831 HOP3(HOP3(last1, prog->anchored_offset, strend)
832 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
833 must,
7fba1cd6 834 multiline ? FBMrf_MULTILINE : 0
33b8afdf 835 );
ab3bbdeb 836 DEBUG_EXECUTE_r({
f2ed9b32 837 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
838 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
839 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 840 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
841 quoted, RE_SV_TAIL(must));
842 });
843
844
2c2d71f5
JH
845 if (!s) {
846 if (last1 >= last2) {
a3621e74 847 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
848 ", giving up...\n"));
849 goto fail_finish;
850 }
a3621e74 851 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 852 ", trying floating at offset %ld...\n",
be8e71aa 853 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
854 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
855 s = HOP3c(last, 1, strend);
2c2d71f5
JH
856 goto restart;
857 }
858 else {
a3621e74 859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 860 (long)(s - i_strpos)));
1aa99e6b
IH
861 t = HOP3c(s, -prog->anchored_offset, strbeg);
862 other_last = HOP3c(s, 1, strend);
be8e71aa 863 s = saved_s;
2c2d71f5
JH
864 if (t == strpos)
865 goto try_at_start;
2c2d71f5
JH
866 goto try_at_offset;
867 }
30944b6d 868 }
2c2d71f5
JH
869 }
870 else { /* Take into account the floating substring. */
33b8afdf 871 char *last, *last1;
be8e71aa 872 char * const saved_s = s;
33b8afdf
JH
873 SV* must;
874
875 t = HOP3c(s, -start_shift, strbeg);
876 last1 = last =
877 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
878 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
879 last = HOP3c(t, prog->float_max_offset, strend);
880 s = HOP3c(t, prog->float_min_offset, strend);
881 if (s < other_last)
882 s = other_last;
2c2d71f5 883 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 884 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
885 /* fbm_instr() takes into account exact value of end-of-str
886 if the check is SvTAIL(ed). Since false positives are OK,
887 and end-of-str is not later than strend we are OK. */
888 if (must == &PL_sv_undef) {
889 s = (char*)NULL;
1de06328 890 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
891 }
892 else
2c2d71f5 893 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
894 (unsigned char*)last + SvCUR(must)
895 - (SvTAIL(must)!=0),
7fba1cd6 896 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 897 DEBUG_EXECUTE_r({
f2ed9b32 898 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
899 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
900 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 901 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
902 quoted, RE_SV_TAIL(must));
903 });
33b8afdf
JH
904 if (!s) {
905 if (last1 == last) {
a3621e74 906 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
907 ", giving up...\n"));
908 goto fail_finish;
2c2d71f5 909 }
a3621e74 910 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 911 ", trying anchored starting at offset %ld...\n",
be8e71aa 912 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
913 other_last = last;
914 s = HOP3c(t, 1, strend);
915 goto restart;
916 }
917 else {
a3621e74 918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
919 (long)(s - i_strpos)));
920 other_last = s; /* Fix this later. --Hugo */
be8e71aa 921 s = saved_s;
33b8afdf
JH
922 if (t == strpos)
923 goto try_at_start;
924 goto try_at_offset;
925 }
2c2d71f5 926 }
cad2e5aa 927 }
2c2d71f5 928
1de06328 929
9ef43ace 930 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 931
6bda09f9 932 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
933 PerlIO_printf(Perl_debug_log,
934 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
935 (IV)prog->check_offset_min,
936 (IV)prog->check_offset_max,
937 (IV)(s-strpos),
938 (IV)(t-strpos),
939 (IV)(t-s),
940 (IV)(strend-strpos)
941 )
942 );
943
2c2d71f5 944 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 945 && (!utf8_target
9ef43ace 946 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
947 && t > strpos)))
948 {
2c2d71f5
JH
949 /* Fixed substring is found far enough so that the match
950 cannot start at strpos. */
951 try_at_offset:
cad2e5aa 952 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
953 /* Eventually fbm_*() should handle this, but often
954 anchored_offset is not 0, so this check will not be wasted. */
955 /* XXXX In the code below we prefer to look for "^" even in
956 presence of anchored substrings. And we search even
957 beyond the found float position. These pessimizations
958 are historical artefacts only. */
959 find_anchor:
2c2d71f5 960 while (t < strend - prog->minlen) {
cad2e5aa 961 if (*t == '\n') {
4ee3650e 962 if (t < check_at - prog->check_offset_min) {
f2ed9b32 963 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
964 /* Since we moved from the found position,
965 we definitely contradict the found anchored
30944b6d
IZ
966 substr. Due to the above check we do not
967 contradict "check" substr.
968 Thus we can arrive here only if check substr
969 is float. Redo checking for "other"=="fixed".
970 */
9041c2e3 971 strpos = t + 1;
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 973 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
974 goto do_other_anchored;
975 }
4ee3650e
GS
976 /* We don't contradict the found floating substring. */
977 /* XXXX Why not check for STCLASS? */
cad2e5aa 978 s = t + 1;
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 980 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
981 goto set_useful;
982 }
4ee3650e
GS
983 /* Position contradicts check-string */
984 /* XXXX probably better to look for check-string
985 than for "\n", so one should lower the limit for t? */
a3621e74 986 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 987 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 988 other_last = strpos = s = t + 1;
cad2e5aa
JH
989 goto restart;
990 }
991 t++;
992 }
a3621e74 993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 994 PL_colors[0], PL_colors[1]));
2c2d71f5 995 goto fail_finish;
cad2e5aa 996 }
f5952150 997 else {
a3621e74 998 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 999 PL_colors[0], PL_colors[1]));
f5952150 1000 }
cad2e5aa
JH
1001 s = t;
1002 set_useful:
f2ed9b32 1003 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1004 }
1005 else {
f5952150 1006 /* The found string does not prohibit matching at strpos,
2c2d71f5 1007 - no optimization of calling REx engine can be performed,
f5952150
GS
1008 unless it was an MBOL and we are not after MBOL,
1009 or a future STCLASS check will fail this. */
2c2d71f5
JH
1010 try_at_start:
1011 /* Even in this situation we may use MBOL flag if strpos is offset
1012 wrt the start of the string. */
05b4157f 1013 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1014 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1015 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1016 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1017 {
cad2e5aa
JH
1018 t = strpos;
1019 goto find_anchor;
1020 }
a3621e74 1021 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1022 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1023 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1024 );
2c2d71f5 1025 success_at_start:
bbe252da 1026 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1027 && (utf8_target ? (
33b8afdf
JH
1028 prog->check_utf8 /* Could be deleted already */
1029 && --BmUSEFUL(prog->check_utf8) < 0
1030 && (prog->check_utf8 == prog->float_utf8)
1031 ) : (
1032 prog->check_substr /* Could be deleted already */
1033 && --BmUSEFUL(prog->check_substr) < 0
1034 && (prog->check_substr == prog->float_substr)
1035 )))
66e933ab 1036 {
cad2e5aa 1037 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1038 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1039 /* XXX Does the destruction order has to change with utf8_target? */
1040 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1041 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1042 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1043 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1044 check = NULL; /* abort */
cad2e5aa 1045 s = strpos;
486ec47a 1046 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1047 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1048 if (prog->intflags & PREGf_IMPLICIT)
1049 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1050 /* XXXX This is a remnant of the old implementation. It
1051 looks wasteful, since now INTUIT can use many
6eb5f6b9 1052 other heuristics. */
bbe252da 1053 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1054 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1055 }
1056 else
1057 s = strpos;
1058 }
1059
6eb5f6b9
JH
1060 /* Last resort... */
1061 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1062 /* trie stclasses are too expensive to use here, we are better off to
1063 leave it to regmatch itself */
f8fc2ecf 1064 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1065 /* minlen == 0 is possible if regstclass is \b or \B,
1066 and the fixed substr is ''$.
1067 Since minlen is already taken into account, s+1 is before strend;
1068 accidentally, minlen >= 1 guaranties no false positives at s + 1
1069 even for \b or \B. But (minlen? 1 : 0) below assumes that
1070 regstclass does not come from lookahead... */
1071 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1072 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1073 const U8* const str = (U8*)STRING(progi->regstclass);
1074 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1075 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1076 : 1);
1de06328
YO
1077 char * endpos;
1078 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1079 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1080 else if (prog->float_substr || prog->float_utf8)
1081 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1082 else
1083 endpos= strend;
1084
d8080198
YO
1085 if (checked_upto < s)
1086 checked_upto = s;
1087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1088 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1089
6eb5f6b9 1090 t = s;
d8080198
YO
1091 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1092 if (s) {
1093 checked_upto = s;
1094 } else {
6eb5f6b9 1095#ifdef DEBUGGING
cbbf8932 1096 const char *what = NULL;
6eb5f6b9
JH
1097#endif
1098 if (endpos == strend) {
a3621e74 1099 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1100 "Could not match STCLASS...\n") );
1101 goto fail;
1102 }
a3621e74 1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1104 "This position contradicts STCLASS...\n") );
bbe252da 1105 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1106 goto fail;
d8080198
YO
1107 checked_upto = HOPBACKc(endpos, start_shift);
1108 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1109 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1110 /* Contradict one of substrings */
33b8afdf 1111 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1112 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1113 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1114 hop_and_restart:
1aa99e6b 1115 s = HOP3c(t, 1, strend);
66e933ab
GS
1116 if (s + start_shift + end_shift > strend) {
1117 /* XXXX Should be taken into account earlier? */
a3621e74 1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1119 "Could not match STCLASS...\n") );
1120 goto fail;
1121 }
5e39e1e5
HS
1122 if (!check)
1123 goto giveup;
a3621e74 1124 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1125 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1126 what, (long)(s + start_shift - i_strpos)) );
1127 goto restart;
1128 }
66e933ab 1129 /* Have both, check_string is floating */
6eb5f6b9
JH
1130 if (t + start_shift >= check_at) /* Contradicts floating=check */
1131 goto retry_floating_check;
1132 /* Recheck anchored substring, but not floating... */
9041c2e3 1133 s = check_at;
5e39e1e5
HS
1134 if (!check)
1135 goto giveup;
a3621e74 1136 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1137 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1138 (long)(other_last - i_strpos)) );
1139 goto do_other_anchored;
1140 }
60e71179
GS
1141 /* Another way we could have checked stclass at the
1142 current position only: */
1143 if (ml_anch) {
1144 s = t = t + 1;
5e39e1e5
HS
1145 if (!check)
1146 goto giveup;
a3621e74 1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1148 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1149 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1150 goto try_at_offset;
66e933ab 1151 }
f2ed9b32 1152 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1153 goto fail;
486ec47a 1154 /* Check is floating substring. */
6eb5f6b9
JH
1155 retry_floating_check:
1156 t = check_at - start_shift;
a3621e74 1157 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1158 goto hop_and_restart;
1159 }
b7953727 1160 if (t != s) {
a3621e74 1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1162 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1163 (long)(t - i_strpos), (long)(s - i_strpos))
1164 );
1165 }
1166 else {
a3621e74 1167 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1168 "Does not contradict STCLASS...\n");
1169 );
1170 }
6eb5f6b9 1171 }
5e39e1e5 1172 giveup:
a3621e74 1173 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1174 PL_colors[4], (check ? "Guessed" : "Giving up"),
1175 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1176 return s;
2c2d71f5
JH
1177
1178 fail_finish: /* Substring not found */
33b8afdf 1179 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1180 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1181 fail:
a3621e74 1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1183 PL_colors[4], PL_colors[5]));
bd61b366 1184 return NULL;
cad2e5aa 1185}
9661b544 1186
a0a388a1
YO
1187#define DECL_TRIE_TYPE(scan) \
1188 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1189 trie_type = ((scan->flags == EXACT) \
1190 ? (utf8_target ? trie_utf8 : trie_plain) \
1191 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1192
1193#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1194uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1195 STRLEN skiplen; \
1196 switch (trie_type) { \
1197 case trie_utf8_fold: \
1198 if ( foldlen>0 ) { \
1199 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1200 foldlen -= len; \
1201 uscan += len; \
1202 len=0; \
1203 } else { \
1204 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1205 len = UTF8SKIP(uc); \
1206 skiplen = UNISKIP( uvc ); \
1207 foldlen -= skiplen; \
1208 uscan = foldbuf + skiplen; \
1209 } \
1210 break; \
1211 case trie_latin_utf8_fold: \
1212 if ( foldlen>0 ) { \
1213 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1214 foldlen -= len; \
1215 uscan += len; \
1216 len=0; \
1217 } else { \
1218 len = 1; \
1219 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1220 skiplen = UNISKIP( uvc ); \
1221 foldlen -= skiplen; \
1222 uscan = foldbuf + skiplen; \
1223 } \
1224 break; \
1225 case trie_utf8: \
1226 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1227 break; \
1228 case trie_plain: \
1229 uvc = (UV)*uc; \
1230 len = 1; \
1231 } \
1232 if (uvc < 256) { \
1233 charid = trie->charmap[ uvc ]; \
1234 } \
1235 else { \
1236 charid = 0; \
1237 if (widecharmap) { \
1238 SV** const svpp = hv_fetch(widecharmap, \
1239 (char*)&uvc, sizeof(UV), 0); \
1240 if (svpp) \
1241 charid = (U16)SvIV(*svpp); \
1242 } \
1243 } \
4cadc6a9
YO
1244} STMT_END
1245
4cadc6a9
YO
1246#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1247STMT_START { \
1248 while (s <= e) { \
1249 if ( (CoNd) \
fac1af77 1250 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1251 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1252 goto got_it; \
1253 s++; \
1254 } \
1255} STMT_END
1256
1257#define REXEC_FBC_UTF8_SCAN(CoDe) \
1258STMT_START { \
1259 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1260 CoDe \
1261 s += uskip; \
1262 } \
1263} STMT_END
1264
1265#define REXEC_FBC_SCAN(CoDe) \
1266STMT_START { \
1267 while (s < strend) { \
1268 CoDe \
1269 s++; \
1270 } \
1271} STMT_END
1272
1273#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1274REXEC_FBC_UTF8_SCAN( \
1275 if (CoNd) { \
24b23f37 1276 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1277 goto got_it; \
1278 else \
1279 tmp = doevery; \
1280 } \
1281 else \
1282 tmp = 1; \
1283)
1284
1285#define REXEC_FBC_CLASS_SCAN(CoNd) \
1286REXEC_FBC_SCAN( \
1287 if (CoNd) { \
24b23f37 1288 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1289 goto got_it; \
1290 else \
1291 tmp = doevery; \
1292 } \
1293 else \
1294 tmp = 1; \
1295)
1296
1297#define REXEC_FBC_TRYIT \
24b23f37 1298if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1299 goto got_it
1300
e1d1eefb 1301#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1302 if (utf8_target) { \
e1d1eefb
YO
1303 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1304 } \
1305 else { \
1306 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1307 }
e1d1eefb 1308
4cadc6a9 1309#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1310 if (utf8_target) { \
4cadc6a9
YO
1311 UtFpReLoAd; \
1312 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1313 } \
1314 else { \
1315 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1316 }
4cadc6a9
YO
1317
1318#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1319 PL_reg_flags |= RF_tainted; \
f2ed9b32 1320 if (utf8_target) { \
4cadc6a9
YO
1321 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1322 } \
1323 else { \
1324 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1325 }
4cadc6a9 1326
786e8c11
YO
1327#define DUMP_EXEC_POS(li,s,doutf8) \
1328 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1329
cfaf538b
KW
1330
1331#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1332 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1333 tmp = TEST_NON_UTF8(tmp); \
1334 REXEC_FBC_UTF8_SCAN( \
1335 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1336 tmp = !tmp; \
1337 IF_SUCCESS; \
1338 } \
1339 else { \
1340 IF_FAIL; \
1341 } \
1342 ); \
1343
1344#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1345 if (s == PL_bostr) { \
1346 tmp = '\n'; \
1347 } \
1348 else { \
1349 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1350 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1351 } \
1352 tmp = TeSt1_UtF8; \
1353 LOAD_UTF8_CHARCLASS_ALNUM(); \
1354 REXEC_FBC_UTF8_SCAN( \
1355 if (tmp == ! (TeSt2_UtF8)) { \
1356 tmp = !tmp; \
1357 IF_SUCCESS; \
1358 } \
1359 else { \
1360 IF_FAIL; \
1361 } \
1362 ); \
1363
63ac0dad
KW
1364/* The only difference between the BOUND and NBOUND cases is that
1365 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1366 * NBOUND. This is accomplished by passing it in either the if or else clause,
1367 * with the other one being empty */
1368#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1369 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1370
1371#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1372 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1373
1374#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1375 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1376
1377#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1378 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1379
63ac0dad
KW
1380
1381/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1382 * be passed in completely with the variable name being tested, which isn't
1383 * such a clean interface, but this is easier to read than it was before. We
1384 * are looking for the boundary (or non-boundary between a word and non-word
1385 * character. The utf8 and non-utf8 cases have the same logic, but the details
1386 * must be different. Find the "wordness" of the character just prior to this
1387 * one, and compare it with the wordness of this one. If they differ, we have
1388 * a boundary. At the beginning of the string, pretend that the previous
1389 * character was a new-line */
cfaf538b 1390#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1391 if (utf8_target) { \
cfaf538b 1392 UTF8_CODE \
63ac0dad
KW
1393 } \
1394 else { /* Not utf8 */ \
1395 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1396 tmp = TEST_NON_UTF8(tmp); \
1397 REXEC_FBC_SCAN( \
1398 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1399 tmp = !tmp; \
1400 IF_SUCCESS; \
1401 } \
1402 else { \
1403 IF_FAIL; \
1404 } \
1405 ); \
1406 } \
1407 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1408 goto got_it;
1409
786e8c11
YO
1410/* We know what class REx starts with. Try to find this position... */
1411/* if reginfo is NULL, its a dryrun */
1412/* annoyingly all the vars in this routine have different names from their counterparts
1413 in regmatch. /grrr */
1414
3c3eec57 1415STATIC char *
07be1b83 1416S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1417 const char *strend, regmatch_info *reginfo)
a687059c 1418{
27da23d5 1419 dVAR;
bbe252da 1420 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1421 char *pat_string; /* The pattern's exactish string */
1422 char *pat_end; /* ptr to end char of pat_string */
1423 re_fold_t folder; /* Function for computing non-utf8 folds */
1424 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1425 STRLEN ln;
5dab1207 1426 STRLEN lnc;
eb578fdb 1427 STRLEN uskip;
fac1af77
KW
1428 U8 c1;
1429 U8 c2;
6eb5f6b9 1430 char *e;
eb578fdb
KW
1431 I32 tmp = 1; /* Scratch variable? */
1432 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1433 UV utf8_fold_flags = 0;
f8fc2ecf 1434 RXi_GET_DECL(prog,progi);
7918f24d
NC
1435
1436 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1437
6eb5f6b9
JH
1438 /* We know what class it must start with. */
1439 switch (OP(c)) {
f56b6394 1440 case ANYOFV:
6eb5f6b9 1441 case ANYOF:
f56b6394 1442 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1443 STRLEN inclasslen = strend - s;
1444 REXEC_FBC_UTF8_CLASS_SCAN(
1445 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1446 }
1447 else {
6ef69d56 1448 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1449 }
6eb5f6b9 1450 break;
f33976b4 1451 case CANY:
4cadc6a9 1452 REXEC_FBC_SCAN(
24b23f37 1453 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1454 goto got_it;
1455 else
1456 tmp = doevery;
4cadc6a9 1457 );
f33976b4 1458 break;
2f7f8cb1
KW
1459
1460 case EXACTFA:
1461 if (UTF_PATTERN || utf8_target) {
1462 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1463 goto do_exactf_utf8;
1464 }
1465 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1466 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1467 goto do_exactf_non_utf8; /* isn't dealt with by these */
1468
6eb5f6b9 1469 case EXACTF:
62bf7766 1470 if (utf8_target) {
77a6d856
KW
1471
1472 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1473 utf8_fold_flags = 0;
fac1af77
KW
1474 goto do_exactf_utf8;
1475 }
1476 fold_array = PL_fold;
1477 folder = foldEQ;
1478 goto do_exactf_non_utf8;
1479
1480 case EXACTFL:
1481 if (UTF_PATTERN || utf8_target) {
17580e7a 1482 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1483 goto do_exactf_utf8;
1484 }
1485 fold_array = PL_fold_locale;
1486 folder = foldEQ_locale;
16d951b7
KW
1487 goto do_exactf_non_utf8;
1488
3c760661
KW
1489 case EXACTFU_SS:
1490 if (UTF_PATTERN) {
1491 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1492 }
1493 goto do_exactf_utf8;
1494
fab2782b 1495 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1496 case EXACTFU:
1497 if (UTF_PATTERN || utf8_target) {
77a6d856 1498 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1499 goto do_exactf_utf8;
1500 }
1501
1502 /* Any 'ss' in the pattern should have been replaced by regcomp,
1503 * so we don't have to worry here about this single special case
1504 * in the Latin1 range */
1505 fold_array = PL_fold_latin1;
1506 folder = foldEQ_latin1;
fac1af77
KW
1507
1508 /* FALL THROUGH */
1509
62bf7766
KW
1510 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1511 are no glitches with fold-length differences
1512 between the target string and pattern */
fac1af77
KW
1513
1514 /* The idea in the non-utf8 EXACTF* cases is to first find the
1515 * first character of the EXACTF* node and then, if necessary,
1516 * case-insensitively compare the full text of the node. c1 is the
1517 * first character. c2 is its fold. This logic will not work for
1518 * Unicode semantics and the german sharp ss, which hence should
1519 * not be compiled into a node that gets here. */
1520 pat_string = STRING(c);
1521 ln = STR_LEN(c); /* length to match in octets/bytes */
1522
8a90a8fe
KW
1523 /* We know that we have to match at least 'ln' bytes (which is the
1524 * same as characters, since not utf8). If we have to match 3
1525 * characters, and there are only 2 availabe, we know without
1526 * trying that it will fail; so don't start a match past the
1527 * required minimum number from the far end */
fac1af77
KW
1528 e = HOP3c(strend, -((I32)ln), s);
1529
1530 if (!reginfo && e < s) {
1531 e = s; /* Due to minlen logic of intuit() */
1532 }
1533
1534 c1 = *pat_string;
1535 c2 = fold_array[c1];
1536 if (c1 == c2) { /* If char and fold are the same */
1537 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1538 }
1539 else {
1540 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1541 }
1542 break;
1543
1544 do_exactf_utf8:
e067297c
KW
1545 {
1546 unsigned expansion;
1547
fac1af77
KW
1548
1549 /* If one of the operands is in utf8, we can't use the simpler
1550 * folding above, due to the fact that many different characters
1551 * can have the same fold, or portion of a fold, or different-
1552 * length fold */
1553 pat_string = STRING(c);
1554 ln = STR_LEN(c); /* length to match in octets/bytes */
1555 pat_end = pat_string + ln;
1556 lnc = (UTF_PATTERN) /* length to match in characters */
1557 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1558 : ln;
1559
e067297c
KW
1560 /* We have 'lnc' characters to match in the pattern, but because of
1561 * multi-character folding, each character in the target can match
1562 * up to 3 characters (Unicode guarantees it will never exceed
1563 * this) if it is utf8-encoded; and up to 2 if not (based on the
1564 * fact that the Latin 1 folds are already determined, and the
1565 * only multi-char fold in that range is the sharp-s folding to
1566 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1567 * string character. Adjust lnc accordingly, rounding up, so that
1568 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1569 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1570 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1571
1572 /* As in the non-UTF8 case, if we have to match 3 characters, and
1573 * only 2 are left, it's guaranteed to fail, so don't start a
1574 * match that would require us to go beyond the end of the string
1575 */
1576 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1577
1578 if (!reginfo && e < s) {
1579 e = s; /* Due to minlen logic of intuit() */
1580 }
1581
b33105db
KW
1582 /* XXX Note that we could recalculate e to stop the loop earlier,
1583 * as the worst case expansion above will rarely be met, and as we
1584 * go along we would usually find that e moves further to the left.
1585 * This would happen only after we reached the point in the loop
1586 * where if there were no expansion we should fail. Unclear if
1587 * worth the expense */
e067297c 1588
fac1af77
KW
1589 while (s <= e) {
1590 char *my_strend= (char *)strend;
1591 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1592 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1593 && (!reginfo || regtry(reginfo, &s)) )
1594 {
1595 goto got_it;
1596 }
bbdd8bad 1597 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1598 }
1599 break;
e067297c 1600 }
bbce6d69 1601 case BOUNDL:
3280af22 1602 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1603 FBC_BOUND(isALNUM_LC,
1604 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1605 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1606 break;
bbce6d69 1607 case NBOUNDL:
3280af22 1608 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1609 FBC_NBOUND(isALNUM_LC,
1610 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1611 isALNUM_LC_utf8((U8*)s));
1612 break;
1613 case BOUND:
1614 FBC_BOUND(isWORDCHAR,
1615 isALNUM_uni(tmp),
1616 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1617 break;
cfaf538b
KW
1618 case BOUNDA:
1619 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1620 isWORDCHAR_A(tmp),
1621 isWORDCHAR_A((U8*)s));
1622 break;
a0d0e21e 1623 case NBOUND:
63ac0dad
KW
1624 FBC_NBOUND(isWORDCHAR,
1625 isALNUM_uni(tmp),
1626 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1627 break;
cfaf538b
KW
1628 case NBOUNDA:
1629 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1630 isWORDCHAR_A(tmp),
1631 isWORDCHAR_A((U8*)s));
1632 break;
63ac0dad
KW
1633 case BOUNDU:
1634 FBC_BOUND(isWORDCHAR_L1,
1635 isALNUM_uni(tmp),
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637 break;
1638 case NBOUNDU:
1639 FBC_NBOUND(isWORDCHAR_L1,
1640 isALNUM_uni(tmp),
1641 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1642 break;
bbce6d69 1643 case ALNUML:
4cadc6a9
YO
1644 REXEC_FBC_CSCAN_TAINT(
1645 isALNUM_LC_utf8((U8*)s),
1646 isALNUM_LC(*s)
1647 );
6895a8aa 1648 break;
980866de
KW
1649 case ALNUMU:
1650 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1651 LOAD_UTF8_CHARCLASS_ALNUM(),
1652 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1653 isWORDCHAR_L1((U8) *s)
1654 );
6895a8aa 1655 break;
980866de
KW
1656 case ALNUM:
1657 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1658 LOAD_UTF8_CHARCLASS_ALNUM(),
1659 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1660 isWORDCHAR((U8) *s)
1661 );
6895a8aa 1662 break;
cfaf538b 1663 case ALNUMA:
8e9da4d4
KW
1664 /* Don't need to worry about utf8, as it can match only a single
1665 * byte invariant character */
cfaf538b 1666 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1667 break;
980866de
KW
1668 case NALNUMU:
1669 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1670 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1671 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1672 ! isWORDCHAR_L1((U8) *s)
1673 );
6895a8aa 1674 break;
a0d0e21e 1675 case NALNUM:
4cadc6a9 1676 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1677 LOAD_UTF8_CHARCLASS_ALNUM(),
1678 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1679 ! isALNUM(*s)
4cadc6a9 1680 );
6895a8aa 1681 break;
cfaf538b 1682 case NALNUMA:
8e9da4d4
KW
1683 REXEC_FBC_CSCAN(
1684 !isWORDCHAR_A(*s),
1685 !isWORDCHAR_A(*s)
1686 );
1687 break;
bbce6d69 1688 case NALNUML:
4cadc6a9
YO
1689 REXEC_FBC_CSCAN_TAINT(
1690 !isALNUM_LC_utf8((U8*)s),
1691 !isALNUM_LC(*s)
1692 );
6895a8aa 1693 break;
980866de
KW
1694 case SPACEU:
1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1698 isSPACE_L1((U8) *s)
1699 );
6895a8aa 1700 break;
a0d0e21e 1701 case SPACE:
4cadc6a9 1702 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1703 LOAD_UTF8_CHARCLASS_SPACE(),
1704 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1705 isSPACE((U8) *s)
4cadc6a9 1706 );
6895a8aa 1707 break;
cfaf538b 1708 case SPACEA:
8e9da4d4
KW
1709 /* Don't need to worry about utf8, as it can match only a single
1710 * byte invariant character */
cfaf538b 1711 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1712 break;
bbce6d69 1713 case SPACEL:
4cadc6a9 1714 REXEC_FBC_CSCAN_TAINT(
6bbba904 1715 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1716 isSPACE_LC(*s)
1717 );
6895a8aa 1718 break;
980866de
KW
1719 case NSPACEU:
1720 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1721 LOAD_UTF8_CHARCLASS_SPACE(),
1722 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1723 ! isSPACE_L1((U8) *s)
1724 );
6895a8aa 1725 break;
a0d0e21e 1726 case NSPACE:
4cadc6a9 1727 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1728 LOAD_UTF8_CHARCLASS_SPACE(),
1729 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1730 ! isSPACE((U8) *s)
4cadc6a9 1731 );
6895a8aa 1732 break;
cfaf538b 1733 case NSPACEA:
8e9da4d4
KW
1734 REXEC_FBC_CSCAN(
1735 !isSPACE_A(*s),
1736 !isSPACE_A(*s)
1737 );
1738 break;
bbce6d69 1739 case NSPACEL:
4cadc6a9 1740 REXEC_FBC_CSCAN_TAINT(
6bbba904 1741 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1742 !isSPACE_LC(*s)
1743 );
6895a8aa 1744 break;
a0d0e21e 1745 case DIGIT:
4cadc6a9 1746 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1747 LOAD_UTF8_CHARCLASS_DIGIT(),
1748 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1749 isDIGIT(*s)
1750 );
6895a8aa 1751 break;
cfaf538b 1752 case DIGITA:
8e9da4d4
KW
1753 /* Don't need to worry about utf8, as it can match only a single
1754 * byte invariant character */
cfaf538b 1755 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1756 break;
b8c5462f 1757 case DIGITL:
4cadc6a9
YO
1758 REXEC_FBC_CSCAN_TAINT(
1759 isDIGIT_LC_utf8((U8*)s),
1760 isDIGIT_LC(*s)
1761 );
6895a8aa 1762 break;
a0d0e21e 1763 case NDIGIT:
4cadc6a9 1764 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1765 LOAD_UTF8_CHARCLASS_DIGIT(),
1766 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1767 !isDIGIT(*s)
1768 );
6895a8aa 1769 break;
cfaf538b 1770 case NDIGITA:
8e9da4d4
KW
1771 REXEC_FBC_CSCAN(
1772 !isDIGIT_A(*s),
1773 !isDIGIT_A(*s)
1774 );
1775 break;
b8c5462f 1776 case NDIGITL:
4cadc6a9
YO
1777 REXEC_FBC_CSCAN_TAINT(
1778 !isDIGIT_LC_utf8((U8*)s),
1779 !isDIGIT_LC(*s)
1780 );
6895a8aa 1781 break;
e1d1eefb
YO
1782 case LNBREAK:
1783 REXEC_FBC_CSCAN(
1784 is_LNBREAK_utf8(s),
1785 is_LNBREAK_latin1(s)
1786 );
6895a8aa 1787 break;
e1d1eefb
YO
1788 case VERTWS:
1789 REXEC_FBC_CSCAN(
1790 is_VERTWS_utf8(s),
1791 is_VERTWS_latin1(s)
1792 );
6895a8aa 1793 break;
e1d1eefb
YO
1794 case NVERTWS:
1795 REXEC_FBC_CSCAN(
1796 !is_VERTWS_utf8(s),
1797 !is_VERTWS_latin1(s)
1798 );
6895a8aa 1799 break;
e1d1eefb
YO
1800 case HORIZWS:
1801 REXEC_FBC_CSCAN(
1802 is_HORIZWS_utf8(s),
1803 is_HORIZWS_latin1(s)
1804 );
6895a8aa 1805 break;
e1d1eefb
YO
1806 case NHORIZWS:
1807 REXEC_FBC_CSCAN(
1808 !is_HORIZWS_utf8(s),
1809 !is_HORIZWS_latin1(s)
1810 );
6895a8aa 1811 break;
0658cdde
KW
1812 case POSIXA:
1813 /* Don't need to worry about utf8, as it can match only a single
1814 * byte invariant character. The flag in this node type is the
1815 * class number to pass to _generic_isCC() to build a mask for
1816 * searching in PL_charclass[] */
1817 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1818 break;
1819 case NPOSIXA:
1820 REXEC_FBC_CSCAN(
1821 !_generic_isCC_A(*s, FLAGS(c)),
1822 !_generic_isCC_A(*s, FLAGS(c))
1823 );
1824 break;
1825
1de06328
YO
1826 case AHOCORASICKC:
1827 case AHOCORASICK:
07be1b83 1828 {
a0a388a1 1829 DECL_TRIE_TYPE(c);
07be1b83
YO
1830 /* what trie are we using right now */
1831 reg_ac_data *aho
f8fc2ecf 1832 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1833 reg_trie_data *trie
1834 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1835 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1836
1837 const char *last_start = strend - trie->minlen;
6148ee25 1838#ifdef DEBUGGING
07be1b83 1839 const char *real_start = s;
6148ee25 1840#endif
07be1b83 1841 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1842 SV *sv_points;
1843 U8 **points; /* map of where we were in the input string
786e8c11 1844 when reading a given char. For ASCII this
be8e71aa 1845 is unnecessary overhead as the relationship
38a44b82
NC
1846 is always 1:1, but for Unicode, especially
1847 case folded Unicode this is not true. */
f9e705e8 1848 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1849 U8 *bitmap=NULL;
1850
07be1b83
YO
1851
1852 GET_RE_DEBUG_FLAGS_DECL;
1853
be8e71aa
YO
1854 /* We can't just allocate points here. We need to wrap it in
1855 * an SV so it gets freed properly if there is a croak while
1856 * running the match */
1857 ENTER;
1858 SAVETMPS;
1859 sv_points=newSV(maxlen * sizeof(U8 *));
1860 SvCUR_set(sv_points,
1861 maxlen * sizeof(U8 *));
1862 SvPOK_on(sv_points);
1863 sv_2mortal(sv_points);
1864 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1865 if ( trie_type != trie_utf8_fold
1866 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1867 {
786e8c11
YO
1868 if (trie->bitmap)
1869 bitmap=(U8*)trie->bitmap;
1870 else
1871 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1872 }
786e8c11
YO
1873 /* this is the Aho-Corasick algorithm modified a touch
1874 to include special handling for long "unknown char"
1875 sequences. The basic idea being that we use AC as long
1876 as we are dealing with a possible matching char, when
1877 we encounter an unknown char (and we have not encountered
1878 an accepting state) we scan forward until we find a legal
1879 starting char.
1880 AC matching is basically that of trie matching, except
1881 that when we encounter a failing transition, we fall back
1882 to the current states "fail state", and try the current char
1883 again, a process we repeat until we reach the root state,
1884 state 1, or a legal transition. If we fail on the root state
1885 then we can either terminate if we have reached an accepting
1886 state previously, or restart the entire process from the beginning
1887 if we have not.
1888
1889 */
07be1b83
YO
1890 while (s <= last_start) {
1891 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1892 U8 *uc = (U8*)s;
1893 U16 charid = 0;
1894 U32 base = 1;
1895 U32 state = 1;
1896 UV uvc = 0;
1897 STRLEN len = 0;
1898 STRLEN foldlen = 0;
1899 U8 *uscan = (U8*)NULL;
1900 U8 *leftmost = NULL;
786e8c11
YO
1901#ifdef DEBUGGING
1902 U32 accepted_word= 0;
1903#endif
07be1b83
YO
1904 U32 pointpos = 0;
1905
1906 while ( state && uc <= (U8*)strend ) {
1907 int failed=0;
786e8c11
YO
1908 U32 word = aho->states[ state ].wordnum;
1909
1de06328
YO
1910 if( state==1 ) {
1911 if ( bitmap ) {
1912 DEBUG_TRIE_EXECUTE_r(
1913 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1914 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1915 (char *)uc, utf8_target );
1de06328
YO
1916 PerlIO_printf( Perl_debug_log,
1917 " Scanning for legal start char...\n");
1918 }
d085b490
YO
1919 );
1920 if (utf8_target) {
1921 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1922 uc += UTF8SKIP(uc);
1923 }
1924 } else {
1925 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1926 uc++;
1927 }
1928 }
1de06328 1929 s= (char *)uc;
786e8c11 1930 }
786e8c11
YO
1931 if (uc >(U8*)last_start) break;
1932 }
1933
1934 if ( word ) {
2e64971a 1935 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1936 if (!leftmost || lpos < leftmost) {
1937 DEBUG_r(accepted_word=word);
07be1b83 1938 leftmost= lpos;
786e8c11 1939 }
07be1b83 1940 if (base==0) break;
786e8c11 1941
07be1b83
YO
1942 }
1943 points[pointpos++ % maxlen]= uc;
55eed653
NC
1944 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1945 uscan, len, uvc, charid, foldlen,
1946 foldbuf, uniflags);
786e8c11
YO
1947 DEBUG_TRIE_EXECUTE_r({
1948 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1949 s, utf8_target );
07be1b83 1950 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1951 " Charid:%3u CP:%4"UVxf" ",
1952 charid, uvc);
1953 });
07be1b83
YO
1954
1955 do {
6148ee25 1956#ifdef DEBUGGING
786e8c11 1957 word = aho->states[ state ].wordnum;
6148ee25 1958#endif
07be1b83
YO
1959 base = aho->states[ state ].trans.base;
1960
786e8c11
YO
1961 DEBUG_TRIE_EXECUTE_r({
1962 if (failed)
1963 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1964 s, utf8_target );
07be1b83 1965 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1966 "%sState: %4"UVxf", word=%"UVxf,
1967 failed ? " Fail transition to " : "",
1968 (UV)state, (UV)word);
1969 });
07be1b83
YO
1970 if ( base ) {
1971 U32 tmp;
6dd2be57 1972 I32 offset;
07be1b83 1973 if (charid &&
6dd2be57
DM
1974 ( ((offset = base + charid
1975 - 1 - trie->uniquecharcount)) >= 0)
1976 && ((U32)offset < trie->lasttrans)
1977 && trie->trans[offset].check == state
1978 && (tmp=trie->trans[offset].next))
07be1b83 1979 {
786e8c11
YO
1980 DEBUG_TRIE_EXECUTE_r(
1981 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1982 state = tmp;
1983 break;
1984 }
1985 else {
786e8c11
YO
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log," - fail\n"));
1988 failed = 1;
1989 state = aho->fail[state];
07be1b83
YO
1990 }
1991 }
1992 else {
1993 /* we must be accepting here */
786e8c11
YO
1994 DEBUG_TRIE_EXECUTE_r(
1995 PerlIO_printf( Perl_debug_log," - accepting\n"));
1996 failed = 1;
07be1b83
YO
1997 break;
1998 }
1999 } while(state);
786e8c11 2000 uc += len;
07be1b83
YO
2001 if (failed) {
2002 if (leftmost)
2003 break;
786e8c11 2004 if (!state) state = 1;
07be1b83
YO
2005 }
2006 }
2007 if ( aho->states[ state ].wordnum ) {
2e64971a 2008 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2009 if (!leftmost || lpos < leftmost) {
2010 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2011 leftmost = lpos;
786e8c11 2012 }
07be1b83 2013 }
07be1b83
YO
2014 if (leftmost) {
2015 s = (char*)leftmost;
786e8c11
YO
2016 DEBUG_TRIE_EXECUTE_r({
2017 PerlIO_printf(
70685ca0
JH
2018 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2019 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2020 );
2021 });
24b23f37 2022 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2023 FREETMPS;
2024 LEAVE;
07be1b83 2025 goto got_it;
be8e71aa 2026 }
07be1b83 2027 s = HOPc(s,1);
786e8c11
YO
2028 DEBUG_TRIE_EXECUTE_r({
2029 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2030 });
07be1b83 2031 } else {
786e8c11
YO
2032 DEBUG_TRIE_EXECUTE_r(
2033 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2034 break;
2035 }
2036 }
be8e71aa
YO
2037 FREETMPS;
2038 LEAVE;
07be1b83
YO
2039 }
2040 break;
b3c9acc1 2041 default:
3c3eec57
GS
2042 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2043 break;
d6a28714 2044 }
6eb5f6b9
JH
2045 return 0;
2046 got_it:
2047 return s;
2048}
2049
fae667d5 2050
6eb5f6b9
JH
2051/*
2052 - regexec_flags - match a regexp against a string
2053 */
2054I32
288b8c02 2055Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2056 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2057/* stringarg: the point in the string at which to begin matching */
2058/* strend: pointer to null at end of string */
2059/* strbeg: real beginning of string */
2060/* minend: end of match must be >= minend bytes after stringarg. */
2061/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2062 * itself is accessed via the pointers above */
2063/* data: May be used for some additional optimizations.
2064 Currently its only used, with a U32 cast, for transmitting
2065 the ganch offset when doing a /g match. This will change */
2066/* nosave: For optimizations. */
2067
6eb5f6b9 2068{
97aff369 2069 dVAR;
288b8c02 2070 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2071 /*register*/ char *s;
eb578fdb 2072 regnode *c;
24b23f37 2073 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2074 I32 minlen; /* must match at least this many chars */
2075 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2076 I32 end_shift = 0; /* Same for the end. */ /* CC */
2077 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2078 char *scream_olds = NULL;
f2ed9b32 2079 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2080 I32 multiline;
f8fc2ecf 2081 RXi_GET_DECL(prog,progi);
3b0527fe 2082 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2083 regexp_paren_pair *swap = NULL;
a3621e74
YO
2084 GET_RE_DEBUG_FLAGS_DECL;
2085
7918f24d 2086 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2087 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2088
2089 /* Be paranoid... */
2090 if (prog == NULL || startpos == NULL) {
2091 Perl_croak(aTHX_ "NULL regexp parameter");
2092 return 0;
2093 }
2094
bbe252da 2095 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2096 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2097
f2ed9b32 2098 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2099 DEBUG_EXECUTE_r(
f2ed9b32 2100 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2101 "Matching");
2102 );
bac06658 2103
6eb5f6b9 2104 minlen = prog->minlen;
1de06328
YO
2105
2106 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2107 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2108 "String too short [regexec_flags]...\n"));
2109 goto phooey;
1aa99e6b 2110 }
6eb5f6b9 2111
1de06328 2112
6eb5f6b9 2113 /* Check validity of program. */
f8fc2ecf 2114 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2115 Perl_croak(aTHX_ "corrupted regexp program");
2116 }
2117
2118 PL_reg_flags = 0;
ed301438 2119 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2120 PL_reg_maxiter = 0;
2121
3c8556c3 2122 if (RX_UTF8(rx))
6eb5f6b9
JH
2123 PL_reg_flags |= RF_utf8;
2124
2125 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2126 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2127 PL_bostr = strbeg;
3b0527fe 2128 reginfo.sv = sv;
6eb5f6b9
JH
2129
2130 /* Mark end of line for $ (and such) */
2131 PL_regeol = strend;
2132
2133 /* see how far we have to get to not match where we matched before */
3b0527fe 2134 reginfo.till = startpos+minend;
6eb5f6b9 2135
6eb5f6b9
JH
2136 /* If there is a "must appear" string, look for it. */
2137 s = startpos;
2138
bbe252da 2139 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2140 MAGIC *mg;
2c296965 2141 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2142 reginfo.ganch = startpos + prog->gofs;
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2145 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2146 && SvMAGIC(sv)
14befaf4
DM
2147 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2148 && mg->mg_len >= 0) {
3b0527fe 2149 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2152
bbe252da 2153 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2154 if (s > reginfo.ganch)
6eb5f6b9 2155 goto phooey;
58e23c8d 2156 s = reginfo.ganch - prog->gofs;
2c296965 2157 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2158 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2159 if (s < strbeg)
2160 goto phooey;
6eb5f6b9
JH
2161 }
2162 }
58e23c8d 2163 else if (data) {
70685ca0 2164 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2165 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2166 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2167
2168 } else { /* pos() not defined */
3b0527fe 2169 reginfo.ganch = strbeg;
2c296965
YO
2170 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2171 "GPOS: reginfo.ganch = strbeg\n"));
2172 }
6eb5f6b9 2173 }
288b8c02 2174 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2175 /* We have to be careful. If the previous successful match
2176 was from this regex we don't want a subsequent partially
2177 successful match to clobber the old results.
2178 So when we detect this possibility we add a swap buffer
2179 to the re, and switch the buffer each match. If we fail
2180 we switch it back, otherwise we leave it swapped.
2181 */
2182 swap = prog->offs;
2183 /* do we need a save destructor here for eval dies? */
2184 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2185 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2186 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2187 PTR2UV(prog),
2188 PTR2UV(swap),
2189 PTR2UV(prog->offs)
2190 ));
c74340f9 2191 }
a0714e2c 2192 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2193 re_scream_pos_data d;
2194
2195 d.scream_olds = &scream_olds;
2196 d.scream_pos = &scream_pos;
288b8c02 2197 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2198 if (!s) {
a3621e74 2199 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2200 goto phooey; /* not present */
3fa9c3d7 2201 }
6eb5f6b9
JH
2202 }
2203
1de06328 2204
6eb5f6b9
JH
2205
2206 /* Simplest case: anchored match need be tried only once. */
2207 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2208 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2209 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2210 goto got_it;
bbe252da
YO
2211 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2212 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2213 {
2214 char *end;
2215
2216 if (minlen)
2217 dontbother = minlen - 1;
1aa99e6b 2218 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2219 /* for multiline we only have to try after newlines */
33b8afdf 2220 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2221 /* because of the goto we can not easily reuse the macros for bifurcating the
2222 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2223 if (utf8_target) {
2224 if (s == startpos)
2225 goto after_try_utf8;
2226 while (1) {
2227 if (regtry(&reginfo, &s)) {
2228 goto got_it;
2229 }
2230 after_try_utf8:
2231 if (s > end) {
2232 goto phooey;
2233 }
2234 if (prog->extflags & RXf_USE_INTUIT) {
2235 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2236 if (!s) {
2237 goto phooey;
2238 }
2239 }
2240 else {
2241 s += UTF8SKIP(s);
2242 }
2243 }
2244 } /* end search for check string in unicode */
2245 else {
2246 if (s == startpos) {
2247 goto after_try_latin;
2248 }
2249 while (1) {
2250 if (regtry(&reginfo, &s)) {
2251 goto got_it;
2252 }
2253 after_try_latin:
2254 if (s > end) {
2255 goto phooey;
2256 }
2257 if (prog->extflags & RXf_USE_INTUIT) {
2258 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2259 if (!s) {
2260 goto phooey;
2261 }
2262 }
2263 else {
2264 s++;
2265 }
2266 }
2267 } /* end search for check string in latin*/
2268 } /* end search for check string */
2269 else { /* search for newline */
2270 if (s > startpos) {
2271 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2272 s--;
92f3d482 2273 }
21eede78
YO
2274 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2275 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2276 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2277 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2278 goto got_it;
2279 }
92f3d482
YO
2280 }
2281 } /* end search for newline */
2282 } /* end anchored/multiline check string search */
6eb5f6b9 2283 goto phooey;
bbe252da 2284 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2285 {
486ec47a 2286 /* the warning about reginfo.ganch being used without initialization
bbe252da 2287 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2288 and we only enter this block when the same bit is set. */
58e23c8d 2289 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2290
2291 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2292 goto got_it;
2293 goto phooey;
2294 }
2295
2296 /* Messy cases: unanchored match. */
bbe252da 2297 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2298 /* we have /x+whatever/ */
f2ed9b32 2299 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2300 char ch;
bf93d4cc
GS
2301#ifdef DEBUGGING
2302 int did_match = 0;
2303#endif
f2ed9b32
KW
2304 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2305 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2306 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2307
f2ed9b32 2308 if (utf8_target) {
4cadc6a9 2309 REXEC_FBC_SCAN(
6eb5f6b9 2310 if (*s == ch) {
a3621e74 2311 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2312 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2313 s += UTF8SKIP(s);
2314 while (s < strend && *s == ch)
2315 s += UTF8SKIP(s);
2316 }
4cadc6a9 2317 );
6eb5f6b9
JH
2318 }
2319 else {
4cadc6a9 2320 REXEC_FBC_SCAN(
6eb5f6b9 2321 if (*s == ch) {
a3621e74 2322 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2323 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2324 s++;
2325 while (s < strend && *s == ch)
2326 s++;
2327 }
4cadc6a9 2328 );
6eb5f6b9 2329 }
a3621e74 2330 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2331 PerlIO_printf(Perl_debug_log,
b7953727
JH
2332 "Did not find anchored character...\n")
2333 );
6eb5f6b9 2334 }
a0714e2c
SS
2335 else if (prog->anchored_substr != NULL
2336 || prog->anchored_utf8 != NULL
2337 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2338 && prog->float_max_offset < strend - s)) {
2339 SV *must;
2340 I32 back_max;
2341 I32 back_min;
2342 char *last;
6eb5f6b9 2343 char *last1; /* Last position checked before */
bf93d4cc
GS
2344#ifdef DEBUGGING
2345 int did_match = 0;
2346#endif
33b8afdf 2347 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2348 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2349 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2350 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2351 back_max = back_min = prog->anchored_offset;
2352 } else {
f2ed9b32
KW
2353 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2354 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2355 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2356 back_max = prog->float_max_offset;
2357 back_min = prog->float_min_offset;
2358 }
1de06328
YO
2359
2360
33b8afdf
JH
2361 if (must == &PL_sv_undef)
2362 /* could not downgrade utf8 check substring, so must fail */
2363 goto phooey;
2364
1de06328
YO
2365 if (back_min<0) {
2366 last = strend;
2367 } else {
2368 last = HOP3c(strend, /* Cannot start after this */
2369 -(I32)(CHR_SVLEN(must)
2370 - (SvTAIL(must) != 0) + back_min), strbeg);
2371 }
6eb5f6b9
JH
2372 if (s > PL_bostr)
2373 last1 = HOPc(s, -1);
2374 else
2375 last1 = s - 1; /* bogus */
2376
a0288114 2377 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2378 check_substr==must. */
2379 scream_pos = -1;
2380 dontbother = end_shift;
2381 strend = HOPc(strend, -dontbother);
2382 while ( (s <= last) &&
c33e64f0 2383 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2384 (unsigned char*)strend, must,
c33e64f0 2385 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2386 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2387 if (HOPc(s, -back_max) > last1) {
2388 last1 = HOPc(s, -back_min);
2389 s = HOPc(s, -back_max);
2390 }
2391 else {
52657f30 2392 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2393
2394 last1 = HOPc(s, -back_min);
52657f30 2395 s = t;
6eb5f6b9 2396 }
f2ed9b32 2397 if (utf8_target) {
6eb5f6b9 2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s += UTF8SKIP(s);
2402 }
2403 }
2404 else {
2405 while (s <= last1) {
24b23f37 2406 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2407 goto got_it;
2408 s++;
2409 }
2410 }
2411 }
ab3bbdeb 2412 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2413 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2414 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2415 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2416 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2417 ? "anchored" : "floating"),
ab3bbdeb
YO
2418 quoted, RE_SV_TAIL(must));
2419 });
6eb5f6b9
JH
2420 goto phooey;
2421 }
f8fc2ecf 2422 else if ( (c = progi->regstclass) ) {
f14c76ed 2423 if (minlen) {
f8fc2ecf 2424 const OPCODE op = OP(progi->regstclass);
66e933ab 2425 /* don't bother with what can't match */
786e8c11 2426 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2427 strend = HOPc(strend, -(minlen - 1));
2428 }
a3621e74 2429 DEBUG_EXECUTE_r({
be8e71aa 2430 SV * const prop = sv_newmortal();
32fc9b6a 2431 regprop(prog, prop, c);
0df25f3d 2432 {
f2ed9b32 2433 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2434 s,strend-s,60);
0df25f3d 2435 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2436 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2437 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2438 quoted, (int)(strend - s));
0df25f3d 2439 }
ffc61ed2 2440 });
3b0527fe 2441 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2442 goto got_it;
07be1b83 2443 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2444 }
2445 else {
2446 dontbother = 0;
a0714e2c 2447 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2448 /* Trim the end. */
6af40bd7 2449 char *last= NULL;
33b8afdf 2450 SV* float_real;
c33e64f0
FC
2451 STRLEN len;
2452 const char *little;
33b8afdf 2453
f2ed9b32
KW
2454 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2455 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2456 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2457
c33e64f0
FC
2458 little = SvPV_const(float_real, len);
2459 if (SvTAIL(float_real)) {
1a13b075
YO
2460 /* This means that float_real contains an artificial \n on the end
2461 * due to the presence of something like this: /foo$/
2462 * where we can match both "foo" and "foo\n" at the end of the string.
2463 * So we have to compare the end of the string first against the float_real
2464 * without the \n and then against the full float_real with the string.
2465 * We have to watch out for cases where the string might be smaller
2466 * than the float_real or the float_real without the \n.
2467 */
2468 char *checkpos= strend - len;
2469 DEBUG_OPTIMISE_r(
2470 PerlIO_printf(Perl_debug_log,
2471 "%sChecking for float_real.%s\n",
2472 PL_colors[4], PL_colors[5]));
2473 if (checkpos + 1 < strbeg) {
2474 /* can't match, even if we remove the trailing \n string is too short to match */
2475 DEBUG_EXECUTE_r(
2476 PerlIO_printf(Perl_debug_log,
2477 "%sString shorter than required trailing substring, cannot match.%s\n",
2478 PL_colors[4], PL_colors[5]));
2479 goto phooey;
2480 } else if (memEQ(checkpos + 1, little, len - 1)) {
2481 /* can match, the end of the string matches without the "\n" */
2482 last = checkpos + 1;
2483 } else if (checkpos < strbeg) {
2484 /* cant match, string is too short when the "\n" is included */
2485 DEBUG_EXECUTE_r(
2486 PerlIO_printf(Perl_debug_log,
2487 "%sString does not contain required trailing substring, cannot match.%s\n",
2488 PL_colors[4], PL_colors[5]));
2489 goto phooey;
2490 } else if (!multiline) {
2491 /* non multiline match, so compare with the "\n" at the end of the string */
2492 if (memEQ(checkpos, little, len)) {
2493 last= checkpos;
2494 } else {
2495 DEBUG_EXECUTE_r(
2496 PerlIO_printf(Perl_debug_log,
2497 "%sString does not contain required trailing substring, cannot match.%s\n",
2498 PL_colors[4], PL_colors[5]));
2499 goto phooey;
2500 }
2501 } else {
2502 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2503 goto find_last;
1a13b075 2504 }
c33e64f0 2505 } else {
d6a28714 2506 find_last:
9041c2e3 2507 if (len)
d6a28714 2508 last = rninstr(s, strend, little, little + len);
b8c5462f 2509 else
a0288114 2510 last = strend; /* matching "$" */
b8c5462f 2511 }
6af40bd7
YO
2512 if (!last) {
2513 /* at one point this block contained a comment which was probably
2514 * incorrect, which said that this was a "should not happen" case.
2515 * Even if it was true when it was written I am pretty sure it is
2516 * not anymore, so I have removed the comment and replaced it with
2517 * this one. Yves */
6bda09f9
YO
2518 DEBUG_EXECUTE_r(
2519 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2520 "String does not contain required substring, cannot match.\n"
2521 ));
2522 goto phooey;
bf93d4cc 2523 }
d6a28714
JH
2524 dontbother = strend - last + prog->float_min_offset;
2525 }
2526 if (minlen && (dontbother < minlen))
2527 dontbother = minlen - 1;
2528 strend -= dontbother; /* this one's always in bytes! */
2529 /* We don't know much -- general case. */
f2ed9b32 2530 if (utf8_target) {
d6a28714 2531 for (;;) {
24b23f37 2532 if (regtry(&reginfo, &s))
d6a28714
JH
2533 goto got_it;
2534 if (s >= strend)
2535 break;
b8c5462f 2536 s += UTF8SKIP(s);
d6a28714
JH
2537 };
2538 }
2539 else {
2540 do {
24b23f37 2541 if (regtry(&reginfo, &s))
d6a28714
JH
2542 goto got_it;
2543 } while (s++ < strend);
2544 }
2545 }
2546
2547 /* Failure. */
2548 goto phooey;
2549
2550got_it:
495f47a5
DM
2551 DEBUG_BUFFERS_r(
2552 if (swap)
2553 PerlIO_printf(Perl_debug_log,
2554 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2555 PTR2UV(prog),
2556 PTR2UV(swap)
2557 );
2558 );
e9105d30 2559 Safefree(swap);
288b8c02 2560 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2561
ed301438 2562 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2563 restore_pos(aTHX_ prog);
5daac39c
NC
2564 if (RXp_PAREN_NAMES(prog))
2565 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2566
2567 /* make sure $`, $&, $', and $digit will work later */
2568 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2569 if (flags & REXEC_COPY_STR) {
f8c7b90f 2570#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2571 if ((SvIsCOW(sv)
2572 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2573 if (DEBUG_C_TEST) {
2574 PerlIO_printf(Perl_debug_log,
2575 "Copy on write: regexp capture, type %d\n",
2576 (int) SvTYPE(sv));
2577 }
2578 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2579 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2580 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2581 prog->sublen = PL_regeol - strbeg;
2582 prog->suboffset = 0;
2583 prog->subcoffset = 0;
ed252734
NC
2584 } else
2585#endif
2586 {
6502e081
DM
2587 I32 min = 0;
2588 I32 max = PL_regeol - strbeg;
2589 I32 sublen;
2590
2591 if ( (flags & REXEC_COPY_SKIP_POST)
2592 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2593 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2594 ) { /* don't copy $' part of string */
3de645a8 2595 U32 n = 0;
6502e081
DM
2596 max = -1;
2597 /* calculate the right-most part of the string covered
2598 * by a capture. Due to look-ahead, this may be to
2599 * the right of $&, so we have to scan all captures */
2600 while (n <= prog->lastparen) {
2601 if (prog->offs[n].end > max)
2602 max = prog->offs[n].end;
2603 n++;
2604 }
2605 if (max == -1)
2606 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2607 ? prog->offs[0].start
2608 : 0;
2609 assert(max >= 0 && max <= PL_regeol - strbeg);
2610 }
2611
2612 if ( (flags & REXEC_COPY_SKIP_PRE)
2613 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2614 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2615 ) { /* don't copy $` part of string */
3de645a8 2616 U32 n = 0;
6502e081
DM
2617 min = max;
2618 /* calculate the left-most part of the string covered
2619 * by a capture. Due to look-behind, this may be to
2620 * the left of $&, so we have to scan all captures */
2621 while (min && n <= prog->lastparen) {
2622 if ( prog->offs[n].start != -1
2623 && prog->offs[n].start < min)
2624 {
2625 min = prog->offs[n].start;
2626 }
2627 n++;
2628 }
2629 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2630 && min > prog->offs[0].end
2631 )
2632 min = prog->offs[0].end;
2633
2634 }
2635
2636 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2637 sublen = max - min;
2638
2639 if (RX_MATCH_COPIED(rx)) {
2640 if (sublen > prog->sublen)
2641 prog->subbeg =
2642 (char*)saferealloc(prog->subbeg, sublen+1);
2643 }
2644 else
2645 prog->subbeg = (char*)safemalloc(sublen+1);
2646 Copy(strbeg + min, prog->subbeg, sublen, char);
2647 prog->subbeg[sublen] = '\0';
2648 prog->suboffset = min;
2649 prog->sublen = sublen;
2650 }
2651 RX_MATCH_COPIED_on(rx);
2652 prog->subcoffset = prog->suboffset;
2653 if (prog->suboffset && utf8_target) {
2654 /* Convert byte offset to chars.
2655 * XXX ideally should only compute this if @-/@+
2656 * has been seen, a la PL_sawampersand ??? */
2657
2658 /* If there's a direct correspondence between the
2659 * string which we're matching and the original SV,
2660 * then we can use the utf8 len cache associated with
2661 * the SV. In particular, it means that under //g,
2662 * sv_pos_b2u() will use the previously cached
2663 * position to speed up working out the new length of
2664 * subcoffset, rather than counting from the start of
2665 * the string each time. This stops
2666 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2667 * from going quadratic */
2668 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2669 sv_pos_b2u(sv, &(prog->subcoffset));
2670 else
2671 prog->subcoffset = utf8_length((U8*)strbeg,
2672 (U8*)(strbeg+prog->suboffset));
2673 }
d6a28714
JH
2674 }
2675 else {
6502e081 2676 RX_MATCH_COPY_FREE(rx);
d6a28714 2677 prog->subbeg = strbeg;
6502e081
DM
2678 prog->suboffset = 0;
2679 prog->subcoffset = 0;
d6a28714
JH
2680 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2681 }
2682 }
9041c2e3 2683
d6a28714
JH
2684 return 1;
2685
2686phooey:
a3621e74 2687 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2688 PL_colors[4], PL_colors[5]));
ed301438 2689 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2690 restore_pos(aTHX_ prog);
e9105d30 2691 if (swap) {
c74340f9 2692 /* we failed :-( roll it back */
495f47a5
DM
2693 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2694 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2695 PTR2UV(prog),
2696 PTR2UV(prog->offs),
2697 PTR2UV(swap)
2698 ));
e9105d30
GG
2699 Safefree(prog->offs);
2700 prog->offs = swap;
2701 }
2702
d6a28714
JH
2703 return 0;
2704}
2705
6bda09f9 2706
ec43f78b
DM
2707/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2708 * Do inc before dec, in case old and new rex are the same */
2709#define SET_reg_curpm(Re2) \
2710 if (PL_reg_state.re_state_eval_setup_done) { \
2711 (void)ReREFCNT_inc(Re2); \
2712 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2713 PM_SETRE((PL_reg_curpm), (Re2)); \
2714 }
2715
2716
d6a28714
JH
2717/*
2718 - regtry - try match at specific point
2719 */
2720STATIC I32 /* 0 failure, 1 success */
24b23f37 2721S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
d6a28714 2722{
97aff369 2723 dVAR;
d6a28714 2724 CHECKPOINT lastcp;
288b8c02
NC
2725 REGEXP *const rx = reginfo->prog;
2726 regexp *const prog = (struct regexp *)SvANY(rx);
f8fc2ecf 2727 RXi_GET_DECL(prog,progi);
a3621e74 2728 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2729
2730 PERL_ARGS_ASSERT_REGTRY;
2731
24b23f37 2732 reginfo->cutpoint=NULL;
d6a28714 2733
ed301438
DM
2734 if ((prog->extflags & RXf_EVAL_SEEN)
2735 && !PL_reg_state.re_state_eval_setup_done)
2736 {
d6a28714
JH
2737 MAGIC *mg;
2738
ed301438 2739 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2740 if (reginfo->sv) {
d6a28714 2741 /* Make $_ available to executed code. */
3b0527fe 2742 if (reginfo->sv != DEFSV) {
59f00321 2743 SAVE_DEFSV;
414bf5ae 2744 DEFSV_set(reginfo->sv);
b8c5462f 2745 }
d6a28714 2746
3b0527fe
DM
2747 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2748 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2749 /* prepare for quick setting of pos */
d300d9fa 2750#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2751 if (SvIsCOW(reginfo->sv))
2752 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2753#endif
3dab1dad 2754 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2755 &PL_vtbl_mglob, NULL, 0);
d6a28714 2756 mg->mg_len = -1;
b8c5462f 2757 }
d6a28714
JH
2758 PL_reg_magic = mg;
2759 PL_reg_oldpos = mg->mg_len;
4f639d21 2760 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2761 }
09687e5a 2762 if (!PL_reg_curpm) {
a02a5408 2763 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2764#ifdef USE_ITHREADS
2765 {
14a49a24 2766 SV* const repointer = &PL_sv_undef;
92313705
NC
2767 /* this regexp is also owned by the new PL_reg_curpm, which
2768 will try to free it. */
d2ece331 2769 av_push(PL_regex_padav, repointer);
09687e5a
AB
2770 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2771 PL_regex_pad = AvARRAY(PL_regex_padav);
2772 }
2773#endif
2774 }
ec43f78b 2775 SET_reg_curpm(rx);
d6a28714
JH
2776 PL_reg_oldcurpm = PL_curpm;
2777 PL_curpm = PL_reg_curpm;
07bc277f 2778 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2779 /* Here is a serious problem: we cannot rewrite subbeg,
2780 since it may be needed if this match fails. Thus
2781 $` inside (?{}) could fail... */
2782 PL_reg_oldsaved = prog->subbeg;
2783 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2784 PL_reg_oldsavedoffset = prog->suboffset;
2785 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2786#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2787 PL_nrs = prog->saved_copy;
2788#endif
07bc277f 2789 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2790 }
2791 else
bd61b366 2792 PL_reg_oldsaved = NULL;
d6a28714 2793 prog->subbeg = PL_bostr;
6502e081
DM
2794 prog->suboffset = 0;
2795 prog->subcoffset = 0;
d6a28714
JH
2796 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2797 }
97ca13b7
DM
2798#ifdef DEBUGGING
2799 PL_reg_starttry = *startpos;
2800#endif
f0ab9afb 2801 prog->offs[0].start = *startpos - PL_bostr;
24b23f37 2802 PL_reginput = *startpos;
d6a28714 2803 prog->lastparen = 0;
03994de8 2804 prog->lastcloseparen = 0;
d6a28714 2805 PL_regsize = 0;
d6a28714
JH
2806
2807 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2808 to do this again and again, prog->lastparen should take care of
3dd2943c 2809 this! --ilya*/
dafc8851
JH
2810
2811 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2812 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2813 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2814 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2815 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2816 * Meanwhile, this code *is* needed for the
daf18116
JH
2817 * above-mentioned test suite tests to succeed. The common theme
2818 * on those tests seems to be returning null fields from matches.
225593e1 2819 * --jhi updated by dapm */
dafc8851 2820#if 1
d6a28714 2821 if (prog->nparens) {
b93070ed 2822 regexp_paren_pair *pp = prog->offs;
eb578fdb 2823 I32 i;
b93070ed 2824 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2825 ++pp;
2826 pp->start = -1;
2827 pp->end = -1;
d6a28714
JH
2828 }
2829 }
dafc8851 2830#endif
02db2b7b 2831 REGCP_SET(lastcp);
f8fc2ecf 2832 if (regmatch(reginfo, progi->program + 1)) {
b93070ed 2833 prog->offs[0].end = PL_reginput - PL_bostr;
d6a28714
JH
2834 return 1;
2835 }
24b23f37
YO
2836 if (reginfo->cutpoint)
2837 *startpos= reginfo->cutpoint;
02db2b7b 2838 REGCP_UNWIND(lastcp);
d6a28714
JH
2839 return 0;
2840}
2841
02db2b7b 2842
8ba1375e
MJD
2843#define sayYES goto yes
2844#define sayNO goto no
262b90c4 2845#define sayNO_SILENT goto no_silent
8ba1375e 2846
f9f4320a
YO
2847/* we dont use STMT_START/END here because it leads to
2848 "unreachable code" warnings, which are bogus, but distracting. */
2849#define CACHEsayNO \
c476f425
DM
2850 if (ST.cache_mask) \
2851 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2852 sayNO
3298f257 2853
a3621e74 2854/* this is used to determine how far from the left messages like
265c4333
YO
2855 'failed...' are printed. It should be set such that messages
2856 are inline with the regop output that created them.
a3621e74 2857*/
265c4333 2858#define REPORT_CODE_OFF 32
a3621e74
YO
2859
2860
40a82448
DM
2861#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2862#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2863
86545054
DM
2864#define SLAB_FIRST(s) (&(s)->states[0])
2865#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2866
5d9a96ca
DM
2867/* grab a new slab and return the first slot in it */
2868
2869STATIC regmatch_state *
2870S_push_slab(pTHX)
2871{
a35a87e7 2872#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2873 dMY_CXT;
2874#endif
5d9a96ca
DM
2875 regmatch_slab *s = PL_regmatch_slab->next;
2876 if (!s) {
2877 Newx(s, 1, regmatch_slab);
2878 s->prev = PL_regmatch_slab;
2879 s->next = NULL;
2880 PL_regmatch_slab->next = s;
2881 }
2882 PL_regmatch_slab = s;
86545054 2883 return SLAB_FIRST(s);
5d9a96ca 2884}
5b47454d 2885
95b24440 2886
40a82448
DM
2887/* push a new state then goto it */
2888
2889#define PUSH_STATE_GOTO(state, node) \
2890 scan = node; \
2891 st->resume_state = state; \
2892 goto push_state;
2893
2894/* push a new state with success backtracking, then goto it */
2895
2896#define PUSH_YES_STATE_GOTO(state, node) \
2897 scan = node; \
2898 st->resume_state = state; \
2899 goto push_yes_state;
2900
aa283a38 2901
aa283a38 2902
d6a28714 2903/*
95b24440 2904
bf1f174e
DM
2905regmatch() - main matching routine
2906
2907This is basically one big switch statement in a loop. We execute an op,
2908set 'next' to point the next op, and continue. If we come to a point which
2909we may need to backtrack to on failure such as (A|B|C), we push a
2910backtrack state onto the backtrack stack. On failure, we pop the top
2911state, and re-enter the loop at the state indicated. If there are no more
2912states to pop, we return failure.
2913
2914Sometimes we also need to backtrack on success; for example /A+/, where
2915after successfully matching one A, we need to go back and try to
2916match another one; similarly for lookahead assertions: if the assertion
2917completes successfully, we backtrack to the state just before the assertion
2918and then carry on. In these cases, the pushed state is marked as
2919'backtrack on success too'. This marking is in fact done by a chain of
2920pointers, each pointing to the previous 'yes' state. On success, we pop to
2921the nearest yes state, discarding any intermediate failure-only states.
2922Sometimes a yes state is pushed just to force some cleanup code to be
2923called at the end of a successful match or submatch; e.g. (??{$re}) uses
2924it to free the inner regex.
2925
2926Note that failure backtracking rewinds the cursor position, while
2927success backtracking leaves it alone.
2928
2929A pattern is complete when the END op is executed, while a subpattern
2930such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2931ops trigger the "pop to last yes state if any, otherwise return true"
2932behaviour.
2933
2934A common convention in this function is to use A and B to refer to the two
2935subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2936the subpattern to be matched possibly multiple times, while B is the entire
2937rest of the pattern. Variable and state names reflect this convention.
2938
2939The states in the main switch are the union of ops and failure/success of
2940substates associated with with that op. For example, IFMATCH is the op
2941that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2942'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2943successfully matched A and IFMATCH_A_fail is a state saying that we have
2944just failed to match A. Resume states always come in pairs. The backtrack
2945state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2946at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2947on success or failure.
2948
2949The struct that holds a backtracking state is actually a big union, with
2950one variant for each major type of op. The variable st points to the
2951top-most backtrack struct. To make the code clearer, within each
2952block of code we #define ST to alias the relevant union.
2953
2954Here's a concrete example of a (vastly oversimplified) IFMATCH
2955implementation:
2956
2957 switch (state) {
2958 ....
2959
2960#define ST st->u.ifmatch
2961
2962 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2963 ST.foo = ...; // some state we wish to save
95b24440 2964 ...
bf1f174e
DM
2965 // push a yes backtrack state with a resume value of
2966 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2967 // first node of A:
2968 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2969 // NOTREACHED
2970
2971 case IFMATCH_A: // we have successfully executed A; now continue with B
2972 next = B;
2973 bar = ST.foo; // do something with the preserved value
2974 break;
2975
2976 case IFMATCH_A_fail: // A failed, so the assertion failed
2977 ...; // do some housekeeping, then ...
2978 sayNO; // propagate the failure
2979
2980#undef ST
95b24440 2981
bf1f174e
DM
2982 ...
2983 }
95b24440 2984
bf1f174e
DM
2985For any old-timers reading this who are familiar with the old recursive
2986approach, the code above is equivalent to:
95b24440 2987
bf1f174e
DM
2988 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2989 {
2990 int foo = ...
95b24440 2991 ...
bf1f174e
DM
2992 if (regmatch(A)) {
2993 next = B;
2994 bar = foo;
2995 break;
95b24440 2996 }
bf1f174e
DM
2997 ...; // do some housekeeping, then ...
2998 sayNO; // propagate the failure
95b24440 2999 }
bf1f174e
DM
3000
3001The topmost backtrack state, pointed to by st, is usually free. If you
3002want to claim it, populate any ST.foo fields in it with values you wish to
3003save, then do one of
3004
3005 PUSH_STATE_GOTO(resume_state, node);
3006 PUSH_YES_STATE_GOTO(resume_state, node);
3007
3008which sets that backtrack state's resume value to 'resume_state', pushes a
3009new free entry to the top of the backtrack stack, then goes to 'node'.
3010On backtracking, the free slot is popped, and the saved state becomes the
3011new free state. An ST.foo field in this new top state can be temporarily
3012accessed to retrieve values, but once the main loop is re-entered, it
3013becomes available for reuse.
3014
3015Note that the depth of the backtrack stack constantly increases during the
3016left-to-right execution of the pattern, rather than going up and down with
3017the pattern nesting. For example the stack is at its maximum at Z at the
3018end of the pattern, rather than at X in the following:
3019
3020 /(((X)+)+)+....(Y)+....Z/
3021
3022The only exceptions to this are lookahead/behind assertions and the cut,
3023(?>A), which pop all the backtrack states associated with A before
3024continuing.
3025
486ec47a 3026Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3027PL_regmatch_state and st always point to the currently active state,
3028and PL_regmatch_slab points to the slab currently containing
3029PL_regmatch_state. The first time regmatch() is called, the first slab is
3030allocated, and is never freed until interpreter destruction. When the slab
3031is full, a new one is allocated and chained to the end. At exit from
3032regmatch(), slabs allocated since entry are freed.
3033
3034*/
95b24440 3035
40a82448 3036
5bc10b2c 3037#define DEBUG_STATE_pp(pp) \
265c4333 3038 DEBUG_STATE_r({ \
f2ed9b32 3039 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3040 PerlIO_printf(Perl_debug_log, \
5d458dd8 3041 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3042 depth*2, "", \
13d6edb4 3043 PL_reg_name[st->resume_state], \
5d458dd8
YO
3044 ((st==yes_state||st==mark_state) ? "[" : ""), \
3045 ((st==yes_state) ? "Y" : ""), \
3046 ((st==mark_state) ? "M" : ""), \
3047 ((st==yes_state||st==mark_state) ? "]" : "") \
3048 ); \
265c4333 3049 });
5bc10b2c 3050
40a82448 3051
3dab1dad 3052#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3053
3df15adc 3054#ifdef DEBUGGING
5bc10b2c 3055
ab3bbdeb 3056STATIC void
f2ed9b32 3057S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3058 const char *start, const char *end, const char *blurb)
3059{
efd26800 3060 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3061
3062 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3063
ab3bbdeb
YO
3064 if (!PL_colorset)
3065 reginitcolors();
3066 {
3067 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3068 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3069
f2ed9b32 3070 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3071 start, end - start, 60);
3072
3073 PerlIO_printf(Perl_debug_log,
3074 "%s%s REx%s %s against %s\n",
3075 PL_colors[4], blurb, PL_colors[5], s0, s1);
3076
f2ed9b32 3077 if (utf8_target||utf8_pat)
1de06328
YO
3078 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3079 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3080 utf8_pat && utf8_target ? " and " : "",
3081 utf8_target ? "string" : ""
ab3bbdeb
YO
3082 );
3083 }
3084}
3df15adc
YO
3085
3086STATIC void
786e8c11
YO
3087S_dump_exec_pos(pTHX_ const char *locinput,
3088 const regnode *scan,
3089 const char *loc_regeol,
3090 const char *loc_bostr,
3091 const char *loc_reg_starttry,
f2ed9b32 3092 const bool utf8_target)
07be1b83 3093{
786e8c11 3094 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3095 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3096 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3097 /* The part of the string before starttry has one color
3098 (pref0_len chars), between starttry and current
3099 position another one (pref_len - pref0_len chars),
3100 after the current position the third one.
3101 We assume that pref0_len <= pref_len, otherwise we
3102 decrease pref0_len. */
786e8c11
YO
3103 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3104 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3105 int pref0_len;
3106
7918f24d
NC
3107 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3108
f2ed9b32 3109 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3110 pref_len++;
786e8c11
YO
3111 pref0_len = pref_len - (locinput - loc_reg_starttry);
3112 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3113 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3114 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3115 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3116 l--;
3117 if (pref0_len < 0)
3118 pref0_len = 0;
3119 if (pref0_len > pref_len)
3120 pref0_len = pref_len;
3121 {
f2ed9b32 3122 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3123
ab3bbdeb 3124 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3125 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3126
ab3bbdeb 3127 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3128 (locinput - pref_len + pref0_len),
1de06328 3129 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3130
ab3bbdeb 3131 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3132 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3133
1de06328 3134 const STRLEN tlen=len0+len1+len2;
3df15adc 3135 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3136 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3137 (IV)(locinput - loc_bostr),
07be1b83 3138 len0, s0,
07be1b83 3139 len1, s1,
07be1b83 3140 (docolor ? "" : "> <"),
07be1b83 3141 len2, s2,
f9f4320a 3142 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3143 "");
3144 }
3145}
3df15adc 3146
07be1b83
YO
3147#endif
3148
0a4db386
YO
3149/* reg_check_named_buff_matched()
3150 * Checks to see if a named buffer has matched. The data array of
3151 * buffer numbers corresponding to the buffer is expected to reside
3152 * in the regexp->data->data array in the slot stored in the ARG() of
3153 * node involved. Note that this routine doesn't actually care about the
3154 * name, that information is not preserved from compilation to execution.
3155 * Returns the index of the leftmost defined buffer with the given name
3156 * or 0 if non of the buffers matched.
3157 */
3158STATIC I32
7918f24d
NC
3159S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3160{
0a4db386 3161 I32 n;
f8fc2ecf 3162 RXi_GET_DECL(rex,rexi);
ad64d0ec 3163 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3164 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3165
3166 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3167
0a4db386 3168 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3169 if ((I32)rex->lastparen >= nums[n] &&
3170 rex->offs[nums[n]].end != -1)
0a4db386
YO
3171 {
3172 return nums[n];
3173 }
3174 }
3175 return 0;
3176}
3177
2f554ef7
DM
3178
3179/* free all slabs above current one - called during LEAVE_SCOPE */
3180
3181STATIC void
3182S_clear_backtrack_stack(pTHX_ void *p)
3183{
3184 regmatch_slab *s = PL_regmatch_slab->next;
3185 PERL_UNUSED_ARG(p);
3186
3187 if (!s)
3188 return;
3189 PL_regmatch_slab->next = NULL;
3190 while (s) {
3191 regmatch_slab * const osl = s;
3192 s = s->next;
3193 Safefree(osl);
3194 }
3195}
3196
3197
d6a28714 3198STATIC I32 /* 0 failure, 1 success */
24b23f37 3199S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
d6a28714 3200{
a35a87e7 3201#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3202 dMY_CXT;
3203#endif
27da23d5 3204 dVAR;
eb578fdb 3205 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3207 REGEXP *rex_sv = reginfo->prog;
3208 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3209 RXi_GET_DECL(rex,rexi);
2f554ef7 3210 I32 oldsave;
5d9a96ca 3211 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3212 regmatch_state *st;
5d9a96ca 3213 /* cache heavy used fields of st in registers */
eb578fdb
KW
3214 regnode *scan;
3215 regnode *next;
3216 U32 n = 0; /* general value; init to avoid compiler warning */
3217 I32 ln = 0; /* len or last; init to avoid compiler warning */
3218 char *locinput = PL_reginput;
3219 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3220
b69b0499 3221 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3222 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3223 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3224 const U32 max_nochange_depth =
3225 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3226 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3227 regmatch_state *yes_state = NULL; /* state to pop to on success of
3228 subpattern */
e2e6a0f1
YO
3229 /* mark_state piggy backs on the yes_state logic so that when we unwind
3230 the stack on success we can update the mark_state as we go */
3231 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3232 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3233 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3234 U32 state_num;
5d458dd8
YO
3235 bool no_final = 0; /* prevent failure from backtracking? */
3236 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
e2e6a0f1 3237 char *startpoint = PL_reginput;
5d458dd8
YO
3238 SV *popmark = NULL; /* are we looking for a mark? */
3239 SV *sv_commit = NULL; /* last mark name seen in failure */
3240 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3241 during a successful match */
5d458dd8
YO
3242 U32 lastopen = 0; /* last open we saw */
3243 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3244 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3245 /* these three flags are set by various ops to signal information to
3246 * the very next op. They have a useful lifetime of exactly one loop
3247 * iteration, and are not preserved or restored by state pushes/pops
3248 */
3249 bool sw = 0; /* the condition value in (?(cond)a|b) */
3250 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3251 int logical = 0; /* the following EVAL is:
3252 0: (?{...})
3253 1: (?(?{...})X|Y)
3254 2: (??{...})
3255 or the following IFMATCH/UNLESSM is:
3256 false: plain (?=foo)
3257 true: used as a condition: (?(?=foo))
3258 */
81ed78b2
DM
3259 PAD* last_pad = NULL;
3260 dMULTICALL;
3261 I32 gimme = G_SCALAR;
3262 CV *caller_cv = NULL; /* who called us */
3263 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3264 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3265
95b24440 3266#ifdef DEBUGGING
e68ec53f 3267 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3268#endif
3269
81ed78b2
DM
3270 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3271 multicall_oldcatch = 0;
3272 multicall_cv = NULL;
3273 cx = NULL;
4f8dbb2d
JL
3274 PERL_UNUSED_VAR(multicall_cop);
3275 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3276
3277
7918f24d
NC
3278 PERL_ARGS_ASSERT_REGMATCH;
3279
3b57cd43 3280 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3281 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3282 }));
5d9a96ca
DM
3283 /* on first ever call to regmatch, allocate first slab */
3284 if (!PL_regmatch_slab) {
3285 Newx(PL_regmatch_slab, 1, regmatch_slab);
3286 PL_regmatch_slab->prev = NULL;
3287 PL_regmatch_slab->next = NULL;
86545054 3288 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3289 }
3290
2f554ef7
DM
3291 oldsave = PL_savestack_ix;
3292 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3293 SAVEVPTR(PL_regmatch_slab);
3294 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3295
3296 /* grab next free state slot */
3297 st = ++PL_regmatch_state;
86545054 3298 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3299 st = PL_regmatch_state = S_push_slab(aTHX);
3300
d6a28714
JH
3301 /* Note that nextchr is a byte even in UTF */
3302 nextchr = UCHARAT(locinput);
3303 scan = prog;
3304 while (scan != NULL) {
8ba1375e 3305
a3621e74 3306 DEBUG_EXECUTE_r( {
6136c704 3307 SV * const prop = sv_newmortal();
1de06328 3308 regnode *rnext=regnext(scan);
f2ed9b32 3309 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3310 regprop(rex, prop, scan);
07be1b83
YO
3311
3312 PerlIO_printf(Perl_debug_log,
3313 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3314 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3315 SvPVX_const(prop),
1de06328 3316 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3317 0 : (IV)(rnext - rexi->program));
2a782b5b 3318 });
d6a28714
JH
3319
3320 next = scan + NEXT_OFF(scan);
3321 if (next == scan)
3322 next = NULL;
40a82448 3323 state_num = OP(scan);
d6a28714 3324
40a82448 3325 reenter_switch:
34a81e2b 3326
40a82448 3327 switch (state_num) {
d6a28714 3328 case BOL:
7fba1cd6 3329 if (locinput == PL_bostr)
d6a28714 3330 {
3b0527fe 3331 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3332 break;
3333 }
d6a28714
JH
3334 sayNO;
3335 case MBOL:
12d33761
HS
3336 if (locinput == PL_bostr ||
3337 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3338 {
b8c5462f
JH
3339 break;
3340 }
d6a28714
JH
3341 sayNO;
3342 case SBOL:
c2a73568 3343 if (locinput == PL_bostr)
b8c5462f 3344 break;
d6a28714
JH
3345 sayNO;
3346 case GPOS:
3b0527fe 3347 if (locinput == reginfo->ganch)
d6a28714
JH
3348 break;
3349 sayNO;
ee9b8eae
YO
3350
3351 case KEEPS:
3352 /* update the startpoint */
b93070ed 3353 st->u.keeper.val = rex->offs[0].start;
ee9b8eae 3354 PL_reginput = locinput;
b93070ed 3355 rex->offs[0].start = locinput - PL_bostr;
ee9b8eae
YO
3356 PUSH_STATE_GOTO(KEEPS_next, next);
3357 /*NOT-REACHED*/
3358 case KEEPS_next_fail:
3359 /* rollback the start point change */
b93070ed 3360 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3361 sayNO_SILENT;
3362 /*NOT-REACHED*/
d6a28714 3363 case EOL:
d6a28714
JH
3364 goto seol;
3365 case MEOL:
d6a28714 3366 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3367 sayNO;
b8c5462f 3368 break;
d6a28714
JH
3369 case SEOL:
3370 seol:
3371 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3372 sayNO;
d6a28714 3373 if (PL_regeol - locinput > 1)
b8c5462f 3374 sayNO;
b8c5462f 3375 break;
d6a28714
JH
3376 case EOS:
3377 if (PL_regeol != locinput)
b8c5462f 3378 sayNO;
d6a28714 3379 break;
ffc61ed2 3380 case SANY:
d6a28714 3381 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3382 sayNO;
f2ed9b32 3383 if (utf8_target) {
f33976b4
DB
3384 locinput += PL_utf8skip[nextchr];
3385 if (locinput > PL_regeol)
3386 sayNO;
3387 nextchr = UCHARAT(locinput);
3388 }
3389 else
3390 nextchr = UCHARAT(++locinput);
3391 break;
3392 case CANY:
3393 if (!nextchr && locinput >= PL_regeol)
3394 sayNO;
b8c5462f 3395 nextchr = UCHARAT(++locinput);
a0d0e21e 3396 break;
ffc61ed2 3397 case REG_ANY:
1aa99e6b
IH
3398 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3399 sayNO;
f2ed9b32 3400 if (utf8_target) {
b8c5462f 3401 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3402 if (locinput > PL_regeol)
3403 sayNO;
a0ed51b3 3404 nextchr = UCHARAT(locinput);
a0ed51b3 3405 }
1aa99e6b
IH
3406 else
3407 nextchr = UCHARAT(++locinput);
a0ed51b3 3408 break;
166ba7cd
DM
3409
3410#undef ST
3411#define ST st->u.trie
786e8c11
YO
3412 case TRIEC:
3413 /* In this case the charclass data is available inline so
3414 we can fail fast without a lot of extra overhead.
3415 */
fab2782b
YO
3416 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3417 DEBUG_EXECUTE_r(
3418 PerlIO_printf(Perl_debug_log,
3419 "%*s %sfailed to match trie start class...%s\n",
3420 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3421 );
3422 sayNO_SILENT;
118e2215 3423 assert(0); /* NOTREACHED */
786e8c11
YO
3424 }
3425 /* FALL THROUGH */
5b47454d 3426 case TRIE:
2e64971a
DM
3427 /* the basic plan of execution of the trie is:
3428 * At the beginning, run though all the states, and
3429 * find the longest-matching word. Also remember the position
3430 * of the shortest matching word. For example, this pattern:
3431 * 1 2 3 4 5
3432 * ab|a|x|abcd|abc
3433 * when matched against the string "abcde", will generate
3434 * accept states for all words except 3, with the longest
3435 * matching word being 4, and the shortest being 1 (with
3436 * the position being after char 1 of the string).
3437 *
3438 * Then for each matching word, in word order (i.e. 1,2,4,5),
3439 * we run the remainder of the pattern; on each try setting
3440 * the current position to the character following the word,
3441 * returning to try the next word on failure.
3442 *
3443 * We avoid having to build a list of words at runtime by
3444 * using a compile-time structure, wordinfo[].prev, which
3445 * gives, for each word, the previous accepting word (if any).
3446 * In the case above it would contain the mappings 1->2, 2->0,
3447 * 3->0, 4->5, 5->1. We can use this table to generate, from
3448 * the longest word (4 above), a list of all words, by
3449 * following the list of prev pointers; this gives us the
3450 * unordered list 4,5,1,2. Then given the current word we have
3451 * just tried, we can go through the list and find the
3452 * next-biggest word to try (so if we just failed on word 2,
3453 * the next in the list is 4).
3454 *
3455 * Since at runtime we don't record the matching position in
3456 * the string for each word, we have to work that out for
3457 * each word we're about to process. The wordinfo table holds
3458 * the character length of each word; given that we recorded
3459 * at the start: the position of the shortest word and its
3460 * length in chars, we just need to move the pointer the
3461 * difference between the two char lengths. Depending on
3462 * Unicode status and folding, that's cheap or expensive.
3463 *
3464 * This algorithm is optimised for the case where are only a
3465 * small number of accept states, i.e. 0,1, or maybe 2.
3466 * With lots of accepts states, and having to try all of them,
3467 * it becomes quadratic on number of accept states to find all
3468 * the next words.
3469 */
3470
3dab1dad 3471 {
07be1b83 3472 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3473 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3474
3475 /* what trie are we using right now */
be8e71aa 3476 reg_trie_data * const trie
f8fc2ecf 3477 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3478 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3479 U32 state = trie->startstate;
166ba7cd 3480
fab2782b 3481 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3482 if (trie->states[ state ].wordnum) {
3483 DEBUG_EXECUTE_r(
3484 PerlIO_printf(Perl_debug_log,
3485 "%*s %smatched empty string...%s\n",
5bc10b2c 3486 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3487 );
20dbff7c
YO
3488 if (!trie->jump)
3489 break;
3dab1dad
YO
3490 } else {
3491 DEBUG_EXECUTE_r(
3492 PerlIO_printf(Perl_debug_log,
786e8c11 3493 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3494 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3495 );
3496 sayNO_SILENT;
3497 }
3498 }
166ba7cd 3499
786e8c11
YO
3500 {
3501 U8 *uc = ( U8* )locinput;
3502
3503 STRLEN len = 0;
3504 STRLEN foldlen = 0;
3505 U8 *uscan = (U8*)NULL;
786e8c11 3506 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3507 U32 charcount = 0; /* how many input chars we have matched */
3508 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3509
786e8c11 3510 ST.jump = trie->jump;
786e8c11 3511 ST.me = scan;
2e64971a
DM
3512 ST.firstpos = NULL;
3513 ST.longfold = FALSE; /* char longer if folded => it's harder */
3514 ST.nextword = 0;
3515
3516 /* fully traverse the TRIE; note the position of the
3517 shortest accept state and the wordnum of the longest
3518 accept state */
07be1b83 3519
a3621e74 3520 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3521 U32 base = trie->states[ state ].trans.base;
f9f4320a 3522 UV uvc = 0;
acb909b4 3523 U16 charid = 0;
2e64971a
DM
3524 U16 wordnum;
3525 wordnum = trie->states[ state ].wordnum;
3526
3527 if (wordnum) { /* it's an accept state */
3528 if (!accepted) {
3529 accepted = 1;
3530 /* record first match position */
3531 if (ST.longfold) {
3532 ST.firstpos = (U8*)locinput;
3533 ST.firstchars = 0;
5b47454d 3534 }
2e64971a
DM
3535 else {
3536 ST.firstpos = uc;
3537 ST.firstchars = charcount;
3538 }
3539 }
3540 if (!ST.nextword || wordnum < ST.nextword)
3541 ST.nextword = wordnum;
3542 ST.topword = wordnum;
786e8c11 3543 }
a3621e74 3544
07be1b83 3545 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3546 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3547 PerlIO_printf( Perl_debug_log,
2e64971a 3548 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3549 2+depth * 2, "", PL_colors[4],
2e64971a 3550 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3551 });
a3621e74 3552
2e64971a 3553 /* read a char and goto next state */
a3621e74 3554 if ( base ) {
6dd2be57 3555 I32 offset;
55eed653
NC
3556 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3557 uscan, len, uvc, charid, foldlen,
3558 foldbuf, uniflags);
2e64971a
DM
3559 charcount++;
3560 if (foldlen>0)
3561 ST.longfold = TRUE;
5b47454d 3562 if (charid &&
6dd2be57
DM
3563 ( ((offset =
3564 base + charid - 1 - trie->uniquecharcount)) >= 0)
3565
3566 && ((U32)offset < trie->lasttrans)
3567 && trie->trans[offset].check == state)
5b47454d 3568 {
6dd2be57 3569 state = trie->trans[offset].next;
5b47454d
DM
3570 }
3571 else {
3572 state = 0;
3573 }
3574 uc += len;
3575
3576 }
3577 else {
a3621e74
YO
3578 state = 0;
3579 }
3580 DEBUG_TRIE_EXECUTE_r(
e4584336 3581 PerlIO_printf( Perl_debug_log,
786e8c11 3582 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3583 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3584 );
3585 }
2e64971a 3586 if (!accepted)
a3621e74 3587 sayNO;
a3621e74 3588
2e64971a
DM
3589 /* calculate total number of accept states */
3590 {
3591 U16 w = ST.topword;
3592 accepted = 0;
3593 while (w) {
3594 w = trie->wordinfo[w].prev;
3595 accepted++;
3596 }
3597 ST.accepted = accepted;
3598 }
3599
166ba7cd
DM
3600 DEBUG_EXECUTE_r(
3601 PerlIO_printf( Perl_debug_log,
3602 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3603 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3604 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3605 );
2e64971a 3606 goto trie_first_try; /* jump into the fail handler */
786e8c11 3607 }}
118e2215 3608 assert(0); /* NOTREACHED */
2e64971a
DM
3609
3610 case TRIE_next_fail: /* we failed - try next alternative */
fae667d5
YO
3611 if ( ST.jump) {
3612 REGCP_UNWIND(ST.cp);
a8d1f4b4 3613 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3614 }
2e64971a
DM
3615 if (!--ST.accepted) {
3616 DEBUG_EXECUTE_r({
3617 PerlIO_printf( Perl_debug_log,
3618 "%*s %sTRIE failed...%s\n",
3619 REPORT_CODE_OFF+depth*2, "",
3620 PL_colors[4],
3621 PL_colors[5] );
3622 });
3623 sayNO_SILENT;
3624 }
3625 {
3626 /* Find next-highest word to process. Note that this code
3627 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3628 U16 min = 0;
3629 U16 word;
3630 U16 const nextword = ST.nextword;
3631 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3632 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3633 for (word=ST.topword; word; word=wordinfo[word].prev) {
3634 if (word > nextword && (!min || word < min))
3635 min = word;
3636 }
3637 ST.nextword = min;
3638 }
3639
fae667d5 3640 trie_first_try:
5d458dd8
YO
3641 if (do_cutgroup) {
3642 do_cutgroup = 0;
3643 no_final = 0;
3644 }
fae667d5
YO
3645
3646 if ( ST.jump) {
b93070ed 3647 ST.lastparen = rex->lastparen;
f6033a9d 3648 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3649 REGCP_SET(ST.cp);
2e64971a 3650 }
a3621e74 3651
2e64971a 3652 /* find start char of end of current word */
166ba7cd 3653 {
2e64971a
DM
3654 U32 chars; /* how many chars to skip */
3655 U8 *uc = ST.firstpos;
3656 reg_trie_data * const trie
3657 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3658
3659 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3660 >= ST.firstchars);
3661 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3662 - ST.firstchars;
3663
3664 if (ST.longfold) {
3665 /* the hard option - fold each char in turn and find
3666 * its folded length (which may be different */
3667 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3668 STRLEN foldlen;
3669 STRLEN len;
d9a396a3 3670 UV uvc;
2e64971a
DM
3671 U8 *uscan;
3672
3673 while (chars) {
f2ed9b32 3674 if (utf8_target) {
2e64971a
DM
3675 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3676 uniflags);
3677 uc += len;
3678 }
3679 else {
3680 uvc = *uc;
3681 uc++;
3682 }
3683 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3684 uscan = foldbuf;
3685 while (foldlen) {
3686 if (!--chars)
3687 break;
3688 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3689 uniflags);
3690 uscan += len;
3691 foldlen -= len;
3692 }
3693 }
a3621e74 3694 }
2e64971a 3695 else {
f2ed9b32 3696 if (utf8_target)
2e64971a
DM
3697 while (chars--)
3698 uc += UTF8SKIP(uc);
3699 else
3700 uc += chars;
3701 }
3702 PL_reginput = (char *)uc;
3703 }
166ba7cd 3704
6603fe3e
DM
3705 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3706 ? ST.jump[ST.nextword]
3707 : NEXT_OFF(ST.me));
166ba7cd 3708
2e64971a
DM
3709 DEBUG_EXECUTE_r({
3710 PerlIO_printf( Perl_debug_log,
3711 "%*s %sTRIE matched word #%d, continuing%s\n",
3712 REPORT_CODE_OFF+depth*2, "",
3713 PL_colors[4],
3714 ST.nextword,
3715 PL_colors[5]
3716 );
3717 });
3718
3719 if (ST.accepted > 1 || has_cutgroup) {
3720 PUSH_STATE_GOTO(TRIE_next, scan);
118e2215 3721 assert(0); /* NOTREACHED */
166ba7cd 3722 }
2e64971a
DM
3723 /* only one choice left - just continue */
3724 DEBUG_EXECUTE_r({
3725 AV *const trie_words
3726 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3727 SV ** const tmp = av_fetch( trie_words,
3728 ST.nextword-1, 0 );
3729 SV *sv= tmp ? sv_newmortal() : NULL;
3730
3731 PerlIO_printf( Perl_debug_log,
3732 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3733 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3734 ST.nextword,
3735 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3736 PL_colors[0], PL_colors[1],
c89df6cf 3737 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3738 )
3739 : "not compiled under -Dr",
3740 PL_colors[5] );
3741 });
3742
3743 locinput = PL_reginput;
3744 nextchr = UCHARAT(locinput);
3745 continue; /* execute rest of RE */
118e2215 3746 assert(0); /* NOTREACHED */
166ba7cd
DM
3747#undef ST
3748
95b24440
DM
3749 case EXACT: {
3750 char *s = STRING(scan);
24d3c4a9 3751 ln = STR_LEN(scan);
f2ed9b32 3752 if (utf8_target != UTF_PATTERN) {
bc517b45 3753 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3754 char *l = locinput;
24d3c4a9 3755 const char * const e = s + ln;
a72c7584 3756
f2ed9b32 3757 if (utf8_target) {
5ff6fc6d 3758 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 3759 while (s < e) {
a3b680e6 3760 STRLEN ulen;
1aa99e6b 3761 if (l >= PL_regeol)
5ff6fc6d
JH
3762 sayNO;
3763 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 3764 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 3765 uniflags))
5ff6fc6d 3766 sayNO;
bc517b45 3767 l += ulen;
5ff6fc6d 3768 s ++;
1aa99e6b 3769 }
5ff6fc6d
JH
3770 }
3771 else {
3772 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3773 while (s < e) {
a3b680e6 3774 STRLEN ulen;
1aa99e6b
IH
3775 if (l >= PL_regeol)
3776 sayNO;
5ff6fc6d 3777 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 3778 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 3779 uniflags))
1aa99e6b 3780 sayNO;
bc517b45 3781 s += ulen;
a72c7584 3782 l ++;
1aa99e6b 3783 }
5ff6fc6d 3784 }
1aa99e6b
IH
3785 locinput = l;
3786 nextchr = UCHARAT(locinput);
3787 break;
3788 }
bc517b45 3789 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3790 /* Inline the first character, for speed. */
3791 if (UCHARAT(s) != nextchr)
3792 sayNO;
24d3c4a9 3793 if (PL_regeol - locinput < ln)
d6a28714 3794 sayNO;
24d3c4a9 3795 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3796 sayNO;
24d3c4a9 3797 locinput += ln;
d6a28714
JH
3798 nextchr = UCHARAT(locinput);
3799 break;
95b24440 3800 }
9a5a5549 3801 case EXACTFL: {
a932d541 3802 re_fold_t folder;
9a5a5549
KW
3803 const U8 * fold_array;
3804 const char * s;
d513472c 3805 U32 fold_utf8_flags;
9a5a5549 3806
b8c5462f 3807 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3808 folder = foldEQ_locale;
3809 fold_array = PL_fold_locale;
17580e7a 3810 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3811 goto do_exactf;
3812
3c760661 3813 case EXACTFU_SS:
fab2782b 3814 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
3815 case EXACTFU:
3816 folder = foldEQ_latin1;
3817 fold_array = PL_fold_latin1;
2daa8fee 3818 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3819 goto do_exactf;
3820
2f7f8cb1
KW
3821 case EXACTFA:
3822 folder = foldEQ_latin1;
3823 fold_array = PL_fold_latin1;
57014d77 3824 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3825 goto do_exactf;
3826
9a5a5549
KW
3827 case EXACTF:
3828 folder = foldEQ;
3829 fold_array = PL_fold;
62bf7766 3830 fold_utf8_flags = 0;
9a5a5549
KW
3831
3832 do_exactf:
3833 s = STRING(scan);
24d3c4a9 3834 ln = STR_LEN(scan);
d6a28714 3835
3c760661
KW
3836 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3837 /* Either target or the pattern are utf8, or has the issue where
3838 * the fold lengths may differ. */
be8e71aa 3839 const char * const l = locinput;
d07ddd77 3840 char *e = PL_regeol;
bc517b45 3841
d513472c 3842 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3843 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3844 {
3845 sayNO;
5486206c 3846 }
d07ddd77
JH
3847 locinput = e;
3848 nextchr = UCHARAT(locinput);
3849 break;
a0ed51b3 3850 }
d6a28714 3851
0a138b74 3852 /* Neither the target nor the pattern are utf8 */
d6a28714 3853 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3854 UCHARAT(s) != fold_array[nextchr])
3855 {
a0ed51b3 3856 sayNO;
9a5a5549 3857 }
24d3c4a9 3858 if (PL_regeol - locinput < ln)
b8c5462f 3859 sayNO;
9a5a5549 3860 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3861 sayNO;
24d3c4a9 3862 locinput += ln;
d6a28714 3863 nextchr = UCHARAT(locinput);
a0d0e21e 3864 break;
9a5a5549 3865 }
63ac0dad
KW
3866
3867 /* XXX Could improve efficiency by separating these all out using a
3868 * macro or in-line function. At that point regcomp.c would no longer
3869 * have to set the FLAGS fields of these */
b2680017
YO
3870 case BOUNDL:
3871 case NBOUNDL:
3872 PL_reg_flags |= RF_tainted;
3873 /* FALL THROUGH */
3874 case BOUND:
63ac0dad 3875 case BOUNDU:
cfaf538b 3876 case BOUNDA:
b2680017 3877 case NBOUND:
63ac0dad 3878 case NBOUNDU:
cfaf538b 3879 case NBOUNDA:
b2680017 3880 /* was last char in word? */
f2e96b5d
KW
3881 if (utf8_target
3882 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3883 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3884 {
b2680017
YO
3885 if (locinput == PL_bostr)
3886 ln = '\n';
3887 else {
3888 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3889
3890 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3891 }
63ac0dad 3892 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3893 ln = isALNUM_uni(ln);
3894 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3895 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3896 }
3897 else {
3898 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3899 n = isALNUM_LC_utf8((U8*)locinput);
3900 }
3901 }
3902 else {
cfaf538b
KW
3903
3904 /* Here the string isn't utf8, or is utf8 and only ascii
3905 * characters are to match \w. In the latter case looking at
3906 * the byte just prior to the current one may be just the final
3907 * byte of a multi-byte character. This is ok. There are two
3908 * cases:
3909 * 1) it is a single byte character, and then the test is doing
3910 * just what it's supposed to.
3911 * 2) it is a multi-byte character, in which case the final
3912 * byte is never mistakable for ASCII, and so the test
3913 * will say it is not a word character, which is the
3914 * correct answer. */
b2680017
YO
3915 ln = (locinput != PL_bostr) ?
3916 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3917 switch (FLAGS(scan)) {
3918 case REGEX_UNICODE_CHARSET:
3919 ln = isWORDCHAR_L1(ln);
3920 n = isWORDCHAR_L1(nextchr);
3921 break;
3922 case REGEX_LOCALE_CHARSET:
3923 ln = isALNUM_LC(ln);
3924 n = isALNUM_LC(nextchr);
3925 break;
3926 case REGEX_DEPENDS_CHARSET:
3927 ln = isALNUM(ln);
3928 n = isALNUM(nextchr);
3929 break;
cfaf538b 3930 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3931 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3932 ln = isWORDCHAR_A(ln);
3933 n = isWORDCHAR_A(nextchr);
3934 break;
63ac0dad
KW
3935 default:
3936 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3937 break;
b2680017
YO
3938 }
3939 }
63ac0dad
KW
3940 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3941 * regcomp.sym */
3942 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3943 sayNO;
3944 break;
f56b6394 3945 case ANYOFV:
d6a28714 3946 case ANYOF:
f56b6394 3947 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3948 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3949 if (locinput >= PL_regeol)
3950 sayNO;
9e55ce06 3951
f2ed9b32 3952 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3953 sayNO;
b32d7d3e 3954 locinput += inclasslen;
b8c5462f 3955 nextchr = UCHARAT(locinput);
e0f9d4a8 3956 break;
ffc61ed2
JH
3957 }
3958 else {
3959 if (nextchr < 0)
3960 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3961 if (!nextchr && locinput >= PL_regeol)
3962 sayNO;
20ed0b26 3963 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3964 sayNO;
ffc61ed2 3965 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3966 break;
3967 }
b8c5462f 3968 break;
20d0b1e9 3969 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3970 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3971 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3972 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3973 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 3974 alnum, "a");
ee9a90b8
KW
3975
3976 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3977 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3978 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 3979 SPACEA, NSPACEA, isSPACE_A,
779d7b58 3980 space, " ");
ee9a90b8
KW
3981
3982 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3983 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 3984 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 3985 digit, "0");
20d0b1e9 3986
0658cdde
KW
3987 case POSIXA:
3988 if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
3989 sayNO;
3990 }
3991 /* Matched a utf8-invariant, so don't have to worry about utf8 */
3992 nextchr = UCHARAT(++locinput);
3993 break;
3994 case NPOSIXA:
3995 if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
3996 sayNO;
3997 }
3998 if (utf8_target) {
3999 locinput += PL_utf8skip[nextchr];
4000 nextchr = UCHARAT(locinput);
4001 }
4002 else {
4003 nextchr = UCHARAT(++locinput);
4004 }
4005 break;
4006
37e2e78e
KW
4007 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4008 a Unicode extended Grapheme Cluster */
4009 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4010 extended Grapheme Cluster is:
4011
4012 CR LF
4013 | Prepend* Begin Extend*
4014 | .
4015
1e958ea9
KW
4016 Begin is: ( Special_Begin | ! Control )
4017 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4018 Extend is: ( Grapheme_Extend | Spacing_Mark )
4019 Control is: [ GCB_Control CR LF ]
4020 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4021
27d4fc33
KW
4022 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4023 we can rewrite
4024
4025 Begin is ( Regular_Begin + Special Begin )
4026
4027 It turns out that 98.4% of all Unicode code points match
4028 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4029 the previous implementation for almost all Unicode code points.
27d4fc33 4030
37e2e78e
KW
4031 There is a subtlety with Prepend* which showed up in testing.
4032 Note that the Begin, and only the Begin is required in:
4033 | Prepend* Begin Extend*
cc3b396d
KW
4034 Also, Begin contains '! Control'. A Prepend must be a
4035 '! Control', which means it must also be a Begin. What it
4036 comes down to is that if we match Prepend* and then find no
4037 suitable Begin afterwards, that if we backtrack the last
4038 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4039 */
4040
b7c83a7e 4041 if (locinput >= PL_regeol)
a0ed51b3 4042 sayNO;
f2ed9b32 4043 if (! utf8_target) {
37e2e78e
KW
4044
4045 /* Match either CR LF or '.', as all the other possibilities
4046 * require utf8 */
4047 locinput++; /* Match the . or CR */
cc3b396d
KW
4048 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4049 match the LF */
37e2e78e
KW
4050 && locinput < PL_regeol
4051 && UCHARAT(locinput) == '\n') locinput++;
4052 }
4053 else {
4054
4055 /* Utf8: See if is ( CR LF ); already know that locinput <
4056 * PL_regeol, so locinput+1 is in bounds */
4057 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
4058 locinput += 2;
4059 }
4060 else {
4061 /* In case have to backtrack to beginning, then match '.' */
4062 char *starting = locinput;
4063
4064 /* In case have to backtrack the last prepend */
4065 char *previous_prepend = 0;
4066
4067 LOAD_UTF8_CHARCLASS_GCB();
4068
a1853d78
KW
4069 /* Match (prepend)*, but don't bother trying if empty (as
4070 * being set to _undef indicates) */
4071 if (PL_utf8_X_prepend != &PL_sv_undef) {
72167004
KW
4072 while (locinput < PL_regeol
4073 && swash_fetch(PL_utf8_X_prepend,
4074 (U8*)locinput, utf8_target))
4075 {
4076 previous_prepend = locinput;
4077 locinput += UTF8SKIP(locinput);
4078 }
a1853d78 4079 }
37e2e78e
KW
4080
4081 /* As noted above, if we matched a prepend character, but
4082 * the next thing won't match, back off the last prepend we
4083 * matched, as it is guaranteed to match the begin */
4084 if (previous_prepend
4085 && (locinput >= PL_regeol
c101f46d
KW
4086 || (! swash_fetch(PL_utf8_X_regular_begin,
4087 (U8*)locinput, utf8_target)
4088 && ! swash_fetch(PL_utf8_X_special_begin,
f2ed9b32 4089 (U8*)locinput, utf8_target)))
c101f46d 4090 )
37e2e78e
KW
4091 {
4092 locinput = previous_prepend;
4093 }
4094
4095 /* Note that here we know PL_regeol > locinput, as we
4096 * tested that upon input to this switch case, and if we
4097 * moved locinput forward, we tested the result just above
4098 * and it either passed, or we backed off so that it will
4099 * now pass */
11dfcd49
KW
4100 if (swash_fetch(PL_utf8_X_regular_begin,
4101 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4102 locinput += UTF8SKIP(locinput);
4103 }
4104 else if (! swash_fetch(PL_utf8_X_special_begin,
4105 (U8*)locinput, utf8_target))
4106 {
37e2e78e
KW
4107
4108 /* Here did not match the required 'Begin' in the
4109 * second term. So just match the very first
4110 * character, the '.' of the final term of the regex */
4111 locinput = starting + UTF8SKIP(starting);
27d4fc33 4112 goto exit_utf8;
37e2e78e
KW
4113 } else {
4114
11dfcd49
KW
4115 /* Here is a special begin. It can be composed of
4116 * several individual characters. One possibility is
4117 * RI+ */
4118 if (swash_fetch(PL_utf8_X_RI,
4119 (U8*)locinput, utf8_target))
4120 {
4121 locinput += UTF8SKIP(locinput);
4122 while (locinput < PL_regeol
4123 && swash_fetch(PL_utf8_X_RI,
4124 (U8*)locinput, utf8_target))
4125 {
cd94d768 4126 locinput += UTF8SKIP(locinput);
11dfcd49
KW
4127 }
4128 } else /* Another possibility is T+ */
4129 if (swash_fetch(PL_utf8_X_T,
4130 (U8*)locinput, utf8_target))
4131 {
4132 locinput += UTF8SKIP(locinput);
4133 while (locinput < PL_regeol
4134 && swash_fetch(PL_utf8_X_T,
4135 (U8*)locinput, utf8_target))
4136 {
cd94d768 4137 locinput += UTF8SKIP(locinput);
11dfcd49
KW
4138 }
4139 } else {
4140
4141 /* Here, neither RI+ nor T+; must be some other
4142 * Hangul. That means it is one of the others: L,
4143 * LV, LVT or V, and matches:
4144 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4145
4146 /* Match L* */
4147 while (locinput < PL_regeol
4148 && swash_fetch(PL_utf8_X_L,
4149 (U8*)locinput, utf8_target))
4150 {
4151 locinput += UTF8SKIP(locinput);
4152 }
37e2e78e 4153
11dfcd49
KW
4154 /* Here, have exhausted L*. If the next character
4155 * is not an LV, LVT nor V, it means we had to have
4156 * at least one L, so matches L+ in the original
4157 * equation, we have a complete hangul syllable.
4158 * Are done. */
4159
4160 if (locinput < PL_regeol
4161 && swash_fetch(PL_utf8_X_LV_LVT_V,
4162 (U8*)locinput, utf8_target))
4163 {
4164
4165 /* Otherwise keep going. Must be LV, LVT or V.
4166 * See if LVT */
4167 if (is_utf8_X_LVT((U8*)locinput)) {
4168 locinput += UTF8SKIP(locinput);
4169 } else {
4170
4171 /* Must be V or LV. Take it, then match
4172 * V* */
4173 locinput += UTF8SKIP(locinput);
4174 while (locinput < PL_regeol
4175 && swash_fetch(PL_utf8_X_V,
4176 (U8*)locinput,
4177 utf8_target))
4178 {
4179 locinput += UTF8SKIP(locinput);
4180 }
4181 }
37e2e78e 4182
11dfcd49
KW
4183 /* And any of LV, LVT, or V can be followed
4184 * by T* */
4185 while (locinput < PL_regeol
4186 && swash_fetch(PL_utf8_X_T,
4187 (U8*)locinput,
4188 utf8_target))
4189 {
4190 locinput += UTF8SKIP(locinput);
4191 }
4192 }
cd94d768 4193 }
11dfcd49 4194 }
37e2e78e 4195
11dfcd49
KW
4196 /* Match any extender */
4197 while (locinput < PL_regeol
4198 && swash_fetch(PL_utf8_X_extend,
4199 (U8*)locinput, utf8_target))
4200 {
4201 locinput += UTF8SKIP(locinput);
4202 }
37e2e78e 4203 }
27d4fc33 4204 exit_utf8:
37e2e78e
KW
4205 if (locinput > PL_regeol) sayNO;
4206 }
a0ed51b3
LW
4207 nextchr = UCHARAT(locinput);
4208 break;
81714fb9
YO
4209
4210 case NREFFL:
d7ef4b73
KW
4211 { /* The capture buffer cases. The ones beginning with N for the
4212 named buffers just convert to the equivalent numbered and
4213 pretend they were called as the corresponding numbered buffer
4214 op. */
26ecd678
TC
4215 /* don't initialize these in the declaration, it makes C++
4216 unhappy */
81714fb9 4217 char *s;
ff1157ca 4218 char type;
8368298a
TC
4219 re_fold_t folder;
4220 const U8 *fold_array;
26ecd678 4221 UV utf8_fold_flags;
8368298a 4222
81714fb9 4223 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4224 folder = foldEQ_locale;
4225 fold_array = PL_fold_locale;
4226 type = REFFL;
17580e7a 4227 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4228 goto do_nref;
4229
2f7f8cb1
KW
4230 case NREFFA:
4231 folder = foldEQ_latin1;
4232 fold_array = PL_fold_latin1;
4233 type = REFFA;
4234 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4235 goto do_nref;
4236
d7ef4b73
KW
4237 case NREFFU:
4238 folder = foldEQ_latin1;
4239 fold_array = PL_fold_latin1;
4240 type = REFFU;
d513472c 4241 utf8_fold_flags = 0;
d7ef4b73
KW
4242 goto do_nref;
4243
81714fb9 4244 case NREFF:
d7ef4b73
KW
4245 folder = foldEQ;
4246 fold_array = PL_fold;
4247 type = REFF;
d513472c 4248 utf8_fold_flags = 0;
d7ef4b73
KW
4249 goto do_nref;
4250
4251 case NREF:
4252 type = REF;
83d7b90b
KW
4253 folder = NULL;
4254 fold_array = NULL;
d513472c 4255 utf8_fold_flags = 0;
d7ef4b73
KW
4256 do_nref:
4257
4258 /* For the named back references, find the corresponding buffer
4259 * number */
0a4db386
YO
4260 n = reg_check_named_buff_matched(rex,scan);
4261
d7ef4b73 4262 if ( ! n ) {
81714fb9 4263 sayNO;
d7ef4b73
KW
4264 }
4265 goto do_nref_ref_common;
4266
c8756f30 4267 case REFFL:
3280af22 4268 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4269 folder = foldEQ_locale;
4270 fold_array = PL_fold_locale;
17580e7a 4271 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4272 goto do_ref;
4273
2f7f8cb1
KW
4274 case REFFA:
4275 folder = foldEQ_latin1;
4276 fold_array = PL_fold_latin1;
4277 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4278 goto do_ref;
4279
d7ef4b73
KW
4280 case REFFU:
4281 folder = foldEQ_latin1;
4282 fold_array = PL_fold_latin1;
d513472c 4283 utf8_fold_flags = 0;
d7ef4b73
KW
4284 goto do_ref;
4285
4286 case REFF:
4287 folder = foldEQ;
4288 fold_array = PL_fold;
d513472c 4289 utf8_fold_flags = 0;
83d7b90b 4290 goto do_ref;
d7ef4b73 4291
c277df42 4292 case REF:
83d7b90b
KW
4293 folder = NULL;
4294 fold_array = NULL;
d513472c 4295 utf8_fold_flags = 0;
83d7b90b 4296
d7ef4b73 4297 do_ref:
81714fb9 4298 type = OP(scan);
d7ef4b73
KW
4299 n = ARG(scan); /* which paren pair */
4300
4301 do_nref_ref_common:
b93070ed 4302 ln = rex->offs[n].start;
2c2d71f5 4303 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4304 if (rex->lastparen < n || ln == -1)
af3f8c16 4305 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4306 if (ln == rex->offs[n].end)
a0d0e21e 4307 break;
a0ed51b3 4308
24d3c4a9 4309 s = PL_bostr + ln;
d7ef4b73 4310 if (type != REF /* REF can do byte comparison */
2f65c56d 4311 && (utf8_target || type == REFFU))
d7ef4b73
KW
4312 { /* XXX handle REFFL better */
4313 char * limit = PL_regeol;
4314
4315 /* This call case insensitively compares the entire buffer
4316 * at s, with the current input starting at locinput, but
4317 * not going off the end given by PL_regeol, and returns in
4318 * limit upon success, how much of the current input was
4319 * matched */
b93070ed 4320 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4321 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4322 {
4323 sayNO;
a0ed51b3 4324 }
d7ef4b73 4325 locinput = limit;
a0ed51b3
LW
4326 nextchr = UCHARAT(locinput);
4327 break;
4328 }
4329
d7ef4b73 4330 /* Not utf8: Inline the first character, for speed. */
76e3520e 4331 if (UCHARAT(s) != nextchr &&
81714fb9 4332 (type == REF ||
d7ef4b73 4333 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4334 sayNO;
b93070ed 4335 ln = rex->offs[n].end - ln;
24d3c4a9 4336 if (locinput + ln > PL_regeol)
4633a7c4 4337 sayNO;
81714fb9 4338 if (ln > 1 && (type == REF
24d3c4a9 4339 ? memNE(s, locinput, ln)
d7ef4b73 4340 : ! folder(s, locinput, ln)))
4633a7c4 4341 sayNO;
24d3c4a9 4342 locinput += ln;
76e3520e 4343 nextchr = UCHARAT(locinput);
a0d0e21e 4344 break;
81714fb9 4345 }
a0d0e21e 4346 case NOTHING:
c277df42 4347 case TAIL:
a0d0e21e
LW
4348 break;
4349 case BACK:
4350 break;
40a82448
DM
4351
4352#undef ST
4353#define ST st->u.eval
c277df42 4354 {
c277df42 4355 SV *ret;
d2f13c59 4356 REGEXP *re_sv;
6bda09f9 4357 regexp *re;
f8fc2ecf 4358 regexp_internal *rei;
1a147d38
YO
4359 regnode *startpoint;
4360
4361 case GOSTART:
e7707071
YO
4362 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4363 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4364 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4365 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4366 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4367 Perl_croak(aTHX_
4368 "Pattern subroutine nesting without pos change"
4369 " exceeded limit in regex");
6bda09f9
YO
4370 } else {
4371 nochange_depth = 0;
1a147d38 4372 }
288b8c02 4373 re_sv = rex_sv;
6bda09f9 4374 re = rex;
f8fc2ecf 4375 rei = rexi;
1a147d38 4376 if (OP(scan)==GOSUB) {
6bda09f9
YO
4377 startpoint = scan + ARG2L(scan);
4378 ST.close_paren = ARG(scan);
4379 } else {
f8fc2ecf 4380 startpoint = rei->program+1;
6bda09f9
YO
4381 ST.close_paren = 0;
4382 }
4383 goto eval_recurse_doit;
118e2215 4384 assert(0); /* NOTREACHED */
6bda09f9
YO
4385 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4386 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4387 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4388 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4389 } else {
4390 nochange_depth = 0;
4391 }
8e5e9ebe 4392 {
4aabdb9b 4393 /* execute the code in the {...} */
81ed78b2 4394
4aabdb9b 4395 dSP;
81ed78b2 4396 SV ** before;
1f4d1a1e 4397 OP * const oop = PL_op;
4aabdb9b 4398 COP * const ocurcop = PL_curcop;
81ed78b2 4399 OP *nop;
d80618d2 4400 char *saved_regeol = PL_regeol;
91332126 4401 struct re_save_state saved_state;
81ed78b2 4402 CV *newcv;
91332126 4403
74088413
DM
4404 /* save *all* paren positions */
4405 regcppush(rex, 0);
4406 REGCP_SET(runops_cp);
4407
6562f1c4 4408 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4409 * eval we would normally put it on the save stack, like with
4410 * save_re_context. However, re-evals have a weird scoping so we
4411 * can't just add ENTER/LEAVE here. With that, things like
4412 *
4413 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4414 *
4415 * would break, as they expect the localisation to be unwound
4416 * only when the re-engine backtracks through the bit that
4417 * localised it.
4418 *
4419 * What we do instead is just saving the state in a local c
4420 * variable.
4421 */
91332126 4422 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4423
d24ca0c5 4424 PL_reg_state.re_reparsing = FALSE;
91332126 4425
81ed78b2
DM
4426 if (!caller_cv)
4427 caller_cv = find_runcv(NULL);
4428
4aabdb9b 4429 n = ARG(scan);
81ed78b2 4430
b30fcab9 4431 if (rexi->data->what[n] == 'r') { /* code from an external qr */
81ed78b2 4432 newcv = ((struct regexp *)SvANY(
b30fcab9
DM
4433 (REGEXP*)(rexi->data->data[n])
4434 ))->qr_anoncv
81ed78b2
DM
4435 ;
4436 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4437 }
4438 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4439 newcv = caller_cv;
4440 nop = (OP*)rexi->data->data[n];
4441 assert(CvDEPTH(newcv));
68e2671b
DM
4442 }
4443 else {
d24ca0c5
DM
4444 /* literal with own CV */
4445 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4446 newcv = rex->qr_anoncv;
4447 nop = (OP*)rexi->data->data[n];
68e2671b 4448 }
81ed78b2 4449
0e458318
DM
4450 /* normally if we're about to execute code from the same
4451 * CV that we used previously, we just use the existing
4452 * CX stack entry. However, its possible that in the
4453 * meantime we may have backtracked, popped from the save
4454 * stack, and undone the SAVECOMPPAD(s) associated with
4455 * PUSH_MULTICALL; in which case PL_comppad no longer
4456 * points to newcv's pad. */
4457 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4458 {
4459 I32 depth = (newcv == caller_cv) ? 0 : 1;
4460 if (last_pushed_cv) {
4461 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4462 }
4463 else {
4464 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4465 }
4466 last_pushed_cv = newcv;
4467 }
4468 last_pad = PL_comppad;
4469
2e2e3f36
DM
4470 /* the initial nextstate you would normally execute
4471 * at the start of an eval (which would cause error
4472 * messages to come from the eval), may be optimised
4473 * away from the execution path in the regex code blocks;
4474 * so manually set PL_curcop to it initially */
4475 {
81ed78b2 4476 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4477 assert(o->op_type == OP_NULL);
4478 if (o->op_targ == OP_SCOPE) {
4479 o = cUNOPo->op_first;
4480 }
4481 else {
4482 assert(o->op_targ == OP_LEAVE);
4483 o = cUNOPo->op_first;
4484 assert(o->op_type == OP_ENTER);
4485 o = o->op_sibling;
4486 }
4487
4488 if (o->op_type != OP_STUB) {
4489 assert( o->op_type == OP_NEXTSTATE
4490 || o->op_type == OP_DBSTATE
4491 || (o->op_type == OP_NULL
4492 && ( o->op_targ == OP_NEXTSTATE
4493 || o->op_targ == OP_DBSTATE
4494 )
4495 )
4496 );
4497 PL_curcop = (COP*)o;
4498 }
4499 }
81ed78b2 4500 nop = nop->op_next;
2e2e3f36 4501
24b23f37 4502 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4503 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4504
b93070ed 4505 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4506
2bf803e2
YO
4507 if (sv_yes_mark) {
4508 SV *sv_mrk = get_sv("REGMARK", 1);
4509 sv_setsv(sv_mrk, sv_yes_mark);
4510 }
4511
81ed78b2
DM
4512 /* we don't use MULTICALL here as we want to call the
4513 * first op of the block of interest, rather than the
4514 * first op of the sub */
4515 before = SP;
4516 PL_op = nop;
8e5e9ebe
RGS
4517 CALLRUNOPS(aTHX); /* Scalar context. */
4518 SPAGAIN;
4519 if (SP == before)
075aa684 4520 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4521 else {
4522 ret = POPs;
4523 PUTBACK;
4524 }
4aabdb9b 4525
e4bfbed3
DM
4526 /* before restoring everything, evaluate the returned
4527 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4528 * PL_op or pad. Also need to process any magic vars
4529 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4530
4531 PL_op = NULL;
4532
5e98dac2 4533 re_sv = NULL;
e4bfbed3
DM
4534 if (logical == 0) /* (?{})/ */
4535 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4536 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4537 sw = cBOOL(SvTRUE(ret));
4538 logical = 0;
4539 }
4540 else { /* /(??{}) */
497d0a96
DM
4541 /* if its overloaded, let the regex compiler handle
4542 * it; otherwise extract regex, or stringify */
4543 if (!SvAMAGIC(ret)) {
4544 SV *sv = ret;
4545 if (SvROK(sv))
4546 sv = SvRV(sv);
4547 if (SvTYPE(sv) == SVt_REGEXP)
4548 re_sv = (REGEXP*) sv;
4549 else if (SvSMAGICAL(sv)) {
4550 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4551 if (mg)
4552 re_sv = (REGEXP *) mg->mg_obj;
4553 }
e4bfbed3 4554
497d0a96
DM
4555 /* force any magic, undef warnings here */
4556 if (!re_sv) {
4557 ret = sv_mortalcopy(ret);
4558 (void) SvPV_force_nolen(ret);
4559 }
e4bfbed3
DM
4560 }
4561
4562 }
4563
91332126
FR
4564 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4565
81ed78b2
DM
4566 /* *** Note that at this point we don't restore
4567 * PL_comppad, (or pop the CxSUB) on the assumption it may
4568 * be used again soon. This is safe as long as nothing
4569 * in the regexp code uses the pad ! */
4aabdb9b 4570 PL_op = oop;
4aabdb9b 4571 PL_curcop = ocurcop;
d80618d2 4572 PL_regeol = saved_regeol;
e4bfbed3
DM
4573 S_regcp_restore(aTHX_ rex, runops_cp);
4574
4575 if (logical != 2)
4aabdb9b 4576 break;
8e5e9ebe 4577 }
e4bfbed3
DM
4578
4579 /* only /(??{})/ from now on */
24d3c4a9 4580 logical = 0;
4aabdb9b 4581 {
4f639d21
DM
4582 /* extract RE object from returned value; compiling if
4583 * necessary */
5c35adbb 4584
575c37f6
DM
4585 if (re_sv) {
4586 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4587 }
0f5d15d6 4588 else {
c737faaf 4589 U32 pm_flags = 0;
a3b680e6 4590 const I32 osize = PL_regsize;
0f5d15d6 4591
9753d940
DM
4592 if (SvUTF8(ret) && IN_BYTES) {
4593 /* In use 'bytes': make a copy of the octet
4594 * sequence, but without the flag on */
b9ad30b4
NC
4595 STRLEN len;
4596 const char *const p = SvPV(ret, len);
4597 ret = newSVpvn_flags(p, len, SVs_TEMP);
4598 }
732caac7
DM
4599 if (rex->intflags & PREGf_USE_RE_EVAL)
4600 pm_flags |= PMf_USE_RE_EVAL;
4601
4602 /* if we got here, it should be an engine which
4603 * supports compiling code blocks and stuff */
4604 assert(rex->engine && rex->engine->op_comp);
ec841a27 4605 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4606 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4607 rex->engine, NULL, NULL,
4608 /* copy /msix etc to inner pattern */
4609 scan->flags,
4610 pm_flags);
732caac7 4611
9041c2e3 4612 if (!(SvFLAGS(ret)
faf82a0b 4613 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4614 | SVs_GMG))) {
a2794585
NC
4615 /* This isn't a first class regexp. Instead, it's
4616 caching a regexp onto an existing, Perl visible
4617 scalar. */
575c37f6 4618 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4619 }
0f5d15d6 4620 PL_regsize = osize;
74088413
DM
4621 /* safe to do now that any $1 etc has been
4622 * interpolated into the new pattern string and
4623 * compiled */
4624 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4625 }
575c37f6 4626 re = (struct regexp *)SvANY(re_sv);
4aabdb9b 4627 }
07bc277f 4628 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4629 re->subbeg = rex->subbeg;
4630 re->sublen = rex->sublen;
6502e081
DM
4631 re->suboffset = rex->suboffset;
4632 re->subcoffset = rex->subcoffset;
f8fc2ecf 4633 rei = RXi_GET(re);
6bda09f9 4634 DEBUG_EXECUTE_r(
f2ed9b32 4635 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4636 "Matching embedded");
4637 );
f8fc2ecf 4638 startpoint = rei->program + 1;
1a147d38 4639 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4640
1a147d38 4641 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4642 /* run the pattern returned from (??{...}) */
b93070ed 4643 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4644 REGCP_SET(ST.lastcp);
6bda09f9 4645
0357f1fd
ML
4646 re->lastparen = 0;
4647 re->lastcloseparen = 0;
4648
4aabdb9b 4649 PL_reginput = locinput;
ae0beba1 4650 PL_regsize = 0;
4aabdb9b
DM
4651
4652 /* XXXX This is too dramatic a measure... */
4653 PL_reg_maxiter = 0;
4654
faec1544 4655 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4656 if (RX_UTF8(re_sv))
faec1544
DM
4657 PL_reg_flags |= RF_utf8;
4658 else
4659 PL_reg_flags &= ~RF_utf8;
4660 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4661
288b8c02 4662 ST.prev_rex = rex_sv;
faec1544 4663 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
4664 rex_sv = re_sv;
4665 SET_reg_curpm(rex_sv);
288b8c02 4666 rex = re;
f8fc2ecf 4667 rexi = rei;
faec1544 4668 cur_curlyx = NULL;
40a82448 4669 ST.B = next;
faec1544
DM
4670 ST.prev_eval = cur_eval;
4671 cur_eval = st;
faec1544 4672 /* now continue from first node in postoned RE */
6bda09f9 4673 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
118e2215 4674 assert(0); /* NOTREACHED */
c277df42 4675 }
40a82448 4676
faec1544
DM
4677 case EVAL_AB: /* cleanup after a successful (??{A})B */
4678 /* note: this is called twice; first after popping B, then A */
4679 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4680 rex_sv = ST.prev_rex;
4681 SET_reg_curpm(rex_sv);
288b8c02 4682 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4683 rexi = RXi_GET(rex);
faec1544
DM
4684 regcpblow(ST.cp);
4685 cur_eval = ST.prev_eval;
4686 cur_curlyx = ST.prev_curlyx;
34a81e2b 4687
40a82448
DM
4688 /* XXXX This is too dramatic a measure... */
4689 PL_reg_maxiter = 0;
e7707071 4690 if ( nochange_depth )
4b196cd4 4691 nochange_depth--;
262b90c4 4692 sayYES;
40a82448 4693
40a82448 4694
faec1544
DM
4695 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4696 /* note: this is called twice; first after popping B, then A */
4697 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4698 rex_sv = ST.prev_rex;
4699 SET_reg_curpm(rex_sv);
288b8c02 4700 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4701 rexi = RXi_GET(rex);
0357f1fd 4702
40a82448
DM
4703 PL_reginput = locinput;
4704 REGCP_UNWIND(ST.lastcp);
4705 regcppop(rex);
faec1544
DM
4706 cur_eval = ST.prev_eval;
4707 cur_curlyx = ST.prev_curlyx;
4708 /* XXXX This is too dramatic a measure... */
4709 PL_reg_maxiter = 0;
e7707071 4710 if ( nochange_depth )
4b196cd4 4711 nochange_depth--;
40a82448 4712 sayNO_SILENT;
40a82448
DM
4713#undef ST
4714
a0d0e21e 4715 case OPEN:
c277df42 4716 n = ARG(scan); /* which paren pair */
1ca2007e 4717 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
4718 if (n > PL_regsize)
4719 PL_regsize = n;
495f47a5
DM
4720 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4721 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4722 PTR2UV(rex),
4723 PTR2UV(rex->offs),
4724 (UV)n,
4725 (IV)rex->offs[n].start_tmp,
4726 (UV)PL_regsize
4727 ));
e2e6a0f1 4728 lastopen = n;
a0d0e21e 4729 break;
495f47a5
DM
4730
4731/* XXX really need to log other places start/end are set too */
4732#define CLOSE_CAPTURE \
4733 rex->offs[n].start = rex->offs[n].start_tmp; \
4734 rex->offs[n].end = locinput - PL_bostr; \
4735 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4736 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4737 PTR2UV(rex), \
4738 PTR2UV(rex->offs), \
4739 (UV)n, \
4740 (IV)rex->offs[n].start, \
4741 (IV)rex->offs[n].end \
4742 ))
4743
a0d0e21e 4744 case CLOSE:
c277df42 4745 n = ARG(scan); /* which paren pair */
495f47a5 4746 CLOSE_CAPTURE;
7f69552c
YO
4747 /*if (n > PL_regsize)
4748 PL_regsize = n;*/
b93070ed
DM
4749 if (n > rex->lastparen)
4750 rex->lastparen = n;
4751 rex->lastcloseparen = n;
3b6647e0 4752 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4753 goto fake_end;
4754 }
a0d0e21e 4755 break;
e2e6a0f1
YO
4756 case ACCEPT:
4757 if (ARG(scan)){
4758 regnode *cursor;
4759 for (cursor=scan;
4760 cursor && OP(cursor)!=END;
4761 cursor=regnext(cursor))
4762 {
4763 if ( OP(cursor)==CLOSE ){
4764 n = ARG(cursor);
4765 if ( n <= lastopen ) {
495f47a5 4766 CLOSE_CAPTURE;
e2e6a0f1
YO
4767 /*if (n > PL_regsize)
4768 PL_regsize = n;*/
b93070ed
DM
4769 if (n > rex->lastparen)
4770 rex->lastparen = n;
4771 rex->lastcloseparen = n;
3b6647e0
RB
4772 if ( n == ARG(scan) || (cur_eval &&
4773 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4774 break;
4775 }
4776 }
4777 }
4778 }
4779 goto fake_end;
4780 /*NOTREACHED*/
c277df42
IZ
4781 case GROUPP:
4782 n = ARG(scan); /* which paren pair */
b93070ed 4783 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 4784 break;
0a4db386
YO
4785 case NGROUPP:
4786 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4787 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4788 break;
1a147d38 4789 case INSUBP:
0a4db386 4790 n = ARG(scan);
3b6647e0 4791 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4792 break;
4793 case DEFINEP:
4794 sw = 0;
4795 break;
c277df42 4796 case IFTHEN:
2c2d71f5 4797 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4798 if (sw)
c277df42
IZ
4799 next = NEXTOPER(NEXTOPER(scan));
4800 else {
4801 next = scan + ARG(scan);
4802 if (OP(next) == IFTHEN) /* Fake one. */
4803 next = NEXTOPER(NEXTOPER(next));
4804 }
4805 break;
4806 case LOGICAL:
24d3c4a9 4807 logical = scan->flags;
c277df42 4808 break;
c476f425 4809
2ab05381 4810/*******************************************************************
2ab05381 4811
c476f425
DM
4812The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4813pattern, where A and B are subpatterns. (For simple A, CURLYM or
4814STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4815
c476f425 4816A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4817
c476f425
DM
4818On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4819state, which contains the current count, initialised to -1. It also sets
4820cur_curlyx to point to this state, with any previous value saved in the
4821state block.
2ab05381 4822
c476f425
DM
4823CURLYX then jumps straight to the WHILEM op, rather than executing A,
4824since the pattern may possibly match zero times (i.e. it's a while {} loop
4825rather than a do {} while loop).
2ab05381 4826
c476f425
DM
4827Each entry to WHILEM represents a successful match of A. The count in the
4828CURLYX block is incremented, another WHILEM state is pushed, and execution
4829passes to A or B depending on greediness and the current count.
2ab05381 4830
c476f425
DM
4831For example, if matching against the string a1a2a3b (where the aN are
4832substrings that match /A/), then the match progresses as follows: (the
4833pushed states are interspersed with the bits of strings matched so far):
2ab05381 4834
c476f425
DM
4835 <CURLYX cnt=-1>
4836 <CURLYX cnt=0><WHILEM>
4837 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4838 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4839 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4840 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4841
c476f425
DM
4842(Contrast this with something like CURLYM, which maintains only a single
4843backtrack state:
2ab05381 4844
c476f425
DM
4845 <CURLYM cnt=0> a1
4846 a1 <CURLYM cnt=1> a2
4847 a1 a2 <CURLYM cnt=2> a3
4848 a1 a2 a3 <CURLYM cnt=3> b
4849)
2ab05381 4850
c476f425
DM
4851Each WHILEM state block marks a point to backtrack to upon partial failure
4852of A or B, and also contains some minor state data related to that
4853iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4854overall state, such as the count, and pointers to the A and B ops.
2ab05381 4855
c476f425
DM
4856This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4857must always point to the *current* CURLYX block, the rules are:
2ab05381 4858
c476f425
DM
4859When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4860and set cur_curlyx to point the new block.
2ab05381 4861
c476f425
DM
4862When popping the CURLYX block after a successful or unsuccessful match,
4863restore the previous cur_curlyx.
2ab05381 4864
c476f425
DM
4865When WHILEM is about to execute B, save the current cur_curlyx, and set it
4866to the outer one saved in the CURLYX block.
2ab05381 4867
c476f425
DM
4868When popping the WHILEM block after a successful or unsuccessful B match,
4869restore the previous cur_curlyx.
2ab05381 4870
c476f425
DM
4871Here's an example for the pattern (AI* BI)*BO
4872I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4873
c476f425
DM
4874cur_
4875curlyx backtrack stack
4876------ ---------------
4877NULL
4878CO <CO prev=NULL> <WO>
4879CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4880CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4881NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4882
c476f425
DM
4883At this point the pattern succeeds, and we work back down the stack to
4884clean up, restoring as we go:
95b24440 4885
c476f425
DM
4886CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4887CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4888CO <CO prev=NULL> <WO>
4889NULL
a0374537 4890
c476f425
DM
4891*******************************************************************/
4892
4893#define ST st->u.curlyx
4894
4895 case CURLYX: /* start of /A*B/ (for complex A) */
4896 {
4897 /* No need to save/restore up to this paren */
4898 I32 parenfloor = scan->flags;
4899
4900 assert(next); /* keep Coverity happy */
4901 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4902 next += ARG(next);
4903
4904 /* XXXX Probably it is better to teach regpush to support
4905 parenfloor > PL_regsize... */
b93070ed
DM
4906 if (parenfloor > (I32)rex->lastparen)
4907 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
4908
4909 ST.prev_curlyx= cur_curlyx;
4910 cur_curlyx = st;
4911 ST.cp = PL_savestack_ix;
4912
4913 /* these fields contain the state of the current curly.
4914 * they are accessed by subsequent WHILEMs */
4915 ST.parenfloor = parenfloor;
d02d6d97 4916 ST.me = scan;
c476f425 4917 ST.B = next;
24d3c4a9
DM
4918 ST.minmod = minmod;
4919 minmod = 0;
c476f425
DM
4920 ST.count = -1; /* this will be updated by WHILEM */
4921 ST.lastloc = NULL; /* this will be updated by WHILEM */
4922
4923 PL_reginput = locinput;
4924 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
118e2215 4925 assert(0); /* NOTREACHED */
c476f425 4926 }
a0d0e21e 4927
c476f425 4928 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4929 cur_curlyx = ST.prev_curlyx;
4930 sayYES;
118e2215 4931 assert(0); /* NOTREACHED */
a0d0e21e 4932
c476f425
DM
4933 case CURLYX_end_fail: /* just failed to match all of A*B */
4934 regcpblow(ST.cp);
4935 cur_curlyx = ST.prev_curlyx;
4936 sayNO;
118e2215 4937 assert(0); /* NOTREACHED */
4633a7c4 4938
a0d0e21e 4939
c476f425
DM
4940#undef ST
4941#define ST st->u.whilem
4942
4943 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4944 {
4945 /* see the discussion above about CURLYX/WHILEM */
c476f425 4946 I32 n;
d02d6d97
DM
4947 int min = ARG1(cur_curlyx->u.curlyx.me);
4948 int max = ARG2(cur_curlyx->u.curlyx.me);
4949 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4950
c476f425
DM
4951 assert(cur_curlyx); /* keep Coverity happy */
4952 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4953 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4954 ST.cache_offset = 0;
4955 ST.cache_mask = 0;
4956
4957 PL_reginput = locinput;
4958
4959 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4960 "%*s whilem: matched %ld out of %d..%d\n",
4961 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4962 );
a0d0e21e 4963
c476f425 4964 /* First just match a string of min A's. */
a0d0e21e 4965
d02d6d97 4966 if (n < min) {
b93070ed 4967 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 4968 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4969 REGCP_SET(ST.lastcp);
4970
d02d6d97 4971 PUSH_STATE_GOTO(WHILEM_A_pre, A);
118e2215 4972 assert(0); /* NOTREACHED */
c476f425
DM
4973 }
4974
4975 /* If degenerate A matches "", assume A done. */
4976
4977 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4978 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4979 "%*s whilem: empty match detected, trying continuation...\n",
4980 REPORT_CODE_OFF+depth*2, "")
4981 );
4982 goto do_whilem_B_max;
4983 }
4984
4985 /* super-linear cache processing */
4986
4987 if (scan->flags) {
a0d0e21e 4988
2c2d71f5 4989 if (!PL_reg_maxiter) {
c476f425
DM
4990 /* start the countdown: Postpone detection until we
4991 * know the match is not *that* much linear. */
2c2d71f5 4992 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4993 /* possible overflow for long strings and many CURLYX's */
4994 if (PL_reg_maxiter < 0)
4995 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
4996 PL_reg_leftiter = PL_reg_maxiter;
4997 }
c476f425 4998
2c2d71f5 4999 if (PL_reg_leftiter-- == 0) {
c476f425 5000 /* initialise cache */
3298f257 5001 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5002 if (PL_reg_poscache) {
eb160463 5003 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5004 Renew(PL_reg_poscache, size, char);
5005 PL_reg_poscache_size = size;
5006 }
5007 Zero(PL_reg_poscache, size, char);
5008 }
5009 else {
5010 PL_reg_poscache_size = size;
a02a5408 5011 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5012 }
c476f425
DM
5013 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5014 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5015 PL_colors[4], PL_colors[5])
5016 );
2c2d71f5 5017 }
c476f425 5018
2c2d71f5 5019 if (PL_reg_leftiter < 0) {
c476f425
DM
5020 /* have we already failed at this position? */
5021 I32 offset, mask;
5022 offset = (scan->flags & 0xf) - 1
5023 + (locinput - PL_bostr) * (scan->flags>>4);
5024 mask = 1 << (offset % 8);
5025 offset /= 8;
5026 if (PL_reg_poscache[offset] & mask) {
5027 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5028 "%*s whilem: (cache) already tried at this position...\n",
5029 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5030 );
3298f257 5031 sayNO; /* cache records failure */
2c2d71f5 5032 }
c476f425
DM
5033 ST.cache_offset = offset;
5034 ST.cache_mask = mask;
2c2d71f5 5035 }
c476f425 5036 }
2c2d71f5 5037
c476f425 5038 /* Prefer B over A for minimal matching. */
a687059c 5039
c476f425
DM
5040 if (cur_curlyx->u.curlyx.minmod) {
5041 ST.save_curlyx = cur_curlyx;
5042 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 5043 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425
DM
5044 REGCP_SET(ST.lastcp);
5045 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
118e2215 5046 assert(0); /* NOTREACHED */
c476f425 5047 }
a0d0e21e 5048
c476f425
DM
5049 /* Prefer A over B for maximal matching. */
5050
d02d6d97 5051 if (n < max) { /* More greed allowed? */
b93070ed 5052 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
5053 cur_curlyx->u.curlyx.lastloc = locinput;
5054 REGCP_SET(ST.lastcp);
d02d6d97 5055 PUSH_STATE_GOTO(WHILEM_A_max, A);
118e2215 5056 assert(0); /* NOTREACHED */
c476f425
DM
5057 }
5058 goto do_whilem_B_max;
5059 }
118e2215 5060 assert(0); /* NOTREACHED */
c476f425
DM
5061
5062 case WHILEM_B_min: /* just matched B in a minimal match */
5063 case WHILEM_B_max: /* just matched B in a maximal match */
5064 cur_curlyx = ST.save_curlyx;
5065 sayYES;
118e2215 5066 assert(0); /* NOTREACHED */
c476f425
DM
5067
5068 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5069 cur_curlyx = ST.save_curlyx;
5070 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5071 cur_curlyx->u.curlyx.count--;
5072 CACHEsayNO;
118e2215 5073 assert(0); /* NOTREACHED */
c476f425
DM
5074
5075 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5076 /* FALL THROUGH */
5077 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
5078 REGCP_UNWIND(ST.lastcp);
5079 regcppop(rex);
c476f425
DM
5080 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5081 cur_curlyx->u.curlyx.count--;
5082 CACHEsayNO;
118e2215 5083 assert(0); /* NOTREACHED */
c476f425
DM
5084
5085 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5086 REGCP_UNWIND(ST.lastcp);
5087 regcppop(rex); /* Restore some previous $<digit>s? */
5088 PL_reginput = locinput;
5089 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5090 "%*s whilem: failed, trying continuation...\n",
5091 REPORT_CODE_OFF+depth*2, "")
5092 );
5093 do_whilem_B_max:
5094 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5095 && ckWARN(WARN_REGEXP)
5096 && !(PL_reg_flags & RF_warned))
5097 {
5098 PL_reg_flags |= RF_warned;
dcbac5bb
FC
5099 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5100 "Complex regular subexpression recursion limit (%d) "
5101 "exceeded",
c476f425
DM
5102 REG_INFTY - 1);
5103 }
5104
5105 /* now try B */
5106 ST.save_curlyx = cur_curlyx;
5107 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5108 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
118e2215 5109 assert(0); /* NOTREACHED */
c476f425
DM
5110
5111 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5112 cur_curlyx = ST.save_curlyx;
5113 REGCP_UNWIND(ST.lastcp);
5114 regcppop(rex);
5115
d02d6d97 5116 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5117 /* Maximum greed exceeded */
5118 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5119 && ckWARN(WARN_REGEXP)
5120 && !(PL_reg_flags & RF_warned))
5121 {
3280af22 5122 PL_reg_flags |= RF_warned;
c476f425 5123 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5124 "Complex regular subexpression recursion "
5125 "limit (%d) exceeded",
c476f425 5126 REG_INFTY - 1);
a0d0e21e 5127 }
c476f425 5128 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5129 CACHEsayNO;
a0d0e21e 5130 }
c476f425
DM
5131
5132 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5133 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5134 );
5135 /* Try grabbing another A and see if it helps. */
5136 PL_reginput = locinput;
5137 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5138 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5139 REGCP_SET(ST.lastcp);
d02d6d97
DM
5140 PUSH_STATE_GOTO(WHILEM_A_min,
5141 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
118e2215 5142 assert(0); /* NOTREACHED */
40a82448
DM
5143
5144#undef ST
5145#define ST st->u.branch
5146
5147 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5148 next = scan + ARG(scan);
5149 if (next == scan)
5150 next = NULL;
40a82448
DM
5151 scan = NEXTOPER(scan);
5152 /* FALL THROUGH */
c277df42 5153
40a82448
DM
5154 case BRANCH: /* /(...|A|...)/ */
5155 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5156 ST.lastparen = rex->lastparen;
f6033a9d 5157 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5158 ST.next_branch = next;
5159 REGCP_SET(ST.cp);
5160 PL_reginput = locinput;
02db2b7b 5161
40a82448 5162 /* Now go into the branch */
5d458dd8
YO
5163 if (has_cutgroup) {
5164 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
5165 } else {
5166 PUSH_STATE_GOTO(BRANCH_next, scan);
5167 }
118e2215 5168 assert(0); /* NOTREACHED */
5d458dd8
YO
5169 case CUTGROUP:
5170 PL_reginput = locinput;
5171 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5172 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8 5173 PUSH_STATE_GOTO(CUTGROUP_next,next);
118e2215 5174 assert(0); /* NOTREACHED */
5d458dd8
YO
5175 case CUTGROUP_next_fail:
5176 do_cutgroup = 1;
5177 no_final = 1;
5178 if (st->u.mark.mark_name)
5179 sv_commit = st->u.mark.mark_name;
5180 sayNO;
118e2215 5181 assert(0); /* NOTREACHED */
5d458dd8
YO
5182 case BRANCH_next:
5183 sayYES;
118e2215 5184 assert(0); /* NOTREACHED */
40a82448 5185 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5186 if (do_cutgroup) {
5187 do_cutgroup = 0;
5188 no_final = 0;
5189 }
40a82448 5190 REGCP_UNWIND(ST.cp);
a8d1f4b4 5191 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5192 scan = ST.next_branch;
5193 /* no more branches? */
5d458dd8
YO
5194 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5195 DEBUG_EXECUTE_r({
5196 PerlIO_printf( Perl_debug_log,
5197 "%*s %sBRANCH failed...%s\n",
5198 REPORT_CODE_OFF+depth*2, "",
5199 PL_colors[4],
5200 PL_colors[5] );
5201 });
5202 sayNO_SILENT;
5203 }
40a82448 5204 continue; /* execute next BRANCH[J] op */
118e2215 5205 assert(0); /* NOTREACHED */
40a82448 5206
a0d0e21e 5207 case MINMOD:
24d3c4a9 5208 minmod = 1;
a0d0e21e 5209 break;
40a82448
DM
5210
5211#undef ST
5212#define ST st->u.curlym
5213
5214 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5215
5216 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5217 * only a single backtracking state, no matter how many matches
40a82448
DM
5218 * there are in {m,n}. It relies on the pattern being constant
5219 * length, with no parens to influence future backrefs
5220 */
5221
5222 ST.me = scan;
dc45a647 5223 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5224
f6033a9d
DM
5225 ST.lastparen = rex->lastparen;
5226 ST.lastcloseparen = rex->lastcloseparen;
5227
40a82448
DM
5228 /* if paren positive, emulate an OPEN/CLOSE around A */
5229 if (ST.me->flags) {
3b6647e0 5230 U32 paren = ST.me->flags;
40a82448
DM
5231 if (paren > PL_regsize)
5232 PL_regsize = paren;
c277df42 5233 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5234 }
40a82448
DM
5235 ST.A = scan;
5236 ST.B = next;
5237 ST.alen = 0;
5238 ST.count = 0;
24d3c4a9
DM
5239 ST.minmod = minmod;
5240 minmod = 0;
40a82448
DM
5241 ST.c1 = CHRTEST_UNINIT;
5242 REGCP_SET(ST.cp);
6407bf3b 5243
40a82448
DM
5244 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5245 goto curlym_do_B;
5246
5247 curlym_do_A: /* execute the A in /A{m,n}B/ */
6407bf3b 5248 PL_reginput = locinput;
40a82448 5249 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
118e2215 5250 assert(0); /* NOTREACHED */
5f80c4cf 5251
40a82448
DM
5252 case CURLYM_A: /* we've just matched an A */
5253 locinput = st->locinput;
5254 nextchr = UCHARAT(locinput);
5255
5256 ST.count++;
5257 /* after first match, determine A's length: u.curlym.alen */
5258 if (ST.count == 1) {
5259 if (PL_reg_match_utf8) {
5260 char *s = locinput;
5261 while (s < PL_reginput) {
5262 ST.alen++;
5263 s += UTF8SKIP(s);
5264 }
5265 }
5266 else {
5267 ST.alen = PL_reginput - locinput;
5268 }
5269 if (ST.alen == 0)
5270 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5271 }
0cadcf80
DM
5272 DEBUG_EXECUTE_r(
5273 PerlIO_printf(Perl_debug_log,
40a82448 5274 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5275 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5276 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5277 );
5278
40a82448 5279 locinput = PL_reginput;
0a4db386
YO
5280
5281 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5282 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5283 goto fake_end;
5284
c966426a
DM
5285 {
5286 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5287 if ( max == REG_INFTY || ST.count < max )
5288 goto curlym_do_A; /* try to match another A */
5289 }
40a82448 5290 goto curlym_do_B; /* try to match B */
5f80c4cf 5291
40a82448
DM
5292 case CURLYM_A_fail: /* just failed to match an A */
5293 REGCP_UNWIND(ST.cp);
0a4db386
YO
5294
5295 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5296 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5297 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5298 sayNO;
0cadcf80 5299
40a82448
DM
5300 curlym_do_B: /* execute the B in /A{m,n}B/ */
5301 PL_reginput = locinput;
5302 if (ST.c1 == CHRTEST_UNINIT) {
5303 /* calculate c1 and c2 for possible match of 1st char
5304 * following curly */
5305 ST.c1 = ST.c2 = CHRTEST_VOID;
5306 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5307 regnode *text_node = ST.B;
5308 if (! HAS_TEXT(text_node))
5309 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5310 /* this used to be
5311
5312 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5313
5314 But the former is redundant in light of the latter.
5315
5316 if this changes back then the macro for
5317 IS_TEXT and friends need to change.
5318 */
5319 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5320 {
ee9b8eae 5321
40a82448 5322 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5323 switch (OP(text_node)) {
5324 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5325 case EXACTFA:
3c760661 5326 case EXACTFU_SS:
fab2782b 5327 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5328 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5329 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5330 default: ST.c2 = ST.c1;
5331 }
c277df42 5332 }
c277df42 5333 }
40a82448
DM
5334 }
5335
5336 DEBUG_EXECUTE_r(
5337 PerlIO_printf(Perl_debug_log,
5338 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5339 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5340 "", (IV)ST.count)
5341 );
5342 if (ST.c1 != CHRTEST_VOID
5343 && UCHARAT(PL_reginput) != ST.c1
5344 && UCHARAT(PL_reginput) != ST.c2)
5345 {
5346 /* simulate B failing */
3e901dc0
YO
5347 DEBUG_OPTIMISE_r(
5348 PerlIO_printf(Perl_debug_log,
5349 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5350 (int)(REPORT_CODE_OFF+(depth*2)),"",
5351 (IV)ST.c1,(IV)ST.c2
5352 ));
40a82448
DM
5353 state_num = CURLYM_B_fail;
5354 goto reenter_switch;
5355 }
5356
5357 if (ST.me->flags) {
f6033a9d 5358 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5359 I32 paren = ST.me->flags;
5360 if (ST.count) {
b93070ed 5361 rex->offs[paren].start
40a82448 5362 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
b93070ed 5363 rex->offs[paren].end = PL_reginput - PL_bostr;
f6033a9d
DM
5364 if ((U32)paren > rex->lastparen)
5365 rex->lastparen = paren;
5366 rex->lastcloseparen = paren;
c277df42 5367 }
40a82448 5368 else
b93070ed 5369 rex->offs[paren].end = -1;
0a4db386 5370 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5371 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5372 {
5373 if (ST.count)
5374 goto fake_end;
5375 else
5376 sayNO;
5377 }
c277df42 5378 }
0a4db386 5379
40a82448 5380 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
118e2215 5381 assert(0); /* NOTREACHED */
40a82448
DM
5382
5383 case CURLYM_B_fail: /* just failed to match a B */
5384 REGCP_UNWIND(ST.cp);
a8d1f4b4 5385 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5386 if (ST.minmod) {
84d2fa14
HS
5387 I32 max = ARG2(ST.me);
5388 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5389 sayNO;
5390 goto curlym_do_A; /* try to match a further A */
5391 }
5392 /* backtrack one A */
5393 if (ST.count == ARG1(ST.me) /* min */)
5394 sayNO;
5395 ST.count--;
5396 locinput = HOPc(locinput, -ST.alen);
5397 goto curlym_do_B; /* try to match B */
5398
c255a977
DM
5399#undef ST
5400#define ST st->u.curly
40a82448 5401
c255a977
DM
5402#define CURLY_SETPAREN(paren, success) \
5403 if (paren) { \
5404 if (success) { \
b93070ed
DM
5405 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5406 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5407 if (paren > rex->lastparen) \
5408 rex->lastparen = paren; \
b93070ed 5409 rex->lastcloseparen = paren; \
c255a977 5410 } \
f6033a9d 5411 else { \
b93070ed 5412 rex->offs[paren].end = -1; \
f6033a9d
DM
5413 rex->lastparen = ST.lastparen; \
5414 rex->lastcloseparen = ST.lastcloseparen; \
5415 } \
c255a977
DM
5416 }
5417
5418 case STAR: /* /A*B/ where A is width 1 */
5419 ST.paren = 0;
5420 ST.min = 0;
5421 ST.max = REG_INFTY;
a0d0e21e
LW
5422 scan = NEXTOPER(scan);
5423 goto repeat;
c255a977
DM
5424 case PLUS: /* /A+B/ where A is width 1 */
5425 ST.paren = 0;
5426 ST.min = 1;
5427 ST.max = REG_INFTY;
c277df42 5428 scan = NEXTOPER(scan);
c255a977
DM
5429 goto repeat;
5430 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5431 ST.paren = scan->flags; /* Which paren to set */
f6033a9d
DM
5432 ST.lastparen = rex->lastparen;
5433 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5434 if (ST.paren > PL_regsize)
5435 PL_regsize = ST.paren;
c255a977
DM
5436 ST.min = ARG1(scan); /* min to match */
5437 ST.max = ARG2(scan); /* max to match */
0a4db386 5438 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5439 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5440 ST.min=1;
5441 ST.max=1;
5442 }
c255a977
DM
5443 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5444 goto repeat;
5445 case CURLY: /* /A{m,n}B/ where A is width 1 */
5446 ST.paren = 0;
5447 ST.min = ARG1(scan); /* min to match */
5448 ST.max = ARG2(scan); /* max to match */
5449 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5450 repeat:
a0d0e21e
LW
5451 /*
5452 * Lookahead to avoid useless match attempts
5453 * when we know what character comes next.
c255a977 5454 *
5f80c4cf
JP
5455 * Used to only do .*x and .*?x, but now it allows
5456 * for )'s, ('s and (?{ ... })'s to be in the way
5457 * of the quantifier and the EXACT-like node. -- japhy
5458 */
5459
c255a977
DM
5460 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5461 sayNO;
cca55fe3 5462 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5463 U8 *s;
5464 regnode *text_node = next;
5465
3dab1dad
YO
5466 if (! HAS_TEXT(text_node))
5467 FIND_NEXT_IMPT(text_node);
5f80c4cf 5468
9e137952 5469 if (! HAS_TEXT(text_node))
c255a977 5470 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5471 else {
ee9b8eae 5472 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5473 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5474 goto assume_ok_easy;
cca55fe3 5475 }
be8e71aa
YO
5476 else
5477 s = (U8*)STRING(text_node);
ee9b8eae
YO
5478
5479 /* Currently we only get here when
5480
5481 PL_rekind[OP(text_node)] == EXACT
5482
5483 if this changes back then the macro for IS_TEXT and
5484 friends need to change. */
f2ed9b32 5485 if (!UTF_PATTERN) {
9a5a5549
KW
5486 ST.c1 = *s;
5487 switch (OP(text_node)) {
5488 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5489 case EXACTFA:
3c760661 5490 case EXACTFU_SS:
fab2782b 5491 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5492 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5493 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5494 default: ST.c2 = ST.c1; break;
5495 }
1aa99e6b 5496 }
f2ed9b32 5497 else { /* UTF_PATTERN */
9a5a5549 5498 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
b0573d8b
KW
5499 STRLEN ulen;
5500 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5501
5502 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5503 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
e294cc5d 5504 uniflags);
5f80c4cf
JP
5505 }
5506 else {
c255a977 5507 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5508 uniflags);
5f80c4cf 5509 }
1aa99e6b
IH
5510 }
5511 }
bbce6d69 5512 }
a0d0e21e 5513 else
c255a977 5514 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5515 assume_ok_easy:
c255a977
DM
5516
5517 ST.A = scan;
5518 ST.B = next;
3280af22 5519 PL_reginput = locinput;
24d3c4a9
DM
5520 if (minmod) {
5521 minmod = 0;
e2e6a0f1 5522 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4633a7c4 5523 sayNO;
c255a977 5524 ST.count = ST.min;
a0ed51b3 5525 locinput = PL_reginput;
c255a977
DM
5526 REGCP_SET(ST.cp);
5527 if (ST.c1 == CHRTEST_VOID)
5528 goto curly_try_B_min;
5529
5530 ST.oldloc = locinput;
5531
5532 /* set ST.maxpos to the furthest point along the
5533 * string that could possibly match */
5534 if (ST.max == REG_INFTY) {
5535 ST.maxpos = PL_regeol - 1;
f2ed9b32 5536 if (utf8_target)
c255a977
DM
5537 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5538 ST.maxpos--;
5539 }
f2ed9b32 5540 else if (utf8_target) {
c255a977
DM
5541 int m = ST.max - ST.min;
5542 for (ST.maxpos = locinput;
5543 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5544 ST.maxpos += UTF8SKIP(ST.maxpos);
5545 }
5546 else {
5547 ST.maxpos = locinput + ST.max - ST.min;
5548 if (ST.maxpos >= PL_regeol)
5549 ST.maxpos = PL_regeol - 1;
5550 }
5551 goto curly_try_B_min_known;
5552
5553 }
5554 else {
e2e6a0f1 5555 ST.count = regrepeat(rex, ST.A, ST.max, depth);
c255a977
DM
5556 locinput = PL_reginput;
5557 if (ST.count < ST.min)
5558 sayNO;
5559 if ((ST.count > ST.min)
5560 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5561 {
5562 /* A{m,n} must come at the end of the string, there's
5563 * no point in backing off ... */
5564 ST.min = ST.count;
5565 /* ...except that $ and \Z can match before *and* after
5566 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5567 We may back off by one in this case. */
5568 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5569 ST.min--;
5570 }
5571 REGCP_SET(ST.cp);
5572 goto curly_try_B_max;
5573 }
118e2215 5574 assert(0); /* NOTREACHED */
c255a977
DM
5575
5576
5577 case CURLY_B_min_known_fail:
5578 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977
DM
5579
5580 PL_reginput = locinput; /* Could be reset... */
5581 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5582 if (ST.paren) {
5583 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5584 }
c255a977
DM
5585 /* Couldn't or didn't -- move forward. */
5586 ST.oldloc = locinput;
f2ed9b32 5587 if (utf8_target)
c255a977
DM
5588 locinput += UTF8SKIP(locinput);
5589 else
5590 locinput++;
5591 ST.count++;
5592 curly_try_B_min_known:
5593 /* find the next place where 'B' could work, then call B */
5594 {
5595 int n;
f2ed9b32 5596 if (utf8_target) {
c255a977
DM
5597 n = (ST.oldloc == locinput) ? 0 : 1;
5598 if (ST.c1 == ST.c2) {
5599 STRLEN len;
5600 /* set n to utf8_distance(oldloc, locinput) */
5601 while (locinput <= ST.maxpos &&
5602 utf8n_to_uvchr((U8*)locinput,
5603 UTF8_MAXBYTES, &len,
5604 uniflags) != (UV)ST.c1) {
5605 locinput += len;
5606 n++;
5607 }
1aa99e6b
IH
5608 }
5609 else {
c255a977
DM
5610 /* set n to utf8_distance(oldloc, locinput) */
5611 while (locinput <= ST.maxpos) {
5612 STRLEN len;
5613 const UV c = utf8n_to_uvchr((U8*)locinput,
5614 UTF8_MAXBYTES, &len,
5615 uniflags);
5616 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5617 break;
5618 locinput += len;
5619 n++;
1aa99e6b 5620 }
0fe9bf95
IZ
5621 }
5622 }
c255a977
DM
5623 else {
5624 if (ST.c1 == ST.c2) {
5625 while (locinput <= ST.maxpos &&
5626 UCHARAT(locinput) != ST.c1)
5627 locinput++;
bbce6d69 5628 }
c255a977
DM
5629 else {
5630 while (locinput <= ST.maxpos
5631 && UCHARAT(locinput) != ST.c1
5632 && UCHARAT(locinput) != ST.c2)
5633 locinput++;
a0ed51b3 5634 }
c255a977
DM
5635 n = locinput - ST.oldloc;
5636 }
5637 if (locinput > ST.maxpos)
5638 sayNO;
5639 /* PL_reginput == oldloc now */
5640 if (n) {
5641 ST.count += n;
e2e6a0f1 5642 if (regrepeat(rex, ST.A, n, depth) < n)
4633a7c4 5643 sayNO;
a0d0e21e 5644 }
c255a977
DM
5645 PL_reginput = locinput;
5646 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5647 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5648 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5649 goto fake_end;
5650 }
c255a977 5651 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
a0d0e21e 5652 }
118e2215 5653 assert(0); /* NOTREACHED */
c255a977
DM
5654
5655
5656 case CURLY_B_min_fail:
5657 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5658
5659 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5660 if (ST.paren) {
5661 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5662 }
c255a977
DM
5663 /* failed -- move forward one */
5664 PL_reginput = locinput;
e2e6a0f1 5665 if (regrepeat(rex, ST.A, 1, depth)) {
c255a977 5666 ST.count++;
a0ed51b3 5667 locinput = PL_reginput;
c255a977
DM
5668 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5669 ST.count > 0)) /* count overflow ? */
15272685 5670 {
c255a977
DM
5671 curly_try_B_min:
5672 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5673 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5674 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5675 goto fake_end;
5676 }
c255a977 5677 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
a0d0e21e
LW
5678 }
5679 }
4633a7c4 5680 sayNO;
118e2215 5681 assert(0); /* NOTREACHED */
c255a977
DM
5682
5683
5684 curly_try_B_max:
5685 /* a successful greedy match: now try to match B */
40d049e4 5686 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5687 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5688 goto fake_end;
5689 }
c255a977
DM
5690 {
5691 UV c = 0;
5692 if (ST.c1 != CHRTEST_VOID)
f2ed9b32 5693 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
c255a977 5694 UTF8_MAXBYTES, 0, uniflags)
466787eb 5695 : (UV) UCHARAT(PL_reginput);
c255a977
DM
5696 /* If it could work, try it. */
5697 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5698 CURLY_SETPAREN(ST.paren, ST.count);
5699 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
118e2215 5700 assert(0); /* NOTREACHED */
c255a977
DM
5701 }
5702 }
5703 /* FALL THROUGH */
5704 case CURLY_B_max_fail:
5705 /* failed to find B in a greedy match */
c255a977
DM
5706
5707 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5708 if (ST.paren) {
5709 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5710 }
c255a977
DM
5711 /* back up. */
5712 if (--ST.count < ST.min)
5713 sayNO;
5714 PL_reginput = locinput = HOPc(locinput, -1);
5715 goto curly_try_B_max;
5716
5717#undef ST
5718
a0d0e21e 5719 case END:
6bda09f9 5720 fake_end:
faec1544
DM
5721 if (cur_eval) {
5722 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
5723 st->u.eval.toggle_reg_flags
5724 = cur_eval->u.eval.toggle_reg_flags;
5725 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5726
288b8c02 5727 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 5728 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
5729 rex_sv = cur_eval->u.eval.prev_rex;
5730 SET_reg_curpm(rex_sv);
288b8c02 5731 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5732 rexi = RXi_GET(rex);
faec1544 5733 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 5734
faec1544
DM
5735 REGCP_SET(st->u.eval.lastcp);
5736 PL_reginput = locinput;
5737
5738 /* Restore parens of the outer rex without popping the
5739 * savestack */
74088413 5740 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
5741
5742 st->u.eval.prev_eval = cur_eval;
5743 cur_eval = cur_eval->u.eval.prev_eval;
5744 DEBUG_EXECUTE_r(
2a49f0f5
JH
5745 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5746 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5747 if ( nochange_depth )
5748 nochange_depth--;
5749
5750 PUSH_YES_STATE_GOTO(EVAL_AB,
faec1544
DM
5751 st->u.eval.prev_eval->u.eval.B); /* match B */
5752 }
5753
3b0527fe 5754 if (locinput < reginfo->till) {
a3621e74 5755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5756 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5757 PL_colors[4],
5758 (long)(locinput - PL_reg_starttry),
3b0527fe 5759 (long)(reginfo->till - PL_reg_starttry),
7821416a 5760 PL_colors[5]));
58e23c8d 5761
262b90c4 5762 sayNO_SILENT; /* Cannot match: too short. */
7821416a
IZ
5763 }
5764 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5765 sayYES; /* Success! */
dad79028
DM
5766
5767 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5768 DEBUG_EXECUTE_r(
5769 PerlIO_printf(Perl_debug_log,
5770 "%*s %ssubpattern success...%s\n",
5bc10b2c 5771 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
3280af22 5772 PL_reginput = locinput; /* put where regtry can find it */
262b90c4 5773 sayYES; /* Success! */
dad79028 5774
40a82448
DM
5775#undef ST
5776#define ST st->u.ifmatch
5777
5778 case SUSPEND: /* (?>A) */
5779 ST.wanted = 1;
9fe1d20c 5780 PL_reginput = locinput;
9041c2e3 5781 goto do_ifmatch;
dad79028 5782
40a82448
DM
5783 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5784 ST.wanted = 0;
dad79028
DM
5785 goto ifmatch_trivial_fail_test;
5786
40a82448
DM
5787 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5788 ST.wanted = 1;
dad79028 5789 ifmatch_trivial_fail_test:
a0ed51b3 5790 if (scan->flags) {
52657f30 5791 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5792 if (!s) {
5793 /* trivial fail */
24d3c4a9
DM
5794 if (logical) {
5795 logical = 0;
f2338a2e 5796 sw = 1 - cBOOL(ST.wanted);
dad79028 5797 }
40a82448 5798 else if (ST.wanted)
dad79028
DM
5799 sayNO;
5800 next = scan + ARG(scan);
5801 if (next == scan)
5802 next = NULL;
5803 break;
5804 }
efb30f32 5805 PL_reginput = s;
a0ed51b3
LW
5806 }
5807 else
5808 PL_reginput = locinput;
5809
c277df42 5810 do_ifmatch:
40a82448 5811 ST.me = scan;
24d3c4a9 5812 ST.logical = logical;
24d786f4
YO
5813 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5814
40a82448
DM
5815 /* execute body of (?...A) */
5816 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
118e2215 5817 assert(0); /* NOTREACHED */
40a82448
DM
5818
5819 case IFMATCH_A_fail: /* body of (?...A) failed */
5820 ST.wanted = !ST.wanted;
5821 /* FALL THROUGH */
5822
5823 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5824 if (ST.logical) {
f2338a2e 5825 sw = cBOOL(ST.wanted);
40a82448
DM
5826 }
5827 else if (!ST.wanted)
5828 sayNO;
5829
5830 if (OP(ST.me) == SUSPEND)
5831 locinput = PL_reginput;
5832 else {
5833 locinput = PL_reginput = st->locinput;
5834 nextchr = UCHARAT(locinput);
5835 }
5836 scan = ST.me + ARG(ST.me);
5837 if (scan == ST.me)
5838 scan = NULL;
5839 continue; /* execute B */
5840
5841#undef ST
dad79028 5842
c277df42 5843 case LONGJMP:
c277df42
IZ
5844 next = scan + ARG(scan);
5845 if (next == scan)
5846 next = NULL;
a0d0e21e 5847 break;
54612592 5848 case COMMIT:
e2e6a0f1
YO
5849 reginfo->cutpoint = PL_regeol;
5850 /* FALLTHROUGH */
5d458dd8 5851 case PRUNE:
24b23f37 5852 PL_reginput = locinput;
e2e6a0f1 5853 if (!scan->flags)
ad64d0ec 5854 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
54612592 5855 PUSH_STATE_GOTO(COMMIT_next,next);
118e2215 5856 assert(0); /* NOTREACHED */
54612592
YO
5857 case COMMIT_next_fail:
5858 no_final = 1;
5859 /* FALLTHROUGH */
7f69552c
YO
5860 case OPFAIL:
5861 sayNO;
118e2215 5862 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5863
5864#define ST st->u.mark
5865 case MARKPOINT:
5866 ST.prev_mark = mark_state;
5d458dd8 5867 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5868 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1
YO
5869 mark_state = st;
5870 ST.mark_loc = PL_reginput = locinput;
5871 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
118e2215 5872 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5873 case MARKPOINT_next:
5874 mark_state = ST.prev_mark;
5875 sayYES;
118e2215 5876 assert(0); /* NOTREACHED */
e2e6a0f1 5877 case MARKPOINT_next_fail:
5d458dd8 5878 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5879 {
5880 if (ST.mark_loc > startpoint)
5881 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5882 popmark = NULL; /* we found our mark */
5883 sv_commit = ST.mark_name;
5884
5885 DEBUG_EXECUTE_r({
5d458dd8 5886 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5887 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5888 REPORT_CODE_OFF+depth*2, "",
be2597df 5889 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5890 });
5891 }
5892 mark_state = ST.prev_mark;
5d458dd8
YO
5893 sv_yes_mark = mark_state ?
5894 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 5895 sayNO;
118e2215 5896 assert(0); /* NOTREACHED */
5d458dd8
YO
5897 case SKIP:
5898 PL_reginput = locinput;
5899 if (scan->flags) {
2bf803e2 5900 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5901 ST.mark_name = NULL;
e2e6a0f1 5902 ST.mark_loc = locinput;
5d458dd8
YO
5903 PUSH_STATE_GOTO(SKIP_next,next);
5904 } else {
2bf803e2 5905 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5906 otherwise do nothing. Meaning we need to scan
5907 */
5908 regmatch_state *cur = mark_state;
ad64d0ec 5909 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5910
5911 while (cur) {
5912 if ( sv_eq( cur->u.mark.mark_name,
5913 find ) )
5914 {
5915 ST.mark_name = find;
5916 PUSH_STATE_GOTO( SKIP_next, next );
5917 }
5918 cur = cur->u.mark.prev_mark;
5919 }
e2e6a0f1 5920 }
2bf803e2 5921 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5922 break;
5923 case SKIP_next_fail:
5924 if (ST.mark_name) {
5925 /* (*CUT:NAME) - Set up to search for the name as we
5926 collapse the stack*/
5927 popmark = ST.mark_name;
5928 } else {
5929 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5930 if (ST.mark_loc > startpoint)
5931 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5932 /* but we set sv_commit to latest mark_name if there
5933 is one so they can test to see how things lead to this
5934 cut */
5935 if (mark_state)
5936 sv_commit=mark_state->u.mark.mark_name;
5937 }
e2e6a0f1
YO
5938 no_final = 1;
5939 sayNO;
118e2215 5940 assert(0); /* NOTREACHED */
e2e6a0f1 5941#undef ST
e1d1eefb 5942 case LNBREAK:
f2ed9b32 5943 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5944 locinput += n;
5945 nextchr = UCHARAT(locinput);
5946 } else
5947 sayNO;
5948 break;
5949
5950#define CASE_CLASS(nAmE) \
5951 case nAmE: \
1380b51d
DM
5952 if (locinput >= PL_regeol) \
5953 sayNO; \
f2ed9b32 5954 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5955 locinput += n; \
5956 nextchr = UCHARAT(locinput); \
5957 } else \
5958 sayNO; \
5959 break; \
5960 case N##nAmE: \
1380b51d
DM
5961 if (locinput >= PL_regeol) \
5962 sayNO; \
f2ed9b32 5963 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5964 sayNO; \
5965 } else { \
5966 locinput += UTF8SKIP(locinput); \
5967 nextchr = UCHARAT(locinput); \
5968 } \
5969 break
5970
5971 CASE_CLASS(VERTWS);
5972 CASE_CLASS(HORIZWS);
5973#undef CASE_CLASS
5974
a0d0e21e 5975 default:
b900a521 5976 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5977 PTR2UV(scan), OP(scan));
cea2e8a9 5978 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5979
5980 } /* end switch */
95b24440 5981
5d458dd8
YO
5982 /* switch break jumps here */
5983 scan = next; /* prepare to execute the next op and ... */
5984 continue; /* ... jump back to the top, reusing st */
118e2215 5985 assert(0); /* NOTREACHED */
95b24440 5986
40a82448
DM
5987 push_yes_state:
5988 /* push a state that backtracks on success */
5989 st->u.yes.prev_yes_state = yes_state;
5990 yes_state = st;
5991 /* FALL THROUGH */
5992 push_state:
5993 /* push a new regex state, then continue at scan */
5994 {
5995 regmatch_state *newst;
5996
24b23f37
YO
5997 DEBUG_STACK_r({
5998 regmatch_state *cur = st;
5999 regmatch_state *curyes = yes_state;
6000 int curd = depth;
6001 regmatch_slab *slab = PL_regmatch_slab;
6002 for (;curd > -1;cur--,curd--) {
6003 if (cur < SLAB_FIRST(slab)) {
6004 slab = slab->prev;
6005 cur = SLAB_LAST(slab);
6006 }
6007 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6008 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6009 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6010 (curyes == cur) ? "yes" : ""
6011 );
6012 if (curyes == cur)
6013 curyes = cur->u.yes.prev_yes_state;
6014 }
6015 } else
6016 DEBUG_STATE_pp("push")
6017 );
40a82448 6018 depth++;
40a82448
DM
6019 st->locinput = locinput;
6020 newst = st+1;
6021 if (newst > SLAB_LAST(PL_regmatch_slab))
6022 newst = S_push_slab(aTHX);
6023 PL_regmatch_state = newst;
786e8c11 6024
40a82448
DM
6025 locinput = PL_reginput;
6026 nextchr = UCHARAT(locinput);
6027 st = newst;
6028 continue;
118e2215 6029 assert(0); /* NOTREACHED */
40a82448 6030 }
a0d0e21e 6031 }
a687059c 6032
a0d0e21e
LW
6033 /*
6034 * We get here only if there's trouble -- normally "case END" is
6035 * the terminating point.
6036 */
cea2e8a9 6037 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6038 /*NOTREACHED*/
4633a7c4
LW
6039 sayNO;
6040
262b90c4 6041yes:
77cb431f
DM
6042 if (yes_state) {
6043 /* we have successfully completed a subexpression, but we must now
6044 * pop to the state marked by yes_state and continue from there */
77cb431f 6045 assert(st != yes_state);
5bc10b2c
DM
6046#ifdef DEBUGGING
6047 while (st != yes_state) {
6048 st--;
6049 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6050 PL_regmatch_slab = PL_regmatch_slab->prev;
6051 st = SLAB_LAST(PL_regmatch_slab);
6052 }
e2e6a0f1 6053 DEBUG_STATE_r({
54612592
YO
6054 if (no_final) {
6055 DEBUG_STATE_pp("pop (no final)");
6056 } else {
6057 DEBUG_STATE_pp("pop (yes)");
6058 }
e2e6a0f1 6059 });
5bc10b2c
DM
6060 depth--;
6061 }
6062#else
77cb431f
DM
6063 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6064 || yes_state > SLAB_LAST(PL_regmatch_slab))
6065 {
6066 /* not in this slab, pop slab */
6067 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6068 PL_regmatch_slab = PL_regmatch_slab->prev;
6069 st = SLAB_LAST(PL_regmatch_slab);
6070 }
6071 depth -= (st - yes_state);
5bc10b2c 6072#endif
77cb431f
DM
6073 st = yes_state;
6074 yes_state = st->u.yes.prev_yes_state;
6075 PL_regmatch_state = st;
24b23f37 6076
5d458dd8
YO
6077 if (no_final) {
6078 locinput= st->locinput;
6079 nextchr = UCHARAT(locinput);
6080 }
54612592 6081 state_num = st->resume_state + no_final;
24d3c4a9 6082 goto reenter_switch;
77cb431f
DM
6083 }
6084
a3621e74 6085 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6086 PL_colors[4], PL_colors[5]));
02db2b7b 6087
ed301438 6088 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6089 /* each successfully executed (?{...}) block does the equivalent of
6090 * local $^R = do {...}
6091 * When popping the save stack, all these locals would be undone;
6092 * bypass this by setting the outermost saved $^R to the latest
6093 * value */
6094 if (oreplsv != GvSV(PL_replgv))
6095 sv_setsv(oreplsv, GvSV(PL_replgv));
6096 }
95b24440 6097 result = 1;
aa283a38 6098 goto final_exit;
4633a7c4
LW
6099
6100no:
a3621e74 6101 DEBUG_EXECUTE_r(
7821416a 6102 PerlIO_printf(Perl_debug_log,
786e8c11 6103 "%*s %sfailed...%s\n",
5bc10b2c 6104 REPORT_CODE_OFF+depth*2, "",
786e8c11 6105 PL_colors[4], PL_colors[5])
7821416a 6106 );
aa283a38 6107
262b90c4 6108no_silent:
54612592
YO
6109 if (no_final) {
6110 if (yes_state) {
6111 goto yes;
6112 } else {
6113 goto final_exit;
6114 }
6115 }
aa283a38
DM
6116 if (depth) {
6117 /* there's a previous state to backtrack to */
40a82448
DM
6118 st--;
6119 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6120 PL_regmatch_slab = PL_regmatch_slab->prev;
6121 st = SLAB_LAST(PL_regmatch_slab);
6122 }
6123 PL_regmatch_state = st;
40a82448
DM
6124 locinput= st->locinput;
6125 nextchr = UCHARAT(locinput);
6126
5bc10b2c
DM
6127 DEBUG_STATE_pp("pop");
6128 depth--;
262b90c4
DM
6129 if (yes_state == st)
6130 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6131
24d3c4a9
DM
6132 state_num = st->resume_state + 1; /* failure = success + 1 */
6133 goto reenter_switch;
95b24440 6134 }
24d3c4a9 6135 result = 0;
aa283a38 6136
262b90c4 6137 final_exit:
bbe252da 6138 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6139 SV *sv_err = get_sv("REGERROR", 1);
6140 SV *sv_mrk = get_sv("REGMARK", 1);
6141 if (result) {
e2e6a0f1 6142 sv_commit = &PL_sv_no;
5d458dd8
YO
6143 if (!sv_yes_mark)
6144 sv_yes_mark = &PL_sv_yes;
6145 } else {
6146 if (!sv_commit)
6147 sv_commit = &PL_sv_yes;
6148 sv_yes_mark = &PL_sv_no;
6149 }
6150 sv_setsv(sv_err, sv_commit);
6151 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6152 }
19b95bf0 6153
81ed78b2
DM
6154
6155 if (last_pushed_cv) {
6156 dSP;
6157 POP_MULTICALL;
4f8dbb2d 6158 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6159 }
6160
2f554ef7
DM
6161 /* clean up; in particular, free all slabs above current one */
6162 LEAVE_SCOPE(oldsave);
5d9a96ca 6163
95b24440 6164 return result;
a687059c
LW
6165}
6166
6167/*
6168 - regrepeat - repeatedly match something simple, report how many
6169 */
6170/*
6171 * [This routine now assumes that it will only match on things of length 1.
6172 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 6173 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 6174 */
76e3520e 6175STATIC I32
e2e6a0f1 6176S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
a687059c 6177{
27da23d5 6178 dVAR;
eb578fdb
KW
6179 char *scan;
6180 I32 c;
6181 char *loceol = PL_regeol;
6182 I32 hardcount = 0;
6183 bool utf8_target = PL_reg_match_utf8;
d513472c 6184 UV utf8_flags;
4f55667c
SP
6185#ifndef DEBUGGING
6186 PERL_UNUSED_ARG(depth);
6187#endif
a0d0e21e 6188
7918f24d
NC
6189 PERL_ARGS_ASSERT_REGREPEAT;
6190
3280af22 6191 scan = PL_reginput;
faf11cac
HS
6192 if (max == REG_INFTY)
6193 max = I32_MAX;
6194 else if (max < loceol - scan)
7f596f4c 6195 loceol = scan + max;
a0d0e21e 6196 switch (OP(p)) {
22c35a8c 6197 case REG_ANY:
f2ed9b32 6198 if (utf8_target) {
ffc61ed2 6199 loceol = PL_regeol;
1aa99e6b 6200 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6201 scan += UTF8SKIP(scan);
6202 hardcount++;
6203 }
6204 } else {
6205 while (scan < loceol && *scan != '\n')
6206 scan++;
a0ed51b3
LW
6207 }
6208 break;
ffc61ed2 6209 case SANY:
f2ed9b32 6210 if (utf8_target) {
def8e4ea 6211 loceol = PL_regeol;
a0804c9e 6212 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6213 scan += UTF8SKIP(scan);
6214 hardcount++;
6215 }
6216 }
6217 else
6218 scan = loceol;
a0ed51b3 6219 break;
f33976b4
DB
6220 case CANY:
6221 scan = loceol;
6222 break;
59d32103
KW
6223 case EXACT:
6224 /* To get here, EXACTish nodes must have *byte* length == 1. That
6225 * means they match only characters in the string that can be expressed
6226 * as a single byte. For non-utf8 strings, that means a simple match.
6227 * For utf8 strings, the character matched must be an invariant, or
6228 * downgradable to a single byte. The pattern's utf8ness is
6229 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6230 * it is, it's an invariant */
6231
6232 c = (U8)*STRING(p);
6233 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6234
6235 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6236 while (scan < loceol && UCHARAT(scan) == c) {
6237 scan++;
6238 }
6239 }
6240 else {
6241
6242 /* Here, the string is utf8, and the pattern char is different
6243 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6244 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6245 * look for those in sequence in the utf8 string */
6246 U8 high = UTF8_TWO_BYTE_HI(c);
6247 U8 low = UTF8_TWO_BYTE_LO(c);
6248 loceol = PL_regeol;
6249
6250 while (hardcount < max
6251 && scan + 1 < loceol
6252 && UCHARAT(scan) == high
6253 && UCHARAT(scan + 1) == low)
6254 {
6255 scan += 2;
6256 hardcount++;
6257 }
6258 }
6259 break;
2f7f8cb1
KW
6260 case EXACTFA:
6261 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6262 goto do_exactf;
6263
d4e0b827
KW
6264 case EXACTFL:
6265 PL_reg_flags |= RF_tainted;
17580e7a
KW
6266 utf8_flags = FOLDEQ_UTF8_LOCALE;
6267 goto do_exactf;
6268
d4e0b827 6269 case EXACTF:
62bf7766
KW
6270 utf8_flags = 0;
6271 goto do_exactf;
6272
3c760661 6273 case EXACTFU_SS:
fab2782b 6274 case EXACTFU_TRICKYFOLD:
9a5a5549 6275 case EXACTFU:
05f861a2 6276 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6277
2be3e190
KW
6278 /* The comments for the EXACT case above apply as well to these fold
6279 * ones */
634c83a2 6280
2f7f8cb1 6281 do_exactf:
090f7165 6282 c = (U8)*STRING(p);
634c83a2 6283 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6284
3c760661 6285 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6286 char *tmpeol = loceol;
6287 while (hardcount < max
d513472c
KW
6288 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6289 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6290 {
6291 scan = tmpeol;
6292 tmpeol = loceol;
6293 hardcount++;
6294 }
634c83a2 6295
59d32103
KW
6296 /* XXX Note that the above handles properly the German sharp s in
6297 * the pattern matching ss in the string. But it doesn't handle
6298 * properly cases where the string contains say 'LIGATURE ff' and
6299 * the pattern is 'f+'. This would require, say, a new function or
6300 * revised interface to foldEQ_utf8(), in which the maximum number
6301 * of characters to match could be passed and it would return how
6302 * many actually did. This is just one of many cases where
6303 * multi-char folds don't work properly, and so the fix is being
6304 * deferred */
6305 }
6306 else {
87381386 6307 U8 folded;
59d32103 6308
2be3e190
KW
6309 /* Here, the string isn't utf8 and c is a single byte; and either
6310 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6311 * doesn't affect c. Can just do simple comparisons for exact or
6312 * fold matching. */
d4e0b827 6313 switch (OP(p)) {
87381386 6314 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6315 case EXACTFA:
fab2782b 6316 case EXACTFU_TRICKYFOLD:
9a5a5549 6317 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6318 case EXACTFL: folded = PL_fold_locale[c]; break;
6319 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6320 }
6321 while (scan < loceol &&
6322 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6323 {
6324 scan++;
634c83a2
KW
6325 }
6326 }
bbce6d69 6327 break;
f56b6394 6328 case ANYOFV:
a0d0e21e 6329 case ANYOF:
4e8910e0
KW
6330 if (utf8_target || OP(p) == ANYOFV) {
6331 STRLEN inclasslen;
ffc61ed2 6332 loceol = PL_regeol;
4e8910e0
KW
6333 inclasslen = loceol - scan;
6334 while (hardcount < max
6335 && ((inclasslen = loceol - scan) > 0)
6336 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6337 {
6338 scan += inclasslen;
ffc61ed2
JH
6339 hardcount++;
6340 }
6341 } else {
32fc9b6a 6342 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6343 scan++;
6344 }
a0d0e21e 6345 break;
980866de 6346 case ALNUMU:
f2ed9b32 6347 if (utf8_target) {
980866de 6348 utf8_wordchar:
ffc61ed2 6349 loceol = PL_regeol;
1a4fad37 6350 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6351 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6352 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6353 {
ffc61ed2
JH
6354 scan += UTF8SKIP(scan);
6355 hardcount++;
6356 }
980866de 6357 } else {
a12cf05f
KW
6358 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6359 scan++;
6360 }
980866de
KW
6361 }
6362 break;
6363 case ALNUM:
6364 if (utf8_target)
6365 goto utf8_wordchar;
6366 while (scan < loceol && isALNUM((U8) *scan)) {
6367 scan++;
a0ed51b3
LW
6368 }
6369 break;
cfaf538b
KW
6370 case ALNUMA:
6371 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6372 scan++;
6373 }
6374 break;
bbce6d69 6375 case ALNUML:
3280af22 6376 PL_reg_flags |= RF_tainted;
f2ed9b32 6377 if (utf8_target) {
ffc61ed2 6378 loceol = PL_regeol;
1aa99e6b
IH
6379 while (hardcount < max && scan < loceol &&
6380 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6381 scan += UTF8SKIP(scan);
6382 hardcount++;
6383 }
6384 } else {
6385 while (scan < loceol && isALNUM_LC(*scan))
6386 scan++;
a0ed51b3
LW
6387 }
6388 break;
980866de 6389 case NALNUMU:
f2ed9b32 6390 if (utf8_target) {
980866de
KW
6391
6392 utf8_Nwordchar:
6393
ffc61ed2 6394 loceol = PL_regeol;
1a4fad37 6395 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6396 while (hardcount < max && scan < loceol &&
980866de 6397 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6398 {
ffc61ed2
JH
6399 scan += UTF8SKIP(scan);
6400 hardcount++;
6401 }
980866de 6402 } else {
a12cf05f
KW
6403 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6404 scan++;
6405 }
980866de
KW
6406 }
6407 break;
6408 case NALNUM:
6409 if (utf8_target)
6410 goto utf8_Nwordchar;
6411 while (scan < loceol && ! isALNUM((U8) *scan)) {
6412 scan++;
a0ed51b3
LW
6413 }
6414 break;
0658cdde
KW
6415
6416 case POSIXA:
6417 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6418 scan++;
6419 }
6420 break;
6421 case NPOSIXA:
6422 if (utf8_target) {
6423 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6424 scan += UTF8SKIP(scan);
6425 }
6426 }
6427 else {
6428 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6429 scan++;
6430 }
6431 }
6432 break;
cfaf538b
KW
6433 case NALNUMA:
6434 if (utf8_target) {
6435 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6436 scan += UTF8SKIP(scan);
6437 }
6438 }
6439 else {
6440 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6441 scan++;
6442 }
6443 }
6444 break;
bbce6d69 6445 case NALNUML:
3280af22 6446 PL_reg_flags |= RF_tainted;
f2ed9b32 6447 if (utf8_target) {
ffc61ed2 6448 loceol = PL_regeol;
1aa99e6b
IH
6449 while (hardcount < max && scan < loceol &&
6450 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6451 scan += UTF8SKIP(scan);
6452 hardcount++;
6453 }
6454 } else {
6455 while (scan < loceol && !isALNUM_LC(*scan))
6456 scan++;
a0ed51b3
LW
6457 }
6458 break;
980866de 6459 case SPACEU:
f2ed9b32 6460 if (utf8_target) {
980866de
KW
6461
6462 utf8_space:
6463
ffc61ed2 6464 loceol = PL_regeol;
1a4fad37 6465 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6466 while (hardcount < max && scan < loceol &&
3568d838 6467 (*scan == ' ' ||
a12cf05f
KW
6468 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6469 {
ffc61ed2
JH
6470 scan += UTF8SKIP(scan);
6471 hardcount++;
6472 }
980866de
KW
6473 break;
6474 }
6475 else {
a12cf05f
KW
6476 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6477 scan++;
6478 }
980866de
KW
6479 break;
6480 }
6481 case SPACE:
6482 if (utf8_target)
6483 goto utf8_space;
6484
6485 while (scan < loceol && isSPACE((U8) *scan)) {
6486 scan++;
a0ed51b3
LW
6487 }
6488 break;
cfaf538b
KW
6489 case SPACEA:
6490 while (scan < loceol && isSPACE_A((U8) *scan)) {
6491 scan++;
6492 }
6493 break;
bbce6d69 6494 case SPACEL:
3280af22 6495 PL_reg_flags |= RF_tainted;
f2ed9b32 6496 if (utf8_target) {
ffc61ed2 6497 loceol = PL_regeol;
1aa99e6b 6498 while (hardcount < max && scan < loceol &&
6bbba904 6499 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6500 scan += UTF8SKIP(scan);
6501 hardcount++;
6502 }
6503 } else {
6504 while (scan < loceol && isSPACE_LC(*scan))
6505 scan++;
a0ed51b3
LW
6506 }
6507 break;
980866de 6508 case NSPACEU:
f2ed9b32 6509 if (utf8_target) {
980866de
KW
6510
6511 utf8_Nspace:
6512
ffc61ed2 6513 loceol = PL_regeol;
1a4fad37 6514 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6515 while (hardcount < max && scan < loceol &&
980866de
KW
6516 ! (*scan == ' ' ||
6517 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6518 {
ffc61ed2
JH
6519 scan += UTF8SKIP(scan);
6520 hardcount++;
6521 }
980866de
KW
6522 break;
6523 }
6524 else {
a12cf05f
KW
6525 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6526 scan++;
6527 }
980866de
KW
6528 }
6529 break;
6530 case NSPACE:
6531 if (utf8_target)
6532 goto utf8_Nspace;
6533
6534 while (scan < loceol && ! isSPACE((U8) *scan)) {
6535 scan++;
a0ed51b3 6536 }
0008a298 6537 break;
cfaf538b
KW
6538 case NSPACEA:
6539 if (utf8_target) {
6540 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6541 scan += UTF8SKIP(scan);
6542 }
6543 }
6544 else {
6545 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6546 scan++;
6547 }
6548 }
6549 break;
bbce6d69 6550 case NSPACEL:
3280af22 6551 PL_reg_flags |= RF_tainted;
f2ed9b32 6552 if (utf8_target) {
ffc61ed2 6553 loceol = PL_regeol;
1aa99e6b 6554 while (hardcount < max && scan < loceol &&
6bbba904 6555 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6556 scan += UTF8SKIP(scan);
6557 hardcount++;
6558 }
6559 } else {
6560 while (scan < loceol && !isSPACE_LC(*scan))
6561 scan++;
a0ed51b3
LW
6562 }
6563 break;
a0d0e21e 6564 case DIGIT:
f2ed9b32 6565 if (utf8_target) {
ffc61ed2 6566 loceol = PL_regeol;
1a4fad37 6567 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6568 while (hardcount < max && scan < loceol &&
f2ed9b32 6569 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6570 scan += UTF8SKIP(scan);
6571 hardcount++;
6572 }
6573 } else {
6574 while (scan < loceol && isDIGIT(*scan))
6575 scan++;
a0ed51b3
LW
6576 }
6577 break;
cfaf538b
KW
6578 case DIGITA:
6579 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6580 scan++;
6581 }
6582 break;
b77393f6
KW
6583 case DIGITL:
6584 PL_reg_flags |= RF_tainted;
6585 if (utf8_target) {
6586 loceol = PL_regeol;
6587 while (hardcount < max && scan < loceol &&
6588 isDIGIT_LC_utf8((U8*)scan)) {
6589 scan += UTF8SKIP(scan);
6590 hardcount++;
6591 }
6592 } else {
6593 while (scan < loceol && isDIGIT_LC(*scan))
6594 scan++;
6595 }
6596 break;
a0d0e21e 6597 case NDIGIT:
f2ed9b32 6598 if (utf8_target) {
ffc61ed2 6599 loceol = PL_regeol;
1a4fad37 6600 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6601 while (hardcount < max && scan < loceol &&
f2ed9b32 6602 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6603 scan += UTF8SKIP(scan);
6604 hardcount++;
6605 }
6606 } else {
6607 while (scan < loceol && !isDIGIT(*scan))
6608 scan++;
a0ed51b3 6609 }
cfaf538b
KW
6610 break;
6611 case NDIGITA:
6612 if (utf8_target) {
6613 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6614 scan += UTF8SKIP(scan);
6615 }
6616 }
6617 else {
6618 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6619 scan++;
6620 }
6621 }
6622 break;
b77393f6
KW
6623 case NDIGITL:
6624 PL_reg_flags |= RF_tainted;
6625 if (utf8_target) {
6626 loceol = PL_regeol;
6627 while (hardcount < max && scan < loceol &&
6628 !isDIGIT_LC_utf8((U8*)scan)) {
6629 scan += UTF8SKIP(scan);
6630 hardcount++;
6631 }
6632 } else {
6633 while (scan < loceol && !isDIGIT_LC(*scan))
6634 scan++;
6635 }
6636 break;
e1d1eefb 6637 case LNBREAK:
f2ed9b32 6638 if (utf8_target) {
e1d1eefb
YO
6639 loceol = PL_regeol;
6640 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6641 scan += c;
6642 hardcount++;
6643 }
6644 } else {
6645 /*
6646 LNBREAK can match two latin chars, which is ok,
6647 because we have a null terminated string, but we
6648 have to use hardcount in this situation
6649 */
6650 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6651 scan+=c;
6652 hardcount++;
6653 }
6654 }
6655 break;
6656 case HORIZWS:
f2ed9b32 6657 if (utf8_target) {
e1d1eefb
YO
6658 loceol = PL_regeol;
6659 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6660 scan += c;
6661 hardcount++;
6662 }
6663 } else {
6664 while (scan < loceol && is_HORIZWS_latin1(scan))
6665 scan++;
6666 }
a0ed51b3 6667 break;
e1d1eefb 6668 case NHORIZWS:
f2ed9b32 6669 if (utf8_target) {
e1d1eefb
YO
6670 loceol = PL_regeol;
6671 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6672 scan += UTF8SKIP(scan);
6673 hardcount++;
6674 }
6675 } else {
6676 while (scan < loceol && !is_HORIZWS_latin1(scan))
6677 scan++;
6678
6679 }
6680 break;
6681 case VERTWS:
f2ed9b32 6682 if (utf8_target) {
e1d1eefb
YO
6683 loceol = PL_regeol;
6684 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6685 scan += c;
6686 hardcount++;
6687 }
6688 } else {
6689 while (scan < loceol && is_VERTWS_latin1(scan))
6690 scan++;
6691
6692 }
6693 break;
6694 case NVERTWS:
f2ed9b32 6695 if (utf8_target) {
e1d1eefb
YO
6696 loceol = PL_regeol;
6697 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6698 scan += UTF8SKIP(scan);
6699 hardcount++;
6700 }
6701 } else {
6702 while (scan < loceol && !is_VERTWS_latin1(scan))
6703 scan++;
6704
6705 }
6706 break;
6707
a0d0e21e
LW
6708 default: /* Called on something of 0 width. */
6709 break; /* So match right here or not at all. */
6710 }
a687059c 6711
a0ed51b3
LW
6712 if (hardcount)
6713 c = hardcount;
6714 else
6715 c = scan - PL_reginput;
3280af22 6716 PL_reginput = scan;
a687059c 6717
a3621e74 6718 DEBUG_r({
e68ec53f 6719 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6720 DEBUG_EXECUTE_r({
e68ec53f
YO
6721 SV * const prop = sv_newmortal();
6722 regprop(prog, prop, p);
6723 PerlIO_printf(Perl_debug_log,
be8e71aa 6724 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6725 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6726 });
be8e71aa 6727 });
9041c2e3 6728
a0d0e21e 6729 return(c);
a687059c
LW
6730}
6731
c277df42 6732
be8e71aa 6733#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6734/*
6c6525b8
KW
6735- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6736create a copy so that changes the caller makes won't change the shared one
6737 */
ffc61ed2 6738SV *
32fc9b6a 6739Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6740{
6c6525b8
KW
6741 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6742 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6743}
6744#endif
6745
6746STATIC SV *
6747S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6748{
8c9eb58f
KW
6749 /* Returns the swash for the input 'node' in the regex 'prog'.
6750 * If <doinit> is true, will attempt to create the swash if not already
6751 * done.
6752 * If <listsvp> is non-null, will return the swash initialization string in
6753 * it.
6754 * If <altsvp> is non-null, will return the alternates to the regular swash
6755 * in it
6756 * Tied intimately to how regcomp.c sets up the data structure */
6757
97aff369 6758 dVAR;
9e55ce06
JH
6759 SV *sw = NULL;
6760 SV *si = NULL;
6761 SV *alt = NULL;
7a6c6baa
KW
6762 SV* invlist = NULL;
6763
f8fc2ecf
YO
6764 RXi_GET_DECL(prog,progi);
6765 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6766
6c6525b8 6767 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6768
ccb2541c
KW
6769 assert(ANYOF_NONBITMAP(node));
6770
4f639d21 6771 if (data && data->count) {
a3b680e6 6772 const U32 n = ARG(node);
ffc61ed2 6773
4f639d21 6774 if (data->what[n] == 's') {
ad64d0ec
NC
6775 SV * const rv = MUTABLE_SV(data->data[n]);
6776 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6777 SV **const ary = AvARRAY(av);
87367d5f 6778 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 6779
8c9eb58f 6780 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6781
7a6c6baa
KW
6782 /* Elements 3 and 4 are either both present or both absent. [3] is
6783 * any inversion list generated at compile time; [4] indicates if
6784 * that inversion list has any user-defined properties in it. */
6785 if (av_len(av) >= 3) {
6786 invlist = ary[3];
83199d38
KW
6787 if (SvUV(ary[4])) {
6788 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6789 }
7a6c6baa
KW
6790 }
6791 else {
6792 invlist = NULL;
7a6c6baa
KW
6793 }
6794
8c9eb58f
KW
6795 /* Element [1] is reserved for the set-up swash. If already there,
6796 * return it; if not, create it and store it there */
f192cf32
KW
6797 if (SvROK(ary[1])) {
6798 sw = ary[1];
6799 }
ffc61ed2 6800 else if (si && doinit) {
7a6c6baa
KW
6801
6802 sw = _core_swash_init("utf8", /* the utf8 package */
6803 "", /* nameless */
6804 si,
6805 1, /* binary */
6806 0, /* not from tr/// */
7a6c6baa 6807 invlist,
83199d38 6808 &swash_init_flags);
ffc61ed2
JH
6809 (void)av_store(av, 1, sw);
6810 }
8c9eb58f
KW
6811
6812 /* Element [2] is for any multi-char folds. Note that is a
6813 * fundamentally flawed design, because can't backtrack and try
6814 * again. See [perl #89774] */
f192cf32
KW
6815 if (SvTYPE(ary[2]) == SVt_PVAV) {
6816 alt = ary[2];
6817 }
ffc61ed2
JH
6818 }
6819 }
6820
7a6c6baa
KW
6821 if (listsvp) {
6822 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
6823
6824 /* Use the swash, if any, which has to have incorporated into it all
6825 * possibilities */
872dd7e0
KW
6826 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6827 && (si && si != &PL_sv_undef))
6828 {
7a6c6baa 6829
872dd7e0 6830 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
6831 sv_catsv(matches_string, si);
6832 }
6833
6834 /* Add the inversion list to whatever we have. This may have come from
6835 * the swash, or from an input parameter */
6836 if (invlist) {
6837 sv_catsv(matches_string, _invlist_contents(invlist));
6838 }
6839 *listsvp = matches_string;
6840 }
6841
9e55ce06
JH
6842 if (altsvp)
6843 *altsvp = alt;
ffc61ed2
JH
6844
6845 return sw;
6846}
6847
6848/*
ba7b4546 6849 - reginclass - determine if a character falls into a character class
832705d4 6850
6698fab5
KW
6851 n is the ANYOF regnode
6852 p is the target string
6853 lenp is pointer to the maximum number of bytes of how far to go in p
6854 (This is assumed wthout checking to always be at least the current
6855 character's size)
6856 utf8_target tells whether p is in UTF-8.
832705d4 6857
4b3cda86
KW
6858 Returns true if matched; false otherwise. If lenp is not NULL, on return
6859 from a successful match, the value it points to will be updated to how many
6860 bytes in p were matched. If there was no match, the value is undefined,
6861 possibly changed from the input.
eba1359e 6862
d5788240
KW
6863 Note that this can be a synthetic start class, a combination of various
6864 nodes, so things you think might be mutually exclusive, such as locale,
6865 aren't. It can match both locale and non-locale
6866
bbce6d69 6867 */
6868
76e3520e 6869STATIC bool
f6ad78d8 6870S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6871{
27da23d5 6872 dVAR;
a3b680e6 6873 const char flags = ANYOF_FLAGS(n);
bbce6d69 6874 bool match = FALSE;
cc07378b 6875 UV c = *p;
f7ab54c6 6876 STRLEN c_len = 0;
6698fab5 6877 STRLEN maxlen;
1aa99e6b 6878
7918f24d
NC
6879 PERL_ARGS_ASSERT_REGINCLASS;
6880
4b3cda86 6881 /* If c is not already the code point, get it */
f2ed9b32 6882 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6883 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6884 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6885 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6886 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6887 * UTF8_ALLOW_FFFF */
f7ab54c6 6888 if (c_len == (STRLEN)-1)
e8a70c6f 6889 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6890 }
a5a291f5
KW
6891 else {
6892 c_len = 1;
6893 }
bbce6d69 6894
a5a291f5
KW
6895 /* Use passed in max length, or one character if none passed in or less
6896 * than one character. And assume will match just one character. This is
6897 * overwritten later if matched more. */
4b3cda86 6898 if (lenp) {
a5a291f5
KW
6899 maxlen = (*lenp > c_len) ? *lenp : c_len;
6900 *lenp = c_len;
4b3cda86
KW
6901
6902 }
6903 else {
a5a291f5 6904 maxlen = c_len;
4b3cda86
KW
6905 }
6906
7cdde544
KW
6907 /* If this character is potentially in the bitmap, check it */
6908 if (c < 256) {
ffc61ed2
JH
6909 if (ANYOF_BITMAP_TEST(n, c))
6910 match = TRUE;
11454c59
KW
6911 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6912 && ! utf8_target
6913 && ! isASCII(c))
6914 {
6915 match = TRUE;
6916 }
a0ed51b3 6917
78969a98
KW
6918 else if (flags & ANYOF_LOCALE) {
6919 PL_reg_flags |= RF_tainted;
6920
6921 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6922 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6923 {
ffc61ed2 6924 match = TRUE;
78969a98 6925 }
040aea3a
KW
6926 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6927 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6928 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6929 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6930 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6931 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6932 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6933 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6934 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6935 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6936 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
6937 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6938 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
6939 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6940 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6941 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6942 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6943 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6944 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6945 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6946 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6947 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6948 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6949 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6950 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6951 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6952 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6953 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6954 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
6955 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6956 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 6957 ) /* How's that for a conditional? */
78969a98 6958 ) {
ffc61ed2
JH
6959 match = TRUE;
6960 }
a0ed51b3 6961 }
a0ed51b3
LW
6962 }
6963
7cdde544 6964 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6965 * bitmap could match, try that. Locale nodes specifiy completely the
6966 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6967 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6968 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6969 * could be a combination of locale and non-locale nodes. So allow locale
6970 * to match for the synthetic start class, which will give a false
6971 * positive that will be resolved when the match is done again as not part
6972 * of the synthetic start class */
ef87b810 6973 if (!match) {
10ee90d2
KW
6974 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6975 match = TRUE; /* Everything above 255 matches */
e051a21d 6976 }
6f8d7d0d
KW
6977 else if (ANYOF_NONBITMAP(n)
6978 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6979 || (utf8_target
6980 && (c >=256
6981 || (! (flags & ANYOF_LOCALE))
6982 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6983 {
7cdde544 6984 AV *av;
210e6c47 6985 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6986
6987 if (sw) {
6988 U8 * utf8_p;
6989 if (utf8_target) {
6990 utf8_p = (U8 *) p;
6991 } else {
f56b6394
KW
6992
6993 /* Not utf8. Convert as much of the string as available up
6994 * to the limit of how far the (single) character in the
6995 * pattern can possibly match (no need to go further). If
6996 * the node is a straight ANYOF or not folding, it can't
6997 * match more than one. Otherwise, It can match up to how
6998 * far a single char can fold to. Since not utf8, each
6999 * character is a single byte, so the max it can be in
7000 * bytes is the same as the max it can be in characters */
7001 STRLEN len = (OP(n) == ANYOF
7002 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7003 ? 1
7004 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7005 ? maxlen
7006 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
7007 utf8_p = bytes_to_utf8(p, &len);
7008 }
f56b6394 7009
f7eb7452 7010 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 7011 match = TRUE;
39065660 7012 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
7013
7014 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
7015 * matches. The non-multi char folds have all been moved to
7016 * the compilation phase, and the multi-char folds have
7017 * been stored by regcomp into 'av'; we linearly check to
7018 * see if any match the target string (folded). We know
7019 * that the originals were each one character, but we don't
7020 * currently know how many characters/bytes each folded to,
7021 * except we do know that there are small limits imposed by
7022 * Unicode. XXX A performance enhancement would be to have
7023 * regcomp.c store the max number of chars/bytes that are
7024 * in an av entry, as, say the 0th element. Even better
7025 * would be to have a hash of the few characters that can
7026 * start a multi-char fold to the max number of chars of
7027 * those folds.
f56b6394
KW
7028 *
7029 * If there is a match, we will need to advance (if lenp is
7030 * specified) the match pointer in the target string. But
7031 * what we are comparing here isn't that string directly,
7032 * but its fold, whose length may differ from the original.
7033 * As we go along in constructing the fold, therefore, we
7034 * create a map so that we know how many bytes in the
7035 * source to advance given that we have matched a certain
7036 * number of bytes in the fold. This map is stored in
6438af90
KW
7037 * 'map_fold_len_back'. Let n mean the number of bytes in
7038 * the fold of the first character that we are folding.
7039 * Then map_fold_len_back[n] is set to the number of bytes
7040 * in that first character. Similarly let m be the
7041 * corresponding number for the second character to be
7042 * folded. Then map_fold_len_back[n+m] is set to the
7043 * number of bytes occupied by the first two source
7044 * characters. ... */
7045 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
7046 U8 folded[UTF8_MAXBYTES_CASE+1];
7047 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 7048 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 7049 chars */
f56b6394
KW
7050
7051 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7052
7053 /* Here, only need to fold the first char of the target
6438af90 7054 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 7055 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 7056 total_foldlen = foldlen;
6438af90
KW
7057 map_fold_len_back[foldlen] = (utf8_target)
7058 ? UTF8SKIP(utf8_p)
7059 : 1;
f56b6394
KW
7060 }
7061 else {
7062
7063 /* Here, need to fold more than the first char. Do so
7064 * up to the limits */
f56b6394
KW
7065 U8* source_ptr = utf8_p; /* The source for the fold
7066 is the regex target
7067 string */
7068 U8* folded_ptr = folded;
7069 U8* e = utf8_p + maxlen; /* Can't go beyond last
7070 available byte in the
7071 target string */
6438af90
KW
7072 U8 i;
7073 for (i = 0;
7074 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7075 i++)
f56b6394
KW
7076 {
7077
7078 /* Fold the next character */
7079 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7080 STRLEN this_char_foldlen;
7081 to_utf8_fold(source_ptr,
7082 this_char_folded,
7083 &this_char_foldlen);
7084
7085 /* Bail if it would exceed the byte limit for
7086 * folding a single char. */
7087 if (this_char_foldlen + folded_ptr - folded >
7088 UTF8_MAXBYTES_CASE)
7089 {
7090 break;
7091 }
7092
6438af90 7093 /* Add the fold of this character */
f56b6394
KW
7094 Copy(this_char_folded,
7095 folded_ptr,
7096 this_char_foldlen,
7097 U8);
f56b6394 7098 source_ptr += UTF8SKIP(source_ptr);
6438af90 7099 folded_ptr += this_char_foldlen;
deba3d96 7100 total_foldlen = folded_ptr - folded;
6438af90
KW
7101
7102 /* Create map from the number of bytes in the fold
7103 * back to the number of bytes in the source. If
7104 * the source isn't utf8, the byte count is just
7105 * the number of characters so far */
deba3d96 7106 map_fold_len_back[total_foldlen]
6438af90
KW
7107 = (utf8_target)
7108 ? source_ptr - utf8_p
7109 : i + 1;
f56b6394
KW
7110 }
7111 *folded_ptr = '\0';
f56b6394
KW
7112 }
7113
7114
7115 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
7116 * of multi-char folds. */
7117 if (av) {
7cdde544
KW
7118 I32 i;
7119 for (i = 0; i <= av_len(av); i++) {
7120 SV* const sv = *av_fetch(av, i, FALSE);
7121 STRLEN len;
7122 const char * const s = SvPV_const(sv, len);
6438af90 7123
f9126265
KW
7124 if (len <= total_foldlen
7125 && memEQ(s, (char*)folded, len)
7126
7127 /* If 0, means matched a partial char. See
7128 * [perl #90536] */
7129 && map_fold_len_back[len])
f56b6394
KW
7130 {
7131
7132 /* Advance the target string ptr to account for
7133 * this fold, but have to translate from the
7134 * folded length to the corresponding source
6438af90 7135 * length. */
8f2655f7
KW
7136 if (lenp) {
7137 *lenp = map_fold_len_back[len];
8f2655f7 7138 }
7cdde544
KW
7139 match = TRUE;
7140 break;
7141 }
7142 }
7143 }
7cdde544
KW
7144 }
7145
7146 /* If we allocated a string above, free it */
7147 if (! utf8_target) Safefree(utf8_p);
7148 }
7149 }
5073ffbd
KW
7150
7151 if (UNICODE_IS_SUPER(c)
7152 && (flags & ANYOF_WARN_SUPER)
7153 && ckWARN_d(WARN_NON_UNICODE))
7154 {
7155 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7156 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7157 }
7cdde544
KW
7158 }
7159
f0fdc1c9
KW
7160 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7161 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7162}
161b471a 7163
dfe13c55 7164STATIC U8 *
0ce71af7 7165S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7166{
6af86488
KW
7167 /* return the position 'off' UTF-8 characters away from 's', forward if
7168 * 'off' >= 0, backwards if negative. But don't go outside of position
7169 * 'lim', which better be < s if off < 0 */
7170
97aff369 7171 dVAR;
7918f24d
NC
7172
7173 PERL_ARGS_ASSERT_REGHOP3;
7174
a0ed51b3 7175 if (off >= 0) {
1aa99e6b 7176 while (off-- && s < lim) {
ffc61ed2 7177 /* XXX could check well-formedness here */
a0ed51b3 7178 s += UTF8SKIP(s);
ffc61ed2 7179 }
a0ed51b3
LW
7180 }
7181 else {
1de06328
YO
7182 while (off++ && s > lim) {
7183 s--;
7184 if (UTF8_IS_CONTINUED(*s)) {
7185 while (s > lim && UTF8_IS_CONTINUATION(*s))
7186 s--;
a0ed51b3 7187 }
1de06328 7188 /* XXX could check well-formedness here */
a0ed51b3
LW
7189 }
7190 }
7191 return s;
7192}
161b471a 7193
f9f4320a
YO
7194#ifdef XXX_dmq
7195/* there are a bunch of places where we use two reghop3's that should
7196 be replaced with this routine. but since thats not done yet
7197 we ifdef it out - dmq
7198*/
dfe13c55 7199STATIC U8 *
1de06328
YO
7200S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7201{
7202 dVAR;
7918f24d
NC
7203
7204 PERL_ARGS_ASSERT_REGHOP4;
7205
1de06328
YO
7206 if (off >= 0) {
7207 while (off-- && s < rlim) {
7208 /* XXX could check well-formedness here */
7209 s += UTF8SKIP(s);
7210 }
7211 }
7212 else {
7213 while (off++ && s > llim) {
7214 s--;
7215 if (UTF8_IS_CONTINUED(*s)) {
7216 while (s > llim && UTF8_IS_CONTINUATION(*s))
7217 s--;
7218 }
7219 /* XXX could check well-formedness here */
7220 }
7221 }
7222 return s;
7223}
f9f4320a 7224#endif
1de06328
YO
7225
7226STATIC U8 *
0ce71af7 7227S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7228{
97aff369 7229 dVAR;
7918f24d
NC
7230
7231 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7232
a0ed51b3 7233 if (off >= 0) {
1aa99e6b 7234 while (off-- && s < lim) {
ffc61ed2 7235 /* XXX could check well-formedness here */
a0ed51b3 7236 s += UTF8SKIP(s);
ffc61ed2 7237 }
a0ed51b3 7238 if (off >= 0)
3dab1dad 7239 return NULL;
a0ed51b3
LW
7240 }
7241 else {
1de06328
YO
7242 while (off++ && s > lim) {
7243 s--;
7244 if (UTF8_IS_CONTINUED(*s)) {
7245 while (s > lim && UTF8_IS_CONTINUATION(*s))
7246 s--;
a0ed51b3 7247 }
1de06328 7248 /* XXX could check well-formedness here */
a0ed51b3
LW
7249 }
7250 if (off <= 0)
3dab1dad 7251 return NULL;
a0ed51b3
LW
7252 }
7253 return s;
7254}
51371543 7255
51371543 7256static void
acfe0abc 7257restore_pos(pTHX_ void *arg)
51371543 7258{
97aff369 7259 dVAR;
097eb12c 7260 regexp * const rex = (regexp *)arg;
ed301438 7261 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7262 if (PL_reg_oldsaved) {
4f639d21
DM
7263 rex->subbeg = PL_reg_oldsaved;
7264 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7265 rex->suboffset = PL_reg_oldsavedoffset;
7266 rex->subcoffset = PL_reg_oldsavedcoffset;
f8c7b90f 7267#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7268 rex->saved_copy = PL_nrs;
ed252734 7269#endif
07bc277f 7270 RXp_MATCH_COPIED_on(rex);
51371543
GS
7271 }
7272 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7273 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7274 PL_curpm = PL_reg_oldcurpm;
7275 }
7276}
33b8afdf
JH
7277
7278STATIC void
7279S_to_utf8_substr(pTHX_ register regexp *prog)
7280{
a1cac82e 7281 int i = 1;
7918f24d
NC
7282
7283 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7284
a1cac82e
NC
7285 do {
7286 if (prog->substrs->data[i].substr
7287 && !prog->substrs->data[i].utf8_substr) {
7288 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7289 prog->substrs->data[i].utf8_substr = sv;
7290 sv_utf8_upgrade(sv);
610460f9 7291 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7292 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7293 /* Trim the trailing \n that fbm_compile added last
7294 time. */
7295 SvCUR_set(sv, SvCUR(sv) - 1);
7296 /* Whilst this makes the SV technically "invalid" (as its
7297 buffer is no longer followed by "\0") when fbm_compile()
7298 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7299 fbm_compile(sv, FBMcf_TAIL);
7300 } else
7301 fbm_compile(sv, 0);
610460f9 7302 }
a1cac82e
NC
7303 if (prog->substrs->data[i].substr == prog->check_substr)
7304 prog->check_utf8 = sv;
7305 }
7306 } while (i--);
33b8afdf
JH
7307}
7308
7309STATIC void
7310S_to_byte_substr(pTHX_ register regexp *prog)
7311{
97aff369 7312 dVAR;
a1cac82e 7313 int i = 1;
7918f24d
NC
7314
7315 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7316
a1cac82e
NC
7317 do {
7318 if (prog->substrs->data[i].utf8_substr
7319 && !prog->substrs->data[i].substr) {
7320 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7321 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7322 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7323 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7324 /* Trim the trailing \n that fbm_compile added last
7325 time. */
7326 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7327 fbm_compile(sv, FBMcf_TAIL);
7328 } else
7329 fbm_compile(sv, 0);
7330 }
a1cac82e
NC
7331 } else {
7332 SvREFCNT_dec(sv);
7333 sv = &PL_sv_undef;
7334 }
7335 prog->substrs->data[i].substr = sv;
7336 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7337 prog->check_substr = sv;
33b8afdf 7338 }
a1cac82e 7339 } while (i--);
33b8afdf 7340}
66610fdd
RGS
7341
7342/*
7343 * Local variables:
7344 * c-indentation-style: bsd
7345 * c-basic-offset: 4
14d04a33 7346 * indent-tabs-mode: nil
66610fdd
RGS
7347 * End:
7348 *
14d04a33 7349 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7350 */