This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Use gv_fetchpvs() instead of gv_fetchpv(), and GV_ADD, not TRUE.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
ef07e810 86#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 87#define RF_warned 2 /* warned about big count? */
faec1544 88
ab3bbdeb 89#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 90
f2ed9b32 91#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 92
a687059c
LW
93#ifndef STATIC
94#define STATIC static
95#endif
96
7e2509c1
KW
97/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
af364d03
KW
100#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
f2ed9b32 107#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
3dab1dad
YO
110#define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
52657f30 112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
113 : (U8*)(pos + off))
114#define HOPBACKc(pos, off) \
07be1b83
YO
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
8e11feef 118 ? (U8*)pos - off \
3dab1dad 119 : NULL)
efb30f32 120
e7409c1b 121#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 122#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 123
20d0b1e9 124/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 125#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 126 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 127 bool ok; \
9c4fdda1 128 ENTER; save_re_context(); \
cf54f63a
JL
129 ok=CAT2(is_utf8_,class)((const U8*)str); \
130 PERL_UNUSED_VAR(ok); \
131 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
132/* Doesn't do an assert to verify that is correct */
133#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 134 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 135 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
136 ENTER; save_re_context(); \
137 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
138 LEAVE; } } STMT_END
37e2e78e 139
1a4fad37
AL
140#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
141#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
142#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 143
37e2e78e 144#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
145 /* No asserts are done for some of these, in case called on a */ \
146 /* Unicode version in which they map to nothing */ \
27d4fc33 147 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 148 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 149
1dcf4a1b 150#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 151
ee9a90b8
KW
152/* The actual code for CCC_TRY, which uses several variables from the routine
153 * it's callable from. It is designed to be the bulk of a case statement.
154 * FUNC is the macro or function to call on non-utf8 targets that indicate if
155 * nextchr matches the class.
156 * UTF8_TEST is the whole test string to use for utf8 targets
157 * LOAD is what to use to test, and if not present to load in the swash for the
158 * class
159 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
160 * UTF8_TEST test.
161 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
162 * utf8 and a variant, load the swash if necessary and test using the utf8
163 * test. Advance to the next character if test is ok, otherwise fail; If not
164 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
165 * fails, or advance to the next character */
166
167#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
168 if (locinput >= PL_regeol) { \
169 sayNO; \
170 } \
171 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
172 LOAD_UTF8_CHARCLASS(CLASS, STR); \
173 if (POS_OR_NEG (UTF8_TEST)) { \
174 sayNO; \
175 } \
176 locinput += PL_utf8skip[nextchr]; \
177 nextchr = UCHARAT(locinput); \
178 break; \
179 } \
180 if (POS_OR_NEG (FUNC(nextchr))) { \
181 sayNO; \
182 } \
183 nextchr = UCHARAT(++locinput); \
980866de
KW
184 break;
185
ee9a90b8
KW
186/* Handle the non-locale cases for a character class and its complement. It
187 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
188 * This is because that code fails when the test succeeds, so we want to have
189 * the test fail so that the code succeeds. The swash is stored in a
190 * predictable PL_ place */
cfaf538b
KW
191#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
192 CLASS, STR) \
ee9a90b8
KW
193 case NAME: \
194 _CCC_TRY_CODE( !, FUNC, \
195 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
196 (U8*)locinput, TRUE)), \
197 CLASS, STR) \
198 case NNAME: \
1dcf4a1b 199 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
200 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
201 (U8*)locinput, TRUE)), \
202 CLASS, STR) \
203
204/* Generate the case statements for both locale and non-locale character
205 * classes in regmatch for classes that don't have special unicode semantics.
206 * Locales don't use an immediate swash, but an intermediary special locale
207 * function that is called on the pointer to the current place in the input
208 * string. That function will resolve to needing the same swash. One might
209 * think that because we don't know what the locale will match, we shouldn't
210 * check with the swash loading function that it loaded properly; ie, that we
211 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
212 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
213 * irrelevant here */
214#define CCC_TRY(NAME, NNAME, FUNC, \
215 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 216 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
217 CLASS, STR) \
218 case NAMEL: \
219 PL_reg_flags |= RF_tainted; \
220 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
221 case NNAMEL: \
222 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
223 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
224 CLASS, STR) \
cfaf538b
KW
225 case NAMEA: \
226 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
227 sayNO; \
228 } \
229 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
230 nextchr = UCHARAT(++locinput); \
231 break; \
232 case NNAMEA: \
233 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
234 sayNO; \
235 } \
236 if (utf8_target) { \
237 locinput += PL_utf8skip[nextchr]; \
238 nextchr = UCHARAT(locinput); \
239 } \
240 else { \
241 nextchr = UCHARAT(++locinput); \
242 } \
243 break; \
ee9a90b8
KW
244 /* Generate the non-locale cases */ \
245 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
246
247/* This is like CCC_TRY, but has an extra set of parameters for generating case
248 * statements to handle separate Unicode semantics nodes */
249#define CCC_TRY_U(NAME, NNAME, FUNC, \
250 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
251 NAMEU, NNAMEU, FUNCU, \
cfaf538b 252 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 253 CLASS, STR) \
cfaf538b
KW
254 CCC_TRY(NAME, NNAME, FUNC, \
255 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
256 NAMEA, NNAMEA, FUNCA, \
257 CLASS, STR) \
ee9a90b8 258 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 259
3dab1dad
YO
260/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
261
5f80c4cf 262/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
263/* it would be nice to rework regcomp.sym to generate this stuff. sigh
264 *
265 * NOTE that *nothing* that affects backtracking should be in here, specifically
266 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
267 * node that is in between two EXACT like nodes when ascertaining what the required
268 * "follow" character is. This should probably be moved to regex compile time
269 * although it may be done at run time beause of the REF possibility - more
270 * investigation required. -- demerphq
271*/
3e901dc0
YO
272#define JUMPABLE(rn) ( \
273 OP(rn) == OPEN || \
274 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
275 OP(rn) == EVAL || \
cca55fe3
JP
276 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
277 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 278 OP(rn) == KEEPS || \
3dab1dad 279 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 280)
ee9b8eae 281#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 282
ee9b8eae
YO
283#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
284
285#if 0
286/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
287 we don't need this definition. */
288#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 289#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
290#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
291
292#else
293/* ... so we use this as its faster. */
294#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 295#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
296#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
297#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
298
299#endif
e2d8ce26 300
a84d97b6
HS
301/*
302 Search for mandatory following text node; for lookahead, the text must
303 follow but for lookbehind (rn->flags != 0) we skip to the next step.
304*/
cca55fe3 305#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
306 while (JUMPABLE(rn)) { \
307 const OPCODE type = OP(rn); \
308 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 309 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 310 else if (type == PLUS) \
cca55fe3 311 rn = NEXTOPER(rn); \
3dab1dad 312 else if (type == IFMATCH) \
a84d97b6 313 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 314 else rn += NEXT_OFF(rn); \
3dab1dad 315 } \
5f80c4cf 316} STMT_END
74750237 317
c476f425 318
acfe0abc 319static void restore_pos(pTHX_ void *arg);
51371543 320
87c0511b 321#define REGCP_PAREN_ELEMS 3
f067efbf 322#define REGCP_OTHER_ELEMS 3
e0fa7e2b 323#define REGCP_FRAME_ELEMS 1
620d5b66
NC
324/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
325 * are needed for the regexp context stack bookkeeping. */
326
76e3520e 327STATIC CHECKPOINT
b93070ed 328S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 329{
97aff369 330 dVAR;
a3b680e6 331 const int retval = PL_savestack_ix;
a3b680e6 332 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
333 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
334 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 335 I32 p;
40a82448 336 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 337
b93070ed
DM
338 PERL_ARGS_ASSERT_REGCPPUSH;
339
e49a9654 340 if (paren_elems_to_push < 0)
5637ef5b
NC
341 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
342 paren_elems_to_push);
e49a9654 343
e0fa7e2b
NC
344 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
345 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
346 " out of range (%lu-%ld)",
347 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 348
620d5b66 349 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 350
495f47a5
DM
351 DEBUG_BUFFERS_r(
352 if ((int)PL_regsize > (int)parenfloor)
353 PerlIO_printf(Perl_debug_log,
354 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
355 PTR2UV(rex),
356 PTR2UV(rex->offs)
357 );
358 );
87c0511b 359 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 360/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
361 SSPUSHINT(rex->offs[p].end);
362 SSPUSHINT(rex->offs[p].start);
1ca2007e 363 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 364 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
365 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
366 (UV)p,
367 (IV)rex->offs[p].start,
368 (IV)rex->offs[p].start_tmp,
369 (IV)rex->offs[p].end
40a82448 370 ));
a0d0e21e 371 }
b1ce53c5 372/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 373 SSPUSHINT(PL_regsize);
b93070ed
DM
374 SSPUSHINT(rex->lastparen);
375 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 376 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 377
a0d0e21e
LW
378 return retval;
379}
380
c277df42 381/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
382#define REGCP_SET(cp) \
383 DEBUG_STATE_r( \
ab3bbdeb 384 PerlIO_printf(Perl_debug_log, \
e4f74956 385 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
386 (IV)PL_savestack_ix)); \
387 cp = PL_savestack_ix
c3464db5 388
ab3bbdeb 389#define REGCP_UNWIND(cp) \
e4f74956 390 DEBUG_STATE_r( \
ab3bbdeb 391 if (cp != PL_savestack_ix) \
e4f74956
YO
392 PerlIO_printf(Perl_debug_log, \
393 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
394 (IV)(cp), (IV)PL_savestack_ix)); \
395 regcpblow(cp)
c277df42 396
a8d1f4b4
DM
397#define UNWIND_PAREN(lp, lcp) \
398 for (n = rex->lastparen; n > lp; n--) \
399 rex->offs[n].end = -1; \
400 rex->lastparen = n; \
401 rex->lastcloseparen = lcp;
402
403
f067efbf 404STATIC void
b93070ed 405S_regcppop(pTHX_ regexp *rex)
a0d0e21e 406{
97aff369 407 dVAR;
e0fa7e2b 408 UV i;
87c0511b 409 U32 paren;
a3621e74
YO
410 GET_RE_DEBUG_FLAGS_DECL;
411
7918f24d
NC
412 PERL_ARGS_ASSERT_REGCPPOP;
413
b1ce53c5 414 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 415 i = SSPOPUV;
e0fa7e2b
NC
416 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
417 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
418 rex->lastcloseparen = SSPOPINT;
419 rex->lastparen = SSPOPINT;
3280af22 420 PL_regsize = SSPOPINT;
b1ce53c5 421
620d5b66 422 i -= REGCP_OTHER_ELEMS;
b1ce53c5 423 /* Now restore the parentheses context. */
495f47a5
DM
424 DEBUG_BUFFERS_r(
425 if (i || rex->lastparen + 1 <= rex->nparens)
426 PerlIO_printf(Perl_debug_log,
427 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
428 PTR2UV(rex),
429 PTR2UV(rex->offs)
430 );
431 );
87c0511b 432 paren = PL_regsize;
620d5b66 433 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 434 I32 tmps;
1ca2007e 435 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 436 rex->offs[paren].start = SSPOPINT;
cf93c79d 437 tmps = SSPOPINT;
b93070ed
DM
438 if (paren <= rex->lastparen)
439 rex->offs[paren].end = tmps;
495f47a5
DM
440 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
441 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
442 (UV)paren,
443 (IV)rex->offs[paren].start,
444 (IV)rex->offs[paren].start_tmp,
445 (IV)rex->offs[paren].end,
446 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 447 );
87c0511b 448 paren--;
a0d0e21e 449 }
daf18116 450#if 1
dafc8851
JH
451 /* It would seem that the similar code in regtry()
452 * already takes care of this, and in fact it is in
453 * a better location to since this code can #if 0-ed out
454 * but the code in regtry() is needed or otherwise tests
455 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
456 * (as of patchlevel 7877) will fail. Then again,
457 * this code seems to be necessary or otherwise
225593e1
DM
458 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
459 * --jhi updated by dapm */
b93070ed 460 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 461 if (i > PL_regsize)
b93070ed
DM
462 rex->offs[i].start = -1;
463 rex->offs[i].end = -1;
495f47a5
DM
464 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
465 " \\%"UVuf": %s ..-1 undeffing\n",
466 (UV)i,
467 (i > PL_regsize) ? "-1" : " "
468 ));
a0d0e21e 469 }
dafc8851 470#endif
a0d0e21e
LW
471}
472
74088413
DM
473/* restore the parens and associated vars at savestack position ix,
474 * but without popping the stack */
475
476STATIC void
477S_regcp_restore(pTHX_ regexp *rex, I32 ix)
478{
479 I32 tmpix = PL_savestack_ix;
480 PL_savestack_ix = ix;
481 regcppop(rex);
482 PL_savestack_ix = tmpix;
483}
484
02db2b7b 485#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 486
a687059c 487/*
e50aee73 488 * pregexec and friends
a687059c
LW
489 */
490
76234dfb 491#ifndef PERL_IN_XSUB_RE
a687059c 492/*
c277df42 493 - pregexec - match a regexp against a string
a687059c 494 */
c277df42 495I32
49d7dfbc 496Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 497 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
498/* stringarg: the point in the string at which to begin matching */
499/* strend: pointer to null at end of string */
500/* strbeg: real beginning of string */
501/* minend: end of match must be >= minend bytes after stringarg. */
502/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
503 * itself is accessed via the pointers above */
504/* nosave: For optimizations. */
c277df42 505{
7918f24d
NC
506 PERL_ARGS_ASSERT_PREGEXEC;
507
c277df42 508 return
9041c2e3 509 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
510 nosave ? 0 : REXEC_COPY_STR);
511}
76234dfb 512#endif
22e551b9 513
9041c2e3 514/*
cad2e5aa
JH
515 * Need to implement the following flags for reg_anch:
516 *
517 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
518 * USE_INTUIT_ML
519 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
520 * INTUIT_AUTORITATIVE_ML
521 * INTUIT_ONCE_NOML - Intuit can match in one location only.
522 * INTUIT_ONCE_ML
523 *
524 * Another flag for this function: SECOND_TIME (so that float substrs
525 * with giant delta may be not rechecked).
526 */
527
528/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
529
3f7c398e 530/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
531 Otherwise, only SvCUR(sv) is used to get strbeg. */
532
533/* XXXX We assume that strpos is strbeg unless sv. */
534
6eb5f6b9
JH
535/* XXXX Some places assume that there is a fixed substring.
536 An update may be needed if optimizer marks as "INTUITable"
537 RExen without fixed substrings. Similarly, it is assumed that
538 lengths of all the strings are no more than minlen, thus they
539 cannot come from lookahead.
40d049e4
YO
540 (Or minlen should take into account lookahead.)
541 NOTE: Some of this comment is not correct. minlen does now take account
542 of lookahead/behind. Further research is required. -- demerphq
543
544*/
6eb5f6b9 545
2c2d71f5
JH
546/* A failure to find a constant substring means that there is no need to make
547 an expensive call to REx engine, thus we celebrate a failure. Similarly,
548 finding a substring too deep into the string means that less calls to
30944b6d
IZ
549 regtry() should be needed.
550
551 REx compiler's optimizer found 4 possible hints:
552 a) Anchored substring;
553 b) Fixed substring;
554 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 555 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 556 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
557 string which does not contradict any of them.
558 */
2c2d71f5 559
6eb5f6b9
JH
560/* Most of decisions we do here should have been done at compile time.
561 The nodes of the REx which we used for the search should have been
562 deleted from the finite automaton. */
563
cad2e5aa 564char *
288b8c02 565Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 566 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 567{
97aff369 568 dVAR;
288b8c02 569 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 570 I32 start_shift = 0;
cad2e5aa 571 /* Should be nonnegative! */
eb578fdb
KW
572 I32 end_shift = 0;
573 char *s;
574 SV *check;
a1933d95 575 char *strbeg;
cad2e5aa 576 char *t;
f2ed9b32 577 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 578 I32 ml_anch;
eb578fdb 579 char *other_last = NULL; /* other substr checked before this */
bd61b366 580 char *check_at = NULL; /* check substr found at this pos */
d8080198 581 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 582 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 583 RXi_GET_DECL(prog,progi);
30944b6d 584#ifdef DEBUGGING
890ce7af 585 const char * const i_strpos = strpos;
30944b6d 586#endif
a3621e74
YO
587 GET_RE_DEBUG_FLAGS_DECL;
588
7918f24d 589 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
590 PERL_UNUSED_ARG(flags);
591 PERL_UNUSED_ARG(data);
7918f24d 592
f2ed9b32 593 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 594
3c8556c3 595 if (RX_UTF8(rx)) {
b8d68ded
JH
596 PL_reg_flags |= RF_utf8;
597 }
ab3bbdeb 598 DEBUG_EXECUTE_r(
f2ed9b32 599 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
600 sv ? "Guessing start of match in sv for"
601 : "Guessing start of match in string for");
2a782b5b 602 );
cad2e5aa 603
c344f387
JH
604 /* CHR_DIST() would be more correct here but it makes things slow. */
605 if (prog->minlen > strend - strpos) {
a3621e74 606 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 607 "String too short... [re_intuit_start]\n"));
cad2e5aa 608 goto fail;
2c2d71f5 609 }
1de06328 610
a1933d95 611 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 612 PL_regeol = strend;
f2ed9b32 613 if (utf8_target) {
33b8afdf
JH
614 if (!prog->check_utf8 && prog->check_substr)
615 to_utf8_substr(prog);
616 check = prog->check_utf8;
617 } else {
618 if (!prog->check_substr && prog->check_utf8)
619 to_byte_substr(prog);
620 check = prog->check_substr;
621 }
1de06328 622 if (check == &PL_sv_undef) {
a3621e74 623 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1de06328 624 "Non-utf8 string cannot match utf8 check string\n"));
33b8afdf
JH
625 goto fail;
626 }
bbe252da
YO
627 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
628 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
629 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 630 && !multiline ) ); /* Check after \n? */
cad2e5aa 631
7e25d62c 632 if (!ml_anch) {
bbe252da
YO
633 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
634 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 635 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
636 && sv && !SvROK(sv)
637 && (strpos != strbeg)) {
a3621e74 638 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
639 goto fail;
640 }
641 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 642 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 643 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
644 I32 slen;
645
1aa99e6b 646 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 647
653099ff
GS
648 if (SvTAIL(check)) {
649 slen = SvCUR(check); /* >= 1 */
cad2e5aa 650
9041c2e3 651 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 652 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 654 goto fail_finish;
cad2e5aa
JH
655 }
656 /* Now should match s[0..slen-2] */
657 slen--;
3f7c398e 658 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 659 || (slen > 1
3f7c398e 660 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 661 report_neq:
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
663 goto fail_finish;
664 }
cad2e5aa 665 }
3f7c398e 666 else if (*SvPVX_const(check) != *s
653099ff 667 || ((slen = SvCUR(check)) > 1
3f7c398e 668 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 669 goto report_neq;
c315bfe8 670 check_at = s;
2c2d71f5 671 goto success_at_start;
7e25d62c 672 }
cad2e5aa 673 }
2c2d71f5 674 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 675 s = strpos;
2c2d71f5 676 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
677 end_shift = prog->check_end_shift;
678
2c2d71f5 679 if (!ml_anch) {
a3b680e6 680 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 681 - (SvTAIL(check) != 0);
a3b680e6 682 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
683
684 if (end_shift < eshift)
685 end_shift = eshift;
686 }
cad2e5aa 687 }
2c2d71f5 688 else { /* Can match at random position */
cad2e5aa
JH
689 ml_anch = 0;
690 s = strpos;
1de06328
YO
691 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
692 end_shift = prog->check_end_shift;
693
694 /* end shift should be non negative here */
cad2e5aa
JH
695 }
696
bcdf7404 697#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 698 if (end_shift < 0)
1de06328 699 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 700 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
701#endif
702
2c2d71f5
JH
703 restart:
704 /* Find a possible match in the region s..strend by looking for
705 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
706
707 {
708 I32 srch_start_shift = start_shift;
709 I32 srch_end_shift = end_shift;
c33e64f0
FC
710 U8* start_point;
711 U8* end_point;
1de06328
YO
712 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
713 srch_end_shift -= ((strbeg - s) - srch_start_shift);
714 srch_start_shift = strbeg - s;
715 }
6bda09f9 716 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
717 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
718 (IV)prog->check_offset_min,
719 (IV)srch_start_shift,
720 (IV)srch_end_shift,
721 (IV)prog->check_end_shift);
722 });
723
bbe252da 724 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
725 start_point= (U8*)(s + srch_start_shift);
726 end_point= (U8*)(strend - srch_end_shift);
727 } else {
728 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
729 end_point= HOP3(strend, -srch_end_shift, strbeg);
730 }
6bda09f9 731 DEBUG_OPTIMISE_MORE_r({
56570a2c 732 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 733 (int)(end_point - start_point),
fc8cd66c 734 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
735 start_point);
736 });
737
738 s = fbm_instr( start_point, end_point,
7fba1cd6 739 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 740 }
cad2e5aa
JH
741 /* Update the count-of-usability, remove useless subpatterns,
742 unshift s. */
2c2d71f5 743
ab3bbdeb 744 DEBUG_EXECUTE_r({
f2ed9b32 745 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
746 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
747 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 748 (s ? "Found" : "Did not find"),
f2ed9b32 749 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
750 ? "anchored" : "floating"),
751 quoted,
752 RE_SV_TAIL(check),
753 (s ? " at offset " : "...\n") );
754 });
2c2d71f5
JH
755
756 if (!s)
757 goto fail_finish;
2c2d71f5 758 /* Finish the diagnostic message */
a3621e74 759 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 760
1de06328
YO
761 /* XXX dmq: first branch is for positive lookbehind...
762 Our check string is offset from the beginning of the pattern.
763 So we need to do any stclass tests offset forward from that
764 point. I think. :-(
765 */
766
767
768
769 check_at=s;
770
771
2c2d71f5
JH
772 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
773 Start with the other substr.
774 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 775 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
776 *always* match. Probably should be marked during compile...
777 Probably it is right to do no SCREAM here...
778 */
779
f2ed9b32 780 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
781 : (prog->float_substr && prog->anchored_substr))
782 {
30944b6d 783 /* Take into account the "other" substring. */
2c2d71f5
JH
784 /* XXXX May be hopelessly wrong for UTF... */
785 if (!other_last)
6eb5f6b9 786 other_last = strpos;
f2ed9b32 787 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
788 do_other_anchored:
789 {
890ce7af
AL
790 char * const last = HOP3c(s, -start_shift, strbeg);
791 char *last1, *last2;
be8e71aa 792 char * const saved_s = s;
33b8afdf 793 SV* must;
2c2d71f5 794
2c2d71f5
JH
795 t = s - prog->check_offset_max;
796 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 797 && (!utf8_target
0ce71af7 798 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 799 && t > strpos)))
6f207bd3 800 NOOP;
2c2d71f5
JH
801 else
802 t = strpos;
1aa99e6b 803 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
804 if (t < other_last) /* These positions already checked */
805 t = other_last;
1aa99e6b 806 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
807 if (last < last1)
808 last1 = last;
1de06328
YO
809 /* XXXX It is not documented what units *_offsets are in.
810 We assume bytes, but this is clearly wrong.
811 Meaning this code needs to be carefully reviewed for errors.
812 dmq.
813 */
814
2c2d71f5 815 /* On end-of-str: see comment below. */
f2ed9b32 816 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
817 if (must == &PL_sv_undef) {
818 s = (char*)NULL;
1de06328 819 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
820 }
821 else
822 s = fbm_instr(
823 (unsigned char*)t,
824 HOP3(HOP3(last1, prog->anchored_offset, strend)
825 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
826 must,
7fba1cd6 827 multiline ? FBMrf_MULTILINE : 0
33b8afdf 828 );
ab3bbdeb 829 DEBUG_EXECUTE_r({
f2ed9b32 830 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
831 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
832 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 833 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
834 quoted, RE_SV_TAIL(must));
835 });
836
837
2c2d71f5
JH
838 if (!s) {
839 if (last1 >= last2) {
a3621e74 840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
841 ", giving up...\n"));
842 goto fail_finish;
843 }
a3621e74 844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 845 ", trying floating at offset %ld...\n",
be8e71aa 846 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
847 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
848 s = HOP3c(last, 1, strend);
2c2d71f5
JH
849 goto restart;
850 }
851 else {
a3621e74 852 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 853 (long)(s - i_strpos)));
1aa99e6b
IH
854 t = HOP3c(s, -prog->anchored_offset, strbeg);
855 other_last = HOP3c(s, 1, strend);
be8e71aa 856 s = saved_s;
2c2d71f5
JH
857 if (t == strpos)
858 goto try_at_start;
2c2d71f5
JH
859 goto try_at_offset;
860 }
30944b6d 861 }
2c2d71f5
JH
862 }
863 else { /* Take into account the floating substring. */
33b8afdf 864 char *last, *last1;
be8e71aa 865 char * const saved_s = s;
33b8afdf
JH
866 SV* must;
867
868 t = HOP3c(s, -start_shift, strbeg);
869 last1 = last =
870 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
871 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
872 last = HOP3c(t, prog->float_max_offset, strend);
873 s = HOP3c(t, prog->float_min_offset, strend);
874 if (s < other_last)
875 s = other_last;
2c2d71f5 876 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 877 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
878 /* fbm_instr() takes into account exact value of end-of-str
879 if the check is SvTAIL(ed). Since false positives are OK,
880 and end-of-str is not later than strend we are OK. */
881 if (must == &PL_sv_undef) {
882 s = (char*)NULL;
1de06328 883 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
884 }
885 else
2c2d71f5 886 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
887 (unsigned char*)last + SvCUR(must)
888 - (SvTAIL(must)!=0),
7fba1cd6 889 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 890 DEBUG_EXECUTE_r({
f2ed9b32 891 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
892 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
893 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 894 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
895 quoted, RE_SV_TAIL(must));
896 });
33b8afdf
JH
897 if (!s) {
898 if (last1 == last) {
a3621e74 899 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
900 ", giving up...\n"));
901 goto fail_finish;
2c2d71f5 902 }
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 904 ", trying anchored starting at offset %ld...\n",
be8e71aa 905 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
906 other_last = last;
907 s = HOP3c(t, 1, strend);
908 goto restart;
909 }
910 else {
a3621e74 911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
912 (long)(s - i_strpos)));
913 other_last = s; /* Fix this later. --Hugo */
be8e71aa 914 s = saved_s;
33b8afdf
JH
915 if (t == strpos)
916 goto try_at_start;
917 goto try_at_offset;
918 }
2c2d71f5 919 }
cad2e5aa 920 }
2c2d71f5 921
1de06328 922
9ef43ace 923 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 924
6bda09f9 925 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
926 PerlIO_printf(Perl_debug_log,
927 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
928 (IV)prog->check_offset_min,
929 (IV)prog->check_offset_max,
930 (IV)(s-strpos),
931 (IV)(t-strpos),
932 (IV)(t-s),
933 (IV)(strend-strpos)
934 )
935 );
936
2c2d71f5 937 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 938 && (!utf8_target
9ef43ace 939 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
940 && t > strpos)))
941 {
2c2d71f5
JH
942 /* Fixed substring is found far enough so that the match
943 cannot start at strpos. */
944 try_at_offset:
cad2e5aa 945 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
946 /* Eventually fbm_*() should handle this, but often
947 anchored_offset is not 0, so this check will not be wasted. */
948 /* XXXX In the code below we prefer to look for "^" even in
949 presence of anchored substrings. And we search even
950 beyond the found float position. These pessimizations
951 are historical artefacts only. */
952 find_anchor:
2c2d71f5 953 while (t < strend - prog->minlen) {
cad2e5aa 954 if (*t == '\n') {
4ee3650e 955 if (t < check_at - prog->check_offset_min) {
f2ed9b32 956 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
957 /* Since we moved from the found position,
958 we definitely contradict the found anchored
30944b6d
IZ
959 substr. Due to the above check we do not
960 contradict "check" substr.
961 Thus we can arrive here only if check substr
962 is float. Redo checking for "other"=="fixed".
963 */
9041c2e3 964 strpos = t + 1;
a3621e74 965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 966 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
967 goto do_other_anchored;
968 }
4ee3650e
GS
969 /* We don't contradict the found floating substring. */
970 /* XXXX Why not check for STCLASS? */
cad2e5aa 971 s = t + 1;
a3621e74 972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 973 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
974 goto set_useful;
975 }
4ee3650e
GS
976 /* Position contradicts check-string */
977 /* XXXX probably better to look for check-string
978 than for "\n", so one should lower the limit for t? */
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 980 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 981 other_last = strpos = s = t + 1;
cad2e5aa
JH
982 goto restart;
983 }
984 t++;
985 }
a3621e74 986 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 987 PL_colors[0], PL_colors[1]));
2c2d71f5 988 goto fail_finish;
cad2e5aa 989 }
f5952150 990 else {
a3621e74 991 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 992 PL_colors[0], PL_colors[1]));
f5952150 993 }
cad2e5aa
JH
994 s = t;
995 set_useful:
f2ed9b32 996 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
997 }
998 else {
f5952150 999 /* The found string does not prohibit matching at strpos,
2c2d71f5 1000 - no optimization of calling REx engine can be performed,
f5952150
GS
1001 unless it was an MBOL and we are not after MBOL,
1002 or a future STCLASS check will fail this. */
2c2d71f5
JH
1003 try_at_start:
1004 /* Even in this situation we may use MBOL flag if strpos is offset
1005 wrt the start of the string. */
05b4157f 1006 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1007 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1008 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1009 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1010 {
cad2e5aa
JH
1011 t = strpos;
1012 goto find_anchor;
1013 }
a3621e74 1014 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1015 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1016 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1017 );
2c2d71f5 1018 success_at_start:
bbe252da 1019 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1020 && (utf8_target ? (
33b8afdf
JH
1021 prog->check_utf8 /* Could be deleted already */
1022 && --BmUSEFUL(prog->check_utf8) < 0
1023 && (prog->check_utf8 == prog->float_utf8)
1024 ) : (
1025 prog->check_substr /* Could be deleted already */
1026 && --BmUSEFUL(prog->check_substr) < 0
1027 && (prog->check_substr == prog->float_substr)
1028 )))
66e933ab 1029 {
cad2e5aa 1030 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1031 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1032 /* XXX Does the destruction order has to change with utf8_target? */
1033 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1034 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1035 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1036 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1037 check = NULL; /* abort */
cad2e5aa 1038 s = strpos;
486ec47a 1039 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1040 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1041 if (prog->intflags & PREGf_IMPLICIT)
1042 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1043 /* XXXX This is a remnant of the old implementation. It
1044 looks wasteful, since now INTUIT can use many
6eb5f6b9 1045 other heuristics. */
bbe252da 1046 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1047 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1048 }
1049 else
1050 s = strpos;
1051 }
1052
6eb5f6b9
JH
1053 /* Last resort... */
1054 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1055 /* trie stclasses are too expensive to use here, we are better off to
1056 leave it to regmatch itself */
f8fc2ecf 1057 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1058 /* minlen == 0 is possible if regstclass is \b or \B,
1059 and the fixed substr is ''$.
1060 Since minlen is already taken into account, s+1 is before strend;
1061 accidentally, minlen >= 1 guaranties no false positives at s + 1
1062 even for \b or \B. But (minlen? 1 : 0) below assumes that
1063 regstclass does not come from lookahead... */
1064 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1065 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1066 const U8* const str = (U8*)STRING(progi->regstclass);
1067 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1068 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1069 : 1);
1de06328
YO
1070 char * endpos;
1071 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1072 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1073 else if (prog->float_substr || prog->float_utf8)
1074 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1075 else
1076 endpos= strend;
1077
d8080198
YO
1078 if (checked_upto < s)
1079 checked_upto = s;
1080 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1081 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1082
6eb5f6b9 1083 t = s;
d8080198
YO
1084 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1085 if (s) {
1086 checked_upto = s;
1087 } else {
6eb5f6b9 1088#ifdef DEBUGGING
cbbf8932 1089 const char *what = NULL;
6eb5f6b9
JH
1090#endif
1091 if (endpos == strend) {
a3621e74 1092 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1093 "Could not match STCLASS...\n") );
1094 goto fail;
1095 }
a3621e74 1096 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1097 "This position contradicts STCLASS...\n") );
bbe252da 1098 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1099 goto fail;
d8080198
YO
1100 checked_upto = HOPBACKc(endpos, start_shift);
1101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1102 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1103 /* Contradict one of substrings */
33b8afdf 1104 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1105 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1106 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1107 hop_and_restart:
1aa99e6b 1108 s = HOP3c(t, 1, strend);
66e933ab
GS
1109 if (s + start_shift + end_shift > strend) {
1110 /* XXXX Should be taken into account earlier? */
a3621e74 1111 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1112 "Could not match STCLASS...\n") );
1113 goto fail;
1114 }
5e39e1e5
HS
1115 if (!check)
1116 goto giveup;
a3621e74 1117 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1118 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1119 what, (long)(s + start_shift - i_strpos)) );
1120 goto restart;
1121 }
66e933ab 1122 /* Have both, check_string is floating */
6eb5f6b9
JH
1123 if (t + start_shift >= check_at) /* Contradicts floating=check */
1124 goto retry_floating_check;
1125 /* Recheck anchored substring, but not floating... */
9041c2e3 1126 s = check_at;
5e39e1e5
HS
1127 if (!check)
1128 goto giveup;
a3621e74 1129 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1130 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1131 (long)(other_last - i_strpos)) );
1132 goto do_other_anchored;
1133 }
60e71179
GS
1134 /* Another way we could have checked stclass at the
1135 current position only: */
1136 if (ml_anch) {
1137 s = t = t + 1;
5e39e1e5
HS
1138 if (!check)
1139 goto giveup;
a3621e74 1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1141 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1142 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1143 goto try_at_offset;
66e933ab 1144 }
f2ed9b32 1145 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1146 goto fail;
486ec47a 1147 /* Check is floating substring. */
6eb5f6b9
JH
1148 retry_floating_check:
1149 t = check_at - start_shift;
a3621e74 1150 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1151 goto hop_and_restart;
1152 }
b7953727 1153 if (t != s) {
a3621e74 1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1155 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1156 (long)(t - i_strpos), (long)(s - i_strpos))
1157 );
1158 }
1159 else {
a3621e74 1160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1161 "Does not contradict STCLASS...\n");
1162 );
1163 }
6eb5f6b9 1164 }
5e39e1e5 1165 giveup:
a3621e74 1166 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1167 PL_colors[4], (check ? "Guessed" : "Giving up"),
1168 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1169 return s;
2c2d71f5
JH
1170
1171 fail_finish: /* Substring not found */
33b8afdf 1172 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1173 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1174 fail:
a3621e74 1175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1176 PL_colors[4], PL_colors[5]));
bd61b366 1177 return NULL;
cad2e5aa 1178}
9661b544 1179
a0a388a1
YO
1180#define DECL_TRIE_TYPE(scan) \
1181 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1182 trie_type = ((scan->flags == EXACT) \
1183 ? (utf8_target ? trie_utf8 : trie_plain) \
1184 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1185
1186#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1187uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1188 STRLEN skiplen; \
1189 switch (trie_type) { \
1190 case trie_utf8_fold: \
1191 if ( foldlen>0 ) { \
1192 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1193 foldlen -= len; \
1194 uscan += len; \
1195 len=0; \
1196 } else { \
1197 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1198 len = UTF8SKIP(uc); \
1199 skiplen = UNISKIP( uvc ); \
1200 foldlen -= skiplen; \
1201 uscan = foldbuf + skiplen; \
1202 } \
1203 break; \
1204 case trie_latin_utf8_fold: \
1205 if ( foldlen>0 ) { \
1206 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1207 foldlen -= len; \
1208 uscan += len; \
1209 len=0; \
1210 } else { \
1211 len = 1; \
1212 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1213 skiplen = UNISKIP( uvc ); \
1214 foldlen -= skiplen; \
1215 uscan = foldbuf + skiplen; \
1216 } \
1217 break; \
1218 case trie_utf8: \
1219 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1220 break; \
1221 case trie_plain: \
1222 uvc = (UV)*uc; \
1223 len = 1; \
1224 } \
1225 if (uvc < 256) { \
1226 charid = trie->charmap[ uvc ]; \
1227 } \
1228 else { \
1229 charid = 0; \
1230 if (widecharmap) { \
1231 SV** const svpp = hv_fetch(widecharmap, \
1232 (char*)&uvc, sizeof(UV), 0); \
1233 if (svpp) \
1234 charid = (U16)SvIV(*svpp); \
1235 } \
1236 } \
4cadc6a9
YO
1237} STMT_END
1238
4cadc6a9
YO
1239#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1240STMT_START { \
1241 while (s <= e) { \
1242 if ( (CoNd) \
fac1af77 1243 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1244 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1245 goto got_it; \
1246 s++; \
1247 } \
1248} STMT_END
1249
1250#define REXEC_FBC_UTF8_SCAN(CoDe) \
1251STMT_START { \
1252 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1253 CoDe \
1254 s += uskip; \
1255 } \
1256} STMT_END
1257
1258#define REXEC_FBC_SCAN(CoDe) \
1259STMT_START { \
1260 while (s < strend) { \
1261 CoDe \
1262 s++; \
1263 } \
1264} STMT_END
1265
1266#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1267REXEC_FBC_UTF8_SCAN( \
1268 if (CoNd) { \
24b23f37 1269 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1270 goto got_it; \
1271 else \
1272 tmp = doevery; \
1273 } \
1274 else \
1275 tmp = 1; \
1276)
1277
1278#define REXEC_FBC_CLASS_SCAN(CoNd) \
1279REXEC_FBC_SCAN( \
1280 if (CoNd) { \
24b23f37 1281 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1282 goto got_it; \
1283 else \
1284 tmp = doevery; \
1285 } \
1286 else \
1287 tmp = 1; \
1288)
1289
1290#define REXEC_FBC_TRYIT \
24b23f37 1291if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1292 goto got_it
1293
e1d1eefb 1294#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1295 if (utf8_target) { \
e1d1eefb
YO
1296 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1297 } \
1298 else { \
1299 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1300 }
e1d1eefb 1301
4cadc6a9 1302#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1303 if (utf8_target) { \
4cadc6a9
YO
1304 UtFpReLoAd; \
1305 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1306 } \
1307 else { \
1308 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1309 }
4cadc6a9
YO
1310
1311#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1312 PL_reg_flags |= RF_tainted; \
f2ed9b32 1313 if (utf8_target) { \
4cadc6a9
YO
1314 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1315 } \
1316 else { \
1317 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1318 }
4cadc6a9 1319
786e8c11
YO
1320#define DUMP_EXEC_POS(li,s,doutf8) \
1321 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1322
cfaf538b
KW
1323
1324#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1325 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1326 tmp = TEST_NON_UTF8(tmp); \
1327 REXEC_FBC_UTF8_SCAN( \
1328 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1329 tmp = !tmp; \
1330 IF_SUCCESS; \
1331 } \
1332 else { \
1333 IF_FAIL; \
1334 } \
1335 ); \
1336
1337#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1338 if (s == PL_bostr) { \
1339 tmp = '\n'; \
1340 } \
1341 else { \
1342 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1343 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1344 } \
1345 tmp = TeSt1_UtF8; \
1346 LOAD_UTF8_CHARCLASS_ALNUM(); \
1347 REXEC_FBC_UTF8_SCAN( \
1348 if (tmp == ! (TeSt2_UtF8)) { \
1349 tmp = !tmp; \
1350 IF_SUCCESS; \
1351 } \
1352 else { \
1353 IF_FAIL; \
1354 } \
1355 ); \
1356
63ac0dad
KW
1357/* The only difference between the BOUND and NBOUND cases is that
1358 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1359 * NBOUND. This is accomplished by passing it in either the if or else clause,
1360 * with the other one being empty */
1361#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1362 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1363
1364#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1365 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1366
1367#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1368 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1369
1370#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1371 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1372
63ac0dad
KW
1373
1374/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1375 * be passed in completely with the variable name being tested, which isn't
1376 * such a clean interface, but this is easier to read than it was before. We
1377 * are looking for the boundary (or non-boundary between a word and non-word
1378 * character. The utf8 and non-utf8 cases have the same logic, but the details
1379 * must be different. Find the "wordness" of the character just prior to this
1380 * one, and compare it with the wordness of this one. If they differ, we have
1381 * a boundary. At the beginning of the string, pretend that the previous
1382 * character was a new-line */
cfaf538b 1383#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1384 if (utf8_target) { \
cfaf538b 1385 UTF8_CODE \
63ac0dad
KW
1386 } \
1387 else { /* Not utf8 */ \
1388 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1389 tmp = TEST_NON_UTF8(tmp); \
1390 REXEC_FBC_SCAN( \
1391 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1392 tmp = !tmp; \
1393 IF_SUCCESS; \
1394 } \
1395 else { \
1396 IF_FAIL; \
1397 } \
1398 ); \
1399 } \
1400 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1401 goto got_it;
1402
786e8c11
YO
1403/* We know what class REx starts with. Try to find this position... */
1404/* if reginfo is NULL, its a dryrun */
1405/* annoyingly all the vars in this routine have different names from their counterparts
1406 in regmatch. /grrr */
1407
3c3eec57 1408STATIC char *
07be1b83 1409S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1410 const char *strend, regmatch_info *reginfo)
a687059c 1411{
27da23d5 1412 dVAR;
bbe252da 1413 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1414 char *pat_string; /* The pattern's exactish string */
1415 char *pat_end; /* ptr to end char of pat_string */
1416 re_fold_t folder; /* Function for computing non-utf8 folds */
1417 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1418 STRLEN ln;
5dab1207 1419 STRLEN lnc;
eb578fdb 1420 STRLEN uskip;
fac1af77
KW
1421 U8 c1;
1422 U8 c2;
6eb5f6b9 1423 char *e;
eb578fdb
KW
1424 I32 tmp = 1; /* Scratch variable? */
1425 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1426 UV utf8_fold_flags = 0;
f8fc2ecf 1427 RXi_GET_DECL(prog,progi);
7918f24d
NC
1428
1429 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1430
6eb5f6b9
JH
1431 /* We know what class it must start with. */
1432 switch (OP(c)) {
f56b6394 1433 case ANYOFV:
6eb5f6b9 1434 case ANYOF:
f56b6394 1435 if (utf8_target || OP(c) == ANYOFV) {
b1e3e569
KW
1436 STRLEN inclasslen = strend - s;
1437 REXEC_FBC_UTF8_CLASS_SCAN(
1438 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1439 }
1440 else {
6ef69d56 1441 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1442 }
6eb5f6b9 1443 break;
f33976b4 1444 case CANY:
4cadc6a9 1445 REXEC_FBC_SCAN(
24b23f37 1446 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1447 goto got_it;
1448 else
1449 tmp = doevery;
4cadc6a9 1450 );
f33976b4 1451 break;
2f7f8cb1
KW
1452
1453 case EXACTFA:
1454 if (UTF_PATTERN || utf8_target) {
1455 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1456 goto do_exactf_utf8;
1457 }
1458 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1459 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1460 goto do_exactf_non_utf8; /* isn't dealt with by these */
1461
6eb5f6b9 1462 case EXACTF:
62bf7766 1463 if (utf8_target) {
77a6d856
KW
1464
1465 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1466 utf8_fold_flags = 0;
fac1af77
KW
1467 goto do_exactf_utf8;
1468 }
1469 fold_array = PL_fold;
1470 folder = foldEQ;
1471 goto do_exactf_non_utf8;
1472
1473 case EXACTFL:
1474 if (UTF_PATTERN || utf8_target) {
17580e7a 1475 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1476 goto do_exactf_utf8;
1477 }
1478 fold_array = PL_fold_locale;
1479 folder = foldEQ_locale;
16d951b7
KW
1480 goto do_exactf_non_utf8;
1481
3c760661
KW
1482 case EXACTFU_SS:
1483 if (UTF_PATTERN) {
1484 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1485 }
1486 goto do_exactf_utf8;
1487
fab2782b 1488 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1489 case EXACTFU:
1490 if (UTF_PATTERN || utf8_target) {
77a6d856 1491 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1492 goto do_exactf_utf8;
1493 }
1494
1495 /* Any 'ss' in the pattern should have been replaced by regcomp,
1496 * so we don't have to worry here about this single special case
1497 * in the Latin1 range */
1498 fold_array = PL_fold_latin1;
1499 folder = foldEQ_latin1;
fac1af77
KW
1500
1501 /* FALL THROUGH */
1502
62bf7766
KW
1503 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1504 are no glitches with fold-length differences
1505 between the target string and pattern */
fac1af77
KW
1506
1507 /* The idea in the non-utf8 EXACTF* cases is to first find the
1508 * first character of the EXACTF* node and then, if necessary,
1509 * case-insensitively compare the full text of the node. c1 is the
1510 * first character. c2 is its fold. This logic will not work for
1511 * Unicode semantics and the german sharp ss, which hence should
1512 * not be compiled into a node that gets here. */
1513 pat_string = STRING(c);
1514 ln = STR_LEN(c); /* length to match in octets/bytes */
1515
8a90a8fe
KW
1516 /* We know that we have to match at least 'ln' bytes (which is the
1517 * same as characters, since not utf8). If we have to match 3
1518 * characters, and there are only 2 availabe, we know without
1519 * trying that it will fail; so don't start a match past the
1520 * required minimum number from the far end */
fac1af77
KW
1521 e = HOP3c(strend, -((I32)ln), s);
1522
1523 if (!reginfo && e < s) {
1524 e = s; /* Due to minlen logic of intuit() */
1525 }
1526
1527 c1 = *pat_string;
1528 c2 = fold_array[c1];
1529 if (c1 == c2) { /* If char and fold are the same */
1530 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1531 }
1532 else {
1533 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1534 }
1535 break;
1536
1537 do_exactf_utf8:
e067297c
KW
1538 {
1539 unsigned expansion;
1540
fac1af77
KW
1541
1542 /* If one of the operands is in utf8, we can't use the simpler
1543 * folding above, due to the fact that many different characters
1544 * can have the same fold, or portion of a fold, or different-
1545 * length fold */
1546 pat_string = STRING(c);
1547 ln = STR_LEN(c); /* length to match in octets/bytes */
1548 pat_end = pat_string + ln;
1549 lnc = (UTF_PATTERN) /* length to match in characters */
1550 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1551 : ln;
1552
e067297c
KW
1553 /* We have 'lnc' characters to match in the pattern, but because of
1554 * multi-character folding, each character in the target can match
1555 * up to 3 characters (Unicode guarantees it will never exceed
1556 * this) if it is utf8-encoded; and up to 2 if not (based on the
1557 * fact that the Latin 1 folds are already determined, and the
1558 * only multi-char fold in that range is the sharp-s folding to
1559 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1560 * string character. Adjust lnc accordingly, rounding up, so that
1561 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1562 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1563 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1564
1565 /* As in the non-UTF8 case, if we have to match 3 characters, and
1566 * only 2 are left, it's guaranteed to fail, so don't start a
1567 * match that would require us to go beyond the end of the string
1568 */
1569 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1570
1571 if (!reginfo && e < s) {
1572 e = s; /* Due to minlen logic of intuit() */
1573 }
1574
b33105db
KW
1575 /* XXX Note that we could recalculate e to stop the loop earlier,
1576 * as the worst case expansion above will rarely be met, and as we
1577 * go along we would usually find that e moves further to the left.
1578 * This would happen only after we reached the point in the loop
1579 * where if there were no expansion we should fail. Unclear if
1580 * worth the expense */
e067297c 1581
fac1af77
KW
1582 while (s <= e) {
1583 char *my_strend= (char *)strend;
1584 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1585 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1586 && (!reginfo || regtry(reginfo, &s)) )
1587 {
1588 goto got_it;
1589 }
bbdd8bad 1590 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1591 }
1592 break;
e067297c 1593 }
bbce6d69 1594 case BOUNDL:
3280af22 1595 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1596 FBC_BOUND(isALNUM_LC,
1597 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1598 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1599 break;
bbce6d69 1600 case NBOUNDL:
3280af22 1601 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1602 FBC_NBOUND(isALNUM_LC,
1603 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1604 isALNUM_LC_utf8((U8*)s));
1605 break;
1606 case BOUND:
1607 FBC_BOUND(isWORDCHAR,
1608 isALNUM_uni(tmp),
1609 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1610 break;
cfaf538b
KW
1611 case BOUNDA:
1612 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1613 isWORDCHAR_A(tmp),
1614 isWORDCHAR_A((U8*)s));
1615 break;
a0d0e21e 1616 case NBOUND:
63ac0dad
KW
1617 FBC_NBOUND(isWORDCHAR,
1618 isALNUM_uni(tmp),
1619 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1620 break;
cfaf538b
KW
1621 case NBOUNDA:
1622 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1623 isWORDCHAR_A(tmp),
1624 isWORDCHAR_A((U8*)s));
1625 break;
63ac0dad
KW
1626 case BOUNDU:
1627 FBC_BOUND(isWORDCHAR_L1,
1628 isALNUM_uni(tmp),
1629 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1630 break;
1631 case NBOUNDU:
1632 FBC_NBOUND(isWORDCHAR_L1,
1633 isALNUM_uni(tmp),
1634 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1635 break;
bbce6d69 1636 case ALNUML:
4cadc6a9
YO
1637 REXEC_FBC_CSCAN_TAINT(
1638 isALNUM_LC_utf8((U8*)s),
1639 isALNUM_LC(*s)
1640 );
6895a8aa 1641 break;
980866de
KW
1642 case ALNUMU:
1643 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1644 LOAD_UTF8_CHARCLASS_ALNUM(),
1645 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1646 isWORDCHAR_L1((U8) *s)
1647 );
6895a8aa 1648 break;
980866de
KW
1649 case ALNUM:
1650 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1651 LOAD_UTF8_CHARCLASS_ALNUM(),
1652 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1653 isWORDCHAR((U8) *s)
1654 );
6895a8aa 1655 break;
cfaf538b 1656 case ALNUMA:
8e9da4d4
KW
1657 /* Don't need to worry about utf8, as it can match only a single
1658 * byte invariant character */
cfaf538b 1659 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1660 break;
980866de
KW
1661 case NALNUMU:
1662 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1663 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1664 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1665 ! isWORDCHAR_L1((U8) *s)
1666 );
6895a8aa 1667 break;
a0d0e21e 1668 case NALNUM:
4cadc6a9 1669 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1670 LOAD_UTF8_CHARCLASS_ALNUM(),
1671 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1672 ! isALNUM(*s)
4cadc6a9 1673 );
6895a8aa 1674 break;
cfaf538b 1675 case NALNUMA:
8e9da4d4
KW
1676 REXEC_FBC_CSCAN(
1677 !isWORDCHAR_A(*s),
1678 !isWORDCHAR_A(*s)
1679 );
1680 break;
bbce6d69 1681 case NALNUML:
4cadc6a9
YO
1682 REXEC_FBC_CSCAN_TAINT(
1683 !isALNUM_LC_utf8((U8*)s),
1684 !isALNUM_LC(*s)
1685 );
6895a8aa 1686 break;
980866de
KW
1687 case SPACEU:
1688 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1689 LOAD_UTF8_CHARCLASS_SPACE(),
1690 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1691 isSPACE_L1((U8) *s)
1692 );
6895a8aa 1693 break;
a0d0e21e 1694 case SPACE:
4cadc6a9 1695 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1696 LOAD_UTF8_CHARCLASS_SPACE(),
1697 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1698 isSPACE((U8) *s)
4cadc6a9 1699 );
6895a8aa 1700 break;
cfaf538b 1701 case SPACEA:
8e9da4d4
KW
1702 /* Don't need to worry about utf8, as it can match only a single
1703 * byte invariant character */
cfaf538b 1704 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1705 break;
bbce6d69 1706 case SPACEL:
4cadc6a9 1707 REXEC_FBC_CSCAN_TAINT(
6bbba904 1708 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1709 isSPACE_LC(*s)
1710 );
6895a8aa 1711 break;
980866de
KW
1712 case NSPACEU:
1713 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1714 LOAD_UTF8_CHARCLASS_SPACE(),
1715 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1716 ! isSPACE_L1((U8) *s)
1717 );
6895a8aa 1718 break;
a0d0e21e 1719 case NSPACE:
4cadc6a9 1720 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1721 LOAD_UTF8_CHARCLASS_SPACE(),
1722 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1723 ! isSPACE((U8) *s)
4cadc6a9 1724 );
6895a8aa 1725 break;
cfaf538b 1726 case NSPACEA:
8e9da4d4
KW
1727 REXEC_FBC_CSCAN(
1728 !isSPACE_A(*s),
1729 !isSPACE_A(*s)
1730 );
1731 break;
bbce6d69 1732 case NSPACEL:
4cadc6a9 1733 REXEC_FBC_CSCAN_TAINT(
6bbba904 1734 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1735 !isSPACE_LC(*s)
1736 );
6895a8aa 1737 break;
a0d0e21e 1738 case DIGIT:
4cadc6a9 1739 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1740 LOAD_UTF8_CHARCLASS_DIGIT(),
1741 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1742 isDIGIT(*s)
1743 );
6895a8aa 1744 break;
cfaf538b 1745 case DIGITA:
8e9da4d4
KW
1746 /* Don't need to worry about utf8, as it can match only a single
1747 * byte invariant character */
cfaf538b 1748 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1749 break;
b8c5462f 1750 case DIGITL:
4cadc6a9
YO
1751 REXEC_FBC_CSCAN_TAINT(
1752 isDIGIT_LC_utf8((U8*)s),
1753 isDIGIT_LC(*s)
1754 );
6895a8aa 1755 break;
a0d0e21e 1756 case NDIGIT:
4cadc6a9 1757 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1758 LOAD_UTF8_CHARCLASS_DIGIT(),
1759 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1760 !isDIGIT(*s)
1761 );
6895a8aa 1762 break;
cfaf538b 1763 case NDIGITA:
8e9da4d4
KW
1764 REXEC_FBC_CSCAN(
1765 !isDIGIT_A(*s),
1766 !isDIGIT_A(*s)
1767 );
1768 break;
b8c5462f 1769 case NDIGITL:
4cadc6a9
YO
1770 REXEC_FBC_CSCAN_TAINT(
1771 !isDIGIT_LC_utf8((U8*)s),
1772 !isDIGIT_LC(*s)
1773 );
6895a8aa 1774 break;
e1d1eefb
YO
1775 case LNBREAK:
1776 REXEC_FBC_CSCAN(
1777 is_LNBREAK_utf8(s),
1778 is_LNBREAK_latin1(s)
1779 );
6895a8aa 1780 break;
e1d1eefb
YO
1781 case VERTWS:
1782 REXEC_FBC_CSCAN(
1783 is_VERTWS_utf8(s),
1784 is_VERTWS_latin1(s)
1785 );
6895a8aa 1786 break;
e1d1eefb
YO
1787 case NVERTWS:
1788 REXEC_FBC_CSCAN(
1789 !is_VERTWS_utf8(s),
1790 !is_VERTWS_latin1(s)
1791 );
6895a8aa 1792 break;
e1d1eefb
YO
1793 case HORIZWS:
1794 REXEC_FBC_CSCAN(
1795 is_HORIZWS_utf8(s),
1796 is_HORIZWS_latin1(s)
1797 );
6895a8aa 1798 break;
e1d1eefb
YO
1799 case NHORIZWS:
1800 REXEC_FBC_CSCAN(
1801 !is_HORIZWS_utf8(s),
1802 !is_HORIZWS_latin1(s)
1803 );
6895a8aa 1804 break;
0658cdde
KW
1805 case POSIXA:
1806 /* Don't need to worry about utf8, as it can match only a single
1807 * byte invariant character. The flag in this node type is the
1808 * class number to pass to _generic_isCC() to build a mask for
1809 * searching in PL_charclass[] */
1810 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1811 break;
1812 case NPOSIXA:
1813 REXEC_FBC_CSCAN(
1814 !_generic_isCC_A(*s, FLAGS(c)),
1815 !_generic_isCC_A(*s, FLAGS(c))
1816 );
1817 break;
1818
1de06328
YO
1819 case AHOCORASICKC:
1820 case AHOCORASICK:
07be1b83 1821 {
a0a388a1 1822 DECL_TRIE_TYPE(c);
07be1b83
YO
1823 /* what trie are we using right now */
1824 reg_ac_data *aho
f8fc2ecf 1825 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1826 reg_trie_data *trie
1827 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1828 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1829
1830 const char *last_start = strend - trie->minlen;
6148ee25 1831#ifdef DEBUGGING
07be1b83 1832 const char *real_start = s;
6148ee25 1833#endif
07be1b83 1834 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1835 SV *sv_points;
1836 U8 **points; /* map of where we were in the input string
786e8c11 1837 when reading a given char. For ASCII this
be8e71aa 1838 is unnecessary overhead as the relationship
38a44b82
NC
1839 is always 1:1, but for Unicode, especially
1840 case folded Unicode this is not true. */
f9e705e8 1841 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1842 U8 *bitmap=NULL;
1843
07be1b83
YO
1844
1845 GET_RE_DEBUG_FLAGS_DECL;
1846
be8e71aa
YO
1847 /* We can't just allocate points here. We need to wrap it in
1848 * an SV so it gets freed properly if there is a croak while
1849 * running the match */
1850 ENTER;
1851 SAVETMPS;
1852 sv_points=newSV(maxlen * sizeof(U8 *));
1853 SvCUR_set(sv_points,
1854 maxlen * sizeof(U8 *));
1855 SvPOK_on(sv_points);
1856 sv_2mortal(sv_points);
1857 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1858 if ( trie_type != trie_utf8_fold
1859 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1860 {
786e8c11
YO
1861 if (trie->bitmap)
1862 bitmap=(U8*)trie->bitmap;
1863 else
1864 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1865 }
786e8c11
YO
1866 /* this is the Aho-Corasick algorithm modified a touch
1867 to include special handling for long "unknown char"
1868 sequences. The basic idea being that we use AC as long
1869 as we are dealing with a possible matching char, when
1870 we encounter an unknown char (and we have not encountered
1871 an accepting state) we scan forward until we find a legal
1872 starting char.
1873 AC matching is basically that of trie matching, except
1874 that when we encounter a failing transition, we fall back
1875 to the current states "fail state", and try the current char
1876 again, a process we repeat until we reach the root state,
1877 state 1, or a legal transition. If we fail on the root state
1878 then we can either terminate if we have reached an accepting
1879 state previously, or restart the entire process from the beginning
1880 if we have not.
1881
1882 */
07be1b83
YO
1883 while (s <= last_start) {
1884 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885 U8 *uc = (U8*)s;
1886 U16 charid = 0;
1887 U32 base = 1;
1888 U32 state = 1;
1889 UV uvc = 0;
1890 STRLEN len = 0;
1891 STRLEN foldlen = 0;
1892 U8 *uscan = (U8*)NULL;
1893 U8 *leftmost = NULL;
786e8c11
YO
1894#ifdef DEBUGGING
1895 U32 accepted_word= 0;
1896#endif
07be1b83
YO
1897 U32 pointpos = 0;
1898
1899 while ( state && uc <= (U8*)strend ) {
1900 int failed=0;
786e8c11
YO
1901 U32 word = aho->states[ state ].wordnum;
1902
1de06328
YO
1903 if( state==1 ) {
1904 if ( bitmap ) {
1905 DEBUG_TRIE_EXECUTE_r(
1906 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1908 (char *)uc, utf8_target );
1de06328
YO
1909 PerlIO_printf( Perl_debug_log,
1910 " Scanning for legal start char...\n");
1911 }
d085b490
YO
1912 );
1913 if (utf8_target) {
1914 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915 uc += UTF8SKIP(uc);
1916 }
1917 } else {
1918 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 uc++;
1920 }
1921 }
1de06328 1922 s= (char *)uc;
786e8c11 1923 }
786e8c11
YO
1924 if (uc >(U8*)last_start) break;
1925 }
1926
1927 if ( word ) {
2e64971a 1928 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1929 if (!leftmost || lpos < leftmost) {
1930 DEBUG_r(accepted_word=word);
07be1b83 1931 leftmost= lpos;
786e8c11 1932 }
07be1b83 1933 if (base==0) break;
786e8c11 1934
07be1b83
YO
1935 }
1936 points[pointpos++ % maxlen]= uc;
55eed653
NC
1937 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1938 uscan, len, uvc, charid, foldlen,
1939 foldbuf, uniflags);
786e8c11
YO
1940 DEBUG_TRIE_EXECUTE_r({
1941 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1942 s, utf8_target );
07be1b83 1943 PerlIO_printf(Perl_debug_log,
786e8c11
YO
1944 " Charid:%3u CP:%4"UVxf" ",
1945 charid, uvc);
1946 });
07be1b83
YO
1947
1948 do {
6148ee25 1949#ifdef DEBUGGING
786e8c11 1950 word = aho->states[ state ].wordnum;
6148ee25 1951#endif
07be1b83
YO
1952 base = aho->states[ state ].trans.base;
1953
786e8c11
YO
1954 DEBUG_TRIE_EXECUTE_r({
1955 if (failed)
1956 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1957 s, utf8_target );
07be1b83 1958 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1959 "%sState: %4"UVxf", word=%"UVxf,
1960 failed ? " Fail transition to " : "",
1961 (UV)state, (UV)word);
1962 });
07be1b83
YO
1963 if ( base ) {
1964 U32 tmp;
6dd2be57 1965 I32 offset;
07be1b83 1966 if (charid &&
6dd2be57
DM
1967 ( ((offset = base + charid
1968 - 1 - trie->uniquecharcount)) >= 0)
1969 && ((U32)offset < trie->lasttrans)
1970 && trie->trans[offset].check == state
1971 && (tmp=trie->trans[offset].next))
07be1b83 1972 {
786e8c11
YO
1973 DEBUG_TRIE_EXECUTE_r(
1974 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
1975 state = tmp;
1976 break;
1977 }
1978 else {
786e8c11
YO
1979 DEBUG_TRIE_EXECUTE_r(
1980 PerlIO_printf( Perl_debug_log," - fail\n"));
1981 failed = 1;
1982 state = aho->fail[state];
07be1b83
YO
1983 }
1984 }
1985 else {
1986 /* we must be accepting here */
786e8c11
YO
1987 DEBUG_TRIE_EXECUTE_r(
1988 PerlIO_printf( Perl_debug_log," - accepting\n"));
1989 failed = 1;
07be1b83
YO
1990 break;
1991 }
1992 } while(state);
786e8c11 1993 uc += len;
07be1b83
YO
1994 if (failed) {
1995 if (leftmost)
1996 break;
786e8c11 1997 if (!state) state = 1;
07be1b83
YO
1998 }
1999 }
2000 if ( aho->states[ state ].wordnum ) {
2e64971a 2001 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2002 if (!leftmost || lpos < leftmost) {
2003 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2004 leftmost = lpos;
786e8c11 2005 }
07be1b83 2006 }
07be1b83
YO
2007 if (leftmost) {
2008 s = (char*)leftmost;
786e8c11
YO
2009 DEBUG_TRIE_EXECUTE_r({
2010 PerlIO_printf(
70685ca0
JH
2011 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2012 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2013 );
2014 });
24b23f37 2015 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2016 FREETMPS;
2017 LEAVE;
07be1b83 2018 goto got_it;
be8e71aa 2019 }
07be1b83 2020 s = HOPc(s,1);
786e8c11
YO
2021 DEBUG_TRIE_EXECUTE_r({
2022 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2023 });
07be1b83 2024 } else {
786e8c11
YO
2025 DEBUG_TRIE_EXECUTE_r(
2026 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2027 break;
2028 }
2029 }
be8e71aa
YO
2030 FREETMPS;
2031 LEAVE;
07be1b83
YO
2032 }
2033 break;
b3c9acc1 2034 default:
3c3eec57
GS
2035 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2036 break;
d6a28714 2037 }
6eb5f6b9
JH
2038 return 0;
2039 got_it:
2040 return s;
2041}
2042
fae667d5 2043
6eb5f6b9
JH
2044/*
2045 - regexec_flags - match a regexp against a string
2046 */
2047I32
288b8c02 2048Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2049 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2050/* stringarg: the point in the string at which to begin matching */
2051/* strend: pointer to null at end of string */
2052/* strbeg: real beginning of string */
2053/* minend: end of match must be >= minend bytes after stringarg. */
2054/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2055 * itself is accessed via the pointers above */
2056/* data: May be used for some additional optimizations.
2057 Currently its only used, with a U32 cast, for transmitting
2058 the ganch offset when doing a /g match. This will change */
2059/* nosave: For optimizations. */
2060
6eb5f6b9 2061{
97aff369 2062 dVAR;
288b8c02 2063 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2064 /*register*/ char *s;
eb578fdb 2065 regnode *c;
24b23f37 2066 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2067 I32 minlen; /* must match at least this many chars */
2068 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2069 I32 end_shift = 0; /* Same for the end. */ /* CC */
2070 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2071 char *scream_olds = NULL;
f2ed9b32 2072 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2073 I32 multiline;
f8fc2ecf 2074 RXi_GET_DECL(prog,progi);
3b0527fe 2075 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2076 regexp_paren_pair *swap = NULL;
a3621e74
YO
2077 GET_RE_DEBUG_FLAGS_DECL;
2078
7918f24d 2079 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2080 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2081
2082 /* Be paranoid... */
2083 if (prog == NULL || startpos == NULL) {
2084 Perl_croak(aTHX_ "NULL regexp parameter");
2085 return 0;
2086 }
2087
bbe252da 2088 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2089 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2090
f2ed9b32 2091 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2092 DEBUG_EXECUTE_r(
f2ed9b32 2093 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2094 "Matching");
2095 );
bac06658 2096
6eb5f6b9 2097 minlen = prog->minlen;
1de06328
YO
2098
2099 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2101 "String too short [regexec_flags]...\n"));
2102 goto phooey;
1aa99e6b 2103 }
6eb5f6b9 2104
1de06328 2105
6eb5f6b9 2106 /* Check validity of program. */
f8fc2ecf 2107 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2108 Perl_croak(aTHX_ "corrupted regexp program");
2109 }
2110
2111 PL_reg_flags = 0;
ed301438 2112 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2113 PL_reg_maxiter = 0;
2114
3c8556c3 2115 if (RX_UTF8(rx))
6eb5f6b9
JH
2116 PL_reg_flags |= RF_utf8;
2117
2118 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2119 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2120 PL_bostr = strbeg;
3b0527fe 2121 reginfo.sv = sv;
6eb5f6b9
JH
2122
2123 /* Mark end of line for $ (and such) */
2124 PL_regeol = strend;
2125
2126 /* see how far we have to get to not match where we matched before */
3b0527fe 2127 reginfo.till = startpos+minend;
6eb5f6b9 2128
6eb5f6b9
JH
2129 /* If there is a "must appear" string, look for it. */
2130 s = startpos;
2131
bbe252da 2132 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2133 MAGIC *mg;
2c296965 2134 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2135 reginfo.ganch = startpos + prog->gofs;
2c296965 2136 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2137 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2138 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2139 && SvMAGIC(sv)
14befaf4
DM
2140 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2141 && mg->mg_len >= 0) {
3b0527fe 2142 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2143 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2144 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2145
bbe252da 2146 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2147 if (s > reginfo.ganch)
6eb5f6b9 2148 goto phooey;
58e23c8d 2149 s = reginfo.ganch - prog->gofs;
2c296965 2150 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2151 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2152 if (s < strbeg)
2153 goto phooey;
6eb5f6b9
JH
2154 }
2155 }
58e23c8d 2156 else if (data) {
70685ca0 2157 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2160
2161 } else { /* pos() not defined */
3b0527fe 2162 reginfo.ganch = strbeg;
2c296965
YO
2163 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2164 "GPOS: reginfo.ganch = strbeg\n"));
2165 }
6eb5f6b9 2166 }
288b8c02 2167 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2168 /* We have to be careful. If the previous successful match
2169 was from this regex we don't want a subsequent partially
2170 successful match to clobber the old results.
2171 So when we detect this possibility we add a swap buffer
2172 to the re, and switch the buffer each match. If we fail
2173 we switch it back, otherwise we leave it swapped.
2174 */
2175 swap = prog->offs;
2176 /* do we need a save destructor here for eval dies? */
2177 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2178 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2179 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2180 PTR2UV(prog),
2181 PTR2UV(swap),
2182 PTR2UV(prog->offs)
2183 ));
c74340f9 2184 }
a0714e2c 2185 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2186 re_scream_pos_data d;
2187
2188 d.scream_olds = &scream_olds;
2189 d.scream_pos = &scream_pos;
288b8c02 2190 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2191 if (!s) {
a3621e74 2192 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2193 goto phooey; /* not present */
3fa9c3d7 2194 }
6eb5f6b9
JH
2195 }
2196
1de06328 2197
6eb5f6b9
JH
2198
2199 /* Simplest case: anchored match need be tried only once. */
2200 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2201 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2202 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2203 goto got_it;
bbe252da
YO
2204 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2205 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2206 {
2207 char *end;
2208
2209 if (minlen)
2210 dontbother = minlen - 1;
1aa99e6b 2211 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2212 /* for multiline we only have to try after newlines */
33b8afdf 2213 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2214 /* because of the goto we can not easily reuse the macros for bifurcating the
2215 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2216 if (utf8_target) {
2217 if (s == startpos)
2218 goto after_try_utf8;
2219 while (1) {
2220 if (regtry(&reginfo, &s)) {
2221 goto got_it;
2222 }
2223 after_try_utf8:
2224 if (s > end) {
2225 goto phooey;
2226 }
2227 if (prog->extflags & RXf_USE_INTUIT) {
2228 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2229 if (!s) {
2230 goto phooey;
2231 }
2232 }
2233 else {
2234 s += UTF8SKIP(s);
2235 }
2236 }
2237 } /* end search for check string in unicode */
2238 else {
2239 if (s == startpos) {
2240 goto after_try_latin;
2241 }
2242 while (1) {
2243 if (regtry(&reginfo, &s)) {
2244 goto got_it;
2245 }
2246 after_try_latin:
2247 if (s > end) {
2248 goto phooey;
2249 }
2250 if (prog->extflags & RXf_USE_INTUIT) {
2251 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2252 if (!s) {
2253 goto phooey;
2254 }
2255 }
2256 else {
2257 s++;
2258 }
2259 }
2260 } /* end search for check string in latin*/
2261 } /* end search for check string */
2262 else { /* search for newline */
2263 if (s > startpos) {
2264 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2265 s--;
92f3d482 2266 }
21eede78
YO
2267 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2268 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2269 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2270 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2271 goto got_it;
2272 }
92f3d482
YO
2273 }
2274 } /* end search for newline */
2275 } /* end anchored/multiline check string search */
6eb5f6b9 2276 goto phooey;
bbe252da 2277 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2278 {
486ec47a 2279 /* the warning about reginfo.ganch being used without initialization
bbe252da 2280 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2281 and we only enter this block when the same bit is set. */
58e23c8d 2282 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2283
2284 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2285 goto got_it;
2286 goto phooey;
2287 }
2288
2289 /* Messy cases: unanchored match. */
bbe252da 2290 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2291 /* we have /x+whatever/ */
f2ed9b32 2292 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2293 char ch;
bf93d4cc
GS
2294#ifdef DEBUGGING
2295 int did_match = 0;
2296#endif
f2ed9b32
KW
2297 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2298 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2299 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 2300
f2ed9b32 2301 if (utf8_target) {
4cadc6a9 2302 REXEC_FBC_SCAN(
6eb5f6b9 2303 if (*s == ch) {
a3621e74 2304 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2305 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2306 s += UTF8SKIP(s);
2307 while (s < strend && *s == ch)
2308 s += UTF8SKIP(s);
2309 }
4cadc6a9 2310 );
6eb5f6b9
JH
2311 }
2312 else {
4cadc6a9 2313 REXEC_FBC_SCAN(
6eb5f6b9 2314 if (*s == ch) {
a3621e74 2315 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2316 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2317 s++;
2318 while (s < strend && *s == ch)
2319 s++;
2320 }
4cadc6a9 2321 );
6eb5f6b9 2322 }
a3621e74 2323 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2324 PerlIO_printf(Perl_debug_log,
b7953727
JH
2325 "Did not find anchored character...\n")
2326 );
6eb5f6b9 2327 }
a0714e2c
SS
2328 else if (prog->anchored_substr != NULL
2329 || prog->anchored_utf8 != NULL
2330 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2331 && prog->float_max_offset < strend - s)) {
2332 SV *must;
2333 I32 back_max;
2334 I32 back_min;
2335 char *last;
6eb5f6b9 2336 char *last1; /* Last position checked before */
bf93d4cc
GS
2337#ifdef DEBUGGING
2338 int did_match = 0;
2339#endif
33b8afdf 2340 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32
KW
2341 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2342 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2343 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
2344 back_max = back_min = prog->anchored_offset;
2345 } else {
f2ed9b32
KW
2346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2347 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2348 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
2349 back_max = prog->float_max_offset;
2350 back_min = prog->float_min_offset;
2351 }
1de06328
YO
2352
2353
33b8afdf
JH
2354 if (must == &PL_sv_undef)
2355 /* could not downgrade utf8 check substring, so must fail */
2356 goto phooey;
2357
1de06328
YO
2358 if (back_min<0) {
2359 last = strend;
2360 } else {
2361 last = HOP3c(strend, /* Cannot start after this */
2362 -(I32)(CHR_SVLEN(must)
2363 - (SvTAIL(must) != 0) + back_min), strbeg);
2364 }
6eb5f6b9
JH
2365 if (s > PL_bostr)
2366 last1 = HOPc(s, -1);
2367 else
2368 last1 = s - 1; /* bogus */
2369
a0288114 2370 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2371 check_substr==must. */
2372 scream_pos = -1;
2373 dontbother = end_shift;
2374 strend = HOPc(strend, -dontbother);
2375 while ( (s <= last) &&
c33e64f0 2376 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2377 (unsigned char*)strend, must,
c33e64f0 2378 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2379 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2380 if (HOPc(s, -back_max) > last1) {
2381 last1 = HOPc(s, -back_min);
2382 s = HOPc(s, -back_max);
2383 }
2384 else {
52657f30 2385 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2386
2387 last1 = HOPc(s, -back_min);
52657f30 2388 s = t;
6eb5f6b9 2389 }
f2ed9b32 2390 if (utf8_target) {
6eb5f6b9 2391 while (s <= last1) {
24b23f37 2392 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2393 goto got_it;
2394 s += UTF8SKIP(s);
2395 }
2396 }
2397 else {
2398 while (s <= last1) {
24b23f37 2399 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2400 goto got_it;
2401 s++;
2402 }
2403 }
2404 }
ab3bbdeb 2405 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2406 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2407 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2408 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2409 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2410 ? "anchored" : "floating"),
ab3bbdeb
YO
2411 quoted, RE_SV_TAIL(must));
2412 });
6eb5f6b9
JH
2413 goto phooey;
2414 }
f8fc2ecf 2415 else if ( (c = progi->regstclass) ) {
f14c76ed 2416 if (minlen) {
f8fc2ecf 2417 const OPCODE op = OP(progi->regstclass);
66e933ab 2418 /* don't bother with what can't match */
786e8c11 2419 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2420 strend = HOPc(strend, -(minlen - 1));
2421 }
a3621e74 2422 DEBUG_EXECUTE_r({
be8e71aa 2423 SV * const prop = sv_newmortal();
32fc9b6a 2424 regprop(prog, prop, c);
0df25f3d 2425 {
f2ed9b32 2426 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2427 s,strend-s,60);
0df25f3d 2428 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2429 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2430 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2431 quoted, (int)(strend - s));
0df25f3d 2432 }
ffc61ed2 2433 });
3b0527fe 2434 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2435 goto got_it;
07be1b83 2436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2437 }
2438 else {
2439 dontbother = 0;
a0714e2c 2440 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2441 /* Trim the end. */
6af40bd7 2442 char *last= NULL;
33b8afdf 2443 SV* float_real;
c33e64f0
FC
2444 STRLEN len;
2445 const char *little;
33b8afdf 2446
f2ed9b32
KW
2447 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2448 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2449 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
d6a28714 2450
c33e64f0
FC
2451 little = SvPV_const(float_real, len);
2452 if (SvTAIL(float_real)) {
1a13b075
YO
2453 /* This means that float_real contains an artificial \n on the end
2454 * due to the presence of something like this: /foo$/
2455 * where we can match both "foo" and "foo\n" at the end of the string.
2456 * So we have to compare the end of the string first against the float_real
2457 * without the \n and then against the full float_real with the string.
2458 * We have to watch out for cases where the string might be smaller
2459 * than the float_real or the float_real without the \n.
2460 */
2461 char *checkpos= strend - len;
2462 DEBUG_OPTIMISE_r(
2463 PerlIO_printf(Perl_debug_log,
2464 "%sChecking for float_real.%s\n",
2465 PL_colors[4], PL_colors[5]));
2466 if (checkpos + 1 < strbeg) {
2467 /* can't match, even if we remove the trailing \n string is too short to match */
2468 DEBUG_EXECUTE_r(
2469 PerlIO_printf(Perl_debug_log,
2470 "%sString shorter than required trailing substring, cannot match.%s\n",
2471 PL_colors[4], PL_colors[5]));
2472 goto phooey;
2473 } else if (memEQ(checkpos + 1, little, len - 1)) {
2474 /* can match, the end of the string matches without the "\n" */
2475 last = checkpos + 1;
2476 } else if (checkpos < strbeg) {
2477 /* cant match, string is too short when the "\n" is included */
2478 DEBUG_EXECUTE_r(
2479 PerlIO_printf(Perl_debug_log,
2480 "%sString does not contain required trailing substring, cannot match.%s\n",
2481 PL_colors[4], PL_colors[5]));
2482 goto phooey;
2483 } else if (!multiline) {
2484 /* non multiline match, so compare with the "\n" at the end of the string */
2485 if (memEQ(checkpos, little, len)) {
2486 last= checkpos;
2487 } else {
2488 DEBUG_EXECUTE_r(
2489 PerlIO_printf(Perl_debug_log,
2490 "%sString does not contain required trailing substring, cannot match.%s\n",
2491 PL_colors[4], PL_colors[5]));
2492 goto phooey;
2493 }
2494 } else {
2495 /* multiline match, so we have to search for a place where the full string is located */
d6a28714 2496 goto find_last;
1a13b075 2497 }
c33e64f0 2498 } else {
d6a28714 2499 find_last:
9041c2e3 2500 if (len)
d6a28714 2501 last = rninstr(s, strend, little, little + len);
b8c5462f 2502 else
a0288114 2503 last = strend; /* matching "$" */
b8c5462f 2504 }
6af40bd7
YO
2505 if (!last) {
2506 /* at one point this block contained a comment which was probably
2507 * incorrect, which said that this was a "should not happen" case.
2508 * Even if it was true when it was written I am pretty sure it is
2509 * not anymore, so I have removed the comment and replaced it with
2510 * this one. Yves */
6bda09f9
YO
2511 DEBUG_EXECUTE_r(
2512 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2513 "String does not contain required substring, cannot match.\n"
2514 ));
2515 goto phooey;
bf93d4cc 2516 }
d6a28714
JH
2517 dontbother = strend - last + prog->float_min_offset;
2518 }
2519 if (minlen && (dontbother < minlen))
2520 dontbother = minlen - 1;
2521 strend -= dontbother; /* this one's always in bytes! */
2522 /* We don't know much -- general case. */
f2ed9b32 2523 if (utf8_target) {
d6a28714 2524 for (;;) {
24b23f37 2525 if (regtry(&reginfo, &s))
d6a28714
JH
2526 goto got_it;
2527 if (s >= strend)
2528 break;
b8c5462f 2529 s += UTF8SKIP(s);
d6a28714
JH
2530 };
2531 }
2532 else {
2533 do {
24b23f37 2534 if (regtry(&reginfo, &s))
d6a28714
JH
2535 goto got_it;
2536 } while (s++ < strend);
2537 }
2538 }
2539
2540 /* Failure. */
2541 goto phooey;
2542
2543got_it:
495f47a5
DM
2544 DEBUG_BUFFERS_r(
2545 if (swap)
2546 PerlIO_printf(Perl_debug_log,
2547 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2548 PTR2UV(prog),
2549 PTR2UV(swap)
2550 );
2551 );
e9105d30 2552 Safefree(swap);
288b8c02 2553 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2554
ed301438 2555 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2556 restore_pos(aTHX_ prog);
5daac39c
NC
2557 if (RXp_PAREN_NAMES(prog))
2558 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2559
2560 /* make sure $`, $&, $', and $digit will work later */
2561 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2562 if (flags & REXEC_COPY_STR) {
f8c7b90f 2563#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2564 if ((SvIsCOW(sv)
2565 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2566 if (DEBUG_C_TEST) {
2567 PerlIO_printf(Perl_debug_log,
2568 "Copy on write: regexp capture, type %d\n",
2569 (int) SvTYPE(sv));
2570 }
77f8f7c1 2571 RX_MATCH_COPY_FREE(rx);
ed252734 2572 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2573 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2574 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2575 prog->sublen = PL_regeol - strbeg;
2576 prog->suboffset = 0;
2577 prog->subcoffset = 0;
ed252734
NC
2578 } else
2579#endif
2580 {
6502e081
DM
2581 I32 min = 0;
2582 I32 max = PL_regeol - strbeg;
2583 I32 sublen;
2584
2585 if ( (flags & REXEC_COPY_SKIP_POST)
2586 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2587 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2588 ) { /* don't copy $' part of string */
3de645a8 2589 U32 n = 0;
6502e081
DM
2590 max = -1;
2591 /* calculate the right-most part of the string covered
2592 * by a capture. Due to look-ahead, this may be to
2593 * the right of $&, so we have to scan all captures */
2594 while (n <= prog->lastparen) {
2595 if (prog->offs[n].end > max)
2596 max = prog->offs[n].end;
2597 n++;
2598 }
2599 if (max == -1)
2600 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2601 ? prog->offs[0].start
2602 : 0;
2603 assert(max >= 0 && max <= PL_regeol - strbeg);
2604 }
2605
2606 if ( (flags & REXEC_COPY_SKIP_PRE)
2607 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2608 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2609 ) { /* don't copy $` part of string */
3de645a8 2610 U32 n = 0;
6502e081
DM
2611 min = max;
2612 /* calculate the left-most part of the string covered
2613 * by a capture. Due to look-behind, this may be to
2614 * the left of $&, so we have to scan all captures */
2615 while (min && n <= prog->lastparen) {
2616 if ( prog->offs[n].start != -1
2617 && prog->offs[n].start < min)
2618 {
2619 min = prog->offs[n].start;
2620 }
2621 n++;
2622 }
2623 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2624 && min > prog->offs[0].end
2625 )
2626 min = prog->offs[0].end;
2627
2628 }
2629
2630 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2631 sublen = max - min;
2632
2633 if (RX_MATCH_COPIED(rx)) {
2634 if (sublen > prog->sublen)
2635 prog->subbeg =
2636 (char*)saferealloc(prog->subbeg, sublen+1);
2637 }
2638 else
2639 prog->subbeg = (char*)safemalloc(sublen+1);
2640 Copy(strbeg + min, prog->subbeg, sublen, char);
2641 prog->subbeg[sublen] = '\0';
2642 prog->suboffset = min;
2643 prog->sublen = sublen;
77f8f7c1 2644 RX_MATCH_COPIED_on(rx);
6502e081 2645 }
6502e081
DM
2646 prog->subcoffset = prog->suboffset;
2647 if (prog->suboffset && utf8_target) {
2648 /* Convert byte offset to chars.
2649 * XXX ideally should only compute this if @-/@+
2650 * has been seen, a la PL_sawampersand ??? */
2651
2652 /* If there's a direct correspondence between the
2653 * string which we're matching and the original SV,
2654 * then we can use the utf8 len cache associated with
2655 * the SV. In particular, it means that under //g,
2656 * sv_pos_b2u() will use the previously cached
2657 * position to speed up working out the new length of
2658 * subcoffset, rather than counting from the start of
2659 * the string each time. This stops
2660 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2661 * from going quadratic */
2662 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2663 sv_pos_b2u(sv, &(prog->subcoffset));
2664 else
2665 prog->subcoffset = utf8_length((U8*)strbeg,
2666 (U8*)(strbeg+prog->suboffset));
2667 }
d6a28714
JH
2668 }
2669 else {
6502e081 2670 RX_MATCH_COPY_FREE(rx);
d6a28714 2671 prog->subbeg = strbeg;
6502e081
DM
2672 prog->suboffset = 0;
2673 prog->subcoffset = 0;
d6a28714
JH
2674 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2675 }
2676 }
9041c2e3 2677
d6a28714
JH
2678 return 1;
2679
2680phooey:
a3621e74 2681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2682 PL_colors[4], PL_colors[5]));
ed301438 2683 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2684 restore_pos(aTHX_ prog);
e9105d30 2685 if (swap) {
c74340f9 2686 /* we failed :-( roll it back */
495f47a5
DM
2687 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2688 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2689 PTR2UV(prog),
2690 PTR2UV(prog->offs),
2691 PTR2UV(swap)
2692 ));
e9105d30
GG
2693 Safefree(prog->offs);
2694 prog->offs = swap;
2695 }
2696
d6a28714
JH
2697 return 0;
2698}
2699
6bda09f9 2700
ec43f78b
DM
2701/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2702 * Do inc before dec, in case old and new rex are the same */
2703#define SET_reg_curpm(Re2) \
2704 if (PL_reg_state.re_state_eval_setup_done) { \
2705 (void)ReREFCNT_inc(Re2); \
2706 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2707 PM_SETRE((PL_reg_curpm), (Re2)); \
2708 }
2709
2710
d6a28714
JH
2711/*
2712 - regtry - try match at specific point
2713 */
2714STATIC I32 /* 0 failure, 1 success */
f73aaa43 2715S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2716{
97aff369 2717 dVAR;
d6a28714 2718 CHECKPOINT lastcp;
288b8c02
NC
2719 REGEXP *const rx = reginfo->prog;
2720 regexp *const prog = (struct regexp *)SvANY(rx);
f73aaa43 2721 I32 result;
f8fc2ecf 2722 RXi_GET_DECL(prog,progi);
a3621e74 2723 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2724
2725 PERL_ARGS_ASSERT_REGTRY;
2726
24b23f37 2727 reginfo->cutpoint=NULL;
d6a28714 2728
ed301438
DM
2729 if ((prog->extflags & RXf_EVAL_SEEN)
2730 && !PL_reg_state.re_state_eval_setup_done)
2731 {
d6a28714
JH
2732 MAGIC *mg;
2733
ed301438 2734 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2735 if (reginfo->sv) {
d6a28714 2736 /* Make $_ available to executed code. */
3b0527fe 2737 if (reginfo->sv != DEFSV) {
59f00321 2738 SAVE_DEFSV;
414bf5ae 2739 DEFSV_set(reginfo->sv);
b8c5462f 2740 }
d6a28714 2741
3b0527fe
DM
2742 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2743 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2744 /* prepare for quick setting of pos */
d300d9fa 2745#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2746 if (SvIsCOW(reginfo->sv))
2747 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2748#endif
3dab1dad 2749 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2750 &PL_vtbl_mglob, NULL, 0);
d6a28714 2751 mg->mg_len = -1;
b8c5462f 2752 }
d6a28714
JH
2753 PL_reg_magic = mg;
2754 PL_reg_oldpos = mg->mg_len;
4f639d21 2755 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2756 }
09687e5a 2757 if (!PL_reg_curpm) {
a02a5408 2758 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2759#ifdef USE_ITHREADS
2760 {
14a49a24 2761 SV* const repointer = &PL_sv_undef;
92313705
NC
2762 /* this regexp is also owned by the new PL_reg_curpm, which
2763 will try to free it. */
d2ece331 2764 av_push(PL_regex_padav, repointer);
09687e5a
AB
2765 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2766 PL_regex_pad = AvARRAY(PL_regex_padav);
2767 }
2768#endif
2769 }
ec43f78b 2770 SET_reg_curpm(rx);
d6a28714
JH
2771 PL_reg_oldcurpm = PL_curpm;
2772 PL_curpm = PL_reg_curpm;
07bc277f 2773 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2774 /* Here is a serious problem: we cannot rewrite subbeg,
2775 since it may be needed if this match fails. Thus
2776 $` inside (?{}) could fail... */
2777 PL_reg_oldsaved = prog->subbeg;
2778 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2779 PL_reg_oldsavedoffset = prog->suboffset;
2780 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2781#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2782 PL_nrs = prog->saved_copy;
2783#endif
07bc277f 2784 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2785 }
2786 else
bd61b366 2787 PL_reg_oldsaved = NULL;
d6a28714 2788 prog->subbeg = PL_bostr;
6502e081
DM
2789 prog->suboffset = 0;
2790 prog->subcoffset = 0;
d6a28714
JH
2791 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2792 }
97ca13b7 2793#ifdef DEBUGGING
f73aaa43 2794 PL_reg_starttry = *startposp;
97ca13b7 2795#endif
f73aaa43 2796 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2797 prog->lastparen = 0;
03994de8 2798 prog->lastcloseparen = 0;
d6a28714 2799 PL_regsize = 0;
d6a28714
JH
2800
2801 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2802 to do this again and again, prog->lastparen should take care of
3dd2943c 2803 this! --ilya*/
dafc8851
JH
2804
2805 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2806 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2807 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2808 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2809 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2810 * Meanwhile, this code *is* needed for the
daf18116
JH
2811 * above-mentioned test suite tests to succeed. The common theme
2812 * on those tests seems to be returning null fields from matches.
225593e1 2813 * --jhi updated by dapm */
dafc8851 2814#if 1
d6a28714 2815 if (prog->nparens) {
b93070ed 2816 regexp_paren_pair *pp = prog->offs;
eb578fdb 2817 I32 i;
b93070ed 2818 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2819 ++pp;
2820 pp->start = -1;
2821 pp->end = -1;
d6a28714
JH
2822 }
2823 }
dafc8851 2824#endif
02db2b7b 2825 REGCP_SET(lastcp);
f73aaa43
DM
2826 result = regmatch(reginfo, *startposp, progi->program + 1);
2827 if (result != -1) {
2828 prog->offs[0].end = result;
d6a28714
JH
2829 return 1;
2830 }
24b23f37 2831 if (reginfo->cutpoint)
f73aaa43 2832 *startposp= reginfo->cutpoint;
02db2b7b 2833 REGCP_UNWIND(lastcp);
d6a28714
JH
2834 return 0;
2835}
2836
02db2b7b 2837
8ba1375e
MJD
2838#define sayYES goto yes
2839#define sayNO goto no
262b90c4 2840#define sayNO_SILENT goto no_silent
8ba1375e 2841
f9f4320a
YO
2842/* we dont use STMT_START/END here because it leads to
2843 "unreachable code" warnings, which are bogus, but distracting. */
2844#define CACHEsayNO \
c476f425
DM
2845 if (ST.cache_mask) \
2846 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2847 sayNO
3298f257 2848
a3621e74 2849/* this is used to determine how far from the left messages like
265c4333
YO
2850 'failed...' are printed. It should be set such that messages
2851 are inline with the regop output that created them.
a3621e74 2852*/
265c4333 2853#define REPORT_CODE_OFF 32
a3621e74
YO
2854
2855
40a82448
DM
2856#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2857#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2858
86545054
DM
2859#define SLAB_FIRST(s) (&(s)->states[0])
2860#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2861
5d9a96ca
DM
2862/* grab a new slab and return the first slot in it */
2863
2864STATIC regmatch_state *
2865S_push_slab(pTHX)
2866{
a35a87e7 2867#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2868 dMY_CXT;
2869#endif
5d9a96ca
DM
2870 regmatch_slab *s = PL_regmatch_slab->next;
2871 if (!s) {
2872 Newx(s, 1, regmatch_slab);
2873 s->prev = PL_regmatch_slab;
2874 s->next = NULL;
2875 PL_regmatch_slab->next = s;
2876 }
2877 PL_regmatch_slab = s;
86545054 2878 return SLAB_FIRST(s);
5d9a96ca 2879}
5b47454d 2880
95b24440 2881
40a82448
DM
2882/* push a new state then goto it */
2883
4d5016e5
DM
2884#define PUSH_STATE_GOTO(state, node, input) \
2885 pushinput = input; \
40a82448
DM
2886 scan = node; \
2887 st->resume_state = state; \
2888 goto push_state;
2889
2890/* push a new state with success backtracking, then goto it */
2891
4d5016e5
DM
2892#define PUSH_YES_STATE_GOTO(state, node, input) \
2893 pushinput = input; \
40a82448
DM
2894 scan = node; \
2895 st->resume_state = state; \
2896 goto push_yes_state;
2897
aa283a38 2898
aa283a38 2899
4d5016e5 2900
d6a28714 2901/*
95b24440 2902
bf1f174e
DM
2903regmatch() - main matching routine
2904
2905This is basically one big switch statement in a loop. We execute an op,
2906set 'next' to point the next op, and continue. If we come to a point which
2907we may need to backtrack to on failure such as (A|B|C), we push a
2908backtrack state onto the backtrack stack. On failure, we pop the top
2909state, and re-enter the loop at the state indicated. If there are no more
2910states to pop, we return failure.
2911
2912Sometimes we also need to backtrack on success; for example /A+/, where
2913after successfully matching one A, we need to go back and try to
2914match another one; similarly for lookahead assertions: if the assertion
2915completes successfully, we backtrack to the state just before the assertion
2916and then carry on. In these cases, the pushed state is marked as
2917'backtrack on success too'. This marking is in fact done by a chain of
2918pointers, each pointing to the previous 'yes' state. On success, we pop to
2919the nearest yes state, discarding any intermediate failure-only states.
2920Sometimes a yes state is pushed just to force some cleanup code to be
2921called at the end of a successful match or submatch; e.g. (??{$re}) uses
2922it to free the inner regex.
2923
2924Note that failure backtracking rewinds the cursor position, while
2925success backtracking leaves it alone.
2926
2927A pattern is complete when the END op is executed, while a subpattern
2928such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2929ops trigger the "pop to last yes state if any, otherwise return true"
2930behaviour.
2931
2932A common convention in this function is to use A and B to refer to the two
2933subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2934the subpattern to be matched possibly multiple times, while B is the entire
2935rest of the pattern. Variable and state names reflect this convention.
2936
2937The states in the main switch are the union of ops and failure/success of
2938substates associated with with that op. For example, IFMATCH is the op
2939that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2940'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2941successfully matched A and IFMATCH_A_fail is a state saying that we have
2942just failed to match A. Resume states always come in pairs. The backtrack
2943state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2944at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2945on success or failure.
2946
2947The struct that holds a backtracking state is actually a big union, with
2948one variant for each major type of op. The variable st points to the
2949top-most backtrack struct. To make the code clearer, within each
2950block of code we #define ST to alias the relevant union.
2951
2952Here's a concrete example of a (vastly oversimplified) IFMATCH
2953implementation:
2954
2955 switch (state) {
2956 ....
2957
2958#define ST st->u.ifmatch
2959
2960 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2961 ST.foo = ...; // some state we wish to save
95b24440 2962 ...
bf1f174e
DM
2963 // push a yes backtrack state with a resume value of
2964 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2965 // first node of A:
4d5016e5 2966 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
2967 // NOTREACHED
2968
2969 case IFMATCH_A: // we have successfully executed A; now continue with B
2970 next = B;
2971 bar = ST.foo; // do something with the preserved value
2972 break;
2973
2974 case IFMATCH_A_fail: // A failed, so the assertion failed
2975 ...; // do some housekeeping, then ...
2976 sayNO; // propagate the failure
2977
2978#undef ST
95b24440 2979
bf1f174e
DM
2980 ...
2981 }
95b24440 2982
bf1f174e
DM
2983For any old-timers reading this who are familiar with the old recursive
2984approach, the code above is equivalent to:
95b24440 2985
bf1f174e
DM
2986 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2987 {
2988 int foo = ...
95b24440 2989 ...
bf1f174e
DM
2990 if (regmatch(A)) {
2991 next = B;
2992 bar = foo;
2993 break;
95b24440 2994 }
bf1f174e
DM
2995 ...; // do some housekeeping, then ...
2996 sayNO; // propagate the failure
95b24440 2997 }
bf1f174e
DM
2998
2999The topmost backtrack state, pointed to by st, is usually free. If you
3000want to claim it, populate any ST.foo fields in it with values you wish to
3001save, then do one of
3002
4d5016e5
DM
3003 PUSH_STATE_GOTO(resume_state, node, newinput);
3004 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3005
3006which sets that backtrack state's resume value to 'resume_state', pushes a
3007new free entry to the top of the backtrack stack, then goes to 'node'.
3008On backtracking, the free slot is popped, and the saved state becomes the
3009new free state. An ST.foo field in this new top state can be temporarily
3010accessed to retrieve values, but once the main loop is re-entered, it
3011becomes available for reuse.
3012
3013Note that the depth of the backtrack stack constantly increases during the
3014left-to-right execution of the pattern, rather than going up and down with
3015the pattern nesting. For example the stack is at its maximum at Z at the
3016end of the pattern, rather than at X in the following:
3017
3018 /(((X)+)+)+....(Y)+....Z/
3019
3020The only exceptions to this are lookahead/behind assertions and the cut,
3021(?>A), which pop all the backtrack states associated with A before
3022continuing.
3023
486ec47a 3024Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3025PL_regmatch_state and st always point to the currently active state,
3026and PL_regmatch_slab points to the slab currently containing
3027PL_regmatch_state. The first time regmatch() is called, the first slab is
3028allocated, and is never freed until interpreter destruction. When the slab
3029is full, a new one is allocated and chained to the end. At exit from
3030regmatch(), slabs allocated since entry are freed.
3031
3032*/
95b24440 3033
40a82448 3034
5bc10b2c 3035#define DEBUG_STATE_pp(pp) \
265c4333 3036 DEBUG_STATE_r({ \
f2ed9b32 3037 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3038 PerlIO_printf(Perl_debug_log, \
5d458dd8 3039 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3040 depth*2, "", \
13d6edb4 3041 PL_reg_name[st->resume_state], \
5d458dd8
YO
3042 ((st==yes_state||st==mark_state) ? "[" : ""), \
3043 ((st==yes_state) ? "Y" : ""), \
3044 ((st==mark_state) ? "M" : ""), \
3045 ((st==yes_state||st==mark_state) ? "]" : "") \
3046 ); \
265c4333 3047 });
5bc10b2c 3048
40a82448 3049
3dab1dad 3050#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3051
3df15adc 3052#ifdef DEBUGGING
5bc10b2c 3053
ab3bbdeb 3054STATIC void
f2ed9b32 3055S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3056 const char *start, const char *end, const char *blurb)
3057{
efd26800 3058 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3059
3060 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3061
ab3bbdeb
YO
3062 if (!PL_colorset)
3063 reginitcolors();
3064 {
3065 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3066 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3067
f2ed9b32 3068 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3069 start, end - start, 60);
3070
3071 PerlIO_printf(Perl_debug_log,
3072 "%s%s REx%s %s against %s\n",
3073 PL_colors[4], blurb, PL_colors[5], s0, s1);
3074
f2ed9b32 3075 if (utf8_target||utf8_pat)
1de06328
YO
3076 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3077 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3078 utf8_pat && utf8_target ? " and " : "",
3079 utf8_target ? "string" : ""
ab3bbdeb
YO
3080 );
3081 }
3082}
3df15adc
YO
3083
3084STATIC void
786e8c11
YO
3085S_dump_exec_pos(pTHX_ const char *locinput,
3086 const regnode *scan,
3087 const char *loc_regeol,
3088 const char *loc_bostr,
3089 const char *loc_reg_starttry,
f2ed9b32 3090 const bool utf8_target)
07be1b83 3091{
786e8c11 3092 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3093 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3094 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3095 /* The part of the string before starttry has one color
3096 (pref0_len chars), between starttry and current
3097 position another one (pref_len - pref0_len chars),
3098 after the current position the third one.
3099 We assume that pref0_len <= pref_len, otherwise we
3100 decrease pref0_len. */
786e8c11
YO
3101 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3102 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3103 int pref0_len;
3104
7918f24d
NC
3105 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3106
f2ed9b32 3107 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3108 pref_len++;
786e8c11
YO
3109 pref0_len = pref_len - (locinput - loc_reg_starttry);
3110 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3111 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3112 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3113 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3114 l--;
3115 if (pref0_len < 0)
3116 pref0_len = 0;
3117 if (pref0_len > pref_len)
3118 pref0_len = pref_len;
3119 {
f2ed9b32 3120 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3121
ab3bbdeb 3122 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3123 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3124
ab3bbdeb 3125 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3126 (locinput - pref_len + pref0_len),
1de06328 3127 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3128
ab3bbdeb 3129 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3130 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3131
1de06328 3132 const STRLEN tlen=len0+len1+len2;
3df15adc 3133 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3134 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3135 (IV)(locinput - loc_bostr),
07be1b83 3136 len0, s0,
07be1b83 3137 len1, s1,
07be1b83 3138 (docolor ? "" : "> <"),
07be1b83 3139 len2, s2,
f9f4320a 3140 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3141 "");
3142 }
3143}
3df15adc 3144
07be1b83
YO
3145#endif
3146
0a4db386
YO
3147/* reg_check_named_buff_matched()
3148 * Checks to see if a named buffer has matched. The data array of
3149 * buffer numbers corresponding to the buffer is expected to reside
3150 * in the regexp->data->data array in the slot stored in the ARG() of
3151 * node involved. Note that this routine doesn't actually care about the
3152 * name, that information is not preserved from compilation to execution.
3153 * Returns the index of the leftmost defined buffer with the given name
3154 * or 0 if non of the buffers matched.
3155 */
3156STATIC I32
7918f24d
NC
3157S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3158{
0a4db386 3159 I32 n;
f8fc2ecf 3160 RXi_GET_DECL(rex,rexi);
ad64d0ec 3161 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3162 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3163
3164 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3165
0a4db386 3166 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3167 if ((I32)rex->lastparen >= nums[n] &&
3168 rex->offs[nums[n]].end != -1)
0a4db386
YO
3169 {
3170 return nums[n];
3171 }
3172 }
3173 return 0;
3174}
3175
2f554ef7
DM
3176
3177/* free all slabs above current one - called during LEAVE_SCOPE */
3178
3179STATIC void
3180S_clear_backtrack_stack(pTHX_ void *p)
3181{
3182 regmatch_slab *s = PL_regmatch_slab->next;
3183 PERL_UNUSED_ARG(p);
3184
3185 if (!s)
3186 return;
3187 PL_regmatch_slab->next = NULL;
3188 while (s) {
3189 regmatch_slab * const osl = s;
3190 s = s->next;
3191 Safefree(osl);
3192 }
3193}
3194
3195
f73aaa43
DM
3196/* returns -1 on failure, $+[0] on success */
3197STATIC I32
3198S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3199{
a35a87e7 3200#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3201 dMY_CXT;
3202#endif
27da23d5 3203 dVAR;
eb578fdb 3204 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3205 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3206 REGEXP *rex_sv = reginfo->prog;
3207 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3208 RXi_GET_DECL(rex,rexi);
2f554ef7 3209 I32 oldsave;
5d9a96ca 3210 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3211 regmatch_state *st;
5d9a96ca 3212 /* cache heavy used fields of st in registers */
eb578fdb
KW
3213 regnode *scan;
3214 regnode *next;
3215 U32 n = 0; /* general value; init to avoid compiler warning */
3216 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3217 char *locinput = startpos;
4d5016e5 3218 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3219 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3220
b69b0499 3221 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3222 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3223 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3224 const U32 max_nochange_depth =
3225 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3226 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3227 regmatch_state *yes_state = NULL; /* state to pop to on success of
3228 subpattern */
e2e6a0f1
YO
3229 /* mark_state piggy backs on the yes_state logic so that when we unwind
3230 the stack on success we can update the mark_state as we go */
3231 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3232 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3233 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3234 U32 state_num;
5d458dd8
YO
3235 bool no_final = 0; /* prevent failure from backtracking? */
3236 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3237 char *startpoint = locinput;
5d458dd8
YO
3238 SV *popmark = NULL; /* are we looking for a mark? */
3239 SV *sv_commit = NULL; /* last mark name seen in failure */
3240 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3241 during a successful match */
5d458dd8
YO
3242 U32 lastopen = 0; /* last open we saw */
3243 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3244 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3245 /* these three flags are set by various ops to signal information to
3246 * the very next op. They have a useful lifetime of exactly one loop
3247 * iteration, and are not preserved or restored by state pushes/pops
3248 */
3249 bool sw = 0; /* the condition value in (?(cond)a|b) */
3250 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3251 int logical = 0; /* the following EVAL is:
3252 0: (?{...})
3253 1: (?(?{...})X|Y)
3254 2: (??{...})
3255 or the following IFMATCH/UNLESSM is:
3256 false: plain (?=foo)
3257 true: used as a condition: (?(?=foo))
3258 */
81ed78b2
DM
3259 PAD* last_pad = NULL;
3260 dMULTICALL;
3261 I32 gimme = G_SCALAR;
3262 CV *caller_cv = NULL; /* who called us */
3263 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3264 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3265
95b24440 3266#ifdef DEBUGGING
e68ec53f 3267 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3268#endif
3269
81ed78b2
DM
3270 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3271 multicall_oldcatch = 0;
3272 multicall_cv = NULL;
3273 cx = NULL;
4f8dbb2d
JL
3274 PERL_UNUSED_VAR(multicall_cop);
3275 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3276
3277
7918f24d
NC
3278 PERL_ARGS_ASSERT_REGMATCH;
3279
3b57cd43 3280 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3281 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3282 }));
5d9a96ca
DM
3283 /* on first ever call to regmatch, allocate first slab */
3284 if (!PL_regmatch_slab) {
3285 Newx(PL_regmatch_slab, 1, regmatch_slab);
3286 PL_regmatch_slab->prev = NULL;
3287 PL_regmatch_slab->next = NULL;
86545054 3288 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3289 }
3290
2f554ef7
DM
3291 oldsave = PL_savestack_ix;
3292 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3293 SAVEVPTR(PL_regmatch_slab);
3294 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3295
3296 /* grab next free state slot */
3297 st = ++PL_regmatch_state;
86545054 3298 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3299 st = PL_regmatch_state = S_push_slab(aTHX);
3300
d6a28714
JH
3301 /* Note that nextchr is a byte even in UTF */
3302 nextchr = UCHARAT(locinput);
3303 scan = prog;
3304 while (scan != NULL) {
8ba1375e 3305
a3621e74 3306 DEBUG_EXECUTE_r( {
6136c704 3307 SV * const prop = sv_newmortal();
1de06328 3308 regnode *rnext=regnext(scan);
f2ed9b32 3309 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3310 regprop(rex, prop, scan);
07be1b83
YO
3311
3312 PerlIO_printf(Perl_debug_log,
3313 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3314 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3315 SvPVX_const(prop),
1de06328 3316 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3317 0 : (IV)(rnext - rexi->program));
2a782b5b 3318 });
d6a28714
JH
3319
3320 next = scan + NEXT_OFF(scan);
3321 if (next == scan)
3322 next = NULL;
40a82448 3323 state_num = OP(scan);
d6a28714 3324
40a82448 3325 reenter_switch:
34a81e2b 3326
40a82448 3327 switch (state_num) {
d6a28714 3328 case BOL:
7fba1cd6 3329 if (locinput == PL_bostr)
d6a28714 3330 {
3b0527fe 3331 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3332 break;
3333 }
d6a28714
JH
3334 sayNO;
3335 case MBOL:
12d33761
HS
3336 if (locinput == PL_bostr ||
3337 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 3338 {
b8c5462f
JH
3339 break;
3340 }
d6a28714
JH
3341 sayNO;
3342 case SBOL:
c2a73568 3343 if (locinput == PL_bostr)
b8c5462f 3344 break;
d6a28714
JH
3345 sayNO;
3346 case GPOS:
3b0527fe 3347 if (locinput == reginfo->ganch)
d6a28714
JH
3348 break;
3349 sayNO;
ee9b8eae
YO
3350
3351 case KEEPS:
3352 /* update the startpoint */
b93070ed 3353 st->u.keeper.val = rex->offs[0].start;
b93070ed 3354 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3355 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
ee9b8eae
YO
3356 /*NOT-REACHED*/
3357 case KEEPS_next_fail:
3358 /* rollback the start point change */
b93070ed 3359 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3360 sayNO_SILENT;
3361 /*NOT-REACHED*/
d6a28714 3362 case EOL:
d6a28714
JH
3363 goto seol;
3364 case MEOL:
d6a28714 3365 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3366 sayNO;
b8c5462f 3367 break;
d6a28714
JH
3368 case SEOL:
3369 seol:
3370 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 3371 sayNO;
d6a28714 3372 if (PL_regeol - locinput > 1)
b8c5462f 3373 sayNO;
b8c5462f 3374 break;
d6a28714
JH
3375 case EOS:
3376 if (PL_regeol != locinput)
b8c5462f 3377 sayNO;
d6a28714 3378 break;
ffc61ed2 3379 case SANY:
d6a28714 3380 if (!nextchr && locinput >= PL_regeol)
4633a7c4 3381 sayNO;
f2ed9b32 3382 if (utf8_target) {
f33976b4
DB
3383 locinput += PL_utf8skip[nextchr];
3384 if (locinput > PL_regeol)
3385 sayNO;
3386 nextchr = UCHARAT(locinput);
3387 }
3388 else
3389 nextchr = UCHARAT(++locinput);
3390 break;
3391 case CANY:
3392 if (!nextchr && locinput >= PL_regeol)
3393 sayNO;
b8c5462f 3394 nextchr = UCHARAT(++locinput);
a0d0e21e 3395 break;
ffc61ed2 3396 case REG_ANY:
1aa99e6b
IH
3397 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3398 sayNO;
f2ed9b32 3399 if (utf8_target) {
b8c5462f 3400 locinput += PL_utf8skip[nextchr];
d6a28714
JH
3401 if (locinput > PL_regeol)
3402 sayNO;
a0ed51b3 3403 nextchr = UCHARAT(locinput);
a0ed51b3 3404 }
1aa99e6b
IH
3405 else
3406 nextchr = UCHARAT(++locinput);
a0ed51b3 3407 break;
166ba7cd
DM
3408
3409#undef ST
3410#define ST st->u.trie
786e8c11
YO
3411 case TRIEC:
3412 /* In this case the charclass data is available inline so
3413 we can fail fast without a lot of extra overhead.
3414 */
fab2782b
YO
3415 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3416 DEBUG_EXECUTE_r(
3417 PerlIO_printf(Perl_debug_log,
3418 "%*s %sfailed to match trie start class...%s\n",
3419 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3420 );
3421 sayNO_SILENT;
118e2215 3422 assert(0); /* NOTREACHED */
786e8c11
YO
3423 }
3424 /* FALL THROUGH */
5b47454d 3425 case TRIE:
2e64971a
DM
3426 /* the basic plan of execution of the trie is:
3427 * At the beginning, run though all the states, and
3428 * find the longest-matching word. Also remember the position
3429 * of the shortest matching word. For example, this pattern:
3430 * 1 2 3 4 5
3431 * ab|a|x|abcd|abc
3432 * when matched against the string "abcde", will generate
3433 * accept states for all words except 3, with the longest
3434 * matching word being 4, and the shortest being 1 (with
3435 * the position being after char 1 of the string).
3436 *
3437 * Then for each matching word, in word order (i.e. 1,2,4,5),
3438 * we run the remainder of the pattern; on each try setting
3439 * the current position to the character following the word,
3440 * returning to try the next word on failure.
3441 *
3442 * We avoid having to build a list of words at runtime by
3443 * using a compile-time structure, wordinfo[].prev, which
3444 * gives, for each word, the previous accepting word (if any).
3445 * In the case above it would contain the mappings 1->2, 2->0,
3446 * 3->0, 4->5, 5->1. We can use this table to generate, from
3447 * the longest word (4 above), a list of all words, by
3448 * following the list of prev pointers; this gives us the
3449 * unordered list 4,5,1,2. Then given the current word we have
3450 * just tried, we can go through the list and find the
3451 * next-biggest word to try (so if we just failed on word 2,
3452 * the next in the list is 4).
3453 *
3454 * Since at runtime we don't record the matching position in
3455 * the string for each word, we have to work that out for
3456 * each word we're about to process. The wordinfo table holds
3457 * the character length of each word; given that we recorded
3458 * at the start: the position of the shortest word and its
3459 * length in chars, we just need to move the pointer the
3460 * difference between the two char lengths. Depending on
3461 * Unicode status and folding, that's cheap or expensive.
3462 *
3463 * This algorithm is optimised for the case where are only a
3464 * small number of accept states, i.e. 0,1, or maybe 2.
3465 * With lots of accepts states, and having to try all of them,
3466 * it becomes quadratic on number of accept states to find all
3467 * the next words.
3468 */
3469
3dab1dad 3470 {
07be1b83 3471 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3472 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3473
3474 /* what trie are we using right now */
be8e71aa 3475 reg_trie_data * const trie
f8fc2ecf 3476 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3477 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3478 U32 state = trie->startstate;
166ba7cd 3479
fab2782b 3480 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3dab1dad
YO
3481 if (trie->states[ state ].wordnum) {
3482 DEBUG_EXECUTE_r(
3483 PerlIO_printf(Perl_debug_log,
3484 "%*s %smatched empty string...%s\n",
5bc10b2c 3485 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3486 );
20dbff7c
YO
3487 if (!trie->jump)
3488 break;
3dab1dad
YO
3489 } else {
3490 DEBUG_EXECUTE_r(
3491 PerlIO_printf(Perl_debug_log,
786e8c11 3492 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3493 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3494 );
3495 sayNO_SILENT;
3496 }
3497 }
166ba7cd 3498
786e8c11
YO
3499 {
3500 U8 *uc = ( U8* )locinput;
3501
3502 STRLEN len = 0;
3503 STRLEN foldlen = 0;
3504 U8 *uscan = (U8*)NULL;
786e8c11 3505 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3506 U32 charcount = 0; /* how many input chars we have matched */
3507 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3508
786e8c11 3509 ST.jump = trie->jump;
786e8c11 3510 ST.me = scan;
2e64971a
DM
3511 ST.firstpos = NULL;
3512 ST.longfold = FALSE; /* char longer if folded => it's harder */
3513 ST.nextword = 0;
3514
3515 /* fully traverse the TRIE; note the position of the
3516 shortest accept state and the wordnum of the longest
3517 accept state */
07be1b83 3518
a3621e74 3519 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3520 U32 base = trie->states[ state ].trans.base;
f9f4320a 3521 UV uvc = 0;
acb909b4 3522 U16 charid = 0;
2e64971a
DM
3523 U16 wordnum;
3524 wordnum = trie->states[ state ].wordnum;
3525
3526 if (wordnum) { /* it's an accept state */
3527 if (!accepted) {
3528 accepted = 1;
3529 /* record first match position */
3530 if (ST.longfold) {
3531 ST.firstpos = (U8*)locinput;
3532 ST.firstchars = 0;
5b47454d 3533 }
2e64971a
DM
3534 else {
3535 ST.firstpos = uc;
3536 ST.firstchars = charcount;
3537 }
3538 }
3539 if (!ST.nextword || wordnum < ST.nextword)
3540 ST.nextword = wordnum;
3541 ST.topword = wordnum;
786e8c11 3542 }
a3621e74 3543
07be1b83 3544 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3545 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3546 PerlIO_printf( Perl_debug_log,
2e64971a 3547 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3548 2+depth * 2, "", PL_colors[4],
2e64971a 3549 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3550 });
a3621e74 3551
2e64971a 3552 /* read a char and goto next state */
a3621e74 3553 if ( base ) {
6dd2be57 3554 I32 offset;
55eed653
NC
3555 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3556 uscan, len, uvc, charid, foldlen,
3557 foldbuf, uniflags);
2e64971a
DM
3558 charcount++;
3559 if (foldlen>0)
3560 ST.longfold = TRUE;
5b47454d 3561 if (charid &&
6dd2be57
DM
3562 ( ((offset =
3563 base + charid - 1 - trie->uniquecharcount)) >= 0)
3564
3565 && ((U32)offset < trie->lasttrans)
3566 && trie->trans[offset].check == state)
5b47454d 3567 {
6dd2be57 3568 state = trie->trans[offset].next;
5b47454d
DM
3569 }
3570 else {
3571 state = 0;
3572 }
3573 uc += len;
3574
3575 }
3576 else {
a3621e74
YO
3577 state = 0;
3578 }
3579 DEBUG_TRIE_EXECUTE_r(
e4584336 3580 PerlIO_printf( Perl_debug_log,
786e8c11 3581 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3582 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3583 );
3584 }
2e64971a 3585 if (!accepted)
a3621e74 3586 sayNO;
a3621e74 3587
2e64971a
DM
3588 /* calculate total number of accept states */
3589 {
3590 U16 w = ST.topword;
3591 accepted = 0;
3592 while (w) {
3593 w = trie->wordinfo[w].prev;
3594 accepted++;
3595 }
3596 ST.accepted = accepted;
3597 }
3598
166ba7cd
DM
3599 DEBUG_EXECUTE_r(
3600 PerlIO_printf( Perl_debug_log,
3601 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3602 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3603 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3604 );
2e64971a 3605 goto trie_first_try; /* jump into the fail handler */
786e8c11 3606 }}
118e2215 3607 assert(0); /* NOTREACHED */
2e64971a
DM
3608
3609 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3610 {
3611 U8 *uc;
fae667d5
YO
3612 if ( ST.jump) {
3613 REGCP_UNWIND(ST.cp);
a8d1f4b4 3614 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3615 }
2e64971a
DM
3616 if (!--ST.accepted) {
3617 DEBUG_EXECUTE_r({
3618 PerlIO_printf( Perl_debug_log,
3619 "%*s %sTRIE failed...%s\n",
3620 REPORT_CODE_OFF+depth*2, "",
3621 PL_colors[4],
3622 PL_colors[5] );
3623 });
3624 sayNO_SILENT;
3625 }
3626 {
3627 /* Find next-highest word to process. Note that this code
3628 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3629 U16 min = 0;
3630 U16 word;
3631 U16 const nextword = ST.nextword;
3632 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3633 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3634 for (word=ST.topword; word; word=wordinfo[word].prev) {
3635 if (word > nextword && (!min || word < min))
3636 min = word;
3637 }
3638 ST.nextword = min;
3639 }
3640
fae667d5 3641 trie_first_try:
5d458dd8
YO
3642 if (do_cutgroup) {
3643 do_cutgroup = 0;
3644 no_final = 0;
3645 }
fae667d5
YO
3646
3647 if ( ST.jump) {
b93070ed 3648 ST.lastparen = rex->lastparen;
f6033a9d 3649 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3650 REGCP_SET(ST.cp);
2e64971a 3651 }
a3621e74 3652
2e64971a 3653 /* find start char of end of current word */
166ba7cd 3654 {
2e64971a 3655 U32 chars; /* how many chars to skip */
2e64971a
DM
3656 reg_trie_data * const trie
3657 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3658
3659 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3660 >= ST.firstchars);
3661 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3662 - ST.firstchars;
a059a757 3663 uc = ST.firstpos;
2e64971a
DM
3664
3665 if (ST.longfold) {
3666 /* the hard option - fold each char in turn and find
3667 * its folded length (which may be different */
3668 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3669 STRLEN foldlen;
3670 STRLEN len;
d9a396a3 3671 UV uvc;
2e64971a
DM
3672 U8 *uscan;
3673
3674 while (chars) {
f2ed9b32 3675 if (utf8_target) {
2e64971a
DM
3676 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3677 uniflags);
3678 uc += len;
3679 }
3680 else {
3681 uvc = *uc;
3682 uc++;
3683 }
3684 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3685 uscan = foldbuf;
3686 while (foldlen) {
3687 if (!--chars)
3688 break;
3689 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3690 uniflags);
3691 uscan += len;
3692 foldlen -= len;
3693 }
3694 }
a3621e74 3695 }
2e64971a 3696 else {
f2ed9b32 3697 if (utf8_target)
2e64971a
DM
3698 while (chars--)
3699 uc += UTF8SKIP(uc);
3700 else
3701 uc += chars;
3702 }
2e64971a 3703 }
166ba7cd 3704
6603fe3e
DM
3705 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3706 ? ST.jump[ST.nextword]
3707 : NEXT_OFF(ST.me));
166ba7cd 3708
2e64971a
DM
3709 DEBUG_EXECUTE_r({
3710 PerlIO_printf( Perl_debug_log,
3711 "%*s %sTRIE matched word #%d, continuing%s\n",
3712 REPORT_CODE_OFF+depth*2, "",
3713 PL_colors[4],
3714 ST.nextword,
3715 PL_colors[5]
3716 );
3717 });
3718
3719 if (ST.accepted > 1 || has_cutgroup) {
a059a757 3720 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 3721 assert(0); /* NOTREACHED */
166ba7cd 3722 }
2e64971a
DM
3723 /* only one choice left - just continue */
3724 DEBUG_EXECUTE_r({
3725 AV *const trie_words
3726 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3727 SV ** const tmp = av_fetch( trie_words,
3728 ST.nextword-1, 0 );
3729 SV *sv= tmp ? sv_newmortal() : NULL;
3730
3731 PerlIO_printf( Perl_debug_log,
3732 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3733 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3734 ST.nextword,
3735 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3736 PL_colors[0], PL_colors[1],
c89df6cf 3737 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3738 )
3739 : "not compiled under -Dr",
3740 PL_colors[5] );
3741 });
3742
a059a757 3743 locinput = (char*)uc;
2e64971a
DM
3744 nextchr = UCHARAT(locinput);
3745 continue; /* execute rest of RE */
118e2215 3746 assert(0); /* NOTREACHED */
a059a757 3747 }
166ba7cd
DM
3748#undef ST
3749
95b24440
DM
3750 case EXACT: {
3751 char *s = STRING(scan);
24d3c4a9 3752 ln = STR_LEN(scan);
f2ed9b32 3753 if (utf8_target != UTF_PATTERN) {
bc517b45 3754 /* The target and the pattern have differing utf8ness. */
1aa99e6b 3755 char *l = locinput;
24d3c4a9 3756 const char * const e = s + ln;
a72c7584 3757
f2ed9b32 3758 if (utf8_target) {
e6a3850e
KW
3759 /* The target is utf8, the pattern is not utf8.
3760 * Above-Latin1 code points can't match the pattern;
3761 * invariants match exactly, and the other Latin1 ones need
3762 * to be downgraded to a single byte in order to do the
3763 * comparison. (If we could be confident that the target
3764 * is not malformed, this could be refactored to have fewer
3765 * tests by just assuming that if the first bytes match, it
3766 * is an invariant, but there are tests in the test suite
3767 * dealing with (??{...}) which violate this) */
1aa99e6b
IH
3768 while (s < e) {
3769 if (l >= PL_regeol)
5ff6fc6d 3770 sayNO;
e6a3850e
KW
3771 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
3772 sayNO;
3773 }
3774 if (UTF8_IS_INVARIANT(*(U8*)l)) {
3775 if (*l != *s) {
3776 sayNO;
3777 }
3778 l++;
3779 }
3780 else {
3781 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
3782 sayNO;
3783 }
3784 l += 2;
3785 }
3786 s++;
1aa99e6b 3787 }
5ff6fc6d
JH
3788 }
3789 else {
3790 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 3791 while (s < e) {
e6a3850e
KW
3792 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
3793 {
3794 sayNO;
3795 }
3796 if (UTF8_IS_INVARIANT(*(U8*)s)) {
3797 if (*s != *l) {
3798 sayNO;
3799 }
3800 s++;
3801 }
3802 else {
3803 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
3804 sayNO;
3805 }
3806 s += 2;
3807 }
3808 l++;
1aa99e6b 3809 }
5ff6fc6d 3810 }
1aa99e6b
IH
3811 locinput = l;
3812 nextchr = UCHARAT(locinput);
3813 break;
3814 }
bc517b45 3815 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
3816 /* Inline the first character, for speed. */
3817 if (UCHARAT(s) != nextchr)
3818 sayNO;
24d3c4a9 3819 if (PL_regeol - locinput < ln)
d6a28714 3820 sayNO;
24d3c4a9 3821 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 3822 sayNO;
24d3c4a9 3823 locinput += ln;
d6a28714
JH
3824 nextchr = UCHARAT(locinput);
3825 break;
95b24440 3826 }
9a5a5549 3827 case EXACTFL: {
a932d541 3828 re_fold_t folder;
9a5a5549
KW
3829 const U8 * fold_array;
3830 const char * s;
d513472c 3831 U32 fold_utf8_flags;
9a5a5549 3832
b8c5462f 3833 PL_reg_flags |= RF_tainted;
9a5a5549
KW
3834 folder = foldEQ_locale;
3835 fold_array = PL_fold_locale;
17580e7a 3836 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
3837 goto do_exactf;
3838
3c760661 3839 case EXACTFU_SS:
fab2782b 3840 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
3841 case EXACTFU:
3842 folder = foldEQ_latin1;
3843 fold_array = PL_fold_latin1;
2daa8fee 3844 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
3845 goto do_exactf;
3846
2f7f8cb1
KW
3847 case EXACTFA:
3848 folder = foldEQ_latin1;
3849 fold_array = PL_fold_latin1;
57014d77 3850 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
3851 goto do_exactf;
3852
9a5a5549
KW
3853 case EXACTF:
3854 folder = foldEQ;
3855 fold_array = PL_fold;
62bf7766 3856 fold_utf8_flags = 0;
9a5a5549
KW
3857
3858 do_exactf:
3859 s = STRING(scan);
24d3c4a9 3860 ln = STR_LEN(scan);
d6a28714 3861
3c760661
KW
3862 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3863 /* Either target or the pattern are utf8, or has the issue where
3864 * the fold lengths may differ. */
be8e71aa 3865 const char * const l = locinput;
d07ddd77 3866 char *e = PL_regeol;
bc517b45 3867
d513472c 3868 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 3869 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
3870 {
3871 sayNO;
5486206c 3872 }
d07ddd77
JH
3873 locinput = e;
3874 nextchr = UCHARAT(locinput);
3875 break;
a0ed51b3 3876 }
d6a28714 3877
0a138b74 3878 /* Neither the target nor the pattern are utf8 */
d6a28714 3879 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
3880 UCHARAT(s) != fold_array[nextchr])
3881 {
a0ed51b3 3882 sayNO;
9a5a5549 3883 }
24d3c4a9 3884 if (PL_regeol - locinput < ln)
b8c5462f 3885 sayNO;
9a5a5549 3886 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 3887 sayNO;
24d3c4a9 3888 locinput += ln;
d6a28714 3889 nextchr = UCHARAT(locinput);
a0d0e21e 3890 break;
9a5a5549 3891 }
63ac0dad
KW
3892
3893 /* XXX Could improve efficiency by separating these all out using a
3894 * macro or in-line function. At that point regcomp.c would no longer
3895 * have to set the FLAGS fields of these */
b2680017
YO
3896 case BOUNDL:
3897 case NBOUNDL:
3898 PL_reg_flags |= RF_tainted;
3899 /* FALL THROUGH */
3900 case BOUND:
63ac0dad 3901 case BOUNDU:
cfaf538b 3902 case BOUNDA:
b2680017 3903 case NBOUND:
63ac0dad 3904 case NBOUNDU:
cfaf538b 3905 case NBOUNDA:
b2680017 3906 /* was last char in word? */
f2e96b5d
KW
3907 if (utf8_target
3908 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3909 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3910 {
b2680017
YO
3911 if (locinput == PL_bostr)
3912 ln = '\n';
3913 else {
3914 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3915
3916 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3917 }
63ac0dad 3918 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017
YO
3919 ln = isALNUM_uni(ln);
3920 LOAD_UTF8_CHARCLASS_ALNUM();
f2ed9b32 3921 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
b2680017
YO
3922 }
3923 else {
3924 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3925 n = isALNUM_LC_utf8((U8*)locinput);
3926 }
3927 }
3928 else {
cfaf538b
KW
3929
3930 /* Here the string isn't utf8, or is utf8 and only ascii
3931 * characters are to match \w. In the latter case looking at
3932 * the byte just prior to the current one may be just the final
3933 * byte of a multi-byte character. This is ok. There are two
3934 * cases:
3935 * 1) it is a single byte character, and then the test is doing
3936 * just what it's supposed to.
3937 * 2) it is a multi-byte character, in which case the final
3938 * byte is never mistakable for ASCII, and so the test
3939 * will say it is not a word character, which is the
3940 * correct answer. */
b2680017
YO
3941 ln = (locinput != PL_bostr) ?
3942 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
3943 switch (FLAGS(scan)) {
3944 case REGEX_UNICODE_CHARSET:
3945 ln = isWORDCHAR_L1(ln);
3946 n = isWORDCHAR_L1(nextchr);
3947 break;
3948 case REGEX_LOCALE_CHARSET:
3949 ln = isALNUM_LC(ln);
3950 n = isALNUM_LC(nextchr);
3951 break;
3952 case REGEX_DEPENDS_CHARSET:
3953 ln = isALNUM(ln);
3954 n = isALNUM(nextchr);
3955 break;
cfaf538b 3956 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 3957 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
3958 ln = isWORDCHAR_A(ln);
3959 n = isWORDCHAR_A(nextchr);
3960 break;
63ac0dad
KW
3961 default:
3962 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3963 break;
b2680017
YO
3964 }
3965 }
63ac0dad
KW
3966 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3967 * regcomp.sym */
3968 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
3969 sayNO;
3970 break;
f56b6394 3971 case ANYOFV:
d6a28714 3972 case ANYOF:
f56b6394 3973 if (utf8_target || state_num == ANYOFV) {
9e55ce06 3974 STRLEN inclasslen = PL_regeol - locinput;
20ed0b26
KW
3975 if (locinput >= PL_regeol)
3976 sayNO;
9e55ce06 3977
f2ed9b32 3978 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 3979 sayNO;
b32d7d3e 3980 locinput += inclasslen;
b8c5462f 3981 nextchr = UCHARAT(locinput);
e0f9d4a8 3982 break;
ffc61ed2
JH
3983 }
3984 else {
3985 if (nextchr < 0)
3986 nextchr = UCHARAT(locinput);
ffc61ed2
JH
3987 if (!nextchr && locinput >= PL_regeol)
3988 sayNO;
20ed0b26 3989 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 3990 sayNO;
ffc61ed2 3991 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
3992 break;
3993 }
b8c5462f 3994 break;
20d0b1e9 3995 /* Special char classes - The defines start on line 129 or so */
ee9a90b8
KW
3996 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3997 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3998 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 3999 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 4000 alnum, "a");
ee9a90b8
KW
4001
4002 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4003 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4004 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 4005 SPACEA, NSPACEA, isSPACE_A,
779d7b58 4006 space, " ");
ee9a90b8
KW
4007
4008 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4009 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 4010 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 4011 digit, "0");
20d0b1e9 4012
0658cdde
KW
4013 case POSIXA:
4014 if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4015 sayNO;
4016 }
4017 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4018 nextchr = UCHARAT(++locinput);
4019 break;
4020 case NPOSIXA:
4021 if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
4022 sayNO;
4023 }
4024 if (utf8_target) {
4025 locinput += PL_utf8skip[nextchr];
4026 nextchr = UCHARAT(locinput);
4027 }
4028 else {
4029 nextchr = UCHARAT(++locinput);
4030 }
4031 break;
4032
37e2e78e
KW
4033 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4034 a Unicode extended Grapheme Cluster */
4035 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4036 extended Grapheme Cluster is:
4037
4038 CR LF
4039 | Prepend* Begin Extend*
4040 | .
4041
1e958ea9
KW
4042 Begin is: ( Special_Begin | ! Control )
4043 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4044 Extend is: ( Grapheme_Extend | Spacing_Mark )
4045 Control is: [ GCB_Control CR LF ]
4046 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4047
27d4fc33
KW
4048 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4049 we can rewrite
4050
4051 Begin is ( Regular_Begin + Special Begin )
4052
4053 It turns out that 98.4% of all Unicode code points match
4054 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4055 the previous implementation for almost all Unicode code points.
27d4fc33 4056
37e2e78e
KW
4057 There is a subtlety with Prepend* which showed up in testing.
4058 Note that the Begin, and only the Begin is required in:
4059 | Prepend* Begin Extend*
cc3b396d
KW
4060 Also, Begin contains '! Control'. A Prepend must be a
4061 '! Control', which means it must also be a Begin. What it
4062 comes down to is that if we match Prepend* and then find no
4063 suitable Begin afterwards, that if we backtrack the last
4064 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4065 */
4066
b7c83a7e 4067 if (locinput >= PL_regeol)
a0ed51b3 4068 sayNO;
f2ed9b32 4069 if (! utf8_target) {
37e2e78e
KW
4070
4071 /* Match either CR LF or '.', as all the other possibilities
4072 * require utf8 */
4073 locinput++; /* Match the . or CR */
cc3b396d
KW
4074 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4075 match the LF */
37e2e78e
KW
4076 && locinput < PL_regeol
4077 && UCHARAT(locinput) == '\n') locinput++;
4078 }
4079 else {
4080
4081 /* Utf8: See if is ( CR LF ); already know that locinput <
4082 * PL_regeol, so locinput+1 is in bounds */
4083 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
4084 locinput += 2;
4085 }
4086 else {
45fdf108
KW
4087 STRLEN len;
4088
37e2e78e
KW
4089 /* In case have to backtrack to beginning, then match '.' */
4090 char *starting = locinput;
4091
4092 /* In case have to backtrack the last prepend */
4093 char *previous_prepend = 0;
4094
4095 LOAD_UTF8_CHARCLASS_GCB();
4096
45fdf108
KW
4097 /* Match (prepend)* */
4098 while (locinput < PL_regeol
4099 && (len = is_GCB_Prepend_utf8(locinput)))
4100 {
4101 previous_prepend = locinput;
4102 locinput += len;
a1853d78 4103 }
37e2e78e
KW
4104
4105 /* As noted above, if we matched a prepend character, but
4106 * the next thing won't match, back off the last prepend we
4107 * matched, as it is guaranteed to match the begin */
4108 if (previous_prepend
4109 && (locinput >= PL_regeol
c101f46d
KW
4110 || (! swash_fetch(PL_utf8_X_regular_begin,
4111 (U8*)locinput, utf8_target)
45fdf108 4112 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
c101f46d 4113 )
37e2e78e
KW
4114 {
4115 locinput = previous_prepend;
4116 }
4117
4118 /* Note that here we know PL_regeol > locinput, as we
4119 * tested that upon input to this switch case, and if we
4120 * moved locinput forward, we tested the result just above
4121 * and it either passed, or we backed off so that it will
4122 * now pass */
11dfcd49
KW
4123 if (swash_fetch(PL_utf8_X_regular_begin,
4124 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4125 locinput += UTF8SKIP(locinput);
4126 }
45fdf108 4127 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
37e2e78e
KW
4128
4129 /* Here did not match the required 'Begin' in the
4130 * second term. So just match the very first
4131 * character, the '.' of the final term of the regex */
4132 locinput = starting + UTF8SKIP(starting);
27d4fc33 4133 goto exit_utf8;
37e2e78e
KW
4134 } else {
4135
11dfcd49
KW
4136 /* Here is a special begin. It can be composed of
4137 * several individual characters. One possibility is
4138 * RI+ */
45fdf108
KW
4139 if ((len = is_GCB_RI_utf8(locinput))) {
4140 locinput += len;
11dfcd49 4141 while (locinput < PL_regeol
45fdf108 4142 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4143 {
45fdf108 4144 locinput += len;
11dfcd49 4145 }
45fdf108
KW
4146 } else if ((len = is_GCB_T_utf8(locinput))) {
4147 /* Another possibility is T+ */
4148 locinput += len;
11dfcd49 4149 while (locinput < PL_regeol
45fdf108 4150 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4151 {
45fdf108 4152 locinput += len;
11dfcd49
KW
4153 }
4154 } else {
4155
4156 /* Here, neither RI+ nor T+; must be some other
4157 * Hangul. That means it is one of the others: L,
4158 * LV, LVT or V, and matches:
4159 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4160
4161 /* Match L* */
4162 while (locinput < PL_regeol
45fdf108 4163 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4164 {
45fdf108 4165 locinput += len;
11dfcd49 4166 }
37e2e78e 4167
11dfcd49
KW
4168 /* Here, have exhausted L*. If the next character
4169 * is not an LV, LVT nor V, it means we had to have
4170 * at least one L, so matches L+ in the original
4171 * equation, we have a complete hangul syllable.
4172 * Are done. */
4173
4174 if (locinput < PL_regeol
45fdf108 4175 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49
KW
4176 {
4177
4178 /* Otherwise keep going. Must be LV, LVT or V.
4179 * See if LVT */
4180 if (is_utf8_X_LVT((U8*)locinput)) {
4181 locinput += UTF8SKIP(locinput);
4182 } else {
4183
4184 /* Must be V or LV. Take it, then match
4185 * V* */
4186 locinput += UTF8SKIP(locinput);
4187 while (locinput < PL_regeol
45fdf108 4188 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4189 {
45fdf108 4190 locinput += len;
11dfcd49
KW
4191 }
4192 }
37e2e78e 4193
11dfcd49 4194 /* And any of LV, LVT, or V can be followed
45fdf108 4195 * by T* */
11dfcd49 4196 while (locinput < PL_regeol
45fdf108 4197 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4198 {
45fdf108 4199 locinput += len;
11dfcd49
KW
4200 }
4201 }
cd94d768 4202 }
11dfcd49 4203 }
37e2e78e 4204
11dfcd49
KW
4205 /* Match any extender */
4206 while (locinput < PL_regeol
4207 && swash_fetch(PL_utf8_X_extend,
4208 (U8*)locinput, utf8_target))
4209 {
4210 locinput += UTF8SKIP(locinput);
4211 }
37e2e78e 4212 }
27d4fc33 4213 exit_utf8:
37e2e78e
KW
4214 if (locinput > PL_regeol) sayNO;
4215 }
a0ed51b3
LW
4216 nextchr = UCHARAT(locinput);
4217 break;
81714fb9
YO
4218
4219 case NREFFL:
d7ef4b73
KW
4220 { /* The capture buffer cases. The ones beginning with N for the
4221 named buffers just convert to the equivalent numbered and
4222 pretend they were called as the corresponding numbered buffer
4223 op. */
26ecd678
TC
4224 /* don't initialize these in the declaration, it makes C++
4225 unhappy */
81714fb9 4226 char *s;
ff1157ca 4227 char type;
8368298a
TC
4228 re_fold_t folder;
4229 const U8 *fold_array;
26ecd678 4230 UV utf8_fold_flags;
8368298a 4231
81714fb9 4232 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4233 folder = foldEQ_locale;
4234 fold_array = PL_fold_locale;
4235 type = REFFL;
17580e7a 4236 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4237 goto do_nref;
4238
2f7f8cb1
KW
4239 case NREFFA:
4240 folder = foldEQ_latin1;
4241 fold_array = PL_fold_latin1;
4242 type = REFFA;
4243 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4244 goto do_nref;
4245
d7ef4b73
KW
4246 case NREFFU:
4247 folder = foldEQ_latin1;
4248 fold_array = PL_fold_latin1;
4249 type = REFFU;
d513472c 4250 utf8_fold_flags = 0;
d7ef4b73
KW
4251 goto do_nref;
4252
81714fb9 4253 case NREFF:
d7ef4b73
KW
4254 folder = foldEQ;
4255 fold_array = PL_fold;
4256 type = REFF;
d513472c 4257 utf8_fold_flags = 0;
d7ef4b73
KW
4258 goto do_nref;
4259
4260 case NREF:
4261 type = REF;
83d7b90b
KW
4262 folder = NULL;
4263 fold_array = NULL;
d513472c 4264 utf8_fold_flags = 0;
d7ef4b73
KW
4265 do_nref:
4266
4267 /* For the named back references, find the corresponding buffer
4268 * number */
0a4db386
YO
4269 n = reg_check_named_buff_matched(rex,scan);
4270
d7ef4b73 4271 if ( ! n ) {
81714fb9 4272 sayNO;
d7ef4b73
KW
4273 }
4274 goto do_nref_ref_common;
4275
c8756f30 4276 case REFFL:
3280af22 4277 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4278 folder = foldEQ_locale;
4279 fold_array = PL_fold_locale;
17580e7a 4280 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4281 goto do_ref;
4282
2f7f8cb1
KW
4283 case REFFA:
4284 folder = foldEQ_latin1;
4285 fold_array = PL_fold_latin1;
4286 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4287 goto do_ref;
4288
d7ef4b73
KW
4289 case REFFU:
4290 folder = foldEQ_latin1;
4291 fold_array = PL_fold_latin1;
d513472c 4292 utf8_fold_flags = 0;
d7ef4b73
KW
4293 goto do_ref;
4294
4295 case REFF:
4296 folder = foldEQ;
4297 fold_array = PL_fold;
d513472c 4298 utf8_fold_flags = 0;
83d7b90b 4299 goto do_ref;
d7ef4b73 4300
c277df42 4301 case REF:
83d7b90b
KW
4302 folder = NULL;
4303 fold_array = NULL;
d513472c 4304 utf8_fold_flags = 0;
83d7b90b 4305
d7ef4b73 4306 do_ref:
81714fb9 4307 type = OP(scan);
d7ef4b73
KW
4308 n = ARG(scan); /* which paren pair */
4309
4310 do_nref_ref_common:
b93070ed 4311 ln = rex->offs[n].start;
2c2d71f5 4312 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4313 if (rex->lastparen < n || ln == -1)
af3f8c16 4314 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4315 if (ln == rex->offs[n].end)
a0d0e21e 4316 break;
a0ed51b3 4317
24d3c4a9 4318 s = PL_bostr + ln;
d7ef4b73 4319 if (type != REF /* REF can do byte comparison */
2f65c56d 4320 && (utf8_target || type == REFFU))
d7ef4b73
KW
4321 { /* XXX handle REFFL better */
4322 char * limit = PL_regeol;
4323
4324 /* This call case insensitively compares the entire buffer
4325 * at s, with the current input starting at locinput, but
4326 * not going off the end given by PL_regeol, and returns in
4327 * limit upon success, how much of the current input was
4328 * matched */
b93070ed 4329 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4330 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4331 {
4332 sayNO;
a0ed51b3 4333 }
d7ef4b73 4334 locinput = limit;
a0ed51b3
LW
4335 nextchr = UCHARAT(locinput);
4336 break;
4337 }
4338
d7ef4b73 4339 /* Not utf8: Inline the first character, for speed. */
76e3520e 4340 if (UCHARAT(s) != nextchr &&
81714fb9 4341 (type == REF ||
d7ef4b73 4342 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4343 sayNO;
b93070ed 4344 ln = rex->offs[n].end - ln;
24d3c4a9 4345 if (locinput + ln > PL_regeol)
4633a7c4 4346 sayNO;
81714fb9 4347 if (ln > 1 && (type == REF
24d3c4a9 4348 ? memNE(s, locinput, ln)
d7ef4b73 4349 : ! folder(s, locinput, ln)))
4633a7c4 4350 sayNO;
24d3c4a9 4351 locinput += ln;
76e3520e 4352 nextchr = UCHARAT(locinput);
a0d0e21e 4353 break;
81714fb9 4354 }
a0d0e21e 4355 case NOTHING:
c277df42 4356 case TAIL:
a0d0e21e
LW
4357 break;
4358 case BACK:
4359 break;
40a82448
DM
4360
4361#undef ST
4362#define ST st->u.eval
c277df42 4363 {
c277df42 4364 SV *ret;
d2f13c59 4365 REGEXP *re_sv;
6bda09f9 4366 regexp *re;
f8fc2ecf 4367 regexp_internal *rei;
1a147d38
YO
4368 regnode *startpoint;
4369
4370 case GOSTART:
e7707071
YO
4371 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4372 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4373 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4374 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4375 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4376 Perl_croak(aTHX_
4377 "Pattern subroutine nesting without pos change"
4378 " exceeded limit in regex");
6bda09f9
YO
4379 } else {
4380 nochange_depth = 0;
1a147d38 4381 }
288b8c02 4382 re_sv = rex_sv;
6bda09f9 4383 re = rex;
f8fc2ecf 4384 rei = rexi;
1a147d38 4385 if (OP(scan)==GOSUB) {
6bda09f9
YO
4386 startpoint = scan + ARG2L(scan);
4387 ST.close_paren = ARG(scan);
4388 } else {
f8fc2ecf 4389 startpoint = rei->program+1;
6bda09f9
YO
4390 ST.close_paren = 0;
4391 }
4392 goto eval_recurse_doit;
118e2215 4393 assert(0); /* NOTREACHED */
6bda09f9
YO
4394 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4395 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4396 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4397 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4398 } else {
4399 nochange_depth = 0;
4400 }
8e5e9ebe 4401 {
4aabdb9b 4402 /* execute the code in the {...} */
81ed78b2 4403
4aabdb9b 4404 dSP;
81ed78b2 4405 SV ** before;
1f4d1a1e 4406 OP * const oop = PL_op;
4aabdb9b 4407 COP * const ocurcop = PL_curcop;
81ed78b2 4408 OP *nop;
d80618d2 4409 char *saved_regeol = PL_regeol;
91332126 4410 struct re_save_state saved_state;
81ed78b2 4411 CV *newcv;
91332126 4412
74088413
DM
4413 /* save *all* paren positions */
4414 regcppush(rex, 0);
4415 REGCP_SET(runops_cp);
4416
6562f1c4 4417 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4418 * eval we would normally put it on the save stack, like with
4419 * save_re_context. However, re-evals have a weird scoping so we
4420 * can't just add ENTER/LEAVE here. With that, things like
4421 *
4422 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4423 *
4424 * would break, as they expect the localisation to be unwound
4425 * only when the re-engine backtracks through the bit that
4426 * localised it.
4427 *
4428 * What we do instead is just saving the state in a local c
4429 * variable.
4430 */
91332126 4431 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4432
d24ca0c5 4433 PL_reg_state.re_reparsing = FALSE;
91332126 4434
81ed78b2
DM
4435 if (!caller_cv)
4436 caller_cv = find_runcv(NULL);
4437
4aabdb9b 4438 n = ARG(scan);
81ed78b2 4439
b30fcab9 4440 if (rexi->data->what[n] == 'r') { /* code from an external qr */
81ed78b2 4441 newcv = ((struct regexp *)SvANY(
b30fcab9
DM
4442 (REGEXP*)(rexi->data->data[n])
4443 ))->qr_anoncv
81ed78b2
DM
4444 ;
4445 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4446 }
4447 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4448 newcv = caller_cv;
4449 nop = (OP*)rexi->data->data[n];
4450 assert(CvDEPTH(newcv));
68e2671b
DM
4451 }
4452 else {
d24ca0c5
DM
4453 /* literal with own CV */
4454 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4455 newcv = rex->qr_anoncv;
4456 nop = (OP*)rexi->data->data[n];
68e2671b 4457 }
81ed78b2 4458
0e458318
DM
4459 /* normally if we're about to execute code from the same
4460 * CV that we used previously, we just use the existing
4461 * CX stack entry. However, its possible that in the
4462 * meantime we may have backtracked, popped from the save
4463 * stack, and undone the SAVECOMPPAD(s) associated with
4464 * PUSH_MULTICALL; in which case PL_comppad no longer
4465 * points to newcv's pad. */
4466 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4467 {
4468 I32 depth = (newcv == caller_cv) ? 0 : 1;
4469 if (last_pushed_cv) {
4470 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4471 }
4472 else {
4473 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4474 }
4475 last_pushed_cv = newcv;
4476 }
4477 last_pad = PL_comppad;
4478
2e2e3f36
DM
4479 /* the initial nextstate you would normally execute
4480 * at the start of an eval (which would cause error
4481 * messages to come from the eval), may be optimised
4482 * away from the execution path in the regex code blocks;
4483 * so manually set PL_curcop to it initially */
4484 {
81ed78b2 4485 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4486 assert(o->op_type == OP_NULL);
4487 if (o->op_targ == OP_SCOPE) {
4488 o = cUNOPo->op_first;
4489 }
4490 else {
4491 assert(o->op_targ == OP_LEAVE);
4492 o = cUNOPo->op_first;
4493 assert(o->op_type == OP_ENTER);
4494 o = o->op_sibling;
4495 }
4496
4497 if (o->op_type != OP_STUB) {
4498 assert( o->op_type == OP_NEXTSTATE
4499 || o->op_type == OP_DBSTATE
4500 || (o->op_type == OP_NULL
4501 && ( o->op_targ == OP_NEXTSTATE
4502 || o->op_targ == OP_DBSTATE
4503 )
4504 )
4505 );
4506 PL_curcop = (COP*)o;
4507 }
4508 }
81ed78b2 4509 nop = nop->op_next;
2e2e3f36 4510
24b23f37 4511 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4512 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4513
b93070ed 4514 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4515
2bf803e2
YO
4516 if (sv_yes_mark) {
4517 SV *sv_mrk = get_sv("REGMARK", 1);
4518 sv_setsv(sv_mrk, sv_yes_mark);
4519 }
4520
81ed78b2
DM
4521 /* we don't use MULTICALL here as we want to call the
4522 * first op of the block of interest, rather than the
4523 * first op of the sub */
4524 before = SP;
4525 PL_op = nop;
8e5e9ebe
RGS
4526 CALLRUNOPS(aTHX); /* Scalar context. */
4527 SPAGAIN;
4528 if (SP == before)
075aa684 4529 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4530 else {
4531 ret = POPs;
4532 PUTBACK;
4533 }
4aabdb9b 4534
e4bfbed3
DM
4535 /* before restoring everything, evaluate the returned
4536 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4537 * PL_op or pad. Also need to process any magic vars
4538 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4539
4540 PL_op = NULL;
4541
5e98dac2 4542 re_sv = NULL;
e4bfbed3
DM
4543 if (logical == 0) /* (?{})/ */
4544 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4545 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4546 sw = cBOOL(SvTRUE(ret));
4547 logical = 0;
4548 }
4549 else { /* /(??{}) */
497d0a96
DM
4550 /* if its overloaded, let the regex compiler handle
4551 * it; otherwise extract regex, or stringify */
4552 if (!SvAMAGIC(ret)) {
4553 SV *sv = ret;
4554 if (SvROK(sv))
4555 sv = SvRV(sv);
4556 if (SvTYPE(sv) == SVt_REGEXP)
4557 re_sv = (REGEXP*) sv;
4558 else if (SvSMAGICAL(sv)) {
4559 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4560 if (mg)
4561 re_sv = (REGEXP *) mg->mg_obj;
4562 }
e4bfbed3 4563
497d0a96
DM
4564 /* force any magic, undef warnings here */
4565 if (!re_sv) {
4566 ret = sv_mortalcopy(ret);
4567 (void) SvPV_force_nolen(ret);
4568 }
e4bfbed3
DM
4569 }
4570
4571 }
4572
91332126
FR
4573 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4574
81ed78b2
DM
4575 /* *** Note that at this point we don't restore
4576 * PL_comppad, (or pop the CxSUB) on the assumption it may
4577 * be used again soon. This is safe as long as nothing
4578 * in the regexp code uses the pad ! */
4aabdb9b 4579 PL_op = oop;
4aabdb9b 4580 PL_curcop = ocurcop;
d80618d2 4581 PL_regeol = saved_regeol;
e4bfbed3
DM
4582 S_regcp_restore(aTHX_ rex, runops_cp);
4583
4584 if (logical != 2)
4aabdb9b 4585 break;
8e5e9ebe 4586 }
e4bfbed3
DM
4587
4588 /* only /(??{})/ from now on */
24d3c4a9 4589 logical = 0;
4aabdb9b 4590 {
4f639d21
DM
4591 /* extract RE object from returned value; compiling if
4592 * necessary */
5c35adbb 4593
575c37f6
DM
4594 if (re_sv) {
4595 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4596 }
0f5d15d6 4597 else {
c737faaf 4598 U32 pm_flags = 0;
a3b680e6 4599 const I32 osize = PL_regsize;
0f5d15d6 4600
9753d940
DM
4601 if (SvUTF8(ret) && IN_BYTES) {
4602 /* In use 'bytes': make a copy of the octet
4603 * sequence, but without the flag on */
b9ad30b4
NC
4604 STRLEN len;
4605 const char *const p = SvPV(ret, len);
4606 ret = newSVpvn_flags(p, len, SVs_TEMP);
4607 }
732caac7
DM
4608 if (rex->intflags & PREGf_USE_RE_EVAL)
4609 pm_flags |= PMf_USE_RE_EVAL;
4610
4611 /* if we got here, it should be an engine which
4612 * supports compiling code blocks and stuff */
4613 assert(rex->engine && rex->engine->op_comp);
ec841a27 4614 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4615 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4616 rex->engine, NULL, NULL,
4617 /* copy /msix etc to inner pattern */
4618 scan->flags,
4619 pm_flags);
732caac7 4620
9041c2e3 4621 if (!(SvFLAGS(ret)
faf82a0b 4622 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4623 | SVs_GMG))) {
a2794585
NC
4624 /* This isn't a first class regexp. Instead, it's
4625 caching a regexp onto an existing, Perl visible
4626 scalar. */
575c37f6 4627 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4628 }
0f5d15d6 4629 PL_regsize = osize;
74088413
DM
4630 /* safe to do now that any $1 etc has been
4631 * interpolated into the new pattern string and
4632 * compiled */
4633 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4634 }
575c37f6 4635 re = (struct regexp *)SvANY(re_sv);
4aabdb9b 4636 }
07bc277f 4637 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4638 re->subbeg = rex->subbeg;
4639 re->sublen = rex->sublen;
6502e081
DM
4640 re->suboffset = rex->suboffset;
4641 re->subcoffset = rex->subcoffset;
f8fc2ecf 4642 rei = RXi_GET(re);
6bda09f9 4643 DEBUG_EXECUTE_r(
f2ed9b32 4644 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4645 "Matching embedded");
4646 );
f8fc2ecf 4647 startpoint = rei->program + 1;
1a147d38 4648 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4649
1a147d38 4650 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4651 /* run the pattern returned from (??{...}) */
b93070ed 4652 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4653 REGCP_SET(ST.lastcp);
6bda09f9 4654
0357f1fd
ML
4655 re->lastparen = 0;
4656 re->lastcloseparen = 0;
4657
ae0beba1 4658 PL_regsize = 0;
4aabdb9b
DM
4659
4660 /* XXXX This is too dramatic a measure... */
4661 PL_reg_maxiter = 0;
4662
faec1544 4663 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4664 if (RX_UTF8(re_sv))
faec1544
DM
4665 PL_reg_flags |= RF_utf8;
4666 else
4667 PL_reg_flags &= ~RF_utf8;
4668 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4669
288b8c02 4670 ST.prev_rex = rex_sv;
faec1544 4671 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
4672 rex_sv = re_sv;
4673 SET_reg_curpm(rex_sv);
288b8c02 4674 rex = re;
f8fc2ecf 4675 rexi = rei;
faec1544 4676 cur_curlyx = NULL;
40a82448 4677 ST.B = next;
faec1544
DM
4678 ST.prev_eval = cur_eval;
4679 cur_eval = st;
faec1544 4680 /* now continue from first node in postoned RE */
4d5016e5 4681 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 4682 assert(0); /* NOTREACHED */
c277df42 4683 }
40a82448 4684
faec1544
DM
4685 case EVAL_AB: /* cleanup after a successful (??{A})B */
4686 /* note: this is called twice; first after popping B, then A */
4687 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4688 rex_sv = ST.prev_rex;
4689 SET_reg_curpm(rex_sv);
288b8c02 4690 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4691 rexi = RXi_GET(rex);
faec1544
DM
4692 regcpblow(ST.cp);
4693 cur_eval = ST.prev_eval;
4694 cur_curlyx = ST.prev_curlyx;
34a81e2b 4695
40a82448
DM
4696 /* XXXX This is too dramatic a measure... */
4697 PL_reg_maxiter = 0;
e7707071 4698 if ( nochange_depth )
4b196cd4 4699 nochange_depth--;
262b90c4 4700 sayYES;
40a82448 4701
40a82448 4702
faec1544
DM
4703 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4704 /* note: this is called twice; first after popping B, then A */
4705 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4706 rex_sv = ST.prev_rex;
4707 SET_reg_curpm(rex_sv);
288b8c02 4708 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4709 rexi = RXi_GET(rex);
0357f1fd 4710
40a82448
DM
4711 REGCP_UNWIND(ST.lastcp);
4712 regcppop(rex);
faec1544
DM
4713 cur_eval = ST.prev_eval;
4714 cur_curlyx = ST.prev_curlyx;
4715 /* XXXX This is too dramatic a measure... */
4716 PL_reg_maxiter = 0;
e7707071 4717 if ( nochange_depth )
4b196cd4 4718 nochange_depth--;
40a82448 4719 sayNO_SILENT;
40a82448
DM
4720#undef ST
4721
a0d0e21e 4722 case OPEN:
c277df42 4723 n = ARG(scan); /* which paren pair */
1ca2007e 4724 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
4725 if (n > PL_regsize)
4726 PL_regsize = n;
495f47a5
DM
4727 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4728 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4729 PTR2UV(rex),
4730 PTR2UV(rex->offs),
4731 (UV)n,
4732 (IV)rex->offs[n].start_tmp,
4733 (UV)PL_regsize
4734 ));
e2e6a0f1 4735 lastopen = n;
a0d0e21e 4736 break;
495f47a5
DM
4737
4738/* XXX really need to log other places start/end are set too */
4739#define CLOSE_CAPTURE \
4740 rex->offs[n].start = rex->offs[n].start_tmp; \
4741 rex->offs[n].end = locinput - PL_bostr; \
4742 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4743 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4744 PTR2UV(rex), \
4745 PTR2UV(rex->offs), \
4746 (UV)n, \
4747 (IV)rex->offs[n].start, \
4748 (IV)rex->offs[n].end \
4749 ))
4750
a0d0e21e 4751 case CLOSE:
c277df42 4752 n = ARG(scan); /* which paren pair */
495f47a5 4753 CLOSE_CAPTURE;
7f69552c
YO
4754 /*if (n > PL_regsize)
4755 PL_regsize = n;*/
b93070ed
DM
4756 if (n > rex->lastparen)
4757 rex->lastparen = n;
4758 rex->lastcloseparen = n;
3b6647e0 4759 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
4760 goto fake_end;
4761 }
a0d0e21e 4762 break;
e2e6a0f1
YO
4763 case ACCEPT:
4764 if (ARG(scan)){
4765 regnode *cursor;
4766 for (cursor=scan;
4767 cursor && OP(cursor)!=END;
4768 cursor=regnext(cursor))
4769 {
4770 if ( OP(cursor)==CLOSE ){
4771 n = ARG(cursor);
4772 if ( n <= lastopen ) {
495f47a5 4773 CLOSE_CAPTURE;
e2e6a0f1
YO
4774 /*if (n > PL_regsize)
4775 PL_regsize = n;*/
b93070ed
DM
4776 if (n > rex->lastparen)
4777 rex->lastparen = n;
4778 rex->lastcloseparen = n;
3b6647e0
RB
4779 if ( n == ARG(scan) || (cur_eval &&
4780 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
4781 break;
4782 }
4783 }
4784 }
4785 }
4786 goto fake_end;
4787 /*NOTREACHED*/
c277df42
IZ
4788 case GROUPP:
4789 n = ARG(scan); /* which paren pair */
b93070ed 4790 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 4791 break;
0a4db386
YO
4792 case NGROUPP:
4793 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 4794 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 4795 break;
1a147d38 4796 case INSUBP:
0a4db386 4797 n = ARG(scan);
3b6647e0 4798 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386
YO
4799 break;
4800 case DEFINEP:
4801 sw = 0;
4802 break;
c277df42 4803 case IFTHEN:
2c2d71f5 4804 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 4805 if (sw)
c277df42
IZ
4806 next = NEXTOPER(NEXTOPER(scan));
4807 else {
4808 next = scan + ARG(scan);
4809 if (OP(next) == IFTHEN) /* Fake one. */
4810 next = NEXTOPER(NEXTOPER(next));
4811 }
4812 break;
4813 case LOGICAL:
24d3c4a9 4814 logical = scan->flags;
c277df42 4815 break;
c476f425 4816
2ab05381 4817/*******************************************************************
2ab05381 4818
c476f425
DM
4819The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4820pattern, where A and B are subpatterns. (For simple A, CURLYM or
4821STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 4822
c476f425 4823A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 4824
c476f425
DM
4825On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4826state, which contains the current count, initialised to -1. It also sets
4827cur_curlyx to point to this state, with any previous value saved in the
4828state block.
2ab05381 4829
c476f425
DM
4830CURLYX then jumps straight to the WHILEM op, rather than executing A,
4831since the pattern may possibly match zero times (i.e. it's a while {} loop
4832rather than a do {} while loop).
2ab05381 4833
c476f425
DM
4834Each entry to WHILEM represents a successful match of A. The count in the
4835CURLYX block is incremented, another WHILEM state is pushed, and execution
4836passes to A or B depending on greediness and the current count.
2ab05381 4837
c476f425
DM
4838For example, if matching against the string a1a2a3b (where the aN are
4839substrings that match /A/), then the match progresses as follows: (the
4840pushed states are interspersed with the bits of strings matched so far):
2ab05381 4841
c476f425
DM
4842 <CURLYX cnt=-1>
4843 <CURLYX cnt=0><WHILEM>
4844 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4845 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4846 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4847 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 4848
c476f425
DM
4849(Contrast this with something like CURLYM, which maintains only a single
4850backtrack state:
2ab05381 4851
c476f425
DM
4852 <CURLYM cnt=0> a1
4853 a1 <CURLYM cnt=1> a2
4854 a1 a2 <CURLYM cnt=2> a3
4855 a1 a2 a3 <CURLYM cnt=3> b
4856)
2ab05381 4857
c476f425
DM
4858Each WHILEM state block marks a point to backtrack to upon partial failure
4859of A or B, and also contains some minor state data related to that
4860iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4861overall state, such as the count, and pointers to the A and B ops.
2ab05381 4862
c476f425
DM
4863This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4864must always point to the *current* CURLYX block, the rules are:
2ab05381 4865
c476f425
DM
4866When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4867and set cur_curlyx to point the new block.
2ab05381 4868
c476f425
DM
4869When popping the CURLYX block after a successful or unsuccessful match,
4870restore the previous cur_curlyx.
2ab05381 4871
c476f425
DM
4872When WHILEM is about to execute B, save the current cur_curlyx, and set it
4873to the outer one saved in the CURLYX block.
2ab05381 4874
c476f425
DM
4875When popping the WHILEM block after a successful or unsuccessful B match,
4876restore the previous cur_curlyx.
2ab05381 4877
c476f425
DM
4878Here's an example for the pattern (AI* BI)*BO
4879I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 4880
c476f425
DM
4881cur_
4882curlyx backtrack stack
4883------ ---------------
4884NULL
4885CO <CO prev=NULL> <WO>
4886CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4887CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4888NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 4889
c476f425
DM
4890At this point the pattern succeeds, and we work back down the stack to
4891clean up, restoring as we go:
95b24440 4892
c476f425
DM
4893CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4894CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4895CO <CO prev=NULL> <WO>
4896NULL
a0374537 4897
c476f425
DM
4898*******************************************************************/
4899
4900#define ST st->u.curlyx
4901
4902 case CURLYX: /* start of /A*B/ (for complex A) */
4903 {
4904 /* No need to save/restore up to this paren */
4905 I32 parenfloor = scan->flags;
4906
4907 assert(next); /* keep Coverity happy */
4908 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4909 next += ARG(next);
4910
4911 /* XXXX Probably it is better to teach regpush to support
4912 parenfloor > PL_regsize... */
b93070ed
DM
4913 if (parenfloor > (I32)rex->lastparen)
4914 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
4915
4916 ST.prev_curlyx= cur_curlyx;
4917 cur_curlyx = st;
4918 ST.cp = PL_savestack_ix;
4919
4920 /* these fields contain the state of the current curly.
4921 * they are accessed by subsequent WHILEMs */
4922 ST.parenfloor = parenfloor;
d02d6d97 4923 ST.me = scan;
c476f425 4924 ST.B = next;
24d3c4a9
DM
4925 ST.minmod = minmod;
4926 minmod = 0;
c476f425
DM
4927 ST.count = -1; /* this will be updated by WHILEM */
4928 ST.lastloc = NULL; /* this will be updated by WHILEM */
4929
4d5016e5 4930 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 4931 assert(0); /* NOTREACHED */
c476f425 4932 }
a0d0e21e 4933
c476f425 4934 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
4935 cur_curlyx = ST.prev_curlyx;
4936 sayYES;
118e2215 4937 assert(0); /* NOTREACHED */
a0d0e21e 4938
c476f425
DM
4939 case CURLYX_end_fail: /* just failed to match all of A*B */
4940 regcpblow(ST.cp);
4941 cur_curlyx = ST.prev_curlyx;
4942 sayNO;
118e2215 4943 assert(0); /* NOTREACHED */
4633a7c4 4944
a0d0e21e 4945
c476f425
DM
4946#undef ST
4947#define ST st->u.whilem
4948
4949 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4950 {
4951 /* see the discussion above about CURLYX/WHILEM */
c476f425 4952 I32 n;
d02d6d97
DM
4953 int min = ARG1(cur_curlyx->u.curlyx.me);
4954 int max = ARG2(cur_curlyx->u.curlyx.me);
4955 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4956
c476f425
DM
4957 assert(cur_curlyx); /* keep Coverity happy */
4958 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4959 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4960 ST.cache_offset = 0;
4961 ST.cache_mask = 0;
4962
c476f425
DM
4963
4964 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
4965 "%*s whilem: matched %ld out of %d..%d\n",
4966 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 4967 );
a0d0e21e 4968
c476f425 4969 /* First just match a string of min A's. */
a0d0e21e 4970
d02d6d97 4971 if (n < min) {
b93070ed 4972 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 4973 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
4974 REGCP_SET(ST.lastcp);
4975
4d5016e5 4976 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 4977 assert(0); /* NOTREACHED */
c476f425
DM
4978 }
4979
4980 /* If degenerate A matches "", assume A done. */
4981
4982 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4983 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4984 "%*s whilem: empty match detected, trying continuation...\n",
4985 REPORT_CODE_OFF+depth*2, "")
4986 );
4987 goto do_whilem_B_max;
4988 }
4989
4990 /* super-linear cache processing */
4991
4992 if (scan->flags) {
a0d0e21e 4993
2c2d71f5 4994 if (!PL_reg_maxiter) {
c476f425
DM
4995 /* start the countdown: Postpone detection until we
4996 * know the match is not *that* much linear. */
2c2d71f5 4997 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
4998 /* possible overflow for long strings and many CURLYX's */
4999 if (PL_reg_maxiter < 0)
5000 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5001 PL_reg_leftiter = PL_reg_maxiter;
5002 }
c476f425 5003
2c2d71f5 5004 if (PL_reg_leftiter-- == 0) {
c476f425 5005 /* initialise cache */
3298f257 5006 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5007 if (PL_reg_poscache) {
eb160463 5008 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5009 Renew(PL_reg_poscache, size, char);
5010 PL_reg_poscache_size = size;
5011 }
5012 Zero(PL_reg_poscache, size, char);
5013 }
5014 else {
5015 PL_reg_poscache_size = size;
a02a5408 5016 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5017 }
c476f425
DM
5018 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5019 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5020 PL_colors[4], PL_colors[5])
5021 );
2c2d71f5 5022 }
c476f425 5023
2c2d71f5 5024 if (PL_reg_leftiter < 0) {
c476f425
DM
5025 /* have we already failed at this position? */
5026 I32 offset, mask;
5027 offset = (scan->flags & 0xf) - 1
5028 + (locinput - PL_bostr) * (scan->flags>>4);
5029 mask = 1 << (offset % 8);
5030 offset /= 8;
5031 if (PL_reg_poscache[offset] & mask) {
5032 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5033 "%*s whilem: (cache) already tried at this position...\n",
5034 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5035 );
3298f257 5036 sayNO; /* cache records failure */
2c2d71f5 5037 }
c476f425
DM
5038 ST.cache_offset = offset;
5039 ST.cache_mask = mask;
2c2d71f5 5040 }
c476f425 5041 }
2c2d71f5 5042
c476f425 5043 /* Prefer B over A for minimal matching. */
a687059c 5044
c476f425
DM
5045 if (cur_curlyx->u.curlyx.minmod) {
5046 ST.save_curlyx = cur_curlyx;
5047 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 5048 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425 5049 REGCP_SET(ST.lastcp);
4d5016e5
DM
5050 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5051 locinput);
118e2215 5052 assert(0); /* NOTREACHED */
c476f425 5053 }
a0d0e21e 5054
c476f425
DM
5055 /* Prefer A over B for maximal matching. */
5056
d02d6d97 5057 if (n < max) { /* More greed allowed? */
b93070ed 5058 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
5059 cur_curlyx->u.curlyx.lastloc = locinput;
5060 REGCP_SET(ST.lastcp);
4d5016e5 5061 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5062 assert(0); /* NOTREACHED */
c476f425
DM
5063 }
5064 goto do_whilem_B_max;
5065 }
118e2215 5066 assert(0); /* NOTREACHED */
c476f425
DM
5067
5068 case WHILEM_B_min: /* just matched B in a minimal match */
5069 case WHILEM_B_max: /* just matched B in a maximal match */
5070 cur_curlyx = ST.save_curlyx;
5071 sayYES;
118e2215 5072 assert(0); /* NOTREACHED */
c476f425
DM
5073
5074 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5075 cur_curlyx = ST.save_curlyx;
5076 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5077 cur_curlyx->u.curlyx.count--;
5078 CACHEsayNO;
118e2215 5079 assert(0); /* NOTREACHED */
c476f425
DM
5080
5081 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5082 /* FALL THROUGH */
5083 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
5084 REGCP_UNWIND(ST.lastcp);
5085 regcppop(rex);
c476f425
DM
5086 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5087 cur_curlyx->u.curlyx.count--;
5088 CACHEsayNO;
118e2215 5089 assert(0); /* NOTREACHED */
c476f425
DM
5090
5091 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5092 REGCP_UNWIND(ST.lastcp);
5093 regcppop(rex); /* Restore some previous $<digit>s? */
c476f425
DM
5094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5095 "%*s whilem: failed, trying continuation...\n",
5096 REPORT_CODE_OFF+depth*2, "")
5097 );
5098 do_whilem_B_max:
5099 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5100 && ckWARN(WARN_REGEXP)
5101 && !(PL_reg_flags & RF_warned))
5102 {
5103 PL_reg_flags |= RF_warned;
dcbac5bb
FC
5104 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5105 "Complex regular subexpression recursion limit (%d) "
5106 "exceeded",
c476f425
DM
5107 REG_INFTY - 1);
5108 }
5109
5110 /* now try B */
5111 ST.save_curlyx = cur_curlyx;
5112 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5113 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5114 locinput);
118e2215 5115 assert(0); /* NOTREACHED */
c476f425
DM
5116
5117 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5118 cur_curlyx = ST.save_curlyx;
5119 REGCP_UNWIND(ST.lastcp);
5120 regcppop(rex);
5121
d02d6d97 5122 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5123 /* Maximum greed exceeded */
5124 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5125 && ckWARN(WARN_REGEXP)
5126 && !(PL_reg_flags & RF_warned))
5127 {
3280af22 5128 PL_reg_flags |= RF_warned;
c476f425 5129 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5130 "Complex regular subexpression recursion "
5131 "limit (%d) exceeded",
c476f425 5132 REG_INFTY - 1);
a0d0e21e 5133 }
c476f425 5134 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5135 CACHEsayNO;
a0d0e21e 5136 }
c476f425
DM
5137
5138 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5139 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5140 );
5141 /* Try grabbing another A and see if it helps. */
c476f425 5142 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5143 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5144 REGCP_SET(ST.lastcp);
d02d6d97 5145 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5146 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5147 locinput);
118e2215 5148 assert(0); /* NOTREACHED */
40a82448
DM
5149
5150#undef ST
5151#define ST st->u.branch
5152
5153 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5154 next = scan + ARG(scan);
5155 if (next == scan)
5156 next = NULL;
40a82448
DM
5157 scan = NEXTOPER(scan);
5158 /* FALL THROUGH */
c277df42 5159
40a82448
DM
5160 case BRANCH: /* /(...|A|...)/ */
5161 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5162 ST.lastparen = rex->lastparen;
f6033a9d 5163 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5164 ST.next_branch = next;
5165 REGCP_SET(ST.cp);
02db2b7b 5166
40a82448 5167 /* Now go into the branch */
5d458dd8 5168 if (has_cutgroup) {
4d5016e5 5169 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5170 } else {
4d5016e5 5171 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5172 }
118e2215 5173 assert(0); /* NOTREACHED */
5d458dd8 5174 case CUTGROUP:
5d458dd8 5175 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5176 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5177 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5178 assert(0); /* NOTREACHED */
5d458dd8
YO
5179 case CUTGROUP_next_fail:
5180 do_cutgroup = 1;
5181 no_final = 1;
5182 if (st->u.mark.mark_name)
5183 sv_commit = st->u.mark.mark_name;
5184 sayNO;
118e2215 5185 assert(0); /* NOTREACHED */
5d458dd8
YO
5186 case BRANCH_next:
5187 sayYES;
118e2215 5188 assert(0); /* NOTREACHED */
40a82448 5189 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5190 if (do_cutgroup) {
5191 do_cutgroup = 0;
5192 no_final = 0;
5193 }
40a82448 5194 REGCP_UNWIND(ST.cp);
a8d1f4b4 5195 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5196 scan = ST.next_branch;
5197 /* no more branches? */
5d458dd8
YO
5198 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5199 DEBUG_EXECUTE_r({
5200 PerlIO_printf( Perl_debug_log,
5201 "%*s %sBRANCH failed...%s\n",
5202 REPORT_CODE_OFF+depth*2, "",
5203 PL_colors[4],
5204 PL_colors[5] );
5205 });
5206 sayNO_SILENT;
5207 }
40a82448 5208 continue; /* execute next BRANCH[J] op */
118e2215 5209 assert(0); /* NOTREACHED */
40a82448 5210
a0d0e21e 5211 case MINMOD:
24d3c4a9 5212 minmod = 1;
a0d0e21e 5213 break;
40a82448
DM
5214
5215#undef ST
5216#define ST st->u.curlym
5217
5218 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5219
5220 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5221 * only a single backtracking state, no matter how many matches
40a82448
DM
5222 * there are in {m,n}. It relies on the pattern being constant
5223 * length, with no parens to influence future backrefs
5224 */
5225
5226 ST.me = scan;
dc45a647 5227 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5228
f6033a9d
DM
5229 ST.lastparen = rex->lastparen;
5230 ST.lastcloseparen = rex->lastcloseparen;
5231
40a82448
DM
5232 /* if paren positive, emulate an OPEN/CLOSE around A */
5233 if (ST.me->flags) {
3b6647e0 5234 U32 paren = ST.me->flags;
40a82448
DM
5235 if (paren > PL_regsize)
5236 PL_regsize = paren;
c277df42 5237 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5238 }
40a82448
DM
5239 ST.A = scan;
5240 ST.B = next;
5241 ST.alen = 0;
5242 ST.count = 0;
24d3c4a9
DM
5243 ST.minmod = minmod;
5244 minmod = 0;
40a82448
DM
5245 ST.c1 = CHRTEST_UNINIT;
5246 REGCP_SET(ST.cp);
6407bf3b 5247
40a82448
DM
5248 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5249 goto curlym_do_B;
5250
5251 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5252 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5253 assert(0); /* NOTREACHED */
5f80c4cf 5254
40a82448 5255 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5256 ST.count++;
5257 /* after first match, determine A's length: u.curlym.alen */
5258 if (ST.count == 1) {
5259 if (PL_reg_match_utf8) {
c07e9d7b
DM
5260 char *s = st->locinput;
5261 while (s < locinput) {
40a82448
DM
5262 ST.alen++;
5263 s += UTF8SKIP(s);
5264 }
5265 }
5266 else {
c07e9d7b 5267 ST.alen = locinput - st->locinput;
40a82448
DM
5268 }
5269 if (ST.alen == 0)
5270 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5271 }
0cadcf80
DM
5272 DEBUG_EXECUTE_r(
5273 PerlIO_printf(Perl_debug_log,
40a82448 5274 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5275 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5276 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5277 );
5278
0a4db386 5279 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5280 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5281 goto fake_end;
5282
c966426a
DM
5283 {
5284 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5285 if ( max == REG_INFTY || ST.count < max )
5286 goto curlym_do_A; /* try to match another A */
5287 }
40a82448 5288 goto curlym_do_B; /* try to match B */
5f80c4cf 5289
40a82448
DM
5290 case CURLYM_A_fail: /* just failed to match an A */
5291 REGCP_UNWIND(ST.cp);
0a4db386
YO
5292
5293 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5294 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5295 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5296 sayNO;
0cadcf80 5297
40a82448 5298 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5299 if (ST.c1 == CHRTEST_UNINIT) {
5300 /* calculate c1 and c2 for possible match of 1st char
5301 * following curly */
5302 ST.c1 = ST.c2 = CHRTEST_VOID;
5303 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5304 regnode *text_node = ST.B;
5305 if (! HAS_TEXT(text_node))
5306 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5307 /* this used to be
5308
5309 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5310
5311 But the former is redundant in light of the latter.
5312
5313 if this changes back then the macro for
5314 IS_TEXT and friends need to change.
5315 */
5316 if (PL_regkind[OP(text_node)] == EXACT)
40a82448 5317 {
ee9b8eae 5318
40a82448 5319 ST.c1 = (U8)*STRING(text_node);
9a5a5549
KW
5320 switch (OP(text_node)) {
5321 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5322 case EXACTFA:
3c760661 5323 case EXACTFU_SS:
fab2782b 5324 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5325 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5326 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5327 default: ST.c2 = ST.c1;
5328 }
c277df42 5329 }
c277df42 5330 }
40a82448
DM
5331 }
5332
5333 DEBUG_EXECUTE_r(
5334 PerlIO_printf(Perl_debug_log,
5335 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5336 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5337 "", (IV)ST.count)
5338 );
5339 if (ST.c1 != CHRTEST_VOID
c07e9d7b
DM
5340 && nextchr != ST.c1
5341 && nextchr != ST.c2)
40a82448
DM
5342 {
5343 /* simulate B failing */
3e901dc0
YO
5344 DEBUG_OPTIMISE_r(
5345 PerlIO_printf(Perl_debug_log,
5346 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5347 (int)(REPORT_CODE_OFF+(depth*2)),"",
5348 (IV)ST.c1,(IV)ST.c2
5349 ));
40a82448
DM
5350 state_num = CURLYM_B_fail;
5351 goto reenter_switch;
5352 }
5353
5354 if (ST.me->flags) {
f6033a9d 5355 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5356 I32 paren = ST.me->flags;
5357 if (ST.count) {
b93070ed 5358 rex->offs[paren].start
c07e9d7b
DM
5359 = HOPc(locinput, -ST.alen) - PL_bostr;
5360 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5361 if ((U32)paren > rex->lastparen)
5362 rex->lastparen = paren;
5363 rex->lastcloseparen = paren;
c277df42 5364 }
40a82448 5365 else
b93070ed 5366 rex->offs[paren].end = -1;
0a4db386 5367 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5368 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5369 {
5370 if (ST.count)
5371 goto fake_end;
5372 else
5373 sayNO;
5374 }
c277df42 5375 }
0a4db386 5376
4d5016e5 5377 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5378 assert(0); /* NOTREACHED */
40a82448
DM
5379
5380 case CURLYM_B_fail: /* just failed to match a B */
5381 REGCP_UNWIND(ST.cp);
a8d1f4b4 5382 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5383 if (ST.minmod) {
84d2fa14
HS
5384 I32 max = ARG2(ST.me);
5385 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5386 sayNO;
5387 goto curlym_do_A; /* try to match a further A */
5388 }
5389 /* backtrack one A */
5390 if (ST.count == ARG1(ST.me) /* min */)
5391 sayNO;
5392 ST.count--;
5393 locinput = HOPc(locinput, -ST.alen);
c07e9d7b 5394 nextchr = UCHARAT(locinput);
40a82448
DM
5395 goto curlym_do_B; /* try to match B */
5396
c255a977
DM
5397#undef ST
5398#define ST st->u.curly
40a82448 5399
c255a977
DM
5400#define CURLY_SETPAREN(paren, success) \
5401 if (paren) { \
5402 if (success) { \
b93070ed
DM
5403 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5404 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5405 if (paren > rex->lastparen) \
5406 rex->lastparen = paren; \
b93070ed 5407 rex->lastcloseparen = paren; \
c255a977 5408 } \
f6033a9d 5409 else { \
b93070ed 5410 rex->offs[paren].end = -1; \
f6033a9d
DM
5411 rex->lastparen = ST.lastparen; \
5412 rex->lastcloseparen = ST.lastcloseparen; \
5413 } \
c255a977
DM
5414 }
5415
5416 case STAR: /* /A*B/ where A is width 1 */
5417 ST.paren = 0;
5418 ST.min = 0;
5419 ST.max = REG_INFTY;
a0d0e21e
LW
5420 scan = NEXTOPER(scan);
5421 goto repeat;
c255a977
DM
5422 case PLUS: /* /A+B/ where A is width 1 */
5423 ST.paren = 0;
5424 ST.min = 1;
5425 ST.max = REG_INFTY;
c277df42 5426 scan = NEXTOPER(scan);
c255a977
DM
5427 goto repeat;
5428 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5429 ST.paren = scan->flags; /* Which paren to set */
f6033a9d
DM
5430 ST.lastparen = rex->lastparen;
5431 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5432 if (ST.paren > PL_regsize)
5433 PL_regsize = ST.paren;
c255a977
DM
5434 ST.min = ARG1(scan); /* min to match */
5435 ST.max = ARG2(scan); /* max to match */
0a4db386 5436 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5437 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5438 ST.min=1;
5439 ST.max=1;
5440 }
c255a977
DM
5441 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5442 goto repeat;
5443 case CURLY: /* /A{m,n}B/ where A is width 1 */
5444 ST.paren = 0;
5445 ST.min = ARG1(scan); /* min to match */
5446 ST.max = ARG2(scan); /* max to match */
5447 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5448 repeat:
a0d0e21e
LW
5449 /*
5450 * Lookahead to avoid useless match attempts
5451 * when we know what character comes next.
c255a977 5452 *
5f80c4cf
JP
5453 * Used to only do .*x and .*?x, but now it allows
5454 * for )'s, ('s and (?{ ... })'s to be in the way
5455 * of the quantifier and the EXACT-like node. -- japhy
5456 */
5457
c255a977
DM
5458 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5459 sayNO;
cca55fe3 5460 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
5461 U8 *s;
5462 regnode *text_node = next;
5463
3dab1dad
YO
5464 if (! HAS_TEXT(text_node))
5465 FIND_NEXT_IMPT(text_node);
5f80c4cf 5466
9e137952 5467 if (! HAS_TEXT(text_node))
c255a977 5468 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5469 else {
ee9b8eae 5470 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5471 ST.c1 = ST.c2 = CHRTEST_VOID;
44a68960 5472 goto assume_ok_easy;
cca55fe3 5473 }
be8e71aa
YO
5474 else
5475 s = (U8*)STRING(text_node);
ee9b8eae
YO
5476
5477 /* Currently we only get here when
5478
5479 PL_rekind[OP(text_node)] == EXACT
5480
5481 if this changes back then the macro for IS_TEXT and
5482 friends need to change. */
f2ed9b32 5483 if (!UTF_PATTERN) {
9a5a5549
KW
5484 ST.c1 = *s;
5485 switch (OP(text_node)) {
5486 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
2f7f8cb1 5487 case EXACTFA:
3c760661 5488 case EXACTFU_SS:
fab2782b 5489 case EXACTFU_TRICKYFOLD:
9a5a5549
KW
5490 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5491 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5492 default: ST.c2 = ST.c1; break;
5493 }
1aa99e6b 5494 }
f2ed9b32 5495 else { /* UTF_PATTERN */
9a5a5549 5496 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
b0573d8b
KW
5497 STRLEN ulen;
5498 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5499
5500 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5501 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
e294cc5d 5502 uniflags);
5f80c4cf
JP
5503 }
5504 else {
c255a977 5505 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
041457d9 5506 uniflags);
5f80c4cf 5507 }
1aa99e6b
IH
5508 }
5509 }
bbce6d69 5510 }
a0d0e21e 5511 else
c255a977 5512 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5513 assume_ok_easy:
c255a977
DM
5514
5515 ST.A = scan;
5516 ST.B = next;
24d3c4a9 5517 if (minmod) {
eb72505d 5518 char *li = locinput;
24d3c4a9 5519 minmod = 0;
eb72505d 5520 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
4633a7c4 5521 sayNO;
eb72505d
DM
5522 locinput = li;
5523 nextchr = UCHARAT(locinput);
c255a977 5524 ST.count = ST.min;
c255a977
DM
5525 REGCP_SET(ST.cp);
5526 if (ST.c1 == CHRTEST_VOID)
5527 goto curly_try_B_min;
5528
5529 ST.oldloc = locinput;
5530
5531 /* set ST.maxpos to the furthest point along the
5532 * string that could possibly match */
5533 if (ST.max == REG_INFTY) {
5534 ST.maxpos = PL_regeol - 1;
f2ed9b32 5535 if (utf8_target)
c255a977
DM
5536 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5537 ST.maxpos--;
5538 }
f2ed9b32 5539 else if (utf8_target) {
c255a977
DM
5540 int m = ST.max - ST.min;
5541 for (ST.maxpos = locinput;
5542 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5543 ST.maxpos += UTF8SKIP(ST.maxpos);
5544 }
5545 else {
5546 ST.maxpos = locinput + ST.max - ST.min;
5547 if (ST.maxpos >= PL_regeol)
5548 ST.maxpos = PL_regeol - 1;
5549 }
5550 goto curly_try_B_min_known;
5551
5552 }
5553 else {
eb72505d
DM
5554 /* avoid taking address of locinput, so it can remain
5555 * a register var */
5556 char *li = locinput;
5557 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
c255a977
DM
5558 if (ST.count < ST.min)
5559 sayNO;
eb72505d
DM
5560 locinput = li;
5561 nextchr = UCHARAT(locinput);
c255a977
DM
5562 if ((ST.count > ST.min)
5563 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5564 {
5565 /* A{m,n} must come at the end of the string, there's
5566 * no point in backing off ... */
5567 ST.min = ST.count;
5568 /* ...except that $ and \Z can match before *and* after
5569 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5570 We may back off by one in this case. */
eb72505d 5571 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
5572 ST.min--;
5573 }
5574 REGCP_SET(ST.cp);
5575 goto curly_try_B_max;
5576 }
118e2215 5577 assert(0); /* NOTREACHED */
c255a977
DM
5578
5579
5580 case CURLY_B_min_known_fail:
5581 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 5582
c255a977 5583 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5584 if (ST.paren) {
5585 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5586 }
c255a977
DM
5587 /* Couldn't or didn't -- move forward. */
5588 ST.oldloc = locinput;
f2ed9b32 5589 if (utf8_target)
c255a977
DM
5590 locinput += UTF8SKIP(locinput);
5591 else
5592 locinput++;
5593 ST.count++;
5594 curly_try_B_min_known:
5595 /* find the next place where 'B' could work, then call B */
5596 {
5597 int n;
f2ed9b32 5598 if (utf8_target) {
c255a977
DM
5599 n = (ST.oldloc == locinput) ? 0 : 1;
5600 if (ST.c1 == ST.c2) {
5601 STRLEN len;
5602 /* set n to utf8_distance(oldloc, locinput) */
5603 while (locinput <= ST.maxpos &&
5604 utf8n_to_uvchr((U8*)locinput,
5605 UTF8_MAXBYTES, &len,
5606 uniflags) != (UV)ST.c1) {
5607 locinput += len;
5608 n++;
5609 }
1aa99e6b
IH
5610 }
5611 else {
c255a977
DM
5612 /* set n to utf8_distance(oldloc, locinput) */
5613 while (locinput <= ST.maxpos) {
5614 STRLEN len;
5615 const UV c = utf8n_to_uvchr((U8*)locinput,
5616 UTF8_MAXBYTES, &len,
5617 uniflags);
5618 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5619 break;
5620 locinput += len;
5621 n++;
1aa99e6b 5622 }
0fe9bf95
IZ
5623 }
5624 }
c255a977
DM
5625 else {
5626 if (ST.c1 == ST.c2) {
5627 while (locinput <= ST.maxpos &&
5628 UCHARAT(locinput) != ST.c1)
5629 locinput++;
bbce6d69 5630 }
c255a977
DM
5631 else {
5632 while (locinput <= ST.maxpos
5633 && UCHARAT(locinput) != ST.c1
5634 && UCHARAT(locinput) != ST.c2)
5635 locinput++;
a0ed51b3 5636 }
c255a977
DM
5637 n = locinput - ST.oldloc;
5638 }
5639 if (locinput > ST.maxpos)
5640 sayNO;
c255a977 5641 if (n) {
eb72505d
DM
5642 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5643 * at b; check that everything between oldloc and
5644 * locinput matches */
5645 char *li = ST.oldloc;
c255a977 5646 ST.count += n;
eb72505d 5647 if (regrepeat(rex, &li, ST.A, n, depth) < n)
4633a7c4 5648 sayNO;
eb72505d 5649 assert(n == REG_INFTY || locinput == li);
a0d0e21e 5650 }
c255a977 5651 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5652 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5653 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5654 goto fake_end;
5655 }
4d5016e5 5656 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 5657 }
118e2215 5658 assert(0); /* NOTREACHED */
c255a977
DM
5659
5660
5661 case CURLY_B_min_fail:
5662 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5663
5664 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5665 if (ST.paren) {
5666 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5667 }
c255a977 5668 /* failed -- move forward one */
f73aaa43 5669 {
eb72505d
DM
5670 char *li = locinput;
5671 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
f73aaa43
DM
5672 sayNO;
5673 }
eb72505d 5674 locinput = li;
f73aaa43
DM
5675 }
5676 {
c255a977 5677 ST.count++;
c255a977
DM
5678 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5679 ST.count > 0)) /* count overflow ? */
15272685 5680 {
c255a977
DM
5681 curly_try_B_min:
5682 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5683 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5684 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5685 goto fake_end;
5686 }
4d5016e5 5687 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
5688 }
5689 }
118e2215 5690 assert(0); /* NOTREACHED */
c255a977
DM
5691
5692
5693 curly_try_B_max:
5694 /* a successful greedy match: now try to match B */
40d049e4 5695 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5696 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5697 goto fake_end;
5698 }
c255a977
DM
5699 {
5700 UV c = 0;
5701 if (ST.c1 != CHRTEST_VOID)
eb72505d 5702 c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
c255a977 5703 UTF8_MAXBYTES, 0, uniflags)
eb72505d 5704 : (UV) UCHARAT(locinput);
c255a977
DM
5705 /* If it could work, try it. */
5706 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5707 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 5708 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 5709 assert(0); /* NOTREACHED */
c255a977
DM
5710 }
5711 }
5712 /* FALL THROUGH */
5713 case CURLY_B_max_fail:
5714 /* failed to find B in a greedy match */
c255a977
DM
5715
5716 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5717 if (ST.paren) {
5718 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5719 }
c255a977
DM
5720 /* back up. */
5721 if (--ST.count < ST.min)
5722 sayNO;
eb72505d 5723 locinput = HOPc(locinput, -1);
c255a977
DM
5724 goto curly_try_B_max;
5725
5726#undef ST
5727
a0d0e21e 5728 case END:
6bda09f9 5729 fake_end:
faec1544
DM
5730 if (cur_eval) {
5731 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
5732 st->u.eval.toggle_reg_flags
5733 = cur_eval->u.eval.toggle_reg_flags;
5734 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5735
288b8c02 5736 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 5737 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
5738 rex_sv = cur_eval->u.eval.prev_rex;
5739 SET_reg_curpm(rex_sv);
288b8c02 5740 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5741 rexi = RXi_GET(rex);
faec1544 5742 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 5743
faec1544 5744 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
5745
5746 /* Restore parens of the outer rex without popping the
5747 * savestack */
74088413 5748 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
5749
5750 st->u.eval.prev_eval = cur_eval;
5751 cur_eval = cur_eval->u.eval.prev_eval;
5752 DEBUG_EXECUTE_r(
2a49f0f5
JH
5753 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5754 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5755 if ( nochange_depth )
5756 nochange_depth--;
5757
4d5016e5
DM
5758 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5759 locinput); /* match B */
faec1544
DM
5760 }
5761
3b0527fe 5762 if (locinput < reginfo->till) {
a3621e74 5763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
5764 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5765 PL_colors[4],
5766 (long)(locinput - PL_reg_starttry),
3b0527fe 5767 (long)(reginfo->till - PL_reg_starttry),
7821416a 5768 PL_colors[5]));
58e23c8d 5769
262b90c4 5770 sayNO_SILENT; /* Cannot match: too short. */
7821416a 5771 }
262b90c4 5772 sayYES; /* Success! */
dad79028
DM
5773
5774 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5775 DEBUG_EXECUTE_r(
5776 PerlIO_printf(Perl_debug_log,
5777 "%*s %ssubpattern success...%s\n",
5bc10b2c 5778 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 5779 sayYES; /* Success! */
dad79028 5780
40a82448
DM
5781#undef ST
5782#define ST st->u.ifmatch
5783
37f53970
DM
5784 {
5785 char *newstart;
5786
40a82448
DM
5787 case SUSPEND: /* (?>A) */
5788 ST.wanted = 1;
37f53970 5789 newstart = locinput;
9041c2e3 5790 goto do_ifmatch;
dad79028 5791
40a82448
DM
5792 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5793 ST.wanted = 0;
dad79028
DM
5794 goto ifmatch_trivial_fail_test;
5795
40a82448
DM
5796 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5797 ST.wanted = 1;
dad79028 5798 ifmatch_trivial_fail_test:
a0ed51b3 5799 if (scan->flags) {
52657f30 5800 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
5801 if (!s) {
5802 /* trivial fail */
24d3c4a9
DM
5803 if (logical) {
5804 logical = 0;
f2338a2e 5805 sw = 1 - cBOOL(ST.wanted);
dad79028 5806 }
40a82448 5807 else if (ST.wanted)
dad79028
DM
5808 sayNO;
5809 next = scan + ARG(scan);
5810 if (next == scan)
5811 next = NULL;
5812 break;
5813 }
37f53970 5814 newstart = s;
a0ed51b3
LW
5815 }
5816 else
37f53970 5817 newstart = locinput;
a0ed51b3 5818
c277df42 5819 do_ifmatch:
40a82448 5820 ST.me = scan;
24d3c4a9 5821 ST.logical = logical;
24d786f4
YO
5822 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5823
40a82448 5824 /* execute body of (?...A) */
37f53970 5825 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 5826 assert(0); /* NOTREACHED */
37f53970 5827 }
40a82448
DM
5828
5829 case IFMATCH_A_fail: /* body of (?...A) failed */
5830 ST.wanted = !ST.wanted;
5831 /* FALL THROUGH */
5832
5833 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 5834 if (ST.logical) {
f2338a2e 5835 sw = cBOOL(ST.wanted);
40a82448
DM
5836 }
5837 else if (!ST.wanted)
5838 sayNO;
5839
37f53970
DM
5840 if (OP(ST.me) != SUSPEND) {
5841 /* restore old position except for (?>...) */
5842 locinput = st->locinput;
40a82448
DM
5843 nextchr = UCHARAT(locinput);
5844 }
5845 scan = ST.me + ARG(ST.me);
5846 if (scan == ST.me)
5847 scan = NULL;
5848 continue; /* execute B */
5849
5850#undef ST
dad79028 5851
c277df42 5852 case LONGJMP:
c277df42
IZ
5853 next = scan + ARG(scan);
5854 if (next == scan)
5855 next = NULL;
a0d0e21e 5856 break;
54612592 5857 case COMMIT:
e2e6a0f1
YO
5858 reginfo->cutpoint = PL_regeol;
5859 /* FALLTHROUGH */
5d458dd8 5860 case PRUNE:
e2e6a0f1 5861 if (!scan->flags)
ad64d0ec 5862 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5863 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 5864 assert(0); /* NOTREACHED */
54612592
YO
5865 case COMMIT_next_fail:
5866 no_final = 1;
5867 /* FALLTHROUGH */
7f69552c
YO
5868 case OPFAIL:
5869 sayNO;
118e2215 5870 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5871
5872#define ST st->u.mark
5873 case MARKPOINT:
5874 ST.prev_mark = mark_state;
5d458dd8 5875 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 5876 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 5877 mark_state = st;
4d5016e5
DM
5878 ST.mark_loc = locinput;
5879 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 5880 assert(0); /* NOTREACHED */
e2e6a0f1
YO
5881 case MARKPOINT_next:
5882 mark_state = ST.prev_mark;
5883 sayYES;
118e2215 5884 assert(0); /* NOTREACHED */
e2e6a0f1 5885 case MARKPOINT_next_fail:
5d458dd8 5886 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
5887 {
5888 if (ST.mark_loc > startpoint)
5889 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5890 popmark = NULL; /* we found our mark */
5891 sv_commit = ST.mark_name;
5892
5893 DEBUG_EXECUTE_r({
5d458dd8 5894 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
5895 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5896 REPORT_CODE_OFF+depth*2, "",
be2597df 5897 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
5898 });
5899 }
5900 mark_state = ST.prev_mark;
5d458dd8
YO
5901 sv_yes_mark = mark_state ?
5902 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 5903 sayNO;
118e2215 5904 assert(0); /* NOTREACHED */
5d458dd8 5905 case SKIP:
5d458dd8 5906 if (scan->flags) {
2bf803e2 5907 /* (*SKIP) : if we fail we cut here*/
5d458dd8 5908 ST.mark_name = NULL;
e2e6a0f1 5909 ST.mark_loc = locinput;
4d5016e5 5910 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 5911 } else {
2bf803e2 5912 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
5913 otherwise do nothing. Meaning we need to scan
5914 */
5915 regmatch_state *cur = mark_state;
ad64d0ec 5916 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
5917
5918 while (cur) {
5919 if ( sv_eq( cur->u.mark.mark_name,
5920 find ) )
5921 {
5922 ST.mark_name = find;
4d5016e5 5923 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
5924 }
5925 cur = cur->u.mark.prev_mark;
5926 }
e2e6a0f1 5927 }
2bf803e2 5928 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8
YO
5929 break;
5930 case SKIP_next_fail:
5931 if (ST.mark_name) {
5932 /* (*CUT:NAME) - Set up to search for the name as we
5933 collapse the stack*/
5934 popmark = ST.mark_name;
5935 } else {
5936 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
5937 if (ST.mark_loc > startpoint)
5938 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
5939 /* but we set sv_commit to latest mark_name if there
5940 is one so they can test to see how things lead to this
5941 cut */
5942 if (mark_state)
5943 sv_commit=mark_state->u.mark.mark_name;
5944 }
e2e6a0f1
YO
5945 no_final = 1;
5946 sayNO;
118e2215 5947 assert(0); /* NOTREACHED */
e2e6a0f1 5948#undef ST
e1d1eefb 5949 case LNBREAK:
f2ed9b32 5950 if ((n=is_LNBREAK(locinput,utf8_target))) {
e1d1eefb
YO
5951 locinput += n;
5952 nextchr = UCHARAT(locinput);
5953 } else
5954 sayNO;
5955 break;
5956
5957#define CASE_CLASS(nAmE) \
5958 case nAmE: \
1380b51d
DM
5959 if (locinput >= PL_regeol) \
5960 sayNO; \
f2ed9b32 5961 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5962 locinput += n; \
5963 nextchr = UCHARAT(locinput); \
5964 } else \
5965 sayNO; \
5966 break; \
5967 case N##nAmE: \
1380b51d
DM
5968 if (locinput >= PL_regeol) \
5969 sayNO; \
f2ed9b32 5970 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
5971 sayNO; \
5972 } else { \
5973 locinput += UTF8SKIP(locinput); \
5974 nextchr = UCHARAT(locinput); \
5975 } \
5976 break
5977
5978 CASE_CLASS(VERTWS);
5979 CASE_CLASS(HORIZWS);
5980#undef CASE_CLASS
5981
a0d0e21e 5982 default:
b900a521 5983 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 5984 PTR2UV(scan), OP(scan));
cea2e8a9 5985 Perl_croak(aTHX_ "regexp memory corruption");
5d458dd8
YO
5986
5987 } /* end switch */
95b24440 5988
5d458dd8
YO
5989 /* switch break jumps here */
5990 scan = next; /* prepare to execute the next op and ... */
5991 continue; /* ... jump back to the top, reusing st */
118e2215 5992 assert(0); /* NOTREACHED */
95b24440 5993
40a82448
DM
5994 push_yes_state:
5995 /* push a state that backtracks on success */
5996 st->u.yes.prev_yes_state = yes_state;
5997 yes_state = st;
5998 /* FALL THROUGH */
5999 push_state:
6000 /* push a new regex state, then continue at scan */
6001 {
6002 regmatch_state *newst;
6003
24b23f37
YO
6004 DEBUG_STACK_r({
6005 regmatch_state *cur = st;
6006 regmatch_state *curyes = yes_state;
6007 int curd = depth;
6008 regmatch_slab *slab = PL_regmatch_slab;
6009 for (;curd > -1;cur--,curd--) {
6010 if (cur < SLAB_FIRST(slab)) {
6011 slab = slab->prev;
6012 cur = SLAB_LAST(slab);
6013 }
6014 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6015 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6016 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6017 (curyes == cur) ? "yes" : ""
6018 );
6019 if (curyes == cur)
6020 curyes = cur->u.yes.prev_yes_state;
6021 }
6022 } else
6023 DEBUG_STATE_pp("push")
6024 );
40a82448 6025 depth++;
40a82448
DM
6026 st->locinput = locinput;
6027 newst = st+1;
6028 if (newst > SLAB_LAST(PL_regmatch_slab))
6029 newst = S_push_slab(aTHX);
6030 PL_regmatch_state = newst;
786e8c11 6031
4d5016e5 6032 locinput = pushinput;
40a82448
DM
6033 nextchr = UCHARAT(locinput);
6034 st = newst;
6035 continue;
118e2215 6036 assert(0); /* NOTREACHED */
40a82448 6037 }
a0d0e21e 6038 }
a687059c 6039
a0d0e21e
LW
6040 /*
6041 * We get here only if there's trouble -- normally "case END" is
6042 * the terminating point.
6043 */
cea2e8a9 6044 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6045 /*NOTREACHED*/
4633a7c4
LW
6046 sayNO;
6047
262b90c4 6048yes:
77cb431f
DM
6049 if (yes_state) {
6050 /* we have successfully completed a subexpression, but we must now
6051 * pop to the state marked by yes_state and continue from there */
77cb431f 6052 assert(st != yes_state);
5bc10b2c
DM
6053#ifdef DEBUGGING
6054 while (st != yes_state) {
6055 st--;
6056 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6057 PL_regmatch_slab = PL_regmatch_slab->prev;
6058 st = SLAB_LAST(PL_regmatch_slab);
6059 }
e2e6a0f1 6060 DEBUG_STATE_r({
54612592
YO
6061 if (no_final) {
6062 DEBUG_STATE_pp("pop (no final)");
6063 } else {
6064 DEBUG_STATE_pp("pop (yes)");
6065 }
e2e6a0f1 6066 });
5bc10b2c
DM
6067 depth--;
6068 }
6069#else
77cb431f
DM
6070 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6071 || yes_state > SLAB_LAST(PL_regmatch_slab))
6072 {
6073 /* not in this slab, pop slab */
6074 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6075 PL_regmatch_slab = PL_regmatch_slab->prev;
6076 st = SLAB_LAST(PL_regmatch_slab);
6077 }
6078 depth -= (st - yes_state);
5bc10b2c 6079#endif
77cb431f
DM
6080 st = yes_state;
6081 yes_state = st->u.yes.prev_yes_state;
6082 PL_regmatch_state = st;
24b23f37 6083
5d458dd8
YO
6084 if (no_final) {
6085 locinput= st->locinput;
6086 nextchr = UCHARAT(locinput);
6087 }
54612592 6088 state_num = st->resume_state + no_final;
24d3c4a9 6089 goto reenter_switch;
77cb431f
DM
6090 }
6091
a3621e74 6092 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6093 PL_colors[4], PL_colors[5]));
02db2b7b 6094
ed301438 6095 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6096 /* each successfully executed (?{...}) block does the equivalent of
6097 * local $^R = do {...}
6098 * When popping the save stack, all these locals would be undone;
6099 * bypass this by setting the outermost saved $^R to the latest
6100 * value */
6101 if (oreplsv != GvSV(PL_replgv))
6102 sv_setsv(oreplsv, GvSV(PL_replgv));
6103 }
95b24440 6104 result = 1;
aa283a38 6105 goto final_exit;
4633a7c4
LW
6106
6107no:
a3621e74 6108 DEBUG_EXECUTE_r(
7821416a 6109 PerlIO_printf(Perl_debug_log,
786e8c11 6110 "%*s %sfailed...%s\n",
5bc10b2c 6111 REPORT_CODE_OFF+depth*2, "",
786e8c11 6112 PL_colors[4], PL_colors[5])
7821416a 6113 );
aa283a38 6114
262b90c4 6115no_silent:
54612592
YO
6116 if (no_final) {
6117 if (yes_state) {
6118 goto yes;
6119 } else {
6120 goto final_exit;
6121 }
6122 }
aa283a38
DM
6123 if (depth) {
6124 /* there's a previous state to backtrack to */
40a82448
DM
6125 st--;
6126 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6127 PL_regmatch_slab = PL_regmatch_slab->prev;
6128 st = SLAB_LAST(PL_regmatch_slab);
6129 }
6130 PL_regmatch_state = st;
40a82448
DM
6131 locinput= st->locinput;
6132 nextchr = UCHARAT(locinput);
6133
5bc10b2c
DM
6134 DEBUG_STATE_pp("pop");
6135 depth--;
262b90c4
DM
6136 if (yes_state == st)
6137 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6138
24d3c4a9
DM
6139 state_num = st->resume_state + 1; /* failure = success + 1 */
6140 goto reenter_switch;
95b24440 6141 }
24d3c4a9 6142 result = 0;
aa283a38 6143
262b90c4 6144 final_exit:
bbe252da 6145 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6146 SV *sv_err = get_sv("REGERROR", 1);
6147 SV *sv_mrk = get_sv("REGMARK", 1);
6148 if (result) {
e2e6a0f1 6149 sv_commit = &PL_sv_no;
5d458dd8
YO
6150 if (!sv_yes_mark)
6151 sv_yes_mark = &PL_sv_yes;
6152 } else {
6153 if (!sv_commit)
6154 sv_commit = &PL_sv_yes;
6155 sv_yes_mark = &PL_sv_no;
6156 }
6157 sv_setsv(sv_err, sv_commit);
6158 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6159 }
19b95bf0 6160
81ed78b2
DM
6161
6162 if (last_pushed_cv) {
6163 dSP;
6164 POP_MULTICALL;
4f8dbb2d 6165 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6166 }
6167
2f554ef7
DM
6168 /* clean up; in particular, free all slabs above current one */
6169 LEAVE_SCOPE(oldsave);
5d9a96ca 6170
730f4c74
DM
6171 assert(!result || locinput - PL_bostr >= 0);
6172 return result ? locinput - PL_bostr : -1;
a687059c
LW
6173}
6174
6175/*
6176 - regrepeat - repeatedly match something simple, report how many
d60de1d1
DM
6177 *
6178 * startposp - pointer a pointer to the start position. This is updated
6179 * to point to the byte following the highest successful
6180 * match.
6181 * p - the regnode to be repeatedly matched against.
6182 * max - maximum number of characters to match.
6183 * depth - (for debugging) backtracking depth.
a687059c 6184 */
76e3520e 6185STATIC I32
f73aaa43 6186S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
a687059c 6187{
27da23d5 6188 dVAR;
eb578fdb
KW
6189 char *scan;
6190 I32 c;
6191 char *loceol = PL_regeol;
6192 I32 hardcount = 0;
6193 bool utf8_target = PL_reg_match_utf8;
d513472c 6194 UV utf8_flags;
4f55667c
SP
6195#ifndef DEBUGGING
6196 PERL_UNUSED_ARG(depth);
6197#endif
a0d0e21e 6198
7918f24d
NC
6199 PERL_ARGS_ASSERT_REGREPEAT;
6200
f73aaa43 6201 scan = *startposp;
faf11cac
HS
6202 if (max == REG_INFTY)
6203 max = I32_MAX;
6204 else if (max < loceol - scan)
7f596f4c 6205 loceol = scan + max;
a0d0e21e 6206 switch (OP(p)) {
22c35a8c 6207 case REG_ANY:
f2ed9b32 6208 if (utf8_target) {
ffc61ed2 6209 loceol = PL_regeol;
1aa99e6b 6210 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6211 scan += UTF8SKIP(scan);
6212 hardcount++;
6213 }
6214 } else {
6215 while (scan < loceol && *scan != '\n')
6216 scan++;
a0ed51b3
LW
6217 }
6218 break;
ffc61ed2 6219 case SANY:
f2ed9b32 6220 if (utf8_target) {
def8e4ea 6221 loceol = PL_regeol;
a0804c9e 6222 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6223 scan += UTF8SKIP(scan);
6224 hardcount++;
6225 }
6226 }
6227 else
6228 scan = loceol;
a0ed51b3 6229 break;
f33976b4
DB
6230 case CANY:
6231 scan = loceol;
6232 break;
59d32103
KW
6233 case EXACT:
6234 /* To get here, EXACTish nodes must have *byte* length == 1. That
6235 * means they match only characters in the string that can be expressed
6236 * as a single byte. For non-utf8 strings, that means a simple match.
6237 * For utf8 strings, the character matched must be an invariant, or
6238 * downgradable to a single byte. The pattern's utf8ness is
6239 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6240 * it is, it's an invariant */
6241
6242 c = (U8)*STRING(p);
6243 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6244
6245 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6246 while (scan < loceol && UCHARAT(scan) == c) {
6247 scan++;
6248 }
6249 }
6250 else {
6251
6252 /* Here, the string is utf8, and the pattern char is different
6253 * in utf8 than not, so can't compare them directly. Outside the
5908807f 6254 * loop, find the two utf8 bytes that represent c, and then
59d32103
KW
6255 * look for those in sequence in the utf8 string */
6256 U8 high = UTF8_TWO_BYTE_HI(c);
6257 U8 low = UTF8_TWO_BYTE_LO(c);
6258 loceol = PL_regeol;
6259
6260 while (hardcount < max
6261 && scan + 1 < loceol
6262 && UCHARAT(scan) == high
6263 && UCHARAT(scan + 1) == low)
6264 {
6265 scan += 2;
6266 hardcount++;
6267 }
6268 }
6269 break;
2f7f8cb1
KW
6270 case EXACTFA:
6271 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6272 goto do_exactf;
6273
d4e0b827
KW
6274 case EXACTFL:
6275 PL_reg_flags |= RF_tainted;
17580e7a
KW
6276 utf8_flags = FOLDEQ_UTF8_LOCALE;
6277 goto do_exactf;
6278
d4e0b827 6279 case EXACTF:
62bf7766
KW
6280 utf8_flags = 0;
6281 goto do_exactf;
6282
3c760661 6283 case EXACTFU_SS:
fab2782b 6284 case EXACTFU_TRICKYFOLD:
9a5a5549 6285 case EXACTFU:
05f861a2 6286 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6287
2be3e190
KW
6288 /* The comments for the EXACT case above apply as well to these fold
6289 * ones */
634c83a2 6290
2f7f8cb1 6291 do_exactf:
090f7165 6292 c = (U8)*STRING(p);
634c83a2 6293 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
d4e0b827 6294
3c760661 6295 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
59d32103
KW
6296 char *tmpeol = loceol;
6297 while (hardcount < max
d513472c
KW
6298 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6299 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6300 {
6301 scan = tmpeol;
6302 tmpeol = loceol;
6303 hardcount++;
6304 }
634c83a2 6305
59d32103
KW
6306 /* XXX Note that the above handles properly the German sharp s in
6307 * the pattern matching ss in the string. But it doesn't handle
6308 * properly cases where the string contains say 'LIGATURE ff' and
6309 * the pattern is 'f+'. This would require, say, a new function or
6310 * revised interface to foldEQ_utf8(), in which the maximum number
6311 * of characters to match could be passed and it would return how
6312 * many actually did. This is just one of many cases where
6313 * multi-char folds don't work properly, and so the fix is being
6314 * deferred */
6315 }
6316 else {
87381386 6317 U8 folded;
59d32103 6318
2be3e190
KW
6319 /* Here, the string isn't utf8 and c is a single byte; and either
6320 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6321 * doesn't affect c. Can just do simple comparisons for exact or
6322 * fold matching. */
d4e0b827 6323 switch (OP(p)) {
87381386 6324 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6325 case EXACTFA:
fab2782b 6326 case EXACTFU_TRICKYFOLD:
9a5a5549 6327 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6328 case EXACTFL: folded = PL_fold_locale[c]; break;
6329 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6330 }
6331 while (scan < loceol &&
6332 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6333 {
6334 scan++;
634c83a2
KW
6335 }
6336 }
bbce6d69 6337 break;
f56b6394 6338 case ANYOFV:
a0d0e21e 6339 case ANYOF:
4e8910e0
KW
6340 if (utf8_target || OP(p) == ANYOFV) {
6341 STRLEN inclasslen;
ffc61ed2 6342 loceol = PL_regeol;
4e8910e0
KW
6343 inclasslen = loceol - scan;
6344 while (hardcount < max
6345 && ((inclasslen = loceol - scan) > 0)
6346 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6347 {
6348 scan += inclasslen;
ffc61ed2
JH
6349 hardcount++;
6350 }
6351 } else {
32fc9b6a 6352 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6353 scan++;
6354 }
a0d0e21e 6355 break;
980866de 6356 case ALNUMU:
f2ed9b32 6357 if (utf8_target) {
980866de 6358 utf8_wordchar:
ffc61ed2 6359 loceol = PL_regeol;
1a4fad37 6360 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6361 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6362 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6363 {
ffc61ed2
JH
6364 scan += UTF8SKIP(scan);
6365 hardcount++;
6366 }
980866de 6367 } else {
a12cf05f
KW
6368 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6369 scan++;
6370 }
980866de
KW
6371 }
6372 break;
6373 case ALNUM:
6374 if (utf8_target)
6375 goto utf8_wordchar;
6376 while (scan < loceol && isALNUM((U8) *scan)) {
6377 scan++;
a0ed51b3
LW
6378 }
6379 break;
cfaf538b
KW
6380 case ALNUMA:
6381 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6382 scan++;
6383 }
6384 break;
bbce6d69 6385 case ALNUML:
3280af22 6386 PL_reg_flags |= RF_tainted;
f2ed9b32 6387 if (utf8_target) {
ffc61ed2 6388 loceol = PL_regeol;
1aa99e6b
IH
6389 while (hardcount < max && scan < loceol &&
6390 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6391 scan += UTF8SKIP(scan);
6392 hardcount++;
6393 }
6394 } else {
6395 while (scan < loceol && isALNUM_LC(*scan))
6396 scan++;
a0ed51b3
LW
6397 }
6398 break;
980866de 6399 case NALNUMU:
f2ed9b32 6400 if (utf8_target) {
980866de
KW
6401
6402 utf8_Nwordchar:
6403
ffc61ed2 6404 loceol = PL_regeol;
1a4fad37 6405 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6406 while (hardcount < max && scan < loceol &&
980866de 6407 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6408 {
ffc61ed2
JH
6409 scan += UTF8SKIP(scan);
6410 hardcount++;
6411 }
980866de 6412 } else {
a12cf05f
KW
6413 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6414 scan++;
6415 }
980866de
KW
6416 }
6417 break;
6418 case NALNUM:
6419 if (utf8_target)
6420 goto utf8_Nwordchar;
6421 while (scan < loceol && ! isALNUM((U8) *scan)) {
6422 scan++;
a0ed51b3
LW
6423 }
6424 break;
0658cdde
KW
6425
6426 case POSIXA:
6427 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6428 scan++;
6429 }
6430 break;
6431 case NPOSIXA:
6432 if (utf8_target) {
6433 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6434 scan += UTF8SKIP(scan);
6435 }
6436 }
6437 else {
6438 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6439 scan++;
6440 }
6441 }
6442 break;
cfaf538b
KW
6443 case NALNUMA:
6444 if (utf8_target) {
6445 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6446 scan += UTF8SKIP(scan);
6447 }
6448 }
6449 else {
6450 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6451 scan++;
6452 }
6453 }
6454 break;
bbce6d69 6455 case NALNUML:
3280af22 6456 PL_reg_flags |= RF_tainted;
f2ed9b32 6457 if (utf8_target) {
ffc61ed2 6458 loceol = PL_regeol;
1aa99e6b
IH
6459 while (hardcount < max && scan < loceol &&
6460 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6461 scan += UTF8SKIP(scan);
6462 hardcount++;
6463 }
6464 } else {
6465 while (scan < loceol && !isALNUM_LC(*scan))
6466 scan++;
a0ed51b3
LW
6467 }
6468 break;
980866de 6469 case SPACEU:
f2ed9b32 6470 if (utf8_target) {
980866de
KW
6471
6472 utf8_space:
6473
ffc61ed2 6474 loceol = PL_regeol;
1a4fad37 6475 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6476 while (hardcount < max && scan < loceol &&
3568d838 6477 (*scan == ' ' ||
a12cf05f
KW
6478 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6479 {
ffc61ed2
JH
6480 scan += UTF8SKIP(scan);
6481 hardcount++;
6482 }
980866de
KW
6483 break;
6484 }
6485 else {
a12cf05f
KW
6486 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6487 scan++;
6488 }
980866de
KW
6489 break;
6490 }
6491 case SPACE:
6492 if (utf8_target)
6493 goto utf8_space;
6494
6495 while (scan < loceol && isSPACE((U8) *scan)) {
6496 scan++;
a0ed51b3
LW
6497 }
6498 break;
cfaf538b
KW
6499 case SPACEA:
6500 while (scan < loceol && isSPACE_A((U8) *scan)) {
6501 scan++;
6502 }
6503 break;
bbce6d69 6504 case SPACEL:
3280af22 6505 PL_reg_flags |= RF_tainted;
f2ed9b32 6506 if (utf8_target) {
ffc61ed2 6507 loceol = PL_regeol;
1aa99e6b 6508 while (hardcount < max && scan < loceol &&
6bbba904 6509 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6510 scan += UTF8SKIP(scan);
6511 hardcount++;
6512 }
6513 } else {
6514 while (scan < loceol && isSPACE_LC(*scan))
6515 scan++;
a0ed51b3
LW
6516 }
6517 break;
980866de 6518 case NSPACEU:
f2ed9b32 6519 if (utf8_target) {
980866de
KW
6520
6521 utf8_Nspace:
6522
ffc61ed2 6523 loceol = PL_regeol;
1a4fad37 6524 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6525 while (hardcount < max && scan < loceol &&
980866de
KW
6526 ! (*scan == ' ' ||
6527 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6528 {
ffc61ed2
JH
6529 scan += UTF8SKIP(scan);
6530 hardcount++;
6531 }
980866de
KW
6532 break;
6533 }
6534 else {
a12cf05f
KW
6535 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6536 scan++;
6537 }
980866de
KW
6538 }
6539 break;
6540 case NSPACE:
6541 if (utf8_target)
6542 goto utf8_Nspace;
6543
6544 while (scan < loceol && ! isSPACE((U8) *scan)) {
6545 scan++;
a0ed51b3 6546 }
0008a298 6547 break;
cfaf538b
KW
6548 case NSPACEA:
6549 if (utf8_target) {
6550 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6551 scan += UTF8SKIP(scan);
6552 }
6553 }
6554 else {
6555 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6556 scan++;
6557 }
6558 }
6559 break;
bbce6d69 6560 case NSPACEL:
3280af22 6561 PL_reg_flags |= RF_tainted;
f2ed9b32 6562 if (utf8_target) {
ffc61ed2 6563 loceol = PL_regeol;
1aa99e6b 6564 while (hardcount < max && scan < loceol &&
6bbba904 6565 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6566 scan += UTF8SKIP(scan);
6567 hardcount++;
6568 }
6569 } else {
6570 while (scan < loceol && !isSPACE_LC(*scan))
6571 scan++;
a0ed51b3
LW
6572 }
6573 break;
a0d0e21e 6574 case DIGIT:
f2ed9b32 6575 if (utf8_target) {
ffc61ed2 6576 loceol = PL_regeol;
1a4fad37 6577 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6578 while (hardcount < max && scan < loceol &&
f2ed9b32 6579 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6580 scan += UTF8SKIP(scan);
6581 hardcount++;
6582 }
6583 } else {
6584 while (scan < loceol && isDIGIT(*scan))
6585 scan++;
a0ed51b3
LW
6586 }
6587 break;
cfaf538b
KW
6588 case DIGITA:
6589 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6590 scan++;
6591 }
6592 break;
b77393f6
KW
6593 case DIGITL:
6594 PL_reg_flags |= RF_tainted;
6595 if (utf8_target) {
6596 loceol = PL_regeol;
6597 while (hardcount < max && scan < loceol &&
6598 isDIGIT_LC_utf8((U8*)scan)) {
6599 scan += UTF8SKIP(scan);
6600 hardcount++;
6601 }
6602 } else {
6603 while (scan < loceol && isDIGIT_LC(*scan))
6604 scan++;
6605 }
6606 break;
a0d0e21e 6607 case NDIGIT:
f2ed9b32 6608 if (utf8_target) {
ffc61ed2 6609 loceol = PL_regeol;
1a4fad37 6610 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6611 while (hardcount < max && scan < loceol &&
f2ed9b32 6612 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6613 scan += UTF8SKIP(scan);
6614 hardcount++;
6615 }
6616 } else {
6617 while (scan < loceol && !isDIGIT(*scan))
6618 scan++;
a0ed51b3 6619 }
cfaf538b
KW
6620 break;
6621 case NDIGITA:
6622 if (utf8_target) {
6623 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6624 scan += UTF8SKIP(scan);
6625 }
6626 }
6627 else {
6628 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6629 scan++;
6630 }
6631 }
6632 break;
b77393f6
KW
6633 case NDIGITL:
6634 PL_reg_flags |= RF_tainted;
6635 if (utf8_target) {
6636 loceol = PL_regeol;
6637 while (hardcount < max && scan < loceol &&
6638 !isDIGIT_LC_utf8((U8*)scan)) {
6639 scan += UTF8SKIP(scan);
6640 hardcount++;
6641 }
6642 } else {
6643 while (scan < loceol && !isDIGIT_LC(*scan))
6644 scan++;
6645 }
6646 break;
e1d1eefb 6647 case LNBREAK:
f2ed9b32 6648 if (utf8_target) {
e1d1eefb
YO
6649 loceol = PL_regeol;
6650 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6651 scan += c;
6652 hardcount++;
6653 }
6654 } else {
6655 /*
6656 LNBREAK can match two latin chars, which is ok,
6657 because we have a null terminated string, but we
6658 have to use hardcount in this situation
6659 */
6660 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6661 scan+=c;
6662 hardcount++;
6663 }
6664 }
6665 break;
6666 case HORIZWS:
f2ed9b32 6667 if (utf8_target) {
e1d1eefb
YO
6668 loceol = PL_regeol;
6669 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6670 scan += c;
6671 hardcount++;
6672 }
6673 } else {
6674 while (scan < loceol && is_HORIZWS_latin1(scan))
6675 scan++;
6676 }
a0ed51b3 6677 break;
e1d1eefb 6678 case NHORIZWS:
f2ed9b32 6679 if (utf8_target) {
e1d1eefb
YO
6680 loceol = PL_regeol;
6681 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6682 scan += UTF8SKIP(scan);
6683 hardcount++;
6684 }
6685 } else {
6686 while (scan < loceol && !is_HORIZWS_latin1(scan))
6687 scan++;
6688
6689 }
6690 break;
6691 case VERTWS:
f2ed9b32 6692 if (utf8_target) {
e1d1eefb
YO
6693 loceol = PL_regeol;
6694 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6695 scan += c;
6696 hardcount++;
6697 }
6698 } else {
6699 while (scan < loceol && is_VERTWS_latin1(scan))
6700 scan++;
6701
6702 }
6703 break;
6704 case NVERTWS:
f2ed9b32 6705 if (utf8_target) {
e1d1eefb
YO
6706 loceol = PL_regeol;
6707 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6708 scan += UTF8SKIP(scan);
6709 hardcount++;
6710 }
6711 } else {
6712 while (scan < loceol && !is_VERTWS_latin1(scan))
6713 scan++;
6714
6715 }
6716 break;
6717
a0d0e21e
LW
6718 default: /* Called on something of 0 width. */
6719 break; /* So match right here or not at all. */
6720 }
a687059c 6721
a0ed51b3
LW
6722 if (hardcount)
6723 c = hardcount;
6724 else
f73aaa43
DM
6725 c = scan - *startposp;
6726 *startposp = scan;
a687059c 6727
a3621e74 6728 DEBUG_r({
e68ec53f 6729 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 6730 DEBUG_EXECUTE_r({
e68ec53f
YO
6731 SV * const prop = sv_newmortal();
6732 regprop(prog, prop, p);
6733 PerlIO_printf(Perl_debug_log,
be8e71aa 6734 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 6735 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 6736 });
be8e71aa 6737 });
9041c2e3 6738
a0d0e21e 6739 return(c);
a687059c
LW
6740}
6741
c277df42 6742
be8e71aa 6743#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 6744/*
6c6525b8
KW
6745- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6746create a copy so that changes the caller makes won't change the shared one
6747 */
ffc61ed2 6748SV *
32fc9b6a 6749Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 6750{
6c6525b8
KW
6751 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6752 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6753}
6754#endif
6755
6756STATIC SV *
6757S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6758{
8c9eb58f
KW
6759 /* Returns the swash for the input 'node' in the regex 'prog'.
6760 * If <doinit> is true, will attempt to create the swash if not already
6761 * done.
6762 * If <listsvp> is non-null, will return the swash initialization string in
6763 * it.
6764 * If <altsvp> is non-null, will return the alternates to the regular swash
6765 * in it
6766 * Tied intimately to how regcomp.c sets up the data structure */
6767
97aff369 6768 dVAR;
9e55ce06
JH
6769 SV *sw = NULL;
6770 SV *si = NULL;
6771 SV *alt = NULL;
7a6c6baa
KW
6772 SV* invlist = NULL;
6773
f8fc2ecf
YO
6774 RXi_GET_DECL(prog,progi);
6775 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 6776
6c6525b8 6777 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 6778
ccb2541c
KW
6779 assert(ANYOF_NONBITMAP(node));
6780
4f639d21 6781 if (data && data->count) {
a3b680e6 6782 const U32 n = ARG(node);
ffc61ed2 6783
4f639d21 6784 if (data->what[n] == 's') {
ad64d0ec
NC
6785 SV * const rv = MUTABLE_SV(data->data[n]);
6786 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 6787 SV **const ary = AvARRAY(av);
87367d5f 6788 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 6789
8c9eb58f 6790 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 6791
7a6c6baa
KW
6792 /* Elements 3 and 4 are either both present or both absent. [3] is
6793 * any inversion list generated at compile time; [4] indicates if
6794 * that inversion list has any user-defined properties in it. */
6795 if (av_len(av) >= 3) {
6796 invlist = ary[3];
83199d38
KW
6797 if (SvUV(ary[4])) {
6798 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6799 }
7a6c6baa
KW
6800 }
6801 else {
6802 invlist = NULL;
7a6c6baa
KW
6803 }
6804
8c9eb58f
KW
6805 /* Element [1] is reserved for the set-up swash. If already there,
6806 * return it; if not, create it and store it there */
f192cf32
KW
6807 if (SvROK(ary[1])) {
6808 sw = ary[1];
6809 }
ffc61ed2 6810 else if (si && doinit) {
7a6c6baa
KW
6811
6812 sw = _core_swash_init("utf8", /* the utf8 package */
6813 "", /* nameless */
6814 si,
6815 1, /* binary */
6816 0, /* not from tr/// */
7a6c6baa 6817 invlist,
83199d38 6818 &swash_init_flags);
ffc61ed2
JH
6819 (void)av_store(av, 1, sw);
6820 }
8c9eb58f
KW
6821
6822 /* Element [2] is for any multi-char folds. Note that is a
6823 * fundamentally flawed design, because can't backtrack and try
6824 * again. See [perl #89774] */
f192cf32
KW
6825 if (SvTYPE(ary[2]) == SVt_PVAV) {
6826 alt = ary[2];
6827 }
ffc61ed2
JH
6828 }
6829 }
6830
7a6c6baa
KW
6831 if (listsvp) {
6832 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
6833
6834 /* Use the swash, if any, which has to have incorporated into it all
6835 * possibilities */
872dd7e0
KW
6836 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6837 && (si && si != &PL_sv_undef))
6838 {
7a6c6baa 6839
872dd7e0 6840 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
6841 sv_catsv(matches_string, si);
6842 }
6843
6844 /* Add the inversion list to whatever we have. This may have come from
6845 * the swash, or from an input parameter */
6846 if (invlist) {
6847 sv_catsv(matches_string, _invlist_contents(invlist));
6848 }
6849 *listsvp = matches_string;
6850 }
6851
9e55ce06
JH
6852 if (altsvp)
6853 *altsvp = alt;
ffc61ed2
JH
6854
6855 return sw;
6856}
6857
6858/*
ba7b4546 6859 - reginclass - determine if a character falls into a character class
832705d4 6860
6698fab5
KW
6861 n is the ANYOF regnode
6862 p is the target string
6863 lenp is pointer to the maximum number of bytes of how far to go in p
6864 (This is assumed wthout checking to always be at least the current
6865 character's size)
6866 utf8_target tells whether p is in UTF-8.
832705d4 6867
4b3cda86
KW
6868 Returns true if matched; false otherwise. If lenp is not NULL, on return
6869 from a successful match, the value it points to will be updated to how many
6870 bytes in p were matched. If there was no match, the value is undefined,
6871 possibly changed from the input.
eba1359e 6872
d5788240
KW
6873 Note that this can be a synthetic start class, a combination of various
6874 nodes, so things you think might be mutually exclusive, such as locale,
6875 aren't. It can match both locale and non-locale
6876
bbce6d69 6877 */
6878
76e3520e 6879STATIC bool
f6ad78d8 6880S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 6881{
27da23d5 6882 dVAR;
a3b680e6 6883 const char flags = ANYOF_FLAGS(n);
bbce6d69 6884 bool match = FALSE;
cc07378b 6885 UV c = *p;
f7ab54c6 6886 STRLEN c_len = 0;
6698fab5 6887 STRLEN maxlen;
1aa99e6b 6888
7918f24d
NC
6889 PERL_ARGS_ASSERT_REGINCLASS;
6890
4b3cda86 6891 /* If c is not already the code point, get it */
f2ed9b32 6892 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 6893 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
6894 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6895 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6896 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6897 * UTF8_ALLOW_FFFF */
f7ab54c6 6898 if (c_len == (STRLEN)-1)
e8a70c6f 6899 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 6900 }
a5a291f5
KW
6901 else {
6902 c_len = 1;
6903 }
bbce6d69 6904
a5a291f5
KW
6905 /* Use passed in max length, or one character if none passed in or less
6906 * than one character. And assume will match just one character. This is
6907 * overwritten later if matched more. */
4b3cda86 6908 if (lenp) {
a5a291f5
KW
6909 maxlen = (*lenp > c_len) ? *lenp : c_len;
6910 *lenp = c_len;
4b3cda86
KW
6911
6912 }
6913 else {
a5a291f5 6914 maxlen = c_len;
4b3cda86
KW
6915 }
6916
7cdde544
KW
6917 /* If this character is potentially in the bitmap, check it */
6918 if (c < 256) {
ffc61ed2
JH
6919 if (ANYOF_BITMAP_TEST(n, c))
6920 match = TRUE;
11454c59
KW
6921 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6922 && ! utf8_target
6923 && ! isASCII(c))
6924 {
6925 match = TRUE;
6926 }
a0ed51b3 6927
78969a98
KW
6928 else if (flags & ANYOF_LOCALE) {
6929 PL_reg_flags |= RF_tainted;
6930
6931 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6932 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6933 {
ffc61ed2 6934 match = TRUE;
78969a98 6935 }
040aea3a
KW
6936 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6937 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6938 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6939 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6940 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6941 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6942 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6943 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6944 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6945 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6946 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
6947 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6948 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
6949 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6950 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6951 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6952 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6953 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6954 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6955 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6956 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6957 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6958 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6959 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6960 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6961 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6962 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6963 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6964 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
6965 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6966 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 6967 ) /* How's that for a conditional? */
78969a98 6968 ) {
ffc61ed2
JH
6969 match = TRUE;
6970 }
a0ed51b3 6971 }
a0ed51b3
LW
6972 }
6973
7cdde544 6974 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
6975 * bitmap could match, try that. Locale nodes specifiy completely the
6976 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 6977 * cause them to be treated as Unicode and not locale), except in
de87c4fe 6978 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
6979 * could be a combination of locale and non-locale nodes. So allow locale
6980 * to match for the synthetic start class, which will give a false
6981 * positive that will be resolved when the match is done again as not part
6982 * of the synthetic start class */
ef87b810 6983 if (!match) {
10ee90d2
KW
6984 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6985 match = TRUE; /* Everything above 255 matches */
e051a21d 6986 }
6f8d7d0d
KW
6987 else if (ANYOF_NONBITMAP(n)
6988 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6989 || (utf8_target
6990 && (c >=256
6991 || (! (flags & ANYOF_LOCALE))
6992 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 6993 {
7cdde544 6994 AV *av;
210e6c47 6995 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7cdde544
KW
6996
6997 if (sw) {
6998 U8 * utf8_p;
6999 if (utf8_target) {
7000 utf8_p = (U8 *) p;
7001 } else {
f56b6394
KW
7002
7003 /* Not utf8. Convert as much of the string as available up
7004 * to the limit of how far the (single) character in the
7005 * pattern can possibly match (no need to go further). If
7006 * the node is a straight ANYOF or not folding, it can't
7007 * match more than one. Otherwise, It can match up to how
7008 * far a single char can fold to. Since not utf8, each
7009 * character is a single byte, so the max it can be in
7010 * bytes is the same as the max it can be in characters */
7011 STRLEN len = (OP(n) == ANYOF
7012 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7013 ? 1
7014 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7015 ? maxlen
7016 : UTF8_MAX_FOLD_CHAR_EXPAND;
7cdde544
KW
7017 utf8_p = bytes_to_utf8(p, &len);
7018 }
f56b6394 7019
f7eb7452 7020 if (swash_fetch(sw, utf8_p, TRUE))
7cdde544 7021 match = TRUE;
39065660 7022 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
f56b6394
KW
7023
7024 /* Here, we need to test if the fold of the target string
8d5d17fa
KW
7025 * matches. The non-multi char folds have all been moved to
7026 * the compilation phase, and the multi-char folds have
7027 * been stored by regcomp into 'av'; we linearly check to
7028 * see if any match the target string (folded). We know
7029 * that the originals were each one character, but we don't
7030 * currently know how many characters/bytes each folded to,
7031 * except we do know that there are small limits imposed by
7032 * Unicode. XXX A performance enhancement would be to have
7033 * regcomp.c store the max number of chars/bytes that are
7034 * in an av entry, as, say the 0th element. Even better
7035 * would be to have a hash of the few characters that can
7036 * start a multi-char fold to the max number of chars of
7037 * those folds.
f56b6394
KW
7038 *
7039 * If there is a match, we will need to advance (if lenp is
7040 * specified) the match pointer in the target string. But
7041 * what we are comparing here isn't that string directly,
7042 * but its fold, whose length may differ from the original.
7043 * As we go along in constructing the fold, therefore, we
7044 * create a map so that we know how many bytes in the
7045 * source to advance given that we have matched a certain
7046 * number of bytes in the fold. This map is stored in
6438af90
KW
7047 * 'map_fold_len_back'. Let n mean the number of bytes in
7048 * the fold of the first character that we are folding.
7049 * Then map_fold_len_back[n] is set to the number of bytes
7050 * in that first character. Similarly let m be the
7051 * corresponding number for the second character to be
7052 * folded. Then map_fold_len_back[n+m] is set to the
7053 * number of bytes occupied by the first two source
7054 * characters. ... */
7055 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
f56b6394
KW
7056 U8 folded[UTF8_MAXBYTES_CASE+1];
7057 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
deba3d96 7058 STRLEN total_foldlen = 0; /* num bytes in fold of all
6438af90 7059 chars */
f56b6394
KW
7060
7061 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7062
7063 /* Here, only need to fold the first char of the target
6438af90 7064 * string. It the source wasn't utf8, is 1 byte long */
f56b6394 7065 to_utf8_fold(utf8_p, folded, &foldlen);
deba3d96 7066 total_foldlen = foldlen;
6438af90
KW
7067 map_fold_len_back[foldlen] = (utf8_target)
7068 ? UTF8SKIP(utf8_p)
7069 : 1;
f56b6394
KW
7070 }
7071 else {
7072
7073 /* Here, need to fold more than the first char. Do so
7074 * up to the limits */
f56b6394
KW
7075 U8* source_ptr = utf8_p; /* The source for the fold
7076 is the regex target
7077 string */
7078 U8* folded_ptr = folded;
7079 U8* e = utf8_p + maxlen; /* Can't go beyond last
7080 available byte in the
7081 target string */
6438af90
KW
7082 U8 i;
7083 for (i = 0;
7084 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7085 i++)
f56b6394
KW
7086 {
7087
7088 /* Fold the next character */
7089 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7090 STRLEN this_char_foldlen;
7091 to_utf8_fold(source_ptr,
7092 this_char_folded,
7093 &this_char_foldlen);
7094
7095 /* Bail if it would exceed the byte limit for
7096 * folding a single char. */
7097 if (this_char_foldlen + folded_ptr - folded >
7098 UTF8_MAXBYTES_CASE)
7099 {
7100 break;
7101 }
7102
6438af90 7103 /* Add the fold of this character */
f56b6394
KW
7104 Copy(this_char_folded,
7105 folded_ptr,
7106 this_char_foldlen,
7107 U8);
f56b6394 7108 source_ptr += UTF8SKIP(source_ptr);
6438af90 7109 folded_ptr += this_char_foldlen;
deba3d96 7110 total_foldlen = folded_ptr - folded;
6438af90
KW
7111
7112 /* Create map from the number of bytes in the fold
7113 * back to the number of bytes in the source. If
7114 * the source isn't utf8, the byte count is just
7115 * the number of characters so far */
deba3d96 7116 map_fold_len_back[total_foldlen]
6438af90
KW
7117 = (utf8_target)
7118 ? source_ptr - utf8_p
7119 : i + 1;
f56b6394
KW
7120 }
7121 *folded_ptr = '\0';
f56b6394
KW
7122 }
7123
7124
7125 /* Do the linear search to see if the fold is in the list
8f2655f7
KW
7126 * of multi-char folds. */
7127 if (av) {
7cdde544
KW
7128 I32 i;
7129 for (i = 0; i <= av_len(av); i++) {
7130 SV* const sv = *av_fetch(av, i, FALSE);
7131 STRLEN len;
7132 const char * const s = SvPV_const(sv, len);
6438af90 7133
f9126265
KW
7134 if (len <= total_foldlen
7135 && memEQ(s, (char*)folded, len)
7136
7137 /* If 0, means matched a partial char. See
7138 * [perl #90536] */
7139 && map_fold_len_back[len])
f56b6394
KW
7140 {
7141
7142 /* Advance the target string ptr to account for
7143 * this fold, but have to translate from the
7144 * folded length to the corresponding source
6438af90 7145 * length. */
8f2655f7
KW
7146 if (lenp) {
7147 *lenp = map_fold_len_back[len];
8f2655f7 7148 }
7cdde544
KW
7149 match = TRUE;
7150 break;
7151 }
7152 }
7153 }
7cdde544
KW
7154 }
7155
7156 /* If we allocated a string above, free it */
7157 if (! utf8_target) Safefree(utf8_p);
7158 }
7159 }
5073ffbd
KW
7160
7161 if (UNICODE_IS_SUPER(c)
7162 && (flags & ANYOF_WARN_SUPER)
7163 && ckWARN_d(WARN_NON_UNICODE))
7164 {
7165 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7166 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7167 }
7cdde544
KW
7168 }
7169
f0fdc1c9
KW
7170 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7171 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7172}
161b471a 7173
dfe13c55 7174STATIC U8 *
0ce71af7 7175S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7176{
6af86488
KW
7177 /* return the position 'off' UTF-8 characters away from 's', forward if
7178 * 'off' >= 0, backwards if negative. But don't go outside of position
7179 * 'lim', which better be < s if off < 0 */
7180
97aff369 7181 dVAR;
7918f24d
NC
7182
7183 PERL_ARGS_ASSERT_REGHOP3;
7184
a0ed51b3 7185 if (off >= 0) {
1aa99e6b 7186 while (off-- && s < lim) {
ffc61ed2 7187 /* XXX could check well-formedness here */
a0ed51b3 7188 s += UTF8SKIP(s);
ffc61ed2 7189 }
a0ed51b3
LW
7190 }
7191 else {
1de06328
YO
7192 while (off++ && s > lim) {
7193 s--;
7194 if (UTF8_IS_CONTINUED(*s)) {
7195 while (s > lim && UTF8_IS_CONTINUATION(*s))
7196 s--;
a0ed51b3 7197 }
1de06328 7198 /* XXX could check well-formedness here */
a0ed51b3
LW
7199 }
7200 }
7201 return s;
7202}
161b471a 7203
f9f4320a
YO
7204#ifdef XXX_dmq
7205/* there are a bunch of places where we use two reghop3's that should
7206 be replaced with this routine. but since thats not done yet
7207 we ifdef it out - dmq
7208*/
dfe13c55 7209STATIC U8 *
1de06328
YO
7210S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7211{
7212 dVAR;
7918f24d
NC
7213
7214 PERL_ARGS_ASSERT_REGHOP4;
7215
1de06328
YO
7216 if (off >= 0) {
7217 while (off-- && s < rlim) {
7218 /* XXX could check well-formedness here */
7219 s += UTF8SKIP(s);
7220 }
7221 }
7222 else {
7223 while (off++ && s > llim) {
7224 s--;
7225 if (UTF8_IS_CONTINUED(*s)) {
7226 while (s > llim && UTF8_IS_CONTINUATION(*s))
7227 s--;
7228 }
7229 /* XXX could check well-formedness here */
7230 }
7231 }
7232 return s;
7233}
f9f4320a 7234#endif
1de06328
YO
7235
7236STATIC U8 *
0ce71af7 7237S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7238{
97aff369 7239 dVAR;
7918f24d
NC
7240
7241 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7242
a0ed51b3 7243 if (off >= 0) {
1aa99e6b 7244 while (off-- && s < lim) {
ffc61ed2 7245 /* XXX could check well-formedness here */
a0ed51b3 7246 s += UTF8SKIP(s);
ffc61ed2 7247 }
a0ed51b3 7248 if (off >= 0)
3dab1dad 7249 return NULL;
a0ed51b3
LW
7250 }
7251 else {
1de06328
YO
7252 while (off++ && s > lim) {
7253 s--;
7254 if (UTF8_IS_CONTINUED(*s)) {
7255 while (s > lim && UTF8_IS_CONTINUATION(*s))
7256 s--;
a0ed51b3 7257 }
1de06328 7258 /* XXX could check well-formedness here */
a0ed51b3
LW
7259 }
7260 if (off <= 0)
3dab1dad 7261 return NULL;
a0ed51b3
LW
7262 }
7263 return s;
7264}
51371543 7265
51371543 7266static void
acfe0abc 7267restore_pos(pTHX_ void *arg)
51371543 7268{
97aff369 7269 dVAR;
097eb12c 7270 regexp * const rex = (regexp *)arg;
ed301438 7271 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7272 if (PL_reg_oldsaved) {
4f639d21
DM
7273 rex->subbeg = PL_reg_oldsaved;
7274 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7275 rex->suboffset = PL_reg_oldsavedoffset;
7276 rex->subcoffset = PL_reg_oldsavedcoffset;
f8c7b90f 7277#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7278 rex->saved_copy = PL_nrs;
ed252734 7279#endif
07bc277f 7280 RXp_MATCH_COPIED_on(rex);
51371543
GS
7281 }
7282 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7283 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7284 PL_curpm = PL_reg_oldcurpm;
7285 }
7286}
33b8afdf
JH
7287
7288STATIC void
7289S_to_utf8_substr(pTHX_ register regexp *prog)
7290{
a1cac82e 7291 int i = 1;
7918f24d
NC
7292
7293 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7294
a1cac82e
NC
7295 do {
7296 if (prog->substrs->data[i].substr
7297 && !prog->substrs->data[i].utf8_substr) {
7298 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7299 prog->substrs->data[i].utf8_substr = sv;
7300 sv_utf8_upgrade(sv);
610460f9 7301 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7302 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7303 /* Trim the trailing \n that fbm_compile added last
7304 time. */
7305 SvCUR_set(sv, SvCUR(sv) - 1);
7306 /* Whilst this makes the SV technically "invalid" (as its
7307 buffer is no longer followed by "\0") when fbm_compile()
7308 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7309 fbm_compile(sv, FBMcf_TAIL);
7310 } else
7311 fbm_compile(sv, 0);
610460f9 7312 }
a1cac82e
NC
7313 if (prog->substrs->data[i].substr == prog->check_substr)
7314 prog->check_utf8 = sv;
7315 }
7316 } while (i--);
33b8afdf
JH
7317}
7318
7319STATIC void
7320S_to_byte_substr(pTHX_ register regexp *prog)
7321{
97aff369 7322 dVAR;
a1cac82e 7323 int i = 1;
7918f24d
NC
7324
7325 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7326
a1cac82e
NC
7327 do {
7328 if (prog->substrs->data[i].utf8_substr
7329 && !prog->substrs->data[i].substr) {
7330 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7331 if (sv_utf8_downgrade(sv, TRUE)) {
610460f9 7332 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
cffe132d 7333 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
610460f9
NC
7334 /* Trim the trailing \n that fbm_compile added last
7335 time. */
7336 SvCUR_set(sv, SvCUR(sv) - 1);
cffe132d
NC
7337 fbm_compile(sv, FBMcf_TAIL);
7338 } else
7339 fbm_compile(sv, 0);
7340 }
a1cac82e
NC
7341 } else {
7342 SvREFCNT_dec(sv);
7343 sv = &PL_sv_undef;
7344 }
7345 prog->substrs->data[i].substr = sv;
7346 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7347 prog->check_substr = sv;
33b8afdf 7348 }
a1cac82e 7349 } while (i--);
33b8afdf 7350}
66610fdd 7351
a0e786e5
KW
7352/* These constants are for finding GCB=LV and GCB=LVT. These are for the
7353 * pre-composed Hangul syllables, which are all in a contiguous block and
7354 * arranged there in such a way so as to facilitate alorithmic determination of
7355 * their characteristics. As such, they don't need a swash, but can be
7356 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7357 * is a GCB=LV */
7358#define SBASE 0xAC00 /* Start of block */
7359#define SCount 11172 /* Length of block */
7360#define TCount 28
7361
7362#if 0 /* This routine is not currently used */
7363PERL_STATIC_INLINE bool
7364S_is_utf8_X_LV(pTHX_ const U8 *p)
7365{
7366 /* Unlike most other similarly named routines here, this does not create a
7367 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7368
7369 dVAR;
7370
7371 UV cp = valid_utf8_to_uvchr(p, NULL);
7372
7373 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7374
7375 /* The earliest Unicode releases did not have these precomposed Hangul
7376 * syllables. Set to point to undef in that case, so will return false on
7377 * every call */
7378 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7379 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7380 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7381 SvREFCNT_dec(PL_utf8_X_LV);
7382 PL_utf8_X_LV = &PL_sv_undef;
7383 }
7384 }
7385
7386 return (PL_utf8_X_LV != &PL_sv_undef
7387 && cp >= SBASE && cp < SBASE + SCount
7388 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7389}
7390#endif
7391
7392PERL_STATIC_INLINE bool
7393S_is_utf8_X_LVT(pTHX_ const U8 *p)
7394{
7395 /* Unlike most other similarly named routines here, this does not create a
7396 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7397
7398 dVAR;
7399
7400 UV cp = valid_utf8_to_uvchr(p, NULL);
7401
7402 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7403
7404 /* The earliest Unicode releases did not have these precomposed Hangul
7405 * syllables. Set to point to undef in that case, so will return false on
7406 * every call */
7407 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7408 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7409 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7410 SvREFCNT_dec(PL_utf8_X_LVT);
7411 PL_utf8_X_LVT = &PL_sv_undef;
7412 }
7413 }
7414
7415 return (PL_utf8_X_LVT != &PL_sv_undef
7416 && cp >= SBASE && cp < SBASE + SCount
7417 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7418}
7419
66610fdd
RGS
7420/*
7421 * Local variables:
7422 * c-indentation-style: bsd
7423 * c-basic-offset: 4
14d04a33 7424 * indent-tabs-mode: nil
66610fdd
RGS
7425 * End:
7426 *
14d04a33 7427 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7428 */