This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex: Splice out no longer used array element
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
42const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
44
a687059c 45/*
e50aee73 46 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
47 *
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
50 *
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
54 *
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
57 * from defects in it.
58 *
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
61 *
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
64 *
65 **** Alterations to Henry's code are...
66 ****
4bb101f2 67 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
68 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
69 **** by Larry Wall and others
a687059c 70 ****
9ef589d8
LW
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
a687059c
LW
73 *
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
77 */
78#include "EXTERN.h"
864dbfa3 79#define PERL_IN_REGEXEC_C
a687059c 80#include "perl.h"
0f5d15d6 81
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
81e983c1 88#include "inline_invlist.c"
1b0f46bf 89#include "unicode_constants.h"
81e983c1 90
ef07e810 91#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 92#define RF_warned 2 /* warned about big count? */
faec1544 93
ab3bbdeb 94#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 95
f2ed9b32 96#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
e0193e47 104/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
af364d03
KW
107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 115#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad
YO
117#define HOPc(pos,off) \
118 (char *)(PL_reg_match_utf8 \
52657f30 119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
120 : (U8*)(pos + off))
121#define HOPBACKc(pos, off) \
07be1b83
YO
122 (char*)(PL_reg_match_utf8\
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
8e11feef 125 ? (U8*)pos - off \
3dab1dad 126 : NULL)
efb30f32 127
e7409c1b 128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 130
7016d6eb
DM
131
132#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
133#define NEXTCHR_IS_EOS (nextchr < 0)
134
135#define SET_nextchr \
136 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
137
138#define SET_locinput(p) \
139 locinput = (p); \
140 SET_nextchr
141
142
20d0b1e9 143/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 144#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 145 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 146 bool ok; \
9c4fdda1 147 ENTER; save_re_context(); \
cf54f63a
JL
148 ok=CAT2(is_utf8_,class)((const U8*)str); \
149 PERL_UNUSED_VAR(ok); \
150 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
151/* Doesn't do an assert to verify that is correct */
152#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 153 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 154 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
155 ENTER; save_re_context(); \
156 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
157 LEAVE; } } STMT_END
37e2e78e 158
1a4fad37
AL
159#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
160#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
161#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 162
37e2e78e 163#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
164 /* No asserts are done for some of these, in case called on a */ \
165 /* Unicode version in which they map to nothing */ \
27d4fc33 166 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 167 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 168
1dcf4a1b 169#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 170
ee9a90b8
KW
171/* The actual code for CCC_TRY, which uses several variables from the routine
172 * it's callable from. It is designed to be the bulk of a case statement.
173 * FUNC is the macro or function to call on non-utf8 targets that indicate if
174 * nextchr matches the class.
175 * UTF8_TEST is the whole test string to use for utf8 targets
176 * LOAD is what to use to test, and if not present to load in the swash for the
177 * class
178 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
179 * UTF8_TEST test.
180 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
181 * utf8 and a variant, load the swash if necessary and test using the utf8
182 * test. Advance to the next character if test is ok, otherwise fail; If not
183 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
184 * fails, or advance to the next character */
185
186#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
7016d6eb 187 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
188 sayNO; \
189 } \
190 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
191 LOAD_UTF8_CHARCLASS(CLASS, STR); \
192 if (POS_OR_NEG (UTF8_TEST)) { \
193 sayNO; \
194 } \
ee9a90b8 195 } \
28b98f76
DM
196 else if (POS_OR_NEG (FUNC(nextchr))) { \
197 sayNO; \
ee9a90b8 198 } \
28b98f76 199 goto increment_locinput;
980866de 200
ee9a90b8
KW
201/* Handle the non-locale cases for a character class and its complement. It
202 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
203 * This is because that code fails when the test succeeds, so we want to have
204 * the test fail so that the code succeeds. The swash is stored in a
205 * predictable PL_ place */
cfaf538b
KW
206#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
207 CLASS, STR) \
ee9a90b8
KW
208 case NAME: \
209 _CCC_TRY_CODE( !, FUNC, \
210 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
211 (U8*)locinput, TRUE)), \
212 CLASS, STR) \
213 case NNAME: \
1dcf4a1b 214 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218
219/* Generate the case statements for both locale and non-locale character
220 * classes in regmatch for classes that don't have special unicode semantics.
221 * Locales don't use an immediate swash, but an intermediary special locale
222 * function that is called on the pointer to the current place in the input
223 * string. That function will resolve to needing the same swash. One might
224 * think that because we don't know what the locale will match, we shouldn't
225 * check with the swash loading function that it loaded properly; ie, that we
226 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
227 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
228 * irrelevant here */
229#define CCC_TRY(NAME, NNAME, FUNC, \
230 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 231 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
232 CLASS, STR) \
233 case NAMEL: \
234 PL_reg_flags |= RF_tainted; \
235 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
236 case NNAMEL: \
237 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
238 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
239 CLASS, STR) \
cfaf538b 240 case NAMEA: \
7016d6eb 241 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
242 sayNO; \
243 } \
244 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 245 locinput++; \
cfaf538b
KW
246 break; \
247 case NNAMEA: \
7016d6eb 248 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
249 sayNO; \
250 } \
28b98f76 251 goto increment_locinput; \
ee9a90b8
KW
252 /* Generate the non-locale cases */ \
253 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
254
255/* This is like CCC_TRY, but has an extra set of parameters for generating case
256 * statements to handle separate Unicode semantics nodes */
257#define CCC_TRY_U(NAME, NNAME, FUNC, \
258 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
259 NAMEU, NNAMEU, FUNCU, \
cfaf538b 260 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 261 CLASS, STR) \
cfaf538b
KW
262 CCC_TRY(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEA, NNAMEA, FUNCA, \
265 CLASS, STR) \
ee9a90b8 266 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 267
3dab1dad
YO
268/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
269
5f80c4cf 270/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
271/* it would be nice to rework regcomp.sym to generate this stuff. sigh
272 *
273 * NOTE that *nothing* that affects backtracking should be in here, specifically
274 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
275 * node that is in between two EXACT like nodes when ascertaining what the required
276 * "follow" character is. This should probably be moved to regex compile time
277 * although it may be done at run time beause of the REF possibility - more
278 * investigation required. -- demerphq
279*/
3e901dc0
YO
280#define JUMPABLE(rn) ( \
281 OP(rn) == OPEN || \
282 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
283 OP(rn) == EVAL || \
cca55fe3
JP
284 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
285 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 286 OP(rn) == KEEPS || \
3dab1dad 287 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 288)
ee9b8eae 289#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 290
ee9b8eae
YO
291#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
292
293#if 0
294/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
295 we don't need this definition. */
296#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 297#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
298#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
299
300#else
301/* ... so we use this as its faster. */
302#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 303#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
304#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
305#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
306
307#endif
e2d8ce26 308
a84d97b6
HS
309/*
310 Search for mandatory following text node; for lookahead, the text must
311 follow but for lookbehind (rn->flags != 0) we skip to the next step.
312*/
cca55fe3 313#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
314 while (JUMPABLE(rn)) { \
315 const OPCODE type = OP(rn); \
316 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 317 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 318 else if (type == PLUS) \
cca55fe3 319 rn = NEXTOPER(rn); \
3dab1dad 320 else if (type == IFMATCH) \
a84d97b6 321 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 322 else rn += NEXT_OFF(rn); \
3dab1dad 323 } \
5f80c4cf 324} STMT_END
74750237 325
c476f425 326
acfe0abc 327static void restore_pos(pTHX_ void *arg);
51371543 328
87c0511b 329#define REGCP_PAREN_ELEMS 3
f067efbf 330#define REGCP_OTHER_ELEMS 3
e0fa7e2b 331#define REGCP_FRAME_ELEMS 1
620d5b66
NC
332/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
333 * are needed for the regexp context stack bookkeeping. */
334
76e3520e 335STATIC CHECKPOINT
b93070ed 336S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 337{
97aff369 338 dVAR;
a3b680e6 339 const int retval = PL_savestack_ix;
a3b680e6 340 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
341 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
342 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 343 I32 p;
40a82448 344 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 345
b93070ed
DM
346 PERL_ARGS_ASSERT_REGCPPUSH;
347
e49a9654 348 if (paren_elems_to_push < 0)
5637ef5b
NC
349 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
350 paren_elems_to_push);
e49a9654 351
e0fa7e2b
NC
352 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
353 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
354 " out of range (%lu-%ld)",
355 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 356
620d5b66 357 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 358
495f47a5
DM
359 DEBUG_BUFFERS_r(
360 if ((int)PL_regsize > (int)parenfloor)
361 PerlIO_printf(Perl_debug_log,
362 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
363 PTR2UV(rex),
364 PTR2UV(rex->offs)
365 );
366 );
87c0511b 367 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 368/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
369 SSPUSHINT(rex->offs[p].end);
370 SSPUSHINT(rex->offs[p].start);
1ca2007e 371 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
374 (UV)p,
375 (IV)rex->offs[p].start,
376 (IV)rex->offs[p].start_tmp,
377 (IV)rex->offs[p].end
40a82448 378 ));
a0d0e21e 379 }
b1ce53c5 380/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 381 SSPUSHINT(PL_regsize);
b93070ed
DM
382 SSPUSHINT(rex->lastparen);
383 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 384 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 385
a0d0e21e
LW
386 return retval;
387}
388
c277df42 389/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
390#define REGCP_SET(cp) \
391 DEBUG_STATE_r( \
ab3bbdeb 392 PerlIO_printf(Perl_debug_log, \
e4f74956 393 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
394 (IV)PL_savestack_ix)); \
395 cp = PL_savestack_ix
c3464db5 396
ab3bbdeb 397#define REGCP_UNWIND(cp) \
e4f74956 398 DEBUG_STATE_r( \
ab3bbdeb 399 if (cp != PL_savestack_ix) \
e4f74956
YO
400 PerlIO_printf(Perl_debug_log, \
401 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
402 (IV)(cp), (IV)PL_savestack_ix)); \
403 regcpblow(cp)
c277df42 404
a8d1f4b4
DM
405#define UNWIND_PAREN(lp, lcp) \
406 for (n = rex->lastparen; n > lp; n--) \
407 rex->offs[n].end = -1; \
408 rex->lastparen = n; \
409 rex->lastcloseparen = lcp;
410
411
f067efbf 412STATIC void
b93070ed 413S_regcppop(pTHX_ regexp *rex)
a0d0e21e 414{
97aff369 415 dVAR;
e0fa7e2b 416 UV i;
87c0511b 417 U32 paren;
a3621e74
YO
418 GET_RE_DEBUG_FLAGS_DECL;
419
7918f24d
NC
420 PERL_ARGS_ASSERT_REGCPPOP;
421
b1ce53c5 422 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 423 i = SSPOPUV;
e0fa7e2b
NC
424 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
425 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
426 rex->lastcloseparen = SSPOPINT;
427 rex->lastparen = SSPOPINT;
3280af22 428 PL_regsize = SSPOPINT;
b1ce53c5 429
620d5b66 430 i -= REGCP_OTHER_ELEMS;
b1ce53c5 431 /* Now restore the parentheses context. */
495f47a5
DM
432 DEBUG_BUFFERS_r(
433 if (i || rex->lastparen + 1 <= rex->nparens)
434 PerlIO_printf(Perl_debug_log,
435 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
436 PTR2UV(rex),
437 PTR2UV(rex->offs)
438 );
439 );
87c0511b 440 paren = PL_regsize;
620d5b66 441 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 442 I32 tmps;
1ca2007e 443 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 444 rex->offs[paren].start = SSPOPINT;
cf93c79d 445 tmps = SSPOPINT;
b93070ed
DM
446 if (paren <= rex->lastparen)
447 rex->offs[paren].end = tmps;
495f47a5
DM
448 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
449 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
450 (UV)paren,
451 (IV)rex->offs[paren].start,
452 (IV)rex->offs[paren].start_tmp,
453 (IV)rex->offs[paren].end,
454 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 455 );
87c0511b 456 paren--;
a0d0e21e 457 }
daf18116 458#if 1
dafc8851
JH
459 /* It would seem that the similar code in regtry()
460 * already takes care of this, and in fact it is in
461 * a better location to since this code can #if 0-ed out
462 * but the code in regtry() is needed or otherwise tests
463 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
464 * (as of patchlevel 7877) will fail. Then again,
465 * this code seems to be necessary or otherwise
225593e1
DM
466 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
467 * --jhi updated by dapm */
b93070ed 468 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 469 if (i > PL_regsize)
b93070ed
DM
470 rex->offs[i].start = -1;
471 rex->offs[i].end = -1;
495f47a5
DM
472 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
473 " \\%"UVuf": %s ..-1 undeffing\n",
474 (UV)i,
475 (i > PL_regsize) ? "-1" : " "
476 ));
a0d0e21e 477 }
dafc8851 478#endif
a0d0e21e
LW
479}
480
74088413
DM
481/* restore the parens and associated vars at savestack position ix,
482 * but without popping the stack */
483
484STATIC void
485S_regcp_restore(pTHX_ regexp *rex, I32 ix)
486{
487 I32 tmpix = PL_savestack_ix;
488 PL_savestack_ix = ix;
489 regcppop(rex);
490 PL_savestack_ix = tmpix;
491}
492
02db2b7b 493#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 494
a687059c 495/*
e50aee73 496 * pregexec and friends
a687059c
LW
497 */
498
76234dfb 499#ifndef PERL_IN_XSUB_RE
a687059c 500/*
c277df42 501 - pregexec - match a regexp against a string
a687059c 502 */
c277df42 503I32
49d7dfbc 504Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 505 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
506/* stringarg: the point in the string at which to begin matching */
507/* strend: pointer to null at end of string */
508/* strbeg: real beginning of string */
509/* minend: end of match must be >= minend bytes after stringarg. */
510/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
511 * itself is accessed via the pointers above */
512/* nosave: For optimizations. */
c277df42 513{
7918f24d
NC
514 PERL_ARGS_ASSERT_PREGEXEC;
515
c277df42 516 return
9041c2e3 517 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
518 nosave ? 0 : REXEC_COPY_STR);
519}
76234dfb 520#endif
22e551b9 521
9041c2e3 522/*
cad2e5aa
JH
523 * Need to implement the following flags for reg_anch:
524 *
525 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
526 * USE_INTUIT_ML
527 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
528 * INTUIT_AUTORITATIVE_ML
529 * INTUIT_ONCE_NOML - Intuit can match in one location only.
530 * INTUIT_ONCE_ML
531 *
532 * Another flag for this function: SECOND_TIME (so that float substrs
533 * with giant delta may be not rechecked).
534 */
535
536/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
537
3f7c398e 538/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
539 Otherwise, only SvCUR(sv) is used to get strbeg. */
540
541/* XXXX We assume that strpos is strbeg unless sv. */
542
6eb5f6b9
JH
543/* XXXX Some places assume that there is a fixed substring.
544 An update may be needed if optimizer marks as "INTUITable"
545 RExen without fixed substrings. Similarly, it is assumed that
546 lengths of all the strings are no more than minlen, thus they
547 cannot come from lookahead.
40d049e4
YO
548 (Or minlen should take into account lookahead.)
549 NOTE: Some of this comment is not correct. minlen does now take account
550 of lookahead/behind. Further research is required. -- demerphq
551
552*/
6eb5f6b9 553
2c2d71f5
JH
554/* A failure to find a constant substring means that there is no need to make
555 an expensive call to REx engine, thus we celebrate a failure. Similarly,
556 finding a substring too deep into the string means that less calls to
30944b6d
IZ
557 regtry() should be needed.
558
559 REx compiler's optimizer found 4 possible hints:
560 a) Anchored substring;
561 b) Fixed substring;
562 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 563 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 564 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
565 string which does not contradict any of them.
566 */
2c2d71f5 567
6eb5f6b9
JH
568/* Most of decisions we do here should have been done at compile time.
569 The nodes of the REx which we used for the search should have been
570 deleted from the finite automaton. */
571
cad2e5aa 572char *
288b8c02 573Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 574 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 575{
97aff369 576 dVAR;
288b8c02 577 struct regexp *const prog = (struct regexp *)SvANY(rx);
eb578fdb 578 I32 start_shift = 0;
cad2e5aa 579 /* Should be nonnegative! */
eb578fdb
KW
580 I32 end_shift = 0;
581 char *s;
582 SV *check;
a1933d95 583 char *strbeg;
cad2e5aa 584 char *t;
f2ed9b32 585 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 586 I32 ml_anch;
eb578fdb 587 char *other_last = NULL; /* other substr checked before this */
bd61b366 588 char *check_at = NULL; /* check substr found at this pos */
d8080198 589 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 590 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 591 RXi_GET_DECL(prog,progi);
30944b6d 592#ifdef DEBUGGING
890ce7af 593 const char * const i_strpos = strpos;
30944b6d 594#endif
a3621e74
YO
595 GET_RE_DEBUG_FLAGS_DECL;
596
7918f24d 597 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
598 PERL_UNUSED_ARG(flags);
599 PERL_UNUSED_ARG(data);
7918f24d 600
f2ed9b32 601 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 602
3c8556c3 603 if (RX_UTF8(rx)) {
b8d68ded
JH
604 PL_reg_flags |= RF_utf8;
605 }
ab3bbdeb 606 DEBUG_EXECUTE_r(
f2ed9b32 607 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
608 sv ? "Guessing start of match in sv for"
609 : "Guessing start of match in string for");
2a782b5b 610 );
cad2e5aa 611
c344f387
JH
612 /* CHR_DIST() would be more correct here but it makes things slow. */
613 if (prog->minlen > strend - strpos) {
a3621e74 614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 615 "String too short... [re_intuit_start]\n"));
cad2e5aa 616 goto fail;
2c2d71f5 617 }
1de06328 618
7016d6eb
DM
619 /* XXX we need to pass strbeg as a separate arg: the following is
620 * guesswork and can be wrong... */
621 if (sv && SvPOK(sv)) {
622 char * p = SvPVX(sv);
623 STRLEN cur = SvCUR(sv);
624 if (p <= strpos && strpos < p + cur) {
625 strbeg = p;
626 assert(p <= strend && strend <= p + cur);
627 }
628 else
629 strbeg = strend - cur;
630 }
631 else
632 strbeg = strpos;
633
1aa99e6b 634 PL_regeol = strend;
f2ed9b32 635 if (utf8_target) {
33b8afdf
JH
636 if (!prog->check_utf8 && prog->check_substr)
637 to_utf8_substr(prog);
638 check = prog->check_utf8;
639 } else {
7e0d5ad7
KW
640 if (!prog->check_substr && prog->check_utf8) {
641 if (! to_byte_substr(prog)) {
642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
643 non_utf8_target_but_utf8_required));
644 goto fail;
645 }
646 }
33b8afdf
JH
647 check = prog->check_substr;
648 }
bbe252da
YO
649 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
650 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
651 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 652 && !multiline ) ); /* Check after \n? */
cad2e5aa 653
7e25d62c 654 if (!ml_anch) {
bbe252da
YO
655 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
656 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 657 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
658 && sv && !SvROK(sv)
659 && (strpos != strbeg)) {
a3621e74 660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
661 goto fail;
662 }
663 if (prog->check_offset_min == prog->check_offset_max &&
bbe252da 664 !(prog->extflags & RXf_CANY_SEEN)) {
2c2d71f5 665 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
666 I32 slen;
667
1aa99e6b 668 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 669
653099ff
GS
670 if (SvTAIL(check)) {
671 slen = SvCUR(check); /* >= 1 */
cad2e5aa 672
9041c2e3 673 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 674 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 676 goto fail_finish;
cad2e5aa
JH
677 }
678 /* Now should match s[0..slen-2] */
679 slen--;
3f7c398e 680 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 681 || (slen > 1
3f7c398e 682 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 683 report_neq:
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
685 goto fail_finish;
686 }
cad2e5aa 687 }
3f7c398e 688 else if (*SvPVX_const(check) != *s
653099ff 689 || ((slen = SvCUR(check)) > 1
3f7c398e 690 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 691 goto report_neq;
c315bfe8 692 check_at = s;
2c2d71f5 693 goto success_at_start;
7e25d62c 694 }
cad2e5aa 695 }
2c2d71f5 696 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 697 s = strpos;
2c2d71f5 698 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
699 end_shift = prog->check_end_shift;
700
2c2d71f5 701 if (!ml_anch) {
a3b680e6 702 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 703 - (SvTAIL(check) != 0);
a3b680e6 704 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
705
706 if (end_shift < eshift)
707 end_shift = eshift;
708 }
cad2e5aa 709 }
2c2d71f5 710 else { /* Can match at random position */
cad2e5aa
JH
711 ml_anch = 0;
712 s = strpos;
1de06328
YO
713 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
714 end_shift = prog->check_end_shift;
715
716 /* end shift should be non negative here */
cad2e5aa
JH
717 }
718
bcdf7404 719#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 720 if (end_shift < 0)
1de06328 721 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 722 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
723#endif
724
2c2d71f5
JH
725 restart:
726 /* Find a possible match in the region s..strend by looking for
727 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
728
729 {
730 I32 srch_start_shift = start_shift;
731 I32 srch_end_shift = end_shift;
c33e64f0
FC
732 U8* start_point;
733 U8* end_point;
1de06328
YO
734 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
735 srch_end_shift -= ((strbeg - s) - srch_start_shift);
736 srch_start_shift = strbeg - s;
737 }
6bda09f9 738 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
739 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
740 (IV)prog->check_offset_min,
741 (IV)srch_start_shift,
742 (IV)srch_end_shift,
743 (IV)prog->check_end_shift);
744 });
745
bbe252da 746 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
747 start_point= (U8*)(s + srch_start_shift);
748 end_point= (U8*)(strend - srch_end_shift);
749 } else {
750 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
751 end_point= HOP3(strend, -srch_end_shift, strbeg);
752 }
6bda09f9 753 DEBUG_OPTIMISE_MORE_r({
56570a2c 754 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 755 (int)(end_point - start_point),
fc8cd66c 756 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
757 start_point);
758 });
759
760 s = fbm_instr( start_point, end_point,
7fba1cd6 761 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 762 }
cad2e5aa
JH
763 /* Update the count-of-usability, remove useless subpatterns,
764 unshift s. */
2c2d71f5 765
ab3bbdeb 766 DEBUG_EXECUTE_r({
f2ed9b32 767 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
768 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
769 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 770 (s ? "Found" : "Did not find"),
f2ed9b32 771 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
772 ? "anchored" : "floating"),
773 quoted,
774 RE_SV_TAIL(check),
775 (s ? " at offset " : "...\n") );
776 });
2c2d71f5
JH
777
778 if (!s)
779 goto fail_finish;
2c2d71f5 780 /* Finish the diagnostic message */
a3621e74 781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 782
1de06328
YO
783 /* XXX dmq: first branch is for positive lookbehind...
784 Our check string is offset from the beginning of the pattern.
785 So we need to do any stclass tests offset forward from that
786 point. I think. :-(
787 */
788
789
790
791 check_at=s;
792
793
2c2d71f5
JH
794 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
795 Start with the other substr.
796 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 797 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
798 *always* match. Probably should be marked during compile...
799 Probably it is right to do no SCREAM here...
800 */
801
f2ed9b32 802 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
803 : (prog->float_substr && prog->anchored_substr))
804 {
30944b6d 805 /* Take into account the "other" substring. */
2c2d71f5
JH
806 /* XXXX May be hopelessly wrong for UTF... */
807 if (!other_last)
6eb5f6b9 808 other_last = strpos;
f2ed9b32 809 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
810 do_other_anchored:
811 {
890ce7af
AL
812 char * const last = HOP3c(s, -start_shift, strbeg);
813 char *last1, *last2;
be8e71aa 814 char * const saved_s = s;
33b8afdf 815 SV* must;
2c2d71f5 816
2c2d71f5
JH
817 t = s - prog->check_offset_max;
818 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 819 && (!utf8_target
0ce71af7 820 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 821 && t > strpos)))
6f207bd3 822 NOOP;
2c2d71f5
JH
823 else
824 t = strpos;
1aa99e6b 825 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
826 if (t < other_last) /* These positions already checked */
827 t = other_last;
1aa99e6b 828 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
829 if (last < last1)
830 last1 = last;
1de06328
YO
831 /* XXXX It is not documented what units *_offsets are in.
832 We assume bytes, but this is clearly wrong.
833 Meaning this code needs to be carefully reviewed for errors.
834 dmq.
835 */
836
2c2d71f5 837 /* On end-of-str: see comment below. */
f2ed9b32 838 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
839 if (must == &PL_sv_undef) {
840 s = (char*)NULL;
1de06328 841 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
842 }
843 else
844 s = fbm_instr(
845 (unsigned char*)t,
846 HOP3(HOP3(last1, prog->anchored_offset, strend)
847 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
848 must,
7fba1cd6 849 multiline ? FBMrf_MULTILINE : 0
33b8afdf 850 );
ab3bbdeb 851 DEBUG_EXECUTE_r({
f2ed9b32 852 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
853 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
854 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 855 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
856 quoted, RE_SV_TAIL(must));
857 });
858
859
2c2d71f5
JH
860 if (!s) {
861 if (last1 >= last2) {
a3621e74 862 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
863 ", giving up...\n"));
864 goto fail_finish;
865 }
a3621e74 866 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 867 ", trying floating at offset %ld...\n",
be8e71aa 868 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
869 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
870 s = HOP3c(last, 1, strend);
2c2d71f5
JH
871 goto restart;
872 }
873 else {
a3621e74 874 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 875 (long)(s - i_strpos)));
1aa99e6b
IH
876 t = HOP3c(s, -prog->anchored_offset, strbeg);
877 other_last = HOP3c(s, 1, strend);
be8e71aa 878 s = saved_s;
2c2d71f5
JH
879 if (t == strpos)
880 goto try_at_start;
2c2d71f5
JH
881 goto try_at_offset;
882 }
30944b6d 883 }
2c2d71f5
JH
884 }
885 else { /* Take into account the floating substring. */
33b8afdf 886 char *last, *last1;
be8e71aa 887 char * const saved_s = s;
33b8afdf
JH
888 SV* must;
889
890 t = HOP3c(s, -start_shift, strbeg);
891 last1 = last =
892 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
893 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
894 last = HOP3c(t, prog->float_max_offset, strend);
895 s = HOP3c(t, prog->float_min_offset, strend);
896 if (s < other_last)
897 s = other_last;
2c2d71f5 898 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 899 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
900 /* fbm_instr() takes into account exact value of end-of-str
901 if the check is SvTAIL(ed). Since false positives are OK,
902 and end-of-str is not later than strend we are OK. */
903 if (must == &PL_sv_undef) {
904 s = (char*)NULL;
1de06328 905 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
906 }
907 else
2c2d71f5 908 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
909 (unsigned char*)last + SvCUR(must)
910 - (SvTAIL(must)!=0),
7fba1cd6 911 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 912 DEBUG_EXECUTE_r({
f2ed9b32 913 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
914 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
915 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 916 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
917 quoted, RE_SV_TAIL(must));
918 });
33b8afdf
JH
919 if (!s) {
920 if (last1 == last) {
a3621e74 921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
922 ", giving up...\n"));
923 goto fail_finish;
2c2d71f5 924 }
a3621e74 925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 926 ", trying anchored starting at offset %ld...\n",
be8e71aa 927 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
928 other_last = last;
929 s = HOP3c(t, 1, strend);
930 goto restart;
931 }
932 else {
a3621e74 933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
934 (long)(s - i_strpos)));
935 other_last = s; /* Fix this later. --Hugo */
be8e71aa 936 s = saved_s;
33b8afdf
JH
937 if (t == strpos)
938 goto try_at_start;
939 goto try_at_offset;
940 }
2c2d71f5 941 }
cad2e5aa 942 }
2c2d71f5 943
1de06328 944
9ef43ace 945 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 946
6bda09f9 947 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
948 PerlIO_printf(Perl_debug_log,
949 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
950 (IV)prog->check_offset_min,
951 (IV)prog->check_offset_max,
952 (IV)(s-strpos),
953 (IV)(t-strpos),
954 (IV)(t-s),
955 (IV)(strend-strpos)
956 )
957 );
958
2c2d71f5 959 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 960 && (!utf8_target
9ef43ace 961 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
962 && t > strpos)))
963 {
2c2d71f5
JH
964 /* Fixed substring is found far enough so that the match
965 cannot start at strpos. */
966 try_at_offset:
cad2e5aa 967 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
968 /* Eventually fbm_*() should handle this, but often
969 anchored_offset is not 0, so this check will not be wasted. */
970 /* XXXX In the code below we prefer to look for "^" even in
971 presence of anchored substrings. And we search even
972 beyond the found float position. These pessimizations
973 are historical artefacts only. */
974 find_anchor:
2c2d71f5 975 while (t < strend - prog->minlen) {
cad2e5aa 976 if (*t == '\n') {
4ee3650e 977 if (t < check_at - prog->check_offset_min) {
f2ed9b32 978 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
979 /* Since we moved from the found position,
980 we definitely contradict the found anchored
30944b6d
IZ
981 substr. Due to the above check we do not
982 contradict "check" substr.
983 Thus we can arrive here only if check substr
984 is float. Redo checking for "other"=="fixed".
985 */
9041c2e3 986 strpos = t + 1;
a3621e74 987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 988 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
989 goto do_other_anchored;
990 }
4ee3650e
GS
991 /* We don't contradict the found floating substring. */
992 /* XXXX Why not check for STCLASS? */
cad2e5aa 993 s = t + 1;
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
996 goto set_useful;
997 }
4ee3650e
GS
998 /* Position contradicts check-string */
999 /* XXXX probably better to look for check-string
1000 than for "\n", so one should lower the limit for t? */
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1002 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1003 other_last = strpos = s = t + 1;
cad2e5aa
JH
1004 goto restart;
1005 }
1006 t++;
1007 }
a3621e74 1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1009 PL_colors[0], PL_colors[1]));
2c2d71f5 1010 goto fail_finish;
cad2e5aa 1011 }
f5952150 1012 else {
a3621e74 1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1014 PL_colors[0], PL_colors[1]));
f5952150 1015 }
cad2e5aa
JH
1016 s = t;
1017 set_useful:
f2ed9b32 1018 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1019 }
1020 else {
f5952150 1021 /* The found string does not prohibit matching at strpos,
2c2d71f5 1022 - no optimization of calling REx engine can be performed,
f5952150
GS
1023 unless it was an MBOL and we are not after MBOL,
1024 or a future STCLASS check will fail this. */
2c2d71f5
JH
1025 try_at_start:
1026 /* Even in this situation we may use MBOL flag if strpos is offset
1027 wrt the start of the string. */
05b4157f 1028 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1029 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1030 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1031 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1032 {
cad2e5aa
JH
1033 t = strpos;
1034 goto find_anchor;
1035 }
a3621e74 1036 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1037 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1038 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1039 );
2c2d71f5 1040 success_at_start:
bbe252da 1041 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1042 && (utf8_target ? (
33b8afdf
JH
1043 prog->check_utf8 /* Could be deleted already */
1044 && --BmUSEFUL(prog->check_utf8) < 0
1045 && (prog->check_utf8 == prog->float_utf8)
1046 ) : (
1047 prog->check_substr /* Could be deleted already */
1048 && --BmUSEFUL(prog->check_substr) < 0
1049 && (prog->check_substr == prog->float_substr)
1050 )))
66e933ab 1051 {
cad2e5aa 1052 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1053 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1054 /* XXX Does the destruction order has to change with utf8_target? */
1055 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1056 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1057 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1058 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1059 check = NULL; /* abort */
cad2e5aa 1060 s = strpos;
486ec47a 1061 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1062 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1063 if (prog->intflags & PREGf_IMPLICIT)
1064 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1065 /* XXXX This is a remnant of the old implementation. It
1066 looks wasteful, since now INTUIT can use many
6eb5f6b9 1067 other heuristics. */
bbe252da 1068 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1069 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1070 }
1071 else
1072 s = strpos;
1073 }
1074
6eb5f6b9
JH
1075 /* Last resort... */
1076 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1077 /* trie stclasses are too expensive to use here, we are better off to
1078 leave it to regmatch itself */
f8fc2ecf 1079 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1080 /* minlen == 0 is possible if regstclass is \b or \B,
1081 and the fixed substr is ''$.
1082 Since minlen is already taken into account, s+1 is before strend;
1083 accidentally, minlen >= 1 guaranties no false positives at s + 1
1084 even for \b or \B. But (minlen? 1 : 0) below assumes that
1085 regstclass does not come from lookahead... */
1086 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1087 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1088 const U8* const str = (U8*)STRING(progi->regstclass);
1089 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1090 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1091 : 1);
1de06328
YO
1092 char * endpos;
1093 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1094 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1095 else if (prog->float_substr || prog->float_utf8)
1096 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1097 else
1098 endpos= strend;
1099
d8080198
YO
1100 if (checked_upto < s)
1101 checked_upto = s;
1102 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104
6eb5f6b9 1105 t = s;
d8080198
YO
1106 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1107 if (s) {
1108 checked_upto = s;
1109 } else {
6eb5f6b9 1110#ifdef DEBUGGING
cbbf8932 1111 const char *what = NULL;
6eb5f6b9
JH
1112#endif
1113 if (endpos == strend) {
a3621e74 1114 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1115 "Could not match STCLASS...\n") );
1116 goto fail;
1117 }
a3621e74 1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1119 "This position contradicts STCLASS...\n") );
bbe252da 1120 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1121 goto fail;
d8080198
YO
1122 checked_upto = HOPBACKc(endpos, start_shift);
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1124 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1125 /* Contradict one of substrings */
33b8afdf 1126 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1127 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1128 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1129 hop_and_restart:
1aa99e6b 1130 s = HOP3c(t, 1, strend);
66e933ab
GS
1131 if (s + start_shift + end_shift > strend) {
1132 /* XXXX Should be taken into account earlier? */
a3621e74 1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1134 "Could not match STCLASS...\n") );
1135 goto fail;
1136 }
5e39e1e5
HS
1137 if (!check)
1138 goto giveup;
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1140 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1141 what, (long)(s + start_shift - i_strpos)) );
1142 goto restart;
1143 }
66e933ab 1144 /* Have both, check_string is floating */
6eb5f6b9
JH
1145 if (t + start_shift >= check_at) /* Contradicts floating=check */
1146 goto retry_floating_check;
1147 /* Recheck anchored substring, but not floating... */
9041c2e3 1148 s = check_at;
5e39e1e5
HS
1149 if (!check)
1150 goto giveup;
a3621e74 1151 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1152 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1153 (long)(other_last - i_strpos)) );
1154 goto do_other_anchored;
1155 }
60e71179
GS
1156 /* Another way we could have checked stclass at the
1157 current position only: */
1158 if (ml_anch) {
1159 s = t = t + 1;
5e39e1e5
HS
1160 if (!check)
1161 goto giveup;
a3621e74 1162 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1163 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1164 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1165 goto try_at_offset;
66e933ab 1166 }
f2ed9b32 1167 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1168 goto fail;
486ec47a 1169 /* Check is floating substring. */
6eb5f6b9
JH
1170 retry_floating_check:
1171 t = check_at - start_shift;
a3621e74 1172 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1173 goto hop_and_restart;
1174 }
b7953727 1175 if (t != s) {
a3621e74 1176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1177 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1178 (long)(t - i_strpos), (long)(s - i_strpos))
1179 );
1180 }
1181 else {
a3621e74 1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1183 "Does not contradict STCLASS...\n");
1184 );
1185 }
6eb5f6b9 1186 }
5e39e1e5 1187 giveup:
a3621e74 1188 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1189 PL_colors[4], (check ? "Guessed" : "Giving up"),
1190 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1191 return s;
2c2d71f5
JH
1192
1193 fail_finish: /* Substring not found */
33b8afdf 1194 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1195 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1196 fail:
a3621e74 1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1198 PL_colors[4], PL_colors[5]));
bd61b366 1199 return NULL;
cad2e5aa 1200}
9661b544 1201
a0a388a1
YO
1202#define DECL_TRIE_TYPE(scan) \
1203 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1204 trie_type = ((scan->flags == EXACT) \
1205 ? (utf8_target ? trie_utf8 : trie_plain) \
1206 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1207
1208#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1209uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1210 STRLEN skiplen; \
1211 switch (trie_type) { \
1212 case trie_utf8_fold: \
1213 if ( foldlen>0 ) { \
1214 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1215 foldlen -= len; \
1216 uscan += len; \
1217 len=0; \
1218 } else { \
1219 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1220 len = UTF8SKIP(uc); \
1221 skiplen = UNISKIP( uvc ); \
1222 foldlen -= skiplen; \
1223 uscan = foldbuf + skiplen; \
1224 } \
1225 break; \
1226 case trie_latin_utf8_fold: \
1227 if ( foldlen>0 ) { \
1228 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1229 foldlen -= len; \
1230 uscan += len; \
1231 len=0; \
1232 } else { \
1233 len = 1; \
1234 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1235 skiplen = UNISKIP( uvc ); \
1236 foldlen -= skiplen; \
1237 uscan = foldbuf + skiplen; \
1238 } \
1239 break; \
1240 case trie_utf8: \
1241 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1242 break; \
1243 case trie_plain: \
1244 uvc = (UV)*uc; \
1245 len = 1; \
1246 } \
1247 if (uvc < 256) { \
1248 charid = trie->charmap[ uvc ]; \
1249 } \
1250 else { \
1251 charid = 0; \
1252 if (widecharmap) { \
1253 SV** const svpp = hv_fetch(widecharmap, \
1254 (char*)&uvc, sizeof(UV), 0); \
1255 if (svpp) \
1256 charid = (U16)SvIV(*svpp); \
1257 } \
1258 } \
4cadc6a9
YO
1259} STMT_END
1260
4cadc6a9
YO
1261#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1262STMT_START { \
1263 while (s <= e) { \
1264 if ( (CoNd) \
fac1af77 1265 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1266 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1267 goto got_it; \
1268 s++; \
1269 } \
1270} STMT_END
1271
1272#define REXEC_FBC_UTF8_SCAN(CoDe) \
1273STMT_START { \
7016d6eb 1274 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1275 CoDe \
1276 s += uskip; \
1277 } \
1278} STMT_END
1279
1280#define REXEC_FBC_SCAN(CoDe) \
1281STMT_START { \
1282 while (s < strend) { \
1283 CoDe \
1284 s++; \
1285 } \
1286} STMT_END
1287
1288#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1289REXEC_FBC_UTF8_SCAN( \
1290 if (CoNd) { \
24b23f37 1291 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1292 goto got_it; \
1293 else \
1294 tmp = doevery; \
1295 } \
1296 else \
1297 tmp = 1; \
1298)
1299
1300#define REXEC_FBC_CLASS_SCAN(CoNd) \
1301REXEC_FBC_SCAN( \
1302 if (CoNd) { \
24b23f37 1303 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1304 goto got_it; \
1305 else \
1306 tmp = doevery; \
1307 } \
1308 else \
1309 tmp = 1; \
1310)
1311
1312#define REXEC_FBC_TRYIT \
24b23f37 1313if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1314 goto got_it
1315
e1d1eefb 1316#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1317 if (utf8_target) { \
e1d1eefb
YO
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1319 } \
1320 else { \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1322 }
e1d1eefb 1323
4cadc6a9 1324#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1325 if (utf8_target) { \
4cadc6a9
YO
1326 UtFpReLoAd; \
1327 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1328 } \
1329 else { \
1330 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1331 }
4cadc6a9
YO
1332
1333#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1334 PL_reg_flags |= RF_tainted; \
f2ed9b32 1335 if (utf8_target) { \
4cadc6a9
YO
1336 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1337 } \
1338 else { \
1339 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1340 }
4cadc6a9 1341
786e8c11
YO
1342#define DUMP_EXEC_POS(li,s,doutf8) \
1343 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1344
cfaf538b
KW
1345
1346#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1347 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1348 tmp = TEST_NON_UTF8(tmp); \
1349 REXEC_FBC_UTF8_SCAN( \
1350 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1351 tmp = !tmp; \
1352 IF_SUCCESS; \
1353 } \
1354 else { \
1355 IF_FAIL; \
1356 } \
1357 ); \
1358
1359#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1360 if (s == PL_bostr) { \
1361 tmp = '\n'; \
1362 } \
1363 else { \
1364 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1365 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1366 } \
1367 tmp = TeSt1_UtF8; \
1368 LOAD_UTF8_CHARCLASS_ALNUM(); \
1369 REXEC_FBC_UTF8_SCAN( \
1370 if (tmp == ! (TeSt2_UtF8)) { \
1371 tmp = !tmp; \
1372 IF_SUCCESS; \
1373 } \
1374 else { \
1375 IF_FAIL; \
1376 } \
1377 ); \
1378
63ac0dad
KW
1379/* The only difference between the BOUND and NBOUND cases is that
1380 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1381 * NBOUND. This is accomplished by passing it in either the if or else clause,
1382 * with the other one being empty */
1383#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1384 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1385
1386#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1387 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1388
1389#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1391
1392#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1394
63ac0dad
KW
1395
1396/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1397 * be passed in completely with the variable name being tested, which isn't
1398 * such a clean interface, but this is easier to read than it was before. We
1399 * are looking for the boundary (or non-boundary between a word and non-word
1400 * character. The utf8 and non-utf8 cases have the same logic, but the details
1401 * must be different. Find the "wordness" of the character just prior to this
1402 * one, and compare it with the wordness of this one. If they differ, we have
1403 * a boundary. At the beginning of the string, pretend that the previous
1404 * character was a new-line */
cfaf538b 1405#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1406 if (utf8_target) { \
cfaf538b 1407 UTF8_CODE \
63ac0dad
KW
1408 } \
1409 else { /* Not utf8 */ \
1410 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1411 tmp = TEST_NON_UTF8(tmp); \
1412 REXEC_FBC_SCAN( \
1413 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1414 tmp = !tmp; \
1415 IF_SUCCESS; \
1416 } \
1417 else { \
1418 IF_FAIL; \
1419 } \
1420 ); \
1421 } \
1422 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1423 goto got_it;
1424
786e8c11
YO
1425/* We know what class REx starts with. Try to find this position... */
1426/* if reginfo is NULL, its a dryrun */
1427/* annoyingly all the vars in this routine have different names from their counterparts
1428 in regmatch. /grrr */
1429
3c3eec57 1430STATIC char *
07be1b83 1431S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1432 const char *strend, regmatch_info *reginfo)
a687059c 1433{
27da23d5 1434 dVAR;
bbe252da 1435 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1436 char *pat_string; /* The pattern's exactish string */
1437 char *pat_end; /* ptr to end char of pat_string */
1438 re_fold_t folder; /* Function for computing non-utf8 folds */
1439 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1440 STRLEN ln;
5dab1207 1441 STRLEN lnc;
eb578fdb 1442 STRLEN uskip;
fac1af77
KW
1443 U8 c1;
1444 U8 c2;
6eb5f6b9 1445 char *e;
eb578fdb
KW
1446 I32 tmp = 1; /* Scratch variable? */
1447 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1448 UV utf8_fold_flags = 0;
f8fc2ecf 1449 RXi_GET_DECL(prog,progi);
7918f24d
NC
1450
1451 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1452
6eb5f6b9
JH
1453 /* We know what class it must start with. */
1454 switch (OP(c)) {
6eb5f6b9 1455 case ANYOF:
e0193e47 1456 if (utf8_target) {
b1e3e569
KW
1457 STRLEN inclasslen = strend - s;
1458 REXEC_FBC_UTF8_CLASS_SCAN(
1459 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
388cc4de
HS
1460 }
1461 else {
6ef69d56 1462 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1463 }
6eb5f6b9 1464 break;
f33976b4 1465 case CANY:
4cadc6a9 1466 REXEC_FBC_SCAN(
24b23f37 1467 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1468 goto got_it;
1469 else
1470 tmp = doevery;
4cadc6a9 1471 );
f33976b4 1472 break;
2f7f8cb1
KW
1473
1474 case EXACTFA:
1475 if (UTF_PATTERN || utf8_target) {
1476 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1477 goto do_exactf_utf8;
1478 }
1479 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1480 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1481 goto do_exactf_non_utf8; /* isn't dealt with by these */
1482
6eb5f6b9 1483 case EXACTF:
62bf7766 1484 if (utf8_target) {
77a6d856
KW
1485
1486 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1487 utf8_fold_flags = 0;
fac1af77
KW
1488 goto do_exactf_utf8;
1489 }
1490 fold_array = PL_fold;
1491 folder = foldEQ;
1492 goto do_exactf_non_utf8;
1493
1494 case EXACTFL:
1495 if (UTF_PATTERN || utf8_target) {
17580e7a 1496 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1497 goto do_exactf_utf8;
1498 }
1499 fold_array = PL_fold_locale;
1500 folder = foldEQ_locale;
16d951b7
KW
1501 goto do_exactf_non_utf8;
1502
3c760661
KW
1503 case EXACTFU_SS:
1504 if (UTF_PATTERN) {
1505 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1506 }
1507 goto do_exactf_utf8;
1508
fab2782b 1509 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1510 case EXACTFU:
1511 if (UTF_PATTERN || utf8_target) {
77a6d856 1512 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1513 goto do_exactf_utf8;
1514 }
1515
1516 /* Any 'ss' in the pattern should have been replaced by regcomp,
1517 * so we don't have to worry here about this single special case
1518 * in the Latin1 range */
1519 fold_array = PL_fold_latin1;
1520 folder = foldEQ_latin1;
fac1af77
KW
1521
1522 /* FALL THROUGH */
1523
62bf7766
KW
1524 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1525 are no glitches with fold-length differences
1526 between the target string and pattern */
fac1af77
KW
1527
1528 /* The idea in the non-utf8 EXACTF* cases is to first find the
1529 * first character of the EXACTF* node and then, if necessary,
1530 * case-insensitively compare the full text of the node. c1 is the
1531 * first character. c2 is its fold. This logic will not work for
1532 * Unicode semantics and the german sharp ss, which hence should
1533 * not be compiled into a node that gets here. */
1534 pat_string = STRING(c);
1535 ln = STR_LEN(c); /* length to match in octets/bytes */
1536
8a90a8fe
KW
1537 /* We know that we have to match at least 'ln' bytes (which is the
1538 * same as characters, since not utf8). If we have to match 3
1539 * characters, and there are only 2 availabe, we know without
1540 * trying that it will fail; so don't start a match past the
1541 * required minimum number from the far end */
fac1af77
KW
1542 e = HOP3c(strend, -((I32)ln), s);
1543
1544 if (!reginfo && e < s) {
1545 e = s; /* Due to minlen logic of intuit() */
1546 }
1547
1548 c1 = *pat_string;
1549 c2 = fold_array[c1];
1550 if (c1 == c2) { /* If char and fold are the same */
1551 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1552 }
1553 else {
1554 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1555 }
1556 break;
1557
1558 do_exactf_utf8:
e067297c
KW
1559 {
1560 unsigned expansion;
1561
fac1af77
KW
1562
1563 /* If one of the operands is in utf8, we can't use the simpler
1564 * folding above, due to the fact that many different characters
1565 * can have the same fold, or portion of a fold, or different-
1566 * length fold */
1567 pat_string = STRING(c);
1568 ln = STR_LEN(c); /* length to match in octets/bytes */
1569 pat_end = pat_string + ln;
1570 lnc = (UTF_PATTERN) /* length to match in characters */
1571 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1572 : ln;
1573
e067297c
KW
1574 /* We have 'lnc' characters to match in the pattern, but because of
1575 * multi-character folding, each character in the target can match
1576 * up to 3 characters (Unicode guarantees it will never exceed
1577 * this) if it is utf8-encoded; and up to 2 if not (based on the
1578 * fact that the Latin 1 folds are already determined, and the
1579 * only multi-char fold in that range is the sharp-s folding to
1580 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1581 * string character. Adjust lnc accordingly, rounding up, so that
1582 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1583 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1584 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1585
1586 /* As in the non-UTF8 case, if we have to match 3 characters, and
1587 * only 2 are left, it's guaranteed to fail, so don't start a
1588 * match that would require us to go beyond the end of the string
1589 */
1590 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1591
1592 if (!reginfo && e < s) {
1593 e = s; /* Due to minlen logic of intuit() */
1594 }
1595
b33105db
KW
1596 /* XXX Note that we could recalculate e to stop the loop earlier,
1597 * as the worst case expansion above will rarely be met, and as we
1598 * go along we would usually find that e moves further to the left.
1599 * This would happen only after we reached the point in the loop
1600 * where if there were no expansion we should fail. Unclear if
1601 * worth the expense */
e067297c 1602
fac1af77
KW
1603 while (s <= e) {
1604 char *my_strend= (char *)strend;
1605 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1606 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1607 && (!reginfo || regtry(reginfo, &s)) )
1608 {
1609 goto got_it;
1610 }
bbdd8bad 1611 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1612 }
1613 break;
e067297c 1614 }
bbce6d69 1615 case BOUNDL:
3280af22 1616 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1617 FBC_BOUND(isALNUM_LC,
1618 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1619 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1620 break;
bbce6d69 1621 case NBOUNDL:
3280af22 1622 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1623 FBC_NBOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
1626 break;
1627 case BOUND:
1628 FBC_BOUND(isWORDCHAR,
1629 isALNUM_uni(tmp),
1630 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1631 break;
cfaf538b
KW
1632 case BOUNDA:
1633 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1634 isWORDCHAR_A(tmp),
1635 isWORDCHAR_A((U8*)s));
1636 break;
a0d0e21e 1637 case NBOUND:
63ac0dad
KW
1638 FBC_NBOUND(isWORDCHAR,
1639 isALNUM_uni(tmp),
1640 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1641 break;
cfaf538b
KW
1642 case NBOUNDA:
1643 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1644 isWORDCHAR_A(tmp),
1645 isWORDCHAR_A((U8*)s));
1646 break;
63ac0dad
KW
1647 case BOUNDU:
1648 FBC_BOUND(isWORDCHAR_L1,
1649 isALNUM_uni(tmp),
1650 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1651 break;
1652 case NBOUNDU:
1653 FBC_NBOUND(isWORDCHAR_L1,
1654 isALNUM_uni(tmp),
1655 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1656 break;
bbce6d69 1657 case ALNUML:
4cadc6a9
YO
1658 REXEC_FBC_CSCAN_TAINT(
1659 isALNUM_LC_utf8((U8*)s),
1660 isALNUM_LC(*s)
1661 );
6895a8aa 1662 break;
980866de
KW
1663 case ALNUMU:
1664 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1665 LOAD_UTF8_CHARCLASS_ALNUM(),
1666 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1667 isWORDCHAR_L1((U8) *s)
1668 );
6895a8aa 1669 break;
980866de
KW
1670 case ALNUM:
1671 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1672 LOAD_UTF8_CHARCLASS_ALNUM(),
1673 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1674 isWORDCHAR((U8) *s)
1675 );
6895a8aa 1676 break;
cfaf538b 1677 case ALNUMA:
8e9da4d4
KW
1678 /* Don't need to worry about utf8, as it can match only a single
1679 * byte invariant character */
cfaf538b 1680 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1681 break;
980866de
KW
1682 case NALNUMU:
1683 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1684 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1685 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1686 ! isWORDCHAR_L1((U8) *s)
1687 );
6895a8aa 1688 break;
a0d0e21e 1689 case NALNUM:
4cadc6a9 1690 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1691 LOAD_UTF8_CHARCLASS_ALNUM(),
1692 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1693 ! isALNUM(*s)
4cadc6a9 1694 );
6895a8aa 1695 break;
cfaf538b 1696 case NALNUMA:
8e9da4d4
KW
1697 REXEC_FBC_CSCAN(
1698 !isWORDCHAR_A(*s),
1699 !isWORDCHAR_A(*s)
1700 );
1701 break;
bbce6d69 1702 case NALNUML:
4cadc6a9
YO
1703 REXEC_FBC_CSCAN_TAINT(
1704 !isALNUM_LC_utf8((U8*)s),
1705 !isALNUM_LC(*s)
1706 );
6895a8aa 1707 break;
980866de
KW
1708 case SPACEU:
1709 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1710 LOAD_UTF8_CHARCLASS_SPACE(),
1711 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1712 isSPACE_L1((U8) *s)
1713 );
6895a8aa 1714 break;
a0d0e21e 1715 case SPACE:
4cadc6a9 1716 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1717 LOAD_UTF8_CHARCLASS_SPACE(),
1718 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1719 isSPACE((U8) *s)
4cadc6a9 1720 );
6895a8aa 1721 break;
cfaf538b 1722 case SPACEA:
8e9da4d4
KW
1723 /* Don't need to worry about utf8, as it can match only a single
1724 * byte invariant character */
cfaf538b 1725 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1726 break;
bbce6d69 1727 case SPACEL:
4cadc6a9 1728 REXEC_FBC_CSCAN_TAINT(
6bbba904 1729 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1730 isSPACE_LC(*s)
1731 );
6895a8aa 1732 break;
980866de
KW
1733 case NSPACEU:
1734 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1735 LOAD_UTF8_CHARCLASS_SPACE(),
1736 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1737 ! isSPACE_L1((U8) *s)
1738 );
6895a8aa 1739 break;
a0d0e21e 1740 case NSPACE:
4cadc6a9 1741 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1742 LOAD_UTF8_CHARCLASS_SPACE(),
1743 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1744 ! isSPACE((U8) *s)
4cadc6a9 1745 );
6895a8aa 1746 break;
cfaf538b 1747 case NSPACEA:
8e9da4d4
KW
1748 REXEC_FBC_CSCAN(
1749 !isSPACE_A(*s),
1750 !isSPACE_A(*s)
1751 );
1752 break;
bbce6d69 1753 case NSPACEL:
4cadc6a9 1754 REXEC_FBC_CSCAN_TAINT(
6bbba904 1755 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1756 !isSPACE_LC(*s)
1757 );
6895a8aa 1758 break;
a0d0e21e 1759 case DIGIT:
4cadc6a9 1760 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1761 LOAD_UTF8_CHARCLASS_DIGIT(),
1762 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1763 isDIGIT(*s)
1764 );
6895a8aa 1765 break;
cfaf538b 1766 case DIGITA:
8e9da4d4
KW
1767 /* Don't need to worry about utf8, as it can match only a single
1768 * byte invariant character */
cfaf538b 1769 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1770 break;
b8c5462f 1771 case DIGITL:
4cadc6a9
YO
1772 REXEC_FBC_CSCAN_TAINT(
1773 isDIGIT_LC_utf8((U8*)s),
1774 isDIGIT_LC(*s)
1775 );
6895a8aa 1776 break;
a0d0e21e 1777 case NDIGIT:
4cadc6a9 1778 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1779 LOAD_UTF8_CHARCLASS_DIGIT(),
1780 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1781 !isDIGIT(*s)
1782 );
6895a8aa 1783 break;
cfaf538b 1784 case NDIGITA:
8e9da4d4
KW
1785 REXEC_FBC_CSCAN(
1786 !isDIGIT_A(*s),
1787 !isDIGIT_A(*s)
1788 );
1789 break;
b8c5462f 1790 case NDIGITL:
4cadc6a9
YO
1791 REXEC_FBC_CSCAN_TAINT(
1792 !isDIGIT_LC_utf8((U8*)s),
1793 !isDIGIT_LC(*s)
1794 );
6895a8aa 1795 break;
e1d1eefb
YO
1796 case LNBREAK:
1797 REXEC_FBC_CSCAN(
7016d6eb
DM
1798 is_LNBREAK_utf8_safe(s, strend),
1799 is_LNBREAK_latin1_safe(s, strend)
e1d1eefb 1800 );
6895a8aa 1801 break;
e1d1eefb
YO
1802 case VERTWS:
1803 REXEC_FBC_CSCAN(
7016d6eb
DM
1804 is_VERTWS_utf8_safe(s, strend),
1805 is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1806 );
6895a8aa 1807 break;
e1d1eefb
YO
1808 case NVERTWS:
1809 REXEC_FBC_CSCAN(
7016d6eb
DM
1810 !is_VERTWS_utf8_safe(s, strend),
1811 !is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1812 );
6895a8aa 1813 break;
e1d1eefb
YO
1814 case HORIZWS:
1815 REXEC_FBC_CSCAN(
7016d6eb
DM
1816 is_HORIZWS_utf8_safe(s, strend),
1817 is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1818 );
6895a8aa 1819 break;
e1d1eefb
YO
1820 case NHORIZWS:
1821 REXEC_FBC_CSCAN(
7016d6eb
DM
1822 !is_HORIZWS_utf8_safe(s, strend),
1823 !is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1824 );
6895a8aa 1825 break;
0658cdde
KW
1826 case POSIXA:
1827 /* Don't need to worry about utf8, as it can match only a single
1828 * byte invariant character. The flag in this node type is the
1829 * class number to pass to _generic_isCC() to build a mask for
1830 * searching in PL_charclass[] */
1831 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1832 break;
1833 case NPOSIXA:
1834 REXEC_FBC_CSCAN(
1835 !_generic_isCC_A(*s, FLAGS(c)),
1836 !_generic_isCC_A(*s, FLAGS(c))
1837 );
1838 break;
1839
1de06328
YO
1840 case AHOCORASICKC:
1841 case AHOCORASICK:
07be1b83 1842 {
a0a388a1 1843 DECL_TRIE_TYPE(c);
07be1b83
YO
1844 /* what trie are we using right now */
1845 reg_ac_data *aho
f8fc2ecf 1846 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1847 reg_trie_data *trie
1848 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1849 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1850
1851 const char *last_start = strend - trie->minlen;
6148ee25 1852#ifdef DEBUGGING
07be1b83 1853 const char *real_start = s;
6148ee25 1854#endif
07be1b83 1855 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1856 SV *sv_points;
1857 U8 **points; /* map of where we were in the input string
786e8c11 1858 when reading a given char. For ASCII this
be8e71aa 1859 is unnecessary overhead as the relationship
38a44b82
NC
1860 is always 1:1, but for Unicode, especially
1861 case folded Unicode this is not true. */
f9e705e8 1862 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1863 U8 *bitmap=NULL;
1864
07be1b83
YO
1865
1866 GET_RE_DEBUG_FLAGS_DECL;
1867
be8e71aa
YO
1868 /* We can't just allocate points here. We need to wrap it in
1869 * an SV so it gets freed properly if there is a croak while
1870 * running the match */
1871 ENTER;
1872 SAVETMPS;
1873 sv_points=newSV(maxlen * sizeof(U8 *));
1874 SvCUR_set(sv_points,
1875 maxlen * sizeof(U8 *));
1876 SvPOK_on(sv_points);
1877 sv_2mortal(sv_points);
1878 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1879 if ( trie_type != trie_utf8_fold
1880 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1881 {
786e8c11
YO
1882 if (trie->bitmap)
1883 bitmap=(U8*)trie->bitmap;
1884 else
1885 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1886 }
786e8c11
YO
1887 /* this is the Aho-Corasick algorithm modified a touch
1888 to include special handling for long "unknown char"
1889 sequences. The basic idea being that we use AC as long
1890 as we are dealing with a possible matching char, when
1891 we encounter an unknown char (and we have not encountered
1892 an accepting state) we scan forward until we find a legal
1893 starting char.
1894 AC matching is basically that of trie matching, except
1895 that when we encounter a failing transition, we fall back
1896 to the current states "fail state", and try the current char
1897 again, a process we repeat until we reach the root state,
1898 state 1, or a legal transition. If we fail on the root state
1899 then we can either terminate if we have reached an accepting
1900 state previously, or restart the entire process from the beginning
1901 if we have not.
1902
1903 */
07be1b83
YO
1904 while (s <= last_start) {
1905 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1906 U8 *uc = (U8*)s;
1907 U16 charid = 0;
1908 U32 base = 1;
1909 U32 state = 1;
1910 UV uvc = 0;
1911 STRLEN len = 0;
1912 STRLEN foldlen = 0;
1913 U8 *uscan = (U8*)NULL;
1914 U8 *leftmost = NULL;
786e8c11
YO
1915#ifdef DEBUGGING
1916 U32 accepted_word= 0;
1917#endif
07be1b83
YO
1918 U32 pointpos = 0;
1919
1920 while ( state && uc <= (U8*)strend ) {
1921 int failed=0;
786e8c11
YO
1922 U32 word = aho->states[ state ].wordnum;
1923
1de06328
YO
1924 if( state==1 ) {
1925 if ( bitmap ) {
1926 DEBUG_TRIE_EXECUTE_r(
1927 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1928 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1929 (char *)uc, utf8_target );
1de06328
YO
1930 PerlIO_printf( Perl_debug_log,
1931 " Scanning for legal start char...\n");
1932 }
d085b490
YO
1933 );
1934 if (utf8_target) {
1935 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1936 uc += UTF8SKIP(uc);
1937 }
1938 } else {
1939 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1940 uc++;
1941 }
1942 }
1de06328 1943 s= (char *)uc;
786e8c11 1944 }
786e8c11
YO
1945 if (uc >(U8*)last_start) break;
1946 }
1947
1948 if ( word ) {
2e64971a 1949 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1950 if (!leftmost || lpos < leftmost) {
1951 DEBUG_r(accepted_word=word);
07be1b83 1952 leftmost= lpos;
786e8c11 1953 }
07be1b83 1954 if (base==0) break;
786e8c11 1955
07be1b83
YO
1956 }
1957 points[pointpos++ % maxlen]= uc;
7016d6eb
DM
1958 if (foldlen || uc < (U8*)strend) {
1959 REXEC_TRIE_READ_CHAR(trie_type, trie,
1960 widecharmap, uc,
55eed653
NC
1961 uscan, len, uvc, charid, foldlen,
1962 foldbuf, uniflags);
7016d6eb
DM
1963 DEBUG_TRIE_EXECUTE_r({
1964 dump_exec_pos( (char *)uc, c, strend,
1965 real_start, s, utf8_target);
1966 PerlIO_printf(Perl_debug_log,
1967 " Charid:%3u CP:%4"UVxf" ",
1968 charid, uvc);
1969 });
1970 }
1971 else {
1972 len = 0;
1973 charid = 0;
1974 }
1975
07be1b83
YO
1976
1977 do {
6148ee25 1978#ifdef DEBUGGING
786e8c11 1979 word = aho->states[ state ].wordnum;
6148ee25 1980#endif
07be1b83
YO
1981 base = aho->states[ state ].trans.base;
1982
786e8c11
YO
1983 DEBUG_TRIE_EXECUTE_r({
1984 if (failed)
1985 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1986 s, utf8_target );
07be1b83 1987 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1988 "%sState: %4"UVxf", word=%"UVxf,
1989 failed ? " Fail transition to " : "",
1990 (UV)state, (UV)word);
1991 });
07be1b83
YO
1992 if ( base ) {
1993 U32 tmp;
6dd2be57 1994 I32 offset;
07be1b83 1995 if (charid &&
6dd2be57
DM
1996 ( ((offset = base + charid
1997 - 1 - trie->uniquecharcount)) >= 0)
1998 && ((U32)offset < trie->lasttrans)
1999 && trie->trans[offset].check == state
2000 && (tmp=trie->trans[offset].next))
07be1b83 2001 {
786e8c11
YO
2002 DEBUG_TRIE_EXECUTE_r(
2003 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
2004 state = tmp;
2005 break;
2006 }
2007 else {
786e8c11
YO
2008 DEBUG_TRIE_EXECUTE_r(
2009 PerlIO_printf( Perl_debug_log," - fail\n"));
2010 failed = 1;
2011 state = aho->fail[state];
07be1b83
YO
2012 }
2013 }
2014 else {
2015 /* we must be accepting here */
786e8c11
YO
2016 DEBUG_TRIE_EXECUTE_r(
2017 PerlIO_printf( Perl_debug_log," - accepting\n"));
2018 failed = 1;
07be1b83
YO
2019 break;
2020 }
2021 } while(state);
786e8c11 2022 uc += len;
07be1b83
YO
2023 if (failed) {
2024 if (leftmost)
2025 break;
786e8c11 2026 if (!state) state = 1;
07be1b83
YO
2027 }
2028 }
2029 if ( aho->states[ state ].wordnum ) {
2e64971a 2030 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2031 if (!leftmost || lpos < leftmost) {
2032 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2033 leftmost = lpos;
786e8c11 2034 }
07be1b83 2035 }
07be1b83
YO
2036 if (leftmost) {
2037 s = (char*)leftmost;
786e8c11
YO
2038 DEBUG_TRIE_EXECUTE_r({
2039 PerlIO_printf(
70685ca0
JH
2040 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2041 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2042 );
2043 });
24b23f37 2044 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2045 FREETMPS;
2046 LEAVE;
07be1b83 2047 goto got_it;
be8e71aa 2048 }
07be1b83 2049 s = HOPc(s,1);
786e8c11
YO
2050 DEBUG_TRIE_EXECUTE_r({
2051 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2052 });
07be1b83 2053 } else {
786e8c11
YO
2054 DEBUG_TRIE_EXECUTE_r(
2055 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2056 break;
2057 }
2058 }
be8e71aa
YO
2059 FREETMPS;
2060 LEAVE;
07be1b83
YO
2061 }
2062 break;
b3c9acc1 2063 default:
3c3eec57
GS
2064 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2065 break;
d6a28714 2066 }
6eb5f6b9
JH
2067 return 0;
2068 got_it:
2069 return s;
2070}
2071
fae667d5 2072
6eb5f6b9
JH
2073/*
2074 - regexec_flags - match a regexp against a string
2075 */
2076I32
288b8c02 2077Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2078 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2079/* stringarg: the point in the string at which to begin matching */
2080/* strend: pointer to null at end of string */
2081/* strbeg: real beginning of string */
2082/* minend: end of match must be >= minend bytes after stringarg. */
2083/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2084 * itself is accessed via the pointers above */
2085/* data: May be used for some additional optimizations.
2086 Currently its only used, with a U32 cast, for transmitting
2087 the ganch offset when doing a /g match. This will change */
2088/* nosave: For optimizations. */
2089
6eb5f6b9 2090{
97aff369 2091 dVAR;
288b8c02 2092 struct regexp *const prog = (struct regexp *)SvANY(rx);
24b23f37 2093 /*register*/ char *s;
eb578fdb 2094 regnode *c;
24b23f37 2095 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2096 I32 minlen; /* must match at least this many chars */
2097 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2098 I32 end_shift = 0; /* Same for the end. */ /* CC */
2099 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2100 char *scream_olds = NULL;
f2ed9b32 2101 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2102 I32 multiline;
f8fc2ecf 2103 RXi_GET_DECL(prog,progi);
3b0527fe 2104 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2105 regexp_paren_pair *swap = NULL;
a3621e74
YO
2106 GET_RE_DEBUG_FLAGS_DECL;
2107
7918f24d 2108 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2109 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2110
2111 /* Be paranoid... */
2112 if (prog == NULL || startpos == NULL) {
2113 Perl_croak(aTHX_ "NULL regexp parameter");
2114 return 0;
2115 }
2116
bbe252da 2117 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2118 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2119
f2ed9b32 2120 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2121 DEBUG_EXECUTE_r(
f2ed9b32 2122 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2123 "Matching");
2124 );
bac06658 2125
6eb5f6b9 2126 minlen = prog->minlen;
1de06328
YO
2127
2128 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2130 "String too short [regexec_flags]...\n"));
2131 goto phooey;
1aa99e6b 2132 }
6eb5f6b9 2133
1de06328 2134
6eb5f6b9 2135 /* Check validity of program. */
f8fc2ecf 2136 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2137 Perl_croak(aTHX_ "corrupted regexp program");
2138 }
2139
2140 PL_reg_flags = 0;
ed301438 2141 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2142 PL_reg_maxiter = 0;
2143
3c8556c3 2144 if (RX_UTF8(rx))
6eb5f6b9
JH
2145 PL_reg_flags |= RF_utf8;
2146
2147 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2148 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2149 PL_bostr = strbeg;
3b0527fe 2150 reginfo.sv = sv;
6eb5f6b9
JH
2151
2152 /* Mark end of line for $ (and such) */
2153 PL_regeol = strend;
2154
2155 /* see how far we have to get to not match where we matched before */
3b0527fe 2156 reginfo.till = startpos+minend;
6eb5f6b9 2157
6eb5f6b9
JH
2158 /* If there is a "must appear" string, look for it. */
2159 s = startpos;
2160
bbe252da 2161 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2162 MAGIC *mg;
2c296965 2163 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2164 reginfo.ganch = startpos + prog->gofs;
2c296965 2165 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2166 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2167 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2168 && SvMAGIC(sv)
14befaf4
DM
2169 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2170 && mg->mg_len >= 0) {
3b0527fe 2171 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2172 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2173 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2174
bbe252da 2175 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2176 if (s > reginfo.ganch)
6eb5f6b9 2177 goto phooey;
58e23c8d 2178 s = reginfo.ganch - prog->gofs;
2c296965 2179 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2180 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2181 if (s < strbeg)
2182 goto phooey;
6eb5f6b9
JH
2183 }
2184 }
58e23c8d 2185 else if (data) {
70685ca0 2186 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2187 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2188 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2189
2190 } else { /* pos() not defined */
3b0527fe 2191 reginfo.ganch = strbeg;
2c296965
YO
2192 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2193 "GPOS: reginfo.ganch = strbeg\n"));
2194 }
6eb5f6b9 2195 }
288b8c02 2196 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2197 /* We have to be careful. If the previous successful match
2198 was from this regex we don't want a subsequent partially
2199 successful match to clobber the old results.
2200 So when we detect this possibility we add a swap buffer
2201 to the re, and switch the buffer each match. If we fail
2202 we switch it back, otherwise we leave it swapped.
2203 */
2204 swap = prog->offs;
2205 /* do we need a save destructor here for eval dies? */
2206 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2207 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2208 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2209 PTR2UV(prog),
2210 PTR2UV(swap),
2211 PTR2UV(prog->offs)
2212 ));
c74340f9 2213 }
a0714e2c 2214 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2215 re_scream_pos_data d;
2216
2217 d.scream_olds = &scream_olds;
2218 d.scream_pos = &scream_pos;
288b8c02 2219 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2220 if (!s) {
a3621e74 2221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2222 goto phooey; /* not present */
3fa9c3d7 2223 }
6eb5f6b9
JH
2224 }
2225
1de06328 2226
6eb5f6b9
JH
2227
2228 /* Simplest case: anchored match need be tried only once. */
2229 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2230 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2231 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2232 goto got_it;
bbe252da
YO
2233 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2234 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2235 {
2236 char *end;
2237
2238 if (minlen)
2239 dontbother = minlen - 1;
1aa99e6b 2240 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2241 /* for multiline we only have to try after newlines */
33b8afdf 2242 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2243 /* because of the goto we can not easily reuse the macros for bifurcating the
2244 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2245 if (utf8_target) {
2246 if (s == startpos)
2247 goto after_try_utf8;
2248 while (1) {
2249 if (regtry(&reginfo, &s)) {
2250 goto got_it;
2251 }
2252 after_try_utf8:
2253 if (s > end) {
2254 goto phooey;
2255 }
2256 if (prog->extflags & RXf_USE_INTUIT) {
2257 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2258 if (!s) {
2259 goto phooey;
2260 }
2261 }
2262 else {
2263 s += UTF8SKIP(s);
2264 }
2265 }
2266 } /* end search for check string in unicode */
2267 else {
2268 if (s == startpos) {
2269 goto after_try_latin;
2270 }
2271 while (1) {
2272 if (regtry(&reginfo, &s)) {
2273 goto got_it;
2274 }
2275 after_try_latin:
2276 if (s > end) {
2277 goto phooey;
2278 }
2279 if (prog->extflags & RXf_USE_INTUIT) {
2280 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2281 if (!s) {
2282 goto phooey;
2283 }
2284 }
2285 else {
2286 s++;
2287 }
2288 }
2289 } /* end search for check string in latin*/
2290 } /* end search for check string */
2291 else { /* search for newline */
2292 if (s > startpos) {
2293 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2294 s--;
92f3d482 2295 }
21eede78
YO
2296 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2297 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2298 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2299 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2300 goto got_it;
2301 }
92f3d482
YO
2302 }
2303 } /* end search for newline */
2304 } /* end anchored/multiline check string search */
6eb5f6b9 2305 goto phooey;
bbe252da 2306 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2307 {
486ec47a 2308 /* the warning about reginfo.ganch being used without initialization
bbe252da 2309 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2310 and we only enter this block when the same bit is set. */
58e23c8d 2311 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2312
2313 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2314 goto got_it;
2315 goto phooey;
2316 }
2317
2318 /* Messy cases: unanchored match. */
bbe252da 2319 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2320 /* we have /x+whatever/ */
f2ed9b32 2321 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2322 char ch;
bf93d4cc
GS
2323#ifdef DEBUGGING
2324 int did_match = 0;
2325#endif
f2ed9b32 2326 if (utf8_target) {
7e0d5ad7
KW
2327 if (! prog->anchored_utf8) {
2328 to_utf8_substr(prog);
2329 }
2330 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2331 REXEC_FBC_SCAN(
6eb5f6b9 2332 if (*s == ch) {
a3621e74 2333 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2334 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2335 s += UTF8SKIP(s);
2336 while (s < strend && *s == ch)
2337 s += UTF8SKIP(s);
2338 }
4cadc6a9 2339 );
7e0d5ad7 2340
6eb5f6b9
JH
2341 }
2342 else {
7e0d5ad7
KW
2343 if (! prog->anchored_substr) {
2344 if (! to_byte_substr(prog)) {
2345 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346 non_utf8_target_but_utf8_required));
2347 goto phooey;
2348 }
2349 }
2350 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2351 REXEC_FBC_SCAN(
6eb5f6b9 2352 if (*s == ch) {
a3621e74 2353 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2354 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2355 s++;
2356 while (s < strend && *s == ch)
2357 s++;
2358 }
4cadc6a9 2359 );
6eb5f6b9 2360 }
a3621e74 2361 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2362 PerlIO_printf(Perl_debug_log,
b7953727
JH
2363 "Did not find anchored character...\n")
2364 );
6eb5f6b9 2365 }
a0714e2c
SS
2366 else if (prog->anchored_substr != NULL
2367 || prog->anchored_utf8 != NULL
2368 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2369 && prog->float_max_offset < strend - s)) {
2370 SV *must;
2371 I32 back_max;
2372 I32 back_min;
2373 char *last;
6eb5f6b9 2374 char *last1; /* Last position checked before */
bf93d4cc
GS
2375#ifdef DEBUGGING
2376 int did_match = 0;
2377#endif
33b8afdf 2378 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2379 if (utf8_target) {
2380 if (! prog->anchored_utf8) {
2381 to_utf8_substr(prog);
2382 }
2383 must = prog->anchored_utf8;
2384 }
2385 else {
2386 if (! prog->anchored_substr) {
2387 if (! to_byte_substr(prog)) {
2388 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2389 non_utf8_target_but_utf8_required));
2390 goto phooey;
2391 }
2392 }
2393 must = prog->anchored_substr;
2394 }
33b8afdf
JH
2395 back_max = back_min = prog->anchored_offset;
2396 } else {
7e0d5ad7
KW
2397 if (utf8_target) {
2398 if (! prog->float_utf8) {
2399 to_utf8_substr(prog);
2400 }
2401 must = prog->float_utf8;
2402 }
2403 else {
2404 if (! prog->float_substr) {
2405 if (! to_byte_substr(prog)) {
2406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2407 non_utf8_target_but_utf8_required));
2408 goto phooey;
2409 }
2410 }
2411 must = prog->float_substr;
2412 }
33b8afdf
JH
2413 back_max = prog->float_max_offset;
2414 back_min = prog->float_min_offset;
2415 }
1de06328 2416
1de06328
YO
2417 if (back_min<0) {
2418 last = strend;
2419 } else {
2420 last = HOP3c(strend, /* Cannot start after this */
2421 -(I32)(CHR_SVLEN(must)
2422 - (SvTAIL(must) != 0) + back_min), strbeg);
2423 }
6eb5f6b9
JH
2424 if (s > PL_bostr)
2425 last1 = HOPc(s, -1);
2426 else
2427 last1 = s - 1; /* bogus */
2428
a0288114 2429 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2430 check_substr==must. */
2431 scream_pos = -1;
2432 dontbother = end_shift;
2433 strend = HOPc(strend, -dontbother);
2434 while ( (s <= last) &&
c33e64f0 2435 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2436 (unsigned char*)strend, must,
c33e64f0 2437 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2438 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2439 if (HOPc(s, -back_max) > last1) {
2440 last1 = HOPc(s, -back_min);
2441 s = HOPc(s, -back_max);
2442 }
2443 else {
52657f30 2444 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2445
2446 last1 = HOPc(s, -back_min);
52657f30 2447 s = t;
6eb5f6b9 2448 }
f2ed9b32 2449 if (utf8_target) {
6eb5f6b9 2450 while (s <= last1) {
24b23f37 2451 if (regtry(&reginfo, &s))
6eb5f6b9 2452 goto got_it;
7016d6eb
DM
2453 if (s >= last1) {
2454 s++; /* to break out of outer loop */
2455 break;
2456 }
2457 s += UTF8SKIP(s);
6eb5f6b9
JH
2458 }
2459 }
2460 else {
2461 while (s <= last1) {
24b23f37 2462 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2463 goto got_it;
2464 s++;
2465 }
2466 }
2467 }
ab3bbdeb 2468 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2469 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2470 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2472 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2473 ? "anchored" : "floating"),
ab3bbdeb
YO
2474 quoted, RE_SV_TAIL(must));
2475 });
6eb5f6b9
JH
2476 goto phooey;
2477 }
f8fc2ecf 2478 else if ( (c = progi->regstclass) ) {
f14c76ed 2479 if (minlen) {
f8fc2ecf 2480 const OPCODE op = OP(progi->regstclass);
66e933ab 2481 /* don't bother with what can't match */
786e8c11 2482 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2483 strend = HOPc(strend, -(minlen - 1));
2484 }
a3621e74 2485 DEBUG_EXECUTE_r({
be8e71aa 2486 SV * const prop = sv_newmortal();
32fc9b6a 2487 regprop(prog, prop, c);
0df25f3d 2488 {
f2ed9b32 2489 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2490 s,strend-s,60);
0df25f3d 2491 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2492 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2493 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2494 quoted, (int)(strend - s));
0df25f3d 2495 }
ffc61ed2 2496 });
3b0527fe 2497 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2498 goto got_it;
07be1b83 2499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2500 }
2501 else {
2502 dontbother = 0;
a0714e2c 2503 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2504 /* Trim the end. */
6af40bd7 2505 char *last= NULL;
33b8afdf 2506 SV* float_real;
c33e64f0
FC
2507 STRLEN len;
2508 const char *little;
33b8afdf 2509
7e0d5ad7
KW
2510 if (utf8_target) {
2511 if (! prog->float_utf8) {
2512 to_utf8_substr(prog);
2513 }
2514 float_real = prog->float_utf8;
2515 }
2516 else {
2517 if (! prog->float_substr) {
2518 if (! to_byte_substr(prog)) {
2519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2520 non_utf8_target_but_utf8_required));
2521 goto phooey;
2522 }
2523 }
2524 float_real = prog->float_substr;
2525 }
d6a28714 2526
c33e64f0
FC
2527 little = SvPV_const(float_real, len);
2528 if (SvTAIL(float_real)) {
7f18ad16
KW
2529 /* This means that float_real contains an artificial \n on
2530 * the end due to the presence of something like this:
2531 * /foo$/ where we can match both "foo" and "foo\n" at the
2532 * end of the string. So we have to compare the end of the
2533 * string first against the float_real without the \n and
2534 * then against the full float_real with the string. We
2535 * have to watch out for cases where the string might be
2536 * smaller than the float_real or the float_real without
2537 * the \n. */
1a13b075
YO
2538 char *checkpos= strend - len;
2539 DEBUG_OPTIMISE_r(
2540 PerlIO_printf(Perl_debug_log,
2541 "%sChecking for float_real.%s\n",
2542 PL_colors[4], PL_colors[5]));
2543 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2544 /* can't match, even if we remove the trailing \n
2545 * string is too short to match */
1a13b075
YO
2546 DEBUG_EXECUTE_r(
2547 PerlIO_printf(Perl_debug_log,
2548 "%sString shorter than required trailing substring, cannot match.%s\n",
2549 PL_colors[4], PL_colors[5]));
2550 goto phooey;
2551 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2552 /* can match, the end of the string matches without the
2553 * "\n" */
1a13b075
YO
2554 last = checkpos + 1;
2555 } else if (checkpos < strbeg) {
7f18ad16
KW
2556 /* cant match, string is too short when the "\n" is
2557 * included */
1a13b075
YO
2558 DEBUG_EXECUTE_r(
2559 PerlIO_printf(Perl_debug_log,
2560 "%sString does not contain required trailing substring, cannot match.%s\n",
2561 PL_colors[4], PL_colors[5]));
2562 goto phooey;
2563 } else if (!multiline) {
7f18ad16
KW
2564 /* non multiline match, so compare with the "\n" at the
2565 * end of the string */
1a13b075
YO
2566 if (memEQ(checkpos, little, len)) {
2567 last= checkpos;
2568 } else {
2569 DEBUG_EXECUTE_r(
2570 PerlIO_printf(Perl_debug_log,
2571 "%sString does not contain required trailing substring, cannot match.%s\n",
2572 PL_colors[4], PL_colors[5]));
2573 goto phooey;
2574 }
2575 } else {
7f18ad16
KW
2576 /* multiline match, so we have to search for a place
2577 * where the full string is located */
d6a28714 2578 goto find_last;
1a13b075 2579 }
c33e64f0 2580 } else {
d6a28714 2581 find_last:
9041c2e3 2582 if (len)
d6a28714 2583 last = rninstr(s, strend, little, little + len);
b8c5462f 2584 else
a0288114 2585 last = strend; /* matching "$" */
b8c5462f 2586 }
6af40bd7 2587 if (!last) {
7f18ad16
KW
2588 /* at one point this block contained a comment which was
2589 * probably incorrect, which said that this was a "should not
2590 * happen" case. Even if it was true when it was written I am
2591 * pretty sure it is not anymore, so I have removed the comment
2592 * and replaced it with this one. Yves */
6bda09f9
YO
2593 DEBUG_EXECUTE_r(
2594 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2595 "String does not contain required substring, cannot match.\n"
2596 ));
2597 goto phooey;
bf93d4cc 2598 }
d6a28714
JH
2599 dontbother = strend - last + prog->float_min_offset;
2600 }
2601 if (minlen && (dontbother < minlen))
2602 dontbother = minlen - 1;
2603 strend -= dontbother; /* this one's always in bytes! */
2604 /* We don't know much -- general case. */
f2ed9b32 2605 if (utf8_target) {
d6a28714 2606 for (;;) {
24b23f37 2607 if (regtry(&reginfo, &s))
d6a28714
JH
2608 goto got_it;
2609 if (s >= strend)
2610 break;
b8c5462f 2611 s += UTF8SKIP(s);
d6a28714
JH
2612 };
2613 }
2614 else {
2615 do {
24b23f37 2616 if (regtry(&reginfo, &s))
d6a28714
JH
2617 goto got_it;
2618 } while (s++ < strend);
2619 }
2620 }
2621
2622 /* Failure. */
2623 goto phooey;
2624
2625got_it:
495f47a5
DM
2626 DEBUG_BUFFERS_r(
2627 if (swap)
2628 PerlIO_printf(Perl_debug_log,
2629 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2630 PTR2UV(prog),
2631 PTR2UV(swap)
2632 );
2633 );
e9105d30 2634 Safefree(swap);
288b8c02 2635 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2636
ed301438 2637 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2638 restore_pos(aTHX_ prog);
5daac39c
NC
2639 if (RXp_PAREN_NAMES(prog))
2640 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2641
2642 /* make sure $`, $&, $', and $digit will work later */
2643 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2644 if (flags & REXEC_COPY_STR) {
f8c7b90f 2645#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2646 if ((SvIsCOW(sv)
2647 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2648 if (DEBUG_C_TEST) {
2649 PerlIO_printf(Perl_debug_log,
2650 "Copy on write: regexp capture, type %d\n",
2651 (int) SvTYPE(sv));
2652 }
77f8f7c1 2653 RX_MATCH_COPY_FREE(rx);
ed252734 2654 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2655 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2656 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2657 prog->sublen = PL_regeol - strbeg;
2658 prog->suboffset = 0;
2659 prog->subcoffset = 0;
ed252734
NC
2660 } else
2661#endif
2662 {
6502e081
DM
2663 I32 min = 0;
2664 I32 max = PL_regeol - strbeg;
2665 I32 sublen;
2666
2667 if ( (flags & REXEC_COPY_SKIP_POST)
2668 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2669 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2670 ) { /* don't copy $' part of string */
3de645a8 2671 U32 n = 0;
6502e081
DM
2672 max = -1;
2673 /* calculate the right-most part of the string covered
2674 * by a capture. Due to look-ahead, this may be to
2675 * the right of $&, so we have to scan all captures */
2676 while (n <= prog->lastparen) {
2677 if (prog->offs[n].end > max)
2678 max = prog->offs[n].end;
2679 n++;
2680 }
2681 if (max == -1)
2682 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2683 ? prog->offs[0].start
2684 : 0;
2685 assert(max >= 0 && max <= PL_regeol - strbeg);
2686 }
2687
2688 if ( (flags & REXEC_COPY_SKIP_PRE)
2689 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2690 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2691 ) { /* don't copy $` part of string */
3de645a8 2692 U32 n = 0;
6502e081
DM
2693 min = max;
2694 /* calculate the left-most part of the string covered
2695 * by a capture. Due to look-behind, this may be to
2696 * the left of $&, so we have to scan all captures */
2697 while (min && n <= prog->lastparen) {
2698 if ( prog->offs[n].start != -1
2699 && prog->offs[n].start < min)
2700 {
2701 min = prog->offs[n].start;
2702 }
2703 n++;
2704 }
2705 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2706 && min > prog->offs[0].end
2707 )
2708 min = prog->offs[0].end;
2709
2710 }
2711
2712 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2713 sublen = max - min;
2714
2715 if (RX_MATCH_COPIED(rx)) {
2716 if (sublen > prog->sublen)
2717 prog->subbeg =
2718 (char*)saferealloc(prog->subbeg, sublen+1);
2719 }
2720 else
2721 prog->subbeg = (char*)safemalloc(sublen+1);
2722 Copy(strbeg + min, prog->subbeg, sublen, char);
2723 prog->subbeg[sublen] = '\0';
2724 prog->suboffset = min;
2725 prog->sublen = sublen;
77f8f7c1 2726 RX_MATCH_COPIED_on(rx);
6502e081 2727 }
6502e081
DM
2728 prog->subcoffset = prog->suboffset;
2729 if (prog->suboffset && utf8_target) {
2730 /* Convert byte offset to chars.
2731 * XXX ideally should only compute this if @-/@+
2732 * has been seen, a la PL_sawampersand ??? */
2733
2734 /* If there's a direct correspondence between the
2735 * string which we're matching and the original SV,
2736 * then we can use the utf8 len cache associated with
2737 * the SV. In particular, it means that under //g,
2738 * sv_pos_b2u() will use the previously cached
2739 * position to speed up working out the new length of
2740 * subcoffset, rather than counting from the start of
2741 * the string each time. This stops
2742 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2743 * from going quadratic */
2744 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2745 sv_pos_b2u(sv, &(prog->subcoffset));
2746 else
2747 prog->subcoffset = utf8_length((U8*)strbeg,
2748 (U8*)(strbeg+prog->suboffset));
2749 }
d6a28714
JH
2750 }
2751 else {
6502e081 2752 RX_MATCH_COPY_FREE(rx);
d6a28714 2753 prog->subbeg = strbeg;
6502e081
DM
2754 prog->suboffset = 0;
2755 prog->subcoffset = 0;
d6a28714
JH
2756 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2757 }
2758 }
9041c2e3 2759
d6a28714
JH
2760 return 1;
2761
2762phooey:
a3621e74 2763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2764 PL_colors[4], PL_colors[5]));
ed301438 2765 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2766 restore_pos(aTHX_ prog);
e9105d30 2767 if (swap) {
c74340f9 2768 /* we failed :-( roll it back */
495f47a5
DM
2769 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2770 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2771 PTR2UV(prog),
2772 PTR2UV(prog->offs),
2773 PTR2UV(swap)
2774 ));
e9105d30
GG
2775 Safefree(prog->offs);
2776 prog->offs = swap;
2777 }
d6a28714
JH
2778 return 0;
2779}
2780
6bda09f9 2781
ec43f78b
DM
2782/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2783 * Do inc before dec, in case old and new rex are the same */
2784#define SET_reg_curpm(Re2) \
2785 if (PL_reg_state.re_state_eval_setup_done) { \
2786 (void)ReREFCNT_inc(Re2); \
2787 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2788 PM_SETRE((PL_reg_curpm), (Re2)); \
2789 }
2790
2791
d6a28714
JH
2792/*
2793 - regtry - try match at specific point
2794 */
2795STATIC I32 /* 0 failure, 1 success */
f73aaa43 2796S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2797{
97aff369 2798 dVAR;
d6a28714 2799 CHECKPOINT lastcp;
288b8c02
NC
2800 REGEXP *const rx = reginfo->prog;
2801 regexp *const prog = (struct regexp *)SvANY(rx);
f73aaa43 2802 I32 result;
f8fc2ecf 2803 RXi_GET_DECL(prog,progi);
a3621e74 2804 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2805
2806 PERL_ARGS_ASSERT_REGTRY;
2807
24b23f37 2808 reginfo->cutpoint=NULL;
d6a28714 2809
ed301438
DM
2810 if ((prog->extflags & RXf_EVAL_SEEN)
2811 && !PL_reg_state.re_state_eval_setup_done)
2812 {
d6a28714
JH
2813 MAGIC *mg;
2814
ed301438 2815 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2816 if (reginfo->sv) {
d6a28714 2817 /* Make $_ available to executed code. */
3b0527fe 2818 if (reginfo->sv != DEFSV) {
59f00321 2819 SAVE_DEFSV;
414bf5ae 2820 DEFSV_set(reginfo->sv);
b8c5462f 2821 }
d6a28714 2822
3b0527fe
DM
2823 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2824 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2825 /* prepare for quick setting of pos */
d300d9fa 2826#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2827 if (SvIsCOW(reginfo->sv))
2828 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2829#endif
3dab1dad 2830 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2831 &PL_vtbl_mglob, NULL, 0);
d6a28714 2832 mg->mg_len = -1;
b8c5462f 2833 }
d6a28714
JH
2834 PL_reg_magic = mg;
2835 PL_reg_oldpos = mg->mg_len;
4f639d21 2836 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2837 }
09687e5a 2838 if (!PL_reg_curpm) {
a02a5408 2839 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2840#ifdef USE_ITHREADS
2841 {
14a49a24 2842 SV* const repointer = &PL_sv_undef;
92313705
NC
2843 /* this regexp is also owned by the new PL_reg_curpm, which
2844 will try to free it. */
d2ece331 2845 av_push(PL_regex_padav, repointer);
09687e5a
AB
2846 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2847 PL_regex_pad = AvARRAY(PL_regex_padav);
2848 }
2849#endif
2850 }
ec43f78b 2851 SET_reg_curpm(rx);
d6a28714
JH
2852 PL_reg_oldcurpm = PL_curpm;
2853 PL_curpm = PL_reg_curpm;
07bc277f 2854 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2855 /* Here is a serious problem: we cannot rewrite subbeg,
2856 since it may be needed if this match fails. Thus
2857 $` inside (?{}) could fail... */
2858 PL_reg_oldsaved = prog->subbeg;
2859 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2860 PL_reg_oldsavedoffset = prog->suboffset;
2861 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2862#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2863 PL_nrs = prog->saved_copy;
2864#endif
07bc277f 2865 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2866 }
2867 else
bd61b366 2868 PL_reg_oldsaved = NULL;
d6a28714 2869 prog->subbeg = PL_bostr;
6502e081
DM
2870 prog->suboffset = 0;
2871 prog->subcoffset = 0;
d6a28714
JH
2872 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2873 }
97ca13b7 2874#ifdef DEBUGGING
f73aaa43 2875 PL_reg_starttry = *startposp;
97ca13b7 2876#endif
f73aaa43 2877 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2878 prog->lastparen = 0;
03994de8 2879 prog->lastcloseparen = 0;
d6a28714 2880 PL_regsize = 0;
d6a28714
JH
2881
2882 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2883 to do this again and again, prog->lastparen should take care of
3dd2943c 2884 this! --ilya*/
dafc8851
JH
2885
2886 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2887 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2888 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2889 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2890 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2891 * Meanwhile, this code *is* needed for the
daf18116
JH
2892 * above-mentioned test suite tests to succeed. The common theme
2893 * on those tests seems to be returning null fields from matches.
225593e1 2894 * --jhi updated by dapm */
dafc8851 2895#if 1
d6a28714 2896 if (prog->nparens) {
b93070ed 2897 regexp_paren_pair *pp = prog->offs;
eb578fdb 2898 I32 i;
b93070ed 2899 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2900 ++pp;
2901 pp->start = -1;
2902 pp->end = -1;
d6a28714
JH
2903 }
2904 }
dafc8851 2905#endif
02db2b7b 2906 REGCP_SET(lastcp);
f73aaa43
DM
2907 result = regmatch(reginfo, *startposp, progi->program + 1);
2908 if (result != -1) {
2909 prog->offs[0].end = result;
d6a28714
JH
2910 return 1;
2911 }
24b23f37 2912 if (reginfo->cutpoint)
f73aaa43 2913 *startposp= reginfo->cutpoint;
02db2b7b 2914 REGCP_UNWIND(lastcp);
d6a28714
JH
2915 return 0;
2916}
2917
02db2b7b 2918
8ba1375e
MJD
2919#define sayYES goto yes
2920#define sayNO goto no
262b90c4 2921#define sayNO_SILENT goto no_silent
8ba1375e 2922
f9f4320a
YO
2923/* we dont use STMT_START/END here because it leads to
2924 "unreachable code" warnings, which are bogus, but distracting. */
2925#define CACHEsayNO \
c476f425
DM
2926 if (ST.cache_mask) \
2927 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2928 sayNO
3298f257 2929
a3621e74 2930/* this is used to determine how far from the left messages like
265c4333
YO
2931 'failed...' are printed. It should be set such that messages
2932 are inline with the regop output that created them.
a3621e74 2933*/
265c4333 2934#define REPORT_CODE_OFF 32
a3621e74
YO
2935
2936
40a82448
DM
2937#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2938#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2939
86545054
DM
2940#define SLAB_FIRST(s) (&(s)->states[0])
2941#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2942
5d9a96ca
DM
2943/* grab a new slab and return the first slot in it */
2944
2945STATIC regmatch_state *
2946S_push_slab(pTHX)
2947{
a35a87e7 2948#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2949 dMY_CXT;
2950#endif
5d9a96ca
DM
2951 regmatch_slab *s = PL_regmatch_slab->next;
2952 if (!s) {
2953 Newx(s, 1, regmatch_slab);
2954 s->prev = PL_regmatch_slab;
2955 s->next = NULL;
2956 PL_regmatch_slab->next = s;
2957 }
2958 PL_regmatch_slab = s;
86545054 2959 return SLAB_FIRST(s);
5d9a96ca 2960}
5b47454d 2961
95b24440 2962
40a82448
DM
2963/* push a new state then goto it */
2964
4d5016e5
DM
2965#define PUSH_STATE_GOTO(state, node, input) \
2966 pushinput = input; \
40a82448
DM
2967 scan = node; \
2968 st->resume_state = state; \
2969 goto push_state;
2970
2971/* push a new state with success backtracking, then goto it */
2972
4d5016e5
DM
2973#define PUSH_YES_STATE_GOTO(state, node, input) \
2974 pushinput = input; \
40a82448
DM
2975 scan = node; \
2976 st->resume_state = state; \
2977 goto push_yes_state;
2978
aa283a38 2979
aa283a38 2980
4d5016e5 2981
d6a28714 2982/*
95b24440 2983
bf1f174e
DM
2984regmatch() - main matching routine
2985
2986This is basically one big switch statement in a loop. We execute an op,
2987set 'next' to point the next op, and continue. If we come to a point which
2988we may need to backtrack to on failure such as (A|B|C), we push a
2989backtrack state onto the backtrack stack. On failure, we pop the top
2990state, and re-enter the loop at the state indicated. If there are no more
2991states to pop, we return failure.
2992
2993Sometimes we also need to backtrack on success; for example /A+/, where
2994after successfully matching one A, we need to go back and try to
2995match another one; similarly for lookahead assertions: if the assertion
2996completes successfully, we backtrack to the state just before the assertion
2997and then carry on. In these cases, the pushed state is marked as
2998'backtrack on success too'. This marking is in fact done by a chain of
2999pointers, each pointing to the previous 'yes' state. On success, we pop to
3000the nearest yes state, discarding any intermediate failure-only states.
3001Sometimes a yes state is pushed just to force some cleanup code to be
3002called at the end of a successful match or submatch; e.g. (??{$re}) uses
3003it to free the inner regex.
3004
3005Note that failure backtracking rewinds the cursor position, while
3006success backtracking leaves it alone.
3007
3008A pattern is complete when the END op is executed, while a subpattern
3009such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3010ops trigger the "pop to last yes state if any, otherwise return true"
3011behaviour.
3012
3013A common convention in this function is to use A and B to refer to the two
3014subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3015the subpattern to be matched possibly multiple times, while B is the entire
3016rest of the pattern. Variable and state names reflect this convention.
3017
3018The states in the main switch are the union of ops and failure/success of
3019substates associated with with that op. For example, IFMATCH is the op
3020that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3021'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3022successfully matched A and IFMATCH_A_fail is a state saying that we have
3023just failed to match A. Resume states always come in pairs. The backtrack
3024state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3025at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3026on success or failure.
3027
3028The struct that holds a backtracking state is actually a big union, with
3029one variant for each major type of op. The variable st points to the
3030top-most backtrack struct. To make the code clearer, within each
3031block of code we #define ST to alias the relevant union.
3032
3033Here's a concrete example of a (vastly oversimplified) IFMATCH
3034implementation:
3035
3036 switch (state) {
3037 ....
3038
3039#define ST st->u.ifmatch
3040
3041 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3042 ST.foo = ...; // some state we wish to save
95b24440 3043 ...
bf1f174e
DM
3044 // push a yes backtrack state with a resume value of
3045 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3046 // first node of A:
4d5016e5 3047 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3048 // NOTREACHED
3049
3050 case IFMATCH_A: // we have successfully executed A; now continue with B
3051 next = B;
3052 bar = ST.foo; // do something with the preserved value
3053 break;
3054
3055 case IFMATCH_A_fail: // A failed, so the assertion failed
3056 ...; // do some housekeeping, then ...
3057 sayNO; // propagate the failure
3058
3059#undef ST
95b24440 3060
bf1f174e
DM
3061 ...
3062 }
95b24440 3063
bf1f174e
DM
3064For any old-timers reading this who are familiar with the old recursive
3065approach, the code above is equivalent to:
95b24440 3066
bf1f174e
DM
3067 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3068 {
3069 int foo = ...
95b24440 3070 ...
bf1f174e
DM
3071 if (regmatch(A)) {
3072 next = B;
3073 bar = foo;
3074 break;
95b24440 3075 }
bf1f174e
DM
3076 ...; // do some housekeeping, then ...
3077 sayNO; // propagate the failure
95b24440 3078 }
bf1f174e
DM
3079
3080The topmost backtrack state, pointed to by st, is usually free. If you
3081want to claim it, populate any ST.foo fields in it with values you wish to
3082save, then do one of
3083
4d5016e5
DM
3084 PUSH_STATE_GOTO(resume_state, node, newinput);
3085 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3086
3087which sets that backtrack state's resume value to 'resume_state', pushes a
3088new free entry to the top of the backtrack stack, then goes to 'node'.
3089On backtracking, the free slot is popped, and the saved state becomes the
3090new free state. An ST.foo field in this new top state can be temporarily
3091accessed to retrieve values, but once the main loop is re-entered, it
3092becomes available for reuse.
3093
3094Note that the depth of the backtrack stack constantly increases during the
3095left-to-right execution of the pattern, rather than going up and down with
3096the pattern nesting. For example the stack is at its maximum at Z at the
3097end of the pattern, rather than at X in the following:
3098
3099 /(((X)+)+)+....(Y)+....Z/
3100
3101The only exceptions to this are lookahead/behind assertions and the cut,
3102(?>A), which pop all the backtrack states associated with A before
3103continuing.
3104
486ec47a 3105Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3106PL_regmatch_state and st always point to the currently active state,
3107and PL_regmatch_slab points to the slab currently containing
3108PL_regmatch_state. The first time regmatch() is called, the first slab is
3109allocated, and is never freed until interpreter destruction. When the slab
3110is full, a new one is allocated and chained to the end. At exit from
3111regmatch(), slabs allocated since entry are freed.
3112
3113*/
95b24440 3114
40a82448 3115
5bc10b2c 3116#define DEBUG_STATE_pp(pp) \
265c4333 3117 DEBUG_STATE_r({ \
f2ed9b32 3118 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3119 PerlIO_printf(Perl_debug_log, \
5d458dd8 3120 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3121 depth*2, "", \
13d6edb4 3122 PL_reg_name[st->resume_state], \
5d458dd8
YO
3123 ((st==yes_state||st==mark_state) ? "[" : ""), \
3124 ((st==yes_state) ? "Y" : ""), \
3125 ((st==mark_state) ? "M" : ""), \
3126 ((st==yes_state||st==mark_state) ? "]" : "") \
3127 ); \
265c4333 3128 });
5bc10b2c 3129
40a82448 3130
3dab1dad 3131#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3132
3df15adc 3133#ifdef DEBUGGING
5bc10b2c 3134
ab3bbdeb 3135STATIC void
f2ed9b32 3136S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3137 const char *start, const char *end, const char *blurb)
3138{
efd26800 3139 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3140
3141 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3142
ab3bbdeb
YO
3143 if (!PL_colorset)
3144 reginitcolors();
3145 {
3146 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3147 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3148
f2ed9b32 3149 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3150 start, end - start, 60);
3151
3152 PerlIO_printf(Perl_debug_log,
3153 "%s%s REx%s %s against %s\n",
3154 PL_colors[4], blurb, PL_colors[5], s0, s1);
3155
f2ed9b32 3156 if (utf8_target||utf8_pat)
1de06328
YO
3157 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3158 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3159 utf8_pat && utf8_target ? " and " : "",
3160 utf8_target ? "string" : ""
ab3bbdeb
YO
3161 );
3162 }
3163}
3df15adc
YO
3164
3165STATIC void
786e8c11
YO
3166S_dump_exec_pos(pTHX_ const char *locinput,
3167 const regnode *scan,
3168 const char *loc_regeol,
3169 const char *loc_bostr,
3170 const char *loc_reg_starttry,
f2ed9b32 3171 const bool utf8_target)
07be1b83 3172{
786e8c11 3173 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3174 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3175 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3176 /* The part of the string before starttry has one color
3177 (pref0_len chars), between starttry and current
3178 position another one (pref_len - pref0_len chars),
3179 after the current position the third one.
3180 We assume that pref0_len <= pref_len, otherwise we
3181 decrease pref0_len. */
786e8c11
YO
3182 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3183 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3184 int pref0_len;
3185
7918f24d
NC
3186 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3187
f2ed9b32 3188 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3189 pref_len++;
786e8c11
YO
3190 pref0_len = pref_len - (locinput - loc_reg_starttry);
3191 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3192 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3193 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3194 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3195 l--;
3196 if (pref0_len < 0)
3197 pref0_len = 0;
3198 if (pref0_len > pref_len)
3199 pref0_len = pref_len;
3200 {
f2ed9b32 3201 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3202
ab3bbdeb 3203 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3204 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3205
ab3bbdeb 3206 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3207 (locinput - pref_len + pref0_len),
1de06328 3208 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3209
ab3bbdeb 3210 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3211 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3212
1de06328 3213 const STRLEN tlen=len0+len1+len2;
3df15adc 3214 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3215 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3216 (IV)(locinput - loc_bostr),
07be1b83 3217 len0, s0,
07be1b83 3218 len1, s1,
07be1b83 3219 (docolor ? "" : "> <"),
07be1b83 3220 len2, s2,
f9f4320a 3221 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3222 "");
3223 }
3224}
3df15adc 3225
07be1b83
YO
3226#endif
3227
0a4db386
YO
3228/* reg_check_named_buff_matched()
3229 * Checks to see if a named buffer has matched. The data array of
3230 * buffer numbers corresponding to the buffer is expected to reside
3231 * in the regexp->data->data array in the slot stored in the ARG() of
3232 * node involved. Note that this routine doesn't actually care about the
3233 * name, that information is not preserved from compilation to execution.
3234 * Returns the index of the leftmost defined buffer with the given name
3235 * or 0 if non of the buffers matched.
3236 */
3237STATIC I32
7918f24d
NC
3238S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3239{
0a4db386 3240 I32 n;
f8fc2ecf 3241 RXi_GET_DECL(rex,rexi);
ad64d0ec 3242 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3243 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3244
3245 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3246
0a4db386 3247 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3248 if ((I32)rex->lastparen >= nums[n] &&
3249 rex->offs[nums[n]].end != -1)
0a4db386
YO
3250 {
3251 return nums[n];
3252 }
3253 }
3254 return 0;
3255}
3256
2f554ef7
DM
3257
3258/* free all slabs above current one - called during LEAVE_SCOPE */
3259
3260STATIC void
3261S_clear_backtrack_stack(pTHX_ void *p)
3262{
3263 regmatch_slab *s = PL_regmatch_slab->next;
3264 PERL_UNUSED_ARG(p);
3265
3266 if (!s)
3267 return;
3268 PL_regmatch_slab->next = NULL;
3269 while (s) {
3270 regmatch_slab * const osl = s;
3271 s = s->next;
3272 Safefree(osl);
3273 }
3274}
c74f6de9
KW
3275static bool
3276S_setup_EXACTISH_ST_c1_c2(pTHX_ regnode *text_node, I32 *c1, I32 *c2)
3277{
3278 /* This sets up a relatively quick check for the initial part of what must
3279 * match after a CURLY-type operation condition (the "B" in A*B), where B
3280 * starts with an EXACTish node, <text_node>. If this check is not met,
3281 * the caller knows that it should continue with the loop. If the check is
3282 * met, the caller must see if all of B is met, before making the decision.
3283 *
3284 * This function sets *<c1> and *<c2> to be the first code point of B. If
3285 * there are two possible such code points (as when the text_node is
3286 * folded), *<c2> is set to the second. If there are more than two (which
3287 * happens for some folds), or there is some other complication, these
3288 * parameters are set to CHRTEST_VOID, to indicate not to do a quick check:
3289 * just try all of B after every time through the loop.
3290 *
3291 * If the routine determines that there is no possible way for there to be
3292 * a match, it returns FALSE.
3293 * */
3294
3295 const bool utf8_target = PL_reg_match_utf8;
3296 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3297 dVAR;
3298
3299 /* First byte from the EXACTish node */
3300 U8 *pat_byte = (U8*)STRING(text_node);
3301
3302 if (! UTF_PATTERN) { /* Not UTF-8: the code point is the byte */
3303 *c1 = *pat_byte;
3304 if (OP(text_node) == EXACT) {
3305 *c2 = *c1;
3306 }
3307 else if (utf8_target
3308 && HAS_NONLATIN1_FOLD_CLOSURE(*c1)
3309 && (OP(text_node) != EXACTFA || ! isASCII(*c1)))
3310 {
3311 /* Here, there could be something above Latin1 in the target which
3312 * folds to this character in the pattern, which means there are
3313 * more than two possible beginnings of B. */
3314 *c1 = *c2 = CHRTEST_VOID;
3315 }
3316 else { /* Here nothing above Latin1 can fold to the pattern character */
3317 switch (OP(text_node)) {
3318
3319 case EXACTFL: /* /l rules */
3320 *c2 = PL_fold_locale[*c1];
3321 break;
3322
3323 case EXACTFU_SS: /* This requires special handling: Don't
3324 shortcut */
3325 *c1 = *c2 = CHRTEST_VOID;
3326 break;
3327
3328 case EXACTF:
3329 if (! utf8_target) { /* /d rules */
3330 *c2 = PL_fold[*c1];
3331 break;
3332 }
3333 /* FALLTHROUGH */
3334 /* /u rules for all these. This happens to work for
3335 * EXACTFA in the ASCII range as nothing in Latin1 folds to
3336 * ASCII */
3337 case EXACTFA:
3338 case EXACTFU_TRICKYFOLD:
3339 case EXACTFU:
3340 *c2 = PL_fold_latin1[*c1];
3341 break;
3342
3343 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3344 }
3345 }
3346 }
3347 else { /* UTF_PATTERN */
3348 if (OP(text_node) == EXACT) {
3349 *c2 = *c1 = utf8n_to_uvchr(pat_byte, UTF8_MAXBYTES, 0, uniflags);
3350 if (*c1 < 0) { /* Overflowed what we can handle */
3351 *c1 = *c2 = CHRTEST_VOID;
3352 }
3353 else if (*c1 > 255 && ! utf8_target) {
3354 return FALSE; /* Can't possibly match */
3355 }
3356 }
3357 else {
3358 if (UTF8_IS_ABOVE_LATIN1(*pat_byte)) {
3359
3360 /* A multi-character fold is complicated, probably has more
3361 * than two possibilities */
3362 if (is_MULTI_CHAR_FOLD_utf8_safe((char*) pat_byte,
3363 (char*) pat_byte + STR_LEN(text_node)))
3364 {
3365 *c1 = *c2 = CHRTEST_VOID;
3366 }
3367 else { /* Not a multi-char fold */
3368
3369 /* Load the folds hash, if not already done */
3370 SV** listp;
3371 if (! PL_utf8_foldclosures) {
3372 if (! PL_utf8_tofold) {
3373 U8 dummy[UTF8_MAXBYTES+1];
3374 STRLEN dummy_len;
3375
3376 /* Force loading this by folding an above-Latin1
3377 * char */
3378 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
3379 assert(PL_utf8_tofold); /* Verify that worked */
3380 }
3381 PL_utf8_foldclosures =
3382 _swash_inversion_hash(PL_utf8_tofold);
3383 }
3384
3385 /* The fold closures data structure is a hash with the keys
3386 * being every character that is folded to, like 'k', and
3387 * the values each an array of everything that folds to its
3388 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
3389 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3390 (char *) pat_byte,
3391 UTF8SKIP(pat_byte),
3392 FALSE))))
3393 {
3394 /* Not found in the hash, therefore there are no folds
3395 * containing it, so there is only a single char
3396 * possible for beginning B */
3397 *c2 = *c1 = utf8n_to_uvchr(pat_byte, STR_LEN(text_node),
3398 0, uniflags);
3399 if (*c1 < 0) { /* Overflowed what we can handle */
3400 *c1 = *c2 = CHRTEST_VOID;
3401 }
3402 }
3403 else {
3404 AV* list = (AV*) *listp;
3405 if (av_len(list) != 1) { /* If there aren't exactly
3406 two folds to this, have
3407 to test B completely */
3408 *c1 = *c2 = CHRTEST_VOID;
3409 }
3410 else { /* There are two. Set *c1 and *c2 to them */
3411 SV** c_p = av_fetch(list, 0, FALSE);
3412 if (c_p == NULL) {
3413 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3414 }
3415 *c1 = SvUV(*c_p);
3416 c_p = av_fetch(list, 1, FALSE);
3417 if (c_p == NULL) {
3418 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3419 }
3420 *c2 = SvUV(*c_p);
3421 }
3422 }
3423 }
3424 }
3425 else {
3426 /* Get the character represented by the UTF-8-encoded byte */
3427 U8 c = (UTF8_IS_INVARIANT(*pat_byte))
3428 ? *pat_byte
3429 : TWO_BYTE_UTF8_TO_UNI(*pat_byte, *(pat_byte+1));
3430
3431 if (HAS_NONLATIN1_FOLD_CLOSURE(c)
3432 && (OP(text_node) != EXACTFA || ! isASCII(c)))
3433 { /* Something above Latin1 folds to this; hence there are
3434 more than 2 possibilities for B to begin with */
3435 *c1 = *c2 = CHRTEST_VOID;
3436 }
3437 else {
3438 *c1 = c;
3439 *c2 = (OP(text_node) == EXACTFL)
3440 ? PL_fold_locale[*c1]
3441 : PL_fold_latin1[*c1];
3442 }
3443 }
3444 }
3445 }
2f554ef7 3446
c74f6de9
KW
3447 return TRUE;
3448}
2f554ef7 3449
f73aaa43
DM
3450/* returns -1 on failure, $+[0] on success */
3451STATIC I32
3452S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3453{
a35a87e7 3454#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3455 dMY_CXT;
3456#endif
27da23d5 3457 dVAR;
eb578fdb 3458 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3459 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02
NC
3460 REGEXP *rex_sv = reginfo->prog;
3461 regexp *rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 3462 RXi_GET_DECL(rex,rexi);
2f554ef7 3463 I32 oldsave;
5d9a96ca 3464 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3465 regmatch_state *st;
5d9a96ca 3466 /* cache heavy used fields of st in registers */
eb578fdb
KW
3467 regnode *scan;
3468 regnode *next;
3469 U32 n = 0; /* general value; init to avoid compiler warning */
3470 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3471 char *locinput = startpos;
4d5016e5 3472 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3473 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3474
b69b0499 3475 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3476 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3477 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3478 const U32 max_nochange_depth =
3479 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3480 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3481 regmatch_state *yes_state = NULL; /* state to pop to on success of
3482 subpattern */
e2e6a0f1
YO
3483 /* mark_state piggy backs on the yes_state logic so that when we unwind
3484 the stack on success we can update the mark_state as we go */
3485 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3486 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3487 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3488 U32 state_num;
5d458dd8
YO
3489 bool no_final = 0; /* prevent failure from backtracking? */
3490 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3491 char *startpoint = locinput;
5d458dd8
YO
3492 SV *popmark = NULL; /* are we looking for a mark? */
3493 SV *sv_commit = NULL; /* last mark name seen in failure */
3494 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3495 during a successful match */
5d458dd8
YO
3496 U32 lastopen = 0; /* last open we saw */
3497 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3498 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3499 /* these three flags are set by various ops to signal information to
3500 * the very next op. They have a useful lifetime of exactly one loop
3501 * iteration, and are not preserved or restored by state pushes/pops
3502 */
3503 bool sw = 0; /* the condition value in (?(cond)a|b) */
3504 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3505 int logical = 0; /* the following EVAL is:
3506 0: (?{...})
3507 1: (?(?{...})X|Y)
3508 2: (??{...})
3509 or the following IFMATCH/UNLESSM is:
3510 false: plain (?=foo)
3511 true: used as a condition: (?(?=foo))
3512 */
81ed78b2
DM
3513 PAD* last_pad = NULL;
3514 dMULTICALL;
3515 I32 gimme = G_SCALAR;
3516 CV *caller_cv = NULL; /* who called us */
3517 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3518 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3519
95b24440 3520#ifdef DEBUGGING
e68ec53f 3521 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3522#endif
3523
81ed78b2
DM
3524 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3525 multicall_oldcatch = 0;
3526 multicall_cv = NULL;
3527 cx = NULL;
4f8dbb2d
JL
3528 PERL_UNUSED_VAR(multicall_cop);
3529 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3530
3531
7918f24d
NC
3532 PERL_ARGS_ASSERT_REGMATCH;
3533
3b57cd43 3534 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3535 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3536 }));
5d9a96ca
DM
3537 /* on first ever call to regmatch, allocate first slab */
3538 if (!PL_regmatch_slab) {
3539 Newx(PL_regmatch_slab, 1, regmatch_slab);
3540 PL_regmatch_slab->prev = NULL;
3541 PL_regmatch_slab->next = NULL;
86545054 3542 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3543 }
3544
2f554ef7
DM
3545 oldsave = PL_savestack_ix;
3546 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3547 SAVEVPTR(PL_regmatch_slab);
3548 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3549
3550 /* grab next free state slot */
3551 st = ++PL_regmatch_state;
86545054 3552 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3553 st = PL_regmatch_state = S_push_slab(aTHX);
3554
d6a28714 3555 /* Note that nextchr is a byte even in UTF */
7016d6eb 3556 SET_nextchr;
d6a28714
JH
3557 scan = prog;
3558 while (scan != NULL) {
8ba1375e 3559
a3621e74 3560 DEBUG_EXECUTE_r( {
6136c704 3561 SV * const prop = sv_newmortal();
1de06328 3562 regnode *rnext=regnext(scan);
f2ed9b32 3563 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3564 regprop(rex, prop, scan);
07be1b83
YO
3565
3566 PerlIO_printf(Perl_debug_log,
3567 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3568 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3569 SvPVX_const(prop),
1de06328 3570 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3571 0 : (IV)(rnext - rexi->program));
2a782b5b 3572 });
d6a28714
JH
3573
3574 next = scan + NEXT_OFF(scan);
3575 if (next == scan)
3576 next = NULL;
40a82448 3577 state_num = OP(scan);
d6a28714 3578
40a82448 3579 reenter_switch:
34a81e2b 3580
7016d6eb 3581 SET_nextchr;
bf798dc4 3582
40a82448 3583 switch (state_num) {
3c0563b9 3584 case BOL: /* /^../ */
7fba1cd6 3585 if (locinput == PL_bostr)
d6a28714 3586 {
3b0527fe 3587 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3588 break;
3589 }
d6a28714 3590 sayNO;
3c0563b9
DM
3591
3592 case MBOL: /* /^../m */
12d33761 3593 if (locinput == PL_bostr ||
7016d6eb 3594 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3595 {
b8c5462f
JH
3596 break;
3597 }
d6a28714 3598 sayNO;
3c0563b9
DM
3599
3600 case SBOL: /* /^../s */
c2a73568 3601 if (locinput == PL_bostr)
b8c5462f 3602 break;
d6a28714 3603 sayNO;
3c0563b9
DM
3604
3605 case GPOS: /* \G */
3b0527fe 3606 if (locinput == reginfo->ganch)
d6a28714
JH
3607 break;
3608 sayNO;
ee9b8eae 3609
3c0563b9 3610 case KEEPS: /* \K */
ee9b8eae 3611 /* update the startpoint */
b93070ed 3612 st->u.keeper.val = rex->offs[0].start;
b93070ed 3613 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3614 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
ee9b8eae
YO
3615 /*NOT-REACHED*/
3616 case KEEPS_next_fail:
3617 /* rollback the start point change */
b93070ed 3618 rex->offs[0].start = st->u.keeper.val;
ee9b8eae
YO
3619 sayNO_SILENT;
3620 /*NOT-REACHED*/
3c0563b9
DM
3621
3622 case EOL: /* /..$/ */
d6a28714 3623 goto seol;
3c0563b9
DM
3624
3625 case MEOL: /* /..$/m */
7016d6eb 3626 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3627 sayNO;
b8c5462f 3628 break;
3c0563b9
DM
3629
3630 case SEOL: /* /..$/s */
d6a28714 3631 seol:
7016d6eb 3632 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3633 sayNO;
d6a28714 3634 if (PL_regeol - locinput > 1)
b8c5462f 3635 sayNO;
b8c5462f 3636 break;
3c0563b9
DM
3637
3638 case EOS: /* \z */
7016d6eb 3639 if (!NEXTCHR_IS_EOS)
b8c5462f 3640 sayNO;
d6a28714 3641 break;
3c0563b9
DM
3642
3643 case SANY: /* /./s */
7016d6eb 3644 if (NEXTCHR_IS_EOS)
4633a7c4 3645 sayNO;
28b98f76 3646 goto increment_locinput;
3c0563b9
DM
3647
3648 case CANY: /* \C */
7016d6eb 3649 if (NEXTCHR_IS_EOS)
f33976b4 3650 sayNO;
3640db6b 3651 locinput++;
a0d0e21e 3652 break;
3c0563b9
DM
3653
3654 case REG_ANY: /* /./ */
7016d6eb 3655 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3656 sayNO;
28b98f76
DM
3657 goto increment_locinput;
3658
166ba7cd
DM
3659
3660#undef ST
3661#define ST st->u.trie
3c0563b9 3662 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3663 /* In this case the charclass data is available inline so
3664 we can fail fast without a lot of extra overhead.
3665 */
7016d6eb 3666 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3667 DEBUG_EXECUTE_r(
3668 PerlIO_printf(Perl_debug_log,
3669 "%*s %sfailed to match trie start class...%s\n",
3670 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3671 );
3672 sayNO_SILENT;
118e2215 3673 assert(0); /* NOTREACHED */
786e8c11
YO
3674 }
3675 /* FALL THROUGH */
3c0563b9 3676 case TRIE: /* (ab|cd) */
2e64971a
DM
3677 /* the basic plan of execution of the trie is:
3678 * At the beginning, run though all the states, and
3679 * find the longest-matching word. Also remember the position
3680 * of the shortest matching word. For example, this pattern:
3681 * 1 2 3 4 5
3682 * ab|a|x|abcd|abc
3683 * when matched against the string "abcde", will generate
3684 * accept states for all words except 3, with the longest
895cc420 3685 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3686 * the position being after char 1 of the string).
3687 *
3688 * Then for each matching word, in word order (i.e. 1,2,4,5),
3689 * we run the remainder of the pattern; on each try setting
3690 * the current position to the character following the word,
3691 * returning to try the next word on failure.
3692 *
3693 * We avoid having to build a list of words at runtime by
3694 * using a compile-time structure, wordinfo[].prev, which
3695 * gives, for each word, the previous accepting word (if any).
3696 * In the case above it would contain the mappings 1->2, 2->0,
3697 * 3->0, 4->5, 5->1. We can use this table to generate, from
3698 * the longest word (4 above), a list of all words, by
3699 * following the list of prev pointers; this gives us the
3700 * unordered list 4,5,1,2. Then given the current word we have
3701 * just tried, we can go through the list and find the
3702 * next-biggest word to try (so if we just failed on word 2,
3703 * the next in the list is 4).
3704 *
3705 * Since at runtime we don't record the matching position in
3706 * the string for each word, we have to work that out for
3707 * each word we're about to process. The wordinfo table holds
3708 * the character length of each word; given that we recorded
3709 * at the start: the position of the shortest word and its
3710 * length in chars, we just need to move the pointer the
3711 * difference between the two char lengths. Depending on
3712 * Unicode status and folding, that's cheap or expensive.
3713 *
3714 * This algorithm is optimised for the case where are only a
3715 * small number of accept states, i.e. 0,1, or maybe 2.
3716 * With lots of accepts states, and having to try all of them,
3717 * it becomes quadratic on number of accept states to find all
3718 * the next words.
3719 */
3720
3dab1dad 3721 {
07be1b83 3722 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3723 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3724
3725 /* what trie are we using right now */
be8e71aa 3726 reg_trie_data * const trie
f8fc2ecf 3727 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3728 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3729 U32 state = trie->startstate;
166ba7cd 3730
7016d6eb
DM
3731 if ( trie->bitmap
3732 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3733 {
3dab1dad
YO
3734 if (trie->states[ state ].wordnum) {
3735 DEBUG_EXECUTE_r(
3736 PerlIO_printf(Perl_debug_log,
3737 "%*s %smatched empty string...%s\n",
5bc10b2c 3738 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3739 );
20dbff7c
YO
3740 if (!trie->jump)
3741 break;
3dab1dad
YO
3742 } else {
3743 DEBUG_EXECUTE_r(
3744 PerlIO_printf(Perl_debug_log,
786e8c11 3745 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3746 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3747 );
3748 sayNO_SILENT;
3749 }
3750 }
166ba7cd 3751
786e8c11
YO
3752 {
3753 U8 *uc = ( U8* )locinput;
3754
3755 STRLEN len = 0;
3756 STRLEN foldlen = 0;
3757 U8 *uscan = (U8*)NULL;
786e8c11 3758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3759 U32 charcount = 0; /* how many input chars we have matched */
3760 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3761
786e8c11 3762 ST.jump = trie->jump;
786e8c11 3763 ST.me = scan;
2e64971a
DM
3764 ST.firstpos = NULL;
3765 ST.longfold = FALSE; /* char longer if folded => it's harder */
3766 ST.nextword = 0;
3767
3768 /* fully traverse the TRIE; note the position of the
3769 shortest accept state and the wordnum of the longest
3770 accept state */
07be1b83 3771
a3621e74 3772 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3773 U32 base = trie->states[ state ].trans.base;
f9f4320a 3774 UV uvc = 0;
acb909b4 3775 U16 charid = 0;
2e64971a
DM
3776 U16 wordnum;
3777 wordnum = trie->states[ state ].wordnum;
3778
3779 if (wordnum) { /* it's an accept state */
3780 if (!accepted) {
3781 accepted = 1;
3782 /* record first match position */
3783 if (ST.longfold) {
3784 ST.firstpos = (U8*)locinput;
3785 ST.firstchars = 0;
5b47454d 3786 }
2e64971a
DM
3787 else {
3788 ST.firstpos = uc;
3789 ST.firstchars = charcount;
3790 }
3791 }
3792 if (!ST.nextword || wordnum < ST.nextword)
3793 ST.nextword = wordnum;
3794 ST.topword = wordnum;
786e8c11 3795 }
a3621e74 3796
07be1b83 3797 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3798 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3799 PerlIO_printf( Perl_debug_log,
2e64971a 3800 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3801 2+depth * 2, "", PL_colors[4],
2e64971a 3802 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3803 });
a3621e74 3804
2e64971a 3805 /* read a char and goto next state */
7016d6eb 3806 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3807 I32 offset;
55eed653
NC
3808 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3809 uscan, len, uvc, charid, foldlen,
3810 foldbuf, uniflags);
2e64971a
DM
3811 charcount++;
3812 if (foldlen>0)
3813 ST.longfold = TRUE;
5b47454d 3814 if (charid &&
6dd2be57
DM
3815 ( ((offset =
3816 base + charid - 1 - trie->uniquecharcount)) >= 0)
3817
3818 && ((U32)offset < trie->lasttrans)
3819 && trie->trans[offset].check == state)
5b47454d 3820 {
6dd2be57 3821 state = trie->trans[offset].next;
5b47454d
DM
3822 }
3823 else {
3824 state = 0;
3825 }
3826 uc += len;
3827
3828 }
3829 else {
a3621e74
YO
3830 state = 0;
3831 }
3832 DEBUG_TRIE_EXECUTE_r(
e4584336 3833 PerlIO_printf( Perl_debug_log,
786e8c11 3834 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3835 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3836 );
3837 }
2e64971a 3838 if (!accepted)
a3621e74 3839 sayNO;
a3621e74 3840
2e64971a
DM
3841 /* calculate total number of accept states */
3842 {
3843 U16 w = ST.topword;
3844 accepted = 0;
3845 while (w) {
3846 w = trie->wordinfo[w].prev;
3847 accepted++;
3848 }
3849 ST.accepted = accepted;
3850 }
3851
166ba7cd
DM
3852 DEBUG_EXECUTE_r(
3853 PerlIO_printf( Perl_debug_log,
3854 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3855 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3856 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3857 );
2e64971a 3858 goto trie_first_try; /* jump into the fail handler */
786e8c11 3859 }}
118e2215 3860 assert(0); /* NOTREACHED */
2e64971a
DM
3861
3862 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3863 {
3864 U8 *uc;
fae667d5
YO
3865 if ( ST.jump) {
3866 REGCP_UNWIND(ST.cp);
a8d1f4b4 3867 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3868 }
2e64971a
DM
3869 if (!--ST.accepted) {
3870 DEBUG_EXECUTE_r({
3871 PerlIO_printf( Perl_debug_log,
3872 "%*s %sTRIE failed...%s\n",
3873 REPORT_CODE_OFF+depth*2, "",
3874 PL_colors[4],
3875 PL_colors[5] );
3876 });
3877 sayNO_SILENT;
3878 }
3879 {
3880 /* Find next-highest word to process. Note that this code
3881 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3882 U16 min = 0;
3883 U16 word;
3884 U16 const nextword = ST.nextword;
3885 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3886 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3887 for (word=ST.topword; word; word=wordinfo[word].prev) {
3888 if (word > nextword && (!min || word < min))
3889 min = word;
3890 }
3891 ST.nextword = min;
3892 }
3893
fae667d5 3894 trie_first_try:
5d458dd8
YO
3895 if (do_cutgroup) {
3896 do_cutgroup = 0;
3897 no_final = 0;
3898 }
fae667d5
YO
3899
3900 if ( ST.jump) {
b93070ed 3901 ST.lastparen = rex->lastparen;
f6033a9d 3902 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3903 REGCP_SET(ST.cp);
2e64971a 3904 }
a3621e74 3905
2e64971a 3906 /* find start char of end of current word */
166ba7cd 3907 {
2e64971a 3908 U32 chars; /* how many chars to skip */
2e64971a
DM
3909 reg_trie_data * const trie
3910 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3911
3912 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3913 >= ST.firstchars);
3914 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3915 - ST.firstchars;
a059a757 3916 uc = ST.firstpos;
2e64971a
DM
3917
3918 if (ST.longfold) {
3919 /* the hard option - fold each char in turn and find
3920 * its folded length (which may be different */
3921 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3922 STRLEN foldlen;
3923 STRLEN len;
d9a396a3 3924 UV uvc;
2e64971a
DM
3925 U8 *uscan;
3926
3927 while (chars) {
f2ed9b32 3928 if (utf8_target) {
2e64971a
DM
3929 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3930 uniflags);
3931 uc += len;
3932 }
3933 else {
3934 uvc = *uc;
3935 uc++;
3936 }
3937 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3938 uscan = foldbuf;
3939 while (foldlen) {
3940 if (!--chars)
3941 break;
3942 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3943 uniflags);
3944 uscan += len;
3945 foldlen -= len;
3946 }
3947 }
a3621e74 3948 }
2e64971a 3949 else {
f2ed9b32 3950 if (utf8_target)
2e64971a
DM
3951 while (chars--)
3952 uc += UTF8SKIP(uc);
3953 else
3954 uc += chars;
3955 }
2e64971a 3956 }
166ba7cd 3957
6603fe3e
DM
3958 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3959 ? ST.jump[ST.nextword]
3960 : NEXT_OFF(ST.me));
166ba7cd 3961
2e64971a
DM
3962 DEBUG_EXECUTE_r({
3963 PerlIO_printf( Perl_debug_log,
3964 "%*s %sTRIE matched word #%d, continuing%s\n",
3965 REPORT_CODE_OFF+depth*2, "",
3966 PL_colors[4],
3967 ST.nextword,
3968 PL_colors[5]
3969 );
3970 });
3971
3972 if (ST.accepted > 1 || has_cutgroup) {
a059a757 3973 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 3974 assert(0); /* NOTREACHED */
166ba7cd 3975 }
2e64971a
DM
3976 /* only one choice left - just continue */
3977 DEBUG_EXECUTE_r({
3978 AV *const trie_words
3979 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3980 SV ** const tmp = av_fetch( trie_words,
3981 ST.nextword-1, 0 );
3982 SV *sv= tmp ? sv_newmortal() : NULL;
3983
3984 PerlIO_printf( Perl_debug_log,
3985 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3986 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3987 ST.nextword,
3988 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3989 PL_colors[0], PL_colors[1],
c89df6cf 3990 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
3991 )
3992 : "not compiled under -Dr",
3993 PL_colors[5] );
3994 });
3995
a059a757 3996 locinput = (char*)uc;
2e64971a 3997 continue; /* execute rest of RE */
118e2215 3998 assert(0); /* NOTREACHED */
a059a757 3999 }
166ba7cd
DM
4000#undef ST
4001
3c0563b9 4002 case EXACT: { /* /abc/ */
95b24440 4003 char *s = STRING(scan);
24d3c4a9 4004 ln = STR_LEN(scan);
f2ed9b32 4005 if (utf8_target != UTF_PATTERN) {
bc517b45 4006 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4007 char *l = locinput;
24d3c4a9 4008 const char * const e = s + ln;
a72c7584 4009
f2ed9b32 4010 if (utf8_target) {
e6a3850e
KW
4011 /* The target is utf8, the pattern is not utf8.
4012 * Above-Latin1 code points can't match the pattern;
4013 * invariants match exactly, and the other Latin1 ones need
4014 * to be downgraded to a single byte in order to do the
4015 * comparison. (If we could be confident that the target
4016 * is not malformed, this could be refactored to have fewer
4017 * tests by just assuming that if the first bytes match, it
4018 * is an invariant, but there are tests in the test suite
4019 * dealing with (??{...}) which violate this) */
1aa99e6b
IH
4020 while (s < e) {
4021 if (l >= PL_regeol)
5ff6fc6d 4022 sayNO;
e6a3850e
KW
4023 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4024 sayNO;
4025 }
4026 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4027 if (*l != *s) {
4028 sayNO;
4029 }
4030 l++;
4031 }
4032 else {
4033 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4034 sayNO;
4035 }
4036 l += 2;
4037 }
4038 s++;
1aa99e6b 4039 }
5ff6fc6d
JH
4040 }
4041 else {
4042 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4043 while (s < e) {
e6a3850e
KW
4044 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4045 {
4046 sayNO;
4047 }
4048 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4049 if (*s != *l) {
4050 sayNO;
4051 }
4052 s++;
4053 }
4054 else {
4055 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4056 sayNO;
4057 }
4058 s += 2;
4059 }
4060 l++;
1aa99e6b 4061 }
5ff6fc6d 4062 }
1aa99e6b 4063 locinput = l;
1aa99e6b
IH
4064 break;
4065 }
bc517b45 4066 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
4067 /* Inline the first character, for speed. */
4068 if (UCHARAT(s) != nextchr)
4069 sayNO;
24d3c4a9 4070 if (PL_regeol - locinput < ln)
d6a28714 4071 sayNO;
24d3c4a9 4072 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 4073 sayNO;
24d3c4a9 4074 locinput += ln;
d6a28714 4075 break;
95b24440 4076 }
7016d6eb 4077
3c0563b9 4078 case EXACTFL: { /* /abc/il */
a932d541 4079 re_fold_t folder;
9a5a5549
KW
4080 const U8 * fold_array;
4081 const char * s;
d513472c 4082 U32 fold_utf8_flags;
9a5a5549 4083
b8c5462f 4084 PL_reg_flags |= RF_tainted;
9a5a5549
KW
4085 folder = foldEQ_locale;
4086 fold_array = PL_fold_locale;
17580e7a 4087 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4088 goto do_exactf;
4089
3c0563b9
DM
4090 case EXACTFU_SS: /* /\x{df}/iu */
4091 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4092 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4093 folder = foldEQ_latin1;
4094 fold_array = PL_fold_latin1;
2daa8fee 4095 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4096 goto do_exactf;
4097
3c0563b9 4098 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4099 folder = foldEQ_latin1;
4100 fold_array = PL_fold_latin1;
57014d77 4101 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4102 goto do_exactf;
4103
3c0563b9 4104 case EXACTF: /* /abc/i */
9a5a5549
KW
4105 folder = foldEQ;
4106 fold_array = PL_fold;
62bf7766 4107 fold_utf8_flags = 0;
9a5a5549
KW
4108
4109 do_exactf:
4110 s = STRING(scan);
24d3c4a9 4111 ln = STR_LEN(scan);
d6a28714 4112
3c760661
KW
4113 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4114 /* Either target or the pattern are utf8, or has the issue where
4115 * the fold lengths may differ. */
be8e71aa 4116 const char * const l = locinput;
d07ddd77 4117 char *e = PL_regeol;
bc517b45 4118
d513472c 4119 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 4120 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4121 {
4122 sayNO;
5486206c 4123 }
d07ddd77 4124 locinput = e;
d07ddd77 4125 break;
a0ed51b3 4126 }
d6a28714 4127
0a138b74 4128 /* Neither the target nor the pattern are utf8 */
d6a28714 4129 if (UCHARAT(s) != nextchr &&
9a5a5549
KW
4130 UCHARAT(s) != fold_array[nextchr])
4131 {
a0ed51b3 4132 sayNO;
9a5a5549 4133 }
24d3c4a9 4134 if (PL_regeol - locinput < ln)
b8c5462f 4135 sayNO;
9a5a5549 4136 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4137 sayNO;
24d3c4a9 4138 locinput += ln;
a0d0e21e 4139 break;
9a5a5549 4140 }
63ac0dad
KW
4141
4142 /* XXX Could improve efficiency by separating these all out using a
4143 * macro or in-line function. At that point regcomp.c would no longer
4144 * have to set the FLAGS fields of these */
3c0563b9
DM
4145 case BOUNDL: /* /\b/l */
4146 case NBOUNDL: /* /\B/l */
b2680017
YO
4147 PL_reg_flags |= RF_tainted;
4148 /* FALL THROUGH */
3c0563b9
DM
4149 case BOUND: /* /\b/ */
4150 case BOUNDU: /* /\b/u */
4151 case BOUNDA: /* /\b/a */
4152 case NBOUND: /* /\B/ */
4153 case NBOUNDU: /* /\B/u */
4154 case NBOUNDA: /* /\B/a */
b2680017 4155 /* was last char in word? */
f2e96b5d
KW
4156 if (utf8_target
4157 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4158 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4159 {
b2680017
YO
4160 if (locinput == PL_bostr)
4161 ln = '\n';
4162 else {
4163 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4164
4165 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4166 }
63ac0dad 4167 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017 4168 ln = isALNUM_uni(ln);
7016d6eb
DM
4169 if (NEXTCHR_IS_EOS)
4170 n = 0;
4171 else {
4172 LOAD_UTF8_CHARCLASS_ALNUM();
4173 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4174 utf8_target);
4175 }
b2680017
YO
4176 }
4177 else {
4178 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
7016d6eb 4179 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
b2680017
YO
4180 }
4181 }
4182 else {
cfaf538b
KW
4183
4184 /* Here the string isn't utf8, or is utf8 and only ascii
4185 * characters are to match \w. In the latter case looking at
4186 * the byte just prior to the current one may be just the final
4187 * byte of a multi-byte character. This is ok. There are two
4188 * cases:
4189 * 1) it is a single byte character, and then the test is doing
4190 * just what it's supposed to.
4191 * 2) it is a multi-byte character, in which case the final
4192 * byte is never mistakable for ASCII, and so the test
4193 * will say it is not a word character, which is the
4194 * correct answer. */
b2680017
YO
4195 ln = (locinput != PL_bostr) ?
4196 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4197 switch (FLAGS(scan)) {
4198 case REGEX_UNICODE_CHARSET:
4199 ln = isWORDCHAR_L1(ln);
7016d6eb 4200 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4201 break;
4202 case REGEX_LOCALE_CHARSET:
4203 ln = isALNUM_LC(ln);
7016d6eb 4204 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
63ac0dad
KW
4205 break;
4206 case REGEX_DEPENDS_CHARSET:
4207 ln = isALNUM(ln);
7016d6eb 4208 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
63ac0dad 4209 break;
cfaf538b 4210 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4211 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4212 ln = isWORDCHAR_A(ln);
7016d6eb 4213 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4214 break;
63ac0dad
KW
4215 default:
4216 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4217 break;
b2680017
YO
4218 }
4219 }
63ac0dad
KW
4220 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4221 * regcomp.sym */
4222 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4223 sayNO;
4224 break;
3c0563b9 4225
3c0563b9 4226 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4227 if (NEXTCHR_IS_EOS)
4228 sayNO;
e0193e47 4229 if (utf8_target) {
9e55ce06 4230 STRLEN inclasslen = PL_regeol - locinput;
f2ed9b32 4231 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
09b08e9b 4232 sayNO;
b32d7d3e 4233 locinput += inclasslen;
e0f9d4a8 4234 break;
ffc61ed2
JH
4235 }
4236 else {
20ed0b26 4237 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4238 sayNO;
3640db6b 4239 locinput++;
e0f9d4a8
JH
4240 break;
4241 }
b8c5462f 4242 break;
3c0563b9
DM
4243
4244 /* Special char classes: \d, \w etc.
4245 * The defines start on line 166 or so */
ee9a90b8
KW
4246 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4247 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4248 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 4249 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 4250 alnum, "a");
ee9a90b8
KW
4251
4252 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4253 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4254 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 4255 SPACEA, NSPACEA, isSPACE_A,
779d7b58 4256 space, " ");
ee9a90b8
KW
4257
4258 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4259 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 4260 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 4261 digit, "0");
20d0b1e9 4262
3c0563b9 4263 case POSIXA: /* /[[:ascii:]]/ etc */
7016d6eb 4264 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4265 sayNO;
4266 }
4267 /* Matched a utf8-invariant, so don't have to worry about utf8 */
3640db6b 4268 locinput++;
0658cdde 4269 break;
3c0563b9
DM
4270
4271 case NPOSIXA: /* /[^[:ascii:]]/ etc */
7016d6eb 4272 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4273 sayNO;
4274 }
28b98f76 4275 goto increment_locinput;
0658cdde 4276
37e2e78e
KW
4277 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4278 a Unicode extended Grapheme Cluster */
4279 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4280 extended Grapheme Cluster is:
4281
4282 CR LF
4283 | Prepend* Begin Extend*
4284 | .
4285
1e958ea9
KW
4286 Begin is: ( Special_Begin | ! Control )
4287 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4288 Extend is: ( Grapheme_Extend | Spacing_Mark )
4289 Control is: [ GCB_Control CR LF ]
4290 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4291
27d4fc33
KW
4292 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4293 we can rewrite
4294
4295 Begin is ( Regular_Begin + Special Begin )
4296
4297 It turns out that 98.4% of all Unicode code points match
4298 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4299 the previous implementation for almost all Unicode code points.
27d4fc33 4300
37e2e78e
KW
4301 There is a subtlety with Prepend* which showed up in testing.
4302 Note that the Begin, and only the Begin is required in:
4303 | Prepend* Begin Extend*
cc3b396d
KW
4304 Also, Begin contains '! Control'. A Prepend must be a
4305 '! Control', which means it must also be a Begin. What it
4306 comes down to is that if we match Prepend* and then find no
4307 suitable Begin afterwards, that if we backtrack the last
4308 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4309 */
4310
7016d6eb 4311 if (NEXTCHR_IS_EOS)
a0ed51b3 4312 sayNO;
f2ed9b32 4313 if (! utf8_target) {
37e2e78e
KW
4314
4315 /* Match either CR LF or '.', as all the other possibilities
4316 * require utf8 */
4317 locinput++; /* Match the . or CR */
cc3b396d
KW
4318 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4319 match the LF */
37e2e78e
KW
4320 && locinput < PL_regeol
4321 && UCHARAT(locinput) == '\n') locinput++;
4322 }
4323 else {
4324
4325 /* Utf8: See if is ( CR LF ); already know that locinput <
4326 * PL_regeol, so locinput+1 is in bounds */
7016d6eb
DM
4327 if ( nextchr == '\r' && locinput+1 < PL_regeol
4328 && UCHARAT(locinput + 1) == '\n')
4329 {
37e2e78e
KW
4330 locinput += 2;
4331 }
4332 else {
45fdf108
KW
4333 STRLEN len;
4334
37e2e78e
KW
4335 /* In case have to backtrack to beginning, then match '.' */
4336 char *starting = locinput;
4337
4338 /* In case have to backtrack the last prepend */
4339 char *previous_prepend = 0;
4340
4341 LOAD_UTF8_CHARCLASS_GCB();
4342
45fdf108
KW
4343 /* Match (prepend)* */
4344 while (locinput < PL_regeol
4345 && (len = is_GCB_Prepend_utf8(locinput)))
4346 {
4347 previous_prepend = locinput;
4348 locinput += len;
a1853d78 4349 }
37e2e78e
KW
4350
4351 /* As noted above, if we matched a prepend character, but
4352 * the next thing won't match, back off the last prepend we
4353 * matched, as it is guaranteed to match the begin */
4354 if (previous_prepend
4355 && (locinput >= PL_regeol
c101f46d
KW
4356 || (! swash_fetch(PL_utf8_X_regular_begin,
4357 (U8*)locinput, utf8_target)
45fdf108 4358 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
c101f46d 4359 )
37e2e78e
KW
4360 {
4361 locinput = previous_prepend;
4362 }
4363
4364 /* Note that here we know PL_regeol > locinput, as we
4365 * tested that upon input to this switch case, and if we
4366 * moved locinput forward, we tested the result just above
4367 * and it either passed, or we backed off so that it will
4368 * now pass */
11dfcd49
KW
4369 if (swash_fetch(PL_utf8_X_regular_begin,
4370 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4371 locinput += UTF8SKIP(locinput);
4372 }
45fdf108 4373 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
37e2e78e
KW
4374
4375 /* Here did not match the required 'Begin' in the
4376 * second term. So just match the very first
4377 * character, the '.' of the final term of the regex */
4378 locinput = starting + UTF8SKIP(starting);
27d4fc33 4379 goto exit_utf8;
37e2e78e
KW
4380 } else {
4381
11dfcd49
KW
4382 /* Here is a special begin. It can be composed of
4383 * several individual characters. One possibility is
4384 * RI+ */
45fdf108
KW
4385 if ((len = is_GCB_RI_utf8(locinput))) {
4386 locinput += len;
11dfcd49 4387 while (locinput < PL_regeol
45fdf108 4388 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4389 {
45fdf108 4390 locinput += len;
11dfcd49 4391 }
45fdf108
KW
4392 } else if ((len = is_GCB_T_utf8(locinput))) {
4393 /* Another possibility is T+ */
4394 locinput += len;
11dfcd49 4395 while (locinput < PL_regeol
45fdf108 4396 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4397 {
45fdf108 4398 locinput += len;
11dfcd49
KW
4399 }
4400 } else {
4401
4402 /* Here, neither RI+ nor T+; must be some other
4403 * Hangul. That means it is one of the others: L,
4404 * LV, LVT or V, and matches:
4405 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4406
4407 /* Match L* */
4408 while (locinput < PL_regeol
45fdf108 4409 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4410 {
45fdf108 4411 locinput += len;
11dfcd49 4412 }
37e2e78e 4413
11dfcd49
KW
4414 /* Here, have exhausted L*. If the next character
4415 * is not an LV, LVT nor V, it means we had to have
4416 * at least one L, so matches L+ in the original
4417 * equation, we have a complete hangul syllable.
4418 * Are done. */
4419
4420 if (locinput < PL_regeol
45fdf108 4421 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49
KW
4422 {
4423
4424 /* Otherwise keep going. Must be LV, LVT or V.
4425 * See if LVT */
4426 if (is_utf8_X_LVT((U8*)locinput)) {
4427 locinput += UTF8SKIP(locinput);
4428 } else {
4429
4430 /* Must be V or LV. Take it, then match
4431 * V* */
4432 locinput += UTF8SKIP(locinput);
4433 while (locinput < PL_regeol
45fdf108 4434 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4435 {
45fdf108 4436 locinput += len;
11dfcd49
KW
4437 }
4438 }
37e2e78e 4439
11dfcd49 4440 /* And any of LV, LVT, or V can be followed
45fdf108 4441 * by T* */
11dfcd49 4442 while (locinput < PL_regeol
45fdf108 4443 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4444 {
45fdf108 4445 locinput += len;
11dfcd49
KW
4446 }
4447 }
cd94d768 4448 }
11dfcd49 4449 }
37e2e78e 4450
11dfcd49
KW
4451 /* Match any extender */
4452 while (locinput < PL_regeol
4453 && swash_fetch(PL_utf8_X_extend,
4454 (U8*)locinput, utf8_target))
4455 {
4456 locinput += UTF8SKIP(locinput);
4457 }
37e2e78e 4458 }
27d4fc33 4459 exit_utf8:
37e2e78e
KW
4460 if (locinput > PL_regeol) sayNO;
4461 }
a0ed51b3 4462 break;
81714fb9 4463
3c0563b9 4464 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4465 { /* The capture buffer cases. The ones beginning with N for the
4466 named buffers just convert to the equivalent numbered and
4467 pretend they were called as the corresponding numbered buffer
4468 op. */
26ecd678
TC
4469 /* don't initialize these in the declaration, it makes C++
4470 unhappy */
81714fb9 4471 char *s;
ff1157ca 4472 char type;
8368298a
TC
4473 re_fold_t folder;
4474 const U8 *fold_array;
26ecd678 4475 UV utf8_fold_flags;
8368298a 4476
81714fb9 4477 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4478 folder = foldEQ_locale;
4479 fold_array = PL_fold_locale;
4480 type = REFFL;
17580e7a 4481 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4482 goto do_nref;
4483
3c0563b9 4484 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4485 folder = foldEQ_latin1;
4486 fold_array = PL_fold_latin1;
4487 type = REFFA;
4488 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4489 goto do_nref;
4490
3c0563b9 4491 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4492 folder = foldEQ_latin1;
4493 fold_array = PL_fold_latin1;
4494 type = REFFU;
d513472c 4495 utf8_fold_flags = 0;
d7ef4b73
KW
4496 goto do_nref;
4497
3c0563b9 4498 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4499 folder = foldEQ;
4500 fold_array = PL_fold;
4501 type = REFF;
d513472c 4502 utf8_fold_flags = 0;
d7ef4b73
KW
4503 goto do_nref;
4504
3c0563b9 4505 case NREF: /* /\g{name}/ */
d7ef4b73 4506 type = REF;
83d7b90b
KW
4507 folder = NULL;
4508 fold_array = NULL;
d513472c 4509 utf8_fold_flags = 0;
d7ef4b73
KW
4510 do_nref:
4511
4512 /* For the named back references, find the corresponding buffer
4513 * number */
0a4db386
YO
4514 n = reg_check_named_buff_matched(rex,scan);
4515
d7ef4b73 4516 if ( ! n ) {
81714fb9 4517 sayNO;
d7ef4b73
KW
4518 }
4519 goto do_nref_ref_common;
4520
3c0563b9 4521 case REFFL: /* /\1/il */
3280af22 4522 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4523 folder = foldEQ_locale;
4524 fold_array = PL_fold_locale;
17580e7a 4525 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4526 goto do_ref;
4527
3c0563b9 4528 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4529 folder = foldEQ_latin1;
4530 fold_array = PL_fold_latin1;
4531 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4532 goto do_ref;
4533
3c0563b9 4534 case REFFU: /* /\1/iu */
d7ef4b73
KW
4535 folder = foldEQ_latin1;
4536 fold_array = PL_fold_latin1;
d513472c 4537 utf8_fold_flags = 0;
d7ef4b73
KW
4538 goto do_ref;
4539
3c0563b9 4540 case REFF: /* /\1/i */
d7ef4b73
KW
4541 folder = foldEQ;
4542 fold_array = PL_fold;
d513472c 4543 utf8_fold_flags = 0;
83d7b90b 4544 goto do_ref;
d7ef4b73 4545
3c0563b9 4546 case REF: /* /\1/ */
83d7b90b
KW
4547 folder = NULL;
4548 fold_array = NULL;
d513472c 4549 utf8_fold_flags = 0;
83d7b90b 4550
d7ef4b73 4551 do_ref:
81714fb9 4552 type = OP(scan);
d7ef4b73
KW
4553 n = ARG(scan); /* which paren pair */
4554
4555 do_nref_ref_common:
b93070ed 4556 ln = rex->offs[n].start;
2c2d71f5 4557 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4558 if (rex->lastparen < n || ln == -1)
af3f8c16 4559 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4560 if (ln == rex->offs[n].end)
a0d0e21e 4561 break;
a0ed51b3 4562
24d3c4a9 4563 s = PL_bostr + ln;
d7ef4b73 4564 if (type != REF /* REF can do byte comparison */
2f65c56d 4565 && (utf8_target || type == REFFU))
d7ef4b73
KW
4566 { /* XXX handle REFFL better */
4567 char * limit = PL_regeol;
4568
4569 /* This call case insensitively compares the entire buffer
4570 * at s, with the current input starting at locinput, but
4571 * not going off the end given by PL_regeol, and returns in
4572 * limit upon success, how much of the current input was
4573 * matched */
b93070ed 4574 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4575 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4576 {
4577 sayNO;
a0ed51b3 4578 }
d7ef4b73 4579 locinput = limit;
a0ed51b3
LW
4580 break;
4581 }
4582
d7ef4b73 4583 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4584 if (!NEXTCHR_IS_EOS &&
4585 UCHARAT(s) != nextchr &&
81714fb9 4586 (type == REF ||
d7ef4b73 4587 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4588 sayNO;
b93070ed 4589 ln = rex->offs[n].end - ln;
24d3c4a9 4590 if (locinput + ln > PL_regeol)
4633a7c4 4591 sayNO;
81714fb9 4592 if (ln > 1 && (type == REF
24d3c4a9 4593 ? memNE(s, locinput, ln)
d7ef4b73 4594 : ! folder(s, locinput, ln)))
4633a7c4 4595 sayNO;
24d3c4a9 4596 locinput += ln;
a0d0e21e 4597 break;
81714fb9 4598 }
3c0563b9
DM
4599
4600 case NOTHING: /* null op; e.g. the 'nothing' following
4601 * the '*' in m{(a+|b)*}' */
4602 break;
4603 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4604 break;
3c0563b9
DM
4605
4606 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4607 break;
40a82448
DM
4608
4609#undef ST
4610#define ST st->u.eval
c277df42 4611 {
c277df42 4612 SV *ret;
d2f13c59 4613 REGEXP *re_sv;
6bda09f9 4614 regexp *re;
f8fc2ecf 4615 regexp_internal *rei;
1a147d38
YO
4616 regnode *startpoint;
4617
3c0563b9 4618 case GOSTART: /* (?R) */
e7707071
YO
4619 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4620 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4621 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4622 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4623 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4624 Perl_croak(aTHX_
4625 "Pattern subroutine nesting without pos change"
4626 " exceeded limit in regex");
6bda09f9
YO
4627 } else {
4628 nochange_depth = 0;
1a147d38 4629 }
288b8c02 4630 re_sv = rex_sv;
6bda09f9 4631 re = rex;
f8fc2ecf 4632 rei = rexi;
1a147d38 4633 if (OP(scan)==GOSUB) {
6bda09f9
YO
4634 startpoint = scan + ARG2L(scan);
4635 ST.close_paren = ARG(scan);
4636 } else {
f8fc2ecf 4637 startpoint = rei->program+1;
6bda09f9
YO
4638 ST.close_paren = 0;
4639 }
4640 goto eval_recurse_doit;
118e2215 4641 assert(0); /* NOTREACHED */
3c0563b9 4642
6bda09f9
YO
4643 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4644 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4645 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4646 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4647 } else {
4648 nochange_depth = 0;
4649 }
8e5e9ebe 4650 {
4aabdb9b 4651 /* execute the code in the {...} */
81ed78b2 4652
4aabdb9b 4653 dSP;
81ed78b2 4654 SV ** before;
1f4d1a1e 4655 OP * const oop = PL_op;
4aabdb9b 4656 COP * const ocurcop = PL_curcop;
81ed78b2 4657 OP *nop;
d80618d2 4658 char *saved_regeol = PL_regeol;
91332126 4659 struct re_save_state saved_state;
81ed78b2 4660 CV *newcv;
91332126 4661
74088413
DM
4662 /* save *all* paren positions */
4663 regcppush(rex, 0);
4664 REGCP_SET(runops_cp);
4665
6562f1c4 4666 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4667 * eval we would normally put it on the save stack, like with
4668 * save_re_context. However, re-evals have a weird scoping so we
4669 * can't just add ENTER/LEAVE here. With that, things like
4670 *
4671 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4672 *
4673 * would break, as they expect the localisation to be unwound
4674 * only when the re-engine backtracks through the bit that
4675 * localised it.
4676 *
4677 * What we do instead is just saving the state in a local c
4678 * variable.
4679 */
91332126 4680 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4681
d24ca0c5 4682 PL_reg_state.re_reparsing = FALSE;
91332126 4683
81ed78b2
DM
4684 if (!caller_cv)
4685 caller_cv = find_runcv(NULL);
4686
4aabdb9b 4687 n = ARG(scan);
81ed78b2 4688
b30fcab9 4689 if (rexi->data->what[n] == 'r') { /* code from an external qr */
81ed78b2 4690 newcv = ((struct regexp *)SvANY(
b30fcab9
DM
4691 (REGEXP*)(rexi->data->data[n])
4692 ))->qr_anoncv
81ed78b2
DM
4693 ;
4694 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4695 }
4696 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4697 newcv = caller_cv;
4698 nop = (OP*)rexi->data->data[n];
4699 assert(CvDEPTH(newcv));
68e2671b
DM
4700 }
4701 else {
d24ca0c5
DM
4702 /* literal with own CV */
4703 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4704 newcv = rex->qr_anoncv;
4705 nop = (OP*)rexi->data->data[n];
68e2671b 4706 }
81ed78b2 4707
0e458318
DM
4708 /* normally if we're about to execute code from the same
4709 * CV that we used previously, we just use the existing
4710 * CX stack entry. However, its possible that in the
4711 * meantime we may have backtracked, popped from the save
4712 * stack, and undone the SAVECOMPPAD(s) associated with
4713 * PUSH_MULTICALL; in which case PL_comppad no longer
4714 * points to newcv's pad. */
4715 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4716 {
4717 I32 depth = (newcv == caller_cv) ? 0 : 1;
4718 if (last_pushed_cv) {
4719 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4720 }
4721 else {
4722 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4723 }
4724 last_pushed_cv = newcv;
4725 }
4726 last_pad = PL_comppad;
4727
2e2e3f36
DM
4728 /* the initial nextstate you would normally execute
4729 * at the start of an eval (which would cause error
4730 * messages to come from the eval), may be optimised
4731 * away from the execution path in the regex code blocks;
4732 * so manually set PL_curcop to it initially */
4733 {
81ed78b2 4734 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4735 assert(o->op_type == OP_NULL);
4736 if (o->op_targ == OP_SCOPE) {
4737 o = cUNOPo->op_first;
4738 }
4739 else {
4740 assert(o->op_targ == OP_LEAVE);
4741 o = cUNOPo->op_first;
4742 assert(o->op_type == OP_ENTER);
4743 o = o->op_sibling;
4744 }
4745
4746 if (o->op_type != OP_STUB) {
4747 assert( o->op_type == OP_NEXTSTATE
4748 || o->op_type == OP_DBSTATE
4749 || (o->op_type == OP_NULL
4750 && ( o->op_targ == OP_NEXTSTATE
4751 || o->op_targ == OP_DBSTATE
4752 )
4753 )
4754 );
4755 PL_curcop = (COP*)o;
4756 }
4757 }
81ed78b2 4758 nop = nop->op_next;
2e2e3f36 4759
24b23f37 4760 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4761 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4762
b93070ed 4763 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4764
2bf803e2
YO
4765 if (sv_yes_mark) {
4766 SV *sv_mrk = get_sv("REGMARK", 1);
4767 sv_setsv(sv_mrk, sv_yes_mark);
4768 }
4769
81ed78b2
DM
4770 /* we don't use MULTICALL here as we want to call the
4771 * first op of the block of interest, rather than the
4772 * first op of the sub */
4773 before = SP;
4774 PL_op = nop;
8e5e9ebe
RGS
4775 CALLRUNOPS(aTHX); /* Scalar context. */
4776 SPAGAIN;
4777 if (SP == before)
075aa684 4778 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4779 else {
4780 ret = POPs;
4781 PUTBACK;
4782 }
4aabdb9b 4783
e4bfbed3
DM
4784 /* before restoring everything, evaluate the returned
4785 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4786 * PL_op or pad. Also need to process any magic vars
4787 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4788
4789 PL_op = NULL;
4790
5e98dac2 4791 re_sv = NULL;
e4bfbed3
DM
4792 if (logical == 0) /* (?{})/ */
4793 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4794 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4795 sw = cBOOL(SvTRUE(ret));
4796 logical = 0;
4797 }
4798 else { /* /(??{}) */
497d0a96
DM
4799 /* if its overloaded, let the regex compiler handle
4800 * it; otherwise extract regex, or stringify */
4801 if (!SvAMAGIC(ret)) {
4802 SV *sv = ret;
4803 if (SvROK(sv))
4804 sv = SvRV(sv);
4805 if (SvTYPE(sv) == SVt_REGEXP)
4806 re_sv = (REGEXP*) sv;
4807 else if (SvSMAGICAL(sv)) {
4808 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4809 if (mg)
4810 re_sv = (REGEXP *) mg->mg_obj;
4811 }
e4bfbed3 4812
497d0a96
DM
4813 /* force any magic, undef warnings here */
4814 if (!re_sv) {
4815 ret = sv_mortalcopy(ret);
4816 (void) SvPV_force_nolen(ret);
4817 }
e4bfbed3
DM
4818 }
4819
4820 }
4821
91332126
FR
4822 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4823
81ed78b2
DM
4824 /* *** Note that at this point we don't restore
4825 * PL_comppad, (or pop the CxSUB) on the assumption it may
4826 * be used again soon. This is safe as long as nothing
4827 * in the regexp code uses the pad ! */
4aabdb9b 4828 PL_op = oop;
4aabdb9b 4829 PL_curcop = ocurcop;
d80618d2 4830 PL_regeol = saved_regeol;
e4bfbed3
DM
4831 S_regcp_restore(aTHX_ rex, runops_cp);
4832
4833 if (logical != 2)
4aabdb9b 4834 break;
8e5e9ebe 4835 }
e4bfbed3
DM
4836
4837 /* only /(??{})/ from now on */
24d3c4a9 4838 logical = 0;
4aabdb9b 4839 {
4f639d21
DM
4840 /* extract RE object from returned value; compiling if
4841 * necessary */
5c35adbb 4842
575c37f6
DM
4843 if (re_sv) {
4844 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4845 }
0f5d15d6 4846 else {
c737faaf 4847 U32 pm_flags = 0;
a3b680e6 4848 const I32 osize = PL_regsize;
0f5d15d6 4849
9753d940
DM
4850 if (SvUTF8(ret) && IN_BYTES) {
4851 /* In use 'bytes': make a copy of the octet
4852 * sequence, but without the flag on */
b9ad30b4
NC
4853 STRLEN len;
4854 const char *const p = SvPV(ret, len);
4855 ret = newSVpvn_flags(p, len, SVs_TEMP);
4856 }
732caac7
DM
4857 if (rex->intflags & PREGf_USE_RE_EVAL)
4858 pm_flags |= PMf_USE_RE_EVAL;
4859
4860 /* if we got here, it should be an engine which
4861 * supports compiling code blocks and stuff */
4862 assert(rex->engine && rex->engine->op_comp);
ec841a27 4863 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4864 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4865 rex->engine, NULL, NULL,
4866 /* copy /msix etc to inner pattern */
4867 scan->flags,
4868 pm_flags);
732caac7 4869
9041c2e3 4870 if (!(SvFLAGS(ret)
faf82a0b 4871 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4872 | SVs_GMG))) {
a2794585
NC
4873 /* This isn't a first class regexp. Instead, it's
4874 caching a regexp onto an existing, Perl visible
4875 scalar. */
575c37f6 4876 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4877 }
0f5d15d6 4878 PL_regsize = osize;
74088413
DM
4879 /* safe to do now that any $1 etc has been
4880 * interpolated into the new pattern string and
4881 * compiled */
4882 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4883 }
575c37f6 4884 re = (struct regexp *)SvANY(re_sv);
4aabdb9b 4885 }
07bc277f 4886 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4887 re->subbeg = rex->subbeg;
4888 re->sublen = rex->sublen;
6502e081
DM
4889 re->suboffset = rex->suboffset;
4890 re->subcoffset = rex->subcoffset;
f8fc2ecf 4891 rei = RXi_GET(re);
6bda09f9 4892 DEBUG_EXECUTE_r(
f2ed9b32 4893 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4894 "Matching embedded");
4895 );
f8fc2ecf 4896 startpoint = rei->program + 1;
1a147d38 4897 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4898
1a147d38 4899 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4900 /* run the pattern returned from (??{...}) */
b93070ed 4901 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4902 REGCP_SET(ST.lastcp);
6bda09f9 4903
0357f1fd
ML
4904 re->lastparen = 0;
4905 re->lastcloseparen = 0;
4906
ae0beba1 4907 PL_regsize = 0;
4aabdb9b
DM
4908
4909 /* XXXX This is too dramatic a measure... */
4910 PL_reg_maxiter = 0;
4911
faec1544 4912 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4913 if (RX_UTF8(re_sv))
faec1544
DM
4914 PL_reg_flags |= RF_utf8;
4915 else
4916 PL_reg_flags &= ~RF_utf8;
4917 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4918
288b8c02 4919 ST.prev_rex = rex_sv;
faec1544 4920 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
4921 rex_sv = re_sv;
4922 SET_reg_curpm(rex_sv);
288b8c02 4923 rex = re;
f8fc2ecf 4924 rexi = rei;
faec1544 4925 cur_curlyx = NULL;
40a82448 4926 ST.B = next;
faec1544
DM
4927 ST.prev_eval = cur_eval;
4928 cur_eval = st;
faec1544 4929 /* now continue from first node in postoned RE */
4d5016e5 4930 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 4931 assert(0); /* NOTREACHED */
c277df42 4932 }
40a82448 4933
faec1544
DM
4934 case EVAL_AB: /* cleanup after a successful (??{A})B */
4935 /* note: this is called twice; first after popping B, then A */
4936 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4937 rex_sv = ST.prev_rex;
4938 SET_reg_curpm(rex_sv);
288b8c02 4939 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4940 rexi = RXi_GET(rex);
faec1544
DM
4941 regcpblow(ST.cp);
4942 cur_eval = ST.prev_eval;
4943 cur_curlyx = ST.prev_curlyx;
34a81e2b 4944
40a82448
DM
4945 /* XXXX This is too dramatic a measure... */
4946 PL_reg_maxiter = 0;
e7707071 4947 if ( nochange_depth )
4b196cd4 4948 nochange_depth--;
262b90c4 4949 sayYES;
40a82448 4950
40a82448 4951
faec1544
DM
4952 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4953 /* note: this is called twice; first after popping B, then A */
4954 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
4955 rex_sv = ST.prev_rex;
4956 SET_reg_curpm(rex_sv);
288b8c02 4957 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 4958 rexi = RXi_GET(rex);
0357f1fd 4959
40a82448
DM
4960 REGCP_UNWIND(ST.lastcp);
4961 regcppop(rex);
faec1544
DM
4962 cur_eval = ST.prev_eval;
4963 cur_curlyx = ST.prev_curlyx;
4964 /* XXXX This is too dramatic a measure... */
4965 PL_reg_maxiter = 0;
e7707071 4966 if ( nochange_depth )
4b196cd4 4967 nochange_depth--;
40a82448 4968 sayNO_SILENT;
40a82448
DM
4969#undef ST
4970
3c0563b9 4971 case OPEN: /* ( */
c277df42 4972 n = ARG(scan); /* which paren pair */
1ca2007e 4973 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
4974 if (n > PL_regsize)
4975 PL_regsize = n;
495f47a5
DM
4976 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4977 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4978 PTR2UV(rex),
4979 PTR2UV(rex->offs),
4980 (UV)n,
4981 (IV)rex->offs[n].start_tmp,
4982 (UV)PL_regsize
4983 ));
e2e6a0f1 4984 lastopen = n;
a0d0e21e 4985 break;
495f47a5
DM
4986
4987/* XXX really need to log other places start/end are set too */
4988#define CLOSE_CAPTURE \
4989 rex->offs[n].start = rex->offs[n].start_tmp; \
4990 rex->offs[n].end = locinput - PL_bostr; \
4991 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4992 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4993 PTR2UV(rex), \
4994 PTR2UV(rex->offs), \
4995 (UV)n, \
4996 (IV)rex->offs[n].start, \
4997 (IV)rex->offs[n].end \
4998 ))
4999
3c0563b9 5000 case CLOSE: /* ) */
c277df42 5001 n = ARG(scan); /* which paren pair */
495f47a5 5002 CLOSE_CAPTURE;
7f69552c
YO
5003 /*if (n > PL_regsize)
5004 PL_regsize = n;*/
b93070ed
DM
5005 if (n > rex->lastparen)
5006 rex->lastparen = n;
5007 rex->lastcloseparen = n;
3b6647e0 5008 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5009 goto fake_end;
5010 }
a0d0e21e 5011 break;
3c0563b9
DM
5012
5013 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5014 if (ARG(scan)){
5015 regnode *cursor;
5016 for (cursor=scan;
5017 cursor && OP(cursor)!=END;
5018 cursor=regnext(cursor))
5019 {
5020 if ( OP(cursor)==CLOSE ){
5021 n = ARG(cursor);
5022 if ( n <= lastopen ) {
495f47a5 5023 CLOSE_CAPTURE;
e2e6a0f1
YO
5024 /*if (n > PL_regsize)
5025 PL_regsize = n;*/
b93070ed
DM
5026 if (n > rex->lastparen)
5027 rex->lastparen = n;
5028 rex->lastcloseparen = n;
3b6647e0
RB
5029 if ( n == ARG(scan) || (cur_eval &&
5030 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5031 break;
5032 }
5033 }
5034 }
5035 }
5036 goto fake_end;
5037 /*NOTREACHED*/
3c0563b9
DM
5038
5039 case GROUPP: /* (?(1)) */
c277df42 5040 n = ARG(scan); /* which paren pair */
b93070ed 5041 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5042 break;
3c0563b9
DM
5043
5044 case NGROUPP: /* (?(<name>)) */
0a4db386 5045 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5046 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5047 break;
3c0563b9
DM
5048
5049 case INSUBP: /* (?(R)) */
0a4db386 5050 n = ARG(scan);
3b6647e0 5051 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5052 break;
3c0563b9
DM
5053
5054 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5055 sw = 0;
5056 break;
3c0563b9
DM
5057
5058 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5059 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5060 if (sw)
c277df42
IZ
5061 next = NEXTOPER(NEXTOPER(scan));
5062 else {
5063 next = scan + ARG(scan);
5064 if (OP(next) == IFTHEN) /* Fake one. */
5065 next = NEXTOPER(NEXTOPER(next));
5066 }
5067 break;
3c0563b9
DM
5068
5069 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5070 logical = scan->flags;
c277df42 5071 break;
c476f425 5072
2ab05381 5073/*******************************************************************
2ab05381 5074
c476f425
DM
5075The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5076pattern, where A and B are subpatterns. (For simple A, CURLYM or
5077STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5078
c476f425 5079A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5080
c476f425
DM
5081On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5082state, which contains the current count, initialised to -1. It also sets
5083cur_curlyx to point to this state, with any previous value saved in the
5084state block.
2ab05381 5085
c476f425
DM
5086CURLYX then jumps straight to the WHILEM op, rather than executing A,
5087since the pattern may possibly match zero times (i.e. it's a while {} loop
5088rather than a do {} while loop).
2ab05381 5089
c476f425
DM
5090Each entry to WHILEM represents a successful match of A. The count in the
5091CURLYX block is incremented, another WHILEM state is pushed, and execution
5092passes to A or B depending on greediness and the current count.
2ab05381 5093
c476f425
DM
5094For example, if matching against the string a1a2a3b (where the aN are
5095substrings that match /A/), then the match progresses as follows: (the
5096pushed states are interspersed with the bits of strings matched so far):
2ab05381 5097
c476f425
DM
5098 <CURLYX cnt=-1>
5099 <CURLYX cnt=0><WHILEM>
5100 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5101 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5102 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5103 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5104
c476f425
DM
5105(Contrast this with something like CURLYM, which maintains only a single
5106backtrack state:
2ab05381 5107
c476f425
DM
5108 <CURLYM cnt=0> a1
5109 a1 <CURLYM cnt=1> a2
5110 a1 a2 <CURLYM cnt=2> a3
5111 a1 a2 a3 <CURLYM cnt=3> b
5112)
2ab05381 5113
c476f425
DM
5114Each WHILEM state block marks a point to backtrack to upon partial failure
5115of A or B, and also contains some minor state data related to that
5116iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5117overall state, such as the count, and pointers to the A and B ops.
2ab05381 5118
c476f425
DM
5119This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5120must always point to the *current* CURLYX block, the rules are:
2ab05381 5121
c476f425
DM
5122When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5123and set cur_curlyx to point the new block.
2ab05381 5124
c476f425
DM
5125When popping the CURLYX block after a successful or unsuccessful match,
5126restore the previous cur_curlyx.
2ab05381 5127
c476f425
DM
5128When WHILEM is about to execute B, save the current cur_curlyx, and set it
5129to the outer one saved in the CURLYX block.
2ab05381 5130
c476f425
DM
5131When popping the WHILEM block after a successful or unsuccessful B match,
5132restore the previous cur_curlyx.
2ab05381 5133
c476f425
DM
5134Here's an example for the pattern (AI* BI)*BO
5135I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5136
c476f425
DM
5137cur_
5138curlyx backtrack stack
5139------ ---------------
5140NULL
5141CO <CO prev=NULL> <WO>
5142CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5143CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5144NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5145
c476f425
DM
5146At this point the pattern succeeds, and we work back down the stack to
5147clean up, restoring as we go:
95b24440 5148
c476f425
DM
5149CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5150CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5151CO <CO prev=NULL> <WO>
5152NULL
a0374537 5153
c476f425
DM
5154*******************************************************************/
5155
5156#define ST st->u.curlyx
5157
5158 case CURLYX: /* start of /A*B/ (for complex A) */
5159 {
5160 /* No need to save/restore up to this paren */
5161 I32 parenfloor = scan->flags;
5162
5163 assert(next); /* keep Coverity happy */
5164 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5165 next += ARG(next);
5166
5167 /* XXXX Probably it is better to teach regpush to support
5168 parenfloor > PL_regsize... */
b93070ed
DM
5169 if (parenfloor > (I32)rex->lastparen)
5170 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5171
5172 ST.prev_curlyx= cur_curlyx;
5173 cur_curlyx = st;
5174 ST.cp = PL_savestack_ix;
5175
5176 /* these fields contain the state of the current curly.
5177 * they are accessed by subsequent WHILEMs */
5178 ST.parenfloor = parenfloor;
d02d6d97 5179 ST.me = scan;
c476f425 5180 ST.B = next;
24d3c4a9
DM
5181 ST.minmod = minmod;
5182 minmod = 0;
c476f425
DM
5183 ST.count = -1; /* this will be updated by WHILEM */
5184 ST.lastloc = NULL; /* this will be updated by WHILEM */
5185
4d5016e5 5186 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5187 assert(0); /* NOTREACHED */
c476f425 5188 }
a0d0e21e 5189
c476f425 5190 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5191 cur_curlyx = ST.prev_curlyx;
5192 sayYES;
118e2215 5193 assert(0); /* NOTREACHED */
a0d0e21e 5194
c476f425
DM
5195 case CURLYX_end_fail: /* just failed to match all of A*B */
5196 regcpblow(ST.cp);
5197 cur_curlyx = ST.prev_curlyx;
5198 sayNO;
118e2215 5199 assert(0); /* NOTREACHED */
4633a7c4 5200
a0d0e21e 5201
c476f425
DM
5202#undef ST
5203#define ST st->u.whilem
5204
5205 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5206 {
5207 /* see the discussion above about CURLYX/WHILEM */
c476f425 5208 I32 n;
d02d6d97
DM
5209 int min = ARG1(cur_curlyx->u.curlyx.me);
5210 int max = ARG2(cur_curlyx->u.curlyx.me);
5211 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5212
c476f425
DM
5213 assert(cur_curlyx); /* keep Coverity happy */
5214 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5215 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5216 ST.cache_offset = 0;
5217 ST.cache_mask = 0;
5218
c476f425
DM
5219
5220 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5221 "%*s whilem: matched %ld out of %d..%d\n",
5222 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5223 );
a0d0e21e 5224
c476f425 5225 /* First just match a string of min A's. */
a0d0e21e 5226
d02d6d97 5227 if (n < min) {
b93070ed 5228 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5229 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5230 REGCP_SET(ST.lastcp);
5231
4d5016e5 5232 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5233 assert(0); /* NOTREACHED */
c476f425
DM
5234 }
5235
5236 /* If degenerate A matches "", assume A done. */
5237
5238 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5239 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5240 "%*s whilem: empty match detected, trying continuation...\n",
5241 REPORT_CODE_OFF+depth*2, "")
5242 );
5243 goto do_whilem_B_max;
5244 }
5245
5246 /* super-linear cache processing */
5247
5248 if (scan->flags) {
a0d0e21e 5249
2c2d71f5 5250 if (!PL_reg_maxiter) {
c476f425
DM
5251 /* start the countdown: Postpone detection until we
5252 * know the match is not *that* much linear. */
2c2d71f5 5253 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5254 /* possible overflow for long strings and many CURLYX's */
5255 if (PL_reg_maxiter < 0)
5256 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5257 PL_reg_leftiter = PL_reg_maxiter;
5258 }
c476f425 5259
2c2d71f5 5260 if (PL_reg_leftiter-- == 0) {
c476f425 5261 /* initialise cache */
3298f257 5262 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5263 if (PL_reg_poscache) {
eb160463 5264 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5265 Renew(PL_reg_poscache, size, char);
5266 PL_reg_poscache_size = size;
5267 }
5268 Zero(PL_reg_poscache, size, char);
5269 }
5270 else {
5271 PL_reg_poscache_size = size;
a02a5408 5272 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5273 }
c476f425
DM
5274 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5275 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5276 PL_colors[4], PL_colors[5])
5277 );
2c2d71f5 5278 }
c476f425 5279
2c2d71f5 5280 if (PL_reg_leftiter < 0) {
c476f425
DM
5281 /* have we already failed at this position? */
5282 I32 offset, mask;
5283 offset = (scan->flags & 0xf) - 1
5284 + (locinput - PL_bostr) * (scan->flags>>4);
5285 mask = 1 << (offset % 8);
5286 offset /= 8;
5287 if (PL_reg_poscache[offset] & mask) {
5288 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5289 "%*s whilem: (cache) already tried at this position...\n",
5290 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5291 );
3298f257 5292 sayNO; /* cache records failure */
2c2d71f5 5293 }
c476f425
DM
5294 ST.cache_offset = offset;
5295 ST.cache_mask = mask;
2c2d71f5 5296 }
c476f425 5297 }
2c2d71f5 5298
c476f425 5299 /* Prefer B over A for minimal matching. */
a687059c 5300
c476f425
DM
5301 if (cur_curlyx->u.curlyx.minmod) {
5302 ST.save_curlyx = cur_curlyx;
5303 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 5304 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425 5305 REGCP_SET(ST.lastcp);
4d5016e5
DM
5306 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5307 locinput);
118e2215 5308 assert(0); /* NOTREACHED */
c476f425 5309 }
a0d0e21e 5310
c476f425
DM
5311 /* Prefer A over B for maximal matching. */
5312
d02d6d97 5313 if (n < max) { /* More greed allowed? */
b93070ed 5314 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
5315 cur_curlyx->u.curlyx.lastloc = locinput;
5316 REGCP_SET(ST.lastcp);
4d5016e5 5317 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5318 assert(0); /* NOTREACHED */
c476f425
DM
5319 }
5320 goto do_whilem_B_max;
5321 }
118e2215 5322 assert(0); /* NOTREACHED */
c476f425
DM
5323
5324 case WHILEM_B_min: /* just matched B in a minimal match */
5325 case WHILEM_B_max: /* just matched B in a maximal match */
5326 cur_curlyx = ST.save_curlyx;
5327 sayYES;
118e2215 5328 assert(0); /* NOTREACHED */
c476f425
DM
5329
5330 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5331 cur_curlyx = ST.save_curlyx;
5332 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5333 cur_curlyx->u.curlyx.count--;
5334 CACHEsayNO;
118e2215 5335 assert(0); /* NOTREACHED */
c476f425
DM
5336
5337 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5338 /* FALL THROUGH */
5339 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
5340 REGCP_UNWIND(ST.lastcp);
5341 regcppop(rex);
c476f425
DM
5342 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5343 cur_curlyx->u.curlyx.count--;
5344 CACHEsayNO;
118e2215 5345 assert(0); /* NOTREACHED */
c476f425
DM
5346
5347 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5348 REGCP_UNWIND(ST.lastcp);
5349 regcppop(rex); /* Restore some previous $<digit>s? */
c476f425
DM
5350 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5351 "%*s whilem: failed, trying continuation...\n",
5352 REPORT_CODE_OFF+depth*2, "")
5353 );
5354 do_whilem_B_max:
5355 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5356 && ckWARN(WARN_REGEXP)
5357 && !(PL_reg_flags & RF_warned))
5358 {
5359 PL_reg_flags |= RF_warned;
dcbac5bb
FC
5360 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5361 "Complex regular subexpression recursion limit (%d) "
5362 "exceeded",
c476f425
DM
5363 REG_INFTY - 1);
5364 }
5365
5366 /* now try B */
5367 ST.save_curlyx = cur_curlyx;
5368 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5369 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5370 locinput);
118e2215 5371 assert(0); /* NOTREACHED */
c476f425
DM
5372
5373 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5374 cur_curlyx = ST.save_curlyx;
5375 REGCP_UNWIND(ST.lastcp);
5376 regcppop(rex);
5377
d02d6d97 5378 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5379 /* Maximum greed exceeded */
5380 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5381 && ckWARN(WARN_REGEXP)
5382 && !(PL_reg_flags & RF_warned))
5383 {
3280af22 5384 PL_reg_flags |= RF_warned;
c476f425 5385 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5386 "Complex regular subexpression recursion "
5387 "limit (%d) exceeded",
c476f425 5388 REG_INFTY - 1);
a0d0e21e 5389 }
c476f425 5390 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5391 CACHEsayNO;
a0d0e21e 5392 }
c476f425
DM
5393
5394 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5395 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5396 );
5397 /* Try grabbing another A and see if it helps. */
c476f425 5398 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5399 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5400 REGCP_SET(ST.lastcp);
d02d6d97 5401 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5402 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5403 locinput);
118e2215 5404 assert(0); /* NOTREACHED */
40a82448
DM
5405
5406#undef ST
5407#define ST st->u.branch
5408
5409 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5410 next = scan + ARG(scan);
5411 if (next == scan)
5412 next = NULL;
40a82448
DM
5413 scan = NEXTOPER(scan);
5414 /* FALL THROUGH */
c277df42 5415
40a82448
DM
5416 case BRANCH: /* /(...|A|...)/ */
5417 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5418 ST.lastparen = rex->lastparen;
f6033a9d 5419 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5420 ST.next_branch = next;
5421 REGCP_SET(ST.cp);
02db2b7b 5422
40a82448 5423 /* Now go into the branch */
5d458dd8 5424 if (has_cutgroup) {
4d5016e5 5425 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5426 } else {
4d5016e5 5427 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5428 }
118e2215 5429 assert(0); /* NOTREACHED */
3c0563b9
DM
5430
5431 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5432 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5433 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5434 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5435 assert(0); /* NOTREACHED */
3c0563b9 5436
5d458dd8
YO
5437 case CUTGROUP_next_fail:
5438 do_cutgroup = 1;
5439 no_final = 1;
5440 if (st->u.mark.mark_name)
5441 sv_commit = st->u.mark.mark_name;
5442 sayNO;
118e2215 5443 assert(0); /* NOTREACHED */
3c0563b9 5444
5d458dd8
YO
5445 case BRANCH_next:
5446 sayYES;
118e2215 5447 assert(0); /* NOTREACHED */
3c0563b9 5448
40a82448 5449 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5450 if (do_cutgroup) {
5451 do_cutgroup = 0;
5452 no_final = 0;
5453 }
40a82448 5454 REGCP_UNWIND(ST.cp);
a8d1f4b4 5455 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5456 scan = ST.next_branch;
5457 /* no more branches? */
5d458dd8
YO
5458 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5459 DEBUG_EXECUTE_r({
5460 PerlIO_printf( Perl_debug_log,
5461 "%*s %sBRANCH failed...%s\n",
5462 REPORT_CODE_OFF+depth*2, "",
5463 PL_colors[4],
5464 PL_colors[5] );
5465 });
5466 sayNO_SILENT;
5467 }
40a82448 5468 continue; /* execute next BRANCH[J] op */
118e2215 5469 assert(0); /* NOTREACHED */
40a82448 5470
3c0563b9 5471 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5472 minmod = 1;
a0d0e21e 5473 break;
40a82448
DM
5474
5475#undef ST
5476#define ST st->u.curlym
5477
5478 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5479
5480 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5481 * only a single backtracking state, no matter how many matches
40a82448
DM
5482 * there are in {m,n}. It relies on the pattern being constant
5483 * length, with no parens to influence future backrefs
5484 */
5485
5486 ST.me = scan;
dc45a647 5487 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5488
f6033a9d
DM
5489 ST.lastparen = rex->lastparen;
5490 ST.lastcloseparen = rex->lastcloseparen;
5491
40a82448
DM
5492 /* if paren positive, emulate an OPEN/CLOSE around A */
5493 if (ST.me->flags) {
3b6647e0 5494 U32 paren = ST.me->flags;
40a82448
DM
5495 if (paren > PL_regsize)
5496 PL_regsize = paren;
c277df42 5497 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5498 }
40a82448
DM
5499 ST.A = scan;
5500 ST.B = next;
5501 ST.alen = 0;
5502 ST.count = 0;
24d3c4a9
DM
5503 ST.minmod = minmod;
5504 minmod = 0;
40a82448
DM
5505 ST.c1 = CHRTEST_UNINIT;
5506 REGCP_SET(ST.cp);
6407bf3b 5507
40a82448
DM
5508 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5509 goto curlym_do_B;
5510
5511 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5512 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5513 assert(0); /* NOTREACHED */
5f80c4cf 5514
40a82448 5515 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5516 ST.count++;
5517 /* after first match, determine A's length: u.curlym.alen */
5518 if (ST.count == 1) {
5519 if (PL_reg_match_utf8) {
c07e9d7b
DM
5520 char *s = st->locinput;
5521 while (s < locinput) {
40a82448
DM
5522 ST.alen++;
5523 s += UTF8SKIP(s);
5524 }
5525 }
5526 else {
c07e9d7b 5527 ST.alen = locinput - st->locinput;
40a82448
DM
5528 }
5529 if (ST.alen == 0)
5530 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5531 }
0cadcf80
DM
5532 DEBUG_EXECUTE_r(
5533 PerlIO_printf(Perl_debug_log,
40a82448 5534 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5535 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5536 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5537 );
5538
0a4db386 5539 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5540 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5541 goto fake_end;
5542
c966426a
DM
5543 {
5544 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5545 if ( max == REG_INFTY || ST.count < max )
5546 goto curlym_do_A; /* try to match another A */
5547 }
40a82448 5548 goto curlym_do_B; /* try to match B */
5f80c4cf 5549
40a82448
DM
5550 case CURLYM_A_fail: /* just failed to match an A */
5551 REGCP_UNWIND(ST.cp);
0a4db386
YO
5552
5553 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5554 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5555 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5556 sayNO;
0cadcf80 5557
40a82448 5558 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5559 if (ST.c1 == CHRTEST_UNINIT) {
5560 /* calculate c1 and c2 for possible match of 1st char
5561 * following curly */
5562 ST.c1 = ST.c2 = CHRTEST_VOID;
5563 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5564 regnode *text_node = ST.B;
5565 if (! HAS_TEXT(text_node))
5566 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5567 /* this used to be
5568
5569 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5570
5571 But the former is redundant in light of the latter.
5572
5573 if this changes back then the macro for
5574 IS_TEXT and friends need to change.
5575 */
c74f6de9
KW
5576 if (PL_regkind[OP(text_node)] == EXACT) {
5577 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
5578 &ST.c1, &ST.c2))
5579 {
5580 sayNO;
5581 }
c277df42 5582 }
c277df42 5583 }
40a82448
DM
5584 }
5585
5586 DEBUG_EXECUTE_r(
5587 PerlIO_printf(Perl_debug_log,
5588 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5589 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5590 "", (IV)ST.count)
5591 );
c74f6de9
KW
5592 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5593 const UV c = (utf8_target)
5594 ? utf8n_to_uvchr((U8*)locinput,
5595 UTF8_MAXBYTES, NULL,
5596 uniflags)
5597 : nextchr;
5598 if (c != (UV) ST.c1 && c != (UV) ST.c2) {
5400f398
KW
5599 /* simulate B failing */
5600 DEBUG_OPTIMISE_r(
5601 PerlIO_printf(Perl_debug_log,
5602 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5603 (int)(REPORT_CODE_OFF+(depth*2)),"",
5604 (IV)ST.c1,(IV)ST.c2
5605 ));
5606 state_num = CURLYM_B_fail;
5607 goto reenter_switch;
5608 }
c74f6de9 5609 }
40a82448
DM
5610
5611 if (ST.me->flags) {
f6033a9d 5612 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5613 I32 paren = ST.me->flags;
5614 if (ST.count) {
b93070ed 5615 rex->offs[paren].start
c07e9d7b
DM
5616 = HOPc(locinput, -ST.alen) - PL_bostr;
5617 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5618 if ((U32)paren > rex->lastparen)
5619 rex->lastparen = paren;
5620 rex->lastcloseparen = paren;
c277df42 5621 }
40a82448 5622 else
b93070ed 5623 rex->offs[paren].end = -1;
0a4db386 5624 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5625 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5626 {
5627 if (ST.count)
5628 goto fake_end;
5629 else
5630 sayNO;
5631 }
c277df42 5632 }
0a4db386 5633
4d5016e5 5634 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5635 assert(0); /* NOTREACHED */
40a82448
DM
5636
5637 case CURLYM_B_fail: /* just failed to match a B */
5638 REGCP_UNWIND(ST.cp);
a8d1f4b4 5639 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5640 if (ST.minmod) {
84d2fa14
HS
5641 I32 max = ARG2(ST.me);
5642 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5643 sayNO;
5644 goto curlym_do_A; /* try to match a further A */
5645 }
5646 /* backtrack one A */
5647 if (ST.count == ARG1(ST.me) /* min */)
5648 sayNO;
5649 ST.count--;
7016d6eb 5650 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5651 goto curlym_do_B; /* try to match B */
5652
c255a977
DM
5653#undef ST
5654#define ST st->u.curly
40a82448 5655
c255a977
DM
5656#define CURLY_SETPAREN(paren, success) \
5657 if (paren) { \
5658 if (success) { \
b93070ed
DM
5659 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5660 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5661 if (paren > rex->lastparen) \
5662 rex->lastparen = paren; \
b93070ed 5663 rex->lastcloseparen = paren; \
c255a977 5664 } \
f6033a9d 5665 else { \
b93070ed 5666 rex->offs[paren].end = -1; \
f6033a9d
DM
5667 rex->lastparen = ST.lastparen; \
5668 rex->lastcloseparen = ST.lastcloseparen; \
5669 } \
c255a977
DM
5670 }
5671
b40a2c17 5672 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5673 ST.paren = 0;
5674 ST.min = 0;
5675 ST.max = REG_INFTY;
a0d0e21e
LW
5676 scan = NEXTOPER(scan);
5677 goto repeat;
3c0563b9 5678
b40a2c17 5679 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5680 ST.paren = 0;
5681 ST.min = 1;
5682 ST.max = REG_INFTY;
c277df42 5683 scan = NEXTOPER(scan);
c255a977 5684 goto repeat;
3c0563b9 5685
b40a2c17 5686 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5687 ST.paren = scan->flags; /* Which paren to set */
5688 ST.lastparen = rex->lastparen;
f6033a9d 5689 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5690 if (ST.paren > PL_regsize)
5691 PL_regsize = ST.paren;
c255a977
DM
5692 ST.min = ARG1(scan); /* min to match */
5693 ST.max = ARG2(scan); /* max to match */
0a4db386 5694 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5695 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5696 ST.min=1;
5697 ST.max=1;
5698 }
c255a977
DM
5699 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5700 goto repeat;
3c0563b9 5701
b40a2c17 5702 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5703 ST.paren = 0;
5704 ST.min = ARG1(scan); /* min to match */
5705 ST.max = ARG2(scan); /* max to match */
5706 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5707 repeat:
a0d0e21e
LW
5708 /*
5709 * Lookahead to avoid useless match attempts
5710 * when we know what character comes next.
c255a977 5711 *
5f80c4cf
JP
5712 * Used to only do .*x and .*?x, but now it allows
5713 * for )'s, ('s and (?{ ... })'s to be in the way
5714 * of the quantifier and the EXACT-like node. -- japhy
5715 */
5716
eb5c1be8 5717 assert(ST.min <= ST.max);
3337dfe3
KW
5718 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5719 ST.c1 = ST.c2 = CHRTEST_VOID;
5720 }
5721 else {
5f80c4cf
JP
5722 regnode *text_node = next;
5723
3dab1dad
YO
5724 if (! HAS_TEXT(text_node))
5725 FIND_NEXT_IMPT(text_node);
5f80c4cf 5726
9e137952 5727 if (! HAS_TEXT(text_node))
c255a977 5728 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5729 else {
ee9b8eae 5730 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5731 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5732 }
c74f6de9 5733 else {
ee9b8eae
YO
5734
5735 /* Currently we only get here when
5736
5737 PL_rekind[OP(text_node)] == EXACT
5738
5739 if this changes back then the macro for IS_TEXT and
5740 friends need to change. */
c74f6de9
KW
5741 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node,
5742 &ST.c1, &ST.c2))
5743 {
5744 sayNO;
5745 }
5746 }
1aa99e6b 5747 }
bbce6d69 5748 }
c255a977
DM
5749
5750 ST.A = scan;
5751 ST.B = next;
24d3c4a9 5752 if (minmod) {
eb72505d 5753 char *li = locinput;
24d3c4a9 5754 minmod = 0;
eb72505d 5755 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
4633a7c4 5756 sayNO;
7016d6eb 5757 SET_locinput(li);
c255a977 5758 ST.count = ST.min;
c255a977
DM
5759 REGCP_SET(ST.cp);
5760 if (ST.c1 == CHRTEST_VOID)
5761 goto curly_try_B_min;
5762
5763 ST.oldloc = locinput;
5764
5765 /* set ST.maxpos to the furthest point along the
5766 * string that could possibly match */
5767 if (ST.max == REG_INFTY) {
5768 ST.maxpos = PL_regeol - 1;
f2ed9b32 5769 if (utf8_target)
c255a977
DM
5770 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5771 ST.maxpos--;
5772 }
f2ed9b32 5773 else if (utf8_target) {
c255a977
DM
5774 int m = ST.max - ST.min;
5775 for (ST.maxpos = locinput;
5776 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5777 ST.maxpos += UTF8SKIP(ST.maxpos);
5778 }
5779 else {
5780 ST.maxpos = locinput + ST.max - ST.min;
5781 if (ST.maxpos >= PL_regeol)
5782 ST.maxpos = PL_regeol - 1;
5783 }
5784 goto curly_try_B_min_known;
5785
5786 }
5787 else {
eb72505d
DM
5788 /* avoid taking address of locinput, so it can remain
5789 * a register var */
5790 char *li = locinput;
5791 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
c255a977
DM
5792 if (ST.count < ST.min)
5793 sayNO;
7016d6eb 5794 SET_locinput(li);
c255a977
DM
5795 if ((ST.count > ST.min)
5796 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5797 {
5798 /* A{m,n} must come at the end of the string, there's
5799 * no point in backing off ... */
5800 ST.min = ST.count;
5801 /* ...except that $ and \Z can match before *and* after
5802 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5803 We may back off by one in this case. */
eb72505d 5804 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
5805 ST.min--;
5806 }
5807 REGCP_SET(ST.cp);
5808 goto curly_try_B_max;
5809 }
118e2215 5810 assert(0); /* NOTREACHED */
c255a977
DM
5811
5812
5813 case CURLY_B_min_known_fail:
5814 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 5815
c255a977 5816 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5817 if (ST.paren) {
5818 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5819 }
c255a977
DM
5820 /* Couldn't or didn't -- move forward. */
5821 ST.oldloc = locinput;
f2ed9b32 5822 if (utf8_target)
c255a977
DM
5823 locinput += UTF8SKIP(locinput);
5824 else
5825 locinput++;
5826 ST.count++;
5827 curly_try_B_min_known:
5828 /* find the next place where 'B' could work, then call B */
5829 {
5830 int n;
f2ed9b32 5831 if (utf8_target) {
c255a977
DM
5832 n = (ST.oldloc == locinput) ? 0 : 1;
5833 if (ST.c1 == ST.c2) {
5834 STRLEN len;
5835 /* set n to utf8_distance(oldloc, locinput) */
5836 while (locinput <= ST.maxpos &&
5837 utf8n_to_uvchr((U8*)locinput,
5838 UTF8_MAXBYTES, &len,
5839 uniflags) != (UV)ST.c1) {
5840 locinput += len;
5841 n++;
5842 }
1aa99e6b
IH
5843 }
5844 else {
c255a977
DM
5845 /* set n to utf8_distance(oldloc, locinput) */
5846 while (locinput <= ST.maxpos) {
5847 STRLEN len;
5848 const UV c = utf8n_to_uvchr((U8*)locinput,
5849 UTF8_MAXBYTES, &len,
5850 uniflags);
5851 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5852 break;
5853 locinput += len;
5854 n++;
1aa99e6b 5855 }
0fe9bf95
IZ
5856 }
5857 }
5400f398 5858 else { /* Not utf8_target */
c255a977
DM
5859 if (ST.c1 == ST.c2) {
5860 while (locinput <= ST.maxpos &&
5861 UCHARAT(locinput) != ST.c1)
5862 locinput++;
bbce6d69 5863 }
c255a977
DM
5864 else {
5865 while (locinput <= ST.maxpos
5866 && UCHARAT(locinput) != ST.c1
5867 && UCHARAT(locinput) != ST.c2)
5868 locinput++;
a0ed51b3 5869 }
c255a977
DM
5870 n = locinput - ST.oldloc;
5871 }
5872 if (locinput > ST.maxpos)
5873 sayNO;
c255a977 5874 if (n) {
eb72505d
DM
5875 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5876 * at b; check that everything between oldloc and
5877 * locinput matches */
5878 char *li = ST.oldloc;
c255a977 5879 ST.count += n;
eb72505d 5880 if (regrepeat(rex, &li, ST.A, n, depth) < n)
4633a7c4 5881 sayNO;
eb72505d 5882 assert(n == REG_INFTY || locinput == li);
a0d0e21e 5883 }
c255a977 5884 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5885 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5886 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5887 goto fake_end;
5888 }
4d5016e5 5889 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 5890 }
118e2215 5891 assert(0); /* NOTREACHED */
c255a977
DM
5892
5893
5894 case CURLY_B_min_fail:
5895 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5896
5897 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5898 if (ST.paren) {
5899 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5900 }
c255a977 5901 /* failed -- move forward one */
f73aaa43 5902 {
eb72505d
DM
5903 char *li = locinput;
5904 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
f73aaa43
DM
5905 sayNO;
5906 }
eb72505d 5907 locinput = li;
f73aaa43
DM
5908 }
5909 {
c255a977 5910 ST.count++;
c255a977
DM
5911 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5912 ST.count > 0)) /* count overflow ? */
15272685 5913 {
c255a977
DM
5914 curly_try_B_min:
5915 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5916 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5917 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5918 goto fake_end;
5919 }
4d5016e5 5920 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
5921 }
5922 }
c74f6de9 5923 sayNO;
118e2215 5924 assert(0); /* NOTREACHED */
c255a977
DM
5925
5926
5927 curly_try_B_max:
5928 /* a successful greedy match: now try to match B */
40d049e4 5929 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5930 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
5931 goto fake_end;
5932 }
c255a977
DM
5933 {
5934 UV c = 0;
7016d6eb 5935 if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol)
eb72505d 5936 c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
c255a977 5937 UTF8_MAXBYTES, 0, uniflags)
eb72505d 5938 : (UV) UCHARAT(locinput);
c255a977 5939 /* If it could work, try it. */
5400f398 5940 if (ST.c1 == CHRTEST_VOID
7016d6eb
DM
5941 || (locinput < PL_regeol &&
5942 (c == (UV)ST.c1 || c == (UV)ST.c2)))
5943 {
c255a977 5944 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 5945 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 5946 assert(0); /* NOTREACHED */
c255a977
DM
5947 }
5948 }
5949 /* FALL THROUGH */
3c0563b9 5950
c255a977
DM
5951 case CURLY_B_max_fail:
5952 /* failed to find B in a greedy match */
c255a977
DM
5953
5954 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5955 if (ST.paren) {
5956 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5957 }
c255a977
DM
5958 /* back up. */
5959 if (--ST.count < ST.min)
5960 sayNO;
eb72505d 5961 locinput = HOPc(locinput, -1);
c255a977
DM
5962 goto curly_try_B_max;
5963
5964#undef ST
5965
3c0563b9 5966 case END: /* last op of main pattern */
6bda09f9 5967 fake_end:
faec1544
DM
5968 if (cur_eval) {
5969 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
5970 st->u.eval.toggle_reg_flags
5971 = cur_eval->u.eval.toggle_reg_flags;
5972 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5973
288b8c02 5974 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 5975 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
5976 rex_sv = cur_eval->u.eval.prev_rex;
5977 SET_reg_curpm(rex_sv);
288b8c02 5978 rex = (struct regexp *)SvANY(rex_sv);
f8fc2ecf 5979 rexi = RXi_GET(rex);
faec1544 5980 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 5981
faec1544 5982 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
5983
5984 /* Restore parens of the outer rex without popping the
5985 * savestack */
74088413 5986 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
5987
5988 st->u.eval.prev_eval = cur_eval;
5989 cur_eval = cur_eval->u.eval.prev_eval;
5990 DEBUG_EXECUTE_r(
2a49f0f5
JH
5991 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5992 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
5993 if ( nochange_depth )
5994 nochange_depth--;
5995
4d5016e5
DM
5996 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5997 locinput); /* match B */
faec1544
DM
5998 }
5999
3b0527fe 6000 if (locinput < reginfo->till) {
a3621e74 6001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6002 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6003 PL_colors[4],
6004 (long)(locinput - PL_reg_starttry),
3b0527fe 6005 (long)(reginfo->till - PL_reg_starttry),
7821416a 6006 PL_colors[5]));
58e23c8d 6007
262b90c4 6008 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6009 }
262b90c4 6010 sayYES; /* Success! */
dad79028
DM
6011
6012 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6013 DEBUG_EXECUTE_r(
6014 PerlIO_printf(Perl_debug_log,
6015 "%*s %ssubpattern success...%s\n",
5bc10b2c 6016 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6017 sayYES; /* Success! */
dad79028 6018
40a82448
DM
6019#undef ST
6020#define ST st->u.ifmatch
6021
37f53970
DM
6022 {
6023 char *newstart;
6024
40a82448
DM
6025 case SUSPEND: /* (?>A) */
6026 ST.wanted = 1;
37f53970 6027 newstart = locinput;
9041c2e3 6028 goto do_ifmatch;
dad79028 6029
40a82448
DM
6030 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6031 ST.wanted = 0;
dad79028
DM
6032 goto ifmatch_trivial_fail_test;
6033
40a82448
DM
6034 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6035 ST.wanted = 1;
dad79028 6036 ifmatch_trivial_fail_test:
a0ed51b3 6037 if (scan->flags) {
52657f30 6038 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6039 if (!s) {
6040 /* trivial fail */
24d3c4a9
DM
6041 if (logical) {
6042 logical = 0;
f2338a2e 6043 sw = 1 - cBOOL(ST.wanted);
dad79028 6044 }
40a82448 6045 else if (ST.wanted)
dad79028
DM
6046 sayNO;
6047 next = scan + ARG(scan);
6048 if (next == scan)
6049 next = NULL;
6050 break;
6051 }
37f53970 6052 newstart = s;
a0ed51b3
LW
6053 }
6054 else
37f53970 6055 newstart = locinput;
a0ed51b3 6056
c277df42 6057 do_ifmatch:
40a82448 6058 ST.me = scan;
24d3c4a9 6059 ST.logical = logical;
24d786f4
YO
6060 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6061
40a82448 6062 /* execute body of (?...A) */
37f53970 6063 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6064 assert(0); /* NOTREACHED */
37f53970 6065 }
40a82448
DM
6066
6067 case IFMATCH_A_fail: /* body of (?...A) failed */
6068 ST.wanted = !ST.wanted;
6069 /* FALL THROUGH */
6070
6071 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6072 if (ST.logical) {
f2338a2e 6073 sw = cBOOL(ST.wanted);
40a82448
DM
6074 }
6075 else if (!ST.wanted)
6076 sayNO;
6077
37f53970
DM
6078 if (OP(ST.me) != SUSPEND) {
6079 /* restore old position except for (?>...) */
6080 locinput = st->locinput;
40a82448
DM
6081 }
6082 scan = ST.me + ARG(ST.me);
6083 if (scan == ST.me)
6084 scan = NULL;
6085 continue; /* execute B */
6086
6087#undef ST
dad79028 6088
3c0563b9
DM
6089 case LONGJMP: /* alternative with many branches compiles to
6090 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6091 next = scan + ARG(scan);
6092 if (next == scan)
6093 next = NULL;
a0d0e21e 6094 break;
3c0563b9
DM
6095
6096 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6097 reginfo->cutpoint = PL_regeol;
6098 /* FALLTHROUGH */
3c0563b9
DM
6099
6100 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6101 if (!scan->flags)
ad64d0ec 6102 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6103 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6104 assert(0); /* NOTREACHED */
3c0563b9 6105
54612592
YO
6106 case COMMIT_next_fail:
6107 no_final = 1;
6108 /* FALLTHROUGH */
3c0563b9
DM
6109
6110 case OPFAIL: /* (*FAIL) */
7f69552c 6111 sayNO;
118e2215 6112 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6113
6114#define ST st->u.mark
3c0563b9 6115 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6116 ST.prev_mark = mark_state;
5d458dd8 6117 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6118 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6119 mark_state = st;
4d5016e5
DM
6120 ST.mark_loc = locinput;
6121 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6122 assert(0); /* NOTREACHED */
3c0563b9 6123
e2e6a0f1
YO
6124 case MARKPOINT_next:
6125 mark_state = ST.prev_mark;
6126 sayYES;
118e2215 6127 assert(0); /* NOTREACHED */
3c0563b9 6128
e2e6a0f1 6129 case MARKPOINT_next_fail:
5d458dd8 6130 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6131 {
6132 if (ST.mark_loc > startpoint)
6133 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6134 popmark = NULL; /* we found our mark */
6135 sv_commit = ST.mark_name;
6136
6137 DEBUG_EXECUTE_r({
5d458dd8 6138 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6139 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6140 REPORT_CODE_OFF+depth*2, "",
be2597df 6141 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6142 });
6143 }
6144 mark_state = ST.prev_mark;
5d458dd8
YO
6145 sv_yes_mark = mark_state ?
6146 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6147 sayNO;
118e2215 6148 assert(0); /* NOTREACHED */
3c0563b9
DM
6149
6150 case SKIP: /* (*SKIP) */
5d458dd8 6151 if (scan->flags) {
2bf803e2 6152 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6153 ST.mark_name = NULL;
e2e6a0f1 6154 ST.mark_loc = locinput;
4d5016e5 6155 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6156 } else {
2bf803e2 6157 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6158 otherwise do nothing. Meaning we need to scan
6159 */
6160 regmatch_state *cur = mark_state;
ad64d0ec 6161 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6162
6163 while (cur) {
6164 if ( sv_eq( cur->u.mark.mark_name,
6165 find ) )
6166 {
6167 ST.mark_name = find;
4d5016e5 6168 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6169 }
6170 cur = cur->u.mark.prev_mark;
6171 }
e2e6a0f1 6172 }
2bf803e2 6173 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6174 break;
3c0563b9 6175
5d458dd8
YO
6176 case SKIP_next_fail:
6177 if (ST.mark_name) {
6178 /* (*CUT:NAME) - Set up to search for the name as we
6179 collapse the stack*/
6180 popmark = ST.mark_name;
6181 } else {
6182 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6183 if (ST.mark_loc > startpoint)
6184 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6185 /* but we set sv_commit to latest mark_name if there
6186 is one so they can test to see how things lead to this
6187 cut */
6188 if (mark_state)
6189 sv_commit=mark_state->u.mark.mark_name;
6190 }
e2e6a0f1
YO
6191 no_final = 1;
6192 sayNO;
118e2215 6193 assert(0); /* NOTREACHED */
e2e6a0f1 6194#undef ST
3c0563b9
DM
6195
6196 case LNBREAK: /* \R */
7016d6eb 6197 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6198 locinput += n;
e1d1eefb
YO
6199 } else
6200 sayNO;
6201 break;
6202
6203#define CASE_CLASS(nAmE) \
6204 case nAmE: \
7016d6eb 6205 if (NEXTCHR_IS_EOS) \
1380b51d 6206 sayNO; \
f2ed9b32 6207 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb 6208 locinput += n; \
e1d1eefb
YO
6209 } else \
6210 sayNO; \
6211 break; \
6212 case N##nAmE: \
7016d6eb 6213 if (NEXTCHR_IS_EOS) \
1380b51d 6214 sayNO; \
f2ed9b32 6215 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
6216 sayNO; \
6217 } else { \
6218 locinput += UTF8SKIP(locinput); \
e1d1eefb
YO
6219 } \
6220 break
6221
3c0563b9
DM
6222 CASE_CLASS(VERTWS); /* \v \V */
6223 CASE_CLASS(HORIZWS); /* \h \H */
e1d1eefb
YO
6224#undef CASE_CLASS
6225
a0d0e21e 6226 default:
b900a521 6227 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6228 PTR2UV(scan), OP(scan));
cea2e8a9 6229 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6230
6231 /* this is a point to jump to in order to increment
6232 * locinput by one character */
6233 increment_locinput:
6234 if (utf8_target) {
6235 locinput += PL_utf8skip[nextchr];
7016d6eb 6236 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6237 if (locinput > PL_regeol)
6238 sayNO;
28b98f76
DM
6239 }
6240 else
3640db6b 6241 locinput++;
28b98f76 6242 break;
5d458dd8
YO
6243
6244 } /* end switch */
95b24440 6245
5d458dd8
YO
6246 /* switch break jumps here */
6247 scan = next; /* prepare to execute the next op and ... */
6248 continue; /* ... jump back to the top, reusing st */
118e2215 6249 assert(0); /* NOTREACHED */
95b24440 6250
40a82448
DM
6251 push_yes_state:
6252 /* push a state that backtracks on success */
6253 st->u.yes.prev_yes_state = yes_state;
6254 yes_state = st;
6255 /* FALL THROUGH */
6256 push_state:
6257 /* push a new regex state, then continue at scan */
6258 {
6259 regmatch_state *newst;
6260
24b23f37
YO
6261 DEBUG_STACK_r({
6262 regmatch_state *cur = st;
6263 regmatch_state *curyes = yes_state;
6264 int curd = depth;
6265 regmatch_slab *slab = PL_regmatch_slab;
6266 for (;curd > -1;cur--,curd--) {
6267 if (cur < SLAB_FIRST(slab)) {
6268 slab = slab->prev;
6269 cur = SLAB_LAST(slab);
6270 }
6271 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6272 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6273 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6274 (curyes == cur) ? "yes" : ""
6275 );
6276 if (curyes == cur)
6277 curyes = cur->u.yes.prev_yes_state;
6278 }
6279 } else
6280 DEBUG_STATE_pp("push")
6281 );
40a82448 6282 depth++;
40a82448
DM
6283 st->locinput = locinput;
6284 newst = st+1;
6285 if (newst > SLAB_LAST(PL_regmatch_slab))
6286 newst = S_push_slab(aTHX);
6287 PL_regmatch_state = newst;
786e8c11 6288
4d5016e5 6289 locinput = pushinput;
40a82448
DM
6290 st = newst;
6291 continue;
118e2215 6292 assert(0); /* NOTREACHED */
40a82448 6293 }
a0d0e21e 6294 }
a687059c 6295
a0d0e21e
LW
6296 /*
6297 * We get here only if there's trouble -- normally "case END" is
6298 * the terminating point.
6299 */
cea2e8a9 6300 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6301 /*NOTREACHED*/
4633a7c4
LW
6302 sayNO;
6303
262b90c4 6304yes:
77cb431f
DM
6305 if (yes_state) {
6306 /* we have successfully completed a subexpression, but we must now
6307 * pop to the state marked by yes_state and continue from there */
77cb431f 6308 assert(st != yes_state);
5bc10b2c
DM
6309#ifdef DEBUGGING
6310 while (st != yes_state) {
6311 st--;
6312 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6313 PL_regmatch_slab = PL_regmatch_slab->prev;
6314 st = SLAB_LAST(PL_regmatch_slab);
6315 }
e2e6a0f1 6316 DEBUG_STATE_r({
54612592
YO
6317 if (no_final) {
6318 DEBUG_STATE_pp("pop (no final)");
6319 } else {
6320 DEBUG_STATE_pp("pop (yes)");
6321 }
e2e6a0f1 6322 });
5bc10b2c
DM
6323 depth--;
6324 }
6325#else
77cb431f
DM
6326 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6327 || yes_state > SLAB_LAST(PL_regmatch_slab))
6328 {
6329 /* not in this slab, pop slab */
6330 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6331 PL_regmatch_slab = PL_regmatch_slab->prev;
6332 st = SLAB_LAST(PL_regmatch_slab);
6333 }
6334 depth -= (st - yes_state);
5bc10b2c 6335#endif
77cb431f
DM
6336 st = yes_state;
6337 yes_state = st->u.yes.prev_yes_state;
6338 PL_regmatch_state = st;
24b23f37 6339
3640db6b 6340 if (no_final)
5d458dd8 6341 locinput= st->locinput;
54612592 6342 state_num = st->resume_state + no_final;
24d3c4a9 6343 goto reenter_switch;
77cb431f
DM
6344 }
6345
a3621e74 6346 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6347 PL_colors[4], PL_colors[5]));
02db2b7b 6348
ed301438 6349 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6350 /* each successfully executed (?{...}) block does the equivalent of
6351 * local $^R = do {...}
6352 * When popping the save stack, all these locals would be undone;
6353 * bypass this by setting the outermost saved $^R to the latest
6354 * value */
6355 if (oreplsv != GvSV(PL_replgv))
6356 sv_setsv(oreplsv, GvSV(PL_replgv));
6357 }
95b24440 6358 result = 1;
aa283a38 6359 goto final_exit;
4633a7c4
LW
6360
6361no:
a3621e74 6362 DEBUG_EXECUTE_r(
7821416a 6363 PerlIO_printf(Perl_debug_log,
786e8c11 6364 "%*s %sfailed...%s\n",
5bc10b2c 6365 REPORT_CODE_OFF+depth*2, "",
786e8c11 6366 PL_colors[4], PL_colors[5])
7821416a 6367 );
aa283a38 6368
262b90c4 6369no_silent:
54612592
YO
6370 if (no_final) {
6371 if (yes_state) {
6372 goto yes;
6373 } else {
6374 goto final_exit;
6375 }
6376 }
aa283a38
DM
6377 if (depth) {
6378 /* there's a previous state to backtrack to */
40a82448
DM
6379 st--;
6380 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6381 PL_regmatch_slab = PL_regmatch_slab->prev;
6382 st = SLAB_LAST(PL_regmatch_slab);
6383 }
6384 PL_regmatch_state = st;
40a82448 6385 locinput= st->locinput;
40a82448 6386
5bc10b2c
DM
6387 DEBUG_STATE_pp("pop");
6388 depth--;
262b90c4
DM
6389 if (yes_state == st)
6390 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6391
24d3c4a9
DM
6392 state_num = st->resume_state + 1; /* failure = success + 1 */
6393 goto reenter_switch;
95b24440 6394 }
24d3c4a9 6395 result = 0;
aa283a38 6396
262b90c4 6397 final_exit:
bbe252da 6398 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6399 SV *sv_err = get_sv("REGERROR", 1);
6400 SV *sv_mrk = get_sv("REGMARK", 1);
6401 if (result) {
e2e6a0f1 6402 sv_commit = &PL_sv_no;
5d458dd8
YO
6403 if (!sv_yes_mark)
6404 sv_yes_mark = &PL_sv_yes;
6405 } else {
6406 if (!sv_commit)
6407 sv_commit = &PL_sv_yes;
6408 sv_yes_mark = &PL_sv_no;
6409 }
6410 sv_setsv(sv_err, sv_commit);
6411 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6412 }
19b95bf0 6413
81ed78b2
DM
6414
6415 if (last_pushed_cv) {
6416 dSP;
6417 POP_MULTICALL;
4f8dbb2d 6418 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6419 }
6420
2f554ef7
DM
6421 /* clean up; in particular, free all slabs above current one */
6422 LEAVE_SCOPE(oldsave);
5d9a96ca 6423
730f4c74
DM
6424 assert(!result || locinput - PL_bostr >= 0);
6425 return result ? locinput - PL_bostr : -1;
a687059c
LW
6426}
6427
6428/*
6429 - regrepeat - repeatedly match something simple, report how many
d60de1d1
DM
6430 *
6431 * startposp - pointer a pointer to the start position. This is updated
6432 * to point to the byte following the highest successful
6433 * match.
6434 * p - the regnode to be repeatedly matched against.
6435 * max - maximum number of characters to match.
6436 * depth - (for debugging) backtracking depth.
a687059c 6437 */
76e3520e 6438STATIC I32
f73aaa43 6439S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
a687059c 6440{
27da23d5 6441 dVAR;
eb578fdb
KW
6442 char *scan;
6443 I32 c;
6444 char *loceol = PL_regeol;
6445 I32 hardcount = 0;
6446 bool utf8_target = PL_reg_match_utf8;
d513472c 6447 UV utf8_flags;
4f55667c
SP
6448#ifndef DEBUGGING
6449 PERL_UNUSED_ARG(depth);
6450#endif
a0d0e21e 6451
7918f24d
NC
6452 PERL_ARGS_ASSERT_REGREPEAT;
6453
f73aaa43 6454 scan = *startposp;
faf11cac
HS
6455 if (max == REG_INFTY)
6456 max = I32_MAX;
6457 else if (max < loceol - scan)
7f596f4c 6458 loceol = scan + max;
a0d0e21e 6459 switch (OP(p)) {
22c35a8c 6460 case REG_ANY:
f2ed9b32 6461 if (utf8_target) {
ffc61ed2 6462 loceol = PL_regeol;
1aa99e6b 6463 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6464 scan += UTF8SKIP(scan);
6465 hardcount++;
6466 }
6467 } else {
6468 while (scan < loceol && *scan != '\n')
6469 scan++;
a0ed51b3
LW
6470 }
6471 break;
ffc61ed2 6472 case SANY:
f2ed9b32 6473 if (utf8_target) {
def8e4ea 6474 loceol = PL_regeol;
a0804c9e 6475 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6476 scan += UTF8SKIP(scan);
6477 hardcount++;
6478 }
6479 }
6480 else
6481 scan = loceol;
a0ed51b3 6482 break;
f33976b4
DB
6483 case CANY:
6484 scan = loceol;
6485 break;
59d32103 6486 case EXACT:
59d32103 6487 c = (U8)*STRING(p);
59d32103 6488
5e4a1da1
KW
6489 /* Can use a simple loop if the pattern char to match on is invariant
6490 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6491 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6492 * true iff it doesn't matter if the argument is in UTF-8 or not */
6493 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
59d32103
KW
6494 while (scan < loceol && UCHARAT(scan) == c) {
6495 scan++;
6496 }
6497 }
b40a2c17 6498 else if (UTF_PATTERN) {
5e4a1da1
KW
6499 if (utf8_target) {
6500 STRLEN scan_char_len;
6501 loceol = PL_regeol;
6502
6503 /* When both target and pattern are UTF-8, we have to do s
6504 * string EQ */
6505 while (hardcount < max
6506 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6507 && scan_char_len <= STR_LEN(p)
6508 && memEQ(scan, STRING(p), scan_char_len))
6509 {
6510 scan += scan_char_len;
6511 hardcount++;
6512 }
6513 }
6514 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6515
5e4a1da1
KW
6516 /* Target isn't utf8; convert the character in the UTF-8
6517 * pattern to non-UTF8, and do a simple loop */
6518 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6519 while (scan < loceol && UCHARAT(scan) == c) {
6520 scan++;
6521 }
6522 } /* else pattern char is above Latin1, can't possibly match the
6523 non-UTF-8 target */
b40a2c17 6524 }
5e4a1da1 6525 else {
59d32103 6526
5e4a1da1
KW
6527 /* Here, the string must be utf8; pattern isn't, and <c> is
6528 * different in utf8 than not, so can't compare them directly.
6529 * Outside the loop, find the two utf8 bytes that represent c, and
6530 * then look for those in sequence in the utf8 string */
59d32103
KW
6531 U8 high = UTF8_TWO_BYTE_HI(c);
6532 U8 low = UTF8_TWO_BYTE_LO(c);
6533 loceol = PL_regeol;
6534
6535 while (hardcount < max
6536 && scan + 1 < loceol
6537 && UCHARAT(scan) == high
6538 && UCHARAT(scan + 1) == low)
6539 {
6540 scan += 2;
6541 hardcount++;
6542 }
6543 }
6544 break;
5e4a1da1 6545
2f7f8cb1
KW
6546 case EXACTFA:
6547 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6548 goto do_exactf;
6549
d4e0b827
KW
6550 case EXACTFL:
6551 PL_reg_flags |= RF_tainted;
17580e7a
KW
6552 utf8_flags = FOLDEQ_UTF8_LOCALE;
6553 goto do_exactf;
6554
d4e0b827 6555 case EXACTF:
62bf7766
KW
6556 utf8_flags = 0;
6557 goto do_exactf;
6558
3c760661 6559 case EXACTFU_SS:
fab2782b 6560 case EXACTFU_TRICKYFOLD:
9a5a5549 6561 case EXACTFU:
05f861a2 6562 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6563
2f7f8cb1 6564 do_exactf:
090f7165 6565 c = (U8)*STRING(p);
d4e0b827 6566
b40a2c17
KW
6567 if (utf8_target
6568 || OP(p) == EXACTFU_SS
6569 || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
6570 {
6571 /* Use full Unicode fold matching */
59d32103 6572 char *tmpeol = loceol;
b40a2c17 6573 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
59d32103 6574 while (hardcount < max
d513472c 6575 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
b40a2c17 6576 STRING(p), NULL, pat_len, cBOOL(UTF_PATTERN), utf8_flags))
59d32103
KW
6577 {
6578 scan = tmpeol;
6579 tmpeol = loceol;
6580 hardcount++;
6581 }
634c83a2 6582
59d32103
KW
6583 /* XXX Note that the above handles properly the German sharp s in
6584 * the pattern matching ss in the string. But it doesn't handle
6585 * properly cases where the string contains say 'LIGATURE ff' and
6586 * the pattern is 'f+'. This would require, say, a new function or
6587 * revised interface to foldEQ_utf8(), in which the maximum number
6588 * of characters to match could be passed and it would return how
6589 * many actually did. This is just one of many cases where
6590 * multi-char folds don't work properly, and so the fix is being
6591 * deferred */
6592 }
6593 else {
87381386 6594 U8 folded;
59d32103 6595
5400f398
KW
6596 /* Here, the string isn't utf8; and either the pattern isn't utf8
6597 * or c is an invariant, so its utf8ness doesn't affect c. Can
6598 * just do simple comparisons for exact or fold matching. */
d4e0b827 6599 switch (OP(p)) {
87381386 6600 case EXACTF: folded = PL_fold[c]; break;
2f7f8cb1 6601 case EXACTFA:
fab2782b 6602 case EXACTFU_TRICKYFOLD:
9a5a5549 6603 case EXACTFU: folded = PL_fold_latin1[c]; break;
87381386
KW
6604 case EXACTFL: folded = PL_fold_locale[c]; break;
6605 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6606 }
6607 while (scan < loceol &&
6608 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6609 {
6610 scan++;
634c83a2
KW
6611 }
6612 }
bbce6d69 6613 break;
a0d0e21e 6614 case ANYOF:
e0193e47 6615 if (utf8_target) {
4e8910e0 6616 STRLEN inclasslen;
ffc61ed2 6617 loceol = PL_regeol;
4e8910e0
KW
6618 inclasslen = loceol - scan;
6619 while (hardcount < max
6620 && ((inclasslen = loceol - scan) > 0)
6621 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6622 {
6623 scan += inclasslen;
ffc61ed2
JH
6624 hardcount++;
6625 }
6626 } else {
32fc9b6a 6627 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6628 scan++;
6629 }
a0d0e21e 6630 break;
980866de 6631 case ALNUMU:
f2ed9b32 6632 if (utf8_target) {
980866de 6633 utf8_wordchar:
ffc61ed2 6634 loceol = PL_regeol;
1a4fad37 6635 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6636 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6637 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6638 {
ffc61ed2
JH
6639 scan += UTF8SKIP(scan);
6640 hardcount++;
6641 }
980866de 6642 } else {
a12cf05f
KW
6643 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6644 scan++;
6645 }
980866de
KW
6646 }
6647 break;
6648 case ALNUM:
6649 if (utf8_target)
6650 goto utf8_wordchar;
6651 while (scan < loceol && isALNUM((U8) *scan)) {
6652 scan++;
a0ed51b3
LW
6653 }
6654 break;
cfaf538b
KW
6655 case ALNUMA:
6656 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6657 scan++;
6658 }
6659 break;
bbce6d69 6660 case ALNUML:
3280af22 6661 PL_reg_flags |= RF_tainted;
f2ed9b32 6662 if (utf8_target) {
ffc61ed2 6663 loceol = PL_regeol;
1aa99e6b
IH
6664 while (hardcount < max && scan < loceol &&
6665 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6666 scan += UTF8SKIP(scan);
6667 hardcount++;
6668 }
6669 } else {
6670 while (scan < loceol && isALNUM_LC(*scan))
6671 scan++;
a0ed51b3
LW
6672 }
6673 break;
980866de 6674 case NALNUMU:
f2ed9b32 6675 if (utf8_target) {
980866de
KW
6676
6677 utf8_Nwordchar:
6678
ffc61ed2 6679 loceol = PL_regeol;
1a4fad37 6680 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6681 while (hardcount < max && scan < loceol &&
980866de 6682 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6683 {
ffc61ed2
JH
6684 scan += UTF8SKIP(scan);
6685 hardcount++;
6686 }
980866de 6687 } else {
a12cf05f
KW
6688 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6689 scan++;
6690 }
980866de
KW
6691 }
6692 break;
6693 case NALNUM:
6694 if (utf8_target)
6695 goto utf8_Nwordchar;
6696 while (scan < loceol && ! isALNUM((U8) *scan)) {
6697 scan++;
a0ed51b3
LW
6698 }
6699 break;
0658cdde
KW
6700
6701 case POSIXA:
6702 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6703 scan++;
6704 }
6705 break;
6706 case NPOSIXA:
6707 if (utf8_target) {
6708 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6709 scan += UTF8SKIP(scan);
6710 }
6711 }
6712 else {
6713 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6714 scan++;
6715 }
6716 }
6717 break;
cfaf538b
KW
6718 case NALNUMA:
6719 if (utf8_target) {
6720 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6721 scan += UTF8SKIP(scan);
6722 }
6723 }
6724 else {
6725 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6726 scan++;
6727 }
6728 }
6729 break;
bbce6d69 6730 case NALNUML:
3280af22 6731 PL_reg_flags |= RF_tainted;
f2ed9b32 6732 if (utf8_target) {
ffc61ed2 6733 loceol = PL_regeol;
1aa99e6b
IH
6734 while (hardcount < max && scan < loceol &&
6735 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6736 scan += UTF8SKIP(scan);
6737 hardcount++;
6738 }
6739 } else {
6740 while (scan < loceol && !isALNUM_LC(*scan))
6741 scan++;
a0ed51b3
LW
6742 }
6743 break;
980866de 6744 case SPACEU:
f2ed9b32 6745 if (utf8_target) {
980866de
KW
6746
6747 utf8_space:
6748
ffc61ed2 6749 loceol = PL_regeol;
1a4fad37 6750 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6751 while (hardcount < max && scan < loceol &&
3568d838 6752 (*scan == ' ' ||
a12cf05f
KW
6753 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6754 {
ffc61ed2
JH
6755 scan += UTF8SKIP(scan);
6756 hardcount++;
6757 }
980866de
KW
6758 break;
6759 }
6760 else {
a12cf05f
KW
6761 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6762 scan++;
6763 }
980866de
KW
6764 break;
6765 }
6766 case SPACE:
6767 if (utf8_target)
6768 goto utf8_space;
6769
6770 while (scan < loceol && isSPACE((U8) *scan)) {
6771 scan++;
a0ed51b3
LW
6772 }
6773 break;
cfaf538b
KW
6774 case SPACEA:
6775 while (scan < loceol && isSPACE_A((U8) *scan)) {
6776 scan++;
6777 }
6778 break;
bbce6d69 6779 case SPACEL:
3280af22 6780 PL_reg_flags |= RF_tainted;
f2ed9b32 6781 if (utf8_target) {
ffc61ed2 6782 loceol = PL_regeol;
1aa99e6b 6783 while (hardcount < max && scan < loceol &&
6bbba904 6784 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6785 scan += UTF8SKIP(scan);
6786 hardcount++;
6787 }
6788 } else {
6789 while (scan < loceol && isSPACE_LC(*scan))
6790 scan++;
a0ed51b3
LW
6791 }
6792 break;
980866de 6793 case NSPACEU:
f2ed9b32 6794 if (utf8_target) {
980866de
KW
6795
6796 utf8_Nspace:
6797
ffc61ed2 6798 loceol = PL_regeol;
1a4fad37 6799 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6800 while (hardcount < max && scan < loceol &&
980866de
KW
6801 ! (*scan == ' ' ||
6802 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6803 {
ffc61ed2
JH
6804 scan += UTF8SKIP(scan);
6805 hardcount++;
6806 }
980866de
KW
6807 break;
6808 }
6809 else {
a12cf05f
KW
6810 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6811 scan++;
6812 }
980866de
KW
6813 }
6814 break;
6815 case NSPACE:
6816 if (utf8_target)
6817 goto utf8_Nspace;
6818
6819 while (scan < loceol && ! isSPACE((U8) *scan)) {
6820 scan++;
a0ed51b3 6821 }
0008a298 6822 break;
cfaf538b
KW
6823 case NSPACEA:
6824 if (utf8_target) {
6825 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6826 scan += UTF8SKIP(scan);
6827 }
6828 }
6829 else {
6830 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6831 scan++;
6832 }
6833 }
6834 break;
bbce6d69 6835 case NSPACEL:
3280af22 6836 PL_reg_flags |= RF_tainted;
f2ed9b32 6837 if (utf8_target) {
ffc61ed2 6838 loceol = PL_regeol;
1aa99e6b 6839 while (hardcount < max && scan < loceol &&
6bbba904 6840 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6841 scan += UTF8SKIP(scan);
6842 hardcount++;
6843 }
6844 } else {
6845 while (scan < loceol && !isSPACE_LC(*scan))
6846 scan++;
a0ed51b3
LW
6847 }
6848 break;
a0d0e21e 6849 case DIGIT:
f2ed9b32 6850 if (utf8_target) {
ffc61ed2 6851 loceol = PL_regeol;
1a4fad37 6852 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6853 while (hardcount < max && scan < loceol &&
f2ed9b32 6854 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6855 scan += UTF8SKIP(scan);
6856 hardcount++;
6857 }
6858 } else {
6859 while (scan < loceol && isDIGIT(*scan))
6860 scan++;
a0ed51b3
LW
6861 }
6862 break;
cfaf538b
KW
6863 case DIGITA:
6864 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6865 scan++;
6866 }
6867 break;
b77393f6
KW
6868 case DIGITL:
6869 PL_reg_flags |= RF_tainted;
6870 if (utf8_target) {
6871 loceol = PL_regeol;
6872 while (hardcount < max && scan < loceol &&
6873 isDIGIT_LC_utf8((U8*)scan)) {
6874 scan += UTF8SKIP(scan);
6875 hardcount++;
6876 }
6877 } else {
6878 while (scan < loceol && isDIGIT_LC(*scan))
6879 scan++;
6880 }
6881 break;
a0d0e21e 6882 case NDIGIT:
f2ed9b32 6883 if (utf8_target) {
ffc61ed2 6884 loceol = PL_regeol;
1a4fad37 6885 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 6886 while (hardcount < max && scan < loceol &&
f2ed9b32 6887 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
6888 scan += UTF8SKIP(scan);
6889 hardcount++;
6890 }
6891 } else {
6892 while (scan < loceol && !isDIGIT(*scan))
6893 scan++;
a0ed51b3 6894 }
cfaf538b
KW
6895 break;
6896 case NDIGITA:
6897 if (utf8_target) {
6898 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6899 scan += UTF8SKIP(scan);
6900 }
6901 }
6902 else {
6903 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6904 scan++;
6905 }
6906 }
6907 break;
b77393f6
KW
6908 case NDIGITL:
6909 PL_reg_flags |= RF_tainted;
6910 if (utf8_target) {
6911 loceol = PL_regeol;
6912 while (hardcount < max && scan < loceol &&
6913 !isDIGIT_LC_utf8((U8*)scan)) {
6914 scan += UTF8SKIP(scan);
6915 hardcount++;
6916 }
6917 } else {
6918 while (scan < loceol && !isDIGIT_LC(*scan))
6919 scan++;
6920 }
6921 break;
e1d1eefb 6922 case LNBREAK:
f2ed9b32 6923 if (utf8_target) {
e1d1eefb 6924 loceol = PL_regeol;
7016d6eb
DM
6925 while (hardcount < max && scan < loceol &&
6926 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
e1d1eefb
YO
6927 scan += c;
6928 hardcount++;
6929 }
6930 } else {
6931 /*
6932 LNBREAK can match two latin chars, which is ok,
6933 because we have a null terminated string, but we
6934 have to use hardcount in this situation
6935 */
7016d6eb 6936 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
e1d1eefb
YO
6937 scan+=c;
6938 hardcount++;
6939 }
6940 }
6941 break;
6942 case HORIZWS:
f2ed9b32 6943 if (utf8_target) {
e1d1eefb 6944 loceol = PL_regeol;
7016d6eb
DM
6945 while (hardcount < max && scan < loceol &&
6946 (c=is_HORIZWS_utf8_safe(scan, loceol)))
6947 {
e1d1eefb
YO
6948 scan += c;
6949 hardcount++;
6950 }
6951 } else {
7016d6eb 6952 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
6953 scan++;
6954 }
a0ed51b3 6955 break;
e1d1eefb 6956 case NHORIZWS:
f2ed9b32 6957 if (utf8_target) {
e1d1eefb 6958 loceol = PL_regeol;
7016d6eb
DM
6959 while (hardcount < max && scan < loceol &&
6960 !is_HORIZWS_utf8_safe(scan, loceol))
6961 {
e1d1eefb
YO
6962 scan += UTF8SKIP(scan);
6963 hardcount++;
6964 }
6965 } else {
7016d6eb 6966 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
6967 scan++;
6968
6969 }
6970 break;
6971 case VERTWS:
f2ed9b32 6972 if (utf8_target) {
e1d1eefb 6973 loceol = PL_regeol;
7016d6eb
DM
6974 while (hardcount < max && scan < loceol &&
6975 (c=is_VERTWS_utf8_safe(scan, loceol)))
6976 {
e1d1eefb
YO
6977 scan += c;
6978 hardcount++;
6979 }
6980 } else {
7016d6eb 6981 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
6982 scan++;
6983
6984 }
6985 break;
6986 case NVERTWS:
f2ed9b32 6987 if (utf8_target) {
e1d1eefb 6988 loceol = PL_regeol;
7016d6eb
DM
6989 while (hardcount < max && scan < loceol &&
6990 !is_VERTWS_utf8_safe(scan, loceol))
6991 {
e1d1eefb
YO
6992 scan += UTF8SKIP(scan);
6993 hardcount++;
6994 }
6995 } else {
7016d6eb 6996 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
6997 scan++;
6998
6999 }
7000 break;
7001
a0d0e21e
LW
7002 default: /* Called on something of 0 width. */
7003 break; /* So match right here or not at all. */
7004 }
a687059c 7005
a0ed51b3
LW
7006 if (hardcount)
7007 c = hardcount;
7008 else
f73aaa43
DM
7009 c = scan - *startposp;
7010 *startposp = scan;
a687059c 7011
a3621e74 7012 DEBUG_r({
e68ec53f 7013 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7014 DEBUG_EXECUTE_r({
e68ec53f
YO
7015 SV * const prop = sv_newmortal();
7016 regprop(prog, prop, p);
7017 PerlIO_printf(Perl_debug_log,
be8e71aa 7018 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7019 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7020 });
be8e71aa 7021 });
9041c2e3 7022
a0d0e21e 7023 return(c);
a687059c
LW
7024}
7025
c277df42 7026
be8e71aa 7027#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7028/*
6c6525b8 7029- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7030create a copy so that changes the caller makes won't change the shared one.
7031If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7032 */
ffc61ed2 7033SV *
32fc9b6a 7034Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7035{
6c6525b8 7036 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7037
7038 if (altsvp) {
7039 *altsvp = NULL;
7040 }
7041
7042 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7043}
7044#endif
7045
7046STATIC SV *
e0193e47 7047S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7048{
8c9eb58f
KW
7049 /* Returns the swash for the input 'node' in the regex 'prog'.
7050 * If <doinit> is true, will attempt to create the swash if not already
7051 * done.
7052 * If <listsvp> is non-null, will return the swash initialization string in
7053 * it.
8c9eb58f
KW
7054 * Tied intimately to how regcomp.c sets up the data structure */
7055
97aff369 7056 dVAR;
9e55ce06
JH
7057 SV *sw = NULL;
7058 SV *si = NULL;
7a6c6baa
KW
7059 SV* invlist = NULL;
7060
f8fc2ecf
YO
7061 RXi_GET_DECL(prog,progi);
7062 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7063
6c6525b8 7064 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7065
ccb2541c
KW
7066 assert(ANYOF_NONBITMAP(node));
7067
4f639d21 7068 if (data && data->count) {
a3b680e6 7069 const U32 n = ARG(node);
ffc61ed2 7070
4f639d21 7071 if (data->what[n] == 's') {
ad64d0ec
NC
7072 SV * const rv = MUTABLE_SV(data->data[n]);
7073 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7074 SV **const ary = AvARRAY(av);
87367d5f 7075 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7076
8c9eb58f 7077 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7078
88675427
KW
7079 /* Elements 2 and 3 are either both present or both absent. [2] is
7080 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7081 * that inversion list has any user-defined properties in it. */
88675427
KW
7082 if (av_len(av) >= 2) {
7083 invlist = ary[2];
7084 if (SvUV(ary[3])) {
83199d38
KW
7085 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7086 }
7a6c6baa
KW
7087 }
7088 else {
7089 invlist = NULL;
7a6c6baa
KW
7090 }
7091
8c9eb58f
KW
7092 /* Element [1] is reserved for the set-up swash. If already there,
7093 * return it; if not, create it and store it there */
f192cf32
KW
7094 if (SvROK(ary[1])) {
7095 sw = ary[1];
7096 }
ffc61ed2 7097 else if (si && doinit) {
7a6c6baa
KW
7098
7099 sw = _core_swash_init("utf8", /* the utf8 package */
7100 "", /* nameless */
7101 si,
7102 1, /* binary */
7103 0, /* not from tr/// */
7a6c6baa 7104 invlist,
83199d38 7105 &swash_init_flags);
ffc61ed2
JH
7106 (void)av_store(av, 1, sw);
7107 }
7108 }
7109 }
7110
7a6c6baa
KW
7111 if (listsvp) {
7112 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7113
7114 /* Use the swash, if any, which has to have incorporated into it all
7115 * possibilities */
872dd7e0
KW
7116 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7117 && (si && si != &PL_sv_undef))
7118 {
7a6c6baa 7119
872dd7e0 7120 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7121 sv_catsv(matches_string, si);
7122 }
7123
7124 /* Add the inversion list to whatever we have. This may have come from
7125 * the swash, or from an input parameter */
7126 if (invlist) {
7127 sv_catsv(matches_string, _invlist_contents(invlist));
7128 }
7129 *listsvp = matches_string;
7130 }
7131
ffc61ed2
JH
7132 return sw;
7133}
7134
7135/*
ba7b4546 7136 - reginclass - determine if a character falls into a character class
832705d4 7137
6698fab5
KW
7138 n is the ANYOF regnode
7139 p is the target string
7140 lenp is pointer to the maximum number of bytes of how far to go in p
7141 (This is assumed wthout checking to always be at least the current
7142 character's size)
7143 utf8_target tells whether p is in UTF-8.
832705d4 7144
4b3cda86
KW
7145 Returns true if matched; false otherwise. If lenp is not NULL, on return
7146 from a successful match, the value it points to will be updated to how many
7147 bytes in p were matched. If there was no match, the value is undefined,
7148 possibly changed from the input.
eba1359e 7149
d5788240
KW
7150 Note that this can be a synthetic start class, a combination of various
7151 nodes, so things you think might be mutually exclusive, such as locale,
7152 aren't. It can match both locale and non-locale
7153
bbce6d69 7154 */
7155
76e3520e 7156STATIC bool
f6ad78d8 7157S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
bbce6d69 7158{
27da23d5 7159 dVAR;
a3b680e6 7160 const char flags = ANYOF_FLAGS(n);
bbce6d69 7161 bool match = FALSE;
cc07378b 7162 UV c = *p;
f7ab54c6 7163 STRLEN c_len = 0;
6698fab5 7164 STRLEN maxlen;
1aa99e6b 7165
7918f24d
NC
7166 PERL_ARGS_ASSERT_REGINCLASS;
7167
4b3cda86 7168 /* If c is not already the code point, get it */
f2ed9b32 7169 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
f7ab54c6 7170 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7171 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7172 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7173 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7174 * UTF8_ALLOW_FFFF */
f7ab54c6 7175 if (c_len == (STRLEN)-1)
e8a70c6f 7176 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7177 }
a5a291f5
KW
7178 else {
7179 c_len = 1;
7180 }
bbce6d69 7181
a5a291f5
KW
7182 /* Use passed in max length, or one character if none passed in or less
7183 * than one character. And assume will match just one character. This is
7184 * overwritten later if matched more. */
4b3cda86 7185 if (lenp) {
a5a291f5
KW
7186 maxlen = (*lenp > c_len) ? *lenp : c_len;
7187 *lenp = c_len;
4b3cda86
KW
7188
7189 }
7190 else {
a5a291f5 7191 maxlen = c_len;
4b3cda86
KW
7192 }
7193
7cdde544
KW
7194 /* If this character is potentially in the bitmap, check it */
7195 if (c < 256) {
ffc61ed2
JH
7196 if (ANYOF_BITMAP_TEST(n, c))
7197 match = TRUE;
11454c59
KW
7198 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7199 && ! utf8_target
7200 && ! isASCII(c))
7201 {
7202 match = TRUE;
7203 }
a0ed51b3 7204
78969a98
KW
7205 else if (flags & ANYOF_LOCALE) {
7206 PL_reg_flags |= RF_tainted;
7207
7208 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
7209 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7210 {
ffc61ed2 7211 match = TRUE;
78969a98 7212 }
040aea3a
KW
7213 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7214 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7215 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7216 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7217 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7218 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7219 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7220 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7221 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7222 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7223 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
7224 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7225 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
7226 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7227 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7228 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7229 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7230 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7231 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7232 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7233 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7234 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7235 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7236 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7237 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7238 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7239 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7240 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7241 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
7242 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7243 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 7244 ) /* How's that for a conditional? */
78969a98 7245 ) {
ffc61ed2
JH
7246 match = TRUE;
7247 }
a0ed51b3 7248 }
a0ed51b3
LW
7249 }
7250
7cdde544 7251 /* If the bitmap didn't (or couldn't) match, and something outside the
de87c4fe
KW
7252 * bitmap could match, try that. Locale nodes specifiy completely the
7253 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7254 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7255 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7256 * could be a combination of locale and non-locale nodes. So allow locale
7257 * to match for the synthetic start class, which will give a false
7258 * positive that will be resolved when the match is done again as not part
7259 * of the synthetic start class */
ef87b810 7260 if (!match) {
10ee90d2
KW
7261 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7262 match = TRUE; /* Everything above 255 matches */
e051a21d 7263 }
6f8d7d0d
KW
7264 else if (ANYOF_NONBITMAP(n)
7265 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7266 || (utf8_target
7267 && (c >=256
7268 || (! (flags & ANYOF_LOCALE))
7269 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 7270 {
e0193e47 7271 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7272 if (sw) {
7273 U8 * utf8_p;
7274 if (utf8_target) {
7275 utf8_p = (U8 *) p;
e0193e47
KW
7276 } else { /* Convert to utf8 */
7277 STRLEN len = 1;
7cdde544
KW
7278 utf8_p = bytes_to_utf8(p, &len);
7279 }
f56b6394 7280
e0193e47 7281 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7282 match = TRUE;
e0193e47 7283 }
7cdde544
KW
7284
7285 /* If we allocated a string above, free it */
7286 if (! utf8_target) Safefree(utf8_p);
7287 }
7288 }
5073ffbd
KW
7289
7290 if (UNICODE_IS_SUPER(c)
7291 && (flags & ANYOF_WARN_SUPER)
7292 && ckWARN_d(WARN_NON_UNICODE))
7293 {
7294 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7295 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7296 }
7cdde544
KW
7297 }
7298
f0fdc1c9
KW
7299 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7300 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7301}
161b471a 7302
dfe13c55 7303STATIC U8 *
0ce71af7 7304S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7305{
6af86488
KW
7306 /* return the position 'off' UTF-8 characters away from 's', forward if
7307 * 'off' >= 0, backwards if negative. But don't go outside of position
7308 * 'lim', which better be < s if off < 0 */
7309
97aff369 7310 dVAR;
7918f24d
NC
7311
7312 PERL_ARGS_ASSERT_REGHOP3;
7313
a0ed51b3 7314 if (off >= 0) {
1aa99e6b 7315 while (off-- && s < lim) {
ffc61ed2 7316 /* XXX could check well-formedness here */
a0ed51b3 7317 s += UTF8SKIP(s);
ffc61ed2 7318 }
a0ed51b3
LW
7319 }
7320 else {
1de06328
YO
7321 while (off++ && s > lim) {
7322 s--;
7323 if (UTF8_IS_CONTINUED(*s)) {
7324 while (s > lim && UTF8_IS_CONTINUATION(*s))
7325 s--;
a0ed51b3 7326 }
1de06328 7327 /* XXX could check well-formedness here */
a0ed51b3
LW
7328 }
7329 }
7330 return s;
7331}
161b471a 7332
f9f4320a
YO
7333#ifdef XXX_dmq
7334/* there are a bunch of places where we use two reghop3's that should
7335 be replaced with this routine. but since thats not done yet
7336 we ifdef it out - dmq
7337*/
dfe13c55 7338STATIC U8 *
1de06328
YO
7339S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7340{
7341 dVAR;
7918f24d
NC
7342
7343 PERL_ARGS_ASSERT_REGHOP4;
7344
1de06328
YO
7345 if (off >= 0) {
7346 while (off-- && s < rlim) {
7347 /* XXX could check well-formedness here */
7348 s += UTF8SKIP(s);
7349 }
7350 }
7351 else {
7352 while (off++ && s > llim) {
7353 s--;
7354 if (UTF8_IS_CONTINUED(*s)) {
7355 while (s > llim && UTF8_IS_CONTINUATION(*s))
7356 s--;
7357 }
7358 /* XXX could check well-formedness here */
7359 }
7360 }
7361 return s;
7362}
f9f4320a 7363#endif
1de06328
YO
7364
7365STATIC U8 *
0ce71af7 7366S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7367{
97aff369 7368 dVAR;
7918f24d
NC
7369
7370 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7371
a0ed51b3 7372 if (off >= 0) {
1aa99e6b 7373 while (off-- && s < lim) {
ffc61ed2 7374 /* XXX could check well-formedness here */
a0ed51b3 7375 s += UTF8SKIP(s);
ffc61ed2 7376 }
a0ed51b3 7377 if (off >= 0)
3dab1dad 7378 return NULL;
a0ed51b3
LW
7379 }
7380 else {
1de06328
YO
7381 while (off++ && s > lim) {
7382 s--;
7383 if (UTF8_IS_CONTINUED(*s)) {
7384 while (s > lim && UTF8_IS_CONTINUATION(*s))
7385 s--;
a0ed51b3 7386 }
1de06328 7387 /* XXX could check well-formedness here */
a0ed51b3
LW
7388 }
7389 if (off <= 0)
3dab1dad 7390 return NULL;
a0ed51b3
LW
7391 }
7392 return s;
7393}
51371543 7394
51371543 7395static void
acfe0abc 7396restore_pos(pTHX_ void *arg)
51371543 7397{
97aff369 7398 dVAR;
097eb12c 7399 regexp * const rex = (regexp *)arg;
ed301438 7400 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7401 if (PL_reg_oldsaved) {
4f639d21
DM
7402 rex->subbeg = PL_reg_oldsaved;
7403 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7404 rex->suboffset = PL_reg_oldsavedoffset;
7405 rex->subcoffset = PL_reg_oldsavedcoffset;
f8c7b90f 7406#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7407 rex->saved_copy = PL_nrs;
ed252734 7408#endif
07bc277f 7409 RXp_MATCH_COPIED_on(rex);
51371543
GS
7410 }
7411 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7412 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7413 PL_curpm = PL_reg_oldcurpm;
7414 }
7415}
33b8afdf
JH
7416
7417STATIC void
7418S_to_utf8_substr(pTHX_ register regexp *prog)
7419{
7e0d5ad7
KW
7420 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7421 * on the converted value */
7422
a1cac82e 7423 int i = 1;
7918f24d
NC
7424
7425 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7426
a1cac82e
NC
7427 do {
7428 if (prog->substrs->data[i].substr
7429 && !prog->substrs->data[i].utf8_substr) {
7430 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7431 prog->substrs->data[i].utf8_substr = sv;
7432 sv_utf8_upgrade(sv);
610460f9 7433 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7434 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7435 /* Trim the trailing \n that fbm_compile added last
7436 time. */
7437 SvCUR_set(sv, SvCUR(sv) - 1);
7438 /* Whilst this makes the SV technically "invalid" (as its
7439 buffer is no longer followed by "\0") when fbm_compile()
7440 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7441 fbm_compile(sv, FBMcf_TAIL);
7442 } else
7443 fbm_compile(sv, 0);
610460f9 7444 }
a1cac82e
NC
7445 if (prog->substrs->data[i].substr == prog->check_substr)
7446 prog->check_utf8 = sv;
7447 }
7448 } while (i--);
33b8afdf
JH
7449}
7450
7e0d5ad7 7451STATIC bool
33b8afdf
JH
7452S_to_byte_substr(pTHX_ register regexp *prog)
7453{
7e0d5ad7
KW
7454 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7455 * on the converted value; returns FALSE if can't be converted. */
7456
97aff369 7457 dVAR;
a1cac82e 7458 int i = 1;
7918f24d
NC
7459
7460 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7461
a1cac82e
NC
7462 do {
7463 if (prog->substrs->data[i].utf8_substr
7464 && !prog->substrs->data[i].substr) {
7465 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7466 if (! sv_utf8_downgrade(sv, TRUE)) {
7467 return FALSE;
7468 }
5400f398
KW
7469 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7470 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7471 /* Trim the trailing \n that fbm_compile added last
7472 time. */
7473 SvCUR_set(sv, SvCUR(sv) - 1);
7474 fbm_compile(sv, FBMcf_TAIL);
7475 } else
7476 fbm_compile(sv, 0);
7477 }
a1cac82e
NC
7478 prog->substrs->data[i].substr = sv;
7479 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7480 prog->check_substr = sv;
33b8afdf 7481 }
a1cac82e 7482 } while (i--);
7e0d5ad7
KW
7483
7484 return TRUE;
33b8afdf 7485}
66610fdd 7486
a0e786e5
KW
7487/* These constants are for finding GCB=LV and GCB=LVT. These are for the
7488 * pre-composed Hangul syllables, which are all in a contiguous block and
7489 * arranged there in such a way so as to facilitate alorithmic determination of
7490 * their characteristics. As such, they don't need a swash, but can be
7491 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7492 * is a GCB=LV */
7493#define SBASE 0xAC00 /* Start of block */
7494#define SCount 11172 /* Length of block */
7495#define TCount 28
7496
7497#if 0 /* This routine is not currently used */
7498PERL_STATIC_INLINE bool
7499S_is_utf8_X_LV(pTHX_ const U8 *p)
7500{
7501 /* Unlike most other similarly named routines here, this does not create a
7502 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7503
7504 dVAR;
7505
7506 UV cp = valid_utf8_to_uvchr(p, NULL);
7507
7508 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7509
7510 /* The earliest Unicode releases did not have these precomposed Hangul
7511 * syllables. Set to point to undef in that case, so will return false on
7512 * every call */
7513 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7514 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7515 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7516 SvREFCNT_dec(PL_utf8_X_LV);
7517 PL_utf8_X_LV = &PL_sv_undef;
7518 }
7519 }
7520
7521 return (PL_utf8_X_LV != &PL_sv_undef
7522 && cp >= SBASE && cp < SBASE + SCount
7523 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7524}
7525#endif
7526
7527PERL_STATIC_INLINE bool
7528S_is_utf8_X_LVT(pTHX_ const U8 *p)
7529{
7530 /* Unlike most other similarly named routines here, this does not create a
7531 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7532
7533 dVAR;
7534
7535 UV cp = valid_utf8_to_uvchr(p, NULL);
7536
7537 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7538
7539 /* The earliest Unicode releases did not have these precomposed Hangul
7540 * syllables. Set to point to undef in that case, so will return false on
7541 * every call */
7542 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7543 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7544 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7545 SvREFCNT_dec(PL_utf8_X_LVT);
7546 PL_utf8_X_LVT = &PL_sv_undef;
7547 }
7548 }
7549
7550 return (PL_utf8_X_LVT != &PL_sv_undef
7551 && cp >= SBASE && cp < SBASE + SCount
7552 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7553}
7554
66610fdd
RGS
7555/*
7556 * Local variables:
7557 * c-indentation-style: bsd
7558 * c-basic-offset: 4
14d04a33 7559 * indent-tabs-mode: nil
66610fdd
RGS
7560 * End:
7561 *
14d04a33 7562 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7563 */