This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
test the resetting of refcnt for immortals
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 97#define RF_warned 2 /* warned about big count? */
faec1544 98
ab3bbdeb 99#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 100
f2ed9b32 101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 102
c74f6de9
KW
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
a687059c
LW
105#ifndef STATIC
106#define STATIC static
107#endif
108
e0193e47 109/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
635cd5d4 112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 113 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 114
c277df42
IZ
115/*
116 * Forwards.
117 */
118
f2ed9b32 119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 121
3dab1dad
YO
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
52657f30 124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
07be1b83
YO
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
8e11feef 130 ? (U8*)pos - off \
3dab1dad 131 : NULL)
efb30f32 132
e7409c1b 133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 135
7016d6eb
DM
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
20d0b1e9 148/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 150 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 151 bool ok; \
9c4fdda1 152 ENTER; save_re_context(); \
cf54f63a
JL
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 158 if (!CAT2(PL_utf8_,class)) { \
cb65b687
DM
159 bool throw_away; \
160 PERL_UNUSED_VAR(throw_away); \
9c4fdda1
RB
161 ENTER; save_re_context(); \
162 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
c1976674 163 PERL_UNUSED_VAR(throw_away); \
9c4fdda1 164 LEAVE; } } STMT_END
37e2e78e 165
1a4fad37
AL
166#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
167#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
51371543 168
37e2e78e 169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
170 /* No asserts are done for some of these, in case called on a */ \
171 /* Unicode version in which they map to nothing */ \
9566f14a 172 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 173 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 174
1dcf4a1b 175#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 176
ee9a90b8
KW
177/* The actual code for CCC_TRY, which uses several variables from the routine
178 * it's callable from. It is designed to be the bulk of a case statement.
179 * FUNC is the macro or function to call on non-utf8 targets that indicate if
180 * nextchr matches the class.
181 * UTF8_TEST is the whole test string to use for utf8 targets
182 * LOAD is what to use to test, and if not present to load in the swash for the
183 * class
184 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
185 * UTF8_TEST test.
186 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
187 * utf8 and a variant, load the swash if necessary and test using the utf8
188 * test. Advance to the next character if test is ok, otherwise fail; If not
189 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
190 * fails, or advance to the next character */
191
192#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
9566f14a 193 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
194 sayNO; \
195 } \
196 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
197 LOAD_UTF8_CHARCLASS(CLASS, STR); \
198 if (POS_OR_NEG (UTF8_TEST)) { \
199 sayNO; \
200 } \
ee9a90b8 201 } \
28b98f76
DM
202 else if (POS_OR_NEG (FUNC(nextchr))) { \
203 sayNO; \
ee9a90b8 204 } \
28b98f76 205 goto increment_locinput;
980866de 206
ee9a90b8
KW
207/* Handle the non-locale cases for a character class and its complement. It
208 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
209 * This is because that code fails when the test succeeds, so we want to have
210 * the test fail so that the code succeeds. The swash is stored in a
211 * predictable PL_ place */
cfaf538b
KW
212#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
213 CLASS, STR) \
ee9a90b8
KW
214 case NAME: \
215 _CCC_TRY_CODE( !, FUNC, \
216 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
217 (U8*)locinput, TRUE)), \
218 CLASS, STR) \
219 case NNAME: \
1dcf4a1b 220 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
221 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
222 (U8*)locinput, TRUE)), \
9566f14a 223 CLASS, STR)
ee9a90b8
KW
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 236 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
cfaf538b 245 case NAMEA: \
7016d6eb 246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 250 locinput++; \
cfaf538b
KW
251 break; \
252 case NNAMEA: \
7016d6eb 253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
254 sayNO; \
255 } \
28b98f76 256 goto increment_locinput; \
ee9a90b8
KW
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
cfaf538b 265 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 266 CLASS, STR) \
cfaf538b
KW
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
ee9a90b8 271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 272
3dab1dad
YO
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
5f80c4cf 275/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
3e901dc0
YO
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
cca55fe3
JP
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 291 OP(rn) == KEEPS || \
3dab1dad 292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 293)
ee9b8eae 294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 295
ee9b8eae
YO
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
e2d8ce26 313
a84d97b6
HS
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
cca55fe3 318#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 322 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 323 else if (type == PLUS) \
cca55fe3 324 rn = NEXTOPER(rn); \
3dab1dad 325 else if (type == IFMATCH) \
a84d97b6 326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 327 else rn += NEXT_OFF(rn); \
3dab1dad 328 } \
5f80c4cf 329} STMT_END
74750237 330
22913b96
KW
331/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
332 * These are for the pre-composed Hangul syllables, which are all in a
333 * contiguous block and arranged there in such a way so as to facilitate
334 * alorithmic determination of their characteristics. As such, they don't need
335 * a swash, but can be determined by simple arithmetic. Almost all are
336 * GCB=LVT, but every 28th one is a GCB=LV */
337#define SBASE 0xAC00 /* Start of block */
338#define SCount 11172 /* Length of block */
339#define TCount 28
c476f425 340
acfe0abc 341static void restore_pos(pTHX_ void *arg);
51371543 342
87c0511b 343#define REGCP_PAREN_ELEMS 3
f067efbf 344#define REGCP_OTHER_ELEMS 3
e0fa7e2b 345#define REGCP_FRAME_ELEMS 1
620d5b66
NC
346/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
347 * are needed for the regexp context stack bookkeeping. */
348
76e3520e 349STATIC CHECKPOINT
92da3157 350S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 351{
97aff369 352 dVAR;
a3b680e6 353 const int retval = PL_savestack_ix;
92da3157
DM
354 const int paren_elems_to_push =
355 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
356 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
357 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 358 I32 p;
40a82448 359 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 360
b93070ed
DM
361 PERL_ARGS_ASSERT_REGCPPUSH;
362
e49a9654 363 if (paren_elems_to_push < 0)
5637ef5b
NC
364 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
365 paren_elems_to_push);
e49a9654 366
e0fa7e2b
NC
367 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
368 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 369 " out of range (%lu-%ld)",
92da3157
DM
370 total_elems,
371 (unsigned long)maxopenparen,
372 (long)parenfloor);
e0fa7e2b 373
620d5b66 374 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 375
495f47a5 376 DEBUG_BUFFERS_r(
92da3157 377 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
378 PerlIO_printf(Perl_debug_log,
379 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
380 PTR2UV(rex),
381 PTR2UV(rex->offs)
382 );
383 );
92da3157 384 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 385/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
386 SSPUSHINT(rex->offs[p].end);
387 SSPUSHINT(rex->offs[p].start);
1ca2007e 388 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 389 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
390 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
391 (UV)p,
392 (IV)rex->offs[p].start,
393 (IV)rex->offs[p].start_tmp,
394 (IV)rex->offs[p].end
40a82448 395 ));
a0d0e21e 396 }
b1ce53c5 397/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 398 SSPUSHINT(maxopenparen);
b93070ed
DM
399 SSPUSHINT(rex->lastparen);
400 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 401 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 402
a0d0e21e
LW
403 return retval;
404}
405
c277df42 406/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
407#define REGCP_SET(cp) \
408 DEBUG_STATE_r( \
ab3bbdeb 409 PerlIO_printf(Perl_debug_log, \
e4f74956 410 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
411 (IV)PL_savestack_ix)); \
412 cp = PL_savestack_ix
c3464db5 413
ab3bbdeb 414#define REGCP_UNWIND(cp) \
e4f74956 415 DEBUG_STATE_r( \
ab3bbdeb 416 if (cp != PL_savestack_ix) \
e4f74956
YO
417 PerlIO_printf(Perl_debug_log, \
418 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
419 (IV)(cp), (IV)PL_savestack_ix)); \
420 regcpblow(cp)
c277df42 421
a8d1f4b4
DM
422#define UNWIND_PAREN(lp, lcp) \
423 for (n = rex->lastparen; n > lp; n--) \
424 rex->offs[n].end = -1; \
425 rex->lastparen = n; \
426 rex->lastcloseparen = lcp;
427
428
f067efbf 429STATIC void
92da3157 430S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 431{
97aff369 432 dVAR;
e0fa7e2b 433 UV i;
87c0511b 434 U32 paren;
a3621e74
YO
435 GET_RE_DEBUG_FLAGS_DECL;
436
7918f24d
NC
437 PERL_ARGS_ASSERT_REGCPPOP;
438
b1ce53c5 439 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 440 i = SSPOPUV;
e0fa7e2b
NC
441 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
442 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
443 rex->lastcloseparen = SSPOPINT;
444 rex->lastparen = SSPOPINT;
92da3157 445 *maxopenparen_p = SSPOPINT;
b1ce53c5 446
620d5b66 447 i -= REGCP_OTHER_ELEMS;
b1ce53c5 448 /* Now restore the parentheses context. */
495f47a5
DM
449 DEBUG_BUFFERS_r(
450 if (i || rex->lastparen + 1 <= rex->nparens)
451 PerlIO_printf(Perl_debug_log,
452 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
453 PTR2UV(rex),
454 PTR2UV(rex->offs)
455 );
456 );
92da3157 457 paren = *maxopenparen_p;
620d5b66 458 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 459 I32 tmps;
1ca2007e 460 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 461 rex->offs[paren].start = SSPOPINT;
cf93c79d 462 tmps = SSPOPINT;
b93070ed
DM
463 if (paren <= rex->lastparen)
464 rex->offs[paren].end = tmps;
495f47a5
DM
465 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
466 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
467 (UV)paren,
468 (IV)rex->offs[paren].start,
469 (IV)rex->offs[paren].start_tmp,
470 (IV)rex->offs[paren].end,
471 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 472 );
87c0511b 473 paren--;
a0d0e21e 474 }
daf18116 475#if 1
dafc8851
JH
476 /* It would seem that the similar code in regtry()
477 * already takes care of this, and in fact it is in
478 * a better location to since this code can #if 0-ed out
479 * but the code in regtry() is needed or otherwise tests
480 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
481 * (as of patchlevel 7877) will fail. Then again,
482 * this code seems to be necessary or otherwise
225593e1
DM
483 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
484 * --jhi updated by dapm */
b93070ed 485 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 486 if (i > *maxopenparen_p)
b93070ed
DM
487 rex->offs[i].start = -1;
488 rex->offs[i].end = -1;
495f47a5
DM
489 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
490 " \\%"UVuf": %s ..-1 undeffing\n",
491 (UV)i,
92da3157 492 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 493 ));
a0d0e21e 494 }
dafc8851 495#endif
a0d0e21e
LW
496}
497
74088413
DM
498/* restore the parens and associated vars at savestack position ix,
499 * but without popping the stack */
500
501STATIC void
92da3157 502S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
503{
504 I32 tmpix = PL_savestack_ix;
505 PL_savestack_ix = ix;
92da3157 506 regcppop(rex, maxopenparen_p);
74088413
DM
507 PL_savestack_ix = tmpix;
508}
509
02db2b7b 510#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 511
31c7f561
KW
512STATIC bool
513S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
514{
515 /* Returns a boolean as to whether or not 'character' is a member of the
516 * Posix character class given by 'classnum' that should be equivalent to a
517 * value in the typedef '_char_class_number'.
518 *
519 * Ideally this could be replaced by a just an array of function pointers
520 * to the C library functions that implement the macros this calls.
521 * However, to compile, the precise function signatures are required, and
522 * these may vary from platform to to platform. To avoid having to figure
523 * out what those all are on each platform, I (khw) am using this method,
524 * which adds an extra layer of function call overhead. But we don't
525 * particularly care about performance with locales anyway. */
526
527 switch ((_char_class_number) classnum) {
528 case _CC_ENUM_ALNUMC: return isALNUMC_LC(character);
529 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
530 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
531 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
532 case _CC_ENUM_LOWER: return isLOWER_LC(character);
533 case _CC_ENUM_PRINT: return isPRINT_LC(character);
534 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
535 case _CC_ENUM_UPPER: return isUPPER_LC(character);
536 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
537 case _CC_ENUM_SPACE: return isSPACE_LC(character);
538 case _CC_ENUM_BLANK: return isBLANK_LC(character);
539 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
540 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
541 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
542 case _CC_ENUM_ASCII: return isASCII_LC(character);
543 default: /* VERTSPACE should never occur in locales */
544 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
545 }
546
547 assert(0); /* NOTREACHED */
548 return FALSE;
549}
550
a687059c 551/*
e50aee73 552 * pregexec and friends
a687059c
LW
553 */
554
76234dfb 555#ifndef PERL_IN_XSUB_RE
a687059c 556/*
c277df42 557 - pregexec - match a regexp against a string
a687059c 558 */
c277df42 559I32
5aaab254 560Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 561 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
562/* stringarg: the point in the string at which to begin matching */
563/* strend: pointer to null at end of string */
564/* strbeg: real beginning of string */
565/* minend: end of match must be >= minend bytes after stringarg. */
566/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
567 * itself is accessed via the pointers above */
568/* nosave: For optimizations. */
c277df42 569{
7918f24d
NC
570 PERL_ARGS_ASSERT_PREGEXEC;
571
c277df42 572 return
9041c2e3 573 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
574 nosave ? 0 : REXEC_COPY_STR);
575}
76234dfb 576#endif
22e551b9 577
9041c2e3 578/*
cad2e5aa
JH
579 * Need to implement the following flags for reg_anch:
580 *
581 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
582 * USE_INTUIT_ML
583 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
584 * INTUIT_AUTORITATIVE_ML
585 * INTUIT_ONCE_NOML - Intuit can match in one location only.
586 * INTUIT_ONCE_ML
587 *
588 * Another flag for this function: SECOND_TIME (so that float substrs
589 * with giant delta may be not rechecked).
590 */
591
592/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
593
3f7c398e 594/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
595 Otherwise, only SvCUR(sv) is used to get strbeg. */
596
597/* XXXX We assume that strpos is strbeg unless sv. */
598
6eb5f6b9
JH
599/* XXXX Some places assume that there is a fixed substring.
600 An update may be needed if optimizer marks as "INTUITable"
601 RExen without fixed substrings. Similarly, it is assumed that
602 lengths of all the strings are no more than minlen, thus they
603 cannot come from lookahead.
40d049e4
YO
604 (Or minlen should take into account lookahead.)
605 NOTE: Some of this comment is not correct. minlen does now take account
606 of lookahead/behind. Further research is required. -- demerphq
607
608*/
6eb5f6b9 609
2c2d71f5
JH
610/* A failure to find a constant substring means that there is no need to make
611 an expensive call to REx engine, thus we celebrate a failure. Similarly,
612 finding a substring too deep into the string means that less calls to
30944b6d
IZ
613 regtry() should be needed.
614
615 REx compiler's optimizer found 4 possible hints:
616 a) Anchored substring;
617 b) Fixed substring;
618 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 619 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 620 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
621 string which does not contradict any of them.
622 */
2c2d71f5 623
6eb5f6b9
JH
624/* Most of decisions we do here should have been done at compile time.
625 The nodes of the REx which we used for the search should have been
626 deleted from the finite automaton. */
627
cad2e5aa 628char *
288b8c02 629Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 630 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 631{
97aff369 632 dVAR;
8d919b0a 633 struct regexp *const prog = ReANY(rx);
eb578fdb 634 I32 start_shift = 0;
cad2e5aa 635 /* Should be nonnegative! */
eb578fdb
KW
636 I32 end_shift = 0;
637 char *s;
638 SV *check;
a1933d95 639 char *strbeg;
cad2e5aa 640 char *t;
f2ed9b32 641 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 642 I32 ml_anch;
eb578fdb 643 char *other_last = NULL; /* other substr checked before this */
bd61b366 644 char *check_at = NULL; /* check substr found at this pos */
d8080198 645 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 646 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 647 RXi_GET_DECL(prog,progi);
30944b6d 648#ifdef DEBUGGING
890ce7af 649 const char * const i_strpos = strpos;
30944b6d 650#endif
a3621e74
YO
651 GET_RE_DEBUG_FLAGS_DECL;
652
7918f24d 653 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
654 PERL_UNUSED_ARG(flags);
655 PERL_UNUSED_ARG(data);
7918f24d 656
f2ed9b32 657 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 658
3c8556c3 659 if (RX_UTF8(rx)) {
b8d68ded
JH
660 PL_reg_flags |= RF_utf8;
661 }
ab3bbdeb 662 DEBUG_EXECUTE_r(
f2ed9b32 663 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
664 sv ? "Guessing start of match in sv for"
665 : "Guessing start of match in string for");
2a782b5b 666 );
cad2e5aa 667
c344f387
JH
668 /* CHR_DIST() would be more correct here but it makes things slow. */
669 if (prog->minlen > strend - strpos) {
a3621e74 670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 671 "String too short... [re_intuit_start]\n"));
cad2e5aa 672 goto fail;
2c2d71f5 673 }
1de06328 674
7016d6eb
DM
675 /* XXX we need to pass strbeg as a separate arg: the following is
676 * guesswork and can be wrong... */
677 if (sv && SvPOK(sv)) {
678 char * p = SvPVX(sv);
679 STRLEN cur = SvCUR(sv);
680 if (p <= strpos && strpos < p + cur) {
681 strbeg = p;
682 assert(p <= strend && strend <= p + cur);
683 }
684 else
685 strbeg = strend - cur;
686 }
687 else
688 strbeg = strpos;
689
1aa99e6b 690 PL_regeol = strend;
f2ed9b32 691 if (utf8_target) {
33b8afdf
JH
692 if (!prog->check_utf8 && prog->check_substr)
693 to_utf8_substr(prog);
694 check = prog->check_utf8;
695 } else {
7e0d5ad7
KW
696 if (!prog->check_substr && prog->check_utf8) {
697 if (! to_byte_substr(prog)) {
6b54ddc5 698 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
699 }
700 }
33b8afdf
JH
701 check = prog->check_substr;
702 }
bbe252da
YO
703 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
704 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
705 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 706 && !multiline ) ); /* Check after \n? */
cad2e5aa 707
7e25d62c 708 if (!ml_anch) {
bbe252da
YO
709 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
710 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 711 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
712 && sv && !SvROK(sv)
713 && (strpos != strbeg)) {
a3621e74 714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
715 goto fail;
716 }
d46b78c6
KW
717 if (prog->check_offset_min == prog->check_offset_max
718 && !(prog->extflags & RXf_CANY_SEEN)
719 && ! multiline) /* /m can cause \n's to match that aren't
720 accounted for in the string max length.
721 See [perl #115242] */
722 {
2c2d71f5 723 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
724 I32 slen;
725
1aa99e6b 726 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 727
653099ff
GS
728 if (SvTAIL(check)) {
729 slen = SvCUR(check); /* >= 1 */
cad2e5aa 730
9041c2e3 731 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 732 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 734 goto fail_finish;
cad2e5aa
JH
735 }
736 /* Now should match s[0..slen-2] */
737 slen--;
3f7c398e 738 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 739 || (slen > 1
3f7c398e 740 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 741 report_neq:
a3621e74 742 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
743 goto fail_finish;
744 }
cad2e5aa 745 }
3f7c398e 746 else if (*SvPVX_const(check) != *s
653099ff 747 || ((slen = SvCUR(check)) > 1
3f7c398e 748 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 749 goto report_neq;
c315bfe8 750 check_at = s;
2c2d71f5 751 goto success_at_start;
7e25d62c 752 }
cad2e5aa 753 }
2c2d71f5 754 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 755 s = strpos;
2c2d71f5 756 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
757 end_shift = prog->check_end_shift;
758
2c2d71f5 759 if (!ml_anch) {
a3b680e6 760 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 761 - (SvTAIL(check) != 0);
a3b680e6 762 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
763
764 if (end_shift < eshift)
765 end_shift = eshift;
766 }
cad2e5aa 767 }
2c2d71f5 768 else { /* Can match at random position */
cad2e5aa
JH
769 ml_anch = 0;
770 s = strpos;
1de06328
YO
771 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
772 end_shift = prog->check_end_shift;
773
774 /* end shift should be non negative here */
cad2e5aa
JH
775 }
776
bcdf7404 777#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 778 if (end_shift < 0)
1de06328 779 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 780 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
781#endif
782
2c2d71f5
JH
783 restart:
784 /* Find a possible match in the region s..strend by looking for
785 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
786
787 {
788 I32 srch_start_shift = start_shift;
789 I32 srch_end_shift = end_shift;
c33e64f0
FC
790 U8* start_point;
791 U8* end_point;
1de06328
YO
792 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
793 srch_end_shift -= ((strbeg - s) - srch_start_shift);
794 srch_start_shift = strbeg - s;
795 }
6bda09f9 796 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
797 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
798 (IV)prog->check_offset_min,
799 (IV)srch_start_shift,
800 (IV)srch_end_shift,
801 (IV)prog->check_end_shift);
802 });
803
bbe252da 804 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
805 start_point= (U8*)(s + srch_start_shift);
806 end_point= (U8*)(strend - srch_end_shift);
807 } else {
808 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
809 end_point= HOP3(strend, -srch_end_shift, strbeg);
810 }
6bda09f9 811 DEBUG_OPTIMISE_MORE_r({
56570a2c 812 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 813 (int)(end_point - start_point),
fc8cd66c 814 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
815 start_point);
816 });
817
818 s = fbm_instr( start_point, end_point,
7fba1cd6 819 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 820 }
cad2e5aa
JH
821 /* Update the count-of-usability, remove useless subpatterns,
822 unshift s. */
2c2d71f5 823
ab3bbdeb 824 DEBUG_EXECUTE_r({
f2ed9b32 825 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
826 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
827 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 828 (s ? "Found" : "Did not find"),
f2ed9b32 829 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
830 ? "anchored" : "floating"),
831 quoted,
832 RE_SV_TAIL(check),
833 (s ? " at offset " : "...\n") );
834 });
2c2d71f5
JH
835
836 if (!s)
837 goto fail_finish;
2c2d71f5 838 /* Finish the diagnostic message */
a3621e74 839 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 840
1de06328
YO
841 /* XXX dmq: first branch is for positive lookbehind...
842 Our check string is offset from the beginning of the pattern.
843 So we need to do any stclass tests offset forward from that
844 point. I think. :-(
845 */
846
847
848
849 check_at=s;
850
851
2c2d71f5
JH
852 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
853 Start with the other substr.
854 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 855 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
856 *always* match. Probably should be marked during compile...
857 Probably it is right to do no SCREAM here...
858 */
859
f2ed9b32 860 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
861 : (prog->float_substr && prog->anchored_substr))
862 {
30944b6d 863 /* Take into account the "other" substring. */
2c2d71f5
JH
864 /* XXXX May be hopelessly wrong for UTF... */
865 if (!other_last)
6eb5f6b9 866 other_last = strpos;
f2ed9b32 867 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
868 do_other_anchored:
869 {
890ce7af
AL
870 char * const last = HOP3c(s, -start_shift, strbeg);
871 char *last1, *last2;
be8e71aa 872 char * const saved_s = s;
33b8afdf 873 SV* must;
2c2d71f5 874
2c2d71f5
JH
875 t = s - prog->check_offset_max;
876 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 877 && (!utf8_target
0ce71af7 878 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 879 && t > strpos)))
6f207bd3 880 NOOP;
2c2d71f5
JH
881 else
882 t = strpos;
1aa99e6b 883 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
884 if (t < other_last) /* These positions already checked */
885 t = other_last;
1aa99e6b 886 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
887 if (last < last1)
888 last1 = last;
1de06328
YO
889 /* XXXX It is not documented what units *_offsets are in.
890 We assume bytes, but this is clearly wrong.
891 Meaning this code needs to be carefully reviewed for errors.
892 dmq.
893 */
894
2c2d71f5 895 /* On end-of-str: see comment below. */
f2ed9b32 896 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
897 if (must == &PL_sv_undef) {
898 s = (char*)NULL;
1de06328 899 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
900 }
901 else
902 s = fbm_instr(
903 (unsigned char*)t,
904 HOP3(HOP3(last1, prog->anchored_offset, strend)
905 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
906 must,
7fba1cd6 907 multiline ? FBMrf_MULTILINE : 0
33b8afdf 908 );
ab3bbdeb 909 DEBUG_EXECUTE_r({
f2ed9b32 910 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
911 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
912 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 913 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
914 quoted, RE_SV_TAIL(must));
915 });
916
917
2c2d71f5
JH
918 if (!s) {
919 if (last1 >= last2) {
a3621e74 920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
921 ", giving up...\n"));
922 goto fail_finish;
923 }
a3621e74 924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 925 ", trying floating at offset %ld...\n",
be8e71aa 926 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
927 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
928 s = HOP3c(last, 1, strend);
2c2d71f5
JH
929 goto restart;
930 }
931 else {
a3621e74 932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 933 (long)(s - i_strpos)));
1aa99e6b
IH
934 t = HOP3c(s, -prog->anchored_offset, strbeg);
935 other_last = HOP3c(s, 1, strend);
be8e71aa 936 s = saved_s;
2c2d71f5
JH
937 if (t == strpos)
938 goto try_at_start;
2c2d71f5
JH
939 goto try_at_offset;
940 }
30944b6d 941 }
2c2d71f5
JH
942 }
943 else { /* Take into account the floating substring. */
33b8afdf 944 char *last, *last1;
be8e71aa 945 char * const saved_s = s;
33b8afdf
JH
946 SV* must;
947
948 t = HOP3c(s, -start_shift, strbeg);
949 last1 = last =
950 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
951 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
952 last = HOP3c(t, prog->float_max_offset, strend);
953 s = HOP3c(t, prog->float_min_offset, strend);
954 if (s < other_last)
955 s = other_last;
2c2d71f5 956 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 957 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
958 /* fbm_instr() takes into account exact value of end-of-str
959 if the check is SvTAIL(ed). Since false positives are OK,
960 and end-of-str is not later than strend we are OK. */
961 if (must == &PL_sv_undef) {
962 s = (char*)NULL;
1de06328 963 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
964 }
965 else
2c2d71f5 966 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
967 (unsigned char*)last + SvCUR(must)
968 - (SvTAIL(must)!=0),
7fba1cd6 969 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 970 DEBUG_EXECUTE_r({
f2ed9b32 971 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
972 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
973 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 974 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
975 quoted, RE_SV_TAIL(must));
976 });
33b8afdf
JH
977 if (!s) {
978 if (last1 == last) {
a3621e74 979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
980 ", giving up...\n"));
981 goto fail_finish;
2c2d71f5 982 }
a3621e74 983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 984 ", trying anchored starting at offset %ld...\n",
be8e71aa 985 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
986 other_last = last;
987 s = HOP3c(t, 1, strend);
988 goto restart;
989 }
990 else {
a3621e74 991 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
992 (long)(s - i_strpos)));
993 other_last = s; /* Fix this later. --Hugo */
be8e71aa 994 s = saved_s;
33b8afdf
JH
995 if (t == strpos)
996 goto try_at_start;
997 goto try_at_offset;
998 }
2c2d71f5 999 }
cad2e5aa 1000 }
2c2d71f5 1001
1de06328 1002
9ef43ace 1003 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 1004
6bda09f9 1005 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
1006 PerlIO_printf(Perl_debug_log,
1007 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1008 (IV)prog->check_offset_min,
1009 (IV)prog->check_offset_max,
1010 (IV)(s-strpos),
1011 (IV)(t-strpos),
1012 (IV)(t-s),
1013 (IV)(strend-strpos)
1014 )
1015 );
1016
2c2d71f5 1017 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 1018 && (!utf8_target
9ef43ace 1019 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
1020 && t > strpos)))
1021 {
2c2d71f5
JH
1022 /* Fixed substring is found far enough so that the match
1023 cannot start at strpos. */
1024 try_at_offset:
cad2e5aa 1025 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
1026 /* Eventually fbm_*() should handle this, but often
1027 anchored_offset is not 0, so this check will not be wasted. */
1028 /* XXXX In the code below we prefer to look for "^" even in
1029 presence of anchored substrings. And we search even
1030 beyond the found float position. These pessimizations
1031 are historical artefacts only. */
1032 find_anchor:
2c2d71f5 1033 while (t < strend - prog->minlen) {
cad2e5aa 1034 if (*t == '\n') {
4ee3650e 1035 if (t < check_at - prog->check_offset_min) {
f2ed9b32 1036 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
1037 /* Since we moved from the found position,
1038 we definitely contradict the found anchored
30944b6d
IZ
1039 substr. Due to the above check we do not
1040 contradict "check" substr.
1041 Thus we can arrive here only if check substr
1042 is float. Redo checking for "other"=="fixed".
1043 */
9041c2e3 1044 strpos = t + 1;
a3621e74 1045 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1046 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1047 goto do_other_anchored;
1048 }
4ee3650e
GS
1049 /* We don't contradict the found floating substring. */
1050 /* XXXX Why not check for STCLASS? */
cad2e5aa 1051 s = t + 1;
a3621e74 1052 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1053 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1054 goto set_useful;
1055 }
4ee3650e
GS
1056 /* Position contradicts check-string */
1057 /* XXXX probably better to look for check-string
1058 than for "\n", so one should lower the limit for t? */
a3621e74 1059 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1060 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1061 other_last = strpos = s = t + 1;
cad2e5aa
JH
1062 goto restart;
1063 }
1064 t++;
1065 }
a3621e74 1066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1067 PL_colors[0], PL_colors[1]));
2c2d71f5 1068 goto fail_finish;
cad2e5aa 1069 }
f5952150 1070 else {
a3621e74 1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1072 PL_colors[0], PL_colors[1]));
f5952150 1073 }
cad2e5aa
JH
1074 s = t;
1075 set_useful:
f2ed9b32 1076 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1077 }
1078 else {
f5952150 1079 /* The found string does not prohibit matching at strpos,
2c2d71f5 1080 - no optimization of calling REx engine can be performed,
f5952150
GS
1081 unless it was an MBOL and we are not after MBOL,
1082 or a future STCLASS check will fail this. */
2c2d71f5
JH
1083 try_at_start:
1084 /* Even in this situation we may use MBOL flag if strpos is offset
1085 wrt the start of the string. */
05b4157f 1086 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1087 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1088 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1089 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1090 {
cad2e5aa
JH
1091 t = strpos;
1092 goto find_anchor;
1093 }
a3621e74 1094 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1095 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1096 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1097 );
2c2d71f5 1098 success_at_start:
bbe252da 1099 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1100 && (utf8_target ? (
33b8afdf
JH
1101 prog->check_utf8 /* Could be deleted already */
1102 && --BmUSEFUL(prog->check_utf8) < 0
1103 && (prog->check_utf8 == prog->float_utf8)
1104 ) : (
1105 prog->check_substr /* Could be deleted already */
1106 && --BmUSEFUL(prog->check_substr) < 0
1107 && (prog->check_substr == prog->float_substr)
1108 )))
66e933ab 1109 {
cad2e5aa 1110 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1111 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1112 /* XXX Does the destruction order has to change with utf8_target? */
1113 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1114 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1115 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1116 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1117 check = NULL; /* abort */
cad2e5aa 1118 s = strpos;
486ec47a 1119 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1120 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1121 if (prog->intflags & PREGf_IMPLICIT)
1122 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1123 /* XXXX This is a remnant of the old implementation. It
1124 looks wasteful, since now INTUIT can use many
6eb5f6b9 1125 other heuristics. */
bbe252da 1126 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1127 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1128 }
1129 else
1130 s = strpos;
1131 }
1132
6eb5f6b9
JH
1133 /* Last resort... */
1134 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1135 /* trie stclasses are too expensive to use here, we are better off to
1136 leave it to regmatch itself */
f8fc2ecf 1137 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1138 /* minlen == 0 is possible if regstclass is \b or \B,
1139 and the fixed substr is ''$.
1140 Since minlen is already taken into account, s+1 is before strend;
1141 accidentally, minlen >= 1 guaranties no false positives at s + 1
1142 even for \b or \B. But (minlen? 1 : 0) below assumes that
1143 regstclass does not come from lookahead... */
1144 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1145 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1146 const U8* const str = (U8*)STRING(progi->regstclass);
1147 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1148 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1149 : 1);
1de06328
YO
1150 char * endpos;
1151 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1152 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1153 else if (prog->float_substr || prog->float_utf8)
1154 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1155 else
1156 endpos= strend;
1157
d8080198
YO
1158 if (checked_upto < s)
1159 checked_upto = s;
1160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1161 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1162
6eb5f6b9 1163 t = s;
d8080198
YO
1164 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1165 if (s) {
1166 checked_upto = s;
1167 } else {
6eb5f6b9 1168#ifdef DEBUGGING
cbbf8932 1169 const char *what = NULL;
6eb5f6b9
JH
1170#endif
1171 if (endpos == strend) {
a3621e74 1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1173 "Could not match STCLASS...\n") );
1174 goto fail;
1175 }
a3621e74 1176 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1177 "This position contradicts STCLASS...\n") );
bbe252da 1178 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1179 goto fail;
d8080198
YO
1180 checked_upto = HOPBACKc(endpos, start_shift);
1181 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1182 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1183 /* Contradict one of substrings */
33b8afdf 1184 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1185 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1186 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1187 hop_and_restart:
1aa99e6b 1188 s = HOP3c(t, 1, strend);
66e933ab
GS
1189 if (s + start_shift + end_shift > strend) {
1190 /* XXXX Should be taken into account earlier? */
a3621e74 1191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1192 "Could not match STCLASS...\n") );
1193 goto fail;
1194 }
5e39e1e5
HS
1195 if (!check)
1196 goto giveup;
a3621e74 1197 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1198 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1199 what, (long)(s + start_shift - i_strpos)) );
1200 goto restart;
1201 }
66e933ab 1202 /* Have both, check_string is floating */
6eb5f6b9
JH
1203 if (t + start_shift >= check_at) /* Contradicts floating=check */
1204 goto retry_floating_check;
1205 /* Recheck anchored substring, but not floating... */
9041c2e3 1206 s = check_at;
5e39e1e5
HS
1207 if (!check)
1208 goto giveup;
a3621e74 1209 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1210 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1211 (long)(other_last - i_strpos)) );
1212 goto do_other_anchored;
1213 }
60e71179
GS
1214 /* Another way we could have checked stclass at the
1215 current position only: */
1216 if (ml_anch) {
1217 s = t = t + 1;
5e39e1e5
HS
1218 if (!check)
1219 goto giveup;
a3621e74 1220 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1221 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1222 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1223 goto try_at_offset;
66e933ab 1224 }
f2ed9b32 1225 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1226 goto fail;
486ec47a 1227 /* Check is floating substring. */
6eb5f6b9
JH
1228 retry_floating_check:
1229 t = check_at - start_shift;
a3621e74 1230 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1231 goto hop_and_restart;
1232 }
b7953727 1233 if (t != s) {
a3621e74 1234 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1235 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1236 (long)(t - i_strpos), (long)(s - i_strpos))
1237 );
1238 }
1239 else {
a3621e74 1240 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1241 "Does not contradict STCLASS...\n");
1242 );
1243 }
6eb5f6b9 1244 }
5e39e1e5 1245 giveup:
a3621e74 1246 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1247 PL_colors[4], (check ? "Guessed" : "Giving up"),
1248 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1249 return s;
2c2d71f5
JH
1250
1251 fail_finish: /* Substring not found */
33b8afdf 1252 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1253 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1254 fail:
a3621e74 1255 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1256 PL_colors[4], PL_colors[5]));
bd61b366 1257 return NULL;
cad2e5aa 1258}
9661b544 1259
a0a388a1
YO
1260#define DECL_TRIE_TYPE(scan) \
1261 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1262 trie_type = ((scan->flags == EXACT) \
1263 ? (utf8_target ? trie_utf8 : trie_plain) \
1264 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1265
1266#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1267uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1268 STRLEN skiplen; \
1269 switch (trie_type) { \
1270 case trie_utf8_fold: \
1271 if ( foldlen>0 ) { \
1272 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1273 foldlen -= len; \
1274 uscan += len; \
1275 len=0; \
1276 } else { \
1277 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1278 len = UTF8SKIP(uc); \
1279 skiplen = UNISKIP( uvc ); \
1280 foldlen -= skiplen; \
1281 uscan = foldbuf + skiplen; \
1282 } \
1283 break; \
1284 case trie_latin_utf8_fold: \
1285 if ( foldlen>0 ) { \
1286 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1287 foldlen -= len; \
1288 uscan += len; \
1289 len=0; \
1290 } else { \
1291 len = 1; \
1292 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1293 skiplen = UNISKIP( uvc ); \
1294 foldlen -= skiplen; \
1295 uscan = foldbuf + skiplen; \
1296 } \
1297 break; \
1298 case trie_utf8: \
1299 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1300 break; \
1301 case trie_plain: \
1302 uvc = (UV)*uc; \
1303 len = 1; \
1304 } \
1305 if (uvc < 256) { \
1306 charid = trie->charmap[ uvc ]; \
1307 } \
1308 else { \
1309 charid = 0; \
1310 if (widecharmap) { \
1311 SV** const svpp = hv_fetch(widecharmap, \
1312 (char*)&uvc, sizeof(UV), 0); \
1313 if (svpp) \
1314 charid = (U16)SvIV(*svpp); \
1315 } \
1316 } \
4cadc6a9
YO
1317} STMT_END
1318
4cadc6a9
YO
1319#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1320STMT_START { \
1321 while (s <= e) { \
1322 if ( (CoNd) \
fac1af77 1323 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1324 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1325 goto got_it; \
1326 s++; \
1327 } \
1328} STMT_END
1329
1330#define REXEC_FBC_UTF8_SCAN(CoDe) \
1331STMT_START { \
7016d6eb 1332 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1333 CoDe \
1334 s += uskip; \
1335 } \
1336} STMT_END
1337
1338#define REXEC_FBC_SCAN(CoDe) \
1339STMT_START { \
1340 while (s < strend) { \
1341 CoDe \
1342 s++; \
1343 } \
1344} STMT_END
1345
1346#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1347REXEC_FBC_UTF8_SCAN( \
1348 if (CoNd) { \
24b23f37 1349 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1350 goto got_it; \
1351 else \
1352 tmp = doevery; \
1353 } \
1354 else \
1355 tmp = 1; \
1356)
1357
1358#define REXEC_FBC_CLASS_SCAN(CoNd) \
1359REXEC_FBC_SCAN( \
1360 if (CoNd) { \
24b23f37 1361 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1362 goto got_it; \
1363 else \
1364 tmp = doevery; \
1365 } \
1366 else \
1367 tmp = 1; \
1368)
1369
1370#define REXEC_FBC_TRYIT \
24b23f37 1371if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1372 goto got_it
1373
e1d1eefb 1374#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1375 if (utf8_target) { \
e1d1eefb
YO
1376 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1377 } \
1378 else { \
1379 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1380 }
e1d1eefb 1381
4cadc6a9 1382#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1383 if (utf8_target) { \
4cadc6a9
YO
1384 UtFpReLoAd; \
1385 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1386 } \
1387 else { \
1388 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1389 }
4cadc6a9
YO
1390
1391#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1392 PL_reg_flags |= RF_tainted; \
f2ed9b32 1393 if (utf8_target) { \
4cadc6a9
YO
1394 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1395 } \
1396 else { \
1397 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1398 }
4cadc6a9 1399
786e8c11
YO
1400#define DUMP_EXEC_POS(li,s,doutf8) \
1401 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1402
cfaf538b
KW
1403
1404#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1405 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1406 tmp = TEST_NON_UTF8(tmp); \
1407 REXEC_FBC_UTF8_SCAN( \
1408 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1409 tmp = !tmp; \
1410 IF_SUCCESS; \
1411 } \
1412 else { \
1413 IF_FAIL; \
1414 } \
1415 ); \
1416
1417#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1418 if (s == PL_bostr) { \
1419 tmp = '\n'; \
1420 } \
1421 else { \
1422 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1423 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1424 } \
1425 tmp = TeSt1_UtF8; \
1426 LOAD_UTF8_CHARCLASS_ALNUM(); \
1427 REXEC_FBC_UTF8_SCAN( \
1428 if (tmp == ! (TeSt2_UtF8)) { \
1429 tmp = !tmp; \
1430 IF_SUCCESS; \
1431 } \
1432 else { \
1433 IF_FAIL; \
1434 } \
1435 ); \
1436
63ac0dad
KW
1437/* The only difference between the BOUND and NBOUND cases is that
1438 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1439 * NBOUND. This is accomplished by passing it in either the if or else clause,
1440 * with the other one being empty */
1441#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1442 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1443
1444#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1445 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1446
1447#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1448 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1449
1450#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1451 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1452
63ac0dad
KW
1453
1454/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1455 * be passed in completely with the variable name being tested, which isn't
1456 * such a clean interface, but this is easier to read than it was before. We
1457 * are looking for the boundary (or non-boundary between a word and non-word
1458 * character. The utf8 and non-utf8 cases have the same logic, but the details
1459 * must be different. Find the "wordness" of the character just prior to this
1460 * one, and compare it with the wordness of this one. If they differ, we have
1461 * a boundary. At the beginning of the string, pretend that the previous
1462 * character was a new-line */
cfaf538b 1463#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1464 if (utf8_target) { \
cfaf538b 1465 UTF8_CODE \
63ac0dad
KW
1466 } \
1467 else { /* Not utf8 */ \
1468 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1469 tmp = TEST_NON_UTF8(tmp); \
1470 REXEC_FBC_SCAN( \
1471 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1472 tmp = !tmp; \
1473 IF_SUCCESS; \
1474 } \
1475 else { \
1476 IF_FAIL; \
1477 } \
1478 ); \
1479 } \
1480 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1481 goto got_it;
1482
786e8c11
YO
1483/* We know what class REx starts with. Try to find this position... */
1484/* if reginfo is NULL, its a dryrun */
1485/* annoyingly all the vars in this routine have different names from their counterparts
1486 in regmatch. /grrr */
1487
3c3eec57 1488STATIC char *
07be1b83 1489S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1490 const char *strend, regmatch_info *reginfo)
a687059c 1491{
73104a1b
KW
1492 dVAR;
1493 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1494 char *pat_string; /* The pattern's exactish string */
1495 char *pat_end; /* ptr to end char of pat_string */
1496 re_fold_t folder; /* Function for computing non-utf8 folds */
1497 const U8 *fold_array; /* array for folding ords < 256 */
1498 STRLEN ln;
1499 STRLEN lnc;
1500 STRLEN uskip;
1501 U8 c1;
1502 U8 c2;
1503 char *e;
1504 I32 tmp = 1; /* Scratch variable? */
1505 const bool utf8_target = PL_reg_match_utf8;
1506 UV utf8_fold_flags = 0;
1507 RXi_GET_DECL(prog,progi);
2f7f8cb1 1508
73104a1b 1509 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1510
73104a1b
KW
1511 /* We know what class it must start with. */
1512 switch (OP(c)) {
1513 case ANYOF:
1514 if (utf8_target) {
1515 REXEC_FBC_UTF8_CLASS_SCAN(
1516 reginclass(prog, c, (U8*)s, utf8_target));
1517 }
1518 else {
1519 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1520 }
1521 break;
1522 case CANY:
1523 REXEC_FBC_SCAN(
1524 if (tmp && (!reginfo || regtry(reginfo, &s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 );
1529 break;
1530
1531 case EXACTFA:
1532 if (UTF_PATTERN || utf8_target) {
1533 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1534 goto do_exactf_utf8;
1535 }
1536 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1537 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1538 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1539
73104a1b
KW
1540 case EXACTF:
1541 if (utf8_target) {
16d951b7 1542
73104a1b
KW
1543 /* regcomp.c already folded this if pattern is in UTF-8 */
1544 utf8_fold_flags = 0;
1545 goto do_exactf_utf8;
1546 }
1547 fold_array = PL_fold;
1548 folder = foldEQ;
1549 goto do_exactf_non_utf8;
1550
1551 case EXACTFL:
1552 if (UTF_PATTERN || utf8_target) {
1553 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1554 goto do_exactf_utf8;
1555 }
1556 fold_array = PL_fold_locale;
1557 folder = foldEQ_locale;
1558 goto do_exactf_non_utf8;
3c760661 1559
73104a1b
KW
1560 case EXACTFU_SS:
1561 if (UTF_PATTERN) {
1562 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1563 }
1564 goto do_exactf_utf8;
16d951b7 1565
73104a1b
KW
1566 case EXACTFU_TRICKYFOLD:
1567 case EXACTFU:
1568 if (UTF_PATTERN || utf8_target) {
1569 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1570 goto do_exactf_utf8;
1571 }
fac1af77 1572
73104a1b
KW
1573 /* Any 'ss' in the pattern should have been replaced by regcomp,
1574 * so we don't have to worry here about this single special case
1575 * in the Latin1 range */
1576 fold_array = PL_fold_latin1;
1577 folder = foldEQ_latin1;
1578
1579 /* FALL THROUGH */
1580
1581 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1582 are no glitches with fold-length differences
1583 between the target string and pattern */
1584
1585 /* The idea in the non-utf8 EXACTF* cases is to first find the
1586 * first character of the EXACTF* node and then, if necessary,
1587 * case-insensitively compare the full text of the node. c1 is the
1588 * first character. c2 is its fold. This logic will not work for
1589 * Unicode semantics and the german sharp ss, which hence should
1590 * not be compiled into a node that gets here. */
1591 pat_string = STRING(c);
1592 ln = STR_LEN(c); /* length to match in octets/bytes */
1593
1594 /* We know that we have to match at least 'ln' bytes (which is the
1595 * same as characters, since not utf8). If we have to match 3
1596 * characters, and there are only 2 availabe, we know without
1597 * trying that it will fail; so don't start a match past the
1598 * required minimum number from the far end */
1599 e = HOP3c(strend, -((I32)ln), s);
1600
1601 if (!reginfo && e < s) {
1602 e = s; /* Due to minlen logic of intuit() */
1603 }
fac1af77 1604
73104a1b
KW
1605 c1 = *pat_string;
1606 c2 = fold_array[c1];
1607 if (c1 == c2) { /* If char and fold are the same */
1608 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1609 }
1610 else {
1611 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1612 }
1613 break;
fac1af77 1614
73104a1b
KW
1615 do_exactf_utf8:
1616 {
1617 unsigned expansion;
1618
1619 /* If one of the operands is in utf8, we can't use the simpler folding
1620 * above, due to the fact that many different characters can have the
1621 * same fold, or portion of a fold, or different- length fold */
1622 pat_string = STRING(c);
1623 ln = STR_LEN(c); /* length to match in octets/bytes */
1624 pat_end = pat_string + ln;
1625 lnc = (UTF_PATTERN) /* length to match in characters */
1626 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1627 : ln;
1628
1629 /* We have 'lnc' characters to match in the pattern, but because of
1630 * multi-character folding, each character in the target can match
1631 * up to 3 characters (Unicode guarantees it will never exceed
1632 * this) if it is utf8-encoded; and up to 2 if not (based on the
1633 * fact that the Latin 1 folds are already determined, and the
1634 * only multi-char fold in that range is the sharp-s folding to
1635 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1636 * string character. Adjust lnc accordingly, rounding up, so that
1637 * if we need to match at least 4+1/3 chars, that really is 5. */
1638 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1639 lnc = (lnc + expansion - 1) / expansion;
1640
1641 /* As in the non-UTF8 case, if we have to match 3 characters, and
1642 * only 2 are left, it's guaranteed to fail, so don't start a
1643 * match that would require us to go beyond the end of the string
1644 */
1645 e = HOP3c(strend, -((I32)lnc), s);
1646
1647 if (!reginfo && e < s) {
1648 e = s; /* Due to minlen logic of intuit() */
1649 }
0658cdde 1650
73104a1b
KW
1651 /* XXX Note that we could recalculate e to stop the loop earlier,
1652 * as the worst case expansion above will rarely be met, and as we
1653 * go along we would usually find that e moves further to the left.
1654 * This would happen only after we reached the point in the loop
1655 * where if there were no expansion we should fail. Unclear if
1656 * worth the expense */
1657
1658 while (s <= e) {
1659 char *my_strend= (char *)strend;
1660 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1661 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1662 && (!reginfo || regtry(reginfo, &s)) )
1663 {
1664 goto got_it;
1665 }
1666 s += (utf8_target) ? UTF8SKIP(s) : 1;
1667 }
1668 break;
1669 }
1670 case BOUNDL:
1671 PL_reg_flags |= RF_tainted;
1672 FBC_BOUND(isALNUM_LC,
1673 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1674 isALNUM_LC_utf8((U8*)s));
1675 break;
1676 case NBOUNDL:
1677 PL_reg_flags |= RF_tainted;
1678 FBC_NBOUND(isALNUM_LC,
1679 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1680 isALNUM_LC_utf8((U8*)s));
1681 break;
1682 case BOUND:
1683 FBC_BOUND(isWORDCHAR,
1684 isALNUM_uni(tmp),
1685 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1686 break;
1687 case BOUNDA:
1688 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1689 isWORDCHAR_A(tmp),
1690 isWORDCHAR_A((U8*)s));
1691 break;
1692 case NBOUND:
1693 FBC_NBOUND(isWORDCHAR,
1694 isALNUM_uni(tmp),
1695 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1696 break;
1697 case NBOUNDA:
1698 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1699 isWORDCHAR_A(tmp),
1700 isWORDCHAR_A((U8*)s));
1701 break;
1702 case BOUNDU:
1703 FBC_BOUND(isWORDCHAR_L1,
1704 isALNUM_uni(tmp),
1705 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1706 break;
1707 case NBOUNDU:
1708 FBC_NBOUND(isWORDCHAR_L1,
1709 isALNUM_uni(tmp),
1710 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1711 break;
1712 case ALNUML:
1713 REXEC_FBC_CSCAN_TAINT(
1714 isALNUM_LC_utf8((U8*)s),
1715 isALNUM_LC(*s)
1716 );
1717 break;
1718 case ALNUMU:
1719 REXEC_FBC_CSCAN_PRELOAD(
1720 LOAD_UTF8_CHARCLASS_ALNUM(),
1721 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1722 isWORDCHAR_L1((U8) *s)
1723 );
1724 break;
1725 case ALNUM:
1726 REXEC_FBC_CSCAN_PRELOAD(
1727 LOAD_UTF8_CHARCLASS_ALNUM(),
1728 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1729 isWORDCHAR((U8) *s)
1730 );
1731 break;
1732 case ALNUMA:
1733 /* Don't need to worry about utf8, as it can match only a single
1734 * byte invariant character */
1735 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1736 break;
1737 case NALNUMU:
1738 REXEC_FBC_CSCAN_PRELOAD(
1739 LOAD_UTF8_CHARCLASS_ALNUM(),
1740 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1741 ! isWORDCHAR_L1((U8) *s)
1742 );
1743 break;
1744 case NALNUM:
1745 REXEC_FBC_CSCAN_PRELOAD(
1746 LOAD_UTF8_CHARCLASS_ALNUM(),
1747 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1748 ! isALNUM(*s)
1749 );
1750 break;
1751 case NALNUMA:
1752 REXEC_FBC_CSCAN(
1753 !isWORDCHAR_A(*s),
1754 !isWORDCHAR_A(*s)
1755 );
1756 break;
1757 case NALNUML:
1758 REXEC_FBC_CSCAN_TAINT(
1759 !isALNUM_LC_utf8((U8*)s),
1760 !isALNUM_LC(*s)
1761 );
1762 break;
1763 case SPACEU:
1764 REXEC_FBC_CSCAN(
1765 is_XPERLSPACE_utf8(s),
1766 isSPACE_L1((U8) *s)
1767 );
1768 break;
1769 case SPACE:
1770 REXEC_FBC_CSCAN(
1771 is_XPERLSPACE_utf8(s),
1772 isSPACE((U8) *s)
1773 );
1774 break;
1775 case SPACEA:
1776 /* Don't need to worry about utf8, as it can match only a single
1777 * byte invariant character */
1778 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1779 break;
1780 case SPACEL:
1781 REXEC_FBC_CSCAN_TAINT(
1782 isSPACE_LC_utf8((U8*)s),
1783 isSPACE_LC(*s)
1784 );
1785 break;
1786 case NSPACEU:
1787 REXEC_FBC_CSCAN(
1788 ! is_XPERLSPACE_utf8(s),
1789 ! isSPACE_L1((U8) *s)
1790 );
1791 break;
1792 case NSPACE:
1793 REXEC_FBC_CSCAN(
1794 ! is_XPERLSPACE_utf8(s),
1795 ! isSPACE((U8) *s)
1796 );
1797 break;
1798 case NSPACEA:
1799 REXEC_FBC_CSCAN(
1800 !isSPACE_A(*s),
1801 !isSPACE_A(*s)
1802 );
1803 break;
1804 case NSPACEL:
1805 REXEC_FBC_CSCAN_TAINT(
1806 !isSPACE_LC_utf8((U8*)s),
1807 !isSPACE_LC(*s)
1808 );
1809 break;
1810 case DIGIT:
1811 REXEC_FBC_CSCAN_PRELOAD(
1812 LOAD_UTF8_CHARCLASS_DIGIT(),
1813 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1814 isDIGIT(*s)
1815 );
1816 break;
1817 case DIGITA:
1818 /* Don't need to worry about utf8, as it can match only a single
1819 * byte invariant character */
1820 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1821 break;
1822 case DIGITL:
1823 REXEC_FBC_CSCAN_TAINT(
1824 isDIGIT_LC_utf8((U8*)s),
1825 isDIGIT_LC(*s)
1826 );
1827 break;
1828 case NDIGIT:
1829 REXEC_FBC_CSCAN_PRELOAD(
1830 LOAD_UTF8_CHARCLASS_DIGIT(),
1831 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1832 !isDIGIT(*s)
1833 );
1834 break;
1835 case NDIGITA:
1836 REXEC_FBC_CSCAN(
1837 !isDIGIT_A(*s),
1838 !isDIGIT_A(*s)
1839 );
1840 break;
1841 case NDIGITL:
1842 REXEC_FBC_CSCAN_TAINT(
1843 !isDIGIT_LC_utf8((U8*)s),
1844 !isDIGIT_LC(*s)
1845 );
1846 break;
1847 case LNBREAK:
1848 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1849 is_LNBREAK_latin1_safe(s, strend)
1850 );
1851 break;
1852 case VERTWS:
1853 REXEC_FBC_CSCAN(
1854 is_VERTWS_utf8_safe(s, strend),
1855 is_VERTWS_latin1_safe(s, strend)
1856 );
1857 break;
1858 case NVERTWS:
1859 REXEC_FBC_CSCAN(
1860 !is_VERTWS_utf8_safe(s, strend),
1861 !is_VERTWS_latin1_safe(s, strend)
1862 );
1863 break;
1864 case HORIZWS:
1865 REXEC_FBC_CSCAN(
1866 is_HORIZWS_utf8_safe(s, strend),
1867 is_HORIZWS_latin1_safe(s, strend)
1868 );
1869 break;
1870 case NHORIZWS:
1871 REXEC_FBC_CSCAN(
1872 !is_HORIZWS_utf8_safe(s, strend),
1873 !is_HORIZWS_latin1_safe(s, strend)
1874 );
1875 break;
1876 case POSIXA:
1877 /* Don't need to worry about utf8, as it can match only a single
1878 * byte invariant character. The flag in this node type is the
1879 * class number to pass to _generic_isCC() to build a mask for
1880 * searching in PL_charclass[] */
1881 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1882 break;
1883 case NPOSIXA:
1884 REXEC_FBC_CSCAN(
1885 !_generic_isCC_A(*s, FLAGS(c)),
1886 !_generic_isCC_A(*s, FLAGS(c))
1887 );
1888 break;
1889
1890 case AHOCORASICKC:
1891 case AHOCORASICK:
1892 {
1893 DECL_TRIE_TYPE(c);
1894 /* what trie are we using right now */
1895 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1896 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1897 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1898
1899 const char *last_start = strend - trie->minlen;
6148ee25 1900#ifdef DEBUGGING
73104a1b 1901 const char *real_start = s;
6148ee25 1902#endif
73104a1b
KW
1903 STRLEN maxlen = trie->maxlen;
1904 SV *sv_points;
1905 U8 **points; /* map of where we were in the input string
1906 when reading a given char. For ASCII this
1907 is unnecessary overhead as the relationship
1908 is always 1:1, but for Unicode, especially
1909 case folded Unicode this is not true. */
1910 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1911 U8 *bitmap=NULL;
1912
1913
1914 GET_RE_DEBUG_FLAGS_DECL;
1915
1916 /* We can't just allocate points here. We need to wrap it in
1917 * an SV so it gets freed properly if there is a croak while
1918 * running the match */
1919 ENTER;
1920 SAVETMPS;
1921 sv_points=newSV(maxlen * sizeof(U8 *));
1922 SvCUR_set(sv_points,
1923 maxlen * sizeof(U8 *));
1924 SvPOK_on(sv_points);
1925 sv_2mortal(sv_points);
1926 points=(U8**)SvPV_nolen(sv_points );
1927 if ( trie_type != trie_utf8_fold
1928 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1929 {
1930 if (trie->bitmap)
1931 bitmap=(U8*)trie->bitmap;
1932 else
1933 bitmap=(U8*)ANYOF_BITMAP(c);
1934 }
1935 /* this is the Aho-Corasick algorithm modified a touch
1936 to include special handling for long "unknown char" sequences.
1937 The basic idea being that we use AC as long as we are dealing
1938 with a possible matching char, when we encounter an unknown char
1939 (and we have not encountered an accepting state) we scan forward
1940 until we find a legal starting char.
1941 AC matching is basically that of trie matching, except that when
1942 we encounter a failing transition, we fall back to the current
1943 states "fail state", and try the current char again, a process
1944 we repeat until we reach the root state, state 1, or a legal
1945 transition. If we fail on the root state then we can either
1946 terminate if we have reached an accepting state previously, or
1947 restart the entire process from the beginning if we have not.
1948
1949 */
1950 while (s <= last_start) {
1951 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1952 U8 *uc = (U8*)s;
1953 U16 charid = 0;
1954 U32 base = 1;
1955 U32 state = 1;
1956 UV uvc = 0;
1957 STRLEN len = 0;
1958 STRLEN foldlen = 0;
1959 U8 *uscan = (U8*)NULL;
1960 U8 *leftmost = NULL;
1961#ifdef DEBUGGING
1962 U32 accepted_word= 0;
786e8c11 1963#endif
73104a1b
KW
1964 U32 pointpos = 0;
1965
1966 while ( state && uc <= (U8*)strend ) {
1967 int failed=0;
1968 U32 word = aho->states[ state ].wordnum;
1969
1970 if( state==1 ) {
1971 if ( bitmap ) {
1972 DEBUG_TRIE_EXECUTE_r(
1973 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1974 dump_exec_pos( (char *)uc, c, strend, real_start,
1975 (char *)uc, utf8_target );
1976 PerlIO_printf( Perl_debug_log,
1977 " Scanning for legal start char...\n");
1978 }
1979 );
1980 if (utf8_target) {
1981 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1982 uc += UTF8SKIP(uc);
1983 }
1984 } else {
1985 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1986 uc++;
1987 }
786e8c11 1988 }
73104a1b 1989 s= (char *)uc;
07be1b83 1990 }
73104a1b
KW
1991 if (uc >(U8*)last_start) break;
1992 }
1993
1994 if ( word ) {
1995 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1996 if (!leftmost || lpos < leftmost) {
1997 DEBUG_r(accepted_word=word);
1998 leftmost= lpos;
7016d6eb 1999 }
73104a1b 2000 if (base==0) break;
7016d6eb 2001
73104a1b
KW
2002 }
2003 points[pointpos++ % maxlen]= uc;
2004 if (foldlen || uc < (U8*)strend) {
2005 REXEC_TRIE_READ_CHAR(trie_type, trie,
2006 widecharmap, uc,
2007 uscan, len, uvc, charid, foldlen,
2008 foldbuf, uniflags);
2009 DEBUG_TRIE_EXECUTE_r({
2010 dump_exec_pos( (char *)uc, c, strend,
2011 real_start, s, utf8_target);
2012 PerlIO_printf(Perl_debug_log,
2013 " Charid:%3u CP:%4"UVxf" ",
2014 charid, uvc);
2015 });
2016 }
2017 else {
2018 len = 0;
2019 charid = 0;
2020 }
07be1b83 2021
73104a1b
KW
2022
2023 do {
6148ee25 2024#ifdef DEBUGGING
73104a1b 2025 word = aho->states[ state ].wordnum;
6148ee25 2026#endif
73104a1b
KW
2027 base = aho->states[ state ].trans.base;
2028
2029 DEBUG_TRIE_EXECUTE_r({
2030 if (failed)
2031 dump_exec_pos( (char *)uc, c, strend, real_start,
2032 s, utf8_target );
2033 PerlIO_printf( Perl_debug_log,
2034 "%sState: %4"UVxf", word=%"UVxf,
2035 failed ? " Fail transition to " : "",
2036 (UV)state, (UV)word);
2037 });
2038 if ( base ) {
2039 U32 tmp;
2040 I32 offset;
2041 if (charid &&
2042 ( ((offset = base + charid
2043 - 1 - trie->uniquecharcount)) >= 0)
2044 && ((U32)offset < trie->lasttrans)
2045 && trie->trans[offset].check == state
2046 && (tmp=trie->trans[offset].next))
2047 {
2048 DEBUG_TRIE_EXECUTE_r(
2049 PerlIO_printf( Perl_debug_log," - legal\n"));
2050 state = tmp;
2051 break;
07be1b83
YO
2052 }
2053 else {
786e8c11 2054 DEBUG_TRIE_EXECUTE_r(
73104a1b 2055 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2056 failed = 1;
73104a1b 2057 state = aho->fail[state];
07be1b83 2058 }
07be1b83 2059 }
73104a1b
KW
2060 else {
2061 /* we must be accepting here */
2062 DEBUG_TRIE_EXECUTE_r(
2063 PerlIO_printf( Perl_debug_log," - accepting\n"));
2064 failed = 1;
2065 break;
786e8c11 2066 }
73104a1b
KW
2067 } while(state);
2068 uc += len;
2069 if (failed) {
2070 if (leftmost)
2071 break;
2072 if (!state) state = 1;
07be1b83 2073 }
73104a1b
KW
2074 }
2075 if ( aho->states[ state ].wordnum ) {
2076 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2077 if (!leftmost || lpos < leftmost) {
2078 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2079 leftmost = lpos;
07be1b83
YO
2080 }
2081 }
73104a1b
KW
2082 if (leftmost) {
2083 s = (char*)leftmost;
2084 DEBUG_TRIE_EXECUTE_r({
2085 PerlIO_printf(
2086 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2087 (UV)accepted_word, (IV)(s - real_start)
2088 );
2089 });
2090 if (!reginfo || regtry(reginfo, &s)) {
2091 FREETMPS;
2092 LEAVE;
2093 goto got_it;
2094 }
2095 s = HOPc(s,1);
2096 DEBUG_TRIE_EXECUTE_r({
2097 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2098 });
2099 } else {
2100 DEBUG_TRIE_EXECUTE_r(
2101 PerlIO_printf( Perl_debug_log,"No match.\n"));
2102 break;
2103 }
2104 }
2105 FREETMPS;
2106 LEAVE;
2107 }
2108 break;
2109 default:
2110 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2111 break;
2112 }
2113 return 0;
2114 got_it:
2115 return s;
6eb5f6b9
JH
2116}
2117
fae667d5 2118
6eb5f6b9
JH
2119/*
2120 - regexec_flags - match a regexp against a string
2121 */
2122I32
5aaab254 2123Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2124 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2125/* stringarg: the point in the string at which to begin matching */
2126/* strend: pointer to null at end of string */
2127/* strbeg: real beginning of string */
2128/* minend: end of match must be >= minend bytes after stringarg. */
2129/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2130 * itself is accessed via the pointers above */
2131/* data: May be used for some additional optimizations.
2132 Currently its only used, with a U32 cast, for transmitting
2133 the ganch offset when doing a /g match. This will change */
2134/* nosave: For optimizations. */
2135
6eb5f6b9 2136{
97aff369 2137 dVAR;
8d919b0a 2138 struct regexp *const prog = ReANY(rx);
5aaab254 2139 char *s;
eb578fdb 2140 regnode *c;
5aaab254 2141 char *startpos = stringarg;
6eb5f6b9
JH
2142 I32 minlen; /* must match at least this many chars */
2143 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2144 I32 end_shift = 0; /* Same for the end. */ /* CC */
2145 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2146 char *scream_olds = NULL;
f2ed9b32 2147 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2148 I32 multiline;
f8fc2ecf 2149 RXi_GET_DECL(prog,progi);
3b0527fe 2150 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2151 regexp_paren_pair *swap = NULL;
a3621e74
YO
2152 GET_RE_DEBUG_FLAGS_DECL;
2153
7918f24d 2154 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2155 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2156
2157 /* Be paranoid... */
2158 if (prog == NULL || startpos == NULL) {
2159 Perl_croak(aTHX_ "NULL regexp parameter");
2160 return 0;
2161 }
2162
bbe252da 2163 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2164 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2165
f2ed9b32 2166 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2167 DEBUG_EXECUTE_r(
f2ed9b32 2168 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2169 "Matching");
2170 );
bac06658 2171
6eb5f6b9 2172 minlen = prog->minlen;
1de06328
YO
2173
2174 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2176 "String too short [regexec_flags]...\n"));
2177 goto phooey;
1aa99e6b 2178 }
6eb5f6b9 2179
1de06328 2180
6eb5f6b9 2181 /* Check validity of program. */
f8fc2ecf 2182 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2183 Perl_croak(aTHX_ "corrupted regexp program");
2184 }
2185
2186 PL_reg_flags = 0;
ed301438 2187 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2188 PL_reg_maxiter = 0;
2189
3c8556c3 2190 if (RX_UTF8(rx))
6eb5f6b9
JH
2191 PL_reg_flags |= RF_utf8;
2192
2193 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2194 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2195 PL_bostr = strbeg;
3b0527fe 2196 reginfo.sv = sv;
6eb5f6b9
JH
2197
2198 /* Mark end of line for $ (and such) */
2199 PL_regeol = strend;
2200
2201 /* see how far we have to get to not match where we matched before */
3b0527fe 2202 reginfo.till = startpos+minend;
6eb5f6b9 2203
6eb5f6b9
JH
2204 /* If there is a "must appear" string, look for it. */
2205 s = startpos;
2206
bbe252da 2207 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2208 MAGIC *mg;
2c296965 2209 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2210 reginfo.ganch = startpos + prog->gofs;
2c296965 2211 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2212 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2213 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2214 && SvMAGIC(sv)
14befaf4
DM
2215 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2216 && mg->mg_len >= 0) {
3b0527fe 2217 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2218 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2219 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2220
bbe252da 2221 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2222 if (s > reginfo.ganch)
6eb5f6b9 2223 goto phooey;
58e23c8d 2224 s = reginfo.ganch - prog->gofs;
2c296965 2225 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2226 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2227 if (s < strbeg)
2228 goto phooey;
6eb5f6b9
JH
2229 }
2230 }
58e23c8d 2231 else if (data) {
70685ca0 2232 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2233 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2234 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2235
2236 } else { /* pos() not defined */
3b0527fe 2237 reginfo.ganch = strbeg;
2c296965
YO
2238 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2239 "GPOS: reginfo.ganch = strbeg\n"));
2240 }
6eb5f6b9 2241 }
288b8c02 2242 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2243 /* We have to be careful. If the previous successful match
2244 was from this regex we don't want a subsequent partially
2245 successful match to clobber the old results.
2246 So when we detect this possibility we add a swap buffer
2247 to the re, and switch the buffer each match. If we fail
2248 we switch it back, otherwise we leave it swapped.
2249 */
2250 swap = prog->offs;
2251 /* do we need a save destructor here for eval dies? */
2252 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2253 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2254 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2255 PTR2UV(prog),
2256 PTR2UV(swap),
2257 PTR2UV(prog->offs)
2258 ));
c74340f9 2259 }
a0714e2c 2260 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2261 re_scream_pos_data d;
2262
2263 d.scream_olds = &scream_olds;
2264 d.scream_pos = &scream_pos;
288b8c02 2265 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2266 if (!s) {
a3621e74 2267 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2268 goto phooey; /* not present */
3fa9c3d7 2269 }
6eb5f6b9
JH
2270 }
2271
1de06328 2272
6eb5f6b9
JH
2273
2274 /* Simplest case: anchored match need be tried only once. */
2275 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2276 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2277 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2278 goto got_it;
bbe252da
YO
2279 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2280 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2281 {
2282 char *end;
2283
2284 if (minlen)
2285 dontbother = minlen - 1;
1aa99e6b 2286 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2287 /* for multiline we only have to try after newlines */
33b8afdf 2288 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2289 /* because of the goto we can not easily reuse the macros for bifurcating the
2290 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2291 if (utf8_target) {
2292 if (s == startpos)
2293 goto after_try_utf8;
2294 while (1) {
2295 if (regtry(&reginfo, &s)) {
2296 goto got_it;
2297 }
2298 after_try_utf8:
2299 if (s > end) {
2300 goto phooey;
2301 }
2302 if (prog->extflags & RXf_USE_INTUIT) {
2303 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2304 if (!s) {
2305 goto phooey;
2306 }
2307 }
2308 else {
2309 s += UTF8SKIP(s);
2310 }
2311 }
2312 } /* end search for check string in unicode */
2313 else {
2314 if (s == startpos) {
2315 goto after_try_latin;
2316 }
2317 while (1) {
2318 if (regtry(&reginfo, &s)) {
2319 goto got_it;
2320 }
2321 after_try_latin:
2322 if (s > end) {
2323 goto phooey;
2324 }
2325 if (prog->extflags & RXf_USE_INTUIT) {
2326 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2327 if (!s) {
2328 goto phooey;
2329 }
2330 }
2331 else {
2332 s++;
2333 }
2334 }
2335 } /* end search for check string in latin*/
2336 } /* end search for check string */
2337 else { /* search for newline */
2338 if (s > startpos) {
2339 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2340 s--;
92f3d482 2341 }
21eede78
YO
2342 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2343 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2344 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2345 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2346 goto got_it;
2347 }
92f3d482
YO
2348 }
2349 } /* end search for newline */
2350 } /* end anchored/multiline check string search */
6eb5f6b9 2351 goto phooey;
bbe252da 2352 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2353 {
486ec47a 2354 /* the warning about reginfo.ganch being used without initialization
bbe252da 2355 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2356 and we only enter this block when the same bit is set. */
58e23c8d 2357 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2358
2359 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2360 goto got_it;
2361 goto phooey;
2362 }
2363
2364 /* Messy cases: unanchored match. */
bbe252da 2365 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2366 /* we have /x+whatever/ */
f2ed9b32 2367 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2368 char ch;
bf93d4cc
GS
2369#ifdef DEBUGGING
2370 int did_match = 0;
2371#endif
f2ed9b32 2372 if (utf8_target) {
7e0d5ad7
KW
2373 if (! prog->anchored_utf8) {
2374 to_utf8_substr(prog);
2375 }
2376 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2377 REXEC_FBC_SCAN(
6eb5f6b9 2378 if (*s == ch) {
a3621e74 2379 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2380 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2381 s += UTF8SKIP(s);
2382 while (s < strend && *s == ch)
2383 s += UTF8SKIP(s);
2384 }
4cadc6a9 2385 );
7e0d5ad7 2386
6eb5f6b9
JH
2387 }
2388 else {
7e0d5ad7
KW
2389 if (! prog->anchored_substr) {
2390 if (! to_byte_substr(prog)) {
6b54ddc5 2391 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2392 }
2393 }
2394 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2395 REXEC_FBC_SCAN(
6eb5f6b9 2396 if (*s == ch) {
a3621e74 2397 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2398 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2399 s++;
2400 while (s < strend && *s == ch)
2401 s++;
2402 }
4cadc6a9 2403 );
6eb5f6b9 2404 }
a3621e74 2405 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2406 PerlIO_printf(Perl_debug_log,
b7953727
JH
2407 "Did not find anchored character...\n")
2408 );
6eb5f6b9 2409 }
a0714e2c
SS
2410 else if (prog->anchored_substr != NULL
2411 || prog->anchored_utf8 != NULL
2412 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2413 && prog->float_max_offset < strend - s)) {
2414 SV *must;
2415 I32 back_max;
2416 I32 back_min;
2417 char *last;
6eb5f6b9 2418 char *last1; /* Last position checked before */
bf93d4cc
GS
2419#ifdef DEBUGGING
2420 int did_match = 0;
2421#endif
33b8afdf 2422 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2423 if (utf8_target) {
2424 if (! prog->anchored_utf8) {
2425 to_utf8_substr(prog);
2426 }
2427 must = prog->anchored_utf8;
2428 }
2429 else {
2430 if (! prog->anchored_substr) {
2431 if (! to_byte_substr(prog)) {
6b54ddc5 2432 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2433 }
2434 }
2435 must = prog->anchored_substr;
2436 }
33b8afdf
JH
2437 back_max = back_min = prog->anchored_offset;
2438 } else {
7e0d5ad7
KW
2439 if (utf8_target) {
2440 if (! prog->float_utf8) {
2441 to_utf8_substr(prog);
2442 }
2443 must = prog->float_utf8;
2444 }
2445 else {
2446 if (! prog->float_substr) {
2447 if (! to_byte_substr(prog)) {
6b54ddc5 2448 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2449 }
2450 }
2451 must = prog->float_substr;
2452 }
33b8afdf
JH
2453 back_max = prog->float_max_offset;
2454 back_min = prog->float_min_offset;
2455 }
1de06328 2456
1de06328
YO
2457 if (back_min<0) {
2458 last = strend;
2459 } else {
2460 last = HOP3c(strend, /* Cannot start after this */
2461 -(I32)(CHR_SVLEN(must)
2462 - (SvTAIL(must) != 0) + back_min), strbeg);
2463 }
6eb5f6b9
JH
2464 if (s > PL_bostr)
2465 last1 = HOPc(s, -1);
2466 else
2467 last1 = s - 1; /* bogus */
2468
a0288114 2469 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2470 check_substr==must. */
2471 scream_pos = -1;
2472 dontbother = end_shift;
2473 strend = HOPc(strend, -dontbother);
2474 while ( (s <= last) &&
c33e64f0 2475 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2476 (unsigned char*)strend, must,
c33e64f0 2477 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2478 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2479 if (HOPc(s, -back_max) > last1) {
2480 last1 = HOPc(s, -back_min);
2481 s = HOPc(s, -back_max);
2482 }
2483 else {
52657f30 2484 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2485
2486 last1 = HOPc(s, -back_min);
52657f30 2487 s = t;
6eb5f6b9 2488 }
f2ed9b32 2489 if (utf8_target) {
6eb5f6b9 2490 while (s <= last1) {
24b23f37 2491 if (regtry(&reginfo, &s))
6eb5f6b9 2492 goto got_it;
7016d6eb
DM
2493 if (s >= last1) {
2494 s++; /* to break out of outer loop */
2495 break;
2496 }
2497 s += UTF8SKIP(s);
6eb5f6b9
JH
2498 }
2499 }
2500 else {
2501 while (s <= last1) {
24b23f37 2502 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2503 goto got_it;
2504 s++;
2505 }
2506 }
2507 }
ab3bbdeb 2508 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2509 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2510 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2511 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2512 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2513 ? "anchored" : "floating"),
ab3bbdeb
YO
2514 quoted, RE_SV_TAIL(must));
2515 });
6eb5f6b9
JH
2516 goto phooey;
2517 }
f8fc2ecf 2518 else if ( (c = progi->regstclass) ) {
f14c76ed 2519 if (minlen) {
f8fc2ecf 2520 const OPCODE op = OP(progi->regstclass);
66e933ab 2521 /* don't bother with what can't match */
786e8c11 2522 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2523 strend = HOPc(strend, -(minlen - 1));
2524 }
a3621e74 2525 DEBUG_EXECUTE_r({
be8e71aa 2526 SV * const prop = sv_newmortal();
32fc9b6a 2527 regprop(prog, prop, c);
0df25f3d 2528 {
f2ed9b32 2529 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2530 s,strend-s,60);
0df25f3d 2531 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2532 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2533 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2534 quoted, (int)(strend - s));
0df25f3d 2535 }
ffc61ed2 2536 });
3b0527fe 2537 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2538 goto got_it;
07be1b83 2539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2540 }
2541 else {
2542 dontbother = 0;
a0714e2c 2543 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2544 /* Trim the end. */
6af40bd7 2545 char *last= NULL;
33b8afdf 2546 SV* float_real;
c33e64f0
FC
2547 STRLEN len;
2548 const char *little;
33b8afdf 2549
7e0d5ad7
KW
2550 if (utf8_target) {
2551 if (! prog->float_utf8) {
2552 to_utf8_substr(prog);
2553 }
2554 float_real = prog->float_utf8;
2555 }
2556 else {
2557 if (! prog->float_substr) {
2558 if (! to_byte_substr(prog)) {
6b54ddc5 2559 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2560 }
2561 }
2562 float_real = prog->float_substr;
2563 }
d6a28714 2564
c33e64f0
FC
2565 little = SvPV_const(float_real, len);
2566 if (SvTAIL(float_real)) {
7f18ad16
KW
2567 /* This means that float_real contains an artificial \n on
2568 * the end due to the presence of something like this:
2569 * /foo$/ where we can match both "foo" and "foo\n" at the
2570 * end of the string. So we have to compare the end of the
2571 * string first against the float_real without the \n and
2572 * then against the full float_real with the string. We
2573 * have to watch out for cases where the string might be
2574 * smaller than the float_real or the float_real without
2575 * the \n. */
1a13b075
YO
2576 char *checkpos= strend - len;
2577 DEBUG_OPTIMISE_r(
2578 PerlIO_printf(Perl_debug_log,
2579 "%sChecking for float_real.%s\n",
2580 PL_colors[4], PL_colors[5]));
2581 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2582 /* can't match, even if we remove the trailing \n
2583 * string is too short to match */
1a13b075
YO
2584 DEBUG_EXECUTE_r(
2585 PerlIO_printf(Perl_debug_log,
2586 "%sString shorter than required trailing substring, cannot match.%s\n",
2587 PL_colors[4], PL_colors[5]));
2588 goto phooey;
2589 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2590 /* can match, the end of the string matches without the
2591 * "\n" */
1a13b075
YO
2592 last = checkpos + 1;
2593 } else if (checkpos < strbeg) {
7f18ad16
KW
2594 /* cant match, string is too short when the "\n" is
2595 * included */
1a13b075
YO
2596 DEBUG_EXECUTE_r(
2597 PerlIO_printf(Perl_debug_log,
2598 "%sString does not contain required trailing substring, cannot match.%s\n",
2599 PL_colors[4], PL_colors[5]));
2600 goto phooey;
2601 } else if (!multiline) {
7f18ad16
KW
2602 /* non multiline match, so compare with the "\n" at the
2603 * end of the string */
1a13b075
YO
2604 if (memEQ(checkpos, little, len)) {
2605 last= checkpos;
2606 } else {
2607 DEBUG_EXECUTE_r(
2608 PerlIO_printf(Perl_debug_log,
2609 "%sString does not contain required trailing substring, cannot match.%s\n",
2610 PL_colors[4], PL_colors[5]));
2611 goto phooey;
2612 }
2613 } else {
7f18ad16
KW
2614 /* multiline match, so we have to search for a place
2615 * where the full string is located */
d6a28714 2616 goto find_last;
1a13b075 2617 }
c33e64f0 2618 } else {
d6a28714 2619 find_last:
9041c2e3 2620 if (len)
d6a28714 2621 last = rninstr(s, strend, little, little + len);
b8c5462f 2622 else
a0288114 2623 last = strend; /* matching "$" */
b8c5462f 2624 }
6af40bd7 2625 if (!last) {
7f18ad16
KW
2626 /* at one point this block contained a comment which was
2627 * probably incorrect, which said that this was a "should not
2628 * happen" case. Even if it was true when it was written I am
2629 * pretty sure it is not anymore, so I have removed the comment
2630 * and replaced it with this one. Yves */
6bda09f9
YO
2631 DEBUG_EXECUTE_r(
2632 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2633 "String does not contain required substring, cannot match.\n"
2634 ));
2635 goto phooey;
bf93d4cc 2636 }
d6a28714
JH
2637 dontbother = strend - last + prog->float_min_offset;
2638 }
2639 if (minlen && (dontbother < minlen))
2640 dontbother = minlen - 1;
2641 strend -= dontbother; /* this one's always in bytes! */
2642 /* We don't know much -- general case. */
f2ed9b32 2643 if (utf8_target) {
d6a28714 2644 for (;;) {
24b23f37 2645 if (regtry(&reginfo, &s))
d6a28714
JH
2646 goto got_it;
2647 if (s >= strend)
2648 break;
b8c5462f 2649 s += UTF8SKIP(s);
d6a28714
JH
2650 };
2651 }
2652 else {
2653 do {
24b23f37 2654 if (regtry(&reginfo, &s))
d6a28714
JH
2655 goto got_it;
2656 } while (s++ < strend);
2657 }
2658 }
2659
2660 /* Failure. */
2661 goto phooey;
2662
2663got_it:
495f47a5
DM
2664 DEBUG_BUFFERS_r(
2665 if (swap)
2666 PerlIO_printf(Perl_debug_log,
2667 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2668 PTR2UV(prog),
2669 PTR2UV(swap)
2670 );
2671 );
e9105d30 2672 Safefree(swap);
288b8c02 2673 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2674
ed301438 2675 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2676 restore_pos(aTHX_ prog);
5daac39c
NC
2677 if (RXp_PAREN_NAMES(prog))
2678 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2679
2680 /* make sure $`, $&, $', and $digit will work later */
2681 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2682 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2683#ifdef PERL_ANY_COW
2684 if (SvCANCOW(sv)) {
ed252734
NC
2685 if (DEBUG_C_TEST) {
2686 PerlIO_printf(Perl_debug_log,
2687 "Copy on write: regexp capture, type %d\n",
2688 (int) SvTYPE(sv));
2689 }
77f8f7c1 2690 RX_MATCH_COPY_FREE(rx);
ed252734 2691 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2692 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2693 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2694 prog->sublen = PL_regeol - strbeg;
2695 prog->suboffset = 0;
2696 prog->subcoffset = 0;
ed252734
NC
2697 } else
2698#endif
2699 {
6502e081
DM
2700 I32 min = 0;
2701 I32 max = PL_regeol - strbeg;
2702 I32 sublen;
2703
2704 if ( (flags & REXEC_COPY_SKIP_POST)
2705 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2706 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2707 ) { /* don't copy $' part of string */
3de645a8 2708 U32 n = 0;
6502e081
DM
2709 max = -1;
2710 /* calculate the right-most part of the string covered
2711 * by a capture. Due to look-ahead, this may be to
2712 * the right of $&, so we have to scan all captures */
2713 while (n <= prog->lastparen) {
2714 if (prog->offs[n].end > max)
2715 max = prog->offs[n].end;
2716 n++;
2717 }
2718 if (max == -1)
2719 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2720 ? prog->offs[0].start
2721 : 0;
2722 assert(max >= 0 && max <= PL_regeol - strbeg);
2723 }
2724
2725 if ( (flags & REXEC_COPY_SKIP_PRE)
2726 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2727 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2728 ) { /* don't copy $` part of string */
3de645a8 2729 U32 n = 0;
6502e081
DM
2730 min = max;
2731 /* calculate the left-most part of the string covered
2732 * by a capture. Due to look-behind, this may be to
2733 * the left of $&, so we have to scan all captures */
2734 while (min && n <= prog->lastparen) {
2735 if ( prog->offs[n].start != -1
2736 && prog->offs[n].start < min)
2737 {
2738 min = prog->offs[n].start;
2739 }
2740 n++;
2741 }
2742 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2743 && min > prog->offs[0].end
2744 )
2745 min = prog->offs[0].end;
2746
2747 }
2748
2749 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2750 sublen = max - min;
2751
2752 if (RX_MATCH_COPIED(rx)) {
2753 if (sublen > prog->sublen)
2754 prog->subbeg =
2755 (char*)saferealloc(prog->subbeg, sublen+1);
2756 }
2757 else
2758 prog->subbeg = (char*)safemalloc(sublen+1);
2759 Copy(strbeg + min, prog->subbeg, sublen, char);
2760 prog->subbeg[sublen] = '\0';
2761 prog->suboffset = min;
2762 prog->sublen = sublen;
77f8f7c1 2763 RX_MATCH_COPIED_on(rx);
6502e081 2764 }
6502e081
DM
2765 prog->subcoffset = prog->suboffset;
2766 if (prog->suboffset && utf8_target) {
2767 /* Convert byte offset to chars.
2768 * XXX ideally should only compute this if @-/@+
2769 * has been seen, a la PL_sawampersand ??? */
2770
2771 /* If there's a direct correspondence between the
2772 * string which we're matching and the original SV,
2773 * then we can use the utf8 len cache associated with
2774 * the SV. In particular, it means that under //g,
2775 * sv_pos_b2u() will use the previously cached
2776 * position to speed up working out the new length of
2777 * subcoffset, rather than counting from the start of
2778 * the string each time. This stops
2779 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2780 * from going quadratic */
2781 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2782 sv_pos_b2u(sv, &(prog->subcoffset));
2783 else
2784 prog->subcoffset = utf8_length((U8*)strbeg,
2785 (U8*)(strbeg+prog->suboffset));
2786 }
d6a28714
JH
2787 }
2788 else {
6502e081 2789 RX_MATCH_COPY_FREE(rx);
d6a28714 2790 prog->subbeg = strbeg;
6502e081
DM
2791 prog->suboffset = 0;
2792 prog->subcoffset = 0;
d6a28714
JH
2793 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2794 }
2795 }
9041c2e3 2796
d6a28714
JH
2797 return 1;
2798
2799phooey:
a3621e74 2800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2801 PL_colors[4], PL_colors[5]));
ed301438 2802 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2803 restore_pos(aTHX_ prog);
e9105d30 2804 if (swap) {
c74340f9 2805 /* we failed :-( roll it back */
495f47a5
DM
2806 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2808 PTR2UV(prog),
2809 PTR2UV(prog->offs),
2810 PTR2UV(swap)
2811 ));
e9105d30
GG
2812 Safefree(prog->offs);
2813 prog->offs = swap;
2814 }
d6a28714
JH
2815 return 0;
2816}
2817
6bda09f9 2818
ec43f78b
DM
2819/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2820 * Do inc before dec, in case old and new rex are the same */
2821#define SET_reg_curpm(Re2) \
2822 if (PL_reg_state.re_state_eval_setup_done) { \
2823 (void)ReREFCNT_inc(Re2); \
2824 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2825 PM_SETRE((PL_reg_curpm), (Re2)); \
2826 }
2827
2828
d6a28714
JH
2829/*
2830 - regtry - try match at specific point
2831 */
2832STATIC I32 /* 0 failure, 1 success */
f73aaa43 2833S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2834{
97aff369 2835 dVAR;
d6a28714 2836 CHECKPOINT lastcp;
288b8c02 2837 REGEXP *const rx = reginfo->prog;
8d919b0a 2838 regexp *const prog = ReANY(rx);
f73aaa43 2839 I32 result;
f8fc2ecf 2840 RXi_GET_DECL(prog,progi);
a3621e74 2841 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2842
2843 PERL_ARGS_ASSERT_REGTRY;
2844
24b23f37 2845 reginfo->cutpoint=NULL;
d6a28714 2846
ed301438
DM
2847 if ((prog->extflags & RXf_EVAL_SEEN)
2848 && !PL_reg_state.re_state_eval_setup_done)
2849 {
d6a28714
JH
2850 MAGIC *mg;
2851
ed301438 2852 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2853 if (reginfo->sv) {
d6a28714 2854 /* Make $_ available to executed code. */
3b0527fe 2855 if (reginfo->sv != DEFSV) {
59f00321 2856 SAVE_DEFSV;
414bf5ae 2857 DEFSV_set(reginfo->sv);
b8c5462f 2858 }
d6a28714 2859
3b0527fe
DM
2860 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2861 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2862 /* prepare for quick setting of pos */
d300d9fa 2863#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2864 if (SvIsCOW(reginfo->sv))
2865 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2866#endif
3dab1dad 2867 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2868 &PL_vtbl_mglob, NULL, 0);
d6a28714 2869 mg->mg_len = -1;
b8c5462f 2870 }
d6a28714
JH
2871 PL_reg_magic = mg;
2872 PL_reg_oldpos = mg->mg_len;
4f639d21 2873 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2874 }
09687e5a 2875 if (!PL_reg_curpm) {
a02a5408 2876 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2877#ifdef USE_ITHREADS
2878 {
14a49a24 2879 SV* const repointer = &PL_sv_undef;
92313705
NC
2880 /* this regexp is also owned by the new PL_reg_curpm, which
2881 will try to free it. */
d2ece331 2882 av_push(PL_regex_padav, repointer);
09687e5a
AB
2883 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2884 PL_regex_pad = AvARRAY(PL_regex_padav);
2885 }
2886#endif
2887 }
ec43f78b 2888 SET_reg_curpm(rx);
d6a28714
JH
2889 PL_reg_oldcurpm = PL_curpm;
2890 PL_curpm = PL_reg_curpm;
07bc277f 2891 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2892 /* Here is a serious problem: we cannot rewrite subbeg,
2893 since it may be needed if this match fails. Thus
2894 $` inside (?{}) could fail... */
2895 PL_reg_oldsaved = prog->subbeg;
2896 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2897 PL_reg_oldsavedoffset = prog->suboffset;
2898 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2899#ifdef PERL_ANY_COW
ed252734
NC
2900 PL_nrs = prog->saved_copy;
2901#endif
07bc277f 2902 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2903 }
2904 else
bd61b366 2905 PL_reg_oldsaved = NULL;
d6a28714 2906 prog->subbeg = PL_bostr;
6502e081
DM
2907 prog->suboffset = 0;
2908 prog->subcoffset = 0;
d6a28714
JH
2909 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2910 }
97ca13b7 2911#ifdef DEBUGGING
f73aaa43 2912 PL_reg_starttry = *startposp;
97ca13b7 2913#endif
f73aaa43 2914 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2915 prog->lastparen = 0;
03994de8 2916 prog->lastcloseparen = 0;
d6a28714
JH
2917
2918 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2919 to do this again and again, prog->lastparen should take care of
3dd2943c 2920 this! --ilya*/
dafc8851
JH
2921
2922 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2923 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2924 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2925 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2926 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2927 * Meanwhile, this code *is* needed for the
daf18116
JH
2928 * above-mentioned test suite tests to succeed. The common theme
2929 * on those tests seems to be returning null fields from matches.
225593e1 2930 * --jhi updated by dapm */
dafc8851 2931#if 1
d6a28714 2932 if (prog->nparens) {
b93070ed 2933 regexp_paren_pair *pp = prog->offs;
eb578fdb 2934 I32 i;
b93070ed 2935 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2936 ++pp;
2937 pp->start = -1;
2938 pp->end = -1;
d6a28714
JH
2939 }
2940 }
dafc8851 2941#endif
02db2b7b 2942 REGCP_SET(lastcp);
f73aaa43
DM
2943 result = regmatch(reginfo, *startposp, progi->program + 1);
2944 if (result != -1) {
2945 prog->offs[0].end = result;
d6a28714
JH
2946 return 1;
2947 }
24b23f37 2948 if (reginfo->cutpoint)
f73aaa43 2949 *startposp= reginfo->cutpoint;
02db2b7b 2950 REGCP_UNWIND(lastcp);
d6a28714
JH
2951 return 0;
2952}
2953
02db2b7b 2954
8ba1375e
MJD
2955#define sayYES goto yes
2956#define sayNO goto no
262b90c4 2957#define sayNO_SILENT goto no_silent
8ba1375e 2958
f9f4320a
YO
2959/* we dont use STMT_START/END here because it leads to
2960 "unreachable code" warnings, which are bogus, but distracting. */
2961#define CACHEsayNO \
c476f425
DM
2962 if (ST.cache_mask) \
2963 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2964 sayNO
3298f257 2965
a3621e74 2966/* this is used to determine how far from the left messages like
265c4333
YO
2967 'failed...' are printed. It should be set such that messages
2968 are inline with the regop output that created them.
a3621e74 2969*/
265c4333 2970#define REPORT_CODE_OFF 32
a3621e74
YO
2971
2972
40a82448
DM
2973#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2974#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2975#define CHRTEST_NOT_A_CP_1 -999
2976#define CHRTEST_NOT_A_CP_2 -998
9e137952 2977
86545054
DM
2978#define SLAB_FIRST(s) (&(s)->states[0])
2979#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2980
5d9a96ca
DM
2981/* grab a new slab and return the first slot in it */
2982
2983STATIC regmatch_state *
2984S_push_slab(pTHX)
2985{
a35a87e7 2986#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2987 dMY_CXT;
2988#endif
5d9a96ca
DM
2989 regmatch_slab *s = PL_regmatch_slab->next;
2990 if (!s) {
2991 Newx(s, 1, regmatch_slab);
2992 s->prev = PL_regmatch_slab;
2993 s->next = NULL;
2994 PL_regmatch_slab->next = s;
2995 }
2996 PL_regmatch_slab = s;
86545054 2997 return SLAB_FIRST(s);
5d9a96ca 2998}
5b47454d 2999
95b24440 3000
40a82448
DM
3001/* push a new state then goto it */
3002
4d5016e5
DM
3003#define PUSH_STATE_GOTO(state, node, input) \
3004 pushinput = input; \
40a82448
DM
3005 scan = node; \
3006 st->resume_state = state; \
3007 goto push_state;
3008
3009/* push a new state with success backtracking, then goto it */
3010
4d5016e5
DM
3011#define PUSH_YES_STATE_GOTO(state, node, input) \
3012 pushinput = input; \
40a82448
DM
3013 scan = node; \
3014 st->resume_state = state; \
3015 goto push_yes_state;
3016
aa283a38 3017
aa283a38 3018
4d5016e5 3019
d6a28714 3020/*
95b24440 3021
bf1f174e
DM
3022regmatch() - main matching routine
3023
3024This is basically one big switch statement in a loop. We execute an op,
3025set 'next' to point the next op, and continue. If we come to a point which
3026we may need to backtrack to on failure such as (A|B|C), we push a
3027backtrack state onto the backtrack stack. On failure, we pop the top
3028state, and re-enter the loop at the state indicated. If there are no more
3029states to pop, we return failure.
3030
3031Sometimes we also need to backtrack on success; for example /A+/, where
3032after successfully matching one A, we need to go back and try to
3033match another one; similarly for lookahead assertions: if the assertion
3034completes successfully, we backtrack to the state just before the assertion
3035and then carry on. In these cases, the pushed state is marked as
3036'backtrack on success too'. This marking is in fact done by a chain of
3037pointers, each pointing to the previous 'yes' state. On success, we pop to
3038the nearest yes state, discarding any intermediate failure-only states.
3039Sometimes a yes state is pushed just to force some cleanup code to be
3040called at the end of a successful match or submatch; e.g. (??{$re}) uses
3041it to free the inner regex.
3042
3043Note that failure backtracking rewinds the cursor position, while
3044success backtracking leaves it alone.
3045
3046A pattern is complete when the END op is executed, while a subpattern
3047such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3048ops trigger the "pop to last yes state if any, otherwise return true"
3049behaviour.
3050
3051A common convention in this function is to use A and B to refer to the two
3052subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3053the subpattern to be matched possibly multiple times, while B is the entire
3054rest of the pattern. Variable and state names reflect this convention.
3055
3056The states in the main switch are the union of ops and failure/success of
3057substates associated with with that op. For example, IFMATCH is the op
3058that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3059'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3060successfully matched A and IFMATCH_A_fail is a state saying that we have
3061just failed to match A. Resume states always come in pairs. The backtrack
3062state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3063at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3064on success or failure.
3065
3066The struct that holds a backtracking state is actually a big union, with
3067one variant for each major type of op. The variable st points to the
3068top-most backtrack struct. To make the code clearer, within each
3069block of code we #define ST to alias the relevant union.
3070
3071Here's a concrete example of a (vastly oversimplified) IFMATCH
3072implementation:
3073
3074 switch (state) {
3075 ....
3076
3077#define ST st->u.ifmatch
3078
3079 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3080 ST.foo = ...; // some state we wish to save
95b24440 3081 ...
bf1f174e
DM
3082 // push a yes backtrack state with a resume value of
3083 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3084 // first node of A:
4d5016e5 3085 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3086 // NOTREACHED
3087
3088 case IFMATCH_A: // we have successfully executed A; now continue with B
3089 next = B;
3090 bar = ST.foo; // do something with the preserved value
3091 break;
3092
3093 case IFMATCH_A_fail: // A failed, so the assertion failed
3094 ...; // do some housekeeping, then ...
3095 sayNO; // propagate the failure
3096
3097#undef ST
95b24440 3098
bf1f174e
DM
3099 ...
3100 }
95b24440 3101
bf1f174e
DM
3102For any old-timers reading this who are familiar with the old recursive
3103approach, the code above is equivalent to:
95b24440 3104
bf1f174e
DM
3105 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3106 {
3107 int foo = ...
95b24440 3108 ...
bf1f174e
DM
3109 if (regmatch(A)) {
3110 next = B;
3111 bar = foo;
3112 break;
95b24440 3113 }
bf1f174e
DM
3114 ...; // do some housekeeping, then ...
3115 sayNO; // propagate the failure
95b24440 3116 }
bf1f174e
DM
3117
3118The topmost backtrack state, pointed to by st, is usually free. If you
3119want to claim it, populate any ST.foo fields in it with values you wish to
3120save, then do one of
3121
4d5016e5
DM
3122 PUSH_STATE_GOTO(resume_state, node, newinput);
3123 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3124
3125which sets that backtrack state's resume value to 'resume_state', pushes a
3126new free entry to the top of the backtrack stack, then goes to 'node'.
3127On backtracking, the free slot is popped, and the saved state becomes the
3128new free state. An ST.foo field in this new top state can be temporarily
3129accessed to retrieve values, but once the main loop is re-entered, it
3130becomes available for reuse.
3131
3132Note that the depth of the backtrack stack constantly increases during the
3133left-to-right execution of the pattern, rather than going up and down with
3134the pattern nesting. For example the stack is at its maximum at Z at the
3135end of the pattern, rather than at X in the following:
3136
3137 /(((X)+)+)+....(Y)+....Z/
3138
3139The only exceptions to this are lookahead/behind assertions and the cut,
3140(?>A), which pop all the backtrack states associated with A before
3141continuing.
3142
486ec47a 3143Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3144PL_regmatch_state and st always point to the currently active state,
3145and PL_regmatch_slab points to the slab currently containing
3146PL_regmatch_state. The first time regmatch() is called, the first slab is
3147allocated, and is never freed until interpreter destruction. When the slab
3148is full, a new one is allocated and chained to the end. At exit from
3149regmatch(), slabs allocated since entry are freed.
3150
3151*/
95b24440 3152
40a82448 3153
5bc10b2c 3154#define DEBUG_STATE_pp(pp) \
265c4333 3155 DEBUG_STATE_r({ \
f2ed9b32 3156 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3157 PerlIO_printf(Perl_debug_log, \
5d458dd8 3158 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3159 depth*2, "", \
13d6edb4 3160 PL_reg_name[st->resume_state], \
5d458dd8
YO
3161 ((st==yes_state||st==mark_state) ? "[" : ""), \
3162 ((st==yes_state) ? "Y" : ""), \
3163 ((st==mark_state) ? "M" : ""), \
3164 ((st==yes_state||st==mark_state) ? "]" : "") \
3165 ); \
265c4333 3166 });
5bc10b2c 3167
40a82448 3168
3dab1dad 3169#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3170
3df15adc 3171#ifdef DEBUGGING
5bc10b2c 3172
ab3bbdeb 3173STATIC void
f2ed9b32 3174S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3175 const char *start, const char *end, const char *blurb)
3176{
efd26800 3177 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3178
3179 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3180
ab3bbdeb
YO
3181 if (!PL_colorset)
3182 reginitcolors();
3183 {
3184 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3185 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3186
f2ed9b32 3187 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3188 start, end - start, 60);
3189
3190 PerlIO_printf(Perl_debug_log,
3191 "%s%s REx%s %s against %s\n",
3192 PL_colors[4], blurb, PL_colors[5], s0, s1);
3193
f2ed9b32 3194 if (utf8_target||utf8_pat)
1de06328
YO
3195 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3196 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3197 utf8_pat && utf8_target ? " and " : "",
3198 utf8_target ? "string" : ""
ab3bbdeb
YO
3199 );
3200 }
3201}
3df15adc
YO
3202
3203STATIC void
786e8c11
YO
3204S_dump_exec_pos(pTHX_ const char *locinput,
3205 const regnode *scan,
3206 const char *loc_regeol,
3207 const char *loc_bostr,
3208 const char *loc_reg_starttry,
f2ed9b32 3209 const bool utf8_target)
07be1b83 3210{
786e8c11 3211 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3212 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3213 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3214 /* The part of the string before starttry has one color
3215 (pref0_len chars), between starttry and current
3216 position another one (pref_len - pref0_len chars),
3217 after the current position the third one.
3218 We assume that pref0_len <= pref_len, otherwise we
3219 decrease pref0_len. */
786e8c11
YO
3220 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3221 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3222 int pref0_len;
3223
7918f24d
NC
3224 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3225
f2ed9b32 3226 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3227 pref_len++;
786e8c11
YO
3228 pref0_len = pref_len - (locinput - loc_reg_starttry);
3229 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3230 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3231 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3232 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3233 l--;
3234 if (pref0_len < 0)
3235 pref0_len = 0;
3236 if (pref0_len > pref_len)
3237 pref0_len = pref_len;
3238 {
f2ed9b32 3239 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3240
ab3bbdeb 3241 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3242 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3243
ab3bbdeb 3244 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3245 (locinput - pref_len + pref0_len),
1de06328 3246 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3247
ab3bbdeb 3248 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3249 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3250
1de06328 3251 const STRLEN tlen=len0+len1+len2;
3df15adc 3252 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3253 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3254 (IV)(locinput - loc_bostr),
07be1b83 3255 len0, s0,
07be1b83 3256 len1, s1,
07be1b83 3257 (docolor ? "" : "> <"),
07be1b83 3258 len2, s2,
f9f4320a 3259 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3260 "");
3261 }
3262}
3df15adc 3263
07be1b83
YO
3264#endif
3265
0a4db386
YO
3266/* reg_check_named_buff_matched()
3267 * Checks to see if a named buffer has matched. The data array of
3268 * buffer numbers corresponding to the buffer is expected to reside
3269 * in the regexp->data->data array in the slot stored in the ARG() of
3270 * node involved. Note that this routine doesn't actually care about the
3271 * name, that information is not preserved from compilation to execution.
3272 * Returns the index of the leftmost defined buffer with the given name
3273 * or 0 if non of the buffers matched.
3274 */
3275STATIC I32
7918f24d
NC
3276S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3277{
0a4db386 3278 I32 n;
f8fc2ecf 3279 RXi_GET_DECL(rex,rexi);
ad64d0ec 3280 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3281 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3282
3283 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3284
0a4db386 3285 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3286 if ((I32)rex->lastparen >= nums[n] &&
3287 rex->offs[nums[n]].end != -1)
0a4db386
YO
3288 {
3289 return nums[n];
3290 }
3291 }
3292 return 0;
3293}
3294
2f554ef7
DM
3295
3296/* free all slabs above current one - called during LEAVE_SCOPE */
3297
3298STATIC void
3299S_clear_backtrack_stack(pTHX_ void *p)
3300{
3301 regmatch_slab *s = PL_regmatch_slab->next;
3302 PERL_UNUSED_ARG(p);
3303
3304 if (!s)
3305 return;
3306 PL_regmatch_slab->next = NULL;
3307 while (s) {
3308 regmatch_slab * const osl = s;
3309 s = s->next;
3310 Safefree(osl);
3311 }
3312}
c74f6de9 3313static bool
79a2a0e8 3314S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3315{
79a2a0e8
KW
3316 /* This function determines if there are one or two characters that match
3317 * the first character of the passed-in EXACTish node <text_node>, and if
3318 * so, returns them in the passed-in pointers.
c74f6de9 3319 *
79a2a0e8
KW
3320 * If it determines that no possible character in the target string can
3321 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3322 * the first character in <text_node> requires UTF-8 to represent, and the
3323 * target string isn't in UTF-8.)
c74f6de9 3324 *
79a2a0e8
KW
3325 * If there are more than two characters that could match the beginning of
3326 * <text_node>, or if more context is required to determine a match or not,
3327 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3328 *
3329 * The motiviation behind this function is to allow the caller to set up
3330 * tight loops for matching. If <text_node> is of type EXACT, there is
3331 * only one possible character that can match its first character, and so
3332 * the situation is quite simple. But things get much more complicated if
3333 * folding is involved. It may be that the first character of an EXACTFish
3334 * node doesn't participate in any possible fold, e.g., punctuation, so it
3335 * can be matched only by itself. The vast majority of characters that are
3336 * in folds match just two things, their lower and upper-case equivalents.
3337 * But not all are like that; some have multiple possible matches, or match
3338 * sequences of more than one character. This function sorts all that out.
3339 *
3340 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3341 * loop of trying to match A*, we know we can't exit where the thing
3342 * following it isn't a B. And something can't be a B unless it is the
3343 * beginning of B. By putting a quick test for that beginning in a tight
3344 * loop, we can rule out things that can't possibly be B without having to
3345 * break out of the loop, thus avoiding work. Similarly, if A is a single
3346 * character, we can make a tight loop matching A*, using the outputs of
3347 * this function.
3348 *
3349 * If the target string to match isn't in UTF-8, and there aren't
3350 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3351 * the one or two possible octets (which are characters in this situation)
3352 * that can match. In all cases, if there is only one character that can
3353 * match, *<c1p> and *<c2p> will be identical.
3354 *
3355 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3356 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3357 * can match the beginning of <text_node>. They should be declared with at
3358 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3359 * undefined what these contain.) If one or both of the buffers are
3360 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3361 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3362 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3363 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3364 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3365
3366 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3367
ddb0d839
KW
3368 UV c1 = CHRTEST_NOT_A_CP_1;
3369 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3370 bool use_chrtest_void = FALSE;
3371
3372 /* Used when we have both utf8 input and utf8 output, to avoid converting
3373 * to/from code points */
3374 bool utf8_has_been_setup = FALSE;
3375
c74f6de9
KW
3376 dVAR;
3377
b4291290 3378 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3379
79a2a0e8
KW
3380 if (OP(text_node) == EXACT) {
3381
3382 /* In an exact node, only one thing can be matched, that first
3383 * character. If both the pat and the target are UTF-8, we can just
3384 * copy the input to the output, avoiding finding the code point of
3385 * that character */
3386 if (! UTF_PATTERN) {
3387 c2 = c1 = *pat;
3388 }
3389 else if (utf8_target) {
3390 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3391 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3392 utf8_has_been_setup = TRUE;
3393 }
3394 else {
3395 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3396 }
79a2a0e8
KW
3397 }
3398 else /* an EXACTFish node */
3399 if ((UTF_PATTERN
3400 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3401 pat + STR_LEN(text_node)))
3402 || (! UTF_PATTERN
3403 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3404 pat + STR_LEN(text_node))))
3405 {
3406 /* Multi-character folds require more context to sort out. Also
3407 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3408 * handled outside this routine */
3409 use_chrtest_void = TRUE;
3410 }
3411 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3412 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3413 if (c1 > 256) {
3414 /* Load the folds hash, if not already done */
3415 SV** listp;
3416 if (! PL_utf8_foldclosures) {
3417 if (! PL_utf8_tofold) {
3418 U8 dummy[UTF8_MAXBYTES+1];
3419
3420 /* Force loading this by folding an above-Latin1 char */
3421 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3422 assert(PL_utf8_tofold); /* Verify that worked */
3423 }
3424 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3425 }
3426
3427 /* The fold closures data structure is a hash with the keys being
3428 * the UTF-8 of every character that is folded to, like 'k', and
3429 * the values each an array of all code points that fold to its
3430 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3431 * not included */
3432 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3433 (char *) pat,
3434 UTF8SKIP(pat),
3435 FALSE))))
3436 {
3437 /* Not found in the hash, therefore there are no folds
3438 * containing it, so there is only a single character that
3439 * could match */
3440 c2 = c1;
3441 }
3442 else { /* Does participate in folds */
3443 AV* list = (AV*) *listp;
3444 if (av_len(list) != 1) {
3445
3446 /* If there aren't exactly two folds to this, it is outside
3447 * the scope of this function */
3448 use_chrtest_void = TRUE;
3449 }
3450 else { /* There are two. Get them */
3451 SV** c_p = av_fetch(list, 0, FALSE);
3452 if (c_p == NULL) {
3453 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3454 }
3455 c1 = SvUV(*c_p);
3456
3457 c_p = av_fetch(list, 1, FALSE);
3458 if (c_p == NULL) {
3459 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3460 }
3461 c2 = SvUV(*c_p);
3462
3463 /* Folds that cross the 255/256 boundary are forbidden if
3464 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3465 * pattern character is above 256, and its only other match
3466 * is below 256, the only legal match will be to itself.
3467 * We have thrown away the original, so have to compute
3468 * which is the one above 255 */
3469 if ((c1 < 256) != (c2 < 256)) {
3470 if (OP(text_node) == EXACTFL
3471 || (OP(text_node) == EXACTFA
3472 && (isASCII(c1) || isASCII(c2))))
3473 {
3474 if (c1 < 256) {
3475 c1 = c2;
3476 }
3477 else {
3478 c2 = c1;
3479 }
3480 }
3481 }
3482 }
3483 }
3484 }
3485 else /* Here, c1 is < 255 */
3486 if (utf8_target
3487 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3488 && OP(text_node) != EXACTFL
3489 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3490 {
3491 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3492 * folds to this character in the pattern. All such cases except
3493 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3494 * involved in their folds, so are outside the scope of this
3495 * function */
3496 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3497 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3498 }
3499 else {
3500 use_chrtest_void = TRUE;
3501 }
c74f6de9
KW
3502 }
3503 else { /* Here nothing above Latin1 can fold to the pattern character */
3504 switch (OP(text_node)) {
3505
3506 case EXACTFL: /* /l rules */
79a2a0e8 3507 c2 = PL_fold_locale[c1];
c74f6de9
KW
3508 break;
3509
3510 case EXACTF:
3511 if (! utf8_target) { /* /d rules */
79a2a0e8 3512 c2 = PL_fold[c1];
c74f6de9
KW
3513 break;
3514 }
3515 /* FALLTHROUGH */
3516 /* /u rules for all these. This happens to work for
79a2a0e8 3517 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3518 case EXACTFA:
3519 case EXACTFU_TRICKYFOLD:
79a2a0e8 3520 case EXACTFU_SS:
c74f6de9 3521 case EXACTFU:
79a2a0e8 3522 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3523 break;
3524
878531d3
KW
3525 default:
3526 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3527 assert(0); /* NOTREACHED */
c74f6de9
KW
3528 }
3529 }
3530 }
79a2a0e8
KW
3531
3532 /* Here have figured things out. Set up the returns */
3533 if (use_chrtest_void) {
3534 *c2p = *c1p = CHRTEST_VOID;
3535 }
3536 else if (utf8_target) {
3537 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3538 uvchr_to_utf8(c1_utf8, c1);
3539 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3540 }
c74f6de9 3541
79a2a0e8
KW
3542 /* Invariants are stored in both the utf8 and byte outputs; Use
3543 * negative numbers otherwise for the byte ones. Make sure that the
3544 * byte ones are the same iff the utf8 ones are the same */
3545 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3546 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3547 ? *c2_utf8
3548 : (c1 == c2)
3549 ? CHRTEST_NOT_A_CP_1
3550 : CHRTEST_NOT_A_CP_2;
3551 }
3552 else if (c1 > 255) {
3553 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3554 can represent */
3555 return FALSE;
3556 }
c74f6de9 3557
79a2a0e8
KW
3558 *c1p = *c2p = c2; /* c2 is the only representable value */
3559 }
3560 else { /* c1 is representable; see about c2 */
3561 *c1p = c1;
3562 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3563 }
2f554ef7 3564
c74f6de9
KW
3565 return TRUE;
3566}
2f554ef7 3567
f73aaa43
DM
3568/* returns -1 on failure, $+[0] on success */
3569STATIC I32
3570S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3571{
a35a87e7 3572#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3573 dMY_CXT;
3574#endif
27da23d5 3575 dVAR;
eb578fdb 3576 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3577 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3578 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3579 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3580 RXi_GET_DECL(rex,rexi);
2f554ef7 3581 I32 oldsave;
5d9a96ca 3582 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3583 regmatch_state *st;
5d9a96ca 3584 /* cache heavy used fields of st in registers */
eb578fdb
KW
3585 regnode *scan;
3586 regnode *next;
3587 U32 n = 0; /* general value; init to avoid compiler warning */
3588 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3589 char *locinput = startpos;
4d5016e5 3590 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3591 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3592
b69b0499 3593 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3594 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3595 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3596 const U32 max_nochange_depth =
3597 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3598 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3599 regmatch_state *yes_state = NULL; /* state to pop to on success of
3600 subpattern */
e2e6a0f1
YO
3601 /* mark_state piggy backs on the yes_state logic so that when we unwind
3602 the stack on success we can update the mark_state as we go */
3603 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3604 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3605 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3606 U32 state_num;
5d458dd8
YO
3607 bool no_final = 0; /* prevent failure from backtracking? */
3608 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3609 char *startpoint = locinput;
5d458dd8
YO
3610 SV *popmark = NULL; /* are we looking for a mark? */
3611 SV *sv_commit = NULL; /* last mark name seen in failure */
3612 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3613 during a successful match */
5d458dd8
YO
3614 U32 lastopen = 0; /* last open we saw */
3615 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3616 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3617 /* these three flags are set by various ops to signal information to
3618 * the very next op. They have a useful lifetime of exactly one loop
3619 * iteration, and are not preserved or restored by state pushes/pops
3620 */
3621 bool sw = 0; /* the condition value in (?(cond)a|b) */
3622 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3623 int logical = 0; /* the following EVAL is:
3624 0: (?{...})
3625 1: (?(?{...})X|Y)
3626 2: (??{...})
3627 or the following IFMATCH/UNLESSM is:
3628 false: plain (?=foo)
3629 true: used as a condition: (?(?=foo))
3630 */
81ed78b2
DM
3631 PAD* last_pad = NULL;
3632 dMULTICALL;
3633 I32 gimme = G_SCALAR;
3634 CV *caller_cv = NULL; /* who called us */
3635 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3636 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3637 U32 maxopenparen = 0; /* max '(' index seen so far */
81ed78b2 3638
95b24440 3639#ifdef DEBUGGING
e68ec53f 3640 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3641#endif
3642
81ed78b2
DM
3643 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3644 multicall_oldcatch = 0;
3645 multicall_cv = NULL;
3646 cx = NULL;
4f8dbb2d
JL
3647 PERL_UNUSED_VAR(multicall_cop);
3648 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3649
3650
7918f24d
NC
3651 PERL_ARGS_ASSERT_REGMATCH;
3652
3b57cd43 3653 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3654 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3655 }));
5d9a96ca
DM
3656 /* on first ever call to regmatch, allocate first slab */
3657 if (!PL_regmatch_slab) {
3658 Newx(PL_regmatch_slab, 1, regmatch_slab);
3659 PL_regmatch_slab->prev = NULL;
3660 PL_regmatch_slab->next = NULL;
86545054 3661 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3662 }
3663
2f554ef7
DM
3664 oldsave = PL_savestack_ix;
3665 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3666 SAVEVPTR(PL_regmatch_slab);
3667 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3668
3669 /* grab next free state slot */
3670 st = ++PL_regmatch_state;
86545054 3671 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3672 st = PL_regmatch_state = S_push_slab(aTHX);
3673
d6a28714 3674 /* Note that nextchr is a byte even in UTF */
7016d6eb 3675 SET_nextchr;
d6a28714
JH
3676 scan = prog;
3677 while (scan != NULL) {
8ba1375e 3678
a3621e74 3679 DEBUG_EXECUTE_r( {
6136c704 3680 SV * const prop = sv_newmortal();
1de06328 3681 regnode *rnext=regnext(scan);
f2ed9b32 3682 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3683 regprop(rex, prop, scan);
07be1b83
YO
3684
3685 PerlIO_printf(Perl_debug_log,
3686 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3687 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3688 SvPVX_const(prop),
1de06328 3689 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3690 0 : (IV)(rnext - rexi->program));
2a782b5b 3691 });
d6a28714
JH
3692
3693 next = scan + NEXT_OFF(scan);
3694 if (next == scan)
3695 next = NULL;
40a82448 3696 state_num = OP(scan);
d6a28714 3697
40a82448 3698 reenter_switch:
34a81e2b 3699
7016d6eb 3700 SET_nextchr;
e6ca698c 3701 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3702
40a82448 3703 switch (state_num) {
3c0563b9 3704 case BOL: /* /^../ */
7fba1cd6 3705 if (locinput == PL_bostr)
d6a28714 3706 {
3b0527fe 3707 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3708 break;
3709 }
d6a28714 3710 sayNO;
3c0563b9
DM
3711
3712 case MBOL: /* /^../m */
12d33761 3713 if (locinput == PL_bostr ||
7016d6eb 3714 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3715 {
b8c5462f
JH
3716 break;
3717 }
d6a28714 3718 sayNO;
3c0563b9
DM
3719
3720 case SBOL: /* /^../s */
c2a73568 3721 if (locinput == PL_bostr)
b8c5462f 3722 break;
d6a28714 3723 sayNO;
3c0563b9
DM
3724
3725 case GPOS: /* \G */
3b0527fe 3726 if (locinput == reginfo->ganch)
d6a28714
JH
3727 break;
3728 sayNO;
ee9b8eae 3729
3c0563b9 3730 case KEEPS: /* \K */
ee9b8eae 3731 /* update the startpoint */
b93070ed 3732 st->u.keeper.val = rex->offs[0].start;
b93070ed 3733 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3734 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3735 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3736 case KEEPS_next_fail:
3737 /* rollback the start point change */
b93070ed 3738 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3739 sayNO_SILENT;
878531d3 3740 assert(0); /*NOTREACHED*/
3c0563b9
DM
3741
3742 case EOL: /* /..$/ */
d6a28714 3743 goto seol;
3c0563b9
DM
3744
3745 case MEOL: /* /..$/m */
7016d6eb 3746 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3747 sayNO;
b8c5462f 3748 break;
3c0563b9
DM
3749
3750 case SEOL: /* /..$/s */
d6a28714 3751 seol:
7016d6eb 3752 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3753 sayNO;
d6a28714 3754 if (PL_regeol - locinput > 1)
b8c5462f 3755 sayNO;
b8c5462f 3756 break;
3c0563b9
DM
3757
3758 case EOS: /* \z */
7016d6eb 3759 if (!NEXTCHR_IS_EOS)
b8c5462f 3760 sayNO;
d6a28714 3761 break;
3c0563b9
DM
3762
3763 case SANY: /* /./s */
7016d6eb 3764 if (NEXTCHR_IS_EOS)
4633a7c4 3765 sayNO;
28b98f76 3766 goto increment_locinput;
3c0563b9
DM
3767
3768 case CANY: /* \C */
7016d6eb 3769 if (NEXTCHR_IS_EOS)
f33976b4 3770 sayNO;
3640db6b 3771 locinput++;
a0d0e21e 3772 break;
3c0563b9
DM
3773
3774 case REG_ANY: /* /./ */
7016d6eb 3775 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3776 sayNO;
28b98f76
DM
3777 goto increment_locinput;
3778
166ba7cd
DM
3779
3780#undef ST
3781#define ST st->u.trie
3c0563b9 3782 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3783 /* In this case the charclass data is available inline so
3784 we can fail fast without a lot of extra overhead.
3785 */
7016d6eb 3786 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3787 DEBUG_EXECUTE_r(
3788 PerlIO_printf(Perl_debug_log,
3789 "%*s %sfailed to match trie start class...%s\n",
3790 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3791 );
3792 sayNO_SILENT;
118e2215 3793 assert(0); /* NOTREACHED */
786e8c11
YO
3794 }
3795 /* FALL THROUGH */
3c0563b9 3796 case TRIE: /* (ab|cd) */
2e64971a
DM
3797 /* the basic plan of execution of the trie is:
3798 * At the beginning, run though all the states, and
3799 * find the longest-matching word. Also remember the position
3800 * of the shortest matching word. For example, this pattern:
3801 * 1 2 3 4 5
3802 * ab|a|x|abcd|abc
3803 * when matched against the string "abcde", will generate
3804 * accept states for all words except 3, with the longest
895cc420 3805 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3806 * the position being after char 1 of the string).
3807 *
3808 * Then for each matching word, in word order (i.e. 1,2,4,5),
3809 * we run the remainder of the pattern; on each try setting
3810 * the current position to the character following the word,
3811 * returning to try the next word on failure.
3812 *
3813 * We avoid having to build a list of words at runtime by
3814 * using a compile-time structure, wordinfo[].prev, which
3815 * gives, for each word, the previous accepting word (if any).
3816 * In the case above it would contain the mappings 1->2, 2->0,
3817 * 3->0, 4->5, 5->1. We can use this table to generate, from
3818 * the longest word (4 above), a list of all words, by
3819 * following the list of prev pointers; this gives us the
3820 * unordered list 4,5,1,2. Then given the current word we have
3821 * just tried, we can go through the list and find the
3822 * next-biggest word to try (so if we just failed on word 2,
3823 * the next in the list is 4).
3824 *
3825 * Since at runtime we don't record the matching position in
3826 * the string for each word, we have to work that out for
3827 * each word we're about to process. The wordinfo table holds
3828 * the character length of each word; given that we recorded
3829 * at the start: the position of the shortest word and its
3830 * length in chars, we just need to move the pointer the
3831 * difference between the two char lengths. Depending on
3832 * Unicode status and folding, that's cheap or expensive.
3833 *
3834 * This algorithm is optimised for the case where are only a
3835 * small number of accept states, i.e. 0,1, or maybe 2.
3836 * With lots of accepts states, and having to try all of them,
3837 * it becomes quadratic on number of accept states to find all
3838 * the next words.
3839 */
3840
3dab1dad 3841 {
07be1b83 3842 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3843 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3844
3845 /* what trie are we using right now */
be8e71aa 3846 reg_trie_data * const trie
f8fc2ecf 3847 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3848 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3849 U32 state = trie->startstate;
166ba7cd 3850
7016d6eb
DM
3851 if ( trie->bitmap
3852 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3853 {
3dab1dad
YO
3854 if (trie->states[ state ].wordnum) {
3855 DEBUG_EXECUTE_r(
3856 PerlIO_printf(Perl_debug_log,
3857 "%*s %smatched empty string...%s\n",
5bc10b2c 3858 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3859 );
20dbff7c
YO
3860 if (!trie->jump)
3861 break;
3dab1dad
YO
3862 } else {
3863 DEBUG_EXECUTE_r(
3864 PerlIO_printf(Perl_debug_log,
786e8c11 3865 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3866 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3867 );
3868 sayNO_SILENT;
3869 }
3870 }
166ba7cd 3871
786e8c11
YO
3872 {
3873 U8 *uc = ( U8* )locinput;
3874
3875 STRLEN len = 0;
3876 STRLEN foldlen = 0;
3877 U8 *uscan = (U8*)NULL;
786e8c11 3878 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3879 U32 charcount = 0; /* how many input chars we have matched */
3880 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3881
786e8c11 3882 ST.jump = trie->jump;
786e8c11 3883 ST.me = scan;
2e64971a
DM
3884 ST.firstpos = NULL;
3885 ST.longfold = FALSE; /* char longer if folded => it's harder */
3886 ST.nextword = 0;
3887
3888 /* fully traverse the TRIE; note the position of the
3889 shortest accept state and the wordnum of the longest
3890 accept state */
07be1b83 3891
a3621e74 3892 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3893 U32 base = trie->states[ state ].trans.base;
f9f4320a 3894 UV uvc = 0;
acb909b4 3895 U16 charid = 0;
2e64971a
DM
3896 U16 wordnum;
3897 wordnum = trie->states[ state ].wordnum;
3898
3899 if (wordnum) { /* it's an accept state */
3900 if (!accepted) {
3901 accepted = 1;
3902 /* record first match position */
3903 if (ST.longfold) {
3904 ST.firstpos = (U8*)locinput;
3905 ST.firstchars = 0;
5b47454d 3906 }
2e64971a
DM
3907 else {
3908 ST.firstpos = uc;
3909 ST.firstchars = charcount;
3910 }
3911 }
3912 if (!ST.nextword || wordnum < ST.nextword)
3913 ST.nextword = wordnum;
3914 ST.topword = wordnum;
786e8c11 3915 }
a3621e74 3916
07be1b83 3917 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3918 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3919 PerlIO_printf( Perl_debug_log,
2e64971a 3920 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3921 2+depth * 2, "", PL_colors[4],
2e64971a 3922 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3923 });
a3621e74 3924
2e64971a 3925 /* read a char and goto next state */
7016d6eb 3926 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3927 I32 offset;
55eed653
NC
3928 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3929 uscan, len, uvc, charid, foldlen,
3930 foldbuf, uniflags);
2e64971a
DM
3931 charcount++;
3932 if (foldlen>0)
3933 ST.longfold = TRUE;
5b47454d 3934 if (charid &&
6dd2be57
DM
3935 ( ((offset =
3936 base + charid - 1 - trie->uniquecharcount)) >= 0)
3937
3938 && ((U32)offset < trie->lasttrans)
3939 && trie->trans[offset].check == state)
5b47454d 3940 {
6dd2be57 3941 state = trie->trans[offset].next;
5b47454d
DM
3942 }
3943 else {
3944 state = 0;
3945 }
3946 uc += len;
3947
3948 }
3949 else {
a3621e74
YO
3950 state = 0;
3951 }
3952 DEBUG_TRIE_EXECUTE_r(
e4584336 3953 PerlIO_printf( Perl_debug_log,
786e8c11 3954 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3955 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3956 );
3957 }
2e64971a 3958 if (!accepted)
a3621e74 3959 sayNO;
a3621e74 3960
2e64971a
DM
3961 /* calculate total number of accept states */
3962 {
3963 U16 w = ST.topword;
3964 accepted = 0;
3965 while (w) {
3966 w = trie->wordinfo[w].prev;
3967 accepted++;
3968 }
3969 ST.accepted = accepted;
3970 }
3971
166ba7cd
DM
3972 DEBUG_EXECUTE_r(
3973 PerlIO_printf( Perl_debug_log,
3974 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3975 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3976 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3977 );
2e64971a 3978 goto trie_first_try; /* jump into the fail handler */
786e8c11 3979 }}
118e2215 3980 assert(0); /* NOTREACHED */
2e64971a
DM
3981
3982 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3983 {
3984 U8 *uc;
fae667d5
YO
3985 if ( ST.jump) {
3986 REGCP_UNWIND(ST.cp);
a8d1f4b4 3987 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3988 }
2e64971a
DM
3989 if (!--ST.accepted) {
3990 DEBUG_EXECUTE_r({
3991 PerlIO_printf( Perl_debug_log,
3992 "%*s %sTRIE failed...%s\n",
3993 REPORT_CODE_OFF+depth*2, "",
3994 PL_colors[4],
3995 PL_colors[5] );
3996 });
3997 sayNO_SILENT;
3998 }
3999 {
4000 /* Find next-highest word to process. Note that this code
4001 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
4002 U16 min = 0;
4003 U16 word;
4004 U16 const nextword = ST.nextword;
4005 reg_trie_wordinfo * const wordinfo
2e64971a
DM
4006 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4007 for (word=ST.topword; word; word=wordinfo[word].prev) {
4008 if (word > nextword && (!min || word < min))
4009 min = word;
4010 }
4011 ST.nextword = min;
4012 }
4013
fae667d5 4014 trie_first_try:
5d458dd8
YO
4015 if (do_cutgroup) {
4016 do_cutgroup = 0;
4017 no_final = 0;
4018 }
fae667d5
YO
4019
4020 if ( ST.jump) {
b93070ed 4021 ST.lastparen = rex->lastparen;
f6033a9d 4022 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 4023 REGCP_SET(ST.cp);
2e64971a 4024 }
a3621e74 4025
2e64971a 4026 /* find start char of end of current word */
166ba7cd 4027 {
2e64971a 4028 U32 chars; /* how many chars to skip */
2e64971a
DM
4029 reg_trie_data * const trie
4030 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4031
4032 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4033 >= ST.firstchars);
4034 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035 - ST.firstchars;
a059a757 4036 uc = ST.firstpos;
2e64971a
DM
4037
4038 if (ST.longfold) {
4039 /* the hard option - fold each char in turn and find
4040 * its folded length (which may be different */
4041 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4042 STRLEN foldlen;
4043 STRLEN len;
d9a396a3 4044 UV uvc;
2e64971a
DM
4045 U8 *uscan;
4046
4047 while (chars) {
f2ed9b32 4048 if (utf8_target) {
2e64971a
DM
4049 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4050 uniflags);
4051 uc += len;
4052 }
4053 else {
4054 uvc = *uc;
4055 uc++;
4056 }
4057 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4058 uscan = foldbuf;
4059 while (foldlen) {
4060 if (!--chars)
4061 break;
4062 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4063 uniflags);
4064 uscan += len;
4065 foldlen -= len;
4066 }
4067 }
a3621e74 4068 }
2e64971a 4069 else {
f2ed9b32 4070 if (utf8_target)
2e64971a
DM
4071 while (chars--)
4072 uc += UTF8SKIP(uc);
4073 else
4074 uc += chars;
4075 }
2e64971a 4076 }
166ba7cd 4077
6603fe3e
DM
4078 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4079 ? ST.jump[ST.nextword]
4080 : NEXT_OFF(ST.me));
166ba7cd 4081
2e64971a
DM
4082 DEBUG_EXECUTE_r({
4083 PerlIO_printf( Perl_debug_log,
4084 "%*s %sTRIE matched word #%d, continuing%s\n",
4085 REPORT_CODE_OFF+depth*2, "",
4086 PL_colors[4],
4087 ST.nextword,
4088 PL_colors[5]
4089 );
4090 });
4091
4092 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4093 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4094 assert(0); /* NOTREACHED */
166ba7cd 4095 }
2e64971a
DM
4096 /* only one choice left - just continue */
4097 DEBUG_EXECUTE_r({
4098 AV *const trie_words
4099 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4100 SV ** const tmp = av_fetch( trie_words,
4101 ST.nextword-1, 0 );
4102 SV *sv= tmp ? sv_newmortal() : NULL;
4103
4104 PerlIO_printf( Perl_debug_log,
4105 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4106 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4107 ST.nextword,
4108 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4109 PL_colors[0], PL_colors[1],
c89df6cf 4110 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4111 )
4112 : "not compiled under -Dr",
4113 PL_colors[5] );
4114 });
4115
a059a757 4116 locinput = (char*)uc;
2e64971a 4117 continue; /* execute rest of RE */
118e2215 4118 assert(0); /* NOTREACHED */
a059a757 4119 }
166ba7cd
DM
4120#undef ST
4121
3c0563b9 4122 case EXACT: { /* /abc/ */
95b24440 4123 char *s = STRING(scan);
24d3c4a9 4124 ln = STR_LEN(scan);
f2ed9b32 4125 if (utf8_target != UTF_PATTERN) {
bc517b45 4126 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4127 char *l = locinput;
24d3c4a9 4128 const char * const e = s + ln;
a72c7584 4129
f2ed9b32 4130 if (utf8_target) {
e6a3850e
KW
4131 /* The target is utf8, the pattern is not utf8.
4132 * Above-Latin1 code points can't match the pattern;
4133 * invariants match exactly, and the other Latin1 ones need
4134 * to be downgraded to a single byte in order to do the
4135 * comparison. (If we could be confident that the target
4136 * is not malformed, this could be refactored to have fewer
4137 * tests by just assuming that if the first bytes match, it
4138 * is an invariant, but there are tests in the test suite
4139 * dealing with (??{...}) which violate this) */
1aa99e6b
IH
4140 while (s < e) {
4141 if (l >= PL_regeol)
5ff6fc6d 4142 sayNO;
e6a3850e
KW
4143 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4144 sayNO;
4145 }
4146 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4147 if (*l != *s) {
4148 sayNO;
4149 }
4150 l++;
4151 }
4152 else {
4153 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4154 sayNO;
4155 }
4156 l += 2;
4157 }
4158 s++;
1aa99e6b 4159 }
5ff6fc6d
JH
4160 }
4161 else {
4162 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4163 while (s < e) {
e6a3850e
KW
4164 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4165 {
4166 sayNO;
4167 }
4168 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4169 if (*s != *l) {
4170 sayNO;
4171 }
4172 s++;
4173 }
4174 else {
4175 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4176 sayNO;
4177 }
4178 s += 2;
4179 }
4180 l++;
1aa99e6b 4181 }
5ff6fc6d 4182 }
1aa99e6b 4183 locinput = l;
1aa99e6b
IH
4184 break;
4185 }
bc517b45 4186 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
4187 /* Inline the first character, for speed. */
4188 if (UCHARAT(s) != nextchr)
4189 sayNO;
24d3c4a9 4190 if (PL_regeol - locinput < ln)
d6a28714 4191 sayNO;
24d3c4a9 4192 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 4193 sayNO;
24d3c4a9 4194 locinput += ln;
d6a28714 4195 break;
95b24440 4196 }
7016d6eb 4197
3c0563b9 4198 case EXACTFL: { /* /abc/il */
a932d541 4199 re_fold_t folder;
9a5a5549
KW
4200 const U8 * fold_array;
4201 const char * s;
d513472c 4202 U32 fold_utf8_flags;
9a5a5549 4203
b8c5462f 4204 PL_reg_flags |= RF_tainted;
f67f9e53
KW
4205 folder = foldEQ_locale;
4206 fold_array = PL_fold_locale;
17580e7a 4207 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4208 goto do_exactf;
4209
3c0563b9
DM
4210 case EXACTFU_SS: /* /\x{df}/iu */
4211 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4212 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4213 folder = foldEQ_latin1;
4214 fold_array = PL_fold_latin1;
2daa8fee 4215 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4216 goto do_exactf;
4217
3c0563b9 4218 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4219 folder = foldEQ_latin1;
4220 fold_array = PL_fold_latin1;
57014d77 4221 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4222 goto do_exactf;
4223
3c0563b9 4224 case EXACTF: /* /abc/i */
9a5a5549
KW
4225 folder = foldEQ;
4226 fold_array = PL_fold;
62bf7766 4227 fold_utf8_flags = 0;
9a5a5549
KW
4228
4229 do_exactf:
4230 s = STRING(scan);
24d3c4a9 4231 ln = STR_LEN(scan);
d6a28714 4232
3c760661
KW
4233 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4234 /* Either target or the pattern are utf8, or has the issue where
4235 * the fold lengths may differ. */
be8e71aa 4236 const char * const l = locinput;
d07ddd77 4237 char *e = PL_regeol;
bc517b45 4238
d513472c 4239 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 4240 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4241 {
4242 sayNO;
5486206c 4243 }
d07ddd77 4244 locinput = e;
d07ddd77 4245 break;
a0ed51b3 4246 }
d6a28714 4247
0a138b74 4248 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4249 if (UCHARAT(s) != nextchr
4250 && !NEXTCHR_IS_EOS
4251 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4252 {
a0ed51b3 4253 sayNO;
9a5a5549 4254 }
24d3c4a9 4255 if (PL_regeol - locinput < ln)
b8c5462f 4256 sayNO;
9a5a5549 4257 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4258 sayNO;
24d3c4a9 4259 locinput += ln;
a0d0e21e 4260 break;
9a5a5549 4261 }
63ac0dad
KW
4262
4263 /* XXX Could improve efficiency by separating these all out using a
4264 * macro or in-line function. At that point regcomp.c would no longer
4265 * have to set the FLAGS fields of these */
3c0563b9
DM
4266 case BOUNDL: /* /\b/l */
4267 case NBOUNDL: /* /\B/l */
b2680017
YO
4268 PL_reg_flags |= RF_tainted;
4269 /* FALL THROUGH */
3c0563b9
DM
4270 case BOUND: /* /\b/ */
4271 case BOUNDU: /* /\b/u */
4272 case BOUNDA: /* /\b/a */
4273 case NBOUND: /* /\B/ */
4274 case NBOUNDU: /* /\B/u */
4275 case NBOUNDA: /* /\B/a */
b2680017 4276 /* was last char in word? */
f2e96b5d
KW
4277 if (utf8_target
4278 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4279 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4280 {
b2680017
YO
4281 if (locinput == PL_bostr)
4282 ln = '\n';
4283 else {
4284 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4285
4286 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4287 }
63ac0dad 4288 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017 4289 ln = isALNUM_uni(ln);
7016d6eb
DM
4290 if (NEXTCHR_IS_EOS)
4291 n = 0;
4292 else {
4293 LOAD_UTF8_CHARCLASS_ALNUM();
4294 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4295 utf8_target);
4296 }
b2680017
YO
4297 }
4298 else {
4299 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
7016d6eb 4300 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
b2680017
YO
4301 }
4302 }
4303 else {
cfaf538b
KW
4304
4305 /* Here the string isn't utf8, or is utf8 and only ascii
4306 * characters are to match \w. In the latter case looking at
4307 * the byte just prior to the current one may be just the final
4308 * byte of a multi-byte character. This is ok. There are two
4309 * cases:
4310 * 1) it is a single byte character, and then the test is doing
4311 * just what it's supposed to.
4312 * 2) it is a multi-byte character, in which case the final
4313 * byte is never mistakable for ASCII, and so the test
4314 * will say it is not a word character, which is the
4315 * correct answer. */
b2680017
YO
4316 ln = (locinput != PL_bostr) ?
4317 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4318 switch (FLAGS(scan)) {
4319 case REGEX_UNICODE_CHARSET:
4320 ln = isWORDCHAR_L1(ln);
7016d6eb 4321 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4322 break;
4323 case REGEX_LOCALE_CHARSET:
4324 ln = isALNUM_LC(ln);
7016d6eb 4325 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
63ac0dad
KW
4326 break;
4327 case REGEX_DEPENDS_CHARSET:
4328 ln = isALNUM(ln);
7016d6eb 4329 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
63ac0dad 4330 break;
cfaf538b 4331 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4332 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4333 ln = isWORDCHAR_A(ln);
7016d6eb 4334 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4335 break;
63ac0dad
KW
4336 default:
4337 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4338 break;
b2680017
YO
4339 }
4340 }
63ac0dad
KW
4341 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4342 * regcomp.sym */
4343 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4344 sayNO;
4345 break;
3c0563b9 4346
3c0563b9 4347 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4348 if (NEXTCHR_IS_EOS)
4349 sayNO;
e0193e47 4350 if (utf8_target) {
635cd5d4 4351 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4352 sayNO;
635cd5d4 4353 locinput += UTF8SKIP(locinput);
e0f9d4a8 4354 break;
ffc61ed2
JH
4355 }
4356 else {
20ed0b26 4357 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4358 sayNO;
3640db6b 4359 locinput++;
e0f9d4a8
JH
4360 break;
4361 }
b8c5462f 4362 break;
3c0563b9
DM
4363
4364 /* Special char classes: \d, \w etc.
4365 * The defines start on line 166 or so */
ee9a90b8
KW
4366 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4367 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4368 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 4369 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 4370 alnum, "a");
ee9a90b8 4371
bedac28b
KW
4372 case SPACEL:
4373 PL_reg_flags |= RF_tainted;
4374 if (NEXTCHR_IS_EOS) {
4375 sayNO;
4376 }
4377 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4378 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4379 sayNO;
4380 }
4381 }
4382 else if (! isSPACE_LC((U8) nextchr)) {
4383 sayNO;
4384 }
4385 goto increment_locinput;
4386
4387 case NSPACEL:
4388 PL_reg_flags |= RF_tainted;
4389 if (NEXTCHR_IS_EOS) {
4390 sayNO;
4391 }
4392 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4393 if (isSPACE_LC_utf8((U8 *) locinput)) {
4394 sayNO;
4395 }
4396 }
4397 else if (isSPACE_LC(nextchr)) {
4398 sayNO;
4399 }
4400 goto increment_locinput;
4401
4402 case SPACE:
4403 if (utf8_target) {
4404 goto utf8_space;
4405 }
4406 /* FALL THROUGH */
4407 case SPACEA:
4408 if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4409 sayNO;
4410 }
4411 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4412 locinput++;
4413 break;
4414
4415 case NSPACE:
4416 if (utf8_target) {
4417 goto utf8_nspace;
4418 }
4419 /* FALL THROUGH */
4420 case NSPACEA:
4421 if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4422 sayNO;
4423 }
4424 goto increment_locinput;
4425
4426 case SPACEU:
4427 utf8_space:
4428 if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4429 sayNO;
4430 }
4431 goto increment_locinput;
4432
4433 case NSPACEU:
4434 utf8_nspace:
4435 if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4436 sayNO;
4437 }
4438 goto increment_locinput;
ee9a90b8
KW
4439
4440 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4441 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 4442 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 4443 digit, "0");
20d0b1e9 4444
3c0563b9 4445 case POSIXA: /* /[[:ascii:]]/ etc */
7016d6eb 4446 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4447 sayNO;
4448 }
4449 /* Matched a utf8-invariant, so don't have to worry about utf8 */
3640db6b 4450 locinput++;
0658cdde 4451 break;
3c0563b9
DM
4452
4453 case NPOSIXA: /* /[^[:ascii:]]/ etc */
7016d6eb 4454 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4455 sayNO;
4456 }
28b98f76 4457 goto increment_locinput;
0658cdde 4458
37e2e78e
KW
4459 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4460 a Unicode extended Grapheme Cluster */
4461 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4462 extended Grapheme Cluster is:
4463
4464 CR LF
4465 | Prepend* Begin Extend*
4466 | .
4467
1e958ea9
KW
4468 Begin is: ( Special_Begin | ! Control )
4469 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4470 Extend is: ( Grapheme_Extend | Spacing_Mark )
f67f9e53 4471 Control is: [ GCB_Control | CR | LF ]
1e958ea9 4472 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4473
27d4fc33
KW
4474 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4475 we can rewrite
4476
4477 Begin is ( Regular_Begin + Special Begin )
4478
4479 It turns out that 98.4% of all Unicode code points match
4480 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4481 the previous implementation for almost all Unicode code points.
27d4fc33 4482
37e2e78e
KW
4483 There is a subtlety with Prepend* which showed up in testing.
4484 Note that the Begin, and only the Begin is required in:
4485 | Prepend* Begin Extend*
cc3b396d
KW
4486 Also, Begin contains '! Control'. A Prepend must be a
4487 '! Control', which means it must also be a Begin. What it
4488 comes down to is that if we match Prepend* and then find no
4489 suitable Begin afterwards, that if we backtrack the last
4490 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4491 */
4492
7016d6eb 4493 if (NEXTCHR_IS_EOS)
a0ed51b3 4494 sayNO;
f2ed9b32 4495 if (! utf8_target) {
37e2e78e
KW
4496
4497 /* Match either CR LF or '.', as all the other possibilities
4498 * require utf8 */
4499 locinput++; /* Match the . or CR */
cc3b396d
KW
4500 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4501 match the LF */
37e2e78e 4502 && locinput < PL_regeol
e699a1d5
KW
4503 && UCHARAT(locinput) == '\n')
4504 {
4505 locinput++;
4506 }
37e2e78e
KW
4507 }
4508 else {
4509
4510 /* Utf8: See if is ( CR LF ); already know that locinput <
4511 * PL_regeol, so locinput+1 is in bounds */
7016d6eb 4512 if ( nextchr == '\r' && locinput+1 < PL_regeol
f67f9e53 4513 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4514 {
37e2e78e
KW
4515 locinput += 2;
4516 }
4517 else {
45fdf108
KW
4518 STRLEN len;
4519
37e2e78e
KW
4520 /* In case have to backtrack to beginning, then match '.' */
4521 char *starting = locinput;
4522
4523 /* In case have to backtrack the last prepend */
e699a1d5 4524 char *previous_prepend = NULL;
37e2e78e
KW
4525
4526 LOAD_UTF8_CHARCLASS_GCB();
4527
45fdf108
KW
4528 /* Match (prepend)* */
4529 while (locinput < PL_regeol
4530 && (len = is_GCB_Prepend_utf8(locinput)))
4531 {
4532 previous_prepend = locinput;
4533 locinput += len;
a1853d78 4534 }
37e2e78e
KW
4535
4536 /* As noted above, if we matched a prepend character, but
4537 * the next thing won't match, back off the last prepend we
4538 * matched, as it is guaranteed to match the begin */
4539 if (previous_prepend
4540 && (locinput >= PL_regeol
c101f46d
KW
4541 || (! swash_fetch(PL_utf8_X_regular_begin,
4542 (U8*)locinput, utf8_target)
bff53399 4543 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4544 )
37e2e78e
KW
4545 {
4546 locinput = previous_prepend;
4547 }
4548
4549 /* Note that here we know PL_regeol > locinput, as we
4550 * tested that upon input to this switch case, and if we
4551 * moved locinput forward, we tested the result just above
4552 * and it either passed, or we backed off so that it will
4553 * now pass */
11dfcd49
KW
4554 if (swash_fetch(PL_utf8_X_regular_begin,
4555 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4556 locinput += UTF8SKIP(locinput);
4557 }
bff53399 4558 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4559
4560 /* Here did not match the required 'Begin' in the
4561 * second term. So just match the very first
4562 * character, the '.' of the final term of the regex */
4563 locinput = starting + UTF8SKIP(starting);
27d4fc33 4564 goto exit_utf8;
37e2e78e
KW
4565 } else {
4566
11dfcd49
KW
4567 /* Here is a special begin. It can be composed of
4568 * several individual characters. One possibility is
4569 * RI+ */
45fdf108
KW
4570 if ((len = is_GCB_RI_utf8(locinput))) {
4571 locinput += len;
11dfcd49 4572 while (locinput < PL_regeol
45fdf108 4573 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4574 {
45fdf108 4575 locinput += len;
11dfcd49 4576 }
45fdf108
KW
4577 } else if ((len = is_GCB_T_utf8(locinput))) {
4578 /* Another possibility is T+ */
4579 locinput += len;
11dfcd49 4580 while (locinput < PL_regeol
45fdf108 4581 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4582 {
45fdf108 4583 locinput += len;
11dfcd49
KW
4584 }
4585 } else {
4586
4587 /* Here, neither RI+ nor T+; must be some other
4588 * Hangul. That means it is one of the others: L,
4589 * LV, LVT or V, and matches:
4590 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4591
4592 /* Match L* */
4593 while (locinput < PL_regeol
45fdf108 4594 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4595 {
45fdf108 4596 locinput += len;
11dfcd49 4597 }
37e2e78e 4598
11dfcd49
KW
4599 /* Here, have exhausted L*. If the next character
4600 * is not an LV, LVT nor V, it means we had to have
4601 * at least one L, so matches L+ in the original
4602 * equation, we have a complete hangul syllable.
4603 * Are done. */
4604
4605 if (locinput < PL_regeol
45fdf108 4606 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4607 {
11dfcd49 4608 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4609 * See if LVT, by first ruling out V, then LV */
4610 if (! is_GCB_V_utf8(locinput)
4611 /* All but every TCount one is LV */
4612 && (valid_utf8_to_uvchr((U8 *) locinput,
4613 NULL)
4614 - SBASE)
4615 % TCount != 0)
4616 {
11dfcd49
KW
4617 locinput += UTF8SKIP(locinput);
4618 } else {
4619
4620 /* Must be V or LV. Take it, then match
4621 * V* */
4622 locinput += UTF8SKIP(locinput);
4623 while (locinput < PL_regeol
45fdf108 4624 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4625 {
45fdf108 4626 locinput += len;
11dfcd49
KW
4627 }
4628 }
37e2e78e 4629
11dfcd49 4630 /* And any of LV, LVT, or V can be followed
45fdf108 4631 * by T* */
11dfcd49 4632 while (locinput < PL_regeol
45fdf108 4633 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4634 {
45fdf108 4635 locinput += len;
11dfcd49
KW
4636 }
4637 }
cd94d768 4638 }
11dfcd49 4639 }
37e2e78e 4640
11dfcd49
KW
4641 /* Match any extender */
4642 while (locinput < PL_regeol
4643 && swash_fetch(PL_utf8_X_extend,
4644 (U8*)locinput, utf8_target))
4645 {
4646 locinput += UTF8SKIP(locinput);
4647 }
37e2e78e 4648 }
27d4fc33 4649 exit_utf8:
37e2e78e
KW
4650 if (locinput > PL_regeol) sayNO;
4651 }
a0ed51b3 4652 break;
81714fb9 4653
3c0563b9 4654 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4655 { /* The capture buffer cases. The ones beginning with N for the
4656 named buffers just convert to the equivalent numbered and
4657 pretend they were called as the corresponding numbered buffer
4658 op. */
26ecd678
TC
4659 /* don't initialize these in the declaration, it makes C++
4660 unhappy */
81714fb9 4661 char *s;
ff1157ca 4662 char type;
8368298a
TC
4663 re_fold_t folder;
4664 const U8 *fold_array;
26ecd678 4665 UV utf8_fold_flags;
8368298a 4666
81714fb9 4667 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4668 folder = foldEQ_locale;
4669 fold_array = PL_fold_locale;
4670 type = REFFL;
17580e7a 4671 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4672 goto do_nref;
4673
3c0563b9 4674 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4675 folder = foldEQ_latin1;
4676 fold_array = PL_fold_latin1;
4677 type = REFFA;
4678 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4679 goto do_nref;
4680
3c0563b9 4681 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4682 folder = foldEQ_latin1;
4683 fold_array = PL_fold_latin1;
4684 type = REFFU;
d513472c 4685 utf8_fold_flags = 0;
d7ef4b73
KW
4686 goto do_nref;
4687
3c0563b9 4688 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4689 folder = foldEQ;
4690 fold_array = PL_fold;
4691 type = REFF;
d513472c 4692 utf8_fold_flags = 0;
d7ef4b73
KW
4693 goto do_nref;
4694
3c0563b9 4695 case NREF: /* /\g{name}/ */
d7ef4b73 4696 type = REF;
83d7b90b
KW
4697 folder = NULL;
4698 fold_array = NULL;
d513472c 4699 utf8_fold_flags = 0;
d7ef4b73
KW
4700 do_nref:
4701
4702 /* For the named back references, find the corresponding buffer
4703 * number */
0a4db386
YO
4704 n = reg_check_named_buff_matched(rex,scan);
4705
d7ef4b73 4706 if ( ! n ) {
81714fb9 4707 sayNO;
d7ef4b73
KW
4708 }
4709 goto do_nref_ref_common;
4710
3c0563b9 4711 case REFFL: /* /\1/il */
3280af22 4712 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4713 folder = foldEQ_locale;
4714 fold_array = PL_fold_locale;
17580e7a 4715 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4716 goto do_ref;
4717
3c0563b9 4718 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4719 folder = foldEQ_latin1;
4720 fold_array = PL_fold_latin1;
4721 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4722 goto do_ref;
4723
3c0563b9 4724 case REFFU: /* /\1/iu */
d7ef4b73
KW
4725 folder = foldEQ_latin1;
4726 fold_array = PL_fold_latin1;
d513472c 4727 utf8_fold_flags = 0;
d7ef4b73
KW
4728 goto do_ref;
4729
3c0563b9 4730 case REFF: /* /\1/i */
d7ef4b73
KW
4731 folder = foldEQ;
4732 fold_array = PL_fold;
d513472c 4733 utf8_fold_flags = 0;
83d7b90b 4734 goto do_ref;
d7ef4b73 4735
3c0563b9 4736 case REF: /* /\1/ */
83d7b90b
KW
4737 folder = NULL;
4738 fold_array = NULL;
d513472c 4739 utf8_fold_flags = 0;
83d7b90b 4740
d7ef4b73 4741 do_ref:
81714fb9 4742 type = OP(scan);
d7ef4b73
KW
4743 n = ARG(scan); /* which paren pair */
4744
4745 do_nref_ref_common:
b93070ed 4746 ln = rex->offs[n].start;
2c2d71f5 4747 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4748 if (rex->lastparen < n || ln == -1)
af3f8c16 4749 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4750 if (ln == rex->offs[n].end)
a0d0e21e 4751 break;
a0ed51b3 4752
24d3c4a9 4753 s = PL_bostr + ln;
d7ef4b73 4754 if (type != REF /* REF can do byte comparison */
2f65c56d 4755 && (utf8_target || type == REFFU))
d7ef4b73
KW
4756 { /* XXX handle REFFL better */
4757 char * limit = PL_regeol;
4758
4759 /* This call case insensitively compares the entire buffer
4760 * at s, with the current input starting at locinput, but
4761 * not going off the end given by PL_regeol, and returns in
2db97d41 4762 * <limit> upon success, how much of the current input was
d7ef4b73 4763 * matched */
b93070ed 4764 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4765 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4766 {
4767 sayNO;
a0ed51b3 4768 }
d7ef4b73 4769 locinput = limit;
a0ed51b3
LW
4770 break;
4771 }
4772
d7ef4b73 4773 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4774 if (!NEXTCHR_IS_EOS &&
4775 UCHARAT(s) != nextchr &&
81714fb9 4776 (type == REF ||
d7ef4b73 4777 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4778 sayNO;
b93070ed 4779 ln = rex->offs[n].end - ln;
24d3c4a9 4780 if (locinput + ln > PL_regeol)
4633a7c4 4781 sayNO;
81714fb9 4782 if (ln > 1 && (type == REF
24d3c4a9 4783 ? memNE(s, locinput, ln)
d7ef4b73 4784 : ! folder(s, locinput, ln)))
4633a7c4 4785 sayNO;
24d3c4a9 4786 locinput += ln;
a0d0e21e 4787 break;
81714fb9 4788 }
3c0563b9
DM
4789
4790 case NOTHING: /* null op; e.g. the 'nothing' following
4791 * the '*' in m{(a+|b)*}' */
4792 break;
4793 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4794 break;
3c0563b9
DM
4795
4796 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4797 break;
40a82448
DM
4798
4799#undef ST
4800#define ST st->u.eval
c277df42 4801 {
c277df42 4802 SV *ret;
d2f13c59 4803 REGEXP *re_sv;
6bda09f9 4804 regexp *re;
f8fc2ecf 4805 regexp_internal *rei;
1a147d38
YO
4806 regnode *startpoint;
4807
3c0563b9 4808 case GOSTART: /* (?R) */
e7707071
YO
4809 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4810 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4811 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4812 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4813 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4814 Perl_croak(aTHX_
4815 "Pattern subroutine nesting without pos change"
4816 " exceeded limit in regex");
6bda09f9
YO
4817 } else {
4818 nochange_depth = 0;
1a147d38 4819 }
288b8c02 4820 re_sv = rex_sv;
6bda09f9 4821 re = rex;
f8fc2ecf 4822 rei = rexi;
1a147d38 4823 if (OP(scan)==GOSUB) {
6bda09f9
YO
4824 startpoint = scan + ARG2L(scan);
4825 ST.close_paren = ARG(scan);
4826 } else {
f8fc2ecf 4827 startpoint = rei->program+1;
6bda09f9
YO
4828 ST.close_paren = 0;
4829 }
4830 goto eval_recurse_doit;
118e2215 4831 assert(0); /* NOTREACHED */
3c0563b9 4832
6bda09f9
YO
4833 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4834 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4835 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4836 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4837 } else {
4838 nochange_depth = 0;
4839 }
8e5e9ebe 4840 {
4aabdb9b 4841 /* execute the code in the {...} */
81ed78b2 4842
4aabdb9b 4843 dSP;
a6dc34f1 4844 IV before;
1f4d1a1e 4845 OP * const oop = PL_op;
4aabdb9b 4846 COP * const ocurcop = PL_curcop;
81ed78b2 4847 OP *nop;
d80618d2 4848 char *saved_regeol = PL_regeol;
91332126 4849 struct re_save_state saved_state;
81ed78b2 4850 CV *newcv;
91332126 4851
74088413 4852 /* save *all* paren positions */
92da3157 4853 regcppush(rex, 0, maxopenparen);
74088413
DM
4854 REGCP_SET(runops_cp);
4855
6562f1c4 4856 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4857 * eval we would normally put it on the save stack, like with
4858 * save_re_context. However, re-evals have a weird scoping so we
4859 * can't just add ENTER/LEAVE here. With that, things like
4860 *
4861 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4862 *
4863 * would break, as they expect the localisation to be unwound
4864 * only when the re-engine backtracks through the bit that
4865 * localised it.
4866 *
4867 * What we do instead is just saving the state in a local c
4868 * variable.
4869 */
91332126 4870 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4871
d24ca0c5 4872 PL_reg_state.re_reparsing = FALSE;
91332126 4873
81ed78b2
DM
4874 if (!caller_cv)
4875 caller_cv = find_runcv(NULL);
4876
4aabdb9b 4877 n = ARG(scan);
81ed78b2 4878
b30fcab9 4879 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4880 newcv = (ReANY(
b30fcab9
DM
4881 (REGEXP*)(rexi->data->data[n])
4882 ))->qr_anoncv
81ed78b2
DM
4883 ;
4884 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4885 }
4886 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4887 newcv = caller_cv;
4888 nop = (OP*)rexi->data->data[n];
4889 assert(CvDEPTH(newcv));
68e2671b
DM
4890 }
4891 else {
d24ca0c5
DM
4892 /* literal with own CV */
4893 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4894 newcv = rex->qr_anoncv;
4895 nop = (OP*)rexi->data->data[n];
68e2671b 4896 }
81ed78b2 4897
0e458318
DM
4898 /* normally if we're about to execute code from the same
4899 * CV that we used previously, we just use the existing
4900 * CX stack entry. However, its possible that in the
4901 * meantime we may have backtracked, popped from the save
4902 * stack, and undone the SAVECOMPPAD(s) associated with
4903 * PUSH_MULTICALL; in which case PL_comppad no longer
4904 * points to newcv's pad. */
4905 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4906 {
4907 I32 depth = (newcv == caller_cv) ? 0 : 1;
4908 if (last_pushed_cv) {
4909 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4910 }
4911 else {
4912 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4913 }
4914 last_pushed_cv = newcv;
4915 }
c31ee3bb
DM
4916 else {
4917 /* these assignments are just to silence compiler
4918 * warnings */
4919 multicall_cop = NULL;
4920 newsp = NULL;
4921 }
0e458318
DM
4922 last_pad = PL_comppad;
4923
2e2e3f36
DM
4924 /* the initial nextstate you would normally execute
4925 * at the start of an eval (which would cause error
4926 * messages to come from the eval), may be optimised
4927 * away from the execution path in the regex code blocks;
4928 * so manually set PL_curcop to it initially */
4929 {
81ed78b2 4930 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4931 assert(o->op_type == OP_NULL);
4932 if (o->op_targ == OP_SCOPE) {
4933 o = cUNOPo->op_first;
4934 }
4935 else {
4936 assert(o->op_targ == OP_LEAVE);
4937 o = cUNOPo->op_first;
4938 assert(o->op_type == OP_ENTER);
4939 o = o->op_sibling;
4940 }
4941
4942 if (o->op_type != OP_STUB) {
4943 assert( o->op_type == OP_NEXTSTATE
4944 || o->op_type == OP_DBSTATE
4945 || (o->op_type == OP_NULL
4946 && ( o->op_targ == OP_NEXTSTATE
4947 || o->op_targ == OP_DBSTATE
4948 )
4949 )
4950 );
4951 PL_curcop = (COP*)o;
4952 }
4953 }
81ed78b2 4954 nop = nop->op_next;
2e2e3f36 4955
24b23f37 4956 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4957 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4958
b93070ed 4959 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4960
2bf803e2
YO
4961 if (sv_yes_mark) {
4962 SV *sv_mrk = get_sv("REGMARK", 1);
4963 sv_setsv(sv_mrk, sv_yes_mark);
4964 }
4965
81ed78b2
DM
4966 /* we don't use MULTICALL here as we want to call the
4967 * first op of the block of interest, rather than the
4968 * first op of the sub */
a6dc34f1 4969 before = (IV)(SP-PL_stack_base);
81ed78b2 4970 PL_op = nop;
8e5e9ebe
RGS
4971 CALLRUNOPS(aTHX); /* Scalar context. */
4972 SPAGAIN;
a6dc34f1 4973 if ((IV)(SP-PL_stack_base) == before)
075aa684 4974 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4975 else {
4976 ret = POPs;
4977 PUTBACK;
4978 }
4aabdb9b 4979
e4bfbed3
DM
4980 /* before restoring everything, evaluate the returned
4981 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4982 * PL_op or pad. Also need to process any magic vars
4983 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4984
4985 PL_op = NULL;
4986
5e98dac2 4987 re_sv = NULL;
e4bfbed3
DM
4988 if (logical == 0) /* (?{})/ */
4989 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4990 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4991 sw = cBOOL(SvTRUE(ret));
4992 logical = 0;
4993 }
4994 else { /* /(??{}) */
497d0a96
DM
4995 /* if its overloaded, let the regex compiler handle
4996 * it; otherwise extract regex, or stringify */
4997 if (!SvAMAGIC(ret)) {
4998 SV *sv = ret;
4999 if (SvROK(sv))
5000 sv = SvRV(sv);
5001 if (SvTYPE(sv) == SVt_REGEXP)
5002 re_sv = (REGEXP*) sv;
5003 else if (SvSMAGICAL(sv)) {
5004 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5005 if (mg)
5006 re_sv = (REGEXP *) mg->mg_obj;
5007 }
e4bfbed3 5008
497d0a96
DM
5009 /* force any magic, undef warnings here */
5010 if (!re_sv) {
5011 ret = sv_mortalcopy(ret);
5012 (void) SvPV_force_nolen(ret);
5013 }
e4bfbed3
DM
5014 }
5015
5016 }
5017
91332126
FR
5018 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5019
81ed78b2
DM
5020 /* *** Note that at this point we don't restore
5021 * PL_comppad, (or pop the CxSUB) on the assumption it may
5022 * be used again soon. This is safe as long as nothing
5023 * in the regexp code uses the pad ! */
4aabdb9b 5024 PL_op = oop;
4aabdb9b 5025 PL_curcop = ocurcop;
d80618d2 5026 PL_regeol = saved_regeol;
92da3157 5027 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
e4bfbed3
DM
5028
5029 if (logical != 2)
4aabdb9b 5030 break;
8e5e9ebe 5031 }
e4bfbed3
DM
5032
5033 /* only /(??{})/ from now on */
24d3c4a9 5034 logical = 0;
4aabdb9b 5035 {
4f639d21
DM
5036 /* extract RE object from returned value; compiling if
5037 * necessary */
5c35adbb 5038
575c37f6
DM
5039 if (re_sv) {
5040 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5041 }
0f5d15d6 5042 else {
c737faaf 5043 U32 pm_flags = 0;
0f5d15d6 5044
9753d940
DM
5045 if (SvUTF8(ret) && IN_BYTES) {
5046 /* In use 'bytes': make a copy of the octet
5047 * sequence, but without the flag on */
b9ad30b4
NC
5048 STRLEN len;
5049 const char *const p = SvPV(ret, len);
5050 ret = newSVpvn_flags(p, len, SVs_TEMP);
5051 }
732caac7
DM
5052 if (rex->intflags & PREGf_USE_RE_EVAL)
5053 pm_flags |= PMf_USE_RE_EVAL;
5054
5055 /* if we got here, it should be an engine which
5056 * supports compiling code blocks and stuff */
5057 assert(rex->engine && rex->engine->op_comp);
ec841a27 5058 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5059 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5060 rex->engine, NULL, NULL,
5061 /* copy /msix etc to inner pattern */
5062 scan->flags,
5063 pm_flags);
732caac7 5064
9041c2e3 5065 if (!(SvFLAGS(ret)
faf82a0b 5066 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 5067 | SVs_GMG))) {
a2794585
NC
5068 /* This isn't a first class regexp. Instead, it's
5069 caching a regexp onto an existing, Perl visible
5070 scalar. */
575c37f6 5071 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5072 }
74088413
DM
5073 /* safe to do now that any $1 etc has been
5074 * interpolated into the new pattern string and
5075 * compiled */
92da3157 5076 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
0f5d15d6 5077 }
e1ff3a88 5078 SAVEFREESV(re_sv);
8d919b0a 5079 re = ReANY(re_sv);
4aabdb9b 5080 }
07bc277f 5081 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5082 re->subbeg = rex->subbeg;
5083 re->sublen = rex->sublen;
6502e081
DM
5084 re->suboffset = rex->suboffset;
5085 re->subcoffset = rex->subcoffset;
f8fc2ecf 5086 rei = RXi_GET(re);
6bda09f9 5087 DEBUG_EXECUTE_r(
f2ed9b32 5088 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
5089 "Matching embedded");
5090 );
f8fc2ecf 5091 startpoint = rei->program + 1;
1a147d38 5092 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5093
1a147d38 5094 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5095 /* run the pattern returned from (??{...}) */
92da3157
DM
5096
5097 /* Save *all* the positions. */
5098 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5099 REGCP_SET(ST.lastcp);
6bda09f9 5100
0357f1fd
ML
5101 re->lastparen = 0;
5102 re->lastcloseparen = 0;
5103
92da3157 5104 maxopenparen = 0;
4aabdb9b
DM
5105
5106 /* XXXX This is too dramatic a measure... */
5107 PL_reg_maxiter = 0;
5108
faec1544 5109 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 5110 if (RX_UTF8(re_sv))
faec1544
DM
5111 PL_reg_flags |= RF_utf8;
5112 else
5113 PL_reg_flags &= ~RF_utf8;
5114 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5115
288b8c02 5116 ST.prev_rex = rex_sv;
faec1544 5117 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5118 rex_sv = re_sv;
5119 SET_reg_curpm(rex_sv);
288b8c02 5120 rex = re;
f8fc2ecf 5121 rexi = rei;
faec1544 5122 cur_curlyx = NULL;
40a82448 5123 ST.B = next;
faec1544
DM
5124 ST.prev_eval = cur_eval;
5125 cur_eval = st;
faec1544 5126 /* now continue from first node in postoned RE */
4d5016e5 5127 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5128 assert(0); /* NOTREACHED */
c277df42 5129 }
40a82448 5130
faec1544
DM
5131 case EVAL_AB: /* cleanup after a successful (??{A})B */
5132 /* note: this is called twice; first after popping B, then A */
5133 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
5134 rex_sv = ST.prev_rex;
5135 SET_reg_curpm(rex_sv);
8d919b0a 5136 rex = ReANY(rex_sv);
f8fc2ecf 5137 rexi = RXi_GET(rex);
faec1544
DM
5138 regcpblow(ST.cp);
5139 cur_eval = ST.prev_eval;
5140 cur_curlyx = ST.prev_curlyx;
34a81e2b 5141
40a82448
DM
5142 /* XXXX This is too dramatic a measure... */
5143 PL_reg_maxiter = 0;
e7707071 5144 if ( nochange_depth )
4b196cd4 5145 nochange_depth--;
262b90c4 5146 sayYES;
40a82448 5147
40a82448 5148
faec1544
DM
5149 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5150 /* note: this is called twice; first after popping B, then A */
5151 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
5152 rex_sv = ST.prev_rex;
5153 SET_reg_curpm(rex_sv);
8d919b0a 5154 rex = ReANY(rex_sv);
f8fc2ecf 5155 rexi = RXi_GET(rex);
0357f1fd 5156
40a82448 5157 REGCP_UNWIND(ST.lastcp);
92da3157 5158 regcppop(rex, &maxopenparen);
faec1544
DM
5159 cur_eval = ST.prev_eval;
5160 cur_curlyx = ST.prev_curlyx;
5161 /* XXXX This is too dramatic a measure... */
5162 PL_reg_maxiter = 0;
e7707071 5163 if ( nochange_depth )
4b196cd4 5164 nochange_depth--;
40a82448 5165 sayNO_SILENT;
40a82448
DM
5166#undef ST
5167
3c0563b9 5168 case OPEN: /* ( */
c277df42 5169 n = ARG(scan); /* which paren pair */
1ca2007e 5170 rex->offs[n].start_tmp = locinput - PL_bostr;
92da3157
DM
5171 if (n > maxopenparen)
5172 maxopenparen = n;
495f47a5 5173 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5174 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5175 PTR2UV(rex),
5176 PTR2UV(rex->offs),
5177 (UV)n,
5178 (IV)rex->offs[n].start_tmp,
92da3157 5179 (UV)maxopenparen
495f47a5 5180 ));
e2e6a0f1 5181 lastopen = n;
a0d0e21e 5182 break;
495f47a5
DM
5183
5184/* XXX really need to log other places start/end are set too */
5185#define CLOSE_CAPTURE \
5186 rex->offs[n].start = rex->offs[n].start_tmp; \
5187 rex->offs[n].end = locinput - PL_bostr; \
5188 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5189 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5190 PTR2UV(rex), \
5191 PTR2UV(rex->offs), \
5192 (UV)n, \
5193 (IV)rex->offs[n].start, \
5194 (IV)rex->offs[n].end \
5195 ))
5196
3c0563b9 5197 case CLOSE: /* ) */
c277df42 5198 n = ARG(scan); /* which paren pair */
495f47a5 5199 CLOSE_CAPTURE;
b93070ed
DM
5200 if (n > rex->lastparen)
5201 rex->lastparen = n;
5202 rex->lastcloseparen = n;
3b6647e0 5203 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5204 goto fake_end;
5205 }
a0d0e21e 5206 break;
3c0563b9
DM
5207
5208 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5209 if (ARG(scan)){
5210 regnode *cursor;
5211 for (cursor=scan;
5212 cursor && OP(cursor)!=END;
5213 cursor=regnext(cursor))
5214 {
5215 if ( OP(cursor)==CLOSE ){
5216 n = ARG(cursor);
5217 if ( n <= lastopen ) {
495f47a5 5218 CLOSE_CAPTURE;
b93070ed
DM
5219 if (n > rex->lastparen)
5220 rex->lastparen = n;
5221 rex->lastcloseparen = n;
3b6647e0
RB
5222 if ( n == ARG(scan) || (cur_eval &&
5223 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5224 break;
5225 }
5226 }
5227 }
5228 }
5229 goto fake_end;
5230 /*NOTREACHED*/
3c0563b9
DM
5231
5232 case GROUPP: /* (?(1)) */
c277df42 5233 n = ARG(scan); /* which paren pair */
b93070ed 5234 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5235 break;
3c0563b9
DM
5236
5237 case NGROUPP: /* (?(<name>)) */
0a4db386 5238 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5239 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5240 break;
3c0563b9
DM
5241
5242 case INSUBP: /* (?(R)) */
0a4db386 5243 n = ARG(scan);
3b6647e0 5244 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5245 break;
3c0563b9
DM
5246
5247 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5248 sw = 0;
5249 break;
3c0563b9
DM
5250
5251 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5252 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5253 if (sw)
c277df42
IZ
5254 next = NEXTOPER(NEXTOPER(scan));
5255 else {
5256 next = scan + ARG(scan);
5257 if (OP(next) == IFTHEN) /* Fake one. */
5258 next = NEXTOPER(NEXTOPER(next));
5259 }
5260 break;
3c0563b9
DM
5261
5262 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5263 logical = scan->flags;
c277df42 5264 break;
c476f425 5265
2ab05381 5266/*******************************************************************
2ab05381 5267
c476f425
DM
5268The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5269pattern, where A and B are subpatterns. (For simple A, CURLYM or
5270STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5271
c476f425 5272A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5273
c476f425
DM
5274On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5275state, which contains the current count, initialised to -1. It also sets
5276cur_curlyx to point to this state, with any previous value saved in the
5277state block.
2ab05381 5278
c476f425
DM
5279CURLYX then jumps straight to the WHILEM op, rather than executing A,
5280since the pattern may possibly match zero times (i.e. it's a while {} loop
5281rather than a do {} while loop).
2ab05381 5282
c476f425
DM
5283Each entry to WHILEM represents a successful match of A. The count in the
5284CURLYX block is incremented, another WHILEM state is pushed, and execution
5285passes to A or B depending on greediness and the current count.
2ab05381 5286
c476f425
DM
5287For example, if matching against the string a1a2a3b (where the aN are
5288substrings that match /A/), then the match progresses as follows: (the
5289pushed states are interspersed with the bits of strings matched so far):
2ab05381 5290
c476f425
DM
5291 <CURLYX cnt=-1>
5292 <CURLYX cnt=0><WHILEM>
5293 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5294 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5295 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5296 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5297
c476f425
DM
5298(Contrast this with something like CURLYM, which maintains only a single
5299backtrack state:
2ab05381 5300
c476f425
DM
5301 <CURLYM cnt=0> a1
5302 a1 <CURLYM cnt=1> a2
5303 a1 a2 <CURLYM cnt=2> a3
5304 a1 a2 a3 <CURLYM cnt=3> b
5305)
2ab05381 5306
c476f425
DM
5307Each WHILEM state block marks a point to backtrack to upon partial failure
5308of A or B, and also contains some minor state data related to that
5309iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5310overall state, such as the count, and pointers to the A and B ops.
2ab05381 5311
c476f425
DM
5312This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5313must always point to the *current* CURLYX block, the rules are:
2ab05381 5314
c476f425
DM
5315When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5316and set cur_curlyx to point the new block.
2ab05381 5317
c476f425
DM
5318When popping the CURLYX block after a successful or unsuccessful match,
5319restore the previous cur_curlyx.
2ab05381 5320
c476f425
DM
5321When WHILEM is about to execute B, save the current cur_curlyx, and set it
5322to the outer one saved in the CURLYX block.
2ab05381 5323
c476f425
DM
5324When popping the WHILEM block after a successful or unsuccessful B match,
5325restore the previous cur_curlyx.
2ab05381 5326
c476f425
DM
5327Here's an example for the pattern (AI* BI)*BO
5328I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5329
c476f425
DM
5330cur_
5331curlyx backtrack stack
5332------ ---------------
5333NULL
5334CO <CO prev=NULL> <WO>
5335CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5336CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5337NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5338
c476f425
DM
5339At this point the pattern succeeds, and we work back down the stack to
5340clean up, restoring as we go:
95b24440 5341
c476f425
DM
5342CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5343CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5344CO <CO prev=NULL> <WO>
5345NULL
a0374537 5346
c476f425
DM
5347*******************************************************************/
5348
5349#define ST st->u.curlyx
5350
5351 case CURLYX: /* start of /A*B/ (for complex A) */
5352 {
5353 /* No need to save/restore up to this paren */
5354 I32 parenfloor = scan->flags;
5355
5356 assert(next); /* keep Coverity happy */
5357 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5358 next += ARG(next);
5359
5360 /* XXXX Probably it is better to teach regpush to support
92da3157 5361 parenfloor > maxopenparen ... */
b93070ed
DM
5362 if (parenfloor > (I32)rex->lastparen)
5363 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5364
5365 ST.prev_curlyx= cur_curlyx;
5366 cur_curlyx = st;
5367 ST.cp = PL_savestack_ix;
5368
5369 /* these fields contain the state of the current curly.
5370 * they are accessed by subsequent WHILEMs */
5371 ST.parenfloor = parenfloor;
d02d6d97 5372 ST.me = scan;
c476f425 5373 ST.B = next;
24d3c4a9
DM
5374 ST.minmod = minmod;
5375 minmod = 0;
c476f425
DM
5376 ST.count = -1; /* this will be updated by WHILEM */
5377 ST.lastloc = NULL; /* this will be updated by WHILEM */
5378
4d5016e5 5379 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5380 assert(0); /* NOTREACHED */
c476f425 5381 }
a0d0e21e 5382
c476f425 5383 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5384 cur_curlyx = ST.prev_curlyx;
5385 sayYES;
118e2215 5386 assert(0); /* NOTREACHED */
a0d0e21e 5387
c476f425
DM
5388 case CURLYX_end_fail: /* just failed to match all of A*B */
5389 regcpblow(ST.cp);
5390 cur_curlyx = ST.prev_curlyx;
5391 sayNO;
118e2215 5392 assert(0); /* NOTREACHED */
4633a7c4 5393
a0d0e21e 5394
c476f425
DM
5395#undef ST
5396#define ST st->u.whilem
5397
5398 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5399 {
5400 /* see the discussion above about CURLYX/WHILEM */
c476f425 5401 I32 n;
d02d6d97
DM
5402 int min = ARG1(cur_curlyx->u.curlyx.me);
5403 int max = ARG2(cur_curlyx->u.curlyx.me);
5404 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5405
c476f425
DM
5406 assert(cur_curlyx); /* keep Coverity happy */
5407 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5408 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5409 ST.cache_offset = 0;
5410 ST.cache_mask = 0;
5411
c476f425
DM
5412
5413 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5414 "%*s whilem: matched %ld out of %d..%d\n",
5415 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5416 );
a0d0e21e 5417
c476f425 5418 /* First just match a string of min A's. */
a0d0e21e 5419
d02d6d97 5420 if (n < min) {
92da3157
DM
5421 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5422 maxopenparen);
c476f425 5423 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5424 REGCP_SET(ST.lastcp);
5425
4d5016e5 5426 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5427 assert(0); /* NOTREACHED */
c476f425
DM
5428 }
5429
5430 /* If degenerate A matches "", assume A done. */
5431
5432 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5433 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5434 "%*s whilem: empty match detected, trying continuation...\n",
5435 REPORT_CODE_OFF+depth*2, "")
5436 );
5437 goto do_whilem_B_max;
5438 }
5439
5440 /* super-linear cache processing */
5441
5442 if (scan->flags) {
a0d0e21e 5443
2c2d71f5 5444 if (!PL_reg_maxiter) {
c476f425
DM
5445 /* start the countdown: Postpone detection until we
5446 * know the match is not *that* much linear. */
2c2d71f5 5447 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5448 /* possible overflow for long strings and many CURLYX's */
5449 if (PL_reg_maxiter < 0)
5450 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5451 PL_reg_leftiter = PL_reg_maxiter;
5452 }
c476f425 5453
2c2d71f5 5454 if (PL_reg_leftiter-- == 0) {
c476f425 5455 /* initialise cache */
3298f257 5456 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5457 if (PL_reg_poscache) {
eb160463 5458 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5459 Renew(PL_reg_poscache, size, char);
5460 PL_reg_poscache_size = size;
5461 }
5462 Zero(PL_reg_poscache, size, char);
5463 }
5464 else {
5465 PL_reg_poscache_size = size;
a02a5408 5466 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5467 }
c476f425
DM
5468 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470 PL_colors[4], PL_colors[5])
5471 );
2c2d71f5 5472 }
c476f425 5473
2c2d71f5 5474 if (PL_reg_leftiter < 0) {
c476f425
DM
5475 /* have we already failed at this position? */
5476 I32 offset, mask;
5477 offset = (scan->flags & 0xf) - 1
5478 + (locinput - PL_bostr) * (scan->flags>>4);
5479 mask = 1 << (offset % 8);
5480 offset /= 8;
5481 if (PL_reg_poscache[offset] & mask) {
5482 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5483 "%*s whilem: (cache) already tried at this position...\n",
5484 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5485 );
3298f257 5486 sayNO; /* cache records failure */
2c2d71f5 5487 }
c476f425
DM
5488 ST.cache_offset = offset;
5489 ST.cache_mask = mask;
2c2d71f5 5490 }
c476f425 5491 }
2c2d71f5 5492
c476f425 5493 /* Prefer B over A for minimal matching. */
a687059c 5494
c476f425
DM
5495 if (cur_curlyx->u.curlyx.minmod) {
5496 ST.save_curlyx = cur_curlyx;
5497 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5498 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5499 maxopenparen);
c476f425 5500 REGCP_SET(ST.lastcp);
4d5016e5
DM
5501 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5502 locinput);
118e2215 5503 assert(0); /* NOTREACHED */
c476f425 5504 }
a0d0e21e 5505
c476f425
DM
5506 /* Prefer A over B for maximal matching. */
5507
d02d6d97 5508 if (n < max) { /* More greed allowed? */
92da3157
DM
5509 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5510 maxopenparen);
c476f425
DM
5511 cur_curlyx->u.curlyx.lastloc = locinput;
5512 REGCP_SET(ST.lastcp);
4d5016e5 5513 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5514 assert(0); /* NOTREACHED */
c476f425
DM
5515 }
5516 goto do_whilem_B_max;
5517 }
118e2215 5518 assert(0); /* NOTREACHED */
c476f425
DM
5519
5520 case WHILEM_B_min: /* just matched B in a minimal match */
5521 case WHILEM_B_max: /* just matched B in a maximal match */
5522 cur_curlyx = ST.save_curlyx;
5523 sayYES;
118e2215 5524 assert(0); /* NOTREACHED */
c476f425
DM
5525
5526 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5527 cur_curlyx = ST.save_curlyx;
5528 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5529 cur_curlyx->u.curlyx.count--;
5530 CACHEsayNO;
118e2215 5531 assert(0); /* NOTREACHED */
c476f425
DM
5532
5533 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5534 /* FALL THROUGH */
5535 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5536 REGCP_UNWIND(ST.lastcp);
92da3157 5537 regcppop(rex, &maxopenparen);
c476f425
DM
5538 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5539 cur_curlyx->u.curlyx.count--;
5540 CACHEsayNO;
118e2215 5541 assert(0); /* NOTREACHED */
c476f425
DM
5542
5543 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5544 REGCP_UNWIND(ST.lastcp);
92da3157 5545 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5546 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5547 "%*s whilem: failed, trying continuation...\n",
5548 REPORT_CODE_OFF+depth*2, "")
5549 );
5550 do_whilem_B_max:
5551 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5552 && ckWARN(WARN_REGEXP)
5553 && !(PL_reg_flags & RF_warned))
5554 {
5555 PL_reg_flags |= RF_warned;
dcbac5bb
FC
5556 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5557 "Complex regular subexpression recursion limit (%d) "
5558 "exceeded",
c476f425
DM
5559 REG_INFTY - 1);
5560 }
5561
5562 /* now try B */
5563 ST.save_curlyx = cur_curlyx;
5564 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5565 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5566 locinput);
118e2215 5567 assert(0); /* NOTREACHED */
c476f425
DM
5568
5569 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5570 cur_curlyx = ST.save_curlyx;
5571 REGCP_UNWIND(ST.lastcp);
92da3157 5572 regcppop(rex, &maxopenparen);
c476f425 5573
d02d6d97 5574 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5575 /* Maximum greed exceeded */
5576 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5577 && ckWARN(WARN_REGEXP)
5578 && !(PL_reg_flags & RF_warned))
5579 {
3280af22 5580 PL_reg_flags |= RF_warned;
c476f425 5581 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5582 "Complex regular subexpression recursion "
5583 "limit (%d) exceeded",
c476f425 5584 REG_INFTY - 1);
a0d0e21e 5585 }
c476f425 5586 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5587 CACHEsayNO;
a0d0e21e 5588 }
c476f425
DM
5589
5590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5591 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5592 );
5593 /* Try grabbing another A and see if it helps. */
c476f425 5594 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5595 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5596 maxopenparen);
c476f425 5597 REGCP_SET(ST.lastcp);
d02d6d97 5598 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5599 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5600 locinput);
118e2215 5601 assert(0); /* NOTREACHED */
40a82448
DM
5602
5603#undef ST
5604#define ST st->u.branch
5605
5606 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5607 next = scan + ARG(scan);
5608 if (next == scan)
5609 next = NULL;
40a82448
DM
5610 scan = NEXTOPER(scan);
5611 /* FALL THROUGH */
c277df42 5612
40a82448
DM
5613 case BRANCH: /* /(...|A|...)/ */
5614 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5615 ST.lastparen = rex->lastparen;
f6033a9d 5616 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5617 ST.next_branch = next;
5618 REGCP_SET(ST.cp);
02db2b7b 5619
40a82448 5620 /* Now go into the branch */
5d458dd8 5621 if (has_cutgroup) {
4d5016e5 5622 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5623 } else {
4d5016e5 5624 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5625 }
118e2215 5626 assert(0); /* NOTREACHED */
3c0563b9
DM
5627
5628 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5629 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5630 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5631 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5632 assert(0); /* NOTREACHED */
3c0563b9 5633
5d458dd8
YO
5634 case CUTGROUP_next_fail:
5635 do_cutgroup = 1;
5636 no_final = 1;
5637 if (st->u.mark.mark_name)
5638 sv_commit = st->u.mark.mark_name;
5639 sayNO;
118e2215 5640 assert(0); /* NOTREACHED */
3c0563b9 5641
5d458dd8
YO
5642 case BRANCH_next:
5643 sayYES;
118e2215 5644 assert(0); /* NOTREACHED */
3c0563b9 5645
40a82448 5646 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5647 if (do_cutgroup) {
5648 do_cutgroup = 0;
5649 no_final = 0;
5650 }
40a82448 5651 REGCP_UNWIND(ST.cp);
a8d1f4b4 5652 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5653 scan = ST.next_branch;
5654 /* no more branches? */
5d458dd8
YO
5655 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5656 DEBUG_EXECUTE_r({
5657 PerlIO_printf( Perl_debug_log,
5658 "%*s %sBRANCH failed...%s\n",
5659 REPORT_CODE_OFF+depth*2, "",
5660 PL_colors[4],
5661 PL_colors[5] );
5662 });
5663 sayNO_SILENT;
5664 }
40a82448 5665 continue; /* execute next BRANCH[J] op */
118e2215 5666 assert(0); /* NOTREACHED */
40a82448 5667
3c0563b9 5668 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5669 minmod = 1;
a0d0e21e 5670 break;
40a82448
DM
5671
5672#undef ST
5673#define ST st->u.curlym
5674
5675 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5676
5677 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5678 * only a single backtracking state, no matter how many matches
40a82448
DM
5679 * there are in {m,n}. It relies on the pattern being constant
5680 * length, with no parens to influence future backrefs
5681 */
5682
5683 ST.me = scan;
dc45a647 5684 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5685
f6033a9d
DM
5686 ST.lastparen = rex->lastparen;
5687 ST.lastcloseparen = rex->lastcloseparen;
5688
40a82448
DM
5689 /* if paren positive, emulate an OPEN/CLOSE around A */
5690 if (ST.me->flags) {
3b6647e0 5691 U32 paren = ST.me->flags;
92da3157
DM
5692 if (paren > maxopenparen)
5693 maxopenparen = paren;
c277df42 5694 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5695 }
40a82448
DM
5696 ST.A = scan;
5697 ST.B = next;
5698 ST.alen = 0;
5699 ST.count = 0;
24d3c4a9
DM
5700 ST.minmod = minmod;
5701 minmod = 0;
40a82448
DM
5702 ST.c1 = CHRTEST_UNINIT;
5703 REGCP_SET(ST.cp);
6407bf3b 5704
40a82448
DM
5705 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5706 goto curlym_do_B;
5707
5708 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5709 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5710 assert(0); /* NOTREACHED */
5f80c4cf 5711
40a82448 5712 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5713 ST.count++;
5714 /* after first match, determine A's length: u.curlym.alen */
5715 if (ST.count == 1) {
5716 if (PL_reg_match_utf8) {
c07e9d7b
DM
5717 char *s = st->locinput;
5718 while (s < locinput) {
40a82448
DM
5719 ST.alen++;
5720 s += UTF8SKIP(s);
5721 }
5722 }
5723 else {
c07e9d7b 5724 ST.alen = locinput - st->locinput;
40a82448
DM
5725 }
5726 if (ST.alen == 0)
5727 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5728 }
0cadcf80
DM
5729 DEBUG_EXECUTE_r(
5730 PerlIO_printf(Perl_debug_log,
40a82448 5731 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5732 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5733 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5734 );
5735
0a4db386 5736 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5737 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5738 goto fake_end;
5739
c966426a
DM
5740 {
5741 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5742 if ( max == REG_INFTY || ST.count < max )
5743 goto curlym_do_A; /* try to match another A */
5744 }
40a82448 5745 goto curlym_do_B; /* try to match B */
5f80c4cf 5746
40a82448
DM
5747 case CURLYM_A_fail: /* just failed to match an A */
5748 REGCP_UNWIND(ST.cp);
0a4db386
YO
5749
5750 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5751 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5752 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5753 sayNO;
0cadcf80 5754
40a82448 5755 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5756 if (ST.c1 == CHRTEST_UNINIT) {
5757 /* calculate c1 and c2 for possible match of 1st char
5758 * following curly */
5759 ST.c1 = ST.c2 = CHRTEST_VOID;
5760 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5761 regnode *text_node = ST.B;
5762 if (! HAS_TEXT(text_node))
5763 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5764 /* this used to be
5765
5766 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5767
5768 But the former is redundant in light of the latter.
5769
5770 if this changes back then the macro for
5771 IS_TEXT and friends need to change.
5772 */
c74f6de9 5773 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8
KW
5774 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5775 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
c74f6de9
KW
5776 {
5777 sayNO;
5778 }
c277df42 5779 }
c277df42 5780 }
40a82448
DM
5781 }
5782
5783 DEBUG_EXECUTE_r(
5784 PerlIO_printf(Perl_debug_log,
5785 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5786 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5787 "", (IV)ST.count)
5788 );
c74f6de9 5789 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5790 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5791 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5792 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5793 {
5794 /* simulate B failing */
5795 DEBUG_OPTIMISE_r(
5796 PerlIO_printf(Perl_debug_log,
5797 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5798 (int)(REPORT_CODE_OFF+(depth*2)),"",
5799 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5800 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5801 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5802 );
5803 state_num = CURLYM_B_fail;
5804 goto reenter_switch;
5805 }
5806 }
5807 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5808 /* simulate B failing */
5809 DEBUG_OPTIMISE_r(
5810 PerlIO_printf(Perl_debug_log,
79a2a0e8 5811 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5812 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5813 (int) nextchr, ST.c1, ST.c2)
5814 );
5400f398
KW
5815 state_num = CURLYM_B_fail;
5816 goto reenter_switch;
5817 }
c74f6de9 5818 }
40a82448
DM
5819
5820 if (ST.me->flags) {
f6033a9d 5821 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5822 I32 paren = ST.me->flags;
5823 if (ST.count) {
b93070ed 5824 rex->offs[paren].start
c07e9d7b
DM
5825 = HOPc(locinput, -ST.alen) - PL_bostr;
5826 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5827 if ((U32)paren > rex->lastparen)
5828 rex->lastparen = paren;
5829 rex->lastcloseparen = paren;
c277df42 5830 }
40a82448 5831 else
b93070ed 5832 rex->offs[paren].end = -1;
0a4db386 5833 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5834 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5835 {
5836 if (ST.count)
5837 goto fake_end;
5838 else
5839 sayNO;
5840 }
c277df42 5841 }
0a4db386 5842
4d5016e5 5843 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5844 assert(0); /* NOTREACHED */
40a82448
DM
5845
5846 case CURLYM_B_fail: /* just failed to match a B */
5847 REGCP_UNWIND(ST.cp);
a8d1f4b4 5848 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5849 if (ST.minmod) {
84d2fa14
HS
5850 I32 max = ARG2(ST.me);
5851 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5852 sayNO;
5853 goto curlym_do_A; /* try to match a further A */
5854 }
5855 /* backtrack one A */
5856 if (ST.count == ARG1(ST.me) /* min */)
5857 sayNO;
5858 ST.count--;
7016d6eb 5859 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5860 goto curlym_do_B; /* try to match B */
5861
c255a977
DM
5862#undef ST
5863#define ST st->u.curly
40a82448 5864
c255a977
DM
5865#define CURLY_SETPAREN(paren, success) \
5866 if (paren) { \
5867 if (success) { \
b93070ed
DM
5868 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5869 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5870 if (paren > rex->lastparen) \
5871 rex->lastparen = paren; \
b93070ed 5872 rex->lastcloseparen = paren; \
c255a977 5873 } \
f6033a9d 5874 else { \
b93070ed 5875 rex->offs[paren].end = -1; \
f6033a9d
DM
5876 rex->lastparen = ST.lastparen; \
5877 rex->lastcloseparen = ST.lastcloseparen; \
5878 } \
c255a977
DM
5879 }
5880
b40a2c17 5881 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5882 ST.paren = 0;
5883 ST.min = 0;
5884 ST.max = REG_INFTY;
a0d0e21e
LW
5885 scan = NEXTOPER(scan);
5886 goto repeat;
3c0563b9 5887
b40a2c17 5888 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5889 ST.paren = 0;
5890 ST.min = 1;
5891 ST.max = REG_INFTY;
c277df42 5892 scan = NEXTOPER(scan);
c255a977 5893 goto repeat;
3c0563b9 5894
b40a2c17 5895 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5896 ST.paren = scan->flags; /* Which paren to set */
5897 ST.lastparen = rex->lastparen;
f6033a9d 5898 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
5899 if (ST.paren > maxopenparen)
5900 maxopenparen = ST.paren;
c255a977
DM
5901 ST.min = ARG1(scan); /* min to match */
5902 ST.max = ARG2(scan); /* max to match */
0a4db386 5903 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5904 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5905 ST.min=1;
5906 ST.max=1;
5907 }
c255a977
DM
5908 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5909 goto repeat;
3c0563b9 5910
b40a2c17 5911 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5912 ST.paren = 0;
5913 ST.min = ARG1(scan); /* min to match */
5914 ST.max = ARG2(scan); /* max to match */
5915 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5916 repeat:
a0d0e21e
LW
5917 /*
5918 * Lookahead to avoid useless match attempts
5919 * when we know what character comes next.
c255a977 5920 *
5f80c4cf
JP
5921 * Used to only do .*x and .*?x, but now it allows
5922 * for )'s, ('s and (?{ ... })'s to be in the way
5923 * of the quantifier and the EXACT-like node. -- japhy
5924 */
5925
eb5c1be8 5926 assert(ST.min <= ST.max);
3337dfe3
KW
5927 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5928 ST.c1 = ST.c2 = CHRTEST_VOID;
5929 }
5930 else {
5f80c4cf
JP
5931 regnode *text_node = next;
5932
3dab1dad
YO
5933 if (! HAS_TEXT(text_node))
5934 FIND_NEXT_IMPT(text_node);
5f80c4cf 5935
9e137952 5936 if (! HAS_TEXT(text_node))
c255a977 5937 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5938 else {
ee9b8eae 5939 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5940 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5941 }
c74f6de9 5942 else {
ee9b8eae
YO
5943
5944 /* Currently we only get here when
5945
5946 PL_rekind[OP(text_node)] == EXACT
5947
5948 if this changes back then the macro for IS_TEXT and
5949 friends need to change. */
79a2a0e8
KW
5950 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5951 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
c74f6de9
KW
5952 {
5953 sayNO;
5954 }
5955 }
1aa99e6b 5956 }
bbce6d69 5957 }
c255a977
DM
5958
5959 ST.A = scan;
5960 ST.B = next;
24d3c4a9 5961 if (minmod) {
eb72505d 5962 char *li = locinput;
24d3c4a9 5963 minmod = 0;
eb72505d 5964 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
4633a7c4 5965 sayNO;
7016d6eb 5966 SET_locinput(li);
c255a977 5967 ST.count = ST.min;
c255a977
DM
5968 REGCP_SET(ST.cp);
5969 if (ST.c1 == CHRTEST_VOID)
5970 goto curly_try_B_min;
5971
5972 ST.oldloc = locinput;
5973
5974 /* set ST.maxpos to the furthest point along the
5975 * string that could possibly match */
5976 if (ST.max == REG_INFTY) {
5977 ST.maxpos = PL_regeol - 1;
f2ed9b32 5978 if (utf8_target)
c255a977
DM
5979 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5980 ST.maxpos--;
5981 }
f2ed9b32 5982 else if (utf8_target) {
c255a977
DM
5983 int m = ST.max - ST.min;
5984 for (ST.maxpos = locinput;
5985 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5986 ST.maxpos += UTF8SKIP(ST.maxpos);
5987 }
5988 else {
5989 ST.maxpos = locinput + ST.max - ST.min;
5990 if (ST.maxpos >= PL_regeol)
5991 ST.maxpos = PL_regeol - 1;
5992 }
5993 goto curly_try_B_min_known;
5994
5995 }
5996 else {
eb72505d
DM
5997 /* avoid taking address of locinput, so it can remain
5998 * a register var */
5999 char *li = locinput;
6000 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
c255a977
DM
6001 if (ST.count < ST.min)
6002 sayNO;
7016d6eb 6003 SET_locinput(li);
c255a977
DM
6004 if ((ST.count > ST.min)
6005 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6006 {
6007 /* A{m,n} must come at the end of the string, there's
6008 * no point in backing off ... */
6009 ST.min = ST.count;
6010 /* ...except that $ and \Z can match before *and* after
6011 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6012 We may back off by one in this case. */
eb72505d 6013 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6014 ST.min--;
6015 }
6016 REGCP_SET(ST.cp);
6017 goto curly_try_B_max;
6018 }
118e2215 6019 assert(0); /* NOTREACHED */
c255a977
DM
6020
6021
6022 case CURLY_B_min_known_fail:
6023 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6024
c255a977 6025 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6026 if (ST.paren) {
6027 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6028 }
c255a977
DM
6029 /* Couldn't or didn't -- move forward. */
6030 ST.oldloc = locinput;
f2ed9b32 6031 if (utf8_target)
c255a977
DM
6032 locinput += UTF8SKIP(locinput);
6033 else
6034 locinput++;
6035 ST.count++;
6036 curly_try_B_min_known:
6037 /* find the next place where 'B' could work, then call B */
6038 {
6039 int n;
f2ed9b32 6040 if (utf8_target) {
c255a977
DM
6041 n = (ST.oldloc == locinput) ? 0 : 1;
6042 if (ST.c1 == ST.c2) {
c255a977 6043 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6044 while (locinput <= ST.maxpos
6045 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6046 {
6047 locinput += UTF8SKIP(locinput);
c255a977
DM
6048 n++;
6049 }
1aa99e6b
IH
6050 }
6051 else {
c255a977 6052 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6053 while (locinput <= ST.maxpos
6054 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6055 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6056 {
6057 locinput += UTF8SKIP(locinput);
c255a977 6058 n++;
1aa99e6b 6059 }
0fe9bf95
IZ
6060 }
6061 }
5400f398 6062 else { /* Not utf8_target */
c255a977
DM
6063 if (ST.c1 == ST.c2) {
6064 while (locinput <= ST.maxpos &&
6065 UCHARAT(locinput) != ST.c1)
6066 locinput++;
bbce6d69 6067 }
c255a977
DM
6068 else {
6069 while (locinput <= ST.maxpos
6070 && UCHARAT(locinput) != ST.c1
6071 && UCHARAT(locinput) != ST.c2)
6072 locinput++;
a0ed51b3 6073 }
c255a977
DM
6074 n = locinput - ST.oldloc;
6075 }
6076 if (locinput > ST.maxpos)
6077 sayNO;
c255a977 6078 if (n) {
eb72505d
DM
6079 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6080 * at b; check that everything between oldloc and
6081 * locinput matches */
6082 char *li = ST.oldloc;
c255a977 6083 ST.count += n;
eb72505d 6084 if (regrepeat(rex, &li, ST.A, n, depth) < n)
4633a7c4 6085 sayNO;
eb72505d 6086 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6087 }
c255a977 6088 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6089 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6090 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6091 goto fake_end;
6092 }
4d5016e5 6093 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6094 }
118e2215 6095 assert(0); /* NOTREACHED */
c255a977
DM
6096
6097
6098 case CURLY_B_min_fail:
6099 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6100
6101 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6102 if (ST.paren) {
6103 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6104 }
c255a977 6105 /* failed -- move forward one */
f73aaa43 6106 {
eb72505d
DM
6107 char *li = locinput;
6108 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
f73aaa43
DM
6109 sayNO;
6110 }
eb72505d 6111 locinput = li;
f73aaa43
DM
6112 }
6113 {
c255a977 6114 ST.count++;
c255a977
DM
6115 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6116 ST.count > 0)) /* count overflow ? */
15272685 6117 {
c255a977
DM
6118 curly_try_B_min:
6119 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6120 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6121 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6122 goto fake_end;
6123 }
4d5016e5 6124 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6125 }
6126 }
c74f6de9 6127 sayNO;
118e2215 6128 assert(0); /* NOTREACHED */
c255a977
DM
6129
6130
6131 curly_try_B_max:
6132 /* a successful greedy match: now try to match B */
40d049e4 6133 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6134 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6135 goto fake_end;
6136 }
c255a977 6137 {
79a2a0e8
KW
6138 bool could_match = locinput < PL_regeol;
6139
c255a977 6140 /* If it could work, try it. */
79a2a0e8
KW
6141 if (ST.c1 != CHRTEST_VOID && could_match) {
6142 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6143 {
6144 could_match = memEQ(locinput,
6145 ST.c1_utf8,
6146 UTF8SKIP(locinput))
6147 || memEQ(locinput,
6148 ST.c2_utf8,
6149 UTF8SKIP(locinput));
6150 }
6151 else {
6152 could_match = UCHARAT(locinput) == ST.c1
6153 || UCHARAT(locinput) == ST.c2;
6154 }
6155 }
6156 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6157 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6158 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6159 assert(0); /* NOTREACHED */
c255a977
DM
6160 }
6161 }
6162 /* FALL THROUGH */
3c0563b9 6163
c255a977
DM
6164 case CURLY_B_max_fail:
6165 /* failed to find B in a greedy match */
c255a977
DM
6166
6167 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6168 if (ST.paren) {
6169 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6170 }
c255a977
DM
6171 /* back up. */
6172 if (--ST.count < ST.min)
6173 sayNO;
eb72505d 6174 locinput = HOPc(locinput, -1);
c255a977
DM
6175 goto curly_try_B_max;
6176
6177#undef ST
6178
3c0563b9 6179 case END: /* last op of main pattern */
6bda09f9 6180 fake_end:
faec1544
DM
6181 if (cur_eval) {
6182 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
6183 st->u.eval.toggle_reg_flags
6184 = cur_eval->u.eval.toggle_reg_flags;
6185 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6186
288b8c02 6187 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6188
6189 /* Save *all* the positions. */
6190 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b
DM
6191 rex_sv = cur_eval->u.eval.prev_rex;
6192 SET_reg_curpm(rex_sv);
8d919b0a 6193 rex = ReANY(rex_sv);
f8fc2ecf 6194 rexi = RXi_GET(rex);
faec1544 6195 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6196
faec1544 6197 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6198
6199 /* Restore parens of the outer rex without popping the
6200 * savestack */
92da3157
DM
6201 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6202 &maxopenparen);
faec1544
DM
6203
6204 st->u.eval.prev_eval = cur_eval;
6205 cur_eval = cur_eval->u.eval.prev_eval;
6206 DEBUG_EXECUTE_r(
2a49f0f5
JH
6207 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6208 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6209 if ( nochange_depth )
6210 nochange_depth--;
6211
4d5016e5
DM
6212 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6213 locinput); /* match B */
faec1544
DM
6214 }
6215
3b0527fe 6216 if (locinput < reginfo->till) {
a3621e74 6217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6218 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6219 PL_colors[4],
6220 (long)(locinput - PL_reg_starttry),
3b0527fe 6221 (long)(reginfo->till - PL_reg_starttry),
7821416a 6222 PL_colors[5]));
58e23c8d 6223
262b90c4 6224 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6225 }
262b90c4 6226 sayYES; /* Success! */
dad79028
DM
6227
6228 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6229 DEBUG_EXECUTE_r(
6230 PerlIO_printf(Perl_debug_log,
6231 "%*s %ssubpattern success...%s\n",
5bc10b2c 6232 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6233 sayYES; /* Success! */
dad79028 6234
40a82448
DM
6235#undef ST
6236#define ST st->u.ifmatch
6237
37f53970
DM
6238 {
6239 char *newstart;
6240
40a82448
DM
6241 case SUSPEND: /* (?>A) */
6242 ST.wanted = 1;
37f53970 6243 newstart = locinput;
9041c2e3 6244 goto do_ifmatch;
dad79028 6245
40a82448
DM
6246 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6247 ST.wanted = 0;
dad79028
DM
6248 goto ifmatch_trivial_fail_test;
6249
40a82448
DM
6250 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6251 ST.wanted = 1;
dad79028 6252 ifmatch_trivial_fail_test:
a0ed51b3 6253 if (scan->flags) {
52657f30 6254 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6255 if (!s) {
6256 /* trivial fail */
24d3c4a9
DM
6257 if (logical) {
6258 logical = 0;
f2338a2e 6259 sw = 1 - cBOOL(ST.wanted);
dad79028 6260 }
40a82448 6261 else if (ST.wanted)
dad79028
DM
6262 sayNO;
6263 next = scan + ARG(scan);
6264 if (next == scan)
6265 next = NULL;
6266 break;
6267 }
37f53970 6268 newstart = s;
a0ed51b3
LW
6269 }
6270 else
37f53970 6271 newstart = locinput;
a0ed51b3 6272
c277df42 6273 do_ifmatch:
40a82448 6274 ST.me = scan;
24d3c4a9 6275 ST.logical = logical;
24d786f4
YO
6276 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6277
40a82448 6278 /* execute body of (?...A) */
37f53970 6279 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6280 assert(0); /* NOTREACHED */
37f53970 6281 }
40a82448
DM
6282
6283 case IFMATCH_A_fail: /* body of (?...A) failed */
6284 ST.wanted = !ST.wanted;
6285 /* FALL THROUGH */
6286
6287 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6288 if (ST.logical) {
f2338a2e 6289 sw = cBOOL(ST.wanted);
40a82448
DM
6290 }
6291 else if (!ST.wanted)
6292 sayNO;
6293
37f53970
DM
6294 if (OP(ST.me) != SUSPEND) {
6295 /* restore old position except for (?>...) */
6296 locinput = st->locinput;
40a82448
DM
6297 }
6298 scan = ST.me + ARG(ST.me);
6299 if (scan == ST.me)
6300 scan = NULL;
6301 continue; /* execute B */
6302
6303#undef ST
dad79028 6304
3c0563b9
DM
6305 case LONGJMP: /* alternative with many branches compiles to
6306 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6307 next = scan + ARG(scan);
6308 if (next == scan)
6309 next = NULL;
a0d0e21e 6310 break;
3c0563b9
DM
6311
6312 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6313 reginfo->cutpoint = PL_regeol;
6314 /* FALLTHROUGH */
3c0563b9
DM
6315
6316 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6317 if (!scan->flags)
ad64d0ec 6318 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6319 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6320 assert(0); /* NOTREACHED */
3c0563b9 6321
54612592
YO
6322 case COMMIT_next_fail:
6323 no_final = 1;
6324 /* FALLTHROUGH */
3c0563b9
DM
6325
6326 case OPFAIL: /* (*FAIL) */
7f69552c 6327 sayNO;
118e2215 6328 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6329
6330#define ST st->u.mark
3c0563b9 6331 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6332 ST.prev_mark = mark_state;
5d458dd8 6333 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6334 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6335 mark_state = st;
4d5016e5
DM
6336 ST.mark_loc = locinput;
6337 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6338 assert(0); /* NOTREACHED */
3c0563b9 6339
e2e6a0f1
YO
6340 case MARKPOINT_next:
6341 mark_state = ST.prev_mark;
6342 sayYES;
118e2215 6343 assert(0); /* NOTREACHED */
3c0563b9 6344
e2e6a0f1 6345 case MARKPOINT_next_fail:
5d458dd8 6346 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6347 {
6348 if (ST.mark_loc > startpoint)
6349 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6350 popmark = NULL; /* we found our mark */
6351 sv_commit = ST.mark_name;
6352
6353 DEBUG_EXECUTE_r({
5d458dd8 6354 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6355 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6356 REPORT_CODE_OFF+depth*2, "",
be2597df 6357 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6358 });
6359 }
6360 mark_state = ST.prev_mark;
5d458dd8
YO
6361 sv_yes_mark = mark_state ?
6362 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6363 sayNO;
118e2215 6364 assert(0); /* NOTREACHED */
3c0563b9
DM
6365
6366 case SKIP: /* (*SKIP) */
5d458dd8 6367 if (scan->flags) {
2bf803e2 6368 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6369 ST.mark_name = NULL;
e2e6a0f1 6370 ST.mark_loc = locinput;
4d5016e5 6371 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6372 } else {
2bf803e2 6373 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6374 otherwise do nothing. Meaning we need to scan
6375 */
6376 regmatch_state *cur = mark_state;
ad64d0ec 6377 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6378
6379 while (cur) {
6380 if ( sv_eq( cur->u.mark.mark_name,
6381 find ) )
6382 {
6383 ST.mark_name = find;
4d5016e5 6384 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6385 }
6386 cur = cur->u.mark.prev_mark;
6387 }
e2e6a0f1 6388 }
2bf803e2 6389 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6390 break;
3c0563b9 6391
5d458dd8
YO
6392 case SKIP_next_fail:
6393 if (ST.mark_name) {
6394 /* (*CUT:NAME) - Set up to search for the name as we
6395 collapse the stack*/
6396 popmark = ST.mark_name;
6397 } else {
6398 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6399 if (ST.mark_loc > startpoint)
6400 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6401 /* but we set sv_commit to latest mark_name if there
6402 is one so they can test to see how things lead to this
6403 cut */
6404 if (mark_state)
6405 sv_commit=mark_state->u.mark.mark_name;
6406 }
e2e6a0f1
YO
6407 no_final = 1;
6408 sayNO;
118e2215 6409 assert(0); /* NOTREACHED */
e2e6a0f1 6410#undef ST
3c0563b9
DM
6411
6412 case LNBREAK: /* \R */
7016d6eb 6413 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6414 locinput += n;
e1d1eefb
YO
6415 } else
6416 sayNO;
6417 break;
6418
6419#define CASE_CLASS(nAmE) \
6420 case nAmE: \
7016d6eb 6421 if (NEXTCHR_IS_EOS) \
1380b51d 6422 sayNO; \
f2ed9b32 6423 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb 6424 locinput += n; \
e1d1eefb
YO
6425 } else \
6426 sayNO; \
6427 break; \
6428 case N##nAmE: \
7016d6eb 6429 if (NEXTCHR_IS_EOS) \
1380b51d 6430 sayNO; \
f2ed9b32 6431 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
6432 sayNO; \
6433 } else { \
6434 locinput += UTF8SKIP(locinput); \
e1d1eefb
YO
6435 } \
6436 break
6437
3c0563b9
DM
6438 CASE_CLASS(VERTWS); /* \v \V */
6439 CASE_CLASS(HORIZWS); /* \h \H */
e1d1eefb
YO
6440#undef CASE_CLASS
6441
a0d0e21e 6442 default:
b900a521 6443 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6444 PTR2UV(scan), OP(scan));
cea2e8a9 6445 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6446
6447 /* this is a point to jump to in order to increment
6448 * locinput by one character */
6449 increment_locinput:
e6ca698c 6450 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6451 if (utf8_target) {
6452 locinput += PL_utf8skip[nextchr];
7016d6eb 6453 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6454 if (locinput > PL_regeol)
6455 sayNO;
28b98f76
DM
6456 }
6457 else
3640db6b 6458 locinput++;
28b98f76 6459 break;
5d458dd8
YO
6460
6461 } /* end switch */
95b24440 6462
5d458dd8
YO
6463 /* switch break jumps here */
6464 scan = next; /* prepare to execute the next op and ... */
6465 continue; /* ... jump back to the top, reusing st */
118e2215 6466 assert(0); /* NOTREACHED */
95b24440 6467
40a82448
DM
6468 push_yes_state:
6469 /* push a state that backtracks on success */
6470 st->u.yes.prev_yes_state = yes_state;
6471 yes_state = st;
6472 /* FALL THROUGH */
6473 push_state:
6474 /* push a new regex state, then continue at scan */
6475 {
6476 regmatch_state *newst;
6477
24b23f37
YO
6478 DEBUG_STACK_r({
6479 regmatch_state *cur = st;
6480 regmatch_state *curyes = yes_state;
6481 int curd = depth;
6482 regmatch_slab *slab = PL_regmatch_slab;
6483 for (;curd > -1;cur--,curd--) {
6484 if (cur < SLAB_FIRST(slab)) {
6485 slab = slab->prev;
6486 cur = SLAB_LAST(slab);
6487 }
6488 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6489 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6490 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6491 (curyes == cur) ? "yes" : ""
6492 );
6493 if (curyes == cur)
6494 curyes = cur->u.yes.prev_yes_state;
6495 }
6496 } else
6497 DEBUG_STATE_pp("push")
6498 );
40a82448 6499 depth++;
40a82448
DM
6500 st->locinput = locinput;
6501 newst = st+1;
6502 if (newst > SLAB_LAST(PL_regmatch_slab))
6503 newst = S_push_slab(aTHX);
6504 PL_regmatch_state = newst;
786e8c11 6505
4d5016e5 6506 locinput = pushinput;
40a82448
DM
6507 st = newst;
6508 continue;
118e2215 6509 assert(0); /* NOTREACHED */
40a82448 6510 }
a0d0e21e 6511 }
a687059c 6512
a0d0e21e
LW
6513 /*
6514 * We get here only if there's trouble -- normally "case END" is
6515 * the terminating point.
6516 */
cea2e8a9 6517 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6518 /*NOTREACHED*/
4633a7c4
LW
6519 sayNO;
6520
262b90c4 6521yes:
77cb431f
DM
6522 if (yes_state) {
6523 /* we have successfully completed a subexpression, but we must now
6524 * pop to the state marked by yes_state and continue from there */
77cb431f 6525 assert(st != yes_state);
5bc10b2c
DM
6526#ifdef DEBUGGING
6527 while (st != yes_state) {
6528 st--;
6529 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6530 PL_regmatch_slab = PL_regmatch_slab->prev;
6531 st = SLAB_LAST(PL_regmatch_slab);
6532 }
e2e6a0f1 6533 DEBUG_STATE_r({
54612592
YO
6534 if (no_final) {
6535 DEBUG_STATE_pp("pop (no final)");
6536 } else {
6537 DEBUG_STATE_pp("pop (yes)");
6538 }
e2e6a0f1 6539 });
5bc10b2c
DM
6540 depth--;
6541 }
6542#else
77cb431f
DM
6543 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6544 || yes_state > SLAB_LAST(PL_regmatch_slab))
6545 {
6546 /* not in this slab, pop slab */
6547 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6548 PL_regmatch_slab = PL_regmatch_slab->prev;
6549 st = SLAB_LAST(PL_regmatch_slab);
6550 }
6551 depth -= (st - yes_state);
5bc10b2c 6552#endif
77cb431f
DM
6553 st = yes_state;
6554 yes_state = st->u.yes.prev_yes_state;
6555 PL_regmatch_state = st;
24b23f37 6556
3640db6b 6557 if (no_final)
5d458dd8 6558 locinput= st->locinput;
54612592 6559 state_num = st->resume_state + no_final;
24d3c4a9 6560 goto reenter_switch;
77cb431f
DM
6561 }
6562
a3621e74 6563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6564 PL_colors[4], PL_colors[5]));
02db2b7b 6565
ed301438 6566 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6567 /* each successfully executed (?{...}) block does the equivalent of
6568 * local $^R = do {...}
6569 * When popping the save stack, all these locals would be undone;
6570 * bypass this by setting the outermost saved $^R to the latest
6571 * value */
6572 if (oreplsv != GvSV(PL_replgv))
6573 sv_setsv(oreplsv, GvSV(PL_replgv));
6574 }
95b24440 6575 result = 1;
aa283a38 6576 goto final_exit;
4633a7c4
LW
6577
6578no:
a3621e74 6579 DEBUG_EXECUTE_r(
7821416a 6580 PerlIO_printf(Perl_debug_log,
786e8c11 6581 "%*s %sfailed...%s\n",
5bc10b2c 6582 REPORT_CODE_OFF+depth*2, "",
786e8c11 6583 PL_colors[4], PL_colors[5])
7821416a 6584 );
aa283a38 6585
262b90c4 6586no_silent:
54612592
YO
6587 if (no_final) {
6588 if (yes_state) {
6589 goto yes;
6590 } else {
6591 goto final_exit;
6592 }
6593 }
aa283a38
DM
6594 if (depth) {
6595 /* there's a previous state to backtrack to */
40a82448
DM
6596 st--;
6597 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6598 PL_regmatch_slab = PL_regmatch_slab->prev;
6599 st = SLAB_LAST(PL_regmatch_slab);
6600 }
6601 PL_regmatch_state = st;
40a82448 6602 locinput= st->locinput;
40a82448 6603
5bc10b2c
DM
6604 DEBUG_STATE_pp("pop");
6605 depth--;
262b90c4
DM
6606 if (yes_state == st)
6607 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6608
24d3c4a9
DM
6609 state_num = st->resume_state + 1; /* failure = success + 1 */
6610 goto reenter_switch;
95b24440 6611 }
24d3c4a9 6612 result = 0;
aa283a38 6613
262b90c4 6614 final_exit:
bbe252da 6615 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6616 SV *sv_err = get_sv("REGERROR", 1);
6617 SV *sv_mrk = get_sv("REGMARK", 1);
6618 if (result) {
e2e6a0f1 6619 sv_commit = &PL_sv_no;
5d458dd8
YO
6620 if (!sv_yes_mark)
6621 sv_yes_mark = &PL_sv_yes;
6622 } else {
6623 if (!sv_commit)
6624 sv_commit = &PL_sv_yes;
6625 sv_yes_mark = &PL_sv_no;
6626 }
6627 sv_setsv(sv_err, sv_commit);
6628 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6629 }
19b95bf0 6630
81ed78b2
DM
6631
6632 if (last_pushed_cv) {
6633 dSP;
6634 POP_MULTICALL;
4f8dbb2d 6635 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6636 }
6637
2f554ef7
DM
6638 /* clean up; in particular, free all slabs above current one */
6639 LEAVE_SCOPE(oldsave);
5d9a96ca 6640
730f4c74
DM
6641 assert(!result || locinput - PL_bostr >= 0);
6642 return result ? locinput - PL_bostr : -1;
a687059c
LW
6643}
6644
6645/*
6646 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6647 *
e64f369d
KW
6648 * What 'simple' means is a node which can be the operand of a quantifier like
6649 * '+', or {1,3}
6650 *
d60de1d1
DM
6651 * startposp - pointer a pointer to the start position. This is updated
6652 * to point to the byte following the highest successful
6653 * match.
6654 * p - the regnode to be repeatedly matched against.
4063ade8 6655 * max - maximum number of things to match.
d60de1d1 6656 * depth - (for debugging) backtracking depth.
a687059c 6657 */
76e3520e 6658STATIC I32
f73aaa43 6659S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
a687059c 6660{
27da23d5 6661 dVAR;
4063ade8 6662 char *scan; /* Pointer to current position in target string */
eb578fdb 6663 I32 c;
4063ade8
KW
6664 char *loceol = PL_regeol; /* local version */
6665 I32 hardcount = 0; /* How many matches so far */
eb578fdb 6666 bool utf8_target = PL_reg_match_utf8;
d513472c 6667 UV utf8_flags;
4f55667c
SP
6668#ifndef DEBUGGING
6669 PERL_UNUSED_ARG(depth);
6670#endif
a0d0e21e 6671
7918f24d
NC
6672 PERL_ARGS_ASSERT_REGREPEAT;
6673
f73aaa43 6674 scan = *startposp;
faf11cac
HS
6675 if (max == REG_INFTY)
6676 max = I32_MAX;
4063ade8 6677 else if (! utf8_target && scan + max < loceol)
7f596f4c 6678 loceol = scan + max;
4063ade8
KW
6679
6680 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6681 * to the maximum of how far we should go in it (leaving it set to the real
6682 * end, if the maximum permissible would take us beyond that). This allows
6683 * us to make the loop exit condition that we haven't gone past <loceol> to
6684 * also mean that we haven't exceeded the max permissible count, saving a
6685 * test each time through the loop. But it assumes that the OP matches a
6686 * single byte, which is true for most of the OPs below when applied to a
6687 * non-UTF-8 target. Those relatively few OPs that don't have this
6688 * characteristic will have to compensate.
6689 *
6690 * There is no adjustment for UTF-8 targets, as the number of bytes per
6691 * character varies. OPs will have to test both that the count is less
6692 * than the max permissible (using <hardcount> to keep track), and that we
6693 * are still within the bounds of the string (using <loceol>. A few OPs
6694 * match a single byte no matter what the encoding. They can omit the max
6695 * test if, for the UTF-8 case, they do the adjustment that was skipped
6696 * above.
6697 *
6698 * Thus, the code above sets things up for the common case; and exceptional
6699 * cases need extra work; the common case is to make sure <scan> doesn't
6700 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6701 * count doesn't exceed the maximum permissible */
6702
a0d0e21e 6703 switch (OP(p)) {
22c35a8c 6704 case REG_ANY:
f2ed9b32 6705 if (utf8_target) {
1aa99e6b 6706 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6707 scan += UTF8SKIP(scan);
6708 hardcount++;
6709 }
6710 } else {
6711 while (scan < loceol && *scan != '\n')
6712 scan++;
a0ed51b3
LW
6713 }
6714 break;
ffc61ed2 6715 case SANY:
f2ed9b32 6716 if (utf8_target) {
a0804c9e 6717 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6718 scan += UTF8SKIP(scan);
6719 hardcount++;
6720 }
6721 }
6722 else
6723 scan = loceol;
a0ed51b3 6724 break;
4063ade8
KW
6725 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6726 if (utf8_target && scan + max < loceol) {
6727
6728 /* <loceol> hadn't been adjusted in the UTF-8 case */
6729 scan += max;
6730 }
6731 else {
6732 scan = loceol;
6733 }
f33976b4 6734 break;
59d32103 6735 case EXACT:
613a425d
KW
6736 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6737
59d32103 6738 c = (U8)*STRING(p);
59d32103 6739
5e4a1da1
KW
6740 /* Can use a simple loop if the pattern char to match on is invariant
6741 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6742 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6743 * true iff it doesn't matter if the argument is in UTF-8 or not */
6744 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
4063ade8
KW
6745 if (utf8_target && scan + max < loceol) {
6746 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6747 * since here, to match at all, 1 char == 1 byte */
6748 loceol = scan + max;
6749 }
59d32103
KW
6750 while (scan < loceol && UCHARAT(scan) == c) {
6751 scan++;
6752 }
6753 }
b40a2c17 6754 else if (UTF_PATTERN) {
5e4a1da1
KW
6755 if (utf8_target) {
6756 STRLEN scan_char_len;
5e4a1da1 6757
4063ade8 6758 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6759 * string EQ */
6760 while (hardcount < max
6761 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6762 && scan_char_len <= STR_LEN(p)
6763 && memEQ(scan, STRING(p), scan_char_len))
6764 {
6765 scan += scan_char_len;
6766 hardcount++;
6767 }
6768 }
6769 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6770
5e4a1da1
KW
6771 /* Target isn't utf8; convert the character in the UTF-8
6772 * pattern to non-UTF8, and do a simple loop */
6773 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6774 while (scan < loceol && UCHARAT(scan) == c) {
6775 scan++;
6776 }
6777 } /* else pattern char is above Latin1, can't possibly match the
6778 non-UTF-8 target */
b40a2c17 6779 }
5e4a1da1 6780 else {
59d32103 6781
5e4a1da1
KW
6782 /* Here, the string must be utf8; pattern isn't, and <c> is
6783 * different in utf8 than not, so can't compare them directly.
6784 * Outside the loop, find the two utf8 bytes that represent c, and
6785 * then look for those in sequence in the utf8 string */
59d32103
KW
6786 U8 high = UTF8_TWO_BYTE_HI(c);
6787 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6788
6789 while (hardcount < max
6790 && scan + 1 < loceol
6791 && UCHARAT(scan) == high
6792 && UCHARAT(scan + 1) == low)
6793 {
6794 scan += 2;
6795 hardcount++;
6796 }
6797 }
6798 break;
5e4a1da1 6799
2f7f8cb1
KW
6800 case EXACTFA:
6801 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6802 goto do_exactf;
6803
d4e0b827
KW
6804 case EXACTFL:
6805 PL_reg_flags |= RF_tainted;
17580e7a
KW
6806 utf8_flags = FOLDEQ_UTF8_LOCALE;
6807 goto do_exactf;
6808
d4e0b827 6809 case EXACTF:
62bf7766
KW
6810 utf8_flags = 0;
6811 goto do_exactf;
6812
3c760661 6813 case EXACTFU_SS:
fab2782b 6814 case EXACTFU_TRICKYFOLD:
9a5a5549 6815 case EXACTFU:
05f861a2 6816 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6817
613a425d
KW
6818 do_exactf: {
6819 int c1, c2;
6820 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6821
613a425d
KW
6822 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6823
6824 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6825 if (c1 == CHRTEST_VOID) {
49b95fad 6826 /* Use full Unicode fold matching */
4063ade8 6827 char *tmpeol = PL_regeol;
49b95fad
KW
6828 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6829 while (hardcount < max
6830 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6831 STRING(p), NULL, pat_len,
6832 cBOOL(UTF_PATTERN), utf8_flags))
6833 {
6834 scan = tmpeol;
4063ade8 6835 tmpeol = PL_regeol;
49b95fad
KW
6836 hardcount++;
6837 }
613a425d
KW
6838 }
6839 else if (utf8_target) {
6840 if (c1 == c2) {
4063ade8
KW
6841 while (scan < loceol
6842 && hardcount < max
613a425d
KW
6843 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6844 {
6845 scan += UTF8SKIP(scan);
6846 hardcount++;
6847 }
6848 }
6849 else {
4063ade8
KW
6850 while (scan < loceol
6851 && hardcount < max
613a425d
KW
6852 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6853 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6854 {
6855 scan += UTF8SKIP(scan);
6856 hardcount++;
6857 }
6858 }
6859 }
6860 else if (c1 == c2) {
6861 while (scan < loceol && UCHARAT(scan) == c1) {
6862 scan++;
6863 }
6864 }
6865 else {
6866 while (scan < loceol &&
6867 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6868 {
6869 scan++;
6870 }
6871 }
634c83a2 6872 }
bbce6d69 6873 break;
613a425d 6874 }
a0d0e21e 6875 case ANYOF:
e0193e47 6876 if (utf8_target) {
4e8910e0 6877 STRLEN inclasslen;
4e8910e0 6878 while (hardcount < max
635cd5d4
KW
6879 && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6880 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0
KW
6881 {
6882 scan += inclasslen;
ffc61ed2
JH
6883 hardcount++;
6884 }
6885 } else {
32fc9b6a 6886 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6887 scan++;
6888 }
a0d0e21e 6889 break;
980866de 6890 case ALNUMU:
f2ed9b32 6891 if (utf8_target) {
980866de 6892 utf8_wordchar:
1a4fad37 6893 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6894 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6895 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6896 {
ffc61ed2
JH
6897 scan += UTF8SKIP(scan);
6898 hardcount++;
6899 }
980866de 6900 } else {
a12cf05f
KW
6901 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6902 scan++;
6903 }
980866de
KW
6904 }
6905 break;
6906 case ALNUM:
6907 if (utf8_target)
6908 goto utf8_wordchar;
6909 while (scan < loceol && isALNUM((U8) *scan)) {
6910 scan++;
a0ed51b3
LW
6911 }
6912 break;
cfaf538b 6913 case ALNUMA:
4063ade8
KW
6914 if (utf8_target && scan + max < loceol) {
6915
6916 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6917 * since here, to match, 1 char == 1 byte */
6918 loceol = scan + max;
6919 }
cfaf538b
KW
6920 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6921 scan++;
6922 }
6923 break;
bbce6d69 6924 case ALNUML:
3280af22 6925 PL_reg_flags |= RF_tainted;
f2ed9b32 6926 if (utf8_target) {
1aa99e6b
IH
6927 while (hardcount < max && scan < loceol &&
6928 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6929 scan += UTF8SKIP(scan);
6930 hardcount++;
6931 }
6932 } else {
6933 while (scan < loceol && isALNUM_LC(*scan))
6934 scan++;
a0ed51b3
LW
6935 }
6936 break;
980866de 6937 case NALNUMU:
f2ed9b32 6938 if (utf8_target) {
980866de
KW
6939
6940 utf8_Nwordchar:
6941
1a4fad37 6942 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6943 while (hardcount < max && scan < loceol &&
980866de 6944 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6945 {
ffc61ed2
JH
6946 scan += UTF8SKIP(scan);
6947 hardcount++;
6948 }
980866de 6949 } else {
a12cf05f
KW
6950 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6951 scan++;
6952 }
980866de
KW
6953 }
6954 break;
6955 case NALNUM:
6956 if (utf8_target)
6957 goto utf8_Nwordchar;
6958 while (scan < loceol && ! isALNUM((U8) *scan)) {
6959 scan++;
a0ed51b3
LW
6960 }
6961 break;
0658cdde
KW
6962
6963 case POSIXA:
4063ade8
KW
6964 if (utf8_target && scan + max < loceol) {
6965
6966 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6967 * since here, to match, 1 char == 1 byte */
6968 loceol = scan + max;
6969 }
6970 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
6971 scan++;
6972 }
6973 break;
6974 case NPOSIXA:
6975 if (utf8_target) {
4063ade8
KW
6976 while (scan < loceol && hardcount < max
6977 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6978 {
0658cdde 6979 scan += UTF8SKIP(scan);
4063ade8 6980 hardcount++;
0658cdde
KW
6981 }
6982 }
6983 else {
6984 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6985 scan++;
6986 }
6987 }
6988 break;
cfaf538b
KW
6989 case NALNUMA:
6990 if (utf8_target) {
4063ade8
KW
6991 while (scan < loceol && hardcount < max
6992 && ! isWORDCHAR_A((U8) *scan))
6993 {
cfaf538b 6994 scan += UTF8SKIP(scan);
4063ade8 6995 hardcount++;
cfaf538b
KW
6996 }
6997 }
6998 else {
6999 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
7000 scan++;
7001 }
7002 }
7003 break;
bbce6d69 7004 case NALNUML:
3280af22 7005 PL_reg_flags |= RF_tainted;
f2ed9b32 7006 if (utf8_target) {
1aa99e6b
IH
7007 while (hardcount < max && scan < loceol &&
7008 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
7009 scan += UTF8SKIP(scan);
7010 hardcount++;
7011 }
7012 } else {
7013 while (scan < loceol && !isALNUM_LC(*scan))
7014 scan++;
a0ed51b3
LW
7015 }
7016 break;
980866de 7017 case SPACEU:
f2ed9b32 7018 if (utf8_target) {
980866de
KW
7019
7020 utf8_space:
7021
bedac28b
KW
7022 while (hardcount < max && scan < loceol
7023 && is_XPERLSPACE_utf8((U8*)scan))
a12cf05f 7024 {
ffc61ed2
JH
7025 scan += UTF8SKIP(scan);
7026 hardcount++;
7027 }
980866de
KW
7028 break;
7029 }
7030 else {
a12cf05f
KW
7031 while (scan < loceol && isSPACE_L1((U8) *scan)) {
7032 scan++;
7033 }
980866de
KW
7034 break;
7035 }
7036 case SPACE:
7037 if (utf8_target)
7038 goto utf8_space;
7039
7040 while (scan < loceol && isSPACE((U8) *scan)) {
7041 scan++;
a0ed51b3
LW
7042 }
7043 break;
cfaf538b 7044 case SPACEA:
4063ade8
KW
7045 if (utf8_target && scan + max < loceol) {
7046
7047 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7048 * since here, to match, 1 char == 1 byte */
7049 loceol = scan + max;
7050 }
cfaf538b
KW
7051 while (scan < loceol && isSPACE_A((U8) *scan)) {
7052 scan++;
7053 }
7054 break;
bbce6d69 7055 case SPACEL:
3280af22 7056 PL_reg_flags |= RF_tainted;
f2ed9b32 7057 if (utf8_target) {
1aa99e6b 7058 while (hardcount < max && scan < loceol &&
6bbba904 7059 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
7060 scan += UTF8SKIP(scan);
7061 hardcount++;
7062 }
7063 } else {
7064 while (scan < loceol && isSPACE_LC(*scan))
7065 scan++;
a0ed51b3
LW
7066 }
7067 break;
980866de 7068 case NSPACEU:
f2ed9b32 7069 if (utf8_target) {
980866de
KW
7070
7071 utf8_Nspace:
7072
bedac28b
KW
7073 while (hardcount < max && scan < loceol
7074 && ! is_XPERLSPACE_utf8((U8*)scan))
a12cf05f 7075 {
ffc61ed2
JH
7076 scan += UTF8SKIP(scan);
7077 hardcount++;
7078 }
980866de
KW
7079 break;
7080 }
7081 else {
a12cf05f
KW
7082 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7083 scan++;
7084 }
980866de
KW
7085 }
7086 break;
7087 case NSPACE:
7088 if (utf8_target)
7089 goto utf8_Nspace;
7090
7091 while (scan < loceol && ! isSPACE((U8) *scan)) {
7092 scan++;
a0ed51b3 7093 }
0008a298 7094 break;
cfaf538b
KW
7095 case NSPACEA:
7096 if (utf8_target) {
4063ade8
KW
7097 while (hardcount < max && scan < loceol
7098 && ! isSPACE_A((U8) *scan))
7099 {
cfaf538b 7100 scan += UTF8SKIP(scan);
4063ade8 7101 hardcount++;
cfaf538b
KW
7102 }
7103 }
7104 else {
7105 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7106 scan++;
7107 }
7108 }
7109 break;
bbce6d69 7110 case NSPACEL:
3280af22 7111 PL_reg_flags |= RF_tainted;
f2ed9b32 7112 if (utf8_target) {
1aa99e6b 7113 while (hardcount < max && scan < loceol &&
6bbba904 7114 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
7115 scan += UTF8SKIP(scan);
7116 hardcount++;
7117 }
7118 } else {
7119 while (scan < loceol && !isSPACE_LC(*scan))
7120 scan++;
a0ed51b3
LW
7121 }
7122 break;
a0d0e21e 7123 case DIGIT:
f2ed9b32 7124 if (utf8_target) {
1a4fad37 7125 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 7126 while (hardcount < max && scan < loceol &&
f2ed9b32 7127 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
7128 scan += UTF8SKIP(scan);
7129 hardcount++;
7130 }
7131 } else {
7132 while (scan < loceol && isDIGIT(*scan))
7133 scan++;
a0ed51b3
LW
7134 }
7135 break;
cfaf538b 7136 case DIGITA:
4063ade8
KW
7137 if (utf8_target && scan + max < loceol) {
7138
7139 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7140 * since here, to match, 1 char == 1 byte */
7141 loceol = scan + max;
7142 }
cfaf538b
KW
7143 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7144 scan++;
7145 }
7146 break;
b77393f6
KW
7147 case DIGITL:
7148 PL_reg_flags |= RF_tainted;
7149 if (utf8_target) {
b77393f6
KW
7150 while (hardcount < max && scan < loceol &&
7151 isDIGIT_LC_utf8((U8*)scan)) {
7152 scan += UTF8SKIP(scan);
7153 hardcount++;
7154 }
7155 } else {
7156 while (scan < loceol && isDIGIT_LC(*scan))
7157 scan++;
7158 }
7159 break;
a0d0e21e 7160 case NDIGIT:
f2ed9b32 7161 if (utf8_target) {
1a4fad37 7162 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 7163 while (hardcount < max && scan < loceol &&
f2ed9b32 7164 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
7165 scan += UTF8SKIP(scan);
7166 hardcount++;
7167 }
7168 } else {
7169 while (scan < loceol && !isDIGIT(*scan))
7170 scan++;
a0ed51b3 7171 }
cfaf538b
KW
7172 break;
7173 case NDIGITA:
7174 if (utf8_target) {
4063ade8
KW
7175 while (hardcount < max && scan < loceol
7176 && ! isDIGIT_A((U8) *scan)) {
cfaf538b 7177 scan += UTF8SKIP(scan);
4063ade8 7178 hardcount++;
cfaf538b
KW
7179 }
7180 }
7181 else {
7182 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7183 scan++;
7184 }
7185 }
7186 break;
b77393f6
KW
7187 case NDIGITL:
7188 PL_reg_flags |= RF_tainted;
7189 if (utf8_target) {
b77393f6
KW
7190 while (hardcount < max && scan < loceol &&
7191 !isDIGIT_LC_utf8((U8*)scan)) {
7192 scan += UTF8SKIP(scan);
7193 hardcount++;
7194 }
7195 } else {
7196 while (scan < loceol && !isDIGIT_LC(*scan))
7197 scan++;
7198 }
7199 break;
e1d1eefb 7200 case LNBREAK:
e64f369d
KW
7201 if (utf8_target) {
7202 while (hardcount < max && scan < loceol &&
7203 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7204 scan += c;
7205 hardcount++;
7206 }
7207 } else {
7208 /* LNBREAK can match one or two latin chars, which is ok, but we
7209 * have to use hardcount in this situation, and throw away the
7210 * adjustment to <loceol> done before the switch statement */
7211 loceol = PL_regeol;
7212 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7213 scan+=c;
7214 hardcount++;
7215 }
7216 }
7217 break;
e1d1eefb 7218 case HORIZWS:
f2ed9b32 7219 if (utf8_target) {
7016d6eb
DM
7220 while (hardcount < max && scan < loceol &&
7221 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7222 {
e1d1eefb
YO
7223 scan += c;
7224 hardcount++;
7225 }
7226 } else {
7016d6eb 7227 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7228 scan++;
7229 }
a0ed51b3 7230 break;
e1d1eefb 7231 case NHORIZWS:
f2ed9b32 7232 if (utf8_target) {
7016d6eb
DM
7233 while (hardcount < max && scan < loceol &&
7234 !is_HORIZWS_utf8_safe(scan, loceol))
7235 {
e1d1eefb
YO
7236 scan += UTF8SKIP(scan);
7237 hardcount++;
7238 }
7239 } else {
7016d6eb 7240 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7241 scan++;
7242
7243 }
7244 break;
7245 case VERTWS:
f2ed9b32 7246 if (utf8_target) {
7016d6eb
DM
7247 while (hardcount < max && scan < loceol &&
7248 (c=is_VERTWS_utf8_safe(scan, loceol)))
7249 {
e1d1eefb
YO
7250 scan += c;
7251 hardcount++;
7252 }
7253 } else {
7016d6eb 7254 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7255 scan++;
7256
7257 }
7258 break;
7259 case NVERTWS:
f2ed9b32 7260 if (utf8_target) {
7016d6eb
DM
7261 while (hardcount < max && scan < loceol &&
7262 !is_VERTWS_utf8_safe(scan, loceol))
7263 {
e1d1eefb
YO
7264 scan += UTF8SKIP(scan);
7265 hardcount++;
7266 }
7267 } else {
7016d6eb 7268 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7269 scan++;
7270
7271 }
7272 break;
7273
584b1f02
KW
7274 case BOUND:
7275 case BOUNDA:
7276 case BOUNDL:
7277 case BOUNDU:
7278 case EOS:
7279 case GPOS:
7280 case KEEPS:
7281 case NBOUND:
7282 case NBOUNDA:
7283 case NBOUNDL:
7284 case NBOUNDU:
7285 case OPFAIL:
7286 case SBOL:
7287 case SEOL:
7288 /* These are all 0 width, so match right here or not at all. */
7289 break;
7290
7291 default:
7292 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7293 assert(0); /* NOTREACHED */
7294
a0d0e21e 7295 }
a687059c 7296
a0ed51b3
LW
7297 if (hardcount)
7298 c = hardcount;
7299 else
f73aaa43
DM
7300 c = scan - *startposp;
7301 *startposp = scan;
a687059c 7302
a3621e74 7303 DEBUG_r({
e68ec53f 7304 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7305 DEBUG_EXECUTE_r({
e68ec53f
YO
7306 SV * const prop = sv_newmortal();
7307 regprop(prog, prop, p);
7308 PerlIO_printf(Perl_debug_log,
be8e71aa 7309 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7310 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7311 });
be8e71aa 7312 });
9041c2e3 7313
a0d0e21e 7314 return(c);
a687059c
LW
7315}
7316
c277df42 7317
be8e71aa 7318#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7319/*
6c6525b8 7320- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7321create a copy so that changes the caller makes won't change the shared one.
7322If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7323 */
ffc61ed2 7324SV *
5aaab254 7325Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7326{
6c6525b8 7327 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7328
7329 if (altsvp) {
7330 *altsvp = NULL;
7331 }
7332
7333 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7334}
7335#endif
7336
7337STATIC SV *
5aaab254 7338S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7339{
8c9eb58f
KW
7340 /* Returns the swash for the input 'node' in the regex 'prog'.
7341 * If <doinit> is true, will attempt to create the swash if not already
7342 * done.
7343 * If <listsvp> is non-null, will return the swash initialization string in
7344 * it.
8c9eb58f
KW
7345 * Tied intimately to how regcomp.c sets up the data structure */
7346
97aff369 7347 dVAR;
9e55ce06
JH
7348 SV *sw = NULL;
7349 SV *si = NULL;
7a6c6baa
KW
7350 SV* invlist = NULL;
7351
f8fc2ecf
YO
7352 RXi_GET_DECL(prog,progi);
7353 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7354
6c6525b8 7355 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7356
ccb2541c
KW
7357 assert(ANYOF_NONBITMAP(node));
7358
4f639d21 7359 if (data && data->count) {
a3b680e6 7360 const U32 n = ARG(node);
ffc61ed2 7361
4f639d21 7362 if (data->what[n] == 's') {
ad64d0ec
NC
7363 SV * const rv = MUTABLE_SV(data->data[n]);
7364 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7365 SV **const ary = AvARRAY(av);
87367d5f 7366 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7367
8c9eb58f 7368 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7369
88675427
KW
7370 /* Elements 2 and 3 are either both present or both absent. [2] is
7371 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7372 * that inversion list has any user-defined properties in it. */
88675427
KW
7373 if (av_len(av) >= 2) {
7374 invlist = ary[2];
7375 if (SvUV(ary[3])) {
83199d38
KW
7376 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7377 }
7a6c6baa
KW
7378 }
7379 else {
7380 invlist = NULL;
7a6c6baa
KW
7381 }
7382
8c9eb58f
KW
7383 /* Element [1] is reserved for the set-up swash. If already there,
7384 * return it; if not, create it and store it there */
f192cf32
KW
7385 if (SvROK(ary[1])) {
7386 sw = ary[1];
7387 }
ffc61ed2 7388 else if (si && doinit) {
7a6c6baa
KW
7389
7390 sw = _core_swash_init("utf8", /* the utf8 package */
7391 "", /* nameless */
7392 si,
7393 1, /* binary */
7394 0, /* not from tr/// */
7a6c6baa 7395 invlist,
83199d38 7396 &swash_init_flags);
ffc61ed2
JH
7397 (void)av_store(av, 1, sw);
7398 }
7399 }
7400 }
7401
7a6c6baa
KW
7402 if (listsvp) {
7403 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7404
7405 /* Use the swash, if any, which has to have incorporated into it all
7406 * possibilities */
872dd7e0
KW
7407 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7408 && (si && si != &PL_sv_undef))
7409 {
7a6c6baa 7410
872dd7e0 7411 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7412 sv_catsv(matches_string, si);
7413 }
7414
7415 /* Add the inversion list to whatever we have. This may have come from
7416 * the swash, or from an input parameter */
7417 if (invlist) {
7418 sv_catsv(matches_string, _invlist_contents(invlist));
7419 }
7420 *listsvp = matches_string;
7421 }
7422
ffc61ed2
JH
7423 return sw;
7424}
7425
7426/*
ba7b4546 7427 - reginclass - determine if a character falls into a character class
832705d4 7428
6698fab5
KW
7429 n is the ANYOF regnode
7430 p is the target string
6698fab5 7431 utf8_target tells whether p is in UTF-8.
832705d4 7432
635cd5d4 7433 Returns true if matched; false otherwise.
eba1359e 7434
d5788240
KW
7435 Note that this can be a synthetic start class, a combination of various
7436 nodes, so things you think might be mutually exclusive, such as locale,
7437 aren't. It can match both locale and non-locale
7438
bbce6d69 7439 */
7440
76e3520e 7441STATIC bool
5aaab254 7442S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
bbce6d69 7443{
27da23d5 7444 dVAR;
a3b680e6 7445 const char flags = ANYOF_FLAGS(n);
bbce6d69 7446 bool match = FALSE;
cc07378b 7447 UV c = *p;
1aa99e6b 7448
7918f24d
NC
7449 PERL_ARGS_ASSERT_REGINCLASS;
7450
afd2eb18
KW
7451 /* If c is not already the code point, get it. Note that
7452 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7453 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7454 STRLEN c_len = 0;
f7ab54c6 7455 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7456 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7457 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7458 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7459 * UTF8_ALLOW_FFFF */
f7ab54c6 7460 if (c_len == (STRLEN)-1)
e8a70c6f 7461 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7462 }
4b3cda86 7463
7cdde544
KW
7464 /* If this character is potentially in the bitmap, check it */
7465 if (c < 256) {
ffc61ed2
JH
7466 if (ANYOF_BITMAP_TEST(n, c))
7467 match = TRUE;
11454c59
KW
7468 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7469 && ! utf8_target
7470 && ! isASCII(c))
7471 {
7472 match = TRUE;
7473 }
78969a98
KW
7474 else if (flags & ANYOF_LOCALE) {
7475 PL_reg_flags |= RF_tainted;
7476
538b546e 7477 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7478 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7479 {
ffc61ed2 7480 match = TRUE;
78969a98 7481 }
31c7f561
KW
7482 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7483
7484 /* The data structure is arranged so bits 0, 2, 4, ... are set
7485 * if the class includes the Posix character class given by
7486 * bit/2; and 1, 3, 5, ... are set if the class includes the
7487 * complemented Posix class given by int(bit/2). So we loop
7488 * through the bits, each time changing whether we complement
7489 * the result or not. Suppose for the sake of illustration
7490 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7491 * is set, it means there is a match for this ANYOF node if the
7492 * character is in the class given by the expression (0 / 2 = 0
7493 * = \w). If it is in that class, isFOO_lc() will return 1,
7494 * and since 'to_complement' is 0, the result will stay TRUE,
7495 * and we exit the loop. Suppose instead that bit 0 is 0, but
7496 * bit 1 is 1. That means there is a match if the character
7497 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7498 * but will on bit 1. On the second iteration 'to_complement'
7499 * will be 1, so the exclusive or will reverse things, so we
7500 * are testing for \W. On the third iteration, 'to_complement'
7501 * will be 0, and we would be testing for \s; the fourth
7502 * iteration would test for \S, etc. */
7503
7504 int count = 0;
7505 int to_complement = 0;
7506 while (count < ANYOF_MAX) {
7507 if (ANYOF_CLASS_TEST(n, count)
7508 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7509 {
7510 match = TRUE;
7511 break;
7512 }
7513 count++;
7514 to_complement ^= 1;
7515 }
ffc61ed2 7516 }
a0ed51b3 7517 }
a0ed51b3
LW
7518 }
7519
7cdde544 7520 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7521 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7522 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7523 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7524 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7525 * could be a combination of locale and non-locale nodes. So allow locale
7526 * to match for the synthetic start class, which will give a false
7527 * positive that will be resolved when the match is done again as not part
7528 * of the synthetic start class */
ef87b810 7529 if (!match) {
10ee90d2
KW
7530 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7531 match = TRUE; /* Everything above 255 matches */
e051a21d 7532 }
6f8d7d0d
KW
7533 else if (ANYOF_NONBITMAP(n)
7534 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7535 || (utf8_target
7536 && (c >=256
7537 || (! (flags & ANYOF_LOCALE))
7538 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 7539 {
e0193e47 7540 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7541 if (sw) {
7542 U8 * utf8_p;
7543 if (utf8_target) {
7544 utf8_p = (U8 *) p;
e0193e47
KW
7545 } else { /* Convert to utf8 */
7546 STRLEN len = 1;
7cdde544
KW
7547 utf8_p = bytes_to_utf8(p, &len);
7548 }
f56b6394 7549
e0193e47 7550 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7551 match = TRUE;
e0193e47 7552 }
7cdde544
KW
7553
7554 /* If we allocated a string above, free it */
7555 if (! utf8_target) Safefree(utf8_p);
7556 }
7557 }
5073ffbd
KW
7558
7559 if (UNICODE_IS_SUPER(c)
7560 && (flags & ANYOF_WARN_SUPER)
7561 && ckWARN_d(WARN_NON_UNICODE))
7562 {
7563 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7564 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7565 }
7cdde544
KW
7566 }
7567
f0fdc1c9
KW
7568 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7569 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7570}
161b471a 7571
dfe13c55 7572STATIC U8 *
0ce71af7 7573S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7574{
6af86488
KW
7575 /* return the position 'off' UTF-8 characters away from 's', forward if
7576 * 'off' >= 0, backwards if negative. But don't go outside of position
7577 * 'lim', which better be < s if off < 0 */
7578
97aff369 7579 dVAR;
7918f24d
NC
7580
7581 PERL_ARGS_ASSERT_REGHOP3;
7582
a0ed51b3 7583 if (off >= 0) {
1aa99e6b 7584 while (off-- && s < lim) {
ffc61ed2 7585 /* XXX could check well-formedness here */
a0ed51b3 7586 s += UTF8SKIP(s);
ffc61ed2 7587 }
a0ed51b3
LW
7588 }
7589 else {
1de06328
YO
7590 while (off++ && s > lim) {
7591 s--;
7592 if (UTF8_IS_CONTINUED(*s)) {
7593 while (s > lim && UTF8_IS_CONTINUATION(*s))
7594 s--;
a0ed51b3 7595 }
1de06328 7596 /* XXX could check well-formedness here */
a0ed51b3
LW
7597 }
7598 }
7599 return s;
7600}
161b471a 7601
f9f4320a
YO
7602#ifdef XXX_dmq
7603/* there are a bunch of places where we use two reghop3's that should
7604 be replaced with this routine. but since thats not done yet
7605 we ifdef it out - dmq
7606*/
dfe13c55 7607STATIC U8 *
1de06328
YO
7608S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7609{
7610 dVAR;
7918f24d
NC
7611
7612 PERL_ARGS_ASSERT_REGHOP4;
7613
1de06328
YO
7614 if (off >= 0) {
7615 while (off-- && s < rlim) {
7616 /* XXX could check well-formedness here */
7617 s += UTF8SKIP(s);
7618 }
7619 }
7620 else {
7621 while (off++ && s > llim) {
7622 s--;
7623 if (UTF8_IS_CONTINUED(*s)) {
7624 while (s > llim && UTF8_IS_CONTINUATION(*s))
7625 s--;
7626 }
7627 /* XXX could check well-formedness here */
7628 }
7629 }
7630 return s;
7631}
f9f4320a 7632#endif
1de06328
YO
7633
7634STATIC U8 *
0ce71af7 7635S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7636{
97aff369 7637 dVAR;
7918f24d
NC
7638
7639 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7640
a0ed51b3 7641 if (off >= 0) {
1aa99e6b 7642 while (off-- && s < lim) {
ffc61ed2 7643 /* XXX could check well-formedness here */
a0ed51b3 7644 s += UTF8SKIP(s);
ffc61ed2 7645 }
a0ed51b3 7646 if (off >= 0)
3dab1dad 7647 return NULL;
a0ed51b3
LW
7648 }
7649 else {
1de06328
YO
7650 while (off++ && s > lim) {
7651 s--;
7652 if (UTF8_IS_CONTINUED(*s)) {
7653 while (s > lim && UTF8_IS_CONTINUATION(*s))
7654 s--;
a0ed51b3 7655 }
1de06328 7656 /* XXX could check well-formedness here */
a0ed51b3
LW
7657 }
7658 if (off <= 0)
3dab1dad 7659 return NULL;
a0ed51b3
LW
7660 }
7661 return s;
7662}
51371543 7663
51371543 7664static void
acfe0abc 7665restore_pos(pTHX_ void *arg)
51371543 7666{
97aff369 7667 dVAR;
097eb12c 7668 regexp * const rex = (regexp *)arg;
ed301438 7669 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7670 if (PL_reg_oldsaved) {
4f639d21
DM
7671 rex->subbeg = PL_reg_oldsaved;
7672 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7673 rex->suboffset = PL_reg_oldsavedoffset;
7674 rex->subcoffset = PL_reg_oldsavedcoffset;
db2c6cb3 7675#ifdef PERL_ANY_COW
4f639d21 7676 rex->saved_copy = PL_nrs;
ed252734 7677#endif
07bc277f 7678 RXp_MATCH_COPIED_on(rex);
51371543
GS
7679 }
7680 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7681 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7682 PL_curpm = PL_reg_oldcurpm;
7683 }
7684}
33b8afdf
JH
7685
7686STATIC void
5aaab254 7687S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7688{
7e0d5ad7
KW
7689 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7690 * on the converted value */
7691
a1cac82e 7692 int i = 1;
7918f24d
NC
7693
7694 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7695
a1cac82e
NC
7696 do {
7697 if (prog->substrs->data[i].substr
7698 && !prog->substrs->data[i].utf8_substr) {
7699 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7700 prog->substrs->data[i].utf8_substr = sv;
7701 sv_utf8_upgrade(sv);
610460f9 7702 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7703 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7704 /* Trim the trailing \n that fbm_compile added last
7705 time. */
7706 SvCUR_set(sv, SvCUR(sv) - 1);
7707 /* Whilst this makes the SV technically "invalid" (as its
7708 buffer is no longer followed by "\0") when fbm_compile()
7709 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7710 fbm_compile(sv, FBMcf_TAIL);
7711 } else
7712 fbm_compile(sv, 0);
610460f9 7713 }
a1cac82e
NC
7714 if (prog->substrs->data[i].substr == prog->check_substr)
7715 prog->check_utf8 = sv;
7716 }
7717 } while (i--);
33b8afdf
JH
7718}
7719
7e0d5ad7 7720STATIC bool
5aaab254 7721S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7722{
7e0d5ad7
KW
7723 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7724 * on the converted value; returns FALSE if can't be converted. */
7725
97aff369 7726 dVAR;
a1cac82e 7727 int i = 1;
7918f24d
NC
7728
7729 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7730
a1cac82e
NC
7731 do {
7732 if (prog->substrs->data[i].utf8_substr
7733 && !prog->substrs->data[i].substr) {
7734 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7735 if (! sv_utf8_downgrade(sv, TRUE)) {
7736 return FALSE;
7737 }
5400f398
KW
7738 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7739 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7740 /* Trim the trailing \n that fbm_compile added last
7741 time. */
7742 SvCUR_set(sv, SvCUR(sv) - 1);
7743 fbm_compile(sv, FBMcf_TAIL);
7744 } else
7745 fbm_compile(sv, 0);
7746 }
a1cac82e
NC
7747 prog->substrs->data[i].substr = sv;
7748 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7749 prog->check_substr = sv;
33b8afdf 7750 }
a1cac82e 7751 } while (i--);
7e0d5ad7
KW
7752
7753 return TRUE;
33b8afdf 7754}
66610fdd
RGS
7755
7756/*
7757 * Local variables:
7758 * c-indentation-style: bsd
7759 * c-basic-offset: 4
14d04a33 7760 * indent-tabs-mode: nil
66610fdd
RGS
7761 * End:
7762 *
14d04a33 7763 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7764 */