This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
ext/B/t/OptreeCheck.pm: fix hint stripping
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
c277df42 97#define RF_warned 2 /* warned about big count? */
faec1544 98
ab3bbdeb 99#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 100
f2ed9b32 101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
ce862d02 102
c74f6de9
KW
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
a687059c
LW
105#ifndef STATIC
106#define STATIC static
107#endif
108
e0193e47 109/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
635cd5d4 112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 113 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 114
c277df42
IZ
115/*
116 * Forwards.
117 */
118
f2ed9b32 119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 121
3dab1dad
YO
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
52657f30 124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
07be1b83
YO
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
8e11feef 130 ? (U8*)pos - off \
3dab1dad 131 : NULL)
efb30f32 132
e7409c1b 133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 135
7016d6eb
DM
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
20d0b1e9 148/* these are unrolled below in the CCC_TRY_XXX defined */
61dad979 149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
9c4fdda1 150 if (!CAT2(PL_utf8_,class)) { \
cf54f63a 151 bool ok; \
9c4fdda1 152 ENTER; save_re_context(); \
cf54f63a
JL
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
37e2e78e
KW
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
9c4fdda1 158 if (!CAT2(PL_utf8_,class)) { \
9d63fa07 159 bool throw_away PERL_UNUSED_DECL; \
9c4fdda1
RB
160 ENTER; save_re_context(); \
161 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
162 LEAVE; } } STMT_END
37e2e78e 163
1a4fad37
AL
164#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
165#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
166#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
51371543 167
37e2e78e 168#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
61dad979
KW
169 /* No asserts are done for some of these, in case called on a */ \
170 /* Unicode version in which they map to nothing */ \
27d4fc33 171 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
61dad979 172 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
20d0b1e9 173
1dcf4a1b 174#define PLACEHOLDER /* Something for the preprocessor to grab onto */
d1eb3177 175
ee9a90b8
KW
176/* The actual code for CCC_TRY, which uses several variables from the routine
177 * it's callable from. It is designed to be the bulk of a case statement.
178 * FUNC is the macro or function to call on non-utf8 targets that indicate if
179 * nextchr matches the class.
180 * UTF8_TEST is the whole test string to use for utf8 targets
181 * LOAD is what to use to test, and if not present to load in the swash for the
182 * class
183 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
184 * UTF8_TEST test.
185 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
186 * utf8 and a variant, load the swash if necessary and test using the utf8
187 * test. Advance to the next character if test is ok, otherwise fail; If not
188 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
189 * fails, or advance to the next character */
190
191#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
7016d6eb 192 if (NEXTCHR_IS_EOS) { \
ee9a90b8
KW
193 sayNO; \
194 } \
195 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
196 LOAD_UTF8_CHARCLASS(CLASS, STR); \
197 if (POS_OR_NEG (UTF8_TEST)) { \
198 sayNO; \
199 } \
ee9a90b8 200 } \
28b98f76
DM
201 else if (POS_OR_NEG (FUNC(nextchr))) { \
202 sayNO; \
ee9a90b8 203 } \
28b98f76 204 goto increment_locinput;
980866de 205
ee9a90b8
KW
206/* Handle the non-locale cases for a character class and its complement. It
207 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
208 * This is because that code fails when the test succeeds, so we want to have
209 * the test fail so that the code succeeds. The swash is stored in a
210 * predictable PL_ place */
cfaf538b
KW
211#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
212 CLASS, STR) \
ee9a90b8
KW
213 case NAME: \
214 _CCC_TRY_CODE( !, FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218 case NNAME: \
1dcf4a1b 219 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
ee9a90b8
KW
220 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
221 (U8*)locinput, TRUE)), \
222 CLASS, STR) \
223
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
cfaf538b 236 NAMEA, NNAMEA, FUNCA, \
ee9a90b8
KW
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
1dcf4a1b
KW
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
cfaf538b 245 case NAMEA: \
7016d6eb 246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
cfaf538b
KW
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
3640db6b 250 locinput++; \
cfaf538b
KW
251 break; \
252 case NNAMEA: \
7016d6eb 253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
cfaf538b
KW
254 sayNO; \
255 } \
28b98f76 256 goto increment_locinput; \
ee9a90b8
KW
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
cfaf538b 265 NAMEA, NNAMEA, FUNCA, \
ee9a90b8 266 CLASS, STR) \
cfaf538b
KW
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
ee9a90b8 271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
d1eb3177 272
3dab1dad
YO
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
5f80c4cf 275/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
3e901dc0
YO
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
cca55fe3
JP
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 291 OP(rn) == KEEPS || \
3dab1dad 292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 293)
ee9b8eae 294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 295
ee9b8eae
YO
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
e2d8ce26 313
a84d97b6
HS
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
cca55fe3 318#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 322 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 323 else if (type == PLUS) \
cca55fe3 324 rn = NEXTOPER(rn); \
3dab1dad 325 else if (type == IFMATCH) \
a84d97b6 326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 327 else rn += NEXT_OFF(rn); \
3dab1dad 328 } \
5f80c4cf 329} STMT_END
74750237 330
c476f425 331
acfe0abc 332static void restore_pos(pTHX_ void *arg);
51371543 333
87c0511b 334#define REGCP_PAREN_ELEMS 3
f067efbf 335#define REGCP_OTHER_ELEMS 3
e0fa7e2b 336#define REGCP_FRAME_ELEMS 1
620d5b66
NC
337/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
338 * are needed for the regexp context stack bookkeeping. */
339
76e3520e 340STATIC CHECKPOINT
b93070ed 341S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
a0d0e21e 342{
97aff369 343 dVAR;
a3b680e6 344 const int retval = PL_savestack_ix;
a3b680e6 345 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
346 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
347 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 348 I32 p;
40a82448 349 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 350
b93070ed
DM
351 PERL_ARGS_ASSERT_REGCPPUSH;
352
e49a9654 353 if (paren_elems_to_push < 0)
5637ef5b
NC
354 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
355 paren_elems_to_push);
e49a9654 356
e0fa7e2b
NC
357 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
358 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0
JH
359 " out of range (%lu-%ld)",
360 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
e0fa7e2b 361
620d5b66 362 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 363
495f47a5
DM
364 DEBUG_BUFFERS_r(
365 if ((int)PL_regsize > (int)parenfloor)
366 PerlIO_printf(Perl_debug_log,
367 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
368 PTR2UV(rex),
369 PTR2UV(rex->offs)
370 );
371 );
87c0511b 372 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
b1ce53c5 373/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
374 SSPUSHINT(rex->offs[p].end);
375 SSPUSHINT(rex->offs[p].start);
1ca2007e 376 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 377 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
378 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
379 (UV)p,
380 (IV)rex->offs[p].start,
381 (IV)rex->offs[p].start_tmp,
382 (IV)rex->offs[p].end
40a82448 383 ));
a0d0e21e 384 }
b1ce53c5 385/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22 386 SSPUSHINT(PL_regsize);
b93070ed
DM
387 SSPUSHINT(rex->lastparen);
388 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 389 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 390
a0d0e21e
LW
391 return retval;
392}
393
c277df42 394/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
395#define REGCP_SET(cp) \
396 DEBUG_STATE_r( \
ab3bbdeb 397 PerlIO_printf(Perl_debug_log, \
e4f74956 398 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
399 (IV)PL_savestack_ix)); \
400 cp = PL_savestack_ix
c3464db5 401
ab3bbdeb 402#define REGCP_UNWIND(cp) \
e4f74956 403 DEBUG_STATE_r( \
ab3bbdeb 404 if (cp != PL_savestack_ix) \
e4f74956
YO
405 PerlIO_printf(Perl_debug_log, \
406 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
407 (IV)(cp), (IV)PL_savestack_ix)); \
408 regcpblow(cp)
c277df42 409
a8d1f4b4
DM
410#define UNWIND_PAREN(lp, lcp) \
411 for (n = rex->lastparen; n > lp; n--) \
412 rex->offs[n].end = -1; \
413 rex->lastparen = n; \
414 rex->lastcloseparen = lcp;
415
416
f067efbf 417STATIC void
b93070ed 418S_regcppop(pTHX_ regexp *rex)
a0d0e21e 419{
97aff369 420 dVAR;
e0fa7e2b 421 UV i;
87c0511b 422 U32 paren;
a3621e74
YO
423 GET_RE_DEBUG_FLAGS_DECL;
424
7918f24d
NC
425 PERL_ARGS_ASSERT_REGCPPOP;
426
b1ce53c5 427 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 428 i = SSPOPUV;
e0fa7e2b
NC
429 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
430 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
431 rex->lastcloseparen = SSPOPINT;
432 rex->lastparen = SSPOPINT;
3280af22 433 PL_regsize = SSPOPINT;
b1ce53c5 434
620d5b66 435 i -= REGCP_OTHER_ELEMS;
b1ce53c5 436 /* Now restore the parentheses context. */
495f47a5
DM
437 DEBUG_BUFFERS_r(
438 if (i || rex->lastparen + 1 <= rex->nparens)
439 PerlIO_printf(Perl_debug_log,
440 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
441 PTR2UV(rex),
442 PTR2UV(rex->offs)
443 );
444 );
87c0511b 445 paren = PL_regsize;
620d5b66 446 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 447 I32 tmps;
1ca2007e 448 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 449 rex->offs[paren].start = SSPOPINT;
cf93c79d 450 tmps = SSPOPINT;
b93070ed
DM
451 if (paren <= rex->lastparen)
452 rex->offs[paren].end = tmps;
495f47a5
DM
453 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
454 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
455 (UV)paren,
456 (IV)rex->offs[paren].start,
457 (IV)rex->offs[paren].start_tmp,
458 (IV)rex->offs[paren].end,
459 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 460 );
87c0511b 461 paren--;
a0d0e21e 462 }
daf18116 463#if 1
dafc8851
JH
464 /* It would seem that the similar code in regtry()
465 * already takes care of this, and in fact it is in
466 * a better location to since this code can #if 0-ed out
467 * but the code in regtry() is needed or otherwise tests
468 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
469 * (as of patchlevel 7877) will fail. Then again,
470 * this code seems to be necessary or otherwise
225593e1
DM
471 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
472 * --jhi updated by dapm */
b93070ed 473 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
097eb12c 474 if (i > PL_regsize)
b93070ed
DM
475 rex->offs[i].start = -1;
476 rex->offs[i].end = -1;
495f47a5
DM
477 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
478 " \\%"UVuf": %s ..-1 undeffing\n",
479 (UV)i,
480 (i > PL_regsize) ? "-1" : " "
481 ));
a0d0e21e 482 }
dafc8851 483#endif
a0d0e21e
LW
484}
485
74088413
DM
486/* restore the parens and associated vars at savestack position ix,
487 * but without popping the stack */
488
489STATIC void
490S_regcp_restore(pTHX_ regexp *rex, I32 ix)
491{
492 I32 tmpix = PL_savestack_ix;
493 PL_savestack_ix = ix;
494 regcppop(rex);
495 PL_savestack_ix = tmpix;
496}
497
02db2b7b 498#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 499
a687059c 500/*
e50aee73 501 * pregexec and friends
a687059c
LW
502 */
503
76234dfb 504#ifndef PERL_IN_XSUB_RE
a687059c 505/*
c277df42 506 - pregexec - match a regexp against a string
a687059c 507 */
c277df42 508I32
49d7dfbc 509Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
c3464db5 510 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
511/* stringarg: the point in the string at which to begin matching */
512/* strend: pointer to null at end of string */
513/* strbeg: real beginning of string */
514/* minend: end of match must be >= minend bytes after stringarg. */
515/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
516 * itself is accessed via the pointers above */
517/* nosave: For optimizations. */
c277df42 518{
7918f24d
NC
519 PERL_ARGS_ASSERT_PREGEXEC;
520
c277df42 521 return
9041c2e3 522 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
523 nosave ? 0 : REXEC_COPY_STR);
524}
76234dfb 525#endif
22e551b9 526
9041c2e3 527/*
cad2e5aa
JH
528 * Need to implement the following flags for reg_anch:
529 *
530 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
531 * USE_INTUIT_ML
532 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
533 * INTUIT_AUTORITATIVE_ML
534 * INTUIT_ONCE_NOML - Intuit can match in one location only.
535 * INTUIT_ONCE_ML
536 *
537 * Another flag for this function: SECOND_TIME (so that float substrs
538 * with giant delta may be not rechecked).
539 */
540
541/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
542
3f7c398e 543/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
544 Otherwise, only SvCUR(sv) is used to get strbeg. */
545
546/* XXXX We assume that strpos is strbeg unless sv. */
547
6eb5f6b9
JH
548/* XXXX Some places assume that there is a fixed substring.
549 An update may be needed if optimizer marks as "INTUITable"
550 RExen without fixed substrings. Similarly, it is assumed that
551 lengths of all the strings are no more than minlen, thus they
552 cannot come from lookahead.
40d049e4
YO
553 (Or minlen should take into account lookahead.)
554 NOTE: Some of this comment is not correct. minlen does now take account
555 of lookahead/behind. Further research is required. -- demerphq
556
557*/
6eb5f6b9 558
2c2d71f5
JH
559/* A failure to find a constant substring means that there is no need to make
560 an expensive call to REx engine, thus we celebrate a failure. Similarly,
561 finding a substring too deep into the string means that less calls to
30944b6d
IZ
562 regtry() should be needed.
563
564 REx compiler's optimizer found 4 possible hints:
565 a) Anchored substring;
566 b) Fixed substring;
567 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 568 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 569 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
570 string which does not contradict any of them.
571 */
2c2d71f5 572
6eb5f6b9
JH
573/* Most of decisions we do here should have been done at compile time.
574 The nodes of the REx which we used for the search should have been
575 deleted from the finite automaton. */
576
cad2e5aa 577char *
288b8c02 578Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 579 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 580{
97aff369 581 dVAR;
8d919b0a 582 struct regexp *const prog = ReANY(rx);
eb578fdb 583 I32 start_shift = 0;
cad2e5aa 584 /* Should be nonnegative! */
eb578fdb
KW
585 I32 end_shift = 0;
586 char *s;
587 SV *check;
a1933d95 588 char *strbeg;
cad2e5aa 589 char *t;
f2ed9b32 590 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 591 I32 ml_anch;
eb578fdb 592 char *other_last = NULL; /* other substr checked before this */
bd61b366 593 char *check_at = NULL; /* check substr found at this pos */
d8080198 594 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 595 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 596 RXi_GET_DECL(prog,progi);
30944b6d 597#ifdef DEBUGGING
890ce7af 598 const char * const i_strpos = strpos;
30944b6d 599#endif
a3621e74
YO
600 GET_RE_DEBUG_FLAGS_DECL;
601
7918f24d 602 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
603 PERL_UNUSED_ARG(flags);
604 PERL_UNUSED_ARG(data);
7918f24d 605
f2ed9b32 606 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 607
3c8556c3 608 if (RX_UTF8(rx)) {
b8d68ded
JH
609 PL_reg_flags |= RF_utf8;
610 }
ab3bbdeb 611 DEBUG_EXECUTE_r(
f2ed9b32 612 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
613 sv ? "Guessing start of match in sv for"
614 : "Guessing start of match in string for");
2a782b5b 615 );
cad2e5aa 616
c344f387
JH
617 /* CHR_DIST() would be more correct here but it makes things slow. */
618 if (prog->minlen > strend - strpos) {
a3621e74 619 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 620 "String too short... [re_intuit_start]\n"));
cad2e5aa 621 goto fail;
2c2d71f5 622 }
1de06328 623
7016d6eb
DM
624 /* XXX we need to pass strbeg as a separate arg: the following is
625 * guesswork and can be wrong... */
626 if (sv && SvPOK(sv)) {
627 char * p = SvPVX(sv);
628 STRLEN cur = SvCUR(sv);
629 if (p <= strpos && strpos < p + cur) {
630 strbeg = p;
631 assert(p <= strend && strend <= p + cur);
632 }
633 else
634 strbeg = strend - cur;
635 }
636 else
637 strbeg = strpos;
638
1aa99e6b 639 PL_regeol = strend;
f2ed9b32 640 if (utf8_target) {
33b8afdf
JH
641 if (!prog->check_utf8 && prog->check_substr)
642 to_utf8_substr(prog);
643 check = prog->check_utf8;
644 } else {
7e0d5ad7
KW
645 if (!prog->check_substr && prog->check_utf8) {
646 if (! to_byte_substr(prog)) {
6b54ddc5 647 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
648 }
649 }
33b8afdf
JH
650 check = prog->check_substr;
651 }
bbe252da
YO
652 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
653 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
654 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 655 && !multiline ) ); /* Check after \n? */
cad2e5aa 656
7e25d62c 657 if (!ml_anch) {
bbe252da
YO
658 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
659 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 660 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
661 && sv && !SvROK(sv)
662 && (strpos != strbeg)) {
a3621e74 663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
664 goto fail;
665 }
d46b78c6
KW
666 if (prog->check_offset_min == prog->check_offset_max
667 && !(prog->extflags & RXf_CANY_SEEN)
668 && ! multiline) /* /m can cause \n's to match that aren't
669 accounted for in the string max length.
670 See [perl #115242] */
671 {
2c2d71f5 672 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
673 I32 slen;
674
1aa99e6b 675 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 676
653099ff
GS
677 if (SvTAIL(check)) {
678 slen = SvCUR(check); /* >= 1 */
cad2e5aa 679
9041c2e3 680 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 681 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 683 goto fail_finish;
cad2e5aa
JH
684 }
685 /* Now should match s[0..slen-2] */
686 slen--;
3f7c398e 687 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 688 || (slen > 1
3f7c398e 689 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 690 report_neq:
a3621e74 691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
692 goto fail_finish;
693 }
cad2e5aa 694 }
3f7c398e 695 else if (*SvPVX_const(check) != *s
653099ff 696 || ((slen = SvCUR(check)) > 1
3f7c398e 697 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 698 goto report_neq;
c315bfe8 699 check_at = s;
2c2d71f5 700 goto success_at_start;
7e25d62c 701 }
cad2e5aa 702 }
2c2d71f5 703 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 704 s = strpos;
2c2d71f5 705 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
706 end_shift = prog->check_end_shift;
707
2c2d71f5 708 if (!ml_anch) {
a3b680e6 709 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 710 - (SvTAIL(check) != 0);
a3b680e6 711 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
712
713 if (end_shift < eshift)
714 end_shift = eshift;
715 }
cad2e5aa 716 }
2c2d71f5 717 else { /* Can match at random position */
cad2e5aa
JH
718 ml_anch = 0;
719 s = strpos;
1de06328
YO
720 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
721 end_shift = prog->check_end_shift;
722
723 /* end shift should be non negative here */
cad2e5aa
JH
724 }
725
bcdf7404 726#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 727 if (end_shift < 0)
1de06328 728 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 729 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
730#endif
731
2c2d71f5
JH
732 restart:
733 /* Find a possible match in the region s..strend by looking for
734 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
735
736 {
737 I32 srch_start_shift = start_shift;
738 I32 srch_end_shift = end_shift;
c33e64f0
FC
739 U8* start_point;
740 U8* end_point;
1de06328
YO
741 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
742 srch_end_shift -= ((strbeg - s) - srch_start_shift);
743 srch_start_shift = strbeg - s;
744 }
6bda09f9 745 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
746 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
747 (IV)prog->check_offset_min,
748 (IV)srch_start_shift,
749 (IV)srch_end_shift,
750 (IV)prog->check_end_shift);
751 });
752
bbe252da 753 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
754 start_point= (U8*)(s + srch_start_shift);
755 end_point= (U8*)(strend - srch_end_shift);
756 } else {
757 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
758 end_point= HOP3(strend, -srch_end_shift, strbeg);
759 }
6bda09f9 760 DEBUG_OPTIMISE_MORE_r({
56570a2c 761 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 762 (int)(end_point - start_point),
fc8cd66c 763 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
764 start_point);
765 });
766
767 s = fbm_instr( start_point, end_point,
7fba1cd6 768 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 769 }
cad2e5aa
JH
770 /* Update the count-of-usability, remove useless subpatterns,
771 unshift s. */
2c2d71f5 772
ab3bbdeb 773 DEBUG_EXECUTE_r({
f2ed9b32 774 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
775 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
776 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 777 (s ? "Found" : "Did not find"),
f2ed9b32 778 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
779 ? "anchored" : "floating"),
780 quoted,
781 RE_SV_TAIL(check),
782 (s ? " at offset " : "...\n") );
783 });
2c2d71f5
JH
784
785 if (!s)
786 goto fail_finish;
2c2d71f5 787 /* Finish the diagnostic message */
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 789
1de06328
YO
790 /* XXX dmq: first branch is for positive lookbehind...
791 Our check string is offset from the beginning of the pattern.
792 So we need to do any stclass tests offset forward from that
793 point. I think. :-(
794 */
795
796
797
798 check_at=s;
799
800
2c2d71f5
JH
801 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
802 Start with the other substr.
803 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 804 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
805 *always* match. Probably should be marked during compile...
806 Probably it is right to do no SCREAM here...
807 */
808
f2ed9b32 809 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
810 : (prog->float_substr && prog->anchored_substr))
811 {
30944b6d 812 /* Take into account the "other" substring. */
2c2d71f5
JH
813 /* XXXX May be hopelessly wrong for UTF... */
814 if (!other_last)
6eb5f6b9 815 other_last = strpos;
f2ed9b32 816 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
817 do_other_anchored:
818 {
890ce7af
AL
819 char * const last = HOP3c(s, -start_shift, strbeg);
820 char *last1, *last2;
be8e71aa 821 char * const saved_s = s;
33b8afdf 822 SV* must;
2c2d71f5 823
2c2d71f5
JH
824 t = s - prog->check_offset_max;
825 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 826 && (!utf8_target
0ce71af7 827 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 828 && t > strpos)))
6f207bd3 829 NOOP;
2c2d71f5
JH
830 else
831 t = strpos;
1aa99e6b 832 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
833 if (t < other_last) /* These positions already checked */
834 t = other_last;
1aa99e6b 835 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
836 if (last < last1)
837 last1 = last;
1de06328
YO
838 /* XXXX It is not documented what units *_offsets are in.
839 We assume bytes, but this is clearly wrong.
840 Meaning this code needs to be carefully reviewed for errors.
841 dmq.
842 */
843
2c2d71f5 844 /* On end-of-str: see comment below. */
f2ed9b32 845 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
846 if (must == &PL_sv_undef) {
847 s = (char*)NULL;
1de06328 848 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
849 }
850 else
851 s = fbm_instr(
852 (unsigned char*)t,
853 HOP3(HOP3(last1, prog->anchored_offset, strend)
854 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
855 must,
7fba1cd6 856 multiline ? FBMrf_MULTILINE : 0
33b8afdf 857 );
ab3bbdeb 858 DEBUG_EXECUTE_r({
f2ed9b32 859 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
860 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
861 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 862 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
863 quoted, RE_SV_TAIL(must));
864 });
865
866
2c2d71f5
JH
867 if (!s) {
868 if (last1 >= last2) {
a3621e74 869 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
870 ", giving up...\n"));
871 goto fail_finish;
872 }
a3621e74 873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 874 ", trying floating at offset %ld...\n",
be8e71aa 875 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
876 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
877 s = HOP3c(last, 1, strend);
2c2d71f5
JH
878 goto restart;
879 }
880 else {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 882 (long)(s - i_strpos)));
1aa99e6b
IH
883 t = HOP3c(s, -prog->anchored_offset, strbeg);
884 other_last = HOP3c(s, 1, strend);
be8e71aa 885 s = saved_s;
2c2d71f5
JH
886 if (t == strpos)
887 goto try_at_start;
2c2d71f5
JH
888 goto try_at_offset;
889 }
30944b6d 890 }
2c2d71f5
JH
891 }
892 else { /* Take into account the floating substring. */
33b8afdf 893 char *last, *last1;
be8e71aa 894 char * const saved_s = s;
33b8afdf
JH
895 SV* must;
896
897 t = HOP3c(s, -start_shift, strbeg);
898 last1 = last =
899 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
900 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
901 last = HOP3c(t, prog->float_max_offset, strend);
902 s = HOP3c(t, prog->float_min_offset, strend);
903 if (s < other_last)
904 s = other_last;
2c2d71f5 905 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 906 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
907 /* fbm_instr() takes into account exact value of end-of-str
908 if the check is SvTAIL(ed). Since false positives are OK,
909 and end-of-str is not later than strend we are OK. */
910 if (must == &PL_sv_undef) {
911 s = (char*)NULL;
1de06328 912 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
913 }
914 else
2c2d71f5 915 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
916 (unsigned char*)last + SvCUR(must)
917 - (SvTAIL(must)!=0),
7fba1cd6 918 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 919 DEBUG_EXECUTE_r({
f2ed9b32 920 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
921 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
922 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 923 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
924 quoted, RE_SV_TAIL(must));
925 });
33b8afdf
JH
926 if (!s) {
927 if (last1 == last) {
a3621e74 928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
929 ", giving up...\n"));
930 goto fail_finish;
2c2d71f5 931 }
a3621e74 932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 933 ", trying anchored starting at offset %ld...\n",
be8e71aa 934 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
935 other_last = last;
936 s = HOP3c(t, 1, strend);
937 goto restart;
938 }
939 else {
a3621e74 940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
941 (long)(s - i_strpos)));
942 other_last = s; /* Fix this later. --Hugo */
be8e71aa 943 s = saved_s;
33b8afdf
JH
944 if (t == strpos)
945 goto try_at_start;
946 goto try_at_offset;
947 }
2c2d71f5 948 }
cad2e5aa 949 }
2c2d71f5 950
1de06328 951
9ef43ace 952 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 953
6bda09f9 954 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
955 PerlIO_printf(Perl_debug_log,
956 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
957 (IV)prog->check_offset_min,
958 (IV)prog->check_offset_max,
959 (IV)(s-strpos),
960 (IV)(t-strpos),
961 (IV)(t-s),
962 (IV)(strend-strpos)
963 )
964 );
965
2c2d71f5 966 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 967 && (!utf8_target
9ef43ace 968 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
969 && t > strpos)))
970 {
2c2d71f5
JH
971 /* Fixed substring is found far enough so that the match
972 cannot start at strpos. */
973 try_at_offset:
cad2e5aa 974 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
975 /* Eventually fbm_*() should handle this, but often
976 anchored_offset is not 0, so this check will not be wasted. */
977 /* XXXX In the code below we prefer to look for "^" even in
978 presence of anchored substrings. And we search even
979 beyond the found float position. These pessimizations
980 are historical artefacts only. */
981 find_anchor:
2c2d71f5 982 while (t < strend - prog->minlen) {
cad2e5aa 983 if (*t == '\n') {
4ee3650e 984 if (t < check_at - prog->check_offset_min) {
f2ed9b32 985 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
986 /* Since we moved from the found position,
987 we definitely contradict the found anchored
30944b6d
IZ
988 substr. Due to the above check we do not
989 contradict "check" substr.
990 Thus we can arrive here only if check substr
991 is float. Redo checking for "other"=="fixed".
992 */
9041c2e3 993 strpos = t + 1;
a3621e74 994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 995 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
996 goto do_other_anchored;
997 }
4ee3650e
GS
998 /* We don't contradict the found floating substring. */
999 /* XXXX Why not check for STCLASS? */
cad2e5aa 1000 s = t + 1;
a3621e74 1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1002 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1003 goto set_useful;
1004 }
4ee3650e
GS
1005 /* Position contradicts check-string */
1006 /* XXXX probably better to look for check-string
1007 than for "\n", so one should lower the limit for t? */
a3621e74 1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1009 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1010 other_last = strpos = s = t + 1;
cad2e5aa
JH
1011 goto restart;
1012 }
1013 t++;
1014 }
a3621e74 1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1016 PL_colors[0], PL_colors[1]));
2c2d71f5 1017 goto fail_finish;
cad2e5aa 1018 }
f5952150 1019 else {
a3621e74 1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1021 PL_colors[0], PL_colors[1]));
f5952150 1022 }
cad2e5aa
JH
1023 s = t;
1024 set_useful:
f2ed9b32 1025 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1026 }
1027 else {
f5952150 1028 /* The found string does not prohibit matching at strpos,
2c2d71f5 1029 - no optimization of calling REx engine can be performed,
f5952150
GS
1030 unless it was an MBOL and we are not after MBOL,
1031 or a future STCLASS check will fail this. */
2c2d71f5
JH
1032 try_at_start:
1033 /* Even in this situation we may use MBOL flag if strpos is offset
1034 wrt the start of the string. */
05b4157f 1035 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1036 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1037 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1038 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1039 {
cad2e5aa
JH
1040 t = strpos;
1041 goto find_anchor;
1042 }
a3621e74 1043 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1044 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1045 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1046 );
2c2d71f5 1047 success_at_start:
bbe252da 1048 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1049 && (utf8_target ? (
33b8afdf
JH
1050 prog->check_utf8 /* Could be deleted already */
1051 && --BmUSEFUL(prog->check_utf8) < 0
1052 && (prog->check_utf8 == prog->float_utf8)
1053 ) : (
1054 prog->check_substr /* Could be deleted already */
1055 && --BmUSEFUL(prog->check_substr) < 0
1056 && (prog->check_substr == prog->float_substr)
1057 )))
66e933ab 1058 {
cad2e5aa 1059 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1061 /* XXX Does the destruction order has to change with utf8_target? */
1062 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1064 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1065 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1066 check = NULL; /* abort */
cad2e5aa 1067 s = strpos;
486ec47a 1068 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1069 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070 if (prog->intflags & PREGf_IMPLICIT)
1071 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1072 /* XXXX This is a remnant of the old implementation. It
1073 looks wasteful, since now INTUIT can use many
6eb5f6b9 1074 other heuristics. */
bbe252da 1075 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1076 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1077 }
1078 else
1079 s = strpos;
1080 }
1081
6eb5f6b9
JH
1082 /* Last resort... */
1083 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1084 /* trie stclasses are too expensive to use here, we are better off to
1085 leave it to regmatch itself */
f8fc2ecf 1086 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1087 /* minlen == 0 is possible if regstclass is \b or \B,
1088 and the fixed substr is ''$.
1089 Since minlen is already taken into account, s+1 is before strend;
1090 accidentally, minlen >= 1 guaranties no false positives at s + 1
1091 even for \b or \B. But (minlen? 1 : 0) below assumes that
1092 regstclass does not come from lookahead... */
1093 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1094 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1095 const U8* const str = (U8*)STRING(progi->regstclass);
1096 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1098 : 1);
1de06328
YO
1099 char * endpos;
1100 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102 else if (prog->float_substr || prog->float_utf8)
1103 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1104 else
1105 endpos= strend;
1106
d8080198
YO
1107 if (checked_upto < s)
1108 checked_upto = s;
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1111
6eb5f6b9 1112 t = s;
d8080198
YO
1113 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1114 if (s) {
1115 checked_upto = s;
1116 } else {
6eb5f6b9 1117#ifdef DEBUGGING
cbbf8932 1118 const char *what = NULL;
6eb5f6b9
JH
1119#endif
1120 if (endpos == strend) {
a3621e74 1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1122 "Could not match STCLASS...\n") );
1123 goto fail;
1124 }
a3621e74 1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1126 "This position contradicts STCLASS...\n") );
bbe252da 1127 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1128 goto fail;
d8080198
YO
1129 checked_upto = HOPBACKc(endpos, start_shift);
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1132 /* Contradict one of substrings */
33b8afdf 1133 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1134 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1135 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1136 hop_and_restart:
1aa99e6b 1137 s = HOP3c(t, 1, strend);
66e933ab
GS
1138 if (s + start_shift + end_shift > strend) {
1139 /* XXXX Should be taken into account earlier? */
a3621e74 1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1141 "Could not match STCLASS...\n") );
1142 goto fail;
1143 }
5e39e1e5
HS
1144 if (!check)
1145 goto giveup;
a3621e74 1146 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1147 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1148 what, (long)(s + start_shift - i_strpos)) );
1149 goto restart;
1150 }
66e933ab 1151 /* Have both, check_string is floating */
6eb5f6b9
JH
1152 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153 goto retry_floating_check;
1154 /* Recheck anchored substring, but not floating... */
9041c2e3 1155 s = check_at;
5e39e1e5
HS
1156 if (!check)
1157 goto giveup;
a3621e74 1158 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1159 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1160 (long)(other_last - i_strpos)) );
1161 goto do_other_anchored;
1162 }
60e71179
GS
1163 /* Another way we could have checked stclass at the
1164 current position only: */
1165 if (ml_anch) {
1166 s = t = t + 1;
5e39e1e5
HS
1167 if (!check)
1168 goto giveup;
a3621e74 1169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1170 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1171 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1172 goto try_at_offset;
66e933ab 1173 }
f2ed9b32 1174 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1175 goto fail;
486ec47a 1176 /* Check is floating substring. */
6eb5f6b9
JH
1177 retry_floating_check:
1178 t = check_at - start_shift;
a3621e74 1179 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1180 goto hop_and_restart;
1181 }
b7953727 1182 if (t != s) {
a3621e74 1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1184 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1185 (long)(t - i_strpos), (long)(s - i_strpos))
1186 );
1187 }
1188 else {
a3621e74 1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1190 "Does not contradict STCLASS...\n");
1191 );
1192 }
6eb5f6b9 1193 }
5e39e1e5 1194 giveup:
a3621e74 1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1196 PL_colors[4], (check ? "Guessed" : "Giving up"),
1197 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1198 return s;
2c2d71f5
JH
1199
1200 fail_finish: /* Substring not found */
33b8afdf 1201 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1202 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1203 fail:
a3621e74 1204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1205 PL_colors[4], PL_colors[5]));
bd61b366 1206 return NULL;
cad2e5aa 1207}
9661b544 1208
a0a388a1
YO
1209#define DECL_TRIE_TYPE(scan) \
1210 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1211 trie_type = ((scan->flags == EXACT) \
1212 ? (utf8_target ? trie_utf8 : trie_plain) \
1213 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1214
1215#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1216uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1217 STRLEN skiplen; \
1218 switch (trie_type) { \
1219 case trie_utf8_fold: \
1220 if ( foldlen>0 ) { \
1221 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222 foldlen -= len; \
1223 uscan += len; \
1224 len=0; \
1225 } else { \
1226 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1227 len = UTF8SKIP(uc); \
1228 skiplen = UNISKIP( uvc ); \
1229 foldlen -= skiplen; \
1230 uscan = foldbuf + skiplen; \
1231 } \
1232 break; \
1233 case trie_latin_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 len = 1; \
1241 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_utf8: \
1248 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1249 break; \
1250 case trie_plain: \
1251 uvc = (UV)*uc; \
1252 len = 1; \
1253 } \
1254 if (uvc < 256) { \
1255 charid = trie->charmap[ uvc ]; \
1256 } \
1257 else { \
1258 charid = 0; \
1259 if (widecharmap) { \
1260 SV** const svpp = hv_fetch(widecharmap, \
1261 (char*)&uvc, sizeof(UV), 0); \
1262 if (svpp) \
1263 charid = (U16)SvIV(*svpp); \
1264 } \
1265 } \
4cadc6a9
YO
1266} STMT_END
1267
4cadc6a9
YO
1268#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1269STMT_START { \
1270 while (s <= e) { \
1271 if ( (CoNd) \
fac1af77 1272 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1273 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1274 goto got_it; \
1275 s++; \
1276 } \
1277} STMT_END
1278
1279#define REXEC_FBC_UTF8_SCAN(CoDe) \
1280STMT_START { \
7016d6eb 1281 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
4cadc6a9
YO
1282 CoDe \
1283 s += uskip; \
1284 } \
1285} STMT_END
1286
1287#define REXEC_FBC_SCAN(CoDe) \
1288STMT_START { \
1289 while (s < strend) { \
1290 CoDe \
1291 s++; \
1292 } \
1293} STMT_END
1294
1295#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1296REXEC_FBC_UTF8_SCAN( \
1297 if (CoNd) { \
24b23f37 1298 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1299 goto got_it; \
1300 else \
1301 tmp = doevery; \
1302 } \
1303 else \
1304 tmp = 1; \
1305)
1306
1307#define REXEC_FBC_CLASS_SCAN(CoNd) \
1308REXEC_FBC_SCAN( \
1309 if (CoNd) { \
24b23f37 1310 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1311 goto got_it; \
1312 else \
1313 tmp = doevery; \
1314 } \
1315 else \
1316 tmp = 1; \
1317)
1318
1319#define REXEC_FBC_TRYIT \
24b23f37 1320if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1321 goto got_it
1322
e1d1eefb 1323#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1324 if (utf8_target) { \
e1d1eefb
YO
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1326 } \
1327 else { \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1329 }
e1d1eefb 1330
4cadc6a9 1331#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
f2ed9b32 1332 if (utf8_target) { \
4cadc6a9
YO
1333 UtFpReLoAd; \
1334 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1335 } \
1336 else { \
1337 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1338 }
4cadc6a9
YO
1339
1340#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1341 PL_reg_flags |= RF_tainted; \
f2ed9b32 1342 if (utf8_target) { \
4cadc6a9
YO
1343 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1344 } \
1345 else { \
1346 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1347 }
4cadc6a9 1348
786e8c11
YO
1349#define DUMP_EXEC_POS(li,s,doutf8) \
1350 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1351
cfaf538b
KW
1352
1353#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1355 tmp = TEST_NON_UTF8(tmp); \
1356 REXEC_FBC_UTF8_SCAN( \
1357 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1358 tmp = !tmp; \
1359 IF_SUCCESS; \
1360 } \
1361 else { \
1362 IF_FAIL; \
1363 } \
1364 ); \
1365
1366#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367 if (s == PL_bostr) { \
1368 tmp = '\n'; \
1369 } \
1370 else { \
1371 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1372 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1373 } \
1374 tmp = TeSt1_UtF8; \
1375 LOAD_UTF8_CHARCLASS_ALNUM(); \
1376 REXEC_FBC_UTF8_SCAN( \
1377 if (tmp == ! (TeSt2_UtF8)) { \
1378 tmp = !tmp; \
1379 IF_SUCCESS; \
1380 } \
1381 else { \
1382 IF_FAIL; \
1383 } \
1384 ); \
1385
63ac0dad
KW
1386/* The only difference between the BOUND and NBOUND cases is that
1387 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388 * NBOUND. This is accomplished by passing it in either the if or else clause,
1389 * with the other one being empty */
1390#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1391 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1392
1393#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1394 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1395
1396#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1397 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1398
1399#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1400 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1401
63ac0dad
KW
1402
1403/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1404 * be passed in completely with the variable name being tested, which isn't
1405 * such a clean interface, but this is easier to read than it was before. We
1406 * are looking for the boundary (or non-boundary between a word and non-word
1407 * character. The utf8 and non-utf8 cases have the same logic, but the details
1408 * must be different. Find the "wordness" of the character just prior to this
1409 * one, and compare it with the wordness of this one. If they differ, we have
1410 * a boundary. At the beginning of the string, pretend that the previous
1411 * character was a new-line */
cfaf538b 1412#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1413 if (utf8_target) { \
cfaf538b 1414 UTF8_CODE \
63ac0dad
KW
1415 } \
1416 else { /* Not utf8 */ \
1417 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1418 tmp = TEST_NON_UTF8(tmp); \
1419 REXEC_FBC_SCAN( \
1420 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1421 tmp = !tmp; \
1422 IF_SUCCESS; \
1423 } \
1424 else { \
1425 IF_FAIL; \
1426 } \
1427 ); \
1428 } \
1429 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1430 goto got_it;
1431
786e8c11
YO
1432/* We know what class REx starts with. Try to find this position... */
1433/* if reginfo is NULL, its a dryrun */
1434/* annoyingly all the vars in this routine have different names from their counterparts
1435 in regmatch. /grrr */
1436
3c3eec57 1437STATIC char *
07be1b83 1438S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24b23f37 1439 const char *strend, regmatch_info *reginfo)
a687059c 1440{
27da23d5 1441 dVAR;
bbe252da 1442 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
fac1af77
KW
1443 char *pat_string; /* The pattern's exactish string */
1444 char *pat_end; /* ptr to end char of pat_string */
1445 re_fold_t folder; /* Function for computing non-utf8 folds */
1446 const U8 *fold_array; /* array for folding ords < 256 */
d8093b23 1447 STRLEN ln;
5dab1207 1448 STRLEN lnc;
eb578fdb 1449 STRLEN uskip;
fac1af77
KW
1450 U8 c1;
1451 U8 c2;
6eb5f6b9 1452 char *e;
eb578fdb
KW
1453 I32 tmp = 1; /* Scratch variable? */
1454 const bool utf8_target = PL_reg_match_utf8;
453bfd44 1455 UV utf8_fold_flags = 0;
f8fc2ecf 1456 RXi_GET_DECL(prog,progi);
7918f24d
NC
1457
1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
f8fc2ecf 1459
6eb5f6b9
JH
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
6eb5f6b9 1462 case ANYOF:
e0193e47 1463 if (utf8_target) {
b1e3e569 1464 REXEC_FBC_UTF8_CLASS_SCAN(
635cd5d4 1465 reginclass(prog, c, (U8*)s, utf8_target));
388cc4de
HS
1466 }
1467 else {
6ef69d56 1468 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
a0d0e21e 1469 }
6eb5f6b9 1470 break;
f33976b4 1471 case CANY:
4cadc6a9 1472 REXEC_FBC_SCAN(
24b23f37 1473 if (tmp && (!reginfo || regtry(reginfo, &s)))
f33976b4
DB
1474 goto got_it;
1475 else
1476 tmp = doevery;
4cadc6a9 1477 );
f33976b4 1478 break;
2f7f8cb1
KW
1479
1480 case EXACTFA:
1481 if (UTF_PATTERN || utf8_target) {
1482 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483 goto do_exactf_utf8;
1484 }
1485 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1486 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1487 goto do_exactf_non_utf8; /* isn't dealt with by these */
1488
6eb5f6b9 1489 case EXACTF:
62bf7766 1490 if (utf8_target) {
77a6d856
KW
1491
1492 /* regcomp.c already folded this if pattern is in UTF-8 */
62bf7766 1493 utf8_fold_flags = 0;
fac1af77
KW
1494 goto do_exactf_utf8;
1495 }
1496 fold_array = PL_fold;
1497 folder = foldEQ;
1498 goto do_exactf_non_utf8;
1499
1500 case EXACTFL:
1501 if (UTF_PATTERN || utf8_target) {
17580e7a 1502 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
fac1af77
KW
1503 goto do_exactf_utf8;
1504 }
1505 fold_array = PL_fold_locale;
1506 folder = foldEQ_locale;
16d951b7
KW
1507 goto do_exactf_non_utf8;
1508
3c760661
KW
1509 case EXACTFU_SS:
1510 if (UTF_PATTERN) {
1511 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512 }
1513 goto do_exactf_utf8;
1514
fab2782b 1515 case EXACTFU_TRICKYFOLD:
16d951b7
KW
1516 case EXACTFU:
1517 if (UTF_PATTERN || utf8_target) {
77a6d856 1518 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
16d951b7
KW
1519 goto do_exactf_utf8;
1520 }
1521
1522 /* Any 'ss' in the pattern should have been replaced by regcomp,
1523 * so we don't have to worry here about this single special case
1524 * in the Latin1 range */
1525 fold_array = PL_fold_latin1;
1526 folder = foldEQ_latin1;
fac1af77
KW
1527
1528 /* FALL THROUGH */
1529
62bf7766
KW
1530 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531 are no glitches with fold-length differences
1532 between the target string and pattern */
fac1af77
KW
1533
1534 /* The idea in the non-utf8 EXACTF* cases is to first find the
1535 * first character of the EXACTF* node and then, if necessary,
1536 * case-insensitively compare the full text of the node. c1 is the
1537 * first character. c2 is its fold. This logic will not work for
1538 * Unicode semantics and the german sharp ss, which hence should
1539 * not be compiled into a node that gets here. */
1540 pat_string = STRING(c);
1541 ln = STR_LEN(c); /* length to match in octets/bytes */
1542
8a90a8fe
KW
1543 /* We know that we have to match at least 'ln' bytes (which is the
1544 * same as characters, since not utf8). If we have to match 3
1545 * characters, and there are only 2 availabe, we know without
1546 * trying that it will fail; so don't start a match past the
1547 * required minimum number from the far end */
fac1af77
KW
1548 e = HOP3c(strend, -((I32)ln), s);
1549
1550 if (!reginfo && e < s) {
1551 e = s; /* Due to minlen logic of intuit() */
1552 }
1553
1554 c1 = *pat_string;
1555 c2 = fold_array[c1];
1556 if (c1 == c2) { /* If char and fold are the same */
1557 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1558 }
1559 else {
1560 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561 }
1562 break;
1563
1564 do_exactf_utf8:
e067297c
KW
1565 {
1566 unsigned expansion;
1567
fac1af77
KW
1568
1569 /* If one of the operands is in utf8, we can't use the simpler
1570 * folding above, due to the fact that many different characters
1571 * can have the same fold, or portion of a fold, or different-
1572 * length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
1576 lnc = (UTF_PATTERN) /* length to match in characters */
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
e067297c
KW
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
89378d8a
KW
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
e067297c 1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
89378d8a 1590 lnc = (lnc + expansion - 1) / expansion;
e067297c
KW
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
fac1af77
KW
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
1601
b33105db
KW
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
e067297c 1608
fac1af77
KW
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1612 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
bbdd8bad 1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
fac1af77
KW
1618 }
1619 break;
e067297c 1620 }
bbce6d69 1621 case BOUNDL:
3280af22 1622 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1623 FBC_BOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
a0ed51b3 1626 break;
bbce6d69 1627 case NBOUNDL:
3280af22 1628 PL_reg_flags |= RF_tainted;
63ac0dad
KW
1629 FBC_NBOUND(isALNUM_LC,
1630 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isALNUM_LC_utf8((U8*)s));
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
1635 isALNUM_uni(tmp),
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637 break;
cfaf538b
KW
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
a0d0e21e 1643 case NBOUND:
63ac0dad
KW
1644 FBC_NBOUND(isWORDCHAR,
1645 isALNUM_uni(tmp),
1646 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647 break;
cfaf538b
KW
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
63ac0dad
KW
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
1655 isALNUM_uni(tmp),
1656 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
1660 isALNUM_uni(tmp),
1661 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
a0ed51b3 1662 break;
bbce6d69 1663 case ALNUML:
4cadc6a9
YO
1664 REXEC_FBC_CSCAN_TAINT(
1665 isALNUM_LC_utf8((U8*)s),
1666 isALNUM_LC(*s)
1667 );
6895a8aa 1668 break;
980866de
KW
1669 case ALNUMU:
1670 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1673 isWORDCHAR_L1((U8) *s)
1674 );
6895a8aa 1675 break;
980866de
KW
1676 case ALNUM:
1677 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1680 isWORDCHAR((U8) *s)
1681 );
6895a8aa 1682 break;
cfaf538b 1683 case ALNUMA:
8e9da4d4
KW
1684 /* Don't need to worry about utf8, as it can match only a single
1685 * byte invariant character */
cfaf538b 1686 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
6895a8aa 1687 break;
980866de
KW
1688 case NALNUMU:
1689 REXEC_FBC_CSCAN_PRELOAD(
779d7b58 1690 LOAD_UTF8_CHARCLASS_ALNUM(),
359960d4 1691 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
980866de
KW
1692 ! isWORDCHAR_L1((U8) *s)
1693 );
6895a8aa 1694 break;
a0d0e21e 1695 case NALNUM:
4cadc6a9 1696 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1697 LOAD_UTF8_CHARCLASS_ALNUM(),
1698 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
980866de 1699 ! isALNUM(*s)
4cadc6a9 1700 );
6895a8aa 1701 break;
cfaf538b 1702 case NALNUMA:
8e9da4d4
KW
1703 REXEC_FBC_CSCAN(
1704 !isWORDCHAR_A(*s),
1705 !isWORDCHAR_A(*s)
1706 );
1707 break;
bbce6d69 1708 case NALNUML:
4cadc6a9
YO
1709 REXEC_FBC_CSCAN_TAINT(
1710 !isALNUM_LC_utf8((U8*)s),
1711 !isALNUM_LC(*s)
1712 );
6895a8aa 1713 break;
980866de
KW
1714 case SPACEU:
1715 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1716 LOAD_UTF8_CHARCLASS_SPACE(),
1717 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de
KW
1718 isSPACE_L1((U8) *s)
1719 );
6895a8aa 1720 break;
a0d0e21e 1721 case SPACE:
4cadc6a9 1722 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1723 LOAD_UTF8_CHARCLASS_SPACE(),
1724 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
980866de 1725 isSPACE((U8) *s)
4cadc6a9 1726 );
6895a8aa 1727 break;
cfaf538b 1728 case SPACEA:
8e9da4d4
KW
1729 /* Don't need to worry about utf8, as it can match only a single
1730 * byte invariant character */
cfaf538b 1731 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
6895a8aa 1732 break;
bbce6d69 1733 case SPACEL:
4cadc6a9 1734 REXEC_FBC_CSCAN_TAINT(
6bbba904 1735 isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1736 isSPACE_LC(*s)
1737 );
6895a8aa 1738 break;
980866de
KW
1739 case NSPACEU:
1740 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1741 LOAD_UTF8_CHARCLASS_SPACE(),
1742 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de
KW
1743 ! isSPACE_L1((U8) *s)
1744 );
6895a8aa 1745 break;
a0d0e21e 1746 case NSPACE:
4cadc6a9 1747 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1748 LOAD_UTF8_CHARCLASS_SPACE(),
1749 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
980866de 1750 ! isSPACE((U8) *s)
4cadc6a9 1751 );
6895a8aa 1752 break;
cfaf538b 1753 case NSPACEA:
8e9da4d4
KW
1754 REXEC_FBC_CSCAN(
1755 !isSPACE_A(*s),
1756 !isSPACE_A(*s)
1757 );
1758 break;
bbce6d69 1759 case NSPACEL:
4cadc6a9 1760 REXEC_FBC_CSCAN_TAINT(
6bbba904 1761 !isSPACE_LC_utf8((U8*)s),
4cadc6a9
YO
1762 !isSPACE_LC(*s)
1763 );
6895a8aa 1764 break;
a0d0e21e 1765 case DIGIT:
4cadc6a9 1766 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1767 LOAD_UTF8_CHARCLASS_DIGIT(),
1768 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1769 isDIGIT(*s)
1770 );
6895a8aa 1771 break;
cfaf538b 1772 case DIGITA:
8e9da4d4
KW
1773 /* Don't need to worry about utf8, as it can match only a single
1774 * byte invariant character */
cfaf538b 1775 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
6895a8aa 1776 break;
b8c5462f 1777 case DIGITL:
4cadc6a9
YO
1778 REXEC_FBC_CSCAN_TAINT(
1779 isDIGIT_LC_utf8((U8*)s),
1780 isDIGIT_LC(*s)
1781 );
6895a8aa 1782 break;
a0d0e21e 1783 case NDIGIT:
4cadc6a9 1784 REXEC_FBC_CSCAN_PRELOAD(
779d7b58
KW
1785 LOAD_UTF8_CHARCLASS_DIGIT(),
1786 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
4cadc6a9
YO
1787 !isDIGIT(*s)
1788 );
6895a8aa 1789 break;
cfaf538b 1790 case NDIGITA:
8e9da4d4
KW
1791 REXEC_FBC_CSCAN(
1792 !isDIGIT_A(*s),
1793 !isDIGIT_A(*s)
1794 );
1795 break;
b8c5462f 1796 case NDIGITL:
4cadc6a9
YO
1797 REXEC_FBC_CSCAN_TAINT(
1798 !isDIGIT_LC_utf8((U8*)s),
1799 !isDIGIT_LC(*s)
1800 );
6895a8aa 1801 break;
e1d1eefb
YO
1802 case LNBREAK:
1803 REXEC_FBC_CSCAN(
7016d6eb
DM
1804 is_LNBREAK_utf8_safe(s, strend),
1805 is_LNBREAK_latin1_safe(s, strend)
e1d1eefb 1806 );
6895a8aa 1807 break;
e1d1eefb
YO
1808 case VERTWS:
1809 REXEC_FBC_CSCAN(
7016d6eb
DM
1810 is_VERTWS_utf8_safe(s, strend),
1811 is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1812 );
6895a8aa 1813 break;
e1d1eefb
YO
1814 case NVERTWS:
1815 REXEC_FBC_CSCAN(
7016d6eb
DM
1816 !is_VERTWS_utf8_safe(s, strend),
1817 !is_VERTWS_latin1_safe(s, strend)
e1d1eefb 1818 );
6895a8aa 1819 break;
e1d1eefb
YO
1820 case HORIZWS:
1821 REXEC_FBC_CSCAN(
7016d6eb
DM
1822 is_HORIZWS_utf8_safe(s, strend),
1823 is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1824 );
6895a8aa 1825 break;
e1d1eefb
YO
1826 case NHORIZWS:
1827 REXEC_FBC_CSCAN(
7016d6eb
DM
1828 !is_HORIZWS_utf8_safe(s, strend),
1829 !is_HORIZWS_latin1_safe(s, strend)
e1d1eefb 1830 );
6895a8aa 1831 break;
0658cdde
KW
1832 case POSIXA:
1833 /* Don't need to worry about utf8, as it can match only a single
1834 * byte invariant character. The flag in this node type is the
1835 * class number to pass to _generic_isCC() to build a mask for
1836 * searching in PL_charclass[] */
1837 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1838 break;
1839 case NPOSIXA:
1840 REXEC_FBC_CSCAN(
1841 !_generic_isCC_A(*s, FLAGS(c)),
1842 !_generic_isCC_A(*s, FLAGS(c))
1843 );
1844 break;
1845
1de06328
YO
1846 case AHOCORASICKC:
1847 case AHOCORASICK:
07be1b83 1848 {
a0a388a1 1849 DECL_TRIE_TYPE(c);
07be1b83
YO
1850 /* what trie are we using right now */
1851 reg_ac_data *aho
f8fc2ecf 1852 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3251b653
NC
1853 reg_trie_data *trie
1854 = (reg_trie_data*)progi->data->data[ aho->trie ];
85fbaab2 1855 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
07be1b83
YO
1856
1857 const char *last_start = strend - trie->minlen;
6148ee25 1858#ifdef DEBUGGING
07be1b83 1859 const char *real_start = s;
6148ee25 1860#endif
07be1b83 1861 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1862 SV *sv_points;
1863 U8 **points; /* map of where we were in the input string
786e8c11 1864 when reading a given char. For ASCII this
be8e71aa 1865 is unnecessary overhead as the relationship
38a44b82
NC
1866 is always 1:1, but for Unicode, especially
1867 case folded Unicode this is not true. */
f9e705e8 1868 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
786e8c11
YO
1869 U8 *bitmap=NULL;
1870
07be1b83
YO
1871
1872 GET_RE_DEBUG_FLAGS_DECL;
1873
be8e71aa
YO
1874 /* We can't just allocate points here. We need to wrap it in
1875 * an SV so it gets freed properly if there is a croak while
1876 * running the match */
1877 ENTER;
1878 SAVETMPS;
1879 sv_points=newSV(maxlen * sizeof(U8 *));
1880 SvCUR_set(sv_points,
1881 maxlen * sizeof(U8 *));
1882 SvPOK_on(sv_points);
1883 sv_2mortal(sv_points);
1884 points=(U8**)SvPV_nolen(sv_points );
1de06328
YO
1885 if ( trie_type != trie_utf8_fold
1886 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1887 {
786e8c11
YO
1888 if (trie->bitmap)
1889 bitmap=(U8*)trie->bitmap;
1890 else
1891 bitmap=(U8*)ANYOF_BITMAP(c);
07be1b83 1892 }
786e8c11
YO
1893 /* this is the Aho-Corasick algorithm modified a touch
1894 to include special handling for long "unknown char"
1895 sequences. The basic idea being that we use AC as long
1896 as we are dealing with a possible matching char, when
1897 we encounter an unknown char (and we have not encountered
1898 an accepting state) we scan forward until we find a legal
1899 starting char.
1900 AC matching is basically that of trie matching, except
1901 that when we encounter a failing transition, we fall back
1902 to the current states "fail state", and try the current char
1903 again, a process we repeat until we reach the root state,
1904 state 1, or a legal transition. If we fail on the root state
1905 then we can either terminate if we have reached an accepting
1906 state previously, or restart the entire process from the beginning
1907 if we have not.
1908
1909 */
07be1b83
YO
1910 while (s <= last_start) {
1911 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1912 U8 *uc = (U8*)s;
1913 U16 charid = 0;
1914 U32 base = 1;
1915 U32 state = 1;
1916 UV uvc = 0;
1917 STRLEN len = 0;
1918 STRLEN foldlen = 0;
1919 U8 *uscan = (U8*)NULL;
1920 U8 *leftmost = NULL;
786e8c11
YO
1921#ifdef DEBUGGING
1922 U32 accepted_word= 0;
1923#endif
07be1b83
YO
1924 U32 pointpos = 0;
1925
1926 while ( state && uc <= (U8*)strend ) {
1927 int failed=0;
786e8c11
YO
1928 U32 word = aho->states[ state ].wordnum;
1929
1de06328
YO
1930 if( state==1 ) {
1931 if ( bitmap ) {
1932 DEBUG_TRIE_EXECUTE_r(
1933 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1934 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1935 (char *)uc, utf8_target );
1de06328
YO
1936 PerlIO_printf( Perl_debug_log,
1937 " Scanning for legal start char...\n");
1938 }
d085b490
YO
1939 );
1940 if (utf8_target) {
1941 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1942 uc += UTF8SKIP(uc);
1943 }
1944 } else {
1945 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1946 uc++;
1947 }
1948 }
1de06328 1949 s= (char *)uc;
786e8c11 1950 }
786e8c11
YO
1951 if (uc >(U8*)last_start) break;
1952 }
1953
1954 if ( word ) {
2e64971a 1955 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
786e8c11
YO
1956 if (!leftmost || lpos < leftmost) {
1957 DEBUG_r(accepted_word=word);
07be1b83 1958 leftmost= lpos;
786e8c11 1959 }
07be1b83 1960 if (base==0) break;
786e8c11 1961
07be1b83
YO
1962 }
1963 points[pointpos++ % maxlen]= uc;
7016d6eb
DM
1964 if (foldlen || uc < (U8*)strend) {
1965 REXEC_TRIE_READ_CHAR(trie_type, trie,
1966 widecharmap, uc,
55eed653
NC
1967 uscan, len, uvc, charid, foldlen,
1968 foldbuf, uniflags);
7016d6eb
DM
1969 DEBUG_TRIE_EXECUTE_r({
1970 dump_exec_pos( (char *)uc, c, strend,
1971 real_start, s, utf8_target);
1972 PerlIO_printf(Perl_debug_log,
1973 " Charid:%3u CP:%4"UVxf" ",
1974 charid, uvc);
1975 });
1976 }
1977 else {
1978 len = 0;
1979 charid = 0;
1980 }
1981
07be1b83
YO
1982
1983 do {
6148ee25 1984#ifdef DEBUGGING
786e8c11 1985 word = aho->states[ state ].wordnum;
6148ee25 1986#endif
07be1b83
YO
1987 base = aho->states[ state ].trans.base;
1988
786e8c11
YO
1989 DEBUG_TRIE_EXECUTE_r({
1990 if (failed)
1991 dump_exec_pos( (char *)uc, c, strend, real_start,
f2ed9b32 1992 s, utf8_target );
07be1b83 1993 PerlIO_printf( Perl_debug_log,
786e8c11
YO
1994 "%sState: %4"UVxf", word=%"UVxf,
1995 failed ? " Fail transition to " : "",
1996 (UV)state, (UV)word);
1997 });
07be1b83
YO
1998 if ( base ) {
1999 U32 tmp;
6dd2be57 2000 I32 offset;
07be1b83 2001 if (charid &&
6dd2be57
DM
2002 ( ((offset = base + charid
2003 - 1 - trie->uniquecharcount)) >= 0)
2004 && ((U32)offset < trie->lasttrans)
2005 && trie->trans[offset].check == state
2006 && (tmp=trie->trans[offset].next))
07be1b83 2007 {
786e8c11
YO
2008 DEBUG_TRIE_EXECUTE_r(
2009 PerlIO_printf( Perl_debug_log," - legal\n"));
07be1b83
YO
2010 state = tmp;
2011 break;
2012 }
2013 else {
786e8c11
YO
2014 DEBUG_TRIE_EXECUTE_r(
2015 PerlIO_printf( Perl_debug_log," - fail\n"));
2016 failed = 1;
2017 state = aho->fail[state];
07be1b83
YO
2018 }
2019 }
2020 else {
2021 /* we must be accepting here */
786e8c11
YO
2022 DEBUG_TRIE_EXECUTE_r(
2023 PerlIO_printf( Perl_debug_log," - accepting\n"));
2024 failed = 1;
07be1b83
YO
2025 break;
2026 }
2027 } while(state);
786e8c11 2028 uc += len;
07be1b83
YO
2029 if (failed) {
2030 if (leftmost)
2031 break;
786e8c11 2032 if (!state) state = 1;
07be1b83
YO
2033 }
2034 }
2035 if ( aho->states[ state ].wordnum ) {
2e64971a 2036 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
786e8c11
YO
2037 if (!leftmost || lpos < leftmost) {
2038 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
07be1b83 2039 leftmost = lpos;
786e8c11 2040 }
07be1b83 2041 }
07be1b83
YO
2042 if (leftmost) {
2043 s = (char*)leftmost;
786e8c11
YO
2044 DEBUG_TRIE_EXECUTE_r({
2045 PerlIO_printf(
70685ca0
JH
2046 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2047 (UV)accepted_word, (IV)(s - real_start)
786e8c11
YO
2048 );
2049 });
24b23f37 2050 if (!reginfo || regtry(reginfo, &s)) {
be8e71aa
YO
2051 FREETMPS;
2052 LEAVE;
07be1b83 2053 goto got_it;
be8e71aa 2054 }
07be1b83 2055 s = HOPc(s,1);
786e8c11
YO
2056 DEBUG_TRIE_EXECUTE_r({
2057 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2058 });
07be1b83 2059 } else {
786e8c11
YO
2060 DEBUG_TRIE_EXECUTE_r(
2061 PerlIO_printf( Perl_debug_log,"No match.\n"));
07be1b83
YO
2062 break;
2063 }
2064 }
be8e71aa
YO
2065 FREETMPS;
2066 LEAVE;
07be1b83
YO
2067 }
2068 break;
b3c9acc1 2069 default:
3c3eec57
GS
2070 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2071 break;
d6a28714 2072 }
6eb5f6b9
JH
2073 return 0;
2074 got_it:
2075 return s;
2076}
2077
fae667d5 2078
6eb5f6b9
JH
2079/*
2080 - regexec_flags - match a regexp against a string
2081 */
2082I32
288b8c02 2083Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
6eb5f6b9 2084 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2085/* stringarg: the point in the string at which to begin matching */
2086/* strend: pointer to null at end of string */
2087/* strbeg: real beginning of string */
2088/* minend: end of match must be >= minend bytes after stringarg. */
2089/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2090 * itself is accessed via the pointers above */
2091/* data: May be used for some additional optimizations.
2092 Currently its only used, with a U32 cast, for transmitting
2093 the ganch offset when doing a /g match. This will change */
2094/* nosave: For optimizations. */
2095
6eb5f6b9 2096{
97aff369 2097 dVAR;
8d919b0a 2098 struct regexp *const prog = ReANY(rx);
24b23f37 2099 /*register*/ char *s;
eb578fdb 2100 regnode *c;
24b23f37 2101 /*register*/ char *startpos = stringarg;
6eb5f6b9
JH
2102 I32 minlen; /* must match at least this many chars */
2103 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2104 I32 end_shift = 0; /* Same for the end. */ /* CC */
2105 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2106 char *scream_olds = NULL;
f2ed9b32 2107 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2108 I32 multiline;
f8fc2ecf 2109 RXi_GET_DECL(prog,progi);
3b0527fe 2110 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2111 regexp_paren_pair *swap = NULL;
a3621e74
YO
2112 GET_RE_DEBUG_FLAGS_DECL;
2113
7918f24d 2114 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2115 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2116
2117 /* Be paranoid... */
2118 if (prog == NULL || startpos == NULL) {
2119 Perl_croak(aTHX_ "NULL regexp parameter");
2120 return 0;
2121 }
2122
bbe252da 2123 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2124 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2125
f2ed9b32 2126 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2127 DEBUG_EXECUTE_r(
f2ed9b32 2128 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2129 "Matching");
2130 );
bac06658 2131
6eb5f6b9 2132 minlen = prog->minlen;
1de06328
YO
2133
2134 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2135 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2136 "String too short [regexec_flags]...\n"));
2137 goto phooey;
1aa99e6b 2138 }
6eb5f6b9 2139
1de06328 2140
6eb5f6b9 2141 /* Check validity of program. */
f8fc2ecf 2142 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2143 Perl_croak(aTHX_ "corrupted regexp program");
2144 }
2145
2146 PL_reg_flags = 0;
ed301438 2147 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2148 PL_reg_maxiter = 0;
2149
3c8556c3 2150 if (RX_UTF8(rx))
6eb5f6b9
JH
2151 PL_reg_flags |= RF_utf8;
2152
2153 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2154 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2155 PL_bostr = strbeg;
3b0527fe 2156 reginfo.sv = sv;
6eb5f6b9
JH
2157
2158 /* Mark end of line for $ (and such) */
2159 PL_regeol = strend;
2160
2161 /* see how far we have to get to not match where we matched before */
3b0527fe 2162 reginfo.till = startpos+minend;
6eb5f6b9 2163
6eb5f6b9
JH
2164 /* If there is a "must appear" string, look for it. */
2165 s = startpos;
2166
bbe252da 2167 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2168 MAGIC *mg;
2c296965 2169 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2170 reginfo.ganch = startpos + prog->gofs;
2c296965 2171 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2172 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2173 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2174 && SvMAGIC(sv)
14befaf4
DM
2175 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2176 && mg->mg_len >= 0) {
3b0527fe 2177 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2178 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2179 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2180
bbe252da 2181 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2182 if (s > reginfo.ganch)
6eb5f6b9 2183 goto phooey;
58e23c8d 2184 s = reginfo.ganch - prog->gofs;
2c296965 2185 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2186 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2187 if (s < strbeg)
2188 goto phooey;
6eb5f6b9
JH
2189 }
2190 }
58e23c8d 2191 else if (data) {
70685ca0 2192 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2193 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2195
2196 } else { /* pos() not defined */
3b0527fe 2197 reginfo.ganch = strbeg;
2c296965
YO
2198 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2199 "GPOS: reginfo.ganch = strbeg\n"));
2200 }
6eb5f6b9 2201 }
288b8c02 2202 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2203 /* We have to be careful. If the previous successful match
2204 was from this regex we don't want a subsequent partially
2205 successful match to clobber the old results.
2206 So when we detect this possibility we add a swap buffer
2207 to the re, and switch the buffer each match. If we fail
2208 we switch it back, otherwise we leave it swapped.
2209 */
2210 swap = prog->offs;
2211 /* do we need a save destructor here for eval dies? */
2212 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2213 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2214 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2215 PTR2UV(prog),
2216 PTR2UV(swap),
2217 PTR2UV(prog->offs)
2218 ));
c74340f9 2219 }
a0714e2c 2220 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2221 re_scream_pos_data d;
2222
2223 d.scream_olds = &scream_olds;
2224 d.scream_pos = &scream_pos;
288b8c02 2225 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2226 if (!s) {
a3621e74 2227 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2228 goto phooey; /* not present */
3fa9c3d7 2229 }
6eb5f6b9
JH
2230 }
2231
1de06328 2232
6eb5f6b9
JH
2233
2234 /* Simplest case: anchored match need be tried only once. */
2235 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2236 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2237 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2238 goto got_it;
bbe252da
YO
2239 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2240 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2241 {
2242 char *end;
2243
2244 if (minlen)
2245 dontbother = minlen - 1;
1aa99e6b 2246 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2247 /* for multiline we only have to try after newlines */
33b8afdf 2248 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2249 /* because of the goto we can not easily reuse the macros for bifurcating the
2250 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2251 if (utf8_target) {
2252 if (s == startpos)
2253 goto after_try_utf8;
2254 while (1) {
2255 if (regtry(&reginfo, &s)) {
2256 goto got_it;
2257 }
2258 after_try_utf8:
2259 if (s > end) {
2260 goto phooey;
2261 }
2262 if (prog->extflags & RXf_USE_INTUIT) {
2263 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2264 if (!s) {
2265 goto phooey;
2266 }
2267 }
2268 else {
2269 s += UTF8SKIP(s);
2270 }
2271 }
2272 } /* end search for check string in unicode */
2273 else {
2274 if (s == startpos) {
2275 goto after_try_latin;
2276 }
2277 while (1) {
2278 if (regtry(&reginfo, &s)) {
2279 goto got_it;
2280 }
2281 after_try_latin:
2282 if (s > end) {
2283 goto phooey;
2284 }
2285 if (prog->extflags & RXf_USE_INTUIT) {
2286 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2287 if (!s) {
2288 goto phooey;
2289 }
2290 }
2291 else {
2292 s++;
2293 }
2294 }
2295 } /* end search for check string in latin*/
2296 } /* end search for check string */
2297 else { /* search for newline */
2298 if (s > startpos) {
2299 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2300 s--;
92f3d482 2301 }
21eede78
YO
2302 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2303 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2304 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2305 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2306 goto got_it;
2307 }
92f3d482
YO
2308 }
2309 } /* end search for newline */
2310 } /* end anchored/multiline check string search */
6eb5f6b9 2311 goto phooey;
bbe252da 2312 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2313 {
486ec47a 2314 /* the warning about reginfo.ganch being used without initialization
bbe252da 2315 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2316 and we only enter this block when the same bit is set. */
58e23c8d 2317 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2318
2319 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2320 goto got_it;
2321 goto phooey;
2322 }
2323
2324 /* Messy cases: unanchored match. */
bbe252da 2325 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2326 /* we have /x+whatever/ */
f2ed9b32 2327 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
33b8afdf 2328 char ch;
bf93d4cc
GS
2329#ifdef DEBUGGING
2330 int did_match = 0;
2331#endif
f2ed9b32 2332 if (utf8_target) {
7e0d5ad7
KW
2333 if (! prog->anchored_utf8) {
2334 to_utf8_substr(prog);
2335 }
2336 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2337 REXEC_FBC_SCAN(
6eb5f6b9 2338 if (*s == ch) {
a3621e74 2339 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2340 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2341 s += UTF8SKIP(s);
2342 while (s < strend && *s == ch)
2343 s += UTF8SKIP(s);
2344 }
4cadc6a9 2345 );
7e0d5ad7 2346
6eb5f6b9
JH
2347 }
2348 else {
7e0d5ad7
KW
2349 if (! prog->anchored_substr) {
2350 if (! to_byte_substr(prog)) {
6b54ddc5 2351 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2352 }
2353 }
2354 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2355 REXEC_FBC_SCAN(
6eb5f6b9 2356 if (*s == ch) {
a3621e74 2357 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2358 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2359 s++;
2360 while (s < strend && *s == ch)
2361 s++;
2362 }
4cadc6a9 2363 );
6eb5f6b9 2364 }
a3621e74 2365 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2366 PerlIO_printf(Perl_debug_log,
b7953727
JH
2367 "Did not find anchored character...\n")
2368 );
6eb5f6b9 2369 }
a0714e2c
SS
2370 else if (prog->anchored_substr != NULL
2371 || prog->anchored_utf8 != NULL
2372 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2373 && prog->float_max_offset < strend - s)) {
2374 SV *must;
2375 I32 back_max;
2376 I32 back_min;
2377 char *last;
6eb5f6b9 2378 char *last1; /* Last position checked before */
bf93d4cc
GS
2379#ifdef DEBUGGING
2380 int did_match = 0;
2381#endif
33b8afdf 2382 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2383 if (utf8_target) {
2384 if (! prog->anchored_utf8) {
2385 to_utf8_substr(prog);
2386 }
2387 must = prog->anchored_utf8;
2388 }
2389 else {
2390 if (! prog->anchored_substr) {
2391 if (! to_byte_substr(prog)) {
6b54ddc5 2392 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2393 }
2394 }
2395 must = prog->anchored_substr;
2396 }
33b8afdf
JH
2397 back_max = back_min = prog->anchored_offset;
2398 } else {
7e0d5ad7
KW
2399 if (utf8_target) {
2400 if (! prog->float_utf8) {
2401 to_utf8_substr(prog);
2402 }
2403 must = prog->float_utf8;
2404 }
2405 else {
2406 if (! prog->float_substr) {
2407 if (! to_byte_substr(prog)) {
6b54ddc5 2408 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2409 }
2410 }
2411 must = prog->float_substr;
2412 }
33b8afdf
JH
2413 back_max = prog->float_max_offset;
2414 back_min = prog->float_min_offset;
2415 }
1de06328 2416
1de06328
YO
2417 if (back_min<0) {
2418 last = strend;
2419 } else {
2420 last = HOP3c(strend, /* Cannot start after this */
2421 -(I32)(CHR_SVLEN(must)
2422 - (SvTAIL(must) != 0) + back_min), strbeg);
2423 }
6eb5f6b9
JH
2424 if (s > PL_bostr)
2425 last1 = HOPc(s, -1);
2426 else
2427 last1 = s - 1; /* bogus */
2428
a0288114 2429 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2430 check_substr==must. */
2431 scream_pos = -1;
2432 dontbother = end_shift;
2433 strend = HOPc(strend, -dontbother);
2434 while ( (s <= last) &&
c33e64f0 2435 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2436 (unsigned char*)strend, must,
c33e64f0 2437 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2438 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2439 if (HOPc(s, -back_max) > last1) {
2440 last1 = HOPc(s, -back_min);
2441 s = HOPc(s, -back_max);
2442 }
2443 else {
52657f30 2444 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2445
2446 last1 = HOPc(s, -back_min);
52657f30 2447 s = t;
6eb5f6b9 2448 }
f2ed9b32 2449 if (utf8_target) {
6eb5f6b9 2450 while (s <= last1) {
24b23f37 2451 if (regtry(&reginfo, &s))
6eb5f6b9 2452 goto got_it;
7016d6eb
DM
2453 if (s >= last1) {
2454 s++; /* to break out of outer loop */
2455 break;
2456 }
2457 s += UTF8SKIP(s);
6eb5f6b9
JH
2458 }
2459 }
2460 else {
2461 while (s <= last1) {
24b23f37 2462 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2463 goto got_it;
2464 s++;
2465 }
2466 }
2467 }
ab3bbdeb 2468 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2469 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2470 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2472 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2473 ? "anchored" : "floating"),
ab3bbdeb
YO
2474 quoted, RE_SV_TAIL(must));
2475 });
6eb5f6b9
JH
2476 goto phooey;
2477 }
f8fc2ecf 2478 else if ( (c = progi->regstclass) ) {
f14c76ed 2479 if (minlen) {
f8fc2ecf 2480 const OPCODE op = OP(progi->regstclass);
66e933ab 2481 /* don't bother with what can't match */
786e8c11 2482 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2483 strend = HOPc(strend, -(minlen - 1));
2484 }
a3621e74 2485 DEBUG_EXECUTE_r({
be8e71aa 2486 SV * const prop = sv_newmortal();
32fc9b6a 2487 regprop(prog, prop, c);
0df25f3d 2488 {
f2ed9b32 2489 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2490 s,strend-s,60);
0df25f3d 2491 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2492 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2493 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2494 quoted, (int)(strend - s));
0df25f3d 2495 }
ffc61ed2 2496 });
3b0527fe 2497 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 2498 goto got_it;
07be1b83 2499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2500 }
2501 else {
2502 dontbother = 0;
a0714e2c 2503 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2504 /* Trim the end. */
6af40bd7 2505 char *last= NULL;
33b8afdf 2506 SV* float_real;
c33e64f0
FC
2507 STRLEN len;
2508 const char *little;
33b8afdf 2509
7e0d5ad7
KW
2510 if (utf8_target) {
2511 if (! prog->float_utf8) {
2512 to_utf8_substr(prog);
2513 }
2514 float_real = prog->float_utf8;
2515 }
2516 else {
2517 if (! prog->float_substr) {
2518 if (! to_byte_substr(prog)) {
6b54ddc5 2519 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2520 }
2521 }
2522 float_real = prog->float_substr;
2523 }
d6a28714 2524
c33e64f0
FC
2525 little = SvPV_const(float_real, len);
2526 if (SvTAIL(float_real)) {
7f18ad16
KW
2527 /* This means that float_real contains an artificial \n on
2528 * the end due to the presence of something like this:
2529 * /foo$/ where we can match both "foo" and "foo\n" at the
2530 * end of the string. So we have to compare the end of the
2531 * string first against the float_real without the \n and
2532 * then against the full float_real with the string. We
2533 * have to watch out for cases where the string might be
2534 * smaller than the float_real or the float_real without
2535 * the \n. */
1a13b075
YO
2536 char *checkpos= strend - len;
2537 DEBUG_OPTIMISE_r(
2538 PerlIO_printf(Perl_debug_log,
2539 "%sChecking for float_real.%s\n",
2540 PL_colors[4], PL_colors[5]));
2541 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2542 /* can't match, even if we remove the trailing \n
2543 * string is too short to match */
1a13b075
YO
2544 DEBUG_EXECUTE_r(
2545 PerlIO_printf(Perl_debug_log,
2546 "%sString shorter than required trailing substring, cannot match.%s\n",
2547 PL_colors[4], PL_colors[5]));
2548 goto phooey;
2549 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2550 /* can match, the end of the string matches without the
2551 * "\n" */
1a13b075
YO
2552 last = checkpos + 1;
2553 } else if (checkpos < strbeg) {
7f18ad16
KW
2554 /* cant match, string is too short when the "\n" is
2555 * included */
1a13b075
YO
2556 DEBUG_EXECUTE_r(
2557 PerlIO_printf(Perl_debug_log,
2558 "%sString does not contain required trailing substring, cannot match.%s\n",
2559 PL_colors[4], PL_colors[5]));
2560 goto phooey;
2561 } else if (!multiline) {
7f18ad16
KW
2562 /* non multiline match, so compare with the "\n" at the
2563 * end of the string */
1a13b075
YO
2564 if (memEQ(checkpos, little, len)) {
2565 last= checkpos;
2566 } else {
2567 DEBUG_EXECUTE_r(
2568 PerlIO_printf(Perl_debug_log,
2569 "%sString does not contain required trailing substring, cannot match.%s\n",
2570 PL_colors[4], PL_colors[5]));
2571 goto phooey;
2572 }
2573 } else {
7f18ad16
KW
2574 /* multiline match, so we have to search for a place
2575 * where the full string is located */
d6a28714 2576 goto find_last;
1a13b075 2577 }
c33e64f0 2578 } else {
d6a28714 2579 find_last:
9041c2e3 2580 if (len)
d6a28714 2581 last = rninstr(s, strend, little, little + len);
b8c5462f 2582 else
a0288114 2583 last = strend; /* matching "$" */
b8c5462f 2584 }
6af40bd7 2585 if (!last) {
7f18ad16
KW
2586 /* at one point this block contained a comment which was
2587 * probably incorrect, which said that this was a "should not
2588 * happen" case. Even if it was true when it was written I am
2589 * pretty sure it is not anymore, so I have removed the comment
2590 * and replaced it with this one. Yves */
6bda09f9
YO
2591 DEBUG_EXECUTE_r(
2592 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2593 "String does not contain required substring, cannot match.\n"
2594 ));
2595 goto phooey;
bf93d4cc 2596 }
d6a28714
JH
2597 dontbother = strend - last + prog->float_min_offset;
2598 }
2599 if (minlen && (dontbother < minlen))
2600 dontbother = minlen - 1;
2601 strend -= dontbother; /* this one's always in bytes! */
2602 /* We don't know much -- general case. */
f2ed9b32 2603 if (utf8_target) {
d6a28714 2604 for (;;) {
24b23f37 2605 if (regtry(&reginfo, &s))
d6a28714
JH
2606 goto got_it;
2607 if (s >= strend)
2608 break;
b8c5462f 2609 s += UTF8SKIP(s);
d6a28714
JH
2610 };
2611 }
2612 else {
2613 do {
24b23f37 2614 if (regtry(&reginfo, &s))
d6a28714
JH
2615 goto got_it;
2616 } while (s++ < strend);
2617 }
2618 }
2619
2620 /* Failure. */
2621 goto phooey;
2622
2623got_it:
495f47a5
DM
2624 DEBUG_BUFFERS_r(
2625 if (swap)
2626 PerlIO_printf(Perl_debug_log,
2627 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2628 PTR2UV(prog),
2629 PTR2UV(swap)
2630 );
2631 );
e9105d30 2632 Safefree(swap);
288b8c02 2633 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2634
ed301438 2635 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2636 restore_pos(aTHX_ prog);
5daac39c
NC
2637 if (RXp_PAREN_NAMES(prog))
2638 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2639
2640 /* make sure $`, $&, $', and $digit will work later */
2641 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2642 if (flags & REXEC_COPY_STR) {
f8c7b90f 2643#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2644 if ((SvIsCOW(sv)
2645 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2646 if (DEBUG_C_TEST) {
2647 PerlIO_printf(Perl_debug_log,
2648 "Copy on write: regexp capture, type %d\n",
2649 (int) SvTYPE(sv));
2650 }
77f8f7c1 2651 RX_MATCH_COPY_FREE(rx);
ed252734 2652 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2653 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2654 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2655 prog->sublen = PL_regeol - strbeg;
2656 prog->suboffset = 0;
2657 prog->subcoffset = 0;
ed252734
NC
2658 } else
2659#endif
2660 {
6502e081
DM
2661 I32 min = 0;
2662 I32 max = PL_regeol - strbeg;
2663 I32 sublen;
2664
2665 if ( (flags & REXEC_COPY_SKIP_POST)
2666 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2667 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2668 ) { /* don't copy $' part of string */
3de645a8 2669 U32 n = 0;
6502e081
DM
2670 max = -1;
2671 /* calculate the right-most part of the string covered
2672 * by a capture. Due to look-ahead, this may be to
2673 * the right of $&, so we have to scan all captures */
2674 while (n <= prog->lastparen) {
2675 if (prog->offs[n].end > max)
2676 max = prog->offs[n].end;
2677 n++;
2678 }
2679 if (max == -1)
2680 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2681 ? prog->offs[0].start
2682 : 0;
2683 assert(max >= 0 && max <= PL_regeol - strbeg);
2684 }
2685
2686 if ( (flags & REXEC_COPY_SKIP_PRE)
2687 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2688 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2689 ) { /* don't copy $` part of string */
3de645a8 2690 U32 n = 0;
6502e081
DM
2691 min = max;
2692 /* calculate the left-most part of the string covered
2693 * by a capture. Due to look-behind, this may be to
2694 * the left of $&, so we have to scan all captures */
2695 while (min && n <= prog->lastparen) {
2696 if ( prog->offs[n].start != -1
2697 && prog->offs[n].start < min)
2698 {
2699 min = prog->offs[n].start;
2700 }
2701 n++;
2702 }
2703 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2704 && min > prog->offs[0].end
2705 )
2706 min = prog->offs[0].end;
2707
2708 }
2709
2710 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2711 sublen = max - min;
2712
2713 if (RX_MATCH_COPIED(rx)) {
2714 if (sublen > prog->sublen)
2715 prog->subbeg =
2716 (char*)saferealloc(prog->subbeg, sublen+1);
2717 }
2718 else
2719 prog->subbeg = (char*)safemalloc(sublen+1);
2720 Copy(strbeg + min, prog->subbeg, sublen, char);
2721 prog->subbeg[sublen] = '\0';
2722 prog->suboffset = min;
2723 prog->sublen = sublen;
77f8f7c1 2724 RX_MATCH_COPIED_on(rx);
6502e081 2725 }
6502e081
DM
2726 prog->subcoffset = prog->suboffset;
2727 if (prog->suboffset && utf8_target) {
2728 /* Convert byte offset to chars.
2729 * XXX ideally should only compute this if @-/@+
2730 * has been seen, a la PL_sawampersand ??? */
2731
2732 /* If there's a direct correspondence between the
2733 * string which we're matching and the original SV,
2734 * then we can use the utf8 len cache associated with
2735 * the SV. In particular, it means that under //g,
2736 * sv_pos_b2u() will use the previously cached
2737 * position to speed up working out the new length of
2738 * subcoffset, rather than counting from the start of
2739 * the string each time. This stops
2740 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2741 * from going quadratic */
2742 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2743 sv_pos_b2u(sv, &(prog->subcoffset));
2744 else
2745 prog->subcoffset = utf8_length((U8*)strbeg,
2746 (U8*)(strbeg+prog->suboffset));
2747 }
d6a28714
JH
2748 }
2749 else {
6502e081 2750 RX_MATCH_COPY_FREE(rx);
d6a28714 2751 prog->subbeg = strbeg;
6502e081
DM
2752 prog->suboffset = 0;
2753 prog->subcoffset = 0;
d6a28714
JH
2754 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2755 }
2756 }
9041c2e3 2757
d6a28714
JH
2758 return 1;
2759
2760phooey:
a3621e74 2761 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2762 PL_colors[4], PL_colors[5]));
ed301438 2763 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2764 restore_pos(aTHX_ prog);
e9105d30 2765 if (swap) {
c74340f9 2766 /* we failed :-( roll it back */
495f47a5
DM
2767 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2768 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2769 PTR2UV(prog),
2770 PTR2UV(prog->offs),
2771 PTR2UV(swap)
2772 ));
e9105d30
GG
2773 Safefree(prog->offs);
2774 prog->offs = swap;
2775 }
d6a28714
JH
2776 return 0;
2777}
2778
6bda09f9 2779
ec43f78b
DM
2780/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2781 * Do inc before dec, in case old and new rex are the same */
2782#define SET_reg_curpm(Re2) \
2783 if (PL_reg_state.re_state_eval_setup_done) { \
2784 (void)ReREFCNT_inc(Re2); \
2785 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2786 PM_SETRE((PL_reg_curpm), (Re2)); \
2787 }
2788
2789
d6a28714
JH
2790/*
2791 - regtry - try match at specific point
2792 */
2793STATIC I32 /* 0 failure, 1 success */
f73aaa43 2794S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2795{
97aff369 2796 dVAR;
d6a28714 2797 CHECKPOINT lastcp;
288b8c02 2798 REGEXP *const rx = reginfo->prog;
8d919b0a 2799 regexp *const prog = ReANY(rx);
f73aaa43 2800 I32 result;
f8fc2ecf 2801 RXi_GET_DECL(prog,progi);
a3621e74 2802 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2803
2804 PERL_ARGS_ASSERT_REGTRY;
2805
24b23f37 2806 reginfo->cutpoint=NULL;
d6a28714 2807
ed301438
DM
2808 if ((prog->extflags & RXf_EVAL_SEEN)
2809 && !PL_reg_state.re_state_eval_setup_done)
2810 {
d6a28714
JH
2811 MAGIC *mg;
2812
ed301438 2813 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2814 if (reginfo->sv) {
d6a28714 2815 /* Make $_ available to executed code. */
3b0527fe 2816 if (reginfo->sv != DEFSV) {
59f00321 2817 SAVE_DEFSV;
414bf5ae 2818 DEFSV_set(reginfo->sv);
b8c5462f 2819 }
d6a28714 2820
3b0527fe
DM
2821 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2822 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2823 /* prepare for quick setting of pos */
d300d9fa 2824#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2825 if (SvIsCOW(reginfo->sv))
2826 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2827#endif
3dab1dad 2828 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2829 &PL_vtbl_mglob, NULL, 0);
d6a28714 2830 mg->mg_len = -1;
b8c5462f 2831 }
d6a28714
JH
2832 PL_reg_magic = mg;
2833 PL_reg_oldpos = mg->mg_len;
4f639d21 2834 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2835 }
09687e5a 2836 if (!PL_reg_curpm) {
a02a5408 2837 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2838#ifdef USE_ITHREADS
2839 {
14a49a24 2840 SV* const repointer = &PL_sv_undef;
92313705
NC
2841 /* this regexp is also owned by the new PL_reg_curpm, which
2842 will try to free it. */
d2ece331 2843 av_push(PL_regex_padav, repointer);
09687e5a
AB
2844 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2845 PL_regex_pad = AvARRAY(PL_regex_padav);
2846 }
2847#endif
2848 }
ec43f78b 2849 SET_reg_curpm(rx);
d6a28714
JH
2850 PL_reg_oldcurpm = PL_curpm;
2851 PL_curpm = PL_reg_curpm;
07bc277f 2852 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2853 /* Here is a serious problem: we cannot rewrite subbeg,
2854 since it may be needed if this match fails. Thus
2855 $` inside (?{}) could fail... */
2856 PL_reg_oldsaved = prog->subbeg;
2857 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2858 PL_reg_oldsavedoffset = prog->suboffset;
2859 PL_reg_oldsavedcoffset = prog->suboffset;
f8c7b90f 2860#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2861 PL_nrs = prog->saved_copy;
2862#endif
07bc277f 2863 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2864 }
2865 else
bd61b366 2866 PL_reg_oldsaved = NULL;
d6a28714 2867 prog->subbeg = PL_bostr;
6502e081
DM
2868 prog->suboffset = 0;
2869 prog->subcoffset = 0;
d6a28714
JH
2870 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2871 }
97ca13b7 2872#ifdef DEBUGGING
f73aaa43 2873 PL_reg_starttry = *startposp;
97ca13b7 2874#endif
f73aaa43 2875 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2876 prog->lastparen = 0;
03994de8 2877 prog->lastcloseparen = 0;
d6a28714 2878 PL_regsize = 0;
d6a28714
JH
2879
2880 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2881 to do this again and again, prog->lastparen should take care of
3dd2943c 2882 this! --ilya*/
dafc8851
JH
2883
2884 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2885 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2886 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2887 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2888 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2889 * Meanwhile, this code *is* needed for the
daf18116
JH
2890 * above-mentioned test suite tests to succeed. The common theme
2891 * on those tests seems to be returning null fields from matches.
225593e1 2892 * --jhi updated by dapm */
dafc8851 2893#if 1
d6a28714 2894 if (prog->nparens) {
b93070ed 2895 regexp_paren_pair *pp = prog->offs;
eb578fdb 2896 I32 i;
b93070ed 2897 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2898 ++pp;
2899 pp->start = -1;
2900 pp->end = -1;
d6a28714
JH
2901 }
2902 }
dafc8851 2903#endif
02db2b7b 2904 REGCP_SET(lastcp);
f73aaa43
DM
2905 result = regmatch(reginfo, *startposp, progi->program + 1);
2906 if (result != -1) {
2907 prog->offs[0].end = result;
d6a28714
JH
2908 return 1;
2909 }
24b23f37 2910 if (reginfo->cutpoint)
f73aaa43 2911 *startposp= reginfo->cutpoint;
02db2b7b 2912 REGCP_UNWIND(lastcp);
d6a28714
JH
2913 return 0;
2914}
2915
02db2b7b 2916
8ba1375e
MJD
2917#define sayYES goto yes
2918#define sayNO goto no
262b90c4 2919#define sayNO_SILENT goto no_silent
8ba1375e 2920
f9f4320a
YO
2921/* we dont use STMT_START/END here because it leads to
2922 "unreachable code" warnings, which are bogus, but distracting. */
2923#define CACHEsayNO \
c476f425
DM
2924 if (ST.cache_mask) \
2925 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2926 sayNO
3298f257 2927
a3621e74 2928/* this is used to determine how far from the left messages like
265c4333
YO
2929 'failed...' are printed. It should be set such that messages
2930 are inline with the regop output that created them.
a3621e74 2931*/
265c4333 2932#define REPORT_CODE_OFF 32
a3621e74
YO
2933
2934
40a82448
DM
2935#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2936#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2937#define CHRTEST_NOT_A_CP_1 -999
2938#define CHRTEST_NOT_A_CP_2 -998
9e137952 2939
86545054
DM
2940#define SLAB_FIRST(s) (&(s)->states[0])
2941#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2942
5d9a96ca
DM
2943/* grab a new slab and return the first slot in it */
2944
2945STATIC regmatch_state *
2946S_push_slab(pTHX)
2947{
a35a87e7 2948#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2949 dMY_CXT;
2950#endif
5d9a96ca
DM
2951 regmatch_slab *s = PL_regmatch_slab->next;
2952 if (!s) {
2953 Newx(s, 1, regmatch_slab);
2954 s->prev = PL_regmatch_slab;
2955 s->next = NULL;
2956 PL_regmatch_slab->next = s;
2957 }
2958 PL_regmatch_slab = s;
86545054 2959 return SLAB_FIRST(s);
5d9a96ca 2960}
5b47454d 2961
95b24440 2962
40a82448
DM
2963/* push a new state then goto it */
2964
4d5016e5
DM
2965#define PUSH_STATE_GOTO(state, node, input) \
2966 pushinput = input; \
40a82448
DM
2967 scan = node; \
2968 st->resume_state = state; \
2969 goto push_state;
2970
2971/* push a new state with success backtracking, then goto it */
2972
4d5016e5
DM
2973#define PUSH_YES_STATE_GOTO(state, node, input) \
2974 pushinput = input; \
40a82448
DM
2975 scan = node; \
2976 st->resume_state = state; \
2977 goto push_yes_state;
2978
aa283a38 2979
aa283a38 2980
4d5016e5 2981
d6a28714 2982/*
95b24440 2983
bf1f174e
DM
2984regmatch() - main matching routine
2985
2986This is basically one big switch statement in a loop. We execute an op,
2987set 'next' to point the next op, and continue. If we come to a point which
2988we may need to backtrack to on failure such as (A|B|C), we push a
2989backtrack state onto the backtrack stack. On failure, we pop the top
2990state, and re-enter the loop at the state indicated. If there are no more
2991states to pop, we return failure.
2992
2993Sometimes we also need to backtrack on success; for example /A+/, where
2994after successfully matching one A, we need to go back and try to
2995match another one; similarly for lookahead assertions: if the assertion
2996completes successfully, we backtrack to the state just before the assertion
2997and then carry on. In these cases, the pushed state is marked as
2998'backtrack on success too'. This marking is in fact done by a chain of
2999pointers, each pointing to the previous 'yes' state. On success, we pop to
3000the nearest yes state, discarding any intermediate failure-only states.
3001Sometimes a yes state is pushed just to force some cleanup code to be
3002called at the end of a successful match or submatch; e.g. (??{$re}) uses
3003it to free the inner regex.
3004
3005Note that failure backtracking rewinds the cursor position, while
3006success backtracking leaves it alone.
3007
3008A pattern is complete when the END op is executed, while a subpattern
3009such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3010ops trigger the "pop to last yes state if any, otherwise return true"
3011behaviour.
3012
3013A common convention in this function is to use A and B to refer to the two
3014subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3015the subpattern to be matched possibly multiple times, while B is the entire
3016rest of the pattern. Variable and state names reflect this convention.
3017
3018The states in the main switch are the union of ops and failure/success of
3019substates associated with with that op. For example, IFMATCH is the op
3020that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3021'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3022successfully matched A and IFMATCH_A_fail is a state saying that we have
3023just failed to match A. Resume states always come in pairs. The backtrack
3024state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3025at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3026on success or failure.
3027
3028The struct that holds a backtracking state is actually a big union, with
3029one variant for each major type of op. The variable st points to the
3030top-most backtrack struct. To make the code clearer, within each
3031block of code we #define ST to alias the relevant union.
3032
3033Here's a concrete example of a (vastly oversimplified) IFMATCH
3034implementation:
3035
3036 switch (state) {
3037 ....
3038
3039#define ST st->u.ifmatch
3040
3041 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3042 ST.foo = ...; // some state we wish to save
95b24440 3043 ...
bf1f174e
DM
3044 // push a yes backtrack state with a resume value of
3045 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3046 // first node of A:
4d5016e5 3047 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3048 // NOTREACHED
3049
3050 case IFMATCH_A: // we have successfully executed A; now continue with B
3051 next = B;
3052 bar = ST.foo; // do something with the preserved value
3053 break;
3054
3055 case IFMATCH_A_fail: // A failed, so the assertion failed
3056 ...; // do some housekeeping, then ...
3057 sayNO; // propagate the failure
3058
3059#undef ST
95b24440 3060
bf1f174e
DM
3061 ...
3062 }
95b24440 3063
bf1f174e
DM
3064For any old-timers reading this who are familiar with the old recursive
3065approach, the code above is equivalent to:
95b24440 3066
bf1f174e
DM
3067 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3068 {
3069 int foo = ...
95b24440 3070 ...
bf1f174e
DM
3071 if (regmatch(A)) {
3072 next = B;
3073 bar = foo;
3074 break;
95b24440 3075 }
bf1f174e
DM
3076 ...; // do some housekeeping, then ...
3077 sayNO; // propagate the failure
95b24440 3078 }
bf1f174e
DM
3079
3080The topmost backtrack state, pointed to by st, is usually free. If you
3081want to claim it, populate any ST.foo fields in it with values you wish to
3082save, then do one of
3083
4d5016e5
DM
3084 PUSH_STATE_GOTO(resume_state, node, newinput);
3085 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3086
3087which sets that backtrack state's resume value to 'resume_state', pushes a
3088new free entry to the top of the backtrack stack, then goes to 'node'.
3089On backtracking, the free slot is popped, and the saved state becomes the
3090new free state. An ST.foo field in this new top state can be temporarily
3091accessed to retrieve values, but once the main loop is re-entered, it
3092becomes available for reuse.
3093
3094Note that the depth of the backtrack stack constantly increases during the
3095left-to-right execution of the pattern, rather than going up and down with
3096the pattern nesting. For example the stack is at its maximum at Z at the
3097end of the pattern, rather than at X in the following:
3098
3099 /(((X)+)+)+....(Y)+....Z/
3100
3101The only exceptions to this are lookahead/behind assertions and the cut,
3102(?>A), which pop all the backtrack states associated with A before
3103continuing.
3104
486ec47a 3105Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3106PL_regmatch_state and st always point to the currently active state,
3107and PL_regmatch_slab points to the slab currently containing
3108PL_regmatch_state. The first time regmatch() is called, the first slab is
3109allocated, and is never freed until interpreter destruction. When the slab
3110is full, a new one is allocated and chained to the end. At exit from
3111regmatch(), slabs allocated since entry are freed.
3112
3113*/
95b24440 3114
40a82448 3115
5bc10b2c 3116#define DEBUG_STATE_pp(pp) \
265c4333 3117 DEBUG_STATE_r({ \
f2ed9b32 3118 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3119 PerlIO_printf(Perl_debug_log, \
5d458dd8 3120 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3121 depth*2, "", \
13d6edb4 3122 PL_reg_name[st->resume_state], \
5d458dd8
YO
3123 ((st==yes_state||st==mark_state) ? "[" : ""), \
3124 ((st==yes_state) ? "Y" : ""), \
3125 ((st==mark_state) ? "M" : ""), \
3126 ((st==yes_state||st==mark_state) ? "]" : "") \
3127 ); \
265c4333 3128 });
5bc10b2c 3129
40a82448 3130
3dab1dad 3131#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3132
3df15adc 3133#ifdef DEBUGGING
5bc10b2c 3134
ab3bbdeb 3135STATIC void
f2ed9b32 3136S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3137 const char *start, const char *end, const char *blurb)
3138{
efd26800 3139 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3140
3141 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3142
ab3bbdeb
YO
3143 if (!PL_colorset)
3144 reginitcolors();
3145 {
3146 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3147 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3148
f2ed9b32 3149 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3150 start, end - start, 60);
3151
3152 PerlIO_printf(Perl_debug_log,
3153 "%s%s REx%s %s against %s\n",
3154 PL_colors[4], blurb, PL_colors[5], s0, s1);
3155
f2ed9b32 3156 if (utf8_target||utf8_pat)
1de06328
YO
3157 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3158 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3159 utf8_pat && utf8_target ? " and " : "",
3160 utf8_target ? "string" : ""
ab3bbdeb
YO
3161 );
3162 }
3163}
3df15adc
YO
3164
3165STATIC void
786e8c11
YO
3166S_dump_exec_pos(pTHX_ const char *locinput,
3167 const regnode *scan,
3168 const char *loc_regeol,
3169 const char *loc_bostr,
3170 const char *loc_reg_starttry,
f2ed9b32 3171 const bool utf8_target)
07be1b83 3172{
786e8c11 3173 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3174 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3175 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3176 /* The part of the string before starttry has one color
3177 (pref0_len chars), between starttry and current
3178 position another one (pref_len - pref0_len chars),
3179 after the current position the third one.
3180 We assume that pref0_len <= pref_len, otherwise we
3181 decrease pref0_len. */
786e8c11
YO
3182 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3183 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3184 int pref0_len;
3185
7918f24d
NC
3186 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3187
f2ed9b32 3188 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3189 pref_len++;
786e8c11
YO
3190 pref0_len = pref_len - (locinput - loc_reg_starttry);
3191 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3192 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3193 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3194 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3195 l--;
3196 if (pref0_len < 0)
3197 pref0_len = 0;
3198 if (pref0_len > pref_len)
3199 pref0_len = pref_len;
3200 {
f2ed9b32 3201 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3202
ab3bbdeb 3203 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3204 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3205
ab3bbdeb 3206 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3207 (locinput - pref_len + pref0_len),
1de06328 3208 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3209
ab3bbdeb 3210 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3211 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3212
1de06328 3213 const STRLEN tlen=len0+len1+len2;
3df15adc 3214 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3215 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3216 (IV)(locinput - loc_bostr),
07be1b83 3217 len0, s0,
07be1b83 3218 len1, s1,
07be1b83 3219 (docolor ? "" : "> <"),
07be1b83 3220 len2, s2,
f9f4320a 3221 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3222 "");
3223 }
3224}
3df15adc 3225
07be1b83
YO
3226#endif
3227
0a4db386
YO
3228/* reg_check_named_buff_matched()
3229 * Checks to see if a named buffer has matched. The data array of
3230 * buffer numbers corresponding to the buffer is expected to reside
3231 * in the regexp->data->data array in the slot stored in the ARG() of
3232 * node involved. Note that this routine doesn't actually care about the
3233 * name, that information is not preserved from compilation to execution.
3234 * Returns the index of the leftmost defined buffer with the given name
3235 * or 0 if non of the buffers matched.
3236 */
3237STATIC I32
7918f24d
NC
3238S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3239{
0a4db386 3240 I32 n;
f8fc2ecf 3241 RXi_GET_DECL(rex,rexi);
ad64d0ec 3242 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3243 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3244
3245 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3246
0a4db386 3247 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3248 if ((I32)rex->lastparen >= nums[n] &&
3249 rex->offs[nums[n]].end != -1)
0a4db386
YO
3250 {
3251 return nums[n];
3252 }
3253 }
3254 return 0;
3255}
3256
2f554ef7
DM
3257
3258/* free all slabs above current one - called during LEAVE_SCOPE */
3259
3260STATIC void
3261S_clear_backtrack_stack(pTHX_ void *p)
3262{
3263 regmatch_slab *s = PL_regmatch_slab->next;
3264 PERL_UNUSED_ARG(p);
3265
3266 if (!s)
3267 return;
3268 PL_regmatch_slab->next = NULL;
3269 while (s) {
3270 regmatch_slab * const osl = s;
3271 s = s->next;
3272 Safefree(osl);
3273 }
3274}
c74f6de9 3275static bool
79a2a0e8 3276S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
c74f6de9 3277{
79a2a0e8
KW
3278 /* This function determines if there are one or two characters that match
3279 * the first character of the passed-in EXACTish node <text_node>, and if
3280 * so, returns them in the passed-in pointers.
c74f6de9 3281 *
79a2a0e8
KW
3282 * If it determines that no possible character in the target string can
3283 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3284 * the first character in <text_node> requires UTF-8 to represent, and the
3285 * target string isn't in UTF-8.)
c74f6de9 3286 *
79a2a0e8
KW
3287 * If there are more than two characters that could match the beginning of
3288 * <text_node>, or if more context is required to determine a match or not,
3289 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3290 *
3291 * The motiviation behind this function is to allow the caller to set up
3292 * tight loops for matching. If <text_node> is of type EXACT, there is
3293 * only one possible character that can match its first character, and so
3294 * the situation is quite simple. But things get much more complicated if
3295 * folding is involved. It may be that the first character of an EXACTFish
3296 * node doesn't participate in any possible fold, e.g., punctuation, so it
3297 * can be matched only by itself. The vast majority of characters that are
3298 * in folds match just two things, their lower and upper-case equivalents.
3299 * But not all are like that; some have multiple possible matches, or match
3300 * sequences of more than one character. This function sorts all that out.
3301 *
3302 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3303 * loop of trying to match A*, we know we can't exit where the thing
3304 * following it isn't a B. And something can't be a B unless it is the
3305 * beginning of B. By putting a quick test for that beginning in a tight
3306 * loop, we can rule out things that can't possibly be B without having to
3307 * break out of the loop, thus avoiding work. Similarly, if A is a single
3308 * character, we can make a tight loop matching A*, using the outputs of
3309 * this function.
3310 *
3311 * If the target string to match isn't in UTF-8, and there aren't
3312 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3313 * the one or two possible octets (which are characters in this situation)
3314 * that can match. In all cases, if there is only one character that can
3315 * match, *<c1p> and *<c2p> will be identical.
3316 *
3317 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3318 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3319 * can match the beginning of <text_node>. They should be declared with at
3320 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3321 * undefined what these contain.) If one or both of the buffers are
3322 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3323 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3324 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3325 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3326 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3327
3328 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3329
ddb0d839
KW
3330 UV c1 = CHRTEST_NOT_A_CP_1;
3331 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3332 bool use_chrtest_void = FALSE;
3333
3334 /* Used when we have both utf8 input and utf8 output, to avoid converting
3335 * to/from code points */
3336 bool utf8_has_been_setup = FALSE;
3337
c74f6de9
KW
3338 dVAR;
3339
b4291290 3340 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3341
79a2a0e8
KW
3342 if (OP(text_node) == EXACT) {
3343
3344 /* In an exact node, only one thing can be matched, that first
3345 * character. If both the pat and the target are UTF-8, we can just
3346 * copy the input to the output, avoiding finding the code point of
3347 * that character */
3348 if (! UTF_PATTERN) {
3349 c2 = c1 = *pat;
3350 }
3351 else if (utf8_target) {
3352 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3353 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3354 utf8_has_been_setup = TRUE;
3355 }
3356 else {
3357 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3358 }
79a2a0e8
KW
3359 }
3360 else /* an EXACTFish node */
3361 if ((UTF_PATTERN
3362 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3363 pat + STR_LEN(text_node)))
3364 || (! UTF_PATTERN
3365 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3366 pat + STR_LEN(text_node))))
3367 {
3368 /* Multi-character folds require more context to sort out. Also
3369 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3370 * handled outside this routine */
3371 use_chrtest_void = TRUE;
3372 }
3373 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3374 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3375 if (c1 > 256) {
3376 /* Load the folds hash, if not already done */
3377 SV** listp;
3378 if (! PL_utf8_foldclosures) {
3379 if (! PL_utf8_tofold) {
3380 U8 dummy[UTF8_MAXBYTES+1];
3381
3382 /* Force loading this by folding an above-Latin1 char */
3383 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3384 assert(PL_utf8_tofold); /* Verify that worked */
3385 }
3386 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3387 }
3388
3389 /* The fold closures data structure is a hash with the keys being
3390 * the UTF-8 of every character that is folded to, like 'k', and
3391 * the values each an array of all code points that fold to its
3392 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3393 * not included */
3394 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3395 (char *) pat,
3396 UTF8SKIP(pat),
3397 FALSE))))
3398 {
3399 /* Not found in the hash, therefore there are no folds
3400 * containing it, so there is only a single character that
3401 * could match */
3402 c2 = c1;
3403 }
3404 else { /* Does participate in folds */
3405 AV* list = (AV*) *listp;
3406 if (av_len(list) != 1) {
3407
3408 /* If there aren't exactly two folds to this, it is outside
3409 * the scope of this function */
3410 use_chrtest_void = TRUE;
3411 }
3412 else { /* There are two. Get them */
3413 SV** c_p = av_fetch(list, 0, FALSE);
3414 if (c_p == NULL) {
3415 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3416 }
3417 c1 = SvUV(*c_p);
3418
3419 c_p = av_fetch(list, 1, FALSE);
3420 if (c_p == NULL) {
3421 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3422 }
3423 c2 = SvUV(*c_p);
3424
3425 /* Folds that cross the 255/256 boundary are forbidden if
3426 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3427 * pattern character is above 256, and its only other match
3428 * is below 256, the only legal match will be to itself.
3429 * We have thrown away the original, so have to compute
3430 * which is the one above 255 */
3431 if ((c1 < 256) != (c2 < 256)) {
3432 if (OP(text_node) == EXACTFL
3433 || (OP(text_node) == EXACTFA
3434 && (isASCII(c1) || isASCII(c2))))
3435 {
3436 if (c1 < 256) {
3437 c1 = c2;
3438 }
3439 else {
3440 c2 = c1;
3441 }
3442 }
3443 }
3444 }
3445 }
3446 }
3447 else /* Here, c1 is < 255 */
3448 if (utf8_target
3449 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3450 && OP(text_node) != EXACTFL
3451 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3452 {
3453 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3454 * folds to this character in the pattern. All such cases except
3455 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3456 * involved in their folds, so are outside the scope of this
3457 * function */
3458 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3459 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3460 }
3461 else {
3462 use_chrtest_void = TRUE;
3463 }
c74f6de9
KW
3464 }
3465 else { /* Here nothing above Latin1 can fold to the pattern character */
3466 switch (OP(text_node)) {
3467
3468 case EXACTFL: /* /l rules */
79a2a0e8 3469 c2 = PL_fold_locale[c1];
c74f6de9
KW
3470 break;
3471
3472 case EXACTF:
3473 if (! utf8_target) { /* /d rules */
79a2a0e8 3474 c2 = PL_fold[c1];
c74f6de9
KW
3475 break;
3476 }
3477 /* FALLTHROUGH */
3478 /* /u rules for all these. This happens to work for
79a2a0e8 3479 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3480 case EXACTFA:
3481 case EXACTFU_TRICKYFOLD:
79a2a0e8 3482 case EXACTFU_SS:
c74f6de9 3483 case EXACTFU:
79a2a0e8 3484 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3485 break;
3486
878531d3
KW
3487 default:
3488 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3489 assert(0); /* NOTREACHED */
c74f6de9
KW
3490 }
3491 }
3492 }
79a2a0e8
KW
3493
3494 /* Here have figured things out. Set up the returns */
3495 if (use_chrtest_void) {
3496 *c2p = *c1p = CHRTEST_VOID;
3497 }
3498 else if (utf8_target) {
3499 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3500 uvchr_to_utf8(c1_utf8, c1);
3501 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3502 }
c74f6de9 3503
79a2a0e8
KW
3504 /* Invariants are stored in both the utf8 and byte outputs; Use
3505 * negative numbers otherwise for the byte ones. Make sure that the
3506 * byte ones are the same iff the utf8 ones are the same */
3507 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3509 ? *c2_utf8
3510 : (c1 == c2)
3511 ? CHRTEST_NOT_A_CP_1
3512 : CHRTEST_NOT_A_CP_2;
3513 }
3514 else if (c1 > 255) {
3515 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3516 can represent */
3517 return FALSE;
3518 }
c74f6de9 3519
79a2a0e8
KW
3520 *c1p = *c2p = c2; /* c2 is the only representable value */
3521 }
3522 else { /* c1 is representable; see about c2 */
3523 *c1p = c1;
3524 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3525 }
2f554ef7 3526
c74f6de9
KW
3527 return TRUE;
3528}
2f554ef7 3529
f73aaa43
DM
3530/* returns -1 on failure, $+[0] on success */
3531STATIC I32
3532S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3533{
a35a87e7 3534#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3535 dMY_CXT;
3536#endif
27da23d5 3537 dVAR;
eb578fdb 3538 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3539 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3540 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3541 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3542 RXi_GET_DECL(rex,rexi);
2f554ef7 3543 I32 oldsave;
5d9a96ca 3544 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3545 regmatch_state *st;
5d9a96ca 3546 /* cache heavy used fields of st in registers */
eb578fdb
KW
3547 regnode *scan;
3548 regnode *next;
3549 U32 n = 0; /* general value; init to avoid compiler warning */
3550 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3551 char *locinput = startpos;
4d5016e5 3552 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3553 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3554
b69b0499 3555 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3556 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3557 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558 const U32 max_nochange_depth =
3559 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3561 regmatch_state *yes_state = NULL; /* state to pop to on success of
3562 subpattern */
e2e6a0f1
YO
3563 /* mark_state piggy backs on the yes_state logic so that when we unwind
3564 the stack on success we can update the mark_state as we go */
3565 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3566 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3567 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3568 U32 state_num;
5d458dd8
YO
3569 bool no_final = 0; /* prevent failure from backtracking? */
3570 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3571 char *startpoint = locinput;
5d458dd8
YO
3572 SV *popmark = NULL; /* are we looking for a mark? */
3573 SV *sv_commit = NULL; /* last mark name seen in failure */
3574 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3575 during a successful match */
5d458dd8
YO
3576 U32 lastopen = 0; /* last open we saw */
3577 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3578 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3579 /* these three flags are set by various ops to signal information to
3580 * the very next op. They have a useful lifetime of exactly one loop
3581 * iteration, and are not preserved or restored by state pushes/pops
3582 */
3583 bool sw = 0; /* the condition value in (?(cond)a|b) */
3584 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3585 int logical = 0; /* the following EVAL is:
3586 0: (?{...})
3587 1: (?(?{...})X|Y)
3588 2: (??{...})
3589 or the following IFMATCH/UNLESSM is:
3590 false: plain (?=foo)
3591 true: used as a condition: (?(?=foo))
3592 */
81ed78b2
DM
3593 PAD* last_pad = NULL;
3594 dMULTICALL;
3595 I32 gimme = G_SCALAR;
3596 CV *caller_cv = NULL; /* who called us */
3597 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3598 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
81ed78b2 3599
95b24440 3600#ifdef DEBUGGING
e68ec53f 3601 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3602#endif
3603
81ed78b2
DM
3604 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605 multicall_oldcatch = 0;
3606 multicall_cv = NULL;
3607 cx = NULL;
4f8dbb2d
JL
3608 PERL_UNUSED_VAR(multicall_cop);
3609 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3610
3611
7918f24d
NC
3612 PERL_ARGS_ASSERT_REGMATCH;
3613
3b57cd43 3614 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3615 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3616 }));
5d9a96ca
DM
3617 /* on first ever call to regmatch, allocate first slab */
3618 if (!PL_regmatch_slab) {
3619 Newx(PL_regmatch_slab, 1, regmatch_slab);
3620 PL_regmatch_slab->prev = NULL;
3621 PL_regmatch_slab->next = NULL;
86545054 3622 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3623 }
3624
2f554ef7
DM
3625 oldsave = PL_savestack_ix;
3626 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627 SAVEVPTR(PL_regmatch_slab);
3628 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3629
3630 /* grab next free state slot */
3631 st = ++PL_regmatch_state;
86545054 3632 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3633 st = PL_regmatch_state = S_push_slab(aTHX);
3634
d6a28714 3635 /* Note that nextchr is a byte even in UTF */
7016d6eb 3636 SET_nextchr;
d6a28714
JH
3637 scan = prog;
3638 while (scan != NULL) {
8ba1375e 3639
a3621e74 3640 DEBUG_EXECUTE_r( {
6136c704 3641 SV * const prop = sv_newmortal();
1de06328 3642 regnode *rnext=regnext(scan);
f2ed9b32 3643 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3644 regprop(rex, prop, scan);
07be1b83
YO
3645
3646 PerlIO_printf(Perl_debug_log,
3647 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3648 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3649 SvPVX_const(prop),
1de06328 3650 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3651 0 : (IV)(rnext - rexi->program));
2a782b5b 3652 });
d6a28714
JH
3653
3654 next = scan + NEXT_OFF(scan);
3655 if (next == scan)
3656 next = NULL;
40a82448 3657 state_num = OP(scan);
d6a28714 3658
40a82448 3659 reenter_switch:
34a81e2b 3660
7016d6eb 3661 SET_nextchr;
e6ca698c 3662 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3663
40a82448 3664 switch (state_num) {
3c0563b9 3665 case BOL: /* /^../ */
7fba1cd6 3666 if (locinput == PL_bostr)
d6a28714 3667 {
3b0527fe 3668 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3669 break;
3670 }
d6a28714 3671 sayNO;
3c0563b9
DM
3672
3673 case MBOL: /* /^../m */
12d33761 3674 if (locinput == PL_bostr ||
7016d6eb 3675 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3676 {
b8c5462f
JH
3677 break;
3678 }
d6a28714 3679 sayNO;
3c0563b9
DM
3680
3681 case SBOL: /* /^../s */
c2a73568 3682 if (locinput == PL_bostr)
b8c5462f 3683 break;
d6a28714 3684 sayNO;
3c0563b9
DM
3685
3686 case GPOS: /* \G */
3b0527fe 3687 if (locinput == reginfo->ganch)
d6a28714
JH
3688 break;
3689 sayNO;
ee9b8eae 3690
3c0563b9 3691 case KEEPS: /* \K */
ee9b8eae 3692 /* update the startpoint */
b93070ed 3693 st->u.keeper.val = rex->offs[0].start;
b93070ed 3694 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3695 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3696 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3697 case KEEPS_next_fail:
3698 /* rollback the start point change */
b93070ed 3699 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3700 sayNO_SILENT;
878531d3 3701 assert(0); /*NOTREACHED*/
3c0563b9
DM
3702
3703 case EOL: /* /..$/ */
d6a28714 3704 goto seol;
3c0563b9
DM
3705
3706 case MEOL: /* /..$/m */
7016d6eb 3707 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3708 sayNO;
b8c5462f 3709 break;
3c0563b9
DM
3710
3711 case SEOL: /* /..$/s */
d6a28714 3712 seol:
7016d6eb 3713 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3714 sayNO;
d6a28714 3715 if (PL_regeol - locinput > 1)
b8c5462f 3716 sayNO;
b8c5462f 3717 break;
3c0563b9
DM
3718
3719 case EOS: /* \z */
7016d6eb 3720 if (!NEXTCHR_IS_EOS)
b8c5462f 3721 sayNO;
d6a28714 3722 break;
3c0563b9
DM
3723
3724 case SANY: /* /./s */
7016d6eb 3725 if (NEXTCHR_IS_EOS)
4633a7c4 3726 sayNO;
28b98f76 3727 goto increment_locinput;
3c0563b9
DM
3728
3729 case CANY: /* \C */
7016d6eb 3730 if (NEXTCHR_IS_EOS)
f33976b4 3731 sayNO;
3640db6b 3732 locinput++;
a0d0e21e 3733 break;
3c0563b9
DM
3734
3735 case REG_ANY: /* /./ */
7016d6eb 3736 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3737 sayNO;
28b98f76
DM
3738 goto increment_locinput;
3739
166ba7cd
DM
3740
3741#undef ST
3742#define ST st->u.trie
3c0563b9 3743 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3744 /* In this case the charclass data is available inline so
3745 we can fail fast without a lot of extra overhead.
3746 */
7016d6eb 3747 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3748 DEBUG_EXECUTE_r(
3749 PerlIO_printf(Perl_debug_log,
3750 "%*s %sfailed to match trie start class...%s\n",
3751 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3752 );
3753 sayNO_SILENT;
118e2215 3754 assert(0); /* NOTREACHED */
786e8c11
YO
3755 }
3756 /* FALL THROUGH */
3c0563b9 3757 case TRIE: /* (ab|cd) */
2e64971a
DM
3758 /* the basic plan of execution of the trie is:
3759 * At the beginning, run though all the states, and
3760 * find the longest-matching word. Also remember the position
3761 * of the shortest matching word. For example, this pattern:
3762 * 1 2 3 4 5
3763 * ab|a|x|abcd|abc
3764 * when matched against the string "abcde", will generate
3765 * accept states for all words except 3, with the longest
895cc420 3766 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3767 * the position being after char 1 of the string).
3768 *
3769 * Then for each matching word, in word order (i.e. 1,2,4,5),
3770 * we run the remainder of the pattern; on each try setting
3771 * the current position to the character following the word,
3772 * returning to try the next word on failure.
3773 *
3774 * We avoid having to build a list of words at runtime by
3775 * using a compile-time structure, wordinfo[].prev, which
3776 * gives, for each word, the previous accepting word (if any).
3777 * In the case above it would contain the mappings 1->2, 2->0,
3778 * 3->0, 4->5, 5->1. We can use this table to generate, from
3779 * the longest word (4 above), a list of all words, by
3780 * following the list of prev pointers; this gives us the
3781 * unordered list 4,5,1,2. Then given the current word we have
3782 * just tried, we can go through the list and find the
3783 * next-biggest word to try (so if we just failed on word 2,
3784 * the next in the list is 4).
3785 *
3786 * Since at runtime we don't record the matching position in
3787 * the string for each word, we have to work that out for
3788 * each word we're about to process. The wordinfo table holds
3789 * the character length of each word; given that we recorded
3790 * at the start: the position of the shortest word and its
3791 * length in chars, we just need to move the pointer the
3792 * difference between the two char lengths. Depending on
3793 * Unicode status and folding, that's cheap or expensive.
3794 *
3795 * This algorithm is optimised for the case where are only a
3796 * small number of accept states, i.e. 0,1, or maybe 2.
3797 * With lots of accepts states, and having to try all of them,
3798 * it becomes quadratic on number of accept states to find all
3799 * the next words.
3800 */
3801
3dab1dad 3802 {
07be1b83 3803 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3804 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3805
3806 /* what trie are we using right now */
be8e71aa 3807 reg_trie_data * const trie
f8fc2ecf 3808 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3809 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3810 U32 state = trie->startstate;
166ba7cd 3811
7016d6eb
DM
3812 if ( trie->bitmap
3813 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3814 {
3dab1dad
YO
3815 if (trie->states[ state ].wordnum) {
3816 DEBUG_EXECUTE_r(
3817 PerlIO_printf(Perl_debug_log,
3818 "%*s %smatched empty string...%s\n",
5bc10b2c 3819 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3820 );
20dbff7c
YO
3821 if (!trie->jump)
3822 break;
3dab1dad
YO
3823 } else {
3824 DEBUG_EXECUTE_r(
3825 PerlIO_printf(Perl_debug_log,
786e8c11 3826 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3827 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3828 );
3829 sayNO_SILENT;
3830 }
3831 }
166ba7cd 3832
786e8c11
YO
3833 {
3834 U8 *uc = ( U8* )locinput;
3835
3836 STRLEN len = 0;
3837 STRLEN foldlen = 0;
3838 U8 *uscan = (U8*)NULL;
786e8c11 3839 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3840 U32 charcount = 0; /* how many input chars we have matched */
3841 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3842
786e8c11 3843 ST.jump = trie->jump;
786e8c11 3844 ST.me = scan;
2e64971a
DM
3845 ST.firstpos = NULL;
3846 ST.longfold = FALSE; /* char longer if folded => it's harder */
3847 ST.nextword = 0;
3848
3849 /* fully traverse the TRIE; note the position of the
3850 shortest accept state and the wordnum of the longest
3851 accept state */
07be1b83 3852
a3621e74 3853 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3854 U32 base = trie->states[ state ].trans.base;
f9f4320a 3855 UV uvc = 0;
acb909b4 3856 U16 charid = 0;
2e64971a
DM
3857 U16 wordnum;
3858 wordnum = trie->states[ state ].wordnum;
3859
3860 if (wordnum) { /* it's an accept state */
3861 if (!accepted) {
3862 accepted = 1;
3863 /* record first match position */
3864 if (ST.longfold) {
3865 ST.firstpos = (U8*)locinput;
3866 ST.firstchars = 0;
5b47454d 3867 }
2e64971a
DM
3868 else {
3869 ST.firstpos = uc;
3870 ST.firstchars = charcount;
3871 }
3872 }
3873 if (!ST.nextword || wordnum < ST.nextword)
3874 ST.nextword = wordnum;
3875 ST.topword = wordnum;
786e8c11 3876 }
a3621e74 3877
07be1b83 3878 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3879 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3880 PerlIO_printf( Perl_debug_log,
2e64971a 3881 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3882 2+depth * 2, "", PL_colors[4],
2e64971a 3883 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3884 });
a3621e74 3885
2e64971a 3886 /* read a char and goto next state */
7016d6eb 3887 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3888 I32 offset;
55eed653
NC
3889 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3890 uscan, len, uvc, charid, foldlen,
3891 foldbuf, uniflags);
2e64971a
DM
3892 charcount++;
3893 if (foldlen>0)
3894 ST.longfold = TRUE;
5b47454d 3895 if (charid &&
6dd2be57
DM
3896 ( ((offset =
3897 base + charid - 1 - trie->uniquecharcount)) >= 0)
3898
3899 && ((U32)offset < trie->lasttrans)
3900 && trie->trans[offset].check == state)
5b47454d 3901 {
6dd2be57 3902 state = trie->trans[offset].next;
5b47454d
DM
3903 }
3904 else {
3905 state = 0;
3906 }
3907 uc += len;
3908
3909 }
3910 else {
a3621e74
YO
3911 state = 0;
3912 }
3913 DEBUG_TRIE_EXECUTE_r(
e4584336 3914 PerlIO_printf( Perl_debug_log,
786e8c11 3915 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3916 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3917 );
3918 }
2e64971a 3919 if (!accepted)
a3621e74 3920 sayNO;
a3621e74 3921
2e64971a
DM
3922 /* calculate total number of accept states */
3923 {
3924 U16 w = ST.topword;
3925 accepted = 0;
3926 while (w) {
3927 w = trie->wordinfo[w].prev;
3928 accepted++;
3929 }
3930 ST.accepted = accepted;
3931 }
3932
166ba7cd
DM
3933 DEBUG_EXECUTE_r(
3934 PerlIO_printf( Perl_debug_log,
3935 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3936 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3937 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3938 );
2e64971a 3939 goto trie_first_try; /* jump into the fail handler */
786e8c11 3940 }}
118e2215 3941 assert(0); /* NOTREACHED */
2e64971a
DM
3942
3943 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3944 {
3945 U8 *uc;
fae667d5
YO
3946 if ( ST.jump) {
3947 REGCP_UNWIND(ST.cp);
a8d1f4b4 3948 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3949 }
2e64971a
DM
3950 if (!--ST.accepted) {
3951 DEBUG_EXECUTE_r({
3952 PerlIO_printf( Perl_debug_log,
3953 "%*s %sTRIE failed...%s\n",
3954 REPORT_CODE_OFF+depth*2, "",
3955 PL_colors[4],
3956 PL_colors[5] );
3957 });
3958 sayNO_SILENT;
3959 }
3960 {
3961 /* Find next-highest word to process. Note that this code
3962 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3963 U16 min = 0;
3964 U16 word;
3965 U16 const nextword = ST.nextword;
3966 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3967 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3968 for (word=ST.topword; word; word=wordinfo[word].prev) {
3969 if (word > nextword && (!min || word < min))
3970 min = word;
3971 }
3972 ST.nextword = min;
3973 }
3974
fae667d5 3975 trie_first_try:
5d458dd8
YO
3976 if (do_cutgroup) {
3977 do_cutgroup = 0;
3978 no_final = 0;
3979 }
fae667d5
YO
3980
3981 if ( ST.jump) {
b93070ed 3982 ST.lastparen = rex->lastparen;
f6033a9d 3983 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3984 REGCP_SET(ST.cp);
2e64971a 3985 }
a3621e74 3986
2e64971a 3987 /* find start char of end of current word */
166ba7cd 3988 {
2e64971a 3989 U32 chars; /* how many chars to skip */
2e64971a
DM
3990 reg_trie_data * const trie
3991 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3992
3993 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3994 >= ST.firstchars);
3995 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3996 - ST.firstchars;
a059a757 3997 uc = ST.firstpos;
2e64971a
DM
3998
3999 if (ST.longfold) {
4000 /* the hard option - fold each char in turn and find
4001 * its folded length (which may be different */
4002 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4003 STRLEN foldlen;
4004 STRLEN len;
d9a396a3 4005 UV uvc;
2e64971a
DM
4006 U8 *uscan;
4007
4008 while (chars) {
f2ed9b32 4009 if (utf8_target) {
2e64971a
DM
4010 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4011 uniflags);
4012 uc += len;
4013 }
4014 else {
4015 uvc = *uc;
4016 uc++;
4017 }
4018 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4019 uscan = foldbuf;
4020 while (foldlen) {
4021 if (!--chars)
4022 break;
4023 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4024 uniflags);
4025 uscan += len;
4026 foldlen -= len;
4027 }
4028 }
a3621e74 4029 }
2e64971a 4030 else {
f2ed9b32 4031 if (utf8_target)
2e64971a
DM
4032 while (chars--)
4033 uc += UTF8SKIP(uc);
4034 else
4035 uc += chars;
4036 }
2e64971a 4037 }
166ba7cd 4038
6603fe3e
DM
4039 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4040 ? ST.jump[ST.nextword]
4041 : NEXT_OFF(ST.me));
166ba7cd 4042
2e64971a
DM
4043 DEBUG_EXECUTE_r({
4044 PerlIO_printf( Perl_debug_log,
4045 "%*s %sTRIE matched word #%d, continuing%s\n",
4046 REPORT_CODE_OFF+depth*2, "",
4047 PL_colors[4],
4048 ST.nextword,
4049 PL_colors[5]
4050 );
4051 });
4052
4053 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4054 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4055 assert(0); /* NOTREACHED */
166ba7cd 4056 }
2e64971a
DM
4057 /* only one choice left - just continue */
4058 DEBUG_EXECUTE_r({
4059 AV *const trie_words
4060 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4061 SV ** const tmp = av_fetch( trie_words,
4062 ST.nextword-1, 0 );
4063 SV *sv= tmp ? sv_newmortal() : NULL;
4064
4065 PerlIO_printf( Perl_debug_log,
4066 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4067 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4068 ST.nextword,
4069 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4070 PL_colors[0], PL_colors[1],
c89df6cf 4071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4072 )
4073 : "not compiled under -Dr",
4074 PL_colors[5] );
4075 });
4076
a059a757 4077 locinput = (char*)uc;
2e64971a 4078 continue; /* execute rest of RE */
118e2215 4079 assert(0); /* NOTREACHED */
a059a757 4080 }
166ba7cd
DM
4081#undef ST
4082
3c0563b9 4083 case EXACT: { /* /abc/ */
95b24440 4084 char *s = STRING(scan);
24d3c4a9 4085 ln = STR_LEN(scan);
f2ed9b32 4086 if (utf8_target != UTF_PATTERN) {
bc517b45 4087 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4088 char *l = locinput;
24d3c4a9 4089 const char * const e = s + ln;
a72c7584 4090
f2ed9b32 4091 if (utf8_target) {
e6a3850e
KW
4092 /* The target is utf8, the pattern is not utf8.
4093 * Above-Latin1 code points can't match the pattern;
4094 * invariants match exactly, and the other Latin1 ones need
4095 * to be downgraded to a single byte in order to do the
4096 * comparison. (If we could be confident that the target
4097 * is not malformed, this could be refactored to have fewer
4098 * tests by just assuming that if the first bytes match, it
4099 * is an invariant, but there are tests in the test suite
4100 * dealing with (??{...}) which violate this) */
1aa99e6b
IH
4101 while (s < e) {
4102 if (l >= PL_regeol)
5ff6fc6d 4103 sayNO;
e6a3850e
KW
4104 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4105 sayNO;
4106 }
4107 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4108 if (*l != *s) {
4109 sayNO;
4110 }
4111 l++;
4112 }
4113 else {
4114 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4115 sayNO;
4116 }
4117 l += 2;
4118 }
4119 s++;
1aa99e6b 4120 }
5ff6fc6d
JH
4121 }
4122 else {
4123 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4124 while (s < e) {
e6a3850e
KW
4125 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4126 {
4127 sayNO;
4128 }
4129 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4130 if (*s != *l) {
4131 sayNO;
4132 }
4133 s++;
4134 }
4135 else {
4136 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4137 sayNO;
4138 }
4139 s += 2;
4140 }
4141 l++;
1aa99e6b 4142 }
5ff6fc6d 4143 }
1aa99e6b 4144 locinput = l;
1aa99e6b
IH
4145 break;
4146 }
bc517b45 4147 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
4148 /* Inline the first character, for speed. */
4149 if (UCHARAT(s) != nextchr)
4150 sayNO;
24d3c4a9 4151 if (PL_regeol - locinput < ln)
d6a28714 4152 sayNO;
24d3c4a9 4153 if (ln > 1 && memNE(s, locinput, ln))
d6a28714 4154 sayNO;
24d3c4a9 4155 locinput += ln;
d6a28714 4156 break;
95b24440 4157 }
7016d6eb 4158
3c0563b9 4159 case EXACTFL: { /* /abc/il */
a932d541 4160 re_fold_t folder;
9a5a5549
KW
4161 const U8 * fold_array;
4162 const char * s;
d513472c 4163 U32 fold_utf8_flags;
9a5a5549 4164
b8c5462f 4165 PL_reg_flags |= RF_tainted;
9a5a5549
KW
4166 folder = foldEQ_locale;
4167 fold_array = PL_fold_locale;
17580e7a 4168 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4169 goto do_exactf;
4170
3c0563b9
DM
4171 case EXACTFU_SS: /* /\x{df}/iu */
4172 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4173 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4174 folder = foldEQ_latin1;
4175 fold_array = PL_fold_latin1;
2daa8fee 4176 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4177 goto do_exactf;
4178
3c0563b9 4179 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4180 folder = foldEQ_latin1;
4181 fold_array = PL_fold_latin1;
57014d77 4182 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4183 goto do_exactf;
4184
3c0563b9 4185 case EXACTF: /* /abc/i */
9a5a5549
KW
4186 folder = foldEQ;
4187 fold_array = PL_fold;
62bf7766 4188 fold_utf8_flags = 0;
9a5a5549
KW
4189
4190 do_exactf:
4191 s = STRING(scan);
24d3c4a9 4192 ln = STR_LEN(scan);
d6a28714 4193
3c760661
KW
4194 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4195 /* Either target or the pattern are utf8, or has the issue where
4196 * the fold lengths may differ. */
be8e71aa 4197 const char * const l = locinput;
d07ddd77 4198 char *e = PL_regeol;
bc517b45 4199
d513472c 4200 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
fa5b1667 4201 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4202 {
4203 sayNO;
5486206c 4204 }
d07ddd77 4205 locinput = e;
d07ddd77 4206 break;
a0ed51b3 4207 }
d6a28714 4208
0a138b74 4209 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4210 if (UCHARAT(s) != nextchr
4211 && !NEXTCHR_IS_EOS
4212 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4213 {
a0ed51b3 4214 sayNO;
9a5a5549 4215 }
24d3c4a9 4216 if (PL_regeol - locinput < ln)
b8c5462f 4217 sayNO;
9a5a5549 4218 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4219 sayNO;
24d3c4a9 4220 locinput += ln;
a0d0e21e 4221 break;
9a5a5549 4222 }
63ac0dad
KW
4223
4224 /* XXX Could improve efficiency by separating these all out using a
4225 * macro or in-line function. At that point regcomp.c would no longer
4226 * have to set the FLAGS fields of these */
3c0563b9
DM
4227 case BOUNDL: /* /\b/l */
4228 case NBOUNDL: /* /\B/l */
b2680017
YO
4229 PL_reg_flags |= RF_tainted;
4230 /* FALL THROUGH */
3c0563b9
DM
4231 case BOUND: /* /\b/ */
4232 case BOUNDU: /* /\b/u */
4233 case BOUNDA: /* /\b/a */
4234 case NBOUND: /* /\B/ */
4235 case NBOUNDU: /* /\B/u */
4236 case NBOUNDA: /* /\B/a */
b2680017 4237 /* was last char in word? */
f2e96b5d
KW
4238 if (utf8_target
4239 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4240 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4241 {
b2680017
YO
4242 if (locinput == PL_bostr)
4243 ln = '\n';
4244 else {
4245 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4246
4247 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4248 }
63ac0dad 4249 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
b2680017 4250 ln = isALNUM_uni(ln);
7016d6eb
DM
4251 if (NEXTCHR_IS_EOS)
4252 n = 0;
4253 else {
4254 LOAD_UTF8_CHARCLASS_ALNUM();
4255 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4256 utf8_target);
4257 }
b2680017
YO
4258 }
4259 else {
4260 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
7016d6eb 4261 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
b2680017
YO
4262 }
4263 }
4264 else {
cfaf538b
KW
4265
4266 /* Here the string isn't utf8, or is utf8 and only ascii
4267 * characters are to match \w. In the latter case looking at
4268 * the byte just prior to the current one may be just the final
4269 * byte of a multi-byte character. This is ok. There are two
4270 * cases:
4271 * 1) it is a single byte character, and then the test is doing
4272 * just what it's supposed to.
4273 * 2) it is a multi-byte character, in which case the final
4274 * byte is never mistakable for ASCII, and so the test
4275 * will say it is not a word character, which is the
4276 * correct answer. */
b2680017
YO
4277 ln = (locinput != PL_bostr) ?
4278 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4279 switch (FLAGS(scan)) {
4280 case REGEX_UNICODE_CHARSET:
4281 ln = isWORDCHAR_L1(ln);
7016d6eb 4282 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4283 break;
4284 case REGEX_LOCALE_CHARSET:
4285 ln = isALNUM_LC(ln);
7016d6eb 4286 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
63ac0dad
KW
4287 break;
4288 case REGEX_DEPENDS_CHARSET:
4289 ln = isALNUM(ln);
7016d6eb 4290 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
63ac0dad 4291 break;
cfaf538b 4292 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4293 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4294 ln = isWORDCHAR_A(ln);
7016d6eb 4295 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4296 break;
63ac0dad
KW
4297 default:
4298 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4299 break;
b2680017
YO
4300 }
4301 }
63ac0dad
KW
4302 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4303 * regcomp.sym */
4304 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4305 sayNO;
4306 break;
3c0563b9 4307
3c0563b9 4308 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4309 if (NEXTCHR_IS_EOS)
4310 sayNO;
e0193e47 4311 if (utf8_target) {
635cd5d4 4312 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4313 sayNO;
635cd5d4 4314 locinput += UTF8SKIP(locinput);
e0f9d4a8 4315 break;
ffc61ed2
JH
4316 }
4317 else {
20ed0b26 4318 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4319 sayNO;
3640db6b 4320 locinput++;
e0f9d4a8
JH
4321 break;
4322 }
b8c5462f 4323 break;
3c0563b9
DM
4324
4325 /* Special char classes: \d, \w etc.
4326 * The defines start on line 166 or so */
ee9a90b8
KW
4327 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4328 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329 ALNUMU, NALNUMU, isWORDCHAR_L1,
cfaf538b 4330 ALNUMA, NALNUMA, isWORDCHAR_A,
779d7b58 4331 alnum, "a");
ee9a90b8
KW
4332
4333 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4334 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335 SPACEU, NSPACEU, isSPACE_L1,
cfaf538b 4336 SPACEA, NSPACEA, isSPACE_A,
779d7b58 4337 space, " ");
ee9a90b8
KW
4338
4339 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4340 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
cfaf538b 4341 DIGITA, NDIGITA, isDIGIT_A,
779d7b58 4342 digit, "0");
20d0b1e9 4343
3c0563b9 4344 case POSIXA: /* /[[:ascii:]]/ etc */
7016d6eb 4345 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4346 sayNO;
4347 }
4348 /* Matched a utf8-invariant, so don't have to worry about utf8 */
3640db6b 4349 locinput++;
0658cdde 4350 break;
3c0563b9
DM
4351
4352 case NPOSIXA: /* /[^[:ascii:]]/ etc */
7016d6eb 4353 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
0658cdde
KW
4354 sayNO;
4355 }
28b98f76 4356 goto increment_locinput;
0658cdde 4357
37e2e78e
KW
4358 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4359 a Unicode extended Grapheme Cluster */
4360 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4361 extended Grapheme Cluster is:
4362
4363 CR LF
4364 | Prepend* Begin Extend*
4365 | .
4366
1e958ea9
KW
4367 Begin is: ( Special_Begin | ! Control )
4368 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4369 Extend is: ( Grapheme_Extend | Spacing_Mark )
4370 Control is: [ GCB_Control CR LF ]
4371 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4372
27d4fc33
KW
4373 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4374 we can rewrite
4375
4376 Begin is ( Regular_Begin + Special Begin )
4377
4378 It turns out that 98.4% of all Unicode code points match
4379 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4380 the previous implementation for almost all Unicode code points.
27d4fc33 4381
37e2e78e
KW
4382 There is a subtlety with Prepend* which showed up in testing.
4383 Note that the Begin, and only the Begin is required in:
4384 | Prepend* Begin Extend*
cc3b396d
KW
4385 Also, Begin contains '! Control'. A Prepend must be a
4386 '! Control', which means it must also be a Begin. What it
4387 comes down to is that if we match Prepend* and then find no
4388 suitable Begin afterwards, that if we backtrack the last
4389 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4390 */
4391
7016d6eb 4392 if (NEXTCHR_IS_EOS)
a0ed51b3 4393 sayNO;
f2ed9b32 4394 if (! utf8_target) {
37e2e78e
KW
4395
4396 /* Match either CR LF or '.', as all the other possibilities
4397 * require utf8 */
4398 locinput++; /* Match the . or CR */
cc3b396d
KW
4399 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4400 match the LF */
37e2e78e
KW
4401 && locinput < PL_regeol
4402 && UCHARAT(locinput) == '\n') locinput++;
4403 }
4404 else {
4405
4406 /* Utf8: See if is ( CR LF ); already know that locinput <
4407 * PL_regeol, so locinput+1 is in bounds */
7016d6eb
DM
4408 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409 && UCHARAT(locinput + 1) == '\n')
4410 {
37e2e78e
KW
4411 locinput += 2;
4412 }
4413 else {
45fdf108
KW
4414 STRLEN len;
4415
37e2e78e
KW
4416 /* In case have to backtrack to beginning, then match '.' */
4417 char *starting = locinput;
4418
4419 /* In case have to backtrack the last prepend */
4420 char *previous_prepend = 0;
4421
4422 LOAD_UTF8_CHARCLASS_GCB();
4423
45fdf108
KW
4424 /* Match (prepend)* */
4425 while (locinput < PL_regeol
4426 && (len = is_GCB_Prepend_utf8(locinput)))
4427 {
4428 previous_prepend = locinput;
4429 locinput += len;
a1853d78 4430 }
37e2e78e
KW
4431
4432 /* As noted above, if we matched a prepend character, but
4433 * the next thing won't match, back off the last prepend we
4434 * matched, as it is guaranteed to match the begin */
4435 if (previous_prepend
4436 && (locinput >= PL_regeol
c101f46d
KW
4437 || (! swash_fetch(PL_utf8_X_regular_begin,
4438 (U8*)locinput, utf8_target)
45fdf108 4439 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
c101f46d 4440 )
37e2e78e
KW
4441 {
4442 locinput = previous_prepend;
4443 }
4444
4445 /* Note that here we know PL_regeol > locinput, as we
4446 * tested that upon input to this switch case, and if we
4447 * moved locinput forward, we tested the result just above
4448 * and it either passed, or we backed off so that it will
4449 * now pass */
11dfcd49
KW
4450 if (swash_fetch(PL_utf8_X_regular_begin,
4451 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4452 locinput += UTF8SKIP(locinput);
4453 }
45fdf108 4454 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
37e2e78e
KW
4455
4456 /* Here did not match the required 'Begin' in the
4457 * second term. So just match the very first
4458 * character, the '.' of the final term of the regex */
4459 locinput = starting + UTF8SKIP(starting);
27d4fc33 4460 goto exit_utf8;
37e2e78e
KW
4461 } else {
4462
11dfcd49
KW
4463 /* Here is a special begin. It can be composed of
4464 * several individual characters. One possibility is
4465 * RI+ */
45fdf108
KW
4466 if ((len = is_GCB_RI_utf8(locinput))) {
4467 locinput += len;
11dfcd49 4468 while (locinput < PL_regeol
45fdf108 4469 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4470 {
45fdf108 4471 locinput += len;
11dfcd49 4472 }
45fdf108
KW
4473 } else if ((len = is_GCB_T_utf8(locinput))) {
4474 /* Another possibility is T+ */
4475 locinput += len;
11dfcd49 4476 while (locinput < PL_regeol
45fdf108 4477 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4478 {
45fdf108 4479 locinput += len;
11dfcd49
KW
4480 }
4481 } else {
4482
4483 /* Here, neither RI+ nor T+; must be some other
4484 * Hangul. That means it is one of the others: L,
4485 * LV, LVT or V, and matches:
4486 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4487
4488 /* Match L* */
4489 while (locinput < PL_regeol
45fdf108 4490 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4491 {
45fdf108 4492 locinput += len;
11dfcd49 4493 }
37e2e78e 4494
11dfcd49
KW
4495 /* Here, have exhausted L*. If the next character
4496 * is not an LV, LVT nor V, it means we had to have
4497 * at least one L, so matches L+ in the original
4498 * equation, we have a complete hangul syllable.
4499 * Are done. */
4500
4501 if (locinput < PL_regeol
45fdf108 4502 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49
KW
4503 {
4504
4505 /* Otherwise keep going. Must be LV, LVT or V.
4506 * See if LVT */
4507 if (is_utf8_X_LVT((U8*)locinput)) {
4508 locinput += UTF8SKIP(locinput);
4509 } else {
4510
4511 /* Must be V or LV. Take it, then match
4512 * V* */
4513 locinput += UTF8SKIP(locinput);
4514 while (locinput < PL_regeol
45fdf108 4515 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4516 {
45fdf108 4517 locinput += len;
11dfcd49
KW
4518 }
4519 }
37e2e78e 4520
11dfcd49 4521 /* And any of LV, LVT, or V can be followed
45fdf108 4522 * by T* */
11dfcd49 4523 while (locinput < PL_regeol
45fdf108 4524 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4525 {
45fdf108 4526 locinput += len;
11dfcd49
KW
4527 }
4528 }
cd94d768 4529 }
11dfcd49 4530 }
37e2e78e 4531
11dfcd49
KW
4532 /* Match any extender */
4533 while (locinput < PL_regeol
4534 && swash_fetch(PL_utf8_X_extend,
4535 (U8*)locinput, utf8_target))
4536 {
4537 locinput += UTF8SKIP(locinput);
4538 }
37e2e78e 4539 }
27d4fc33 4540 exit_utf8:
37e2e78e
KW
4541 if (locinput > PL_regeol) sayNO;
4542 }
a0ed51b3 4543 break;
81714fb9 4544
3c0563b9 4545 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4546 { /* The capture buffer cases. The ones beginning with N for the
4547 named buffers just convert to the equivalent numbered and
4548 pretend they were called as the corresponding numbered buffer
4549 op. */
26ecd678
TC
4550 /* don't initialize these in the declaration, it makes C++
4551 unhappy */
81714fb9 4552 char *s;
ff1157ca 4553 char type;
8368298a
TC
4554 re_fold_t folder;
4555 const U8 *fold_array;
26ecd678 4556 UV utf8_fold_flags;
8368298a 4557
81714fb9 4558 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4559 folder = foldEQ_locale;
4560 fold_array = PL_fold_locale;
4561 type = REFFL;
17580e7a 4562 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4563 goto do_nref;
4564
3c0563b9 4565 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4566 folder = foldEQ_latin1;
4567 fold_array = PL_fold_latin1;
4568 type = REFFA;
4569 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4570 goto do_nref;
4571
3c0563b9 4572 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4573 folder = foldEQ_latin1;
4574 fold_array = PL_fold_latin1;
4575 type = REFFU;
d513472c 4576 utf8_fold_flags = 0;
d7ef4b73
KW
4577 goto do_nref;
4578
3c0563b9 4579 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4580 folder = foldEQ;
4581 fold_array = PL_fold;
4582 type = REFF;
d513472c 4583 utf8_fold_flags = 0;
d7ef4b73
KW
4584 goto do_nref;
4585
3c0563b9 4586 case NREF: /* /\g{name}/ */
d7ef4b73 4587 type = REF;
83d7b90b
KW
4588 folder = NULL;
4589 fold_array = NULL;
d513472c 4590 utf8_fold_flags = 0;
d7ef4b73
KW
4591 do_nref:
4592
4593 /* For the named back references, find the corresponding buffer
4594 * number */
0a4db386
YO
4595 n = reg_check_named_buff_matched(rex,scan);
4596
d7ef4b73 4597 if ( ! n ) {
81714fb9 4598 sayNO;
d7ef4b73
KW
4599 }
4600 goto do_nref_ref_common;
4601
3c0563b9 4602 case REFFL: /* /\1/il */
3280af22 4603 PL_reg_flags |= RF_tainted;
d7ef4b73
KW
4604 folder = foldEQ_locale;
4605 fold_array = PL_fold_locale;
17580e7a 4606 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4607 goto do_ref;
4608
3c0563b9 4609 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4610 folder = foldEQ_latin1;
4611 fold_array = PL_fold_latin1;
4612 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4613 goto do_ref;
4614
3c0563b9 4615 case REFFU: /* /\1/iu */
d7ef4b73
KW
4616 folder = foldEQ_latin1;
4617 fold_array = PL_fold_latin1;
d513472c 4618 utf8_fold_flags = 0;
d7ef4b73
KW
4619 goto do_ref;
4620
3c0563b9 4621 case REFF: /* /\1/i */
d7ef4b73
KW
4622 folder = foldEQ;
4623 fold_array = PL_fold;
d513472c 4624 utf8_fold_flags = 0;
83d7b90b 4625 goto do_ref;
d7ef4b73 4626
3c0563b9 4627 case REF: /* /\1/ */
83d7b90b
KW
4628 folder = NULL;
4629 fold_array = NULL;
d513472c 4630 utf8_fold_flags = 0;
83d7b90b 4631
d7ef4b73 4632 do_ref:
81714fb9 4633 type = OP(scan);
d7ef4b73
KW
4634 n = ARG(scan); /* which paren pair */
4635
4636 do_nref_ref_common:
b93070ed 4637 ln = rex->offs[n].start;
2c2d71f5 4638 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4639 if (rex->lastparen < n || ln == -1)
af3f8c16 4640 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4641 if (ln == rex->offs[n].end)
a0d0e21e 4642 break;
a0ed51b3 4643
24d3c4a9 4644 s = PL_bostr + ln;
d7ef4b73 4645 if (type != REF /* REF can do byte comparison */
2f65c56d 4646 && (utf8_target || type == REFFU))
d7ef4b73
KW
4647 { /* XXX handle REFFL better */
4648 char * limit = PL_regeol;
4649
4650 /* This call case insensitively compares the entire buffer
4651 * at s, with the current input starting at locinput, but
4652 * not going off the end given by PL_regeol, and returns in
2db97d41 4653 * <limit> upon success, how much of the current input was
d7ef4b73 4654 * matched */
b93070ed 4655 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4656 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4657 {
4658 sayNO;
a0ed51b3 4659 }
d7ef4b73 4660 locinput = limit;
a0ed51b3
LW
4661 break;
4662 }
4663
d7ef4b73 4664 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4665 if (!NEXTCHR_IS_EOS &&
4666 UCHARAT(s) != nextchr &&
81714fb9 4667 (type == REF ||
d7ef4b73 4668 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4669 sayNO;
b93070ed 4670 ln = rex->offs[n].end - ln;
24d3c4a9 4671 if (locinput + ln > PL_regeol)
4633a7c4 4672 sayNO;
81714fb9 4673 if (ln > 1 && (type == REF
24d3c4a9 4674 ? memNE(s, locinput, ln)
d7ef4b73 4675 : ! folder(s, locinput, ln)))
4633a7c4 4676 sayNO;
24d3c4a9 4677 locinput += ln;
a0d0e21e 4678 break;
81714fb9 4679 }
3c0563b9
DM
4680
4681 case NOTHING: /* null op; e.g. the 'nothing' following
4682 * the '*' in m{(a+|b)*}' */
4683 break;
4684 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4685 break;
3c0563b9
DM
4686
4687 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4688 break;
40a82448
DM
4689
4690#undef ST
4691#define ST st->u.eval
c277df42 4692 {
c277df42 4693 SV *ret;
d2f13c59 4694 REGEXP *re_sv;
6bda09f9 4695 regexp *re;
f8fc2ecf 4696 regexp_internal *rei;
1a147d38
YO
4697 regnode *startpoint;
4698
3c0563b9 4699 case GOSTART: /* (?R) */
e7707071
YO
4700 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4701 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4702 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4703 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4704 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4705 Perl_croak(aTHX_
4706 "Pattern subroutine nesting without pos change"
4707 " exceeded limit in regex");
6bda09f9
YO
4708 } else {
4709 nochange_depth = 0;
1a147d38 4710 }
288b8c02 4711 re_sv = rex_sv;
6bda09f9 4712 re = rex;
f8fc2ecf 4713 rei = rexi;
1a147d38 4714 if (OP(scan)==GOSUB) {
6bda09f9
YO
4715 startpoint = scan + ARG2L(scan);
4716 ST.close_paren = ARG(scan);
4717 } else {
f8fc2ecf 4718 startpoint = rei->program+1;
6bda09f9
YO
4719 ST.close_paren = 0;
4720 }
4721 goto eval_recurse_doit;
118e2215 4722 assert(0); /* NOTREACHED */
3c0563b9 4723
6bda09f9
YO
4724 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4725 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4726 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4727 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4728 } else {
4729 nochange_depth = 0;
4730 }
8e5e9ebe 4731 {
4aabdb9b 4732 /* execute the code in the {...} */
81ed78b2 4733
4aabdb9b 4734 dSP;
81ed78b2 4735 SV ** before;
1f4d1a1e 4736 OP * const oop = PL_op;
4aabdb9b 4737 COP * const ocurcop = PL_curcop;
81ed78b2 4738 OP *nop;
d80618d2 4739 char *saved_regeol = PL_regeol;
91332126 4740 struct re_save_state saved_state;
81ed78b2 4741 CV *newcv;
91332126 4742
74088413
DM
4743 /* save *all* paren positions */
4744 regcppush(rex, 0);
4745 REGCP_SET(runops_cp);
4746
6562f1c4 4747 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4748 * eval we would normally put it on the save stack, like with
4749 * save_re_context. However, re-evals have a weird scoping so we
4750 * can't just add ENTER/LEAVE here. With that, things like
4751 *
4752 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4753 *
4754 * would break, as they expect the localisation to be unwound
4755 * only when the re-engine backtracks through the bit that
4756 * localised it.
4757 *
4758 * What we do instead is just saving the state in a local c
4759 * variable.
4760 */
91332126 4761 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4762
d24ca0c5 4763 PL_reg_state.re_reparsing = FALSE;
91332126 4764
81ed78b2
DM
4765 if (!caller_cv)
4766 caller_cv = find_runcv(NULL);
4767
4aabdb9b 4768 n = ARG(scan);
81ed78b2 4769
b30fcab9 4770 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4771 newcv = (ReANY(
b30fcab9
DM
4772 (REGEXP*)(rexi->data->data[n])
4773 ))->qr_anoncv
81ed78b2
DM
4774 ;
4775 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4776 }
4777 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4778 newcv = caller_cv;
4779 nop = (OP*)rexi->data->data[n];
4780 assert(CvDEPTH(newcv));
68e2671b
DM
4781 }
4782 else {
d24ca0c5
DM
4783 /* literal with own CV */
4784 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4785 newcv = rex->qr_anoncv;
4786 nop = (OP*)rexi->data->data[n];
68e2671b 4787 }
81ed78b2 4788
0e458318
DM
4789 /* normally if we're about to execute code from the same
4790 * CV that we used previously, we just use the existing
4791 * CX stack entry. However, its possible that in the
4792 * meantime we may have backtracked, popped from the save
4793 * stack, and undone the SAVECOMPPAD(s) associated with
4794 * PUSH_MULTICALL; in which case PL_comppad no longer
4795 * points to newcv's pad. */
4796 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4797 {
4798 I32 depth = (newcv == caller_cv) ? 0 : 1;
4799 if (last_pushed_cv) {
4800 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4801 }
4802 else {
4803 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4804 }
4805 last_pushed_cv = newcv;
4806 }
4807 last_pad = PL_comppad;
4808
2e2e3f36
DM
4809 /* the initial nextstate you would normally execute
4810 * at the start of an eval (which would cause error
4811 * messages to come from the eval), may be optimised
4812 * away from the execution path in the regex code blocks;
4813 * so manually set PL_curcop to it initially */
4814 {
81ed78b2 4815 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4816 assert(o->op_type == OP_NULL);
4817 if (o->op_targ == OP_SCOPE) {
4818 o = cUNOPo->op_first;
4819 }
4820 else {
4821 assert(o->op_targ == OP_LEAVE);
4822 o = cUNOPo->op_first;
4823 assert(o->op_type == OP_ENTER);
4824 o = o->op_sibling;
4825 }
4826
4827 if (o->op_type != OP_STUB) {
4828 assert( o->op_type == OP_NEXTSTATE
4829 || o->op_type == OP_DBSTATE
4830 || (o->op_type == OP_NULL
4831 && ( o->op_targ == OP_NEXTSTATE
4832 || o->op_targ == OP_DBSTATE
4833 )
4834 )
4835 );
4836 PL_curcop = (COP*)o;
4837 }
4838 }
81ed78b2 4839 nop = nop->op_next;
2e2e3f36 4840
24b23f37 4841 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4842 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4843
b93070ed 4844 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4845
2bf803e2
YO
4846 if (sv_yes_mark) {
4847 SV *sv_mrk = get_sv("REGMARK", 1);
4848 sv_setsv(sv_mrk, sv_yes_mark);
4849 }
4850
81ed78b2
DM
4851 /* we don't use MULTICALL here as we want to call the
4852 * first op of the block of interest, rather than the
4853 * first op of the sub */
4854 before = SP;
4855 PL_op = nop;
8e5e9ebe
RGS
4856 CALLRUNOPS(aTHX); /* Scalar context. */
4857 SPAGAIN;
4858 if (SP == before)
075aa684 4859 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4860 else {
4861 ret = POPs;
4862 PUTBACK;
4863 }
4aabdb9b 4864
e4bfbed3
DM
4865 /* before restoring everything, evaluate the returned
4866 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4867 * PL_op or pad. Also need to process any magic vars
4868 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4869
4870 PL_op = NULL;
4871
5e98dac2 4872 re_sv = NULL;
e4bfbed3
DM
4873 if (logical == 0) /* (?{})/ */
4874 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4876 sw = cBOOL(SvTRUE(ret));
4877 logical = 0;
4878 }
4879 else { /* /(??{}) */
497d0a96
DM
4880 /* if its overloaded, let the regex compiler handle
4881 * it; otherwise extract regex, or stringify */
4882 if (!SvAMAGIC(ret)) {
4883 SV *sv = ret;
4884 if (SvROK(sv))
4885 sv = SvRV(sv);
4886 if (SvTYPE(sv) == SVt_REGEXP)
4887 re_sv = (REGEXP*) sv;
4888 else if (SvSMAGICAL(sv)) {
4889 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4890 if (mg)
4891 re_sv = (REGEXP *) mg->mg_obj;
4892 }
e4bfbed3 4893
497d0a96
DM
4894 /* force any magic, undef warnings here */
4895 if (!re_sv) {
4896 ret = sv_mortalcopy(ret);
4897 (void) SvPV_force_nolen(ret);
4898 }
e4bfbed3
DM
4899 }
4900
4901 }
4902
91332126
FR
4903 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4904
81ed78b2
DM
4905 /* *** Note that at this point we don't restore
4906 * PL_comppad, (or pop the CxSUB) on the assumption it may
4907 * be used again soon. This is safe as long as nothing
4908 * in the regexp code uses the pad ! */
4aabdb9b 4909 PL_op = oop;
4aabdb9b 4910 PL_curcop = ocurcop;
d80618d2 4911 PL_regeol = saved_regeol;
e4bfbed3
DM
4912 S_regcp_restore(aTHX_ rex, runops_cp);
4913
4914 if (logical != 2)
4aabdb9b 4915 break;
8e5e9ebe 4916 }
e4bfbed3
DM
4917
4918 /* only /(??{})/ from now on */
24d3c4a9 4919 logical = 0;
4aabdb9b 4920 {
4f639d21
DM
4921 /* extract RE object from returned value; compiling if
4922 * necessary */
5c35adbb 4923
575c37f6
DM
4924 if (re_sv) {
4925 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 4926 }
0f5d15d6 4927 else {
c737faaf 4928 U32 pm_flags = 0;
a3b680e6 4929 const I32 osize = PL_regsize;
0f5d15d6 4930
9753d940
DM
4931 if (SvUTF8(ret) && IN_BYTES) {
4932 /* In use 'bytes': make a copy of the octet
4933 * sequence, but without the flag on */
b9ad30b4
NC
4934 STRLEN len;
4935 const char *const p = SvPV(ret, len);
4936 ret = newSVpvn_flags(p, len, SVs_TEMP);
4937 }
732caac7
DM
4938 if (rex->intflags & PREGf_USE_RE_EVAL)
4939 pm_flags |= PMf_USE_RE_EVAL;
4940
4941 /* if we got here, it should be an engine which
4942 * supports compiling code blocks and stuff */
4943 assert(rex->engine && rex->engine->op_comp);
ec841a27 4944 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 4945 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
4946 rex->engine, NULL, NULL,
4947 /* copy /msix etc to inner pattern */
4948 scan->flags,
4949 pm_flags);
732caac7 4950
9041c2e3 4951 if (!(SvFLAGS(ret)
faf82a0b 4952 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 4953 | SVs_GMG))) {
a2794585
NC
4954 /* This isn't a first class regexp. Instead, it's
4955 caching a regexp onto an existing, Perl visible
4956 scalar. */
575c37f6 4957 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 4958 }
0f5d15d6 4959 PL_regsize = osize;
74088413
DM
4960 /* safe to do now that any $1 etc has been
4961 * interpolated into the new pattern string and
4962 * compiled */
4963 S_regcp_restore(aTHX_ rex, runops_cp);
0f5d15d6 4964 }
8d919b0a 4965 re = ReANY(re_sv);
4aabdb9b 4966 }
07bc277f 4967 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
4968 re->subbeg = rex->subbeg;
4969 re->sublen = rex->sublen;
6502e081
DM
4970 re->suboffset = rex->suboffset;
4971 re->subcoffset = rex->subcoffset;
f8fc2ecf 4972 rei = RXi_GET(re);
6bda09f9 4973 DEBUG_EXECUTE_r(
f2ed9b32 4974 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
4975 "Matching embedded");
4976 );
f8fc2ecf 4977 startpoint = rei->program + 1;
1a147d38 4978 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 4979
1a147d38 4980 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 4981 /* run the pattern returned from (??{...}) */
b93070ed 4982 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
40a82448 4983 REGCP_SET(ST.lastcp);
6bda09f9 4984
0357f1fd
ML
4985 re->lastparen = 0;
4986 re->lastcloseparen = 0;
4987
ae0beba1 4988 PL_regsize = 0;
4aabdb9b
DM
4989
4990 /* XXXX This is too dramatic a measure... */
4991 PL_reg_maxiter = 0;
4992
faec1544 4993 ST.toggle_reg_flags = PL_reg_flags;
3c8556c3 4994 if (RX_UTF8(re_sv))
faec1544
DM
4995 PL_reg_flags |= RF_utf8;
4996 else
4997 PL_reg_flags &= ~RF_utf8;
4998 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4999
288b8c02 5000 ST.prev_rex = rex_sv;
faec1544 5001 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5002 rex_sv = re_sv;
5003 SET_reg_curpm(rex_sv);
288b8c02 5004 rex = re;
f8fc2ecf 5005 rexi = rei;
faec1544 5006 cur_curlyx = NULL;
40a82448 5007 ST.B = next;
faec1544
DM
5008 ST.prev_eval = cur_eval;
5009 cur_eval = st;
faec1544 5010 /* now continue from first node in postoned RE */
4d5016e5 5011 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5012 assert(0); /* NOTREACHED */
c277df42 5013 }
40a82448 5014
faec1544
DM
5015 case EVAL_AB: /* cleanup after a successful (??{A})B */
5016 /* note: this is called twice; first after popping B, then A */
5017 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
5018 rex_sv = ST.prev_rex;
5019 SET_reg_curpm(rex_sv);
8d919b0a 5020 rex = ReANY(rex_sv);
f8fc2ecf 5021 rexi = RXi_GET(rex);
faec1544
DM
5022 regcpblow(ST.cp);
5023 cur_eval = ST.prev_eval;
5024 cur_curlyx = ST.prev_curlyx;
34a81e2b 5025
40a82448
DM
5026 /* XXXX This is too dramatic a measure... */
5027 PL_reg_maxiter = 0;
e7707071 5028 if ( nochange_depth )
4b196cd4 5029 nochange_depth--;
262b90c4 5030 sayYES;
40a82448 5031
40a82448 5032
faec1544
DM
5033 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5034 /* note: this is called twice; first after popping B, then A */
5035 PL_reg_flags ^= ST.toggle_reg_flags;
ec43f78b
DM
5036 rex_sv = ST.prev_rex;
5037 SET_reg_curpm(rex_sv);
8d919b0a 5038 rex = ReANY(rex_sv);
f8fc2ecf 5039 rexi = RXi_GET(rex);
0357f1fd 5040
40a82448
DM
5041 REGCP_UNWIND(ST.lastcp);
5042 regcppop(rex);
faec1544
DM
5043 cur_eval = ST.prev_eval;
5044 cur_curlyx = ST.prev_curlyx;
5045 /* XXXX This is too dramatic a measure... */
5046 PL_reg_maxiter = 0;
e7707071 5047 if ( nochange_depth )
4b196cd4 5048 nochange_depth--;
40a82448 5049 sayNO_SILENT;
40a82448
DM
5050#undef ST
5051
3c0563b9 5052 case OPEN: /* ( */
c277df42 5053 n = ARG(scan); /* which paren pair */
1ca2007e 5054 rex->offs[n].start_tmp = locinput - PL_bostr;
3280af22
NIS
5055 if (n > PL_regsize)
5056 PL_regsize = n;
495f47a5
DM
5057 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5058 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5059 PTR2UV(rex),
5060 PTR2UV(rex->offs),
5061 (UV)n,
5062 (IV)rex->offs[n].start_tmp,
5063 (UV)PL_regsize
5064 ));
e2e6a0f1 5065 lastopen = n;
a0d0e21e 5066 break;
495f47a5
DM
5067
5068/* XXX really need to log other places start/end are set too */
5069#define CLOSE_CAPTURE \
5070 rex->offs[n].start = rex->offs[n].start_tmp; \
5071 rex->offs[n].end = locinput - PL_bostr; \
5072 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5073 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5074 PTR2UV(rex), \
5075 PTR2UV(rex->offs), \
5076 (UV)n, \
5077 (IV)rex->offs[n].start, \
5078 (IV)rex->offs[n].end \
5079 ))
5080
3c0563b9 5081 case CLOSE: /* ) */
c277df42 5082 n = ARG(scan); /* which paren pair */
495f47a5 5083 CLOSE_CAPTURE;
7f69552c
YO
5084 /*if (n > PL_regsize)
5085 PL_regsize = n;*/
b93070ed
DM
5086 if (n > rex->lastparen)
5087 rex->lastparen = n;
5088 rex->lastcloseparen = n;
3b6647e0 5089 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5090 goto fake_end;
5091 }
a0d0e21e 5092 break;
3c0563b9
DM
5093
5094 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5095 if (ARG(scan)){
5096 regnode *cursor;
5097 for (cursor=scan;
5098 cursor && OP(cursor)!=END;
5099 cursor=regnext(cursor))
5100 {
5101 if ( OP(cursor)==CLOSE ){
5102 n = ARG(cursor);
5103 if ( n <= lastopen ) {
495f47a5 5104 CLOSE_CAPTURE;
e2e6a0f1
YO
5105 /*if (n > PL_regsize)
5106 PL_regsize = n;*/
b93070ed
DM
5107 if (n > rex->lastparen)
5108 rex->lastparen = n;
5109 rex->lastcloseparen = n;
3b6647e0
RB
5110 if ( n == ARG(scan) || (cur_eval &&
5111 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5112 break;
5113 }
5114 }
5115 }
5116 }
5117 goto fake_end;
5118 /*NOTREACHED*/
3c0563b9
DM
5119
5120 case GROUPP: /* (?(1)) */
c277df42 5121 n = ARG(scan); /* which paren pair */
b93070ed 5122 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5123 break;
3c0563b9
DM
5124
5125 case NGROUPP: /* (?(<name>)) */
0a4db386 5126 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5127 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5128 break;
3c0563b9
DM
5129
5130 case INSUBP: /* (?(R)) */
0a4db386 5131 n = ARG(scan);
3b6647e0 5132 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5133 break;
3c0563b9
DM
5134
5135 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5136 sw = 0;
5137 break;
3c0563b9
DM
5138
5139 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5140 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5141 if (sw)
c277df42
IZ
5142 next = NEXTOPER(NEXTOPER(scan));
5143 else {
5144 next = scan + ARG(scan);
5145 if (OP(next) == IFTHEN) /* Fake one. */
5146 next = NEXTOPER(NEXTOPER(next));
5147 }
5148 break;
3c0563b9
DM
5149
5150 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5151 logical = scan->flags;
c277df42 5152 break;
c476f425 5153
2ab05381 5154/*******************************************************************
2ab05381 5155
c476f425
DM
5156The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5157pattern, where A and B are subpatterns. (For simple A, CURLYM or
5158STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5159
c476f425 5160A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5161
c476f425
DM
5162On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5163state, which contains the current count, initialised to -1. It also sets
5164cur_curlyx to point to this state, with any previous value saved in the
5165state block.
2ab05381 5166
c476f425
DM
5167CURLYX then jumps straight to the WHILEM op, rather than executing A,
5168since the pattern may possibly match zero times (i.e. it's a while {} loop
5169rather than a do {} while loop).
2ab05381 5170
c476f425
DM
5171Each entry to WHILEM represents a successful match of A. The count in the
5172CURLYX block is incremented, another WHILEM state is pushed, and execution
5173passes to A or B depending on greediness and the current count.
2ab05381 5174
c476f425
DM
5175For example, if matching against the string a1a2a3b (where the aN are
5176substrings that match /A/), then the match progresses as follows: (the
5177pushed states are interspersed with the bits of strings matched so far):
2ab05381 5178
c476f425
DM
5179 <CURLYX cnt=-1>
5180 <CURLYX cnt=0><WHILEM>
5181 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5182 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5183 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5184 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5185
c476f425
DM
5186(Contrast this with something like CURLYM, which maintains only a single
5187backtrack state:
2ab05381 5188
c476f425
DM
5189 <CURLYM cnt=0> a1
5190 a1 <CURLYM cnt=1> a2
5191 a1 a2 <CURLYM cnt=2> a3
5192 a1 a2 a3 <CURLYM cnt=3> b
5193)
2ab05381 5194
c476f425
DM
5195Each WHILEM state block marks a point to backtrack to upon partial failure
5196of A or B, and also contains some minor state data related to that
5197iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5198overall state, such as the count, and pointers to the A and B ops.
2ab05381 5199
c476f425
DM
5200This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5201must always point to the *current* CURLYX block, the rules are:
2ab05381 5202
c476f425
DM
5203When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5204and set cur_curlyx to point the new block.
2ab05381 5205
c476f425
DM
5206When popping the CURLYX block after a successful or unsuccessful match,
5207restore the previous cur_curlyx.
2ab05381 5208
c476f425
DM
5209When WHILEM is about to execute B, save the current cur_curlyx, and set it
5210to the outer one saved in the CURLYX block.
2ab05381 5211
c476f425
DM
5212When popping the WHILEM block after a successful or unsuccessful B match,
5213restore the previous cur_curlyx.
2ab05381 5214
c476f425
DM
5215Here's an example for the pattern (AI* BI)*BO
5216I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5217
c476f425
DM
5218cur_
5219curlyx backtrack stack
5220------ ---------------
5221NULL
5222CO <CO prev=NULL> <WO>
5223CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5224CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5225NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5226
c476f425
DM
5227At this point the pattern succeeds, and we work back down the stack to
5228clean up, restoring as we go:
95b24440 5229
c476f425
DM
5230CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5231CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5232CO <CO prev=NULL> <WO>
5233NULL
a0374537 5234
c476f425
DM
5235*******************************************************************/
5236
5237#define ST st->u.curlyx
5238
5239 case CURLYX: /* start of /A*B/ (for complex A) */
5240 {
5241 /* No need to save/restore up to this paren */
5242 I32 parenfloor = scan->flags;
5243
5244 assert(next); /* keep Coverity happy */
5245 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5246 next += ARG(next);
5247
5248 /* XXXX Probably it is better to teach regpush to support
5249 parenfloor > PL_regsize... */
b93070ed
DM
5250 if (parenfloor > (I32)rex->lastparen)
5251 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5252
5253 ST.prev_curlyx= cur_curlyx;
5254 cur_curlyx = st;
5255 ST.cp = PL_savestack_ix;
5256
5257 /* these fields contain the state of the current curly.
5258 * they are accessed by subsequent WHILEMs */
5259 ST.parenfloor = parenfloor;
d02d6d97 5260 ST.me = scan;
c476f425 5261 ST.B = next;
24d3c4a9
DM
5262 ST.minmod = minmod;
5263 minmod = 0;
c476f425
DM
5264 ST.count = -1; /* this will be updated by WHILEM */
5265 ST.lastloc = NULL; /* this will be updated by WHILEM */
5266
4d5016e5 5267 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5268 assert(0); /* NOTREACHED */
c476f425 5269 }
a0d0e21e 5270
c476f425 5271 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5272 cur_curlyx = ST.prev_curlyx;
5273 sayYES;
118e2215 5274 assert(0); /* NOTREACHED */
a0d0e21e 5275
c476f425
DM
5276 case CURLYX_end_fail: /* just failed to match all of A*B */
5277 regcpblow(ST.cp);
5278 cur_curlyx = ST.prev_curlyx;
5279 sayNO;
118e2215 5280 assert(0); /* NOTREACHED */
4633a7c4 5281
a0d0e21e 5282
c476f425
DM
5283#undef ST
5284#define ST st->u.whilem
5285
5286 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5287 {
5288 /* see the discussion above about CURLYX/WHILEM */
c476f425 5289 I32 n;
d02d6d97
DM
5290 int min = ARG1(cur_curlyx->u.curlyx.me);
5291 int max = ARG2(cur_curlyx->u.curlyx.me);
5292 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5293
c476f425
DM
5294 assert(cur_curlyx); /* keep Coverity happy */
5295 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5296 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5297 ST.cache_offset = 0;
5298 ST.cache_mask = 0;
5299
c476f425
DM
5300
5301 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5302 "%*s whilem: matched %ld out of %d..%d\n",
5303 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5304 );
a0d0e21e 5305
c476f425 5306 /* First just match a string of min A's. */
a0d0e21e 5307
d02d6d97 5308 if (n < min) {
b93070ed 5309 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5310 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5311 REGCP_SET(ST.lastcp);
5312
4d5016e5 5313 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5314 assert(0); /* NOTREACHED */
c476f425
DM
5315 }
5316
5317 /* If degenerate A matches "", assume A done. */
5318
5319 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5320 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5321 "%*s whilem: empty match detected, trying continuation...\n",
5322 REPORT_CODE_OFF+depth*2, "")
5323 );
5324 goto do_whilem_B_max;
5325 }
5326
5327 /* super-linear cache processing */
5328
5329 if (scan->flags) {
a0d0e21e 5330
2c2d71f5 5331 if (!PL_reg_maxiter) {
c476f425
DM
5332 /* start the countdown: Postpone detection until we
5333 * know the match is not *that* much linear. */
2c2d71f5 5334 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5335 /* possible overflow for long strings and many CURLYX's */
5336 if (PL_reg_maxiter < 0)
5337 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5338 PL_reg_leftiter = PL_reg_maxiter;
5339 }
c476f425 5340
2c2d71f5 5341 if (PL_reg_leftiter-- == 0) {
c476f425 5342 /* initialise cache */
3298f257 5343 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5344 if (PL_reg_poscache) {
eb160463 5345 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5346 Renew(PL_reg_poscache, size, char);
5347 PL_reg_poscache_size = size;
5348 }
5349 Zero(PL_reg_poscache, size, char);
5350 }
5351 else {
5352 PL_reg_poscache_size = size;
a02a5408 5353 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5354 }
c476f425
DM
5355 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5356 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5357 PL_colors[4], PL_colors[5])
5358 );
2c2d71f5 5359 }
c476f425 5360
2c2d71f5 5361 if (PL_reg_leftiter < 0) {
c476f425
DM
5362 /* have we already failed at this position? */
5363 I32 offset, mask;
5364 offset = (scan->flags & 0xf) - 1
5365 + (locinput - PL_bostr) * (scan->flags>>4);
5366 mask = 1 << (offset % 8);
5367 offset /= 8;
5368 if (PL_reg_poscache[offset] & mask) {
5369 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5370 "%*s whilem: (cache) already tried at this position...\n",
5371 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5372 );
3298f257 5373 sayNO; /* cache records failure */
2c2d71f5 5374 }
c476f425
DM
5375 ST.cache_offset = offset;
5376 ST.cache_mask = mask;
2c2d71f5 5377 }
c476f425 5378 }
2c2d71f5 5379
c476f425 5380 /* Prefer B over A for minimal matching. */
a687059c 5381
c476f425
DM
5382 if (cur_curlyx->u.curlyx.minmod) {
5383 ST.save_curlyx = cur_curlyx;
5384 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
b93070ed 5385 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
c476f425 5386 REGCP_SET(ST.lastcp);
4d5016e5
DM
5387 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5388 locinput);
118e2215 5389 assert(0); /* NOTREACHED */
c476f425 5390 }
a0d0e21e 5391
c476f425
DM
5392 /* Prefer A over B for maximal matching. */
5393
d02d6d97 5394 if (n < max) { /* More greed allowed? */
b93070ed 5395 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425
DM
5396 cur_curlyx->u.curlyx.lastloc = locinput;
5397 REGCP_SET(ST.lastcp);
4d5016e5 5398 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5399 assert(0); /* NOTREACHED */
c476f425
DM
5400 }
5401 goto do_whilem_B_max;
5402 }
118e2215 5403 assert(0); /* NOTREACHED */
c476f425
DM
5404
5405 case WHILEM_B_min: /* just matched B in a minimal match */
5406 case WHILEM_B_max: /* just matched B in a maximal match */
5407 cur_curlyx = ST.save_curlyx;
5408 sayYES;
118e2215 5409 assert(0); /* NOTREACHED */
c476f425
DM
5410
5411 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5412 cur_curlyx = ST.save_curlyx;
5413 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5414 cur_curlyx->u.curlyx.count--;
5415 CACHEsayNO;
118e2215 5416 assert(0); /* NOTREACHED */
c476f425
DM
5417
5418 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5419 /* FALL THROUGH */
5420 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa
YO
5421 REGCP_UNWIND(ST.lastcp);
5422 regcppop(rex);
c476f425
DM
5423 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5424 cur_curlyx->u.curlyx.count--;
5425 CACHEsayNO;
118e2215 5426 assert(0); /* NOTREACHED */
c476f425
DM
5427
5428 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5429 REGCP_UNWIND(ST.lastcp);
5430 regcppop(rex); /* Restore some previous $<digit>s? */
c476f425
DM
5431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5432 "%*s whilem: failed, trying continuation...\n",
5433 REPORT_CODE_OFF+depth*2, "")
5434 );
5435 do_whilem_B_max:
5436 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5437 && ckWARN(WARN_REGEXP)
5438 && !(PL_reg_flags & RF_warned))
5439 {
5440 PL_reg_flags |= RF_warned;
dcbac5bb
FC
5441 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5442 "Complex regular subexpression recursion limit (%d) "
5443 "exceeded",
c476f425
DM
5444 REG_INFTY - 1);
5445 }
5446
5447 /* now try B */
5448 ST.save_curlyx = cur_curlyx;
5449 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5450 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5451 locinput);
118e2215 5452 assert(0); /* NOTREACHED */
c476f425
DM
5453
5454 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5455 cur_curlyx = ST.save_curlyx;
5456 REGCP_UNWIND(ST.lastcp);
5457 regcppop(rex);
5458
d02d6d97 5459 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5460 /* Maximum greed exceeded */
5461 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5462 && ckWARN(WARN_REGEXP)
5463 && !(PL_reg_flags & RF_warned))
5464 {
3280af22 5465 PL_reg_flags |= RF_warned;
c476f425 5466 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5467 "Complex regular subexpression recursion "
5468 "limit (%d) exceeded",
c476f425 5469 REG_INFTY - 1);
a0d0e21e 5470 }
c476f425 5471 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5472 CACHEsayNO;
a0d0e21e 5473 }
c476f425
DM
5474
5475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5476 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5477 );
5478 /* Try grabbing another A and see if it helps. */
c476f425 5479 cur_curlyx->u.curlyx.lastloc = locinput;
b93070ed 5480 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
c476f425 5481 REGCP_SET(ST.lastcp);
d02d6d97 5482 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5483 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5484 locinput);
118e2215 5485 assert(0); /* NOTREACHED */
40a82448
DM
5486
5487#undef ST
5488#define ST st->u.branch
5489
5490 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5491 next = scan + ARG(scan);
5492 if (next == scan)
5493 next = NULL;
40a82448
DM
5494 scan = NEXTOPER(scan);
5495 /* FALL THROUGH */
c277df42 5496
40a82448
DM
5497 case BRANCH: /* /(...|A|...)/ */
5498 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5499 ST.lastparen = rex->lastparen;
f6033a9d 5500 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5501 ST.next_branch = next;
5502 REGCP_SET(ST.cp);
02db2b7b 5503
40a82448 5504 /* Now go into the branch */
5d458dd8 5505 if (has_cutgroup) {
4d5016e5 5506 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5507 } else {
4d5016e5 5508 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5509 }
118e2215 5510 assert(0); /* NOTREACHED */
3c0563b9
DM
5511
5512 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5513 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5514 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5515 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5516 assert(0); /* NOTREACHED */
3c0563b9 5517
5d458dd8
YO
5518 case CUTGROUP_next_fail:
5519 do_cutgroup = 1;
5520 no_final = 1;
5521 if (st->u.mark.mark_name)
5522 sv_commit = st->u.mark.mark_name;
5523 sayNO;
118e2215 5524 assert(0); /* NOTREACHED */
3c0563b9 5525
5d458dd8
YO
5526 case BRANCH_next:
5527 sayYES;
118e2215 5528 assert(0); /* NOTREACHED */
3c0563b9 5529
40a82448 5530 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5531 if (do_cutgroup) {
5532 do_cutgroup = 0;
5533 no_final = 0;
5534 }
40a82448 5535 REGCP_UNWIND(ST.cp);
a8d1f4b4 5536 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5537 scan = ST.next_branch;
5538 /* no more branches? */
5d458dd8
YO
5539 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5540 DEBUG_EXECUTE_r({
5541 PerlIO_printf( Perl_debug_log,
5542 "%*s %sBRANCH failed...%s\n",
5543 REPORT_CODE_OFF+depth*2, "",
5544 PL_colors[4],
5545 PL_colors[5] );
5546 });
5547 sayNO_SILENT;
5548 }
40a82448 5549 continue; /* execute next BRANCH[J] op */
118e2215 5550 assert(0); /* NOTREACHED */
40a82448 5551
3c0563b9 5552 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5553 minmod = 1;
a0d0e21e 5554 break;
40a82448
DM
5555
5556#undef ST
5557#define ST st->u.curlym
5558
5559 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5560
5561 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5562 * only a single backtracking state, no matter how many matches
40a82448
DM
5563 * there are in {m,n}. It relies on the pattern being constant
5564 * length, with no parens to influence future backrefs
5565 */
5566
5567 ST.me = scan;
dc45a647 5568 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5569
f6033a9d
DM
5570 ST.lastparen = rex->lastparen;
5571 ST.lastcloseparen = rex->lastcloseparen;
5572
40a82448
DM
5573 /* if paren positive, emulate an OPEN/CLOSE around A */
5574 if (ST.me->flags) {
3b6647e0 5575 U32 paren = ST.me->flags;
40a82448
DM
5576 if (paren > PL_regsize)
5577 PL_regsize = paren;
c277df42 5578 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5579 }
40a82448
DM
5580 ST.A = scan;
5581 ST.B = next;
5582 ST.alen = 0;
5583 ST.count = 0;
24d3c4a9
DM
5584 ST.minmod = minmod;
5585 minmod = 0;
40a82448
DM
5586 ST.c1 = CHRTEST_UNINIT;
5587 REGCP_SET(ST.cp);
6407bf3b 5588
40a82448
DM
5589 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5590 goto curlym_do_B;
5591
5592 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5593 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5594 assert(0); /* NOTREACHED */
5f80c4cf 5595
40a82448 5596 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5597 ST.count++;
5598 /* after first match, determine A's length: u.curlym.alen */
5599 if (ST.count == 1) {
5600 if (PL_reg_match_utf8) {
c07e9d7b
DM
5601 char *s = st->locinput;
5602 while (s < locinput) {
40a82448
DM
5603 ST.alen++;
5604 s += UTF8SKIP(s);
5605 }
5606 }
5607 else {
c07e9d7b 5608 ST.alen = locinput - st->locinput;
40a82448
DM
5609 }
5610 if (ST.alen == 0)
5611 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5612 }
0cadcf80
DM
5613 DEBUG_EXECUTE_r(
5614 PerlIO_printf(Perl_debug_log,
40a82448 5615 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5616 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5617 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5618 );
5619
0a4db386 5620 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5621 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5622 goto fake_end;
5623
c966426a
DM
5624 {
5625 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5626 if ( max == REG_INFTY || ST.count < max )
5627 goto curlym_do_A; /* try to match another A */
5628 }
40a82448 5629 goto curlym_do_B; /* try to match B */
5f80c4cf 5630
40a82448
DM
5631 case CURLYM_A_fail: /* just failed to match an A */
5632 REGCP_UNWIND(ST.cp);
0a4db386
YO
5633
5634 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5635 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5636 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5637 sayNO;
0cadcf80 5638
40a82448 5639 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5640 if (ST.c1 == CHRTEST_UNINIT) {
5641 /* calculate c1 and c2 for possible match of 1st char
5642 * following curly */
5643 ST.c1 = ST.c2 = CHRTEST_VOID;
5644 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5645 regnode *text_node = ST.B;
5646 if (! HAS_TEXT(text_node))
5647 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5648 /* this used to be
5649
5650 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5651
5652 But the former is redundant in light of the latter.
5653
5654 if this changes back then the macro for
5655 IS_TEXT and friends need to change.
5656 */
c74f6de9 5657 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8
KW
5658 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5659 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
c74f6de9
KW
5660 {
5661 sayNO;
5662 }
c277df42 5663 }
c277df42 5664 }
40a82448
DM
5665 }
5666
5667 DEBUG_EXECUTE_r(
5668 PerlIO_printf(Perl_debug_log,
5669 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5670 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5671 "", (IV)ST.count)
5672 );
c74f6de9 5673 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5674 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5675 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5676 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5677 {
5678 /* simulate B failing */
5679 DEBUG_OPTIMISE_r(
5680 PerlIO_printf(Perl_debug_log,
5681 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5682 (int)(REPORT_CODE_OFF+(depth*2)),"",
5683 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5684 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5685 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5686 );
5687 state_num = CURLYM_B_fail;
5688 goto reenter_switch;
5689 }
5690 }
5691 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5692 /* simulate B failing */
5693 DEBUG_OPTIMISE_r(
5694 PerlIO_printf(Perl_debug_log,
79a2a0e8 5695 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5696 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5697 (int) nextchr, ST.c1, ST.c2)
5698 );
5400f398
KW
5699 state_num = CURLYM_B_fail;
5700 goto reenter_switch;
5701 }
c74f6de9 5702 }
40a82448
DM
5703
5704 if (ST.me->flags) {
f6033a9d 5705 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5706 I32 paren = ST.me->flags;
5707 if (ST.count) {
b93070ed 5708 rex->offs[paren].start
c07e9d7b
DM
5709 = HOPc(locinput, -ST.alen) - PL_bostr;
5710 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5711 if ((U32)paren > rex->lastparen)
5712 rex->lastparen = paren;
5713 rex->lastcloseparen = paren;
c277df42 5714 }
40a82448 5715 else
b93070ed 5716 rex->offs[paren].end = -1;
0a4db386 5717 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5718 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5719 {
5720 if (ST.count)
5721 goto fake_end;
5722 else
5723 sayNO;
5724 }
c277df42 5725 }
0a4db386 5726
4d5016e5 5727 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5728 assert(0); /* NOTREACHED */
40a82448
DM
5729
5730 case CURLYM_B_fail: /* just failed to match a B */
5731 REGCP_UNWIND(ST.cp);
a8d1f4b4 5732 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5733 if (ST.minmod) {
84d2fa14
HS
5734 I32 max = ARG2(ST.me);
5735 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5736 sayNO;
5737 goto curlym_do_A; /* try to match a further A */
5738 }
5739 /* backtrack one A */
5740 if (ST.count == ARG1(ST.me) /* min */)
5741 sayNO;
5742 ST.count--;
7016d6eb 5743 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5744 goto curlym_do_B; /* try to match B */
5745
c255a977
DM
5746#undef ST
5747#define ST st->u.curly
40a82448 5748
c255a977
DM
5749#define CURLY_SETPAREN(paren, success) \
5750 if (paren) { \
5751 if (success) { \
b93070ed
DM
5752 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5753 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5754 if (paren > rex->lastparen) \
5755 rex->lastparen = paren; \
b93070ed 5756 rex->lastcloseparen = paren; \
c255a977 5757 } \
f6033a9d 5758 else { \
b93070ed 5759 rex->offs[paren].end = -1; \
f6033a9d
DM
5760 rex->lastparen = ST.lastparen; \
5761 rex->lastcloseparen = ST.lastcloseparen; \
5762 } \
c255a977
DM
5763 }
5764
b40a2c17 5765 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5766 ST.paren = 0;
5767 ST.min = 0;
5768 ST.max = REG_INFTY;
a0d0e21e
LW
5769 scan = NEXTOPER(scan);
5770 goto repeat;
3c0563b9 5771
b40a2c17 5772 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5773 ST.paren = 0;
5774 ST.min = 1;
5775 ST.max = REG_INFTY;
c277df42 5776 scan = NEXTOPER(scan);
c255a977 5777 goto repeat;
3c0563b9 5778
b40a2c17 5779 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5780 ST.paren = scan->flags; /* Which paren to set */
5781 ST.lastparen = rex->lastparen;
f6033a9d 5782 ST.lastcloseparen = rex->lastcloseparen;
c255a977
DM
5783 if (ST.paren > PL_regsize)
5784 PL_regsize = ST.paren;
c255a977
DM
5785 ST.min = ARG1(scan); /* min to match */
5786 ST.max = ARG2(scan); /* max to match */
0a4db386 5787 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5788 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5789 ST.min=1;
5790 ST.max=1;
5791 }
c255a977
DM
5792 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5793 goto repeat;
3c0563b9 5794
b40a2c17 5795 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5796 ST.paren = 0;
5797 ST.min = ARG1(scan); /* min to match */
5798 ST.max = ARG2(scan); /* max to match */
5799 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5800 repeat:
a0d0e21e
LW
5801 /*
5802 * Lookahead to avoid useless match attempts
5803 * when we know what character comes next.
c255a977 5804 *
5f80c4cf
JP
5805 * Used to only do .*x and .*?x, but now it allows
5806 * for )'s, ('s and (?{ ... })'s to be in the way
5807 * of the quantifier and the EXACT-like node. -- japhy
5808 */
5809
eb5c1be8 5810 assert(ST.min <= ST.max);
3337dfe3
KW
5811 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5812 ST.c1 = ST.c2 = CHRTEST_VOID;
5813 }
5814 else {
5f80c4cf
JP
5815 regnode *text_node = next;
5816
3dab1dad
YO
5817 if (! HAS_TEXT(text_node))
5818 FIND_NEXT_IMPT(text_node);
5f80c4cf 5819
9e137952 5820 if (! HAS_TEXT(text_node))
c255a977 5821 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5822 else {
ee9b8eae 5823 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5824 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5825 }
c74f6de9 5826 else {
ee9b8eae
YO
5827
5828 /* Currently we only get here when
5829
5830 PL_rekind[OP(text_node)] == EXACT
5831
5832 if this changes back then the macro for IS_TEXT and
5833 friends need to change. */
79a2a0e8
KW
5834 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5835 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
c74f6de9
KW
5836 {
5837 sayNO;
5838 }
5839 }
1aa99e6b 5840 }
bbce6d69 5841 }
c255a977
DM
5842
5843 ST.A = scan;
5844 ST.B = next;
24d3c4a9 5845 if (minmod) {
eb72505d 5846 char *li = locinput;
24d3c4a9 5847 minmod = 0;
eb72505d 5848 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
4633a7c4 5849 sayNO;
7016d6eb 5850 SET_locinput(li);
c255a977 5851 ST.count = ST.min;
c255a977
DM
5852 REGCP_SET(ST.cp);
5853 if (ST.c1 == CHRTEST_VOID)
5854 goto curly_try_B_min;
5855
5856 ST.oldloc = locinput;
5857
5858 /* set ST.maxpos to the furthest point along the
5859 * string that could possibly match */
5860 if (ST.max == REG_INFTY) {
5861 ST.maxpos = PL_regeol - 1;
f2ed9b32 5862 if (utf8_target)
c255a977
DM
5863 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5864 ST.maxpos--;
5865 }
f2ed9b32 5866 else if (utf8_target) {
c255a977
DM
5867 int m = ST.max - ST.min;
5868 for (ST.maxpos = locinput;
5869 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5870 ST.maxpos += UTF8SKIP(ST.maxpos);
5871 }
5872 else {
5873 ST.maxpos = locinput + ST.max - ST.min;
5874 if (ST.maxpos >= PL_regeol)
5875 ST.maxpos = PL_regeol - 1;
5876 }
5877 goto curly_try_B_min_known;
5878
5879 }
5880 else {
eb72505d
DM
5881 /* avoid taking address of locinput, so it can remain
5882 * a register var */
5883 char *li = locinput;
5884 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
c255a977
DM
5885 if (ST.count < ST.min)
5886 sayNO;
7016d6eb 5887 SET_locinput(li);
c255a977
DM
5888 if ((ST.count > ST.min)
5889 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5890 {
5891 /* A{m,n} must come at the end of the string, there's
5892 * no point in backing off ... */
5893 ST.min = ST.count;
5894 /* ...except that $ and \Z can match before *and* after
5895 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5896 We may back off by one in this case. */
eb72505d 5897 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
5898 ST.min--;
5899 }
5900 REGCP_SET(ST.cp);
5901 goto curly_try_B_max;
5902 }
118e2215 5903 assert(0); /* NOTREACHED */
c255a977
DM
5904
5905
5906 case CURLY_B_min_known_fail:
5907 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 5908
c255a977 5909 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5910 if (ST.paren) {
5911 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5912 }
c255a977
DM
5913 /* Couldn't or didn't -- move forward. */
5914 ST.oldloc = locinput;
f2ed9b32 5915 if (utf8_target)
c255a977
DM
5916 locinput += UTF8SKIP(locinput);
5917 else
5918 locinput++;
5919 ST.count++;
5920 curly_try_B_min_known:
5921 /* find the next place where 'B' could work, then call B */
5922 {
5923 int n;
f2ed9b32 5924 if (utf8_target) {
c255a977
DM
5925 n = (ST.oldloc == locinput) ? 0 : 1;
5926 if (ST.c1 == ST.c2) {
c255a977 5927 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
5928 while (locinput <= ST.maxpos
5929 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5930 {
5931 locinput += UTF8SKIP(locinput);
c255a977
DM
5932 n++;
5933 }
1aa99e6b
IH
5934 }
5935 else {
c255a977 5936 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
5937 while (locinput <= ST.maxpos
5938 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5939 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5940 {
5941 locinput += UTF8SKIP(locinput);
c255a977 5942 n++;
1aa99e6b 5943 }
0fe9bf95
IZ
5944 }
5945 }
5400f398 5946 else { /* Not utf8_target */
c255a977
DM
5947 if (ST.c1 == ST.c2) {
5948 while (locinput <= ST.maxpos &&
5949 UCHARAT(locinput) != ST.c1)
5950 locinput++;
bbce6d69 5951 }
c255a977
DM
5952 else {
5953 while (locinput <= ST.maxpos
5954 && UCHARAT(locinput) != ST.c1
5955 && UCHARAT(locinput) != ST.c2)
5956 locinput++;
a0ed51b3 5957 }
c255a977
DM
5958 n = locinput - ST.oldloc;
5959 }
5960 if (locinput > ST.maxpos)
5961 sayNO;
c255a977 5962 if (n) {
eb72505d
DM
5963 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5964 * at b; check that everything between oldloc and
5965 * locinput matches */
5966 char *li = ST.oldloc;
c255a977 5967 ST.count += n;
eb72505d 5968 if (regrepeat(rex, &li, ST.A, n, depth) < n)
4633a7c4 5969 sayNO;
eb72505d 5970 assert(n == REG_INFTY || locinput == li);
a0d0e21e 5971 }
c255a977 5972 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 5973 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5974 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5975 goto fake_end;
5976 }
4d5016e5 5977 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 5978 }
118e2215 5979 assert(0); /* NOTREACHED */
c255a977
DM
5980
5981
5982 case CURLY_B_min_fail:
5983 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
5984
5985 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
5986 if (ST.paren) {
5987 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5988 }
c255a977 5989 /* failed -- move forward one */
f73aaa43 5990 {
eb72505d
DM
5991 char *li = locinput;
5992 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
f73aaa43
DM
5993 sayNO;
5994 }
eb72505d 5995 locinput = li;
f73aaa43
DM
5996 }
5997 {
c255a977 5998 ST.count++;
c255a977
DM
5999 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6000 ST.count > 0)) /* count overflow ? */
15272685 6001 {
c255a977
DM
6002 curly_try_B_min:
6003 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6004 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6005 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6006 goto fake_end;
6007 }
4d5016e5 6008 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6009 }
6010 }
c74f6de9 6011 sayNO;
118e2215 6012 assert(0); /* NOTREACHED */
c255a977
DM
6013
6014
6015 curly_try_B_max:
6016 /* a successful greedy match: now try to match B */
40d049e4 6017 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6018 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6019 goto fake_end;
6020 }
c255a977 6021 {
79a2a0e8
KW
6022 bool could_match = locinput < PL_regeol;
6023
c255a977 6024 /* If it could work, try it. */
79a2a0e8
KW
6025 if (ST.c1 != CHRTEST_VOID && could_match) {
6026 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6027 {
6028 could_match = memEQ(locinput,
6029 ST.c1_utf8,
6030 UTF8SKIP(locinput))
6031 || memEQ(locinput,
6032 ST.c2_utf8,
6033 UTF8SKIP(locinput));
6034 }
6035 else {
6036 could_match = UCHARAT(locinput) == ST.c1
6037 || UCHARAT(locinput) == ST.c2;
6038 }
6039 }
6040 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6041 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6042 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6043 assert(0); /* NOTREACHED */
c255a977
DM
6044 }
6045 }
6046 /* FALL THROUGH */
3c0563b9 6047
c255a977
DM
6048 case CURLY_B_max_fail:
6049 /* failed to find B in a greedy match */
c255a977
DM
6050
6051 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6052 if (ST.paren) {
6053 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6054 }
c255a977
DM
6055 /* back up. */
6056 if (--ST.count < ST.min)
6057 sayNO;
eb72505d 6058 locinput = HOPc(locinput, -1);
c255a977
DM
6059 goto curly_try_B_max;
6060
6061#undef ST
6062
3c0563b9 6063 case END: /* last op of main pattern */
6bda09f9 6064 fake_end:
faec1544
DM
6065 if (cur_eval) {
6066 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544
DM
6067 st->u.eval.toggle_reg_flags
6068 = cur_eval->u.eval.toggle_reg_flags;
6069 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6070
288b8c02 6071 st->u.eval.prev_rex = rex_sv; /* inner */
b93070ed 6072 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
ec43f78b
DM
6073 rex_sv = cur_eval->u.eval.prev_rex;
6074 SET_reg_curpm(rex_sv);
8d919b0a 6075 rex = ReANY(rex_sv);
f8fc2ecf 6076 rexi = RXi_GET(rex);
faec1544 6077 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6078
faec1544 6079 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6080
6081 /* Restore parens of the outer rex without popping the
6082 * savestack */
74088413 6083 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
faec1544
DM
6084
6085 st->u.eval.prev_eval = cur_eval;
6086 cur_eval = cur_eval->u.eval.prev_eval;
6087 DEBUG_EXECUTE_r(
2a49f0f5
JH
6088 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6089 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6090 if ( nochange_depth )
6091 nochange_depth--;
6092
4d5016e5
DM
6093 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6094 locinput); /* match B */
faec1544
DM
6095 }
6096
3b0527fe 6097 if (locinput < reginfo->till) {
a3621e74 6098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6099 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6100 PL_colors[4],
6101 (long)(locinput - PL_reg_starttry),
3b0527fe 6102 (long)(reginfo->till - PL_reg_starttry),
7821416a 6103 PL_colors[5]));
58e23c8d 6104
262b90c4 6105 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6106 }
262b90c4 6107 sayYES; /* Success! */
dad79028
DM
6108
6109 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6110 DEBUG_EXECUTE_r(
6111 PerlIO_printf(Perl_debug_log,
6112 "%*s %ssubpattern success...%s\n",
5bc10b2c 6113 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6114 sayYES; /* Success! */
dad79028 6115
40a82448
DM
6116#undef ST
6117#define ST st->u.ifmatch
6118
37f53970
DM
6119 {
6120 char *newstart;
6121
40a82448
DM
6122 case SUSPEND: /* (?>A) */
6123 ST.wanted = 1;
37f53970 6124 newstart = locinput;
9041c2e3 6125 goto do_ifmatch;
dad79028 6126
40a82448
DM
6127 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6128 ST.wanted = 0;
dad79028
DM
6129 goto ifmatch_trivial_fail_test;
6130
40a82448
DM
6131 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6132 ST.wanted = 1;
dad79028 6133 ifmatch_trivial_fail_test:
a0ed51b3 6134 if (scan->flags) {
52657f30 6135 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6136 if (!s) {
6137 /* trivial fail */
24d3c4a9
DM
6138 if (logical) {
6139 logical = 0;
f2338a2e 6140 sw = 1 - cBOOL(ST.wanted);
dad79028 6141 }
40a82448 6142 else if (ST.wanted)
dad79028
DM
6143 sayNO;
6144 next = scan + ARG(scan);
6145 if (next == scan)
6146 next = NULL;
6147 break;
6148 }
37f53970 6149 newstart = s;
a0ed51b3
LW
6150 }
6151 else
37f53970 6152 newstart = locinput;
a0ed51b3 6153
c277df42 6154 do_ifmatch:
40a82448 6155 ST.me = scan;
24d3c4a9 6156 ST.logical = logical;
24d786f4
YO
6157 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6158
40a82448 6159 /* execute body of (?...A) */
37f53970 6160 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6161 assert(0); /* NOTREACHED */
37f53970 6162 }
40a82448
DM
6163
6164 case IFMATCH_A_fail: /* body of (?...A) failed */
6165 ST.wanted = !ST.wanted;
6166 /* FALL THROUGH */
6167
6168 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6169 if (ST.logical) {
f2338a2e 6170 sw = cBOOL(ST.wanted);
40a82448
DM
6171 }
6172 else if (!ST.wanted)
6173 sayNO;
6174
37f53970
DM
6175 if (OP(ST.me) != SUSPEND) {
6176 /* restore old position except for (?>...) */
6177 locinput = st->locinput;
40a82448
DM
6178 }
6179 scan = ST.me + ARG(ST.me);
6180 if (scan == ST.me)
6181 scan = NULL;
6182 continue; /* execute B */
6183
6184#undef ST
dad79028 6185
3c0563b9
DM
6186 case LONGJMP: /* alternative with many branches compiles to
6187 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6188 next = scan + ARG(scan);
6189 if (next == scan)
6190 next = NULL;
a0d0e21e 6191 break;
3c0563b9
DM
6192
6193 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6194 reginfo->cutpoint = PL_regeol;
6195 /* FALLTHROUGH */
3c0563b9
DM
6196
6197 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6198 if (!scan->flags)
ad64d0ec 6199 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6200 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6201 assert(0); /* NOTREACHED */
3c0563b9 6202
54612592
YO
6203 case COMMIT_next_fail:
6204 no_final = 1;
6205 /* FALLTHROUGH */
3c0563b9
DM
6206
6207 case OPFAIL: /* (*FAIL) */
7f69552c 6208 sayNO;
118e2215 6209 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6210
6211#define ST st->u.mark
3c0563b9 6212 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6213 ST.prev_mark = mark_state;
5d458dd8 6214 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6215 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6216 mark_state = st;
4d5016e5
DM
6217 ST.mark_loc = locinput;
6218 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6219 assert(0); /* NOTREACHED */
3c0563b9 6220
e2e6a0f1
YO
6221 case MARKPOINT_next:
6222 mark_state = ST.prev_mark;
6223 sayYES;
118e2215 6224 assert(0); /* NOTREACHED */
3c0563b9 6225
e2e6a0f1 6226 case MARKPOINT_next_fail:
5d458dd8 6227 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6228 {
6229 if (ST.mark_loc > startpoint)
6230 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6231 popmark = NULL; /* we found our mark */
6232 sv_commit = ST.mark_name;
6233
6234 DEBUG_EXECUTE_r({
5d458dd8 6235 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6236 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6237 REPORT_CODE_OFF+depth*2, "",
be2597df 6238 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6239 });
6240 }
6241 mark_state = ST.prev_mark;
5d458dd8
YO
6242 sv_yes_mark = mark_state ?
6243 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6244 sayNO;
118e2215 6245 assert(0); /* NOTREACHED */
3c0563b9
DM
6246
6247 case SKIP: /* (*SKIP) */
5d458dd8 6248 if (scan->flags) {
2bf803e2 6249 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6250 ST.mark_name = NULL;
e2e6a0f1 6251 ST.mark_loc = locinput;
4d5016e5 6252 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6253 } else {
2bf803e2 6254 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6255 otherwise do nothing. Meaning we need to scan
6256 */
6257 regmatch_state *cur = mark_state;
ad64d0ec 6258 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6259
6260 while (cur) {
6261 if ( sv_eq( cur->u.mark.mark_name,
6262 find ) )
6263 {
6264 ST.mark_name = find;
4d5016e5 6265 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6266 }
6267 cur = cur->u.mark.prev_mark;
6268 }
e2e6a0f1 6269 }
2bf803e2 6270 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6271 break;
3c0563b9 6272
5d458dd8
YO
6273 case SKIP_next_fail:
6274 if (ST.mark_name) {
6275 /* (*CUT:NAME) - Set up to search for the name as we
6276 collapse the stack*/
6277 popmark = ST.mark_name;
6278 } else {
6279 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6280 if (ST.mark_loc > startpoint)
6281 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6282 /* but we set sv_commit to latest mark_name if there
6283 is one so they can test to see how things lead to this
6284 cut */
6285 if (mark_state)
6286 sv_commit=mark_state->u.mark.mark_name;
6287 }
e2e6a0f1
YO
6288 no_final = 1;
6289 sayNO;
118e2215 6290 assert(0); /* NOTREACHED */
e2e6a0f1 6291#undef ST
3c0563b9
DM
6292
6293 case LNBREAK: /* \R */
7016d6eb 6294 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6295 locinput += n;
e1d1eefb
YO
6296 } else
6297 sayNO;
6298 break;
6299
6300#define CASE_CLASS(nAmE) \
6301 case nAmE: \
7016d6eb 6302 if (NEXTCHR_IS_EOS) \
1380b51d 6303 sayNO; \
f2ed9b32 6304 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb 6305 locinput += n; \
e1d1eefb
YO
6306 } else \
6307 sayNO; \
6308 break; \
6309 case N##nAmE: \
7016d6eb 6310 if (NEXTCHR_IS_EOS) \
1380b51d 6311 sayNO; \
f2ed9b32 6312 if ((n=is_##nAmE(locinput,utf8_target))) { \
e1d1eefb
YO
6313 sayNO; \
6314 } else { \
6315 locinput += UTF8SKIP(locinput); \
e1d1eefb
YO
6316 } \
6317 break
6318
3c0563b9
DM
6319 CASE_CLASS(VERTWS); /* \v \V */
6320 CASE_CLASS(HORIZWS); /* \h \H */
e1d1eefb
YO
6321#undef CASE_CLASS
6322
a0d0e21e 6323 default:
b900a521 6324 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6325 PTR2UV(scan), OP(scan));
cea2e8a9 6326 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6327
6328 /* this is a point to jump to in order to increment
6329 * locinput by one character */
6330 increment_locinput:
e6ca698c 6331 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6332 if (utf8_target) {
6333 locinput += PL_utf8skip[nextchr];
7016d6eb 6334 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6335 if (locinput > PL_regeol)
6336 sayNO;
28b98f76
DM
6337 }
6338 else
3640db6b 6339 locinput++;
28b98f76 6340 break;
5d458dd8
YO
6341
6342 } /* end switch */
95b24440 6343
5d458dd8
YO
6344 /* switch break jumps here */
6345 scan = next; /* prepare to execute the next op and ... */
6346 continue; /* ... jump back to the top, reusing st */
118e2215 6347 assert(0); /* NOTREACHED */
95b24440 6348
40a82448
DM
6349 push_yes_state:
6350 /* push a state that backtracks on success */
6351 st->u.yes.prev_yes_state = yes_state;
6352 yes_state = st;
6353 /* FALL THROUGH */
6354 push_state:
6355 /* push a new regex state, then continue at scan */
6356 {
6357 regmatch_state *newst;
6358
24b23f37
YO
6359 DEBUG_STACK_r({
6360 regmatch_state *cur = st;
6361 regmatch_state *curyes = yes_state;
6362 int curd = depth;
6363 regmatch_slab *slab = PL_regmatch_slab;
6364 for (;curd > -1;cur--,curd--) {
6365 if (cur < SLAB_FIRST(slab)) {
6366 slab = slab->prev;
6367 cur = SLAB_LAST(slab);
6368 }
6369 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6370 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6371 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6372 (curyes == cur) ? "yes" : ""
6373 );
6374 if (curyes == cur)
6375 curyes = cur->u.yes.prev_yes_state;
6376 }
6377 } else
6378 DEBUG_STATE_pp("push")
6379 );
40a82448 6380 depth++;
40a82448
DM
6381 st->locinput = locinput;
6382 newst = st+1;
6383 if (newst > SLAB_LAST(PL_regmatch_slab))
6384 newst = S_push_slab(aTHX);
6385 PL_regmatch_state = newst;
786e8c11 6386
4d5016e5 6387 locinput = pushinput;
40a82448
DM
6388 st = newst;
6389 continue;
118e2215 6390 assert(0); /* NOTREACHED */
40a82448 6391 }
a0d0e21e 6392 }
a687059c 6393
a0d0e21e
LW
6394 /*
6395 * We get here only if there's trouble -- normally "case END" is
6396 * the terminating point.
6397 */
cea2e8a9 6398 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6399 /*NOTREACHED*/
4633a7c4
LW
6400 sayNO;
6401
262b90c4 6402yes:
77cb431f
DM
6403 if (yes_state) {
6404 /* we have successfully completed a subexpression, but we must now
6405 * pop to the state marked by yes_state and continue from there */
77cb431f 6406 assert(st != yes_state);
5bc10b2c
DM
6407#ifdef DEBUGGING
6408 while (st != yes_state) {
6409 st--;
6410 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6411 PL_regmatch_slab = PL_regmatch_slab->prev;
6412 st = SLAB_LAST(PL_regmatch_slab);
6413 }
e2e6a0f1 6414 DEBUG_STATE_r({
54612592
YO
6415 if (no_final) {
6416 DEBUG_STATE_pp("pop (no final)");
6417 } else {
6418 DEBUG_STATE_pp("pop (yes)");
6419 }
e2e6a0f1 6420 });
5bc10b2c
DM
6421 depth--;
6422 }
6423#else
77cb431f
DM
6424 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6425 || yes_state > SLAB_LAST(PL_regmatch_slab))
6426 {
6427 /* not in this slab, pop slab */
6428 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6429 PL_regmatch_slab = PL_regmatch_slab->prev;
6430 st = SLAB_LAST(PL_regmatch_slab);
6431 }
6432 depth -= (st - yes_state);
5bc10b2c 6433#endif
77cb431f
DM
6434 st = yes_state;
6435 yes_state = st->u.yes.prev_yes_state;
6436 PL_regmatch_state = st;
24b23f37 6437
3640db6b 6438 if (no_final)
5d458dd8 6439 locinput= st->locinput;
54612592 6440 state_num = st->resume_state + no_final;
24d3c4a9 6441 goto reenter_switch;
77cb431f
DM
6442 }
6443
a3621e74 6444 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6445 PL_colors[4], PL_colors[5]));
02db2b7b 6446
ed301438 6447 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6448 /* each successfully executed (?{...}) block does the equivalent of
6449 * local $^R = do {...}
6450 * When popping the save stack, all these locals would be undone;
6451 * bypass this by setting the outermost saved $^R to the latest
6452 * value */
6453 if (oreplsv != GvSV(PL_replgv))
6454 sv_setsv(oreplsv, GvSV(PL_replgv));
6455 }
95b24440 6456 result = 1;
aa283a38 6457 goto final_exit;
4633a7c4
LW
6458
6459no:
a3621e74 6460 DEBUG_EXECUTE_r(
7821416a 6461 PerlIO_printf(Perl_debug_log,
786e8c11 6462 "%*s %sfailed...%s\n",
5bc10b2c 6463 REPORT_CODE_OFF+depth*2, "",
786e8c11 6464 PL_colors[4], PL_colors[5])
7821416a 6465 );
aa283a38 6466
262b90c4 6467no_silent:
54612592
YO
6468 if (no_final) {
6469 if (yes_state) {
6470 goto yes;
6471 } else {
6472 goto final_exit;
6473 }
6474 }
aa283a38
DM
6475 if (depth) {
6476 /* there's a previous state to backtrack to */
40a82448
DM
6477 st--;
6478 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6479 PL_regmatch_slab = PL_regmatch_slab->prev;
6480 st = SLAB_LAST(PL_regmatch_slab);
6481 }
6482 PL_regmatch_state = st;
40a82448 6483 locinput= st->locinput;
40a82448 6484
5bc10b2c
DM
6485 DEBUG_STATE_pp("pop");
6486 depth--;
262b90c4
DM
6487 if (yes_state == st)
6488 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6489
24d3c4a9
DM
6490 state_num = st->resume_state + 1; /* failure = success + 1 */
6491 goto reenter_switch;
95b24440 6492 }
24d3c4a9 6493 result = 0;
aa283a38 6494
262b90c4 6495 final_exit:
bbe252da 6496 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6497 SV *sv_err = get_sv("REGERROR", 1);
6498 SV *sv_mrk = get_sv("REGMARK", 1);
6499 if (result) {
e2e6a0f1 6500 sv_commit = &PL_sv_no;
5d458dd8
YO
6501 if (!sv_yes_mark)
6502 sv_yes_mark = &PL_sv_yes;
6503 } else {
6504 if (!sv_commit)
6505 sv_commit = &PL_sv_yes;
6506 sv_yes_mark = &PL_sv_no;
6507 }
6508 sv_setsv(sv_err, sv_commit);
6509 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6510 }
19b95bf0 6511
81ed78b2
DM
6512
6513 if (last_pushed_cv) {
6514 dSP;
6515 POP_MULTICALL;
4f8dbb2d 6516 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6517 }
6518
2f554ef7
DM
6519 /* clean up; in particular, free all slabs above current one */
6520 LEAVE_SCOPE(oldsave);
5d9a96ca 6521
730f4c74
DM
6522 assert(!result || locinput - PL_bostr >= 0);
6523 return result ? locinput - PL_bostr : -1;
a687059c
LW
6524}
6525
6526/*
6527 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6528 *
e64f369d
KW
6529 * What 'simple' means is a node which can be the operand of a quantifier like
6530 * '+', or {1,3}
6531 *
d60de1d1
DM
6532 * startposp - pointer a pointer to the start position. This is updated
6533 * to point to the byte following the highest successful
6534 * match.
6535 * p - the regnode to be repeatedly matched against.
4063ade8 6536 * max - maximum number of things to match.
d60de1d1 6537 * depth - (for debugging) backtracking depth.
a687059c 6538 */
76e3520e 6539STATIC I32
f73aaa43 6540S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
a687059c 6541{
27da23d5 6542 dVAR;
4063ade8 6543 char *scan; /* Pointer to current position in target string */
eb578fdb 6544 I32 c;
4063ade8
KW
6545 char *loceol = PL_regeol; /* local version */
6546 I32 hardcount = 0; /* How many matches so far */
eb578fdb 6547 bool utf8_target = PL_reg_match_utf8;
d513472c 6548 UV utf8_flags;
4f55667c
SP
6549#ifndef DEBUGGING
6550 PERL_UNUSED_ARG(depth);
6551#endif
a0d0e21e 6552
7918f24d
NC
6553 PERL_ARGS_ASSERT_REGREPEAT;
6554
f73aaa43 6555 scan = *startposp;
faf11cac
HS
6556 if (max == REG_INFTY)
6557 max = I32_MAX;
4063ade8 6558 else if (! utf8_target && scan + max < loceol)
7f596f4c 6559 loceol = scan + max;
4063ade8
KW
6560
6561 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6562 * to the maximum of how far we should go in it (leaving it set to the real
6563 * end, if the maximum permissible would take us beyond that). This allows
6564 * us to make the loop exit condition that we haven't gone past <loceol> to
6565 * also mean that we haven't exceeded the max permissible count, saving a
6566 * test each time through the loop. But it assumes that the OP matches a
6567 * single byte, which is true for most of the OPs below when applied to a
6568 * non-UTF-8 target. Those relatively few OPs that don't have this
6569 * characteristic will have to compensate.
6570 *
6571 * There is no adjustment for UTF-8 targets, as the number of bytes per
6572 * character varies. OPs will have to test both that the count is less
6573 * than the max permissible (using <hardcount> to keep track), and that we
6574 * are still within the bounds of the string (using <loceol>. A few OPs
6575 * match a single byte no matter what the encoding. They can omit the max
6576 * test if, for the UTF-8 case, they do the adjustment that was skipped
6577 * above.
6578 *
6579 * Thus, the code above sets things up for the common case; and exceptional
6580 * cases need extra work; the common case is to make sure <scan> doesn't
6581 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6582 * count doesn't exceed the maximum permissible */
6583
a0d0e21e 6584 switch (OP(p)) {
22c35a8c 6585 case REG_ANY:
f2ed9b32 6586 if (utf8_target) {
1aa99e6b 6587 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6588 scan += UTF8SKIP(scan);
6589 hardcount++;
6590 }
6591 } else {
6592 while (scan < loceol && *scan != '\n')
6593 scan++;
a0ed51b3
LW
6594 }
6595 break;
ffc61ed2 6596 case SANY:
f2ed9b32 6597 if (utf8_target) {
a0804c9e 6598 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6599 scan += UTF8SKIP(scan);
6600 hardcount++;
6601 }
6602 }
6603 else
6604 scan = loceol;
a0ed51b3 6605 break;
4063ade8
KW
6606 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6607 if (utf8_target && scan + max < loceol) {
6608
6609 /* <loceol> hadn't been adjusted in the UTF-8 case */
6610 scan += max;
6611 }
6612 else {
6613 scan = loceol;
6614 }
f33976b4 6615 break;
59d32103 6616 case EXACT:
613a425d
KW
6617 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6618
59d32103 6619 c = (U8)*STRING(p);
59d32103 6620
5e4a1da1
KW
6621 /* Can use a simple loop if the pattern char to match on is invariant
6622 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6623 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6624 * true iff it doesn't matter if the argument is in UTF-8 or not */
6625 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
4063ade8
KW
6626 if (utf8_target && scan + max < loceol) {
6627 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6628 * since here, to match at all, 1 char == 1 byte */
6629 loceol = scan + max;
6630 }
59d32103
KW
6631 while (scan < loceol && UCHARAT(scan) == c) {
6632 scan++;
6633 }
6634 }
b40a2c17 6635 else if (UTF_PATTERN) {
5e4a1da1
KW
6636 if (utf8_target) {
6637 STRLEN scan_char_len;
5e4a1da1 6638
4063ade8 6639 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6640 * string EQ */
6641 while (hardcount < max
6642 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6643 && scan_char_len <= STR_LEN(p)
6644 && memEQ(scan, STRING(p), scan_char_len))
6645 {
6646 scan += scan_char_len;
6647 hardcount++;
6648 }
6649 }
6650 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6651
5e4a1da1
KW
6652 /* Target isn't utf8; convert the character in the UTF-8
6653 * pattern to non-UTF8, and do a simple loop */
6654 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6655 while (scan < loceol && UCHARAT(scan) == c) {
6656 scan++;
6657 }
6658 } /* else pattern char is above Latin1, can't possibly match the
6659 non-UTF-8 target */
b40a2c17 6660 }
5e4a1da1 6661 else {
59d32103 6662
5e4a1da1
KW
6663 /* Here, the string must be utf8; pattern isn't, and <c> is
6664 * different in utf8 than not, so can't compare them directly.
6665 * Outside the loop, find the two utf8 bytes that represent c, and
6666 * then look for those in sequence in the utf8 string */
59d32103
KW
6667 U8 high = UTF8_TWO_BYTE_HI(c);
6668 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6669
6670 while (hardcount < max
6671 && scan + 1 < loceol
6672 && UCHARAT(scan) == high
6673 && UCHARAT(scan + 1) == low)
6674 {
6675 scan += 2;
6676 hardcount++;
6677 }
6678 }
6679 break;
5e4a1da1 6680
2f7f8cb1
KW
6681 case EXACTFA:
6682 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6683 goto do_exactf;
6684
d4e0b827
KW
6685 case EXACTFL:
6686 PL_reg_flags |= RF_tainted;
17580e7a
KW
6687 utf8_flags = FOLDEQ_UTF8_LOCALE;
6688 goto do_exactf;
6689
d4e0b827 6690 case EXACTF:
62bf7766
KW
6691 utf8_flags = 0;
6692 goto do_exactf;
6693
3c760661 6694 case EXACTFU_SS:
fab2782b 6695 case EXACTFU_TRICKYFOLD:
9a5a5549 6696 case EXACTFU:
05f861a2 6697 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6698
613a425d
KW
6699 do_exactf: {
6700 int c1, c2;
6701 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6702
613a425d
KW
6703 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6704
6705 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6706 if (c1 == CHRTEST_VOID) {
49b95fad 6707 /* Use full Unicode fold matching */
4063ade8 6708 char *tmpeol = PL_regeol;
49b95fad
KW
6709 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6710 while (hardcount < max
6711 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6712 STRING(p), NULL, pat_len,
6713 cBOOL(UTF_PATTERN), utf8_flags))
6714 {
6715 scan = tmpeol;
4063ade8 6716 tmpeol = PL_regeol;
49b95fad
KW
6717 hardcount++;
6718 }
613a425d
KW
6719 }
6720 else if (utf8_target) {
6721 if (c1 == c2) {
4063ade8
KW
6722 while (scan < loceol
6723 && hardcount < max
613a425d
KW
6724 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6725 {
6726 scan += UTF8SKIP(scan);
6727 hardcount++;
6728 }
6729 }
6730 else {
4063ade8
KW
6731 while (scan < loceol
6732 && hardcount < max
613a425d
KW
6733 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6734 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6735 {
6736 scan += UTF8SKIP(scan);
6737 hardcount++;
6738 }
6739 }
6740 }
6741 else if (c1 == c2) {
6742 while (scan < loceol && UCHARAT(scan) == c1) {
6743 scan++;
6744 }
6745 }
6746 else {
6747 while (scan < loceol &&
6748 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6749 {
6750 scan++;
6751 }
6752 }
634c83a2 6753 }
bbce6d69 6754 break;
613a425d 6755 }
a0d0e21e 6756 case ANYOF:
e0193e47 6757 if (utf8_target) {
4e8910e0 6758 STRLEN inclasslen;
4e8910e0 6759 while (hardcount < max
635cd5d4
KW
6760 && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6761 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0
KW
6762 {
6763 scan += inclasslen;
ffc61ed2
JH
6764 hardcount++;
6765 }
6766 } else {
32fc9b6a 6767 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6768 scan++;
6769 }
a0d0e21e 6770 break;
980866de 6771 case ALNUMU:
f2ed9b32 6772 if (utf8_target) {
980866de 6773 utf8_wordchar:
1a4fad37 6774 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6775 while (hardcount < max && scan < loceol &&
a12cf05f
KW
6776 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6777 {
ffc61ed2
JH
6778 scan += UTF8SKIP(scan);
6779 hardcount++;
6780 }
980866de 6781 } else {
a12cf05f
KW
6782 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6783 scan++;
6784 }
980866de
KW
6785 }
6786 break;
6787 case ALNUM:
6788 if (utf8_target)
6789 goto utf8_wordchar;
6790 while (scan < loceol && isALNUM((U8) *scan)) {
6791 scan++;
a0ed51b3
LW
6792 }
6793 break;
cfaf538b 6794 case ALNUMA:
4063ade8
KW
6795 if (utf8_target && scan + max < loceol) {
6796
6797 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6798 * since here, to match, 1 char == 1 byte */
6799 loceol = scan + max;
6800 }
cfaf538b
KW
6801 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6802 scan++;
6803 }
6804 break;
bbce6d69 6805 case ALNUML:
3280af22 6806 PL_reg_flags |= RF_tainted;
f2ed9b32 6807 if (utf8_target) {
1aa99e6b
IH
6808 while (hardcount < max && scan < loceol &&
6809 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6810 scan += UTF8SKIP(scan);
6811 hardcount++;
6812 }
6813 } else {
6814 while (scan < loceol && isALNUM_LC(*scan))
6815 scan++;
a0ed51b3
LW
6816 }
6817 break;
980866de 6818 case NALNUMU:
f2ed9b32 6819 if (utf8_target) {
980866de
KW
6820
6821 utf8_Nwordchar:
6822
1a4fad37 6823 LOAD_UTF8_CHARCLASS_ALNUM();
1aa99e6b 6824 while (hardcount < max && scan < loceol &&
980866de 6825 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
a12cf05f 6826 {
ffc61ed2
JH
6827 scan += UTF8SKIP(scan);
6828 hardcount++;
6829 }
980866de 6830 } else {
a12cf05f
KW
6831 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6832 scan++;
6833 }
980866de
KW
6834 }
6835 break;
6836 case NALNUM:
6837 if (utf8_target)
6838 goto utf8_Nwordchar;
6839 while (scan < loceol && ! isALNUM((U8) *scan)) {
6840 scan++;
a0ed51b3
LW
6841 }
6842 break;
0658cdde
KW
6843
6844 case POSIXA:
4063ade8
KW
6845 if (utf8_target && scan + max < loceol) {
6846
6847 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6848 * since here, to match, 1 char == 1 byte */
6849 loceol = scan + max;
6850 }
6851 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
6852 scan++;
6853 }
6854 break;
6855 case NPOSIXA:
6856 if (utf8_target) {
4063ade8
KW
6857 while (scan < loceol && hardcount < max
6858 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6859 {
0658cdde 6860 scan += UTF8SKIP(scan);
4063ade8 6861 hardcount++;
0658cdde
KW
6862 }
6863 }
6864 else {
6865 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6866 scan++;
6867 }
6868 }
6869 break;
cfaf538b
KW
6870 case NALNUMA:
6871 if (utf8_target) {
4063ade8
KW
6872 while (scan < loceol && hardcount < max
6873 && ! isWORDCHAR_A((U8) *scan))
6874 {
cfaf538b 6875 scan += UTF8SKIP(scan);
4063ade8 6876 hardcount++;
cfaf538b
KW
6877 }
6878 }
6879 else {
6880 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6881 scan++;
6882 }
6883 }
6884 break;
bbce6d69 6885 case NALNUML:
3280af22 6886 PL_reg_flags |= RF_tainted;
f2ed9b32 6887 if (utf8_target) {
1aa99e6b
IH
6888 while (hardcount < max && scan < loceol &&
6889 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6890 scan += UTF8SKIP(scan);
6891 hardcount++;
6892 }
6893 } else {
6894 while (scan < loceol && !isALNUM_LC(*scan))
6895 scan++;
a0ed51b3
LW
6896 }
6897 break;
980866de 6898 case SPACEU:
f2ed9b32 6899 if (utf8_target) {
980866de
KW
6900
6901 utf8_space:
6902
1a4fad37 6903 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6904 while (hardcount < max && scan < loceol &&
3568d838 6905 (*scan == ' ' ||
a12cf05f
KW
6906 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6907 {
ffc61ed2
JH
6908 scan += UTF8SKIP(scan);
6909 hardcount++;
6910 }
980866de
KW
6911 break;
6912 }
6913 else {
a12cf05f
KW
6914 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6915 scan++;
6916 }
980866de
KW
6917 break;
6918 }
6919 case SPACE:
6920 if (utf8_target)
6921 goto utf8_space;
6922
6923 while (scan < loceol && isSPACE((U8) *scan)) {
6924 scan++;
a0ed51b3
LW
6925 }
6926 break;
cfaf538b 6927 case SPACEA:
4063ade8
KW
6928 if (utf8_target && scan + max < loceol) {
6929
6930 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6931 * since here, to match, 1 char == 1 byte */
6932 loceol = scan + max;
6933 }
cfaf538b
KW
6934 while (scan < loceol && isSPACE_A((U8) *scan)) {
6935 scan++;
6936 }
6937 break;
bbce6d69 6938 case SPACEL:
3280af22 6939 PL_reg_flags |= RF_tainted;
f2ed9b32 6940 if (utf8_target) {
1aa99e6b 6941 while (hardcount < max && scan < loceol &&
6bbba904 6942 isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
6943 scan += UTF8SKIP(scan);
6944 hardcount++;
6945 }
6946 } else {
6947 while (scan < loceol && isSPACE_LC(*scan))
6948 scan++;
a0ed51b3
LW
6949 }
6950 break;
980866de 6951 case NSPACEU:
f2ed9b32 6952 if (utf8_target) {
980866de
KW
6953
6954 utf8_Nspace:
6955
1a4fad37 6956 LOAD_UTF8_CHARCLASS_SPACE();
1aa99e6b 6957 while (hardcount < max && scan < loceol &&
980866de
KW
6958 ! (*scan == ' ' ||
6959 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
a12cf05f 6960 {
ffc61ed2
JH
6961 scan += UTF8SKIP(scan);
6962 hardcount++;
6963 }
980866de
KW
6964 break;
6965 }
6966 else {
a12cf05f
KW
6967 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6968 scan++;
6969 }
980866de
KW
6970 }
6971 break;
6972 case NSPACE:
6973 if (utf8_target)
6974 goto utf8_Nspace;
6975
6976 while (scan < loceol && ! isSPACE((U8) *scan)) {
6977 scan++;
a0ed51b3 6978 }
0008a298 6979 break;
cfaf538b
KW
6980 case NSPACEA:
6981 if (utf8_target) {
4063ade8
KW
6982 while (hardcount < max && scan < loceol
6983 && ! isSPACE_A((U8) *scan))
6984 {
cfaf538b 6985 scan += UTF8SKIP(scan);
4063ade8 6986 hardcount++;
cfaf538b
KW
6987 }
6988 }
6989 else {
6990 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6991 scan++;
6992 }
6993 }
6994 break;
bbce6d69 6995 case NSPACEL:
3280af22 6996 PL_reg_flags |= RF_tainted;
f2ed9b32 6997 if (utf8_target) {
1aa99e6b 6998 while (hardcount < max && scan < loceol &&
6bbba904 6999 !isSPACE_LC_utf8((U8*)scan)) {
ffc61ed2
JH
7000 scan += UTF8SKIP(scan);
7001 hardcount++;
7002 }
7003 } else {
7004 while (scan < loceol && !isSPACE_LC(*scan))
7005 scan++;
a0ed51b3
LW
7006 }
7007 break;
a0d0e21e 7008 case DIGIT:
f2ed9b32 7009 if (utf8_target) {
1a4fad37 7010 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 7011 while (hardcount < max && scan < loceol &&
f2ed9b32 7012 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
7013 scan += UTF8SKIP(scan);
7014 hardcount++;
7015 }
7016 } else {
7017 while (scan < loceol && isDIGIT(*scan))
7018 scan++;
a0ed51b3
LW
7019 }
7020 break;
cfaf538b 7021 case DIGITA:
4063ade8
KW
7022 if (utf8_target && scan + max < loceol) {
7023
7024 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7025 * since here, to match, 1 char == 1 byte */
7026 loceol = scan + max;
7027 }
cfaf538b
KW
7028 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7029 scan++;
7030 }
7031 break;
b77393f6
KW
7032 case DIGITL:
7033 PL_reg_flags |= RF_tainted;
7034 if (utf8_target) {
b77393f6
KW
7035 while (hardcount < max && scan < loceol &&
7036 isDIGIT_LC_utf8((U8*)scan)) {
7037 scan += UTF8SKIP(scan);
7038 hardcount++;
7039 }
7040 } else {
7041 while (scan < loceol && isDIGIT_LC(*scan))
7042 scan++;
7043 }
7044 break;
a0d0e21e 7045 case NDIGIT:
f2ed9b32 7046 if (utf8_target) {
1a4fad37 7047 LOAD_UTF8_CHARCLASS_DIGIT();
1aa99e6b 7048 while (hardcount < max && scan < loceol &&
f2ed9b32 7049 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
ffc61ed2
JH
7050 scan += UTF8SKIP(scan);
7051 hardcount++;
7052 }
7053 } else {
7054 while (scan < loceol && !isDIGIT(*scan))
7055 scan++;
a0ed51b3 7056 }
cfaf538b
KW
7057 break;
7058 case NDIGITA:
7059 if (utf8_target) {
4063ade8
KW
7060 while (hardcount < max && scan < loceol
7061 && ! isDIGIT_A((U8) *scan)) {
cfaf538b 7062 scan += UTF8SKIP(scan);
4063ade8 7063 hardcount++;
cfaf538b
KW
7064 }
7065 }
7066 else {
7067 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7068 scan++;
7069 }
7070 }
7071 break;
b77393f6
KW
7072 case NDIGITL:
7073 PL_reg_flags |= RF_tainted;
7074 if (utf8_target) {
b77393f6
KW
7075 while (hardcount < max && scan < loceol &&
7076 !isDIGIT_LC_utf8((U8*)scan)) {
7077 scan += UTF8SKIP(scan);
7078 hardcount++;
7079 }
7080 } else {
7081 while (scan < loceol && !isDIGIT_LC(*scan))
7082 scan++;
7083 }
7084 break;
e1d1eefb 7085 case LNBREAK:
e64f369d
KW
7086 if (utf8_target) {
7087 while (hardcount < max && scan < loceol &&
7088 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7089 scan += c;
7090 hardcount++;
7091 }
7092 } else {
7093 /* LNBREAK can match one or two latin chars, which is ok, but we
7094 * have to use hardcount in this situation, and throw away the
7095 * adjustment to <loceol> done before the switch statement */
7096 loceol = PL_regeol;
7097 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7098 scan+=c;
7099 hardcount++;
7100 }
7101 }
7102 break;
e1d1eefb 7103 case HORIZWS:
f2ed9b32 7104 if (utf8_target) {
7016d6eb
DM
7105 while (hardcount < max && scan < loceol &&
7106 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7107 {
e1d1eefb
YO
7108 scan += c;
7109 hardcount++;
7110 }
7111 } else {
7016d6eb 7112 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7113 scan++;
7114 }
a0ed51b3 7115 break;
e1d1eefb 7116 case NHORIZWS:
f2ed9b32 7117 if (utf8_target) {
7016d6eb
DM
7118 while (hardcount < max && scan < loceol &&
7119 !is_HORIZWS_utf8_safe(scan, loceol))
7120 {
e1d1eefb
YO
7121 scan += UTF8SKIP(scan);
7122 hardcount++;
7123 }
7124 } else {
7016d6eb 7125 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7126 scan++;
7127
7128 }
7129 break;
7130 case VERTWS:
f2ed9b32 7131 if (utf8_target) {
7016d6eb
DM
7132 while (hardcount < max && scan < loceol &&
7133 (c=is_VERTWS_utf8_safe(scan, loceol)))
7134 {
e1d1eefb
YO
7135 scan += c;
7136 hardcount++;
7137 }
7138 } else {
7016d6eb 7139 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7140 scan++;
7141
7142 }
7143 break;
7144 case NVERTWS:
f2ed9b32 7145 if (utf8_target) {
7016d6eb
DM
7146 while (hardcount < max && scan < loceol &&
7147 !is_VERTWS_utf8_safe(scan, loceol))
7148 {
e1d1eefb
YO
7149 scan += UTF8SKIP(scan);
7150 hardcount++;
7151 }
7152 } else {
7016d6eb 7153 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
e1d1eefb
YO
7154 scan++;
7155
7156 }
7157 break;
7158
584b1f02
KW
7159 case BOUND:
7160 case BOUNDA:
7161 case BOUNDL:
7162 case BOUNDU:
7163 case EOS:
7164 case GPOS:
7165 case KEEPS:
7166 case NBOUND:
7167 case NBOUNDA:
7168 case NBOUNDL:
7169 case NBOUNDU:
7170 case OPFAIL:
7171 case SBOL:
7172 case SEOL:
7173 /* These are all 0 width, so match right here or not at all. */
7174 break;
7175
7176 default:
7177 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7178 assert(0); /* NOTREACHED */
7179
a0d0e21e 7180 }
a687059c 7181
a0ed51b3
LW
7182 if (hardcount)
7183 c = hardcount;
7184 else
f73aaa43
DM
7185 c = scan - *startposp;
7186 *startposp = scan;
a687059c 7187
a3621e74 7188 DEBUG_r({
e68ec53f 7189 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7190 DEBUG_EXECUTE_r({
e68ec53f
YO
7191 SV * const prop = sv_newmortal();
7192 regprop(prog, prop, p);
7193 PerlIO_printf(Perl_debug_log,
be8e71aa 7194 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7195 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7196 });
be8e71aa 7197 });
9041c2e3 7198
a0d0e21e 7199 return(c);
a687059c
LW
7200}
7201
c277df42 7202
be8e71aa 7203#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7204/*
6c6525b8 7205- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7206create a copy so that changes the caller makes won't change the shared one.
7207If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7208 */
ffc61ed2 7209SV *
32fc9b6a 7210Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7211{
6c6525b8 7212 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7213
7214 if (altsvp) {
7215 *altsvp = NULL;
7216 }
7217
7218 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7219}
7220#endif
7221
7222STATIC SV *
e0193e47 7223S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7224{
8c9eb58f
KW
7225 /* Returns the swash for the input 'node' in the regex 'prog'.
7226 * If <doinit> is true, will attempt to create the swash if not already
7227 * done.
7228 * If <listsvp> is non-null, will return the swash initialization string in
7229 * it.
8c9eb58f
KW
7230 * Tied intimately to how regcomp.c sets up the data structure */
7231
97aff369 7232 dVAR;
9e55ce06
JH
7233 SV *sw = NULL;
7234 SV *si = NULL;
7a6c6baa
KW
7235 SV* invlist = NULL;
7236
f8fc2ecf
YO
7237 RXi_GET_DECL(prog,progi);
7238 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7239
6c6525b8 7240 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7241
ccb2541c
KW
7242 assert(ANYOF_NONBITMAP(node));
7243
4f639d21 7244 if (data && data->count) {
a3b680e6 7245 const U32 n = ARG(node);
ffc61ed2 7246
4f639d21 7247 if (data->what[n] == 's') {
ad64d0ec
NC
7248 SV * const rv = MUTABLE_SV(data->data[n]);
7249 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7250 SV **const ary = AvARRAY(av);
87367d5f 7251 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7252
8c9eb58f 7253 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7254
88675427
KW
7255 /* Elements 2 and 3 are either both present or both absent. [2] is
7256 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7257 * that inversion list has any user-defined properties in it. */
88675427
KW
7258 if (av_len(av) >= 2) {
7259 invlist = ary[2];
7260 if (SvUV(ary[3])) {
83199d38
KW
7261 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7262 }
7a6c6baa
KW
7263 }
7264 else {
7265 invlist = NULL;
7a6c6baa
KW
7266 }
7267
8c9eb58f
KW
7268 /* Element [1] is reserved for the set-up swash. If already there,
7269 * return it; if not, create it and store it there */
f192cf32
KW
7270 if (SvROK(ary[1])) {
7271 sw = ary[1];
7272 }
ffc61ed2 7273 else if (si && doinit) {
7a6c6baa
KW
7274
7275 sw = _core_swash_init("utf8", /* the utf8 package */
7276 "", /* nameless */
7277 si,
7278 1, /* binary */
7279 0, /* not from tr/// */
7a6c6baa 7280 invlist,
83199d38 7281 &swash_init_flags);
ffc61ed2
JH
7282 (void)av_store(av, 1, sw);
7283 }
7284 }
7285 }
7286
7a6c6baa
KW
7287 if (listsvp) {
7288 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7289
7290 /* Use the swash, if any, which has to have incorporated into it all
7291 * possibilities */
872dd7e0
KW
7292 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7293 && (si && si != &PL_sv_undef))
7294 {
7a6c6baa 7295
872dd7e0 7296 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7297 sv_catsv(matches_string, si);
7298 }
7299
7300 /* Add the inversion list to whatever we have. This may have come from
7301 * the swash, or from an input parameter */
7302 if (invlist) {
7303 sv_catsv(matches_string, _invlist_contents(invlist));
7304 }
7305 *listsvp = matches_string;
7306 }
7307
ffc61ed2
JH
7308 return sw;
7309}
7310
7311/*
ba7b4546 7312 - reginclass - determine if a character falls into a character class
832705d4 7313
6698fab5
KW
7314 n is the ANYOF regnode
7315 p is the target string
6698fab5 7316 utf8_target tells whether p is in UTF-8.
832705d4 7317
635cd5d4 7318 Returns true if matched; false otherwise.
eba1359e 7319
d5788240
KW
7320 Note that this can be a synthetic start class, a combination of various
7321 nodes, so things you think might be mutually exclusive, such as locale,
7322 aren't. It can match both locale and non-locale
7323
bbce6d69 7324 */
7325
76e3520e 7326STATIC bool
635cd5d4 7327S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
bbce6d69 7328{
27da23d5 7329 dVAR;
a3b680e6 7330 const char flags = ANYOF_FLAGS(n);
bbce6d69 7331 bool match = FALSE;
cc07378b 7332 UV c = *p;
1aa99e6b 7333
7918f24d
NC
7334 PERL_ARGS_ASSERT_REGINCLASS;
7335
afd2eb18
KW
7336 /* If c is not already the code point, get it. Note that
7337 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7338 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7339 STRLEN c_len = 0;
f7ab54c6 7340 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7341 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7342 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7343 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7344 * UTF8_ALLOW_FFFF */
f7ab54c6 7345 if (c_len == (STRLEN)-1)
e8a70c6f 7346 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7347 }
4b3cda86 7348
7cdde544
KW
7349 /* If this character is potentially in the bitmap, check it */
7350 if (c < 256) {
ffc61ed2
JH
7351 if (ANYOF_BITMAP_TEST(n, c))
7352 match = TRUE;
11454c59
KW
7353 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7354 && ! utf8_target
7355 && ! isASCII(c))
7356 {
7357 match = TRUE;
7358 }
78969a98
KW
7359 else if (flags & ANYOF_LOCALE) {
7360 PL_reg_flags |= RF_tainted;
7361
538b546e 7362 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7363 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7364 {
ffc61ed2 7365 match = TRUE;
78969a98 7366 }
040aea3a
KW
7367 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7368 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7369 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7370 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7371 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7372 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7373 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7374 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7375 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7376 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7377 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
07315176
KW
7378 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7379 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
040aea3a
KW
7380 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7381 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7382 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7383 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7384 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7385 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7386 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7387 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7388 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7389 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7390 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7391 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7392 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7393 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7394 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7395 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
67addccf
KW
7396 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7397 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
040aea3a 7398 ) /* How's that for a conditional? */
78969a98 7399 ) {
ffc61ed2
JH
7400 match = TRUE;
7401 }
a0ed51b3 7402 }
a0ed51b3
LW
7403 }
7404
7cdde544 7405 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7406 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7407 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7408 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7409 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7410 * could be a combination of locale and non-locale nodes. So allow locale
7411 * to match for the synthetic start class, which will give a false
7412 * positive that will be resolved when the match is done again as not part
7413 * of the synthetic start class */
ef87b810 7414 if (!match) {
10ee90d2
KW
7415 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7416 match = TRUE; /* Everything above 255 matches */
e051a21d 7417 }
6f8d7d0d
KW
7418 else if (ANYOF_NONBITMAP(n)
7419 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7420 || (utf8_target
7421 && (c >=256
7422 || (! (flags & ANYOF_LOCALE))
7423 || (flags & ANYOF_IS_SYNTHETIC)))))
ef87b810 7424 {
e0193e47 7425 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7426 if (sw) {
7427 U8 * utf8_p;
7428 if (utf8_target) {
7429 utf8_p = (U8 *) p;
e0193e47
KW
7430 } else { /* Convert to utf8 */
7431 STRLEN len = 1;
7cdde544
KW
7432 utf8_p = bytes_to_utf8(p, &len);
7433 }
f56b6394 7434
e0193e47 7435 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7436 match = TRUE;
e0193e47 7437 }
7cdde544
KW
7438
7439 /* If we allocated a string above, free it */
7440 if (! utf8_target) Safefree(utf8_p);
7441 }
7442 }
5073ffbd
KW
7443
7444 if (UNICODE_IS_SUPER(c)
7445 && (flags & ANYOF_WARN_SUPER)
7446 && ckWARN_d(WARN_NON_UNICODE))
7447 {
7448 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7449 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7450 }
7cdde544
KW
7451 }
7452
f0fdc1c9
KW
7453 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7454 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7455}
161b471a 7456
dfe13c55 7457STATIC U8 *
0ce71af7 7458S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7459{
6af86488
KW
7460 /* return the position 'off' UTF-8 characters away from 's', forward if
7461 * 'off' >= 0, backwards if negative. But don't go outside of position
7462 * 'lim', which better be < s if off < 0 */
7463
97aff369 7464 dVAR;
7918f24d
NC
7465
7466 PERL_ARGS_ASSERT_REGHOP3;
7467
a0ed51b3 7468 if (off >= 0) {
1aa99e6b 7469 while (off-- && s < lim) {
ffc61ed2 7470 /* XXX could check well-formedness here */
a0ed51b3 7471 s += UTF8SKIP(s);
ffc61ed2 7472 }
a0ed51b3
LW
7473 }
7474 else {
1de06328
YO
7475 while (off++ && s > lim) {
7476 s--;
7477 if (UTF8_IS_CONTINUED(*s)) {
7478 while (s > lim && UTF8_IS_CONTINUATION(*s))
7479 s--;
a0ed51b3 7480 }
1de06328 7481 /* XXX could check well-formedness here */
a0ed51b3
LW
7482 }
7483 }
7484 return s;
7485}
161b471a 7486
f9f4320a
YO
7487#ifdef XXX_dmq
7488/* there are a bunch of places where we use two reghop3's that should
7489 be replaced with this routine. but since thats not done yet
7490 we ifdef it out - dmq
7491*/
dfe13c55 7492STATIC U8 *
1de06328
YO
7493S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7494{
7495 dVAR;
7918f24d
NC
7496
7497 PERL_ARGS_ASSERT_REGHOP4;
7498
1de06328
YO
7499 if (off >= 0) {
7500 while (off-- && s < rlim) {
7501 /* XXX could check well-formedness here */
7502 s += UTF8SKIP(s);
7503 }
7504 }
7505 else {
7506 while (off++ && s > llim) {
7507 s--;
7508 if (UTF8_IS_CONTINUED(*s)) {
7509 while (s > llim && UTF8_IS_CONTINUATION(*s))
7510 s--;
7511 }
7512 /* XXX could check well-formedness here */
7513 }
7514 }
7515 return s;
7516}
f9f4320a 7517#endif
1de06328
YO
7518
7519STATIC U8 *
0ce71af7 7520S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7521{
97aff369 7522 dVAR;
7918f24d
NC
7523
7524 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7525
a0ed51b3 7526 if (off >= 0) {
1aa99e6b 7527 while (off-- && s < lim) {
ffc61ed2 7528 /* XXX could check well-formedness here */
a0ed51b3 7529 s += UTF8SKIP(s);
ffc61ed2 7530 }
a0ed51b3 7531 if (off >= 0)
3dab1dad 7532 return NULL;
a0ed51b3
LW
7533 }
7534 else {
1de06328
YO
7535 while (off++ && s > lim) {
7536 s--;
7537 if (UTF8_IS_CONTINUED(*s)) {
7538 while (s > lim && UTF8_IS_CONTINUATION(*s))
7539 s--;
a0ed51b3 7540 }
1de06328 7541 /* XXX could check well-formedness here */
a0ed51b3
LW
7542 }
7543 if (off <= 0)
3dab1dad 7544 return NULL;
a0ed51b3
LW
7545 }
7546 return s;
7547}
51371543 7548
51371543 7549static void
acfe0abc 7550restore_pos(pTHX_ void *arg)
51371543 7551{
97aff369 7552 dVAR;
097eb12c 7553 regexp * const rex = (regexp *)arg;
ed301438 7554 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7555 if (PL_reg_oldsaved) {
4f639d21
DM
7556 rex->subbeg = PL_reg_oldsaved;
7557 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7558 rex->suboffset = PL_reg_oldsavedoffset;
7559 rex->subcoffset = PL_reg_oldsavedcoffset;
f8c7b90f 7560#ifdef PERL_OLD_COPY_ON_WRITE
4f639d21 7561 rex->saved_copy = PL_nrs;
ed252734 7562#endif
07bc277f 7563 RXp_MATCH_COPIED_on(rex);
51371543
GS
7564 }
7565 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7566 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7567 PL_curpm = PL_reg_oldcurpm;
7568 }
7569}
33b8afdf
JH
7570
7571STATIC void
7572S_to_utf8_substr(pTHX_ register regexp *prog)
7573{
7e0d5ad7
KW
7574 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7575 * on the converted value */
7576
a1cac82e 7577 int i = 1;
7918f24d
NC
7578
7579 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7580
a1cac82e
NC
7581 do {
7582 if (prog->substrs->data[i].substr
7583 && !prog->substrs->data[i].utf8_substr) {
7584 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7585 prog->substrs->data[i].utf8_substr = sv;
7586 sv_utf8_upgrade(sv);
610460f9 7587 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7588 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7589 /* Trim the trailing \n that fbm_compile added last
7590 time. */
7591 SvCUR_set(sv, SvCUR(sv) - 1);
7592 /* Whilst this makes the SV technically "invalid" (as its
7593 buffer is no longer followed by "\0") when fbm_compile()
7594 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7595 fbm_compile(sv, FBMcf_TAIL);
7596 } else
7597 fbm_compile(sv, 0);
610460f9 7598 }
a1cac82e
NC
7599 if (prog->substrs->data[i].substr == prog->check_substr)
7600 prog->check_utf8 = sv;
7601 }
7602 } while (i--);
33b8afdf
JH
7603}
7604
7e0d5ad7 7605STATIC bool
33b8afdf
JH
7606S_to_byte_substr(pTHX_ register regexp *prog)
7607{
7e0d5ad7
KW
7608 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7609 * on the converted value; returns FALSE if can't be converted. */
7610
97aff369 7611 dVAR;
a1cac82e 7612 int i = 1;
7918f24d
NC
7613
7614 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7615
a1cac82e
NC
7616 do {
7617 if (prog->substrs->data[i].utf8_substr
7618 && !prog->substrs->data[i].substr) {
7619 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7620 if (! sv_utf8_downgrade(sv, TRUE)) {
7621 return FALSE;
7622 }
5400f398
KW
7623 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7624 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7625 /* Trim the trailing \n that fbm_compile added last
7626 time. */
7627 SvCUR_set(sv, SvCUR(sv) - 1);
7628 fbm_compile(sv, FBMcf_TAIL);
7629 } else
7630 fbm_compile(sv, 0);
7631 }
a1cac82e
NC
7632 prog->substrs->data[i].substr = sv;
7633 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7634 prog->check_substr = sv;
33b8afdf 7635 }
a1cac82e 7636 } while (i--);
7e0d5ad7
KW
7637
7638 return TRUE;
33b8afdf 7639}
66610fdd 7640
a0e786e5
KW
7641/* These constants are for finding GCB=LV and GCB=LVT. These are for the
7642 * pre-composed Hangul syllables, which are all in a contiguous block and
7643 * arranged there in such a way so as to facilitate alorithmic determination of
7644 * their characteristics. As such, they don't need a swash, but can be
7645 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7646 * is a GCB=LV */
7647#define SBASE 0xAC00 /* Start of block */
7648#define SCount 11172 /* Length of block */
7649#define TCount 28
7650
7651#if 0 /* This routine is not currently used */
7652PERL_STATIC_INLINE bool
7653S_is_utf8_X_LV(pTHX_ const U8 *p)
7654{
7655 /* Unlike most other similarly named routines here, this does not create a
7656 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7657
7658 dVAR;
7659
7660 UV cp = valid_utf8_to_uvchr(p, NULL);
7661
7662 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7663
7664 /* The earliest Unicode releases did not have these precomposed Hangul
7665 * syllables. Set to point to undef in that case, so will return false on
7666 * every call */
7667 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7668 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7669 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7670 SvREFCNT_dec(PL_utf8_X_LV);
7671 PL_utf8_X_LV = &PL_sv_undef;
7672 }
7673 }
7674
7675 return (PL_utf8_X_LV != &PL_sv_undef
7676 && cp >= SBASE && cp < SBASE + SCount
7677 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7678}
7679#endif
7680
7681PERL_STATIC_INLINE bool
7682S_is_utf8_X_LVT(pTHX_ const U8 *p)
7683{
7684 /* Unlike most other similarly named routines here, this does not create a
7685 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7686
7687 dVAR;
7688
7689 UV cp = valid_utf8_to_uvchr(p, NULL);
7690
7691 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7692
7693 /* The earliest Unicode releases did not have these precomposed Hangul
7694 * syllables. Set to point to undef in that case, so will return false on
7695 * every call */
7696 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7697 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7698 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7699 SvREFCNT_dec(PL_utf8_X_LVT);
7700 PL_utf8_X_LVT = &PL_sv_undef;
7701 }
7702 }
7703
7704 return (PL_utf8_X_LVT != &PL_sv_undef
7705 && cp >= SBASE && cp < SBASE + SCount
7706 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7707}
7708
66610fdd
RGS
7709/*
7710 * Local variables:
7711 * c-indentation-style: bsd
7712 * c-basic-offset: 4
14d04a33 7713 * indent-tabs-mode: nil
66610fdd
RGS
7714 * End:
7715 *
14d04a33 7716 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7717 */