This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta for 486b1e7f0
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 113#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad 115#define HOPc(pos,off) \
ba44c216 116 (char *)(reginfo->is_utf8_target \
220db18a 117 ? reghop3((U8*)pos, off, \
9d9163fb 118 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
119 : (U8*)(pos + off))
120#define HOPBACKc(pos, off) \
ba44c216 121 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
122 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123 : (pos - off >= reginfo->strbeg) \
8e11feef 124 ? (U8*)pos - off \
3dab1dad 125 : NULL)
efb30f32 126
ba44c216 127#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 129
7016d6eb
DM
130
131#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132#define NEXTCHR_IS_EOS (nextchr < 0)
133
134#define SET_nextchr \
220db18a 135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
136
137#define SET_locinput(p) \
138 locinput = (p); \
139 SET_nextchr
140
141
c7304fe2
KW
142#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
143 if (!swash_ptr) { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2
KW
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
147 assert(swash_ptr); \
148 } \
149 } STMT_END
150
151/* If in debug mode, we test that a known character properly matches */
152#ifdef DEBUGGING
153# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
154 property_name, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
158#else
159# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
160 property_name, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
163#endif
d1eb3177 164
c7304fe2
KW
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
169
170#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171 STMT_START { \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
176 "_X_extend", \
177 COMBINING_GRAVE_ACCENT_UTF8); \
178 } STMT_END
d1eb3177 179
c7304fe2 180#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
181/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
182
5f80c4cf 183/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
184/* it would be nice to rework regcomp.sym to generate this stuff. sigh
185 *
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
192*/
3e901dc0
YO
193#define JUMPABLE(rn) ( \
194 OP(rn) == OPEN || \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
196 OP(rn) == EVAL || \
cca55fe3
JP
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 199 OP(rn) == KEEPS || \
3dab1dad 200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 201)
ee9b8eae 202#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 203
ee9b8eae
YO
204#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
205
206#if 0
207/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 210#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
211#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
212
213#else
214/* ... so we use this as its faster. */
215#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 216#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
217#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
219
220#endif
e2d8ce26 221
a84d97b6
HS
222/*
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
225*/
cca55fe3 226#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 230 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 231 else if (type == PLUS) \
cca55fe3 232 rn = NEXTOPER(rn); \
3dab1dad 233 else if (type == IFMATCH) \
a84d97b6 234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 235 else rn += NEXT_OFF(rn); \
3dab1dad 236 } \
5f80c4cf 237} STMT_END
74750237 238
22913b96
KW
239/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245#define SBASE 0xAC00 /* Start of block */
246#define SCount 11172 /* Length of block */
247#define TCount 28
c476f425 248
006f26b2
DM
249#define SLAB_FIRST(s) (&(s)->states[0])
250#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
251
a75351a1 252static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 253static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 254static regmatch_state * S_push_slab(pTHX);
51371543 255
87c0511b 256#define REGCP_PAREN_ELEMS 3
f067efbf 257#define REGCP_OTHER_ELEMS 3
e0fa7e2b 258#define REGCP_FRAME_ELEMS 1
620d5b66
NC
259/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
260 * are needed for the regexp context stack bookkeeping. */
261
76e3520e 262STATIC CHECKPOINT
92da3157 263S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 264{
97aff369 265 dVAR;
a3b680e6 266 const int retval = PL_savestack_ix;
92da3157
DM
267 const int paren_elems_to_push =
268 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
269 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
270 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 271 I32 p;
40a82448 272 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 273
b93070ed
DM
274 PERL_ARGS_ASSERT_REGCPPUSH;
275
e49a9654 276 if (paren_elems_to_push < 0)
5637ef5b
NC
277 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
278 paren_elems_to_push);
e49a9654 279
e0fa7e2b
NC
280 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
281 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 282 " out of range (%lu-%ld)",
92da3157
DM
283 total_elems,
284 (unsigned long)maxopenparen,
285 (long)parenfloor);
e0fa7e2b 286
620d5b66 287 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 288
495f47a5 289 DEBUG_BUFFERS_r(
92da3157 290 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
291 PerlIO_printf(Perl_debug_log,
292 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
293 PTR2UV(rex),
294 PTR2UV(rex->offs)
295 );
296 );
92da3157 297 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 298/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
299 SSPUSHINT(rex->offs[p].end);
300 SSPUSHINT(rex->offs[p].start);
1ca2007e 301 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 302 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
303 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
304 (UV)p,
305 (IV)rex->offs[p].start,
306 (IV)rex->offs[p].start_tmp,
307 (IV)rex->offs[p].end
40a82448 308 ));
a0d0e21e 309 }
b1ce53c5 310/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 311 SSPUSHINT(maxopenparen);
b93070ed
DM
312 SSPUSHINT(rex->lastparen);
313 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 314 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 315
a0d0e21e
LW
316 return retval;
317}
318
c277df42 319/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
320#define REGCP_SET(cp) \
321 DEBUG_STATE_r( \
ab3bbdeb 322 PerlIO_printf(Perl_debug_log, \
e4f74956 323 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
324 (IV)PL_savestack_ix)); \
325 cp = PL_savestack_ix
c3464db5 326
ab3bbdeb 327#define REGCP_UNWIND(cp) \
e4f74956 328 DEBUG_STATE_r( \
ab3bbdeb 329 if (cp != PL_savestack_ix) \
e4f74956
YO
330 PerlIO_printf(Perl_debug_log, \
331 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
332 (IV)(cp), (IV)PL_savestack_ix)); \
333 regcpblow(cp)
c277df42 334
a8d1f4b4
DM
335#define UNWIND_PAREN(lp, lcp) \
336 for (n = rex->lastparen; n > lp; n--) \
337 rex->offs[n].end = -1; \
338 rex->lastparen = n; \
339 rex->lastcloseparen = lcp;
340
341
f067efbf 342STATIC void
92da3157 343S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 344{
97aff369 345 dVAR;
e0fa7e2b 346 UV i;
87c0511b 347 U32 paren;
a3621e74
YO
348 GET_RE_DEBUG_FLAGS_DECL;
349
7918f24d
NC
350 PERL_ARGS_ASSERT_REGCPPOP;
351
b1ce53c5 352 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 353 i = SSPOPUV;
e0fa7e2b
NC
354 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
355 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
356 rex->lastcloseparen = SSPOPINT;
357 rex->lastparen = SSPOPINT;
92da3157 358 *maxopenparen_p = SSPOPINT;
b1ce53c5 359
620d5b66 360 i -= REGCP_OTHER_ELEMS;
b1ce53c5 361 /* Now restore the parentheses context. */
495f47a5
DM
362 DEBUG_BUFFERS_r(
363 if (i || rex->lastparen + 1 <= rex->nparens)
364 PerlIO_printf(Perl_debug_log,
365 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
366 PTR2UV(rex),
367 PTR2UV(rex->offs)
368 );
369 );
92da3157 370 paren = *maxopenparen_p;
620d5b66 371 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 372 I32 tmps;
1ca2007e 373 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 374 rex->offs[paren].start = SSPOPINT;
cf93c79d 375 tmps = SSPOPINT;
b93070ed
DM
376 if (paren <= rex->lastparen)
377 rex->offs[paren].end = tmps;
495f47a5
DM
378 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
379 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
380 (UV)paren,
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 385 );
87c0511b 386 paren--;
a0d0e21e 387 }
daf18116 388#if 1
dafc8851
JH
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
225593e1
DM
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
b93070ed 398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 399 if (i > *maxopenparen_p)
b93070ed
DM
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
495f47a5
DM
402 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
403 " \\%"UVuf": %s ..-1 undeffing\n",
404 (UV)i,
92da3157 405 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 406 ));
a0d0e21e 407 }
dafc8851 408#endif
a0d0e21e
LW
409}
410
74088413
DM
411/* restore the parens and associated vars at savestack position ix,
412 * but without popping the stack */
413
414STATIC void
92da3157 415S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
416{
417 I32 tmpix = PL_savestack_ix;
418 PL_savestack_ix = ix;
92da3157 419 regcppop(rex, maxopenparen_p);
74088413
DM
420 PL_savestack_ix = tmpix;
421}
422
02db2b7b 423#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 424
31c7f561
KW
425STATIC bool
426S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
427{
428 /* Returns a boolean as to whether or not 'character' is a member of the
429 * Posix character class given by 'classnum' that should be equivalent to a
430 * value in the typedef '_char_class_number'.
431 *
432 * Ideally this could be replaced by a just an array of function pointers
433 * to the C library functions that implement the macros this calls.
434 * However, to compile, the precise function signatures are required, and
435 * these may vary from platform to to platform. To avoid having to figure
436 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
437 * which adds an extra layer of function call overhead (unless the C
438 * optimizer strips it away). But we don't particularly care about
439 * performance with locales anyway. */
31c7f561
KW
440
441 switch ((_char_class_number) classnum) {
15861f94 442 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 443 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
444 case _CC_ENUM_ASCII: return isASCII_LC(character);
445 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
446 case _CC_ENUM_CASED: return isLOWER_LC(character)
447 || isUPPER_LC(character);
e8d596e0 448 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
449 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
450 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
451 case _CC_ENUM_LOWER: return isLOWER_LC(character);
452 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 453 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 454 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 455 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
456 case _CC_ENUM_UPPER: return isUPPER_LC(character);
457 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 458 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
459 default: /* VERTSPACE should never occur in locales */
460 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
461 }
462
463 assert(0); /* NOTREACHED */
464 return FALSE;
465}
466
3018b823
KW
467STATIC bool
468S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
469{
470 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
471 * 'character' is a member of the Posix character class given by 'classnum'
472 * that should be equivalent to a value in the typedef
473 * '_char_class_number'.
474 *
475 * This just calls isFOO_lc on the code point for the character if it is in
476 * the range 0-255. Outside that range, all characters avoid Unicode
477 * rules, ignoring any locale. So use the Unicode function if this class
478 * requires a swash, and use the Unicode macro otherwise. */
479
480 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
481
482 if (UTF8_IS_INVARIANT(*character)) {
483 return isFOO_lc(classnum, *character);
484 }
485 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
486 return isFOO_lc(classnum,
487 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
488 }
489
490 if (classnum < _FIRST_NON_SWASH_CC) {
491
492 /* Initialize the swash unless done already */
493 if (! PL_utf8_swash_ptrs[classnum]) {
494 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
495 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
496 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
497 }
498
92a2046b
KW
499 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
500 character,
501 TRUE /* is UTF */ ));
3018b823
KW
502 }
503
504 switch ((_char_class_number) classnum) {
505 case _CC_ENUM_SPACE:
506 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
507
508 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
509 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
510 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
511 default: return 0; /* Things like CNTRL are always
512 below 256 */
513 }
514
515 assert(0); /* NOTREACHED */
516 return FALSE;
517}
518
a687059c 519/*
e50aee73 520 * pregexec and friends
a687059c
LW
521 */
522
76234dfb 523#ifndef PERL_IN_XSUB_RE
a687059c 524/*
c277df42 525 - pregexec - match a regexp against a string
a687059c 526 */
c277df42 527I32
5aaab254 528Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 529 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
530/* stringarg: the point in the string at which to begin matching */
531/* strend: pointer to null at end of string */
532/* strbeg: real beginning of string */
533/* minend: end of match must be >= minend bytes after stringarg. */
534/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
535 * itself is accessed via the pointers above */
536/* nosave: For optimizations. */
c277df42 537{
7918f24d
NC
538 PERL_ARGS_ASSERT_PREGEXEC;
539
c277df42 540 return
9041c2e3 541 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
542 nosave ? 0 : REXEC_COPY_STR);
543}
76234dfb 544#endif
22e551b9 545
9041c2e3 546/*
cad2e5aa
JH
547 * Need to implement the following flags for reg_anch:
548 *
549 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
550 * USE_INTUIT_ML
551 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
552 * INTUIT_AUTORITATIVE_ML
553 * INTUIT_ONCE_NOML - Intuit can match in one location only.
554 * INTUIT_ONCE_ML
555 *
556 * Another flag for this function: SECOND_TIME (so that float substrs
557 * with giant delta may be not rechecked).
558 */
559
3f7c398e 560/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
561 Otherwise, only SvCUR(sv) is used to get strbeg. */
562
6eb5f6b9
JH
563/* XXXX Some places assume that there is a fixed substring.
564 An update may be needed if optimizer marks as "INTUITable"
565 RExen without fixed substrings. Similarly, it is assumed that
566 lengths of all the strings are no more than minlen, thus they
567 cannot come from lookahead.
40d049e4
YO
568 (Or minlen should take into account lookahead.)
569 NOTE: Some of this comment is not correct. minlen does now take account
570 of lookahead/behind. Further research is required. -- demerphq
571
572*/
6eb5f6b9 573
2c2d71f5
JH
574/* A failure to find a constant substring means that there is no need to make
575 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 576 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
577 regtry() should be needed.
578
579 REx compiler's optimizer found 4 possible hints:
580 a) Anchored substring;
581 b) Fixed substring;
582 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 583 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 584 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
585 string which does not contradict any of them.
586 */
2c2d71f5 587
6eb5f6b9
JH
588/* Most of decisions we do here should have been done at compile time.
589 The nodes of the REx which we used for the search should have been
590 deleted from the finite automaton. */
591
52a21eb3
DM
592/* args:
593 * rx: the regex to match against
594 * sv: the SV being matched: only used for utf8 flag; the string
595 * itself is accessed via the pointers below. Note that on
596 * something like an overloaded SV, SvPOK(sv) may be false
597 * and the string pointers may point to something unrelated to
598 * the SV itself.
599 * strbeg: real beginning of string
600 * strpos: the point in the string at which to begin matching
601 * strend: pointer to the byte following the last char of the string
602 * flags currently unused; set to 0
603 * data: currently unused; set to NULL
604 */
605
cad2e5aa 606char *
52a21eb3
DM
607Perl_re_intuit_start(pTHX_
608 REGEXP * const rx,
609 SV *sv,
610 const char * const strbeg,
611 char *strpos,
612 char *strend,
613 const U32 flags,
614 re_scream_pos_data *data)
cad2e5aa 615{
97aff369 616 dVAR;
8d919b0a 617 struct regexp *const prog = ReANY(rx);
eb578fdb 618 I32 start_shift = 0;
cad2e5aa 619 /* Should be nonnegative! */
eb578fdb
KW
620 I32 end_shift = 0;
621 char *s;
622 SV *check;
cad2e5aa 623 char *t;
f2ed9b32 624 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 625 I32 ml_anch;
eb578fdb 626 char *other_last = NULL; /* other substr checked before this */
bd61b366 627 char *check_at = NULL; /* check substr found at this pos */
d8080198 628 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 629 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 630 RXi_GET_DECL(prog,progi);
02d5137b
DM
631 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
632 regmatch_info *const reginfo = &reginfo_buf;
30944b6d 633#ifdef DEBUGGING
890ce7af 634 const char * const i_strpos = strpos;
30944b6d 635#endif
a3621e74
YO
636 GET_RE_DEBUG_FLAGS_DECL;
637
7918f24d 638 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
639 PERL_UNUSED_ARG(flags);
640 PERL_UNUSED_ARG(data);
7918f24d 641
c344f387
JH
642 /* CHR_DIST() would be more correct here but it makes things slow. */
643 if (prog->minlen > strend - strpos) {
a3621e74 644 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 645 "String too short... [re_intuit_start]\n"));
cad2e5aa 646 goto fail;
2c2d71f5 647 }
d8da0584 648
6c3fea77 649 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 650 reginfo->info_aux = NULL;
9d9163fb 651 reginfo->strbeg = strbeg;
220db18a 652 reginfo->strend = strend;
aed7b151 653 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 654 reginfo->intuit = 1;
1cb48e53
DM
655 /* not actually used within intuit, but zero for safety anyway */
656 reginfo->poscache_maxiter = 0;
02d5137b 657
f2ed9b32 658 if (utf8_target) {
33b8afdf
JH
659 if (!prog->check_utf8 && prog->check_substr)
660 to_utf8_substr(prog);
661 check = prog->check_utf8;
662 } else {
7e0d5ad7
KW
663 if (!prog->check_substr && prog->check_utf8) {
664 if (! to_byte_substr(prog)) {
6b54ddc5 665 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
666 }
667 }
33b8afdf
JH
668 check = prog->check_substr;
669 }
cf44e600
DM
670 if ((prog->extflags & RXf_ANCH) /* Match at beg-of-str or after \n */
671 && !(prog->extflags & RXf_ANCH_GPOS)) /* \G isn't a BOS or \n */
672 {
673 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
bbe252da 674 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 675 && !multiline ) ); /* Check after \n? */
cad2e5aa 676
7e25d62c 677 if (!ml_anch) {
cf44e600 678 if ( !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
7e25d62c 679 && (strpos != strbeg)) {
a3621e74 680 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
681 goto fail;
682 }
d46b78c6
KW
683 if (prog->check_offset_min == prog->check_offset_max
684 && !(prog->extflags & RXf_CANY_SEEN)
685 && ! multiline) /* /m can cause \n's to match that aren't
686 accounted for in the string max length.
687 See [perl #115242] */
688 {
2c2d71f5 689 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
690 I32 slen;
691
1aa99e6b 692 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 693
653099ff
GS
694 if (SvTAIL(check)) {
695 slen = SvCUR(check); /* >= 1 */
cad2e5aa 696
9041c2e3 697 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 698 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 699 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 700 goto fail_finish;
cad2e5aa
JH
701 }
702 /* Now should match s[0..slen-2] */
703 slen--;
3f7c398e 704 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 705 || (slen > 1
3f7c398e 706 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 707 report_neq:
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
709 goto fail_finish;
710 }
cad2e5aa 711 }
3f7c398e 712 else if (*SvPVX_const(check) != *s
653099ff 713 || ((slen = SvCUR(check)) > 1
3f7c398e 714 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 715 goto report_neq;
c315bfe8 716 check_at = s;
2c2d71f5 717 goto success_at_start;
7e25d62c 718 }
cad2e5aa 719 }
2c2d71f5 720 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 721 s = strpos;
2c2d71f5 722 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
723 end_shift = prog->check_end_shift;
724
2c2d71f5 725 if (!ml_anch) {
a3b680e6 726 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 727 - (SvTAIL(check) != 0);
a3b680e6 728 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
729
730 if (end_shift < eshift)
731 end_shift = eshift;
732 }
cad2e5aa 733 }
2c2d71f5 734 else { /* Can match at random position */
cad2e5aa
JH
735 ml_anch = 0;
736 s = strpos;
1de06328
YO
737 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
738 end_shift = prog->check_end_shift;
739
740 /* end shift should be non negative here */
cad2e5aa
JH
741 }
742
bcdf7404 743#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 744 if (end_shift < 0)
1de06328 745 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 746 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
747#endif
748
2c2d71f5
JH
749 restart:
750 /* Find a possible match in the region s..strend by looking for
751 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
752
753 {
754 I32 srch_start_shift = start_shift;
755 I32 srch_end_shift = end_shift;
c33e64f0
FC
756 U8* start_point;
757 U8* end_point;
1de06328
YO
758 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
759 srch_end_shift -= ((strbeg - s) - srch_start_shift);
760 srch_start_shift = strbeg - s;
761 }
6bda09f9 762 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
763 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
764 (IV)prog->check_offset_min,
765 (IV)srch_start_shift,
766 (IV)srch_end_shift,
767 (IV)prog->check_end_shift);
768 });
769
bbe252da 770 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
771 start_point= (U8*)(s + srch_start_shift);
772 end_point= (U8*)(strend - srch_end_shift);
773 } else {
774 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
775 end_point= HOP3(strend, -srch_end_shift, strbeg);
776 }
6bda09f9 777 DEBUG_OPTIMISE_MORE_r({
56570a2c 778 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 779 (int)(end_point - start_point),
fc8cd66c 780 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
781 start_point);
782 });
783
784 s = fbm_instr( start_point, end_point,
7fba1cd6 785 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 786 }
cad2e5aa
JH
787 /* Update the count-of-usability, remove useless subpatterns,
788 unshift s. */
2c2d71f5 789
ab3bbdeb 790 DEBUG_EXECUTE_r({
f2ed9b32 791 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
792 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
793 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 794 (s ? "Found" : "Did not find"),
f2ed9b32 795 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
796 ? "anchored" : "floating"),
797 quoted,
798 RE_SV_TAIL(check),
799 (s ? " at offset " : "...\n") );
800 });
2c2d71f5
JH
801
802 if (!s)
803 goto fail_finish;
2c2d71f5 804 /* Finish the diagnostic message */
a3621e74 805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 806
1de06328
YO
807 /* XXX dmq: first branch is for positive lookbehind...
808 Our check string is offset from the beginning of the pattern.
809 So we need to do any stclass tests offset forward from that
810 point. I think. :-(
811 */
812
813
814
815 check_at=s;
816
817
2c2d71f5
JH
818 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
819 Start with the other substr.
820 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 821 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
822 *always* match. Probably should be marked during compile...
823 Probably it is right to do no SCREAM here...
824 */
825
f2ed9b32 826 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
827 : (prog->float_substr && prog->anchored_substr))
828 {
30944b6d 829 /* Take into account the "other" substring. */
2c2d71f5
JH
830 /* XXXX May be hopelessly wrong for UTF... */
831 if (!other_last)
6eb5f6b9 832 other_last = strpos;
f2ed9b32 833 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
834 do_other_anchored:
835 {
890ce7af
AL
836 char * const last = HOP3c(s, -start_shift, strbeg);
837 char *last1, *last2;
be8e71aa 838 char * const saved_s = s;
33b8afdf 839 SV* must;
2c2d71f5 840
2c2d71f5
JH
841 t = s - prog->check_offset_max;
842 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 843 && (!utf8_target
0ce71af7 844 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 845 && t > strpos)))
6f207bd3 846 NOOP;
2c2d71f5
JH
847 else
848 t = strpos;
1aa99e6b 849 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
850 if (t < other_last) /* These positions already checked */
851 t = other_last;
1aa99e6b 852 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
853 if (last < last1)
854 last1 = last;
1de06328
YO
855 /* XXXX It is not documented what units *_offsets are in.
856 We assume bytes, but this is clearly wrong.
857 Meaning this code needs to be carefully reviewed for errors.
858 dmq.
859 */
860
2c2d71f5 861 /* On end-of-str: see comment below. */
f2ed9b32 862 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
863 if (must == &PL_sv_undef) {
864 s = (char*)NULL;
1de06328 865 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
866 }
867 else
868 s = fbm_instr(
869 (unsigned char*)t,
870 HOP3(HOP3(last1, prog->anchored_offset, strend)
871 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
872 must,
7fba1cd6 873 multiline ? FBMrf_MULTILINE : 0
33b8afdf 874 );
ab3bbdeb 875 DEBUG_EXECUTE_r({
f2ed9b32 876 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
877 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
878 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 879 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
880 quoted, RE_SV_TAIL(must));
881 });
882
883
2c2d71f5
JH
884 if (!s) {
885 if (last1 >= last2) {
a3621e74 886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
887 ", giving up...\n"));
888 goto fail_finish;
889 }
a3621e74 890 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 891 ", trying floating at offset %ld...\n",
be8e71aa 892 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
893 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
894 s = HOP3c(last, 1, strend);
2c2d71f5
JH
895 goto restart;
896 }
897 else {
a3621e74 898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 899 (long)(s - i_strpos)));
1aa99e6b
IH
900 t = HOP3c(s, -prog->anchored_offset, strbeg);
901 other_last = HOP3c(s, 1, strend);
be8e71aa 902 s = saved_s;
2c2d71f5
JH
903 if (t == strpos)
904 goto try_at_start;
2c2d71f5
JH
905 goto try_at_offset;
906 }
30944b6d 907 }
2c2d71f5
JH
908 }
909 else { /* Take into account the floating substring. */
33b8afdf 910 char *last, *last1;
be8e71aa 911 char * const saved_s = s;
33b8afdf
JH
912 SV* must;
913
914 t = HOP3c(s, -start_shift, strbeg);
915 last1 = last =
916 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
917 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
918 last = HOP3c(t, prog->float_max_offset, strend);
919 s = HOP3c(t, prog->float_min_offset, strend);
920 if (s < other_last)
921 s = other_last;
2c2d71f5 922 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 923 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
924 /* fbm_instr() takes into account exact value of end-of-str
925 if the check is SvTAIL(ed). Since false positives are OK,
926 and end-of-str is not later than strend we are OK. */
927 if (must == &PL_sv_undef) {
928 s = (char*)NULL;
1de06328 929 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
930 }
931 else
2c2d71f5 932 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
933 (unsigned char*)last + SvCUR(must)
934 - (SvTAIL(must)!=0),
7fba1cd6 935 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 936 DEBUG_EXECUTE_r({
f2ed9b32 937 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
938 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
939 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 940 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
941 quoted, RE_SV_TAIL(must));
942 });
33b8afdf
JH
943 if (!s) {
944 if (last1 == last) {
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
946 ", giving up...\n"));
947 goto fail_finish;
2c2d71f5 948 }
a3621e74 949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 950 ", trying anchored starting at offset %ld...\n",
be8e71aa 951 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
952 other_last = last;
953 s = HOP3c(t, 1, strend);
954 goto restart;
955 }
956 else {
a3621e74 957 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
958 (long)(s - i_strpos)));
959 other_last = s; /* Fix this later. --Hugo */
be8e71aa 960 s = saved_s;
33b8afdf
JH
961 if (t == strpos)
962 goto try_at_start;
963 goto try_at_offset;
964 }
2c2d71f5 965 }
cad2e5aa 966 }
2c2d71f5 967
1de06328 968
9ef43ace 969 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 970
6bda09f9 971 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
972 PerlIO_printf(Perl_debug_log,
973 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
974 (IV)prog->check_offset_min,
975 (IV)prog->check_offset_max,
976 (IV)(s-strpos),
977 (IV)(t-strpos),
978 (IV)(t-s),
979 (IV)(strend-strpos)
980 )
981 );
982
2c2d71f5 983 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 984 && (!utf8_target
9ef43ace 985 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
986 && t > strpos)))
987 {
2c2d71f5
JH
988 /* Fixed substring is found far enough so that the match
989 cannot start at strpos. */
990 try_at_offset:
cad2e5aa 991 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
992 /* Eventually fbm_*() should handle this, but often
993 anchored_offset is not 0, so this check will not be wasted. */
994 /* XXXX In the code below we prefer to look for "^" even in
995 presence of anchored substrings. And we search even
996 beyond the found float position. These pessimizations
997 are historical artefacts only. */
998 find_anchor:
2c2d71f5 999 while (t < strend - prog->minlen) {
cad2e5aa 1000 if (*t == '\n') {
4ee3650e 1001 if (t < check_at - prog->check_offset_min) {
f2ed9b32 1002 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
1003 /* Since we moved from the found position,
1004 we definitely contradict the found anchored
30944b6d
IZ
1005 substr. Due to the above check we do not
1006 contradict "check" substr.
1007 Thus we can arrive here only if check substr
1008 is float. Redo checking for "other"=="fixed".
1009 */
9041c2e3 1010 strpos = t + 1;
a3621e74 1011 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1012 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1013 goto do_other_anchored;
1014 }
4ee3650e
GS
1015 /* We don't contradict the found floating substring. */
1016 /* XXXX Why not check for STCLASS? */
cad2e5aa 1017 s = t + 1;
a3621e74 1018 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1019 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1020 goto set_useful;
1021 }
4ee3650e
GS
1022 /* Position contradicts check-string */
1023 /* XXXX probably better to look for check-string
1024 than for "\n", so one should lower the limit for t? */
a3621e74 1025 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1026 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1027 other_last = strpos = s = t + 1;
cad2e5aa
JH
1028 goto restart;
1029 }
1030 t++;
1031 }
a3621e74 1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1033 PL_colors[0], PL_colors[1]));
2c2d71f5 1034 goto fail_finish;
cad2e5aa 1035 }
f5952150 1036 else {
a3621e74 1037 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1038 PL_colors[0], PL_colors[1]));
f5952150 1039 }
cad2e5aa
JH
1040 s = t;
1041 set_useful:
f2ed9b32 1042 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1043 }
1044 else {
f5952150 1045 /* The found string does not prohibit matching at strpos,
2c2d71f5 1046 - no optimization of calling REx engine can be performed,
f5952150
GS
1047 unless it was an MBOL and we are not after MBOL,
1048 or a future STCLASS check will fail this. */
2c2d71f5
JH
1049 try_at_start:
1050 /* Even in this situation we may use MBOL flag if strpos is offset
1051 wrt the start of the string. */
52a21eb3 1052 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1053 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1054 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1055 {
cad2e5aa
JH
1056 t = strpos;
1057 goto find_anchor;
1058 }
a3621e74 1059 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1060 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1061 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1062 );
2c2d71f5 1063 success_at_start:
bbe252da 1064 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1065 && (utf8_target ? (
33b8afdf
JH
1066 prog->check_utf8 /* Could be deleted already */
1067 && --BmUSEFUL(prog->check_utf8) < 0
1068 && (prog->check_utf8 == prog->float_utf8)
1069 ) : (
1070 prog->check_substr /* Could be deleted already */
1071 && --BmUSEFUL(prog->check_substr) < 0
1072 && (prog->check_substr == prog->float_substr)
1073 )))
66e933ab 1074 {
cad2e5aa 1075 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1076 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1077 /* XXX Does the destruction order has to change with utf8_target? */
1078 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1079 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1080 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1081 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1082 check = NULL; /* abort */
cad2e5aa 1083 s = strpos;
486ec47a 1084 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1085 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1086 if (prog->intflags & PREGf_IMPLICIT)
1087 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1088 /* XXXX This is a remnant of the old implementation. It
1089 looks wasteful, since now INTUIT can use many
6eb5f6b9 1090 other heuristics. */
bbe252da 1091 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1092 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1093 }
1094 else
1095 s = strpos;
1096 }
1097
6eb5f6b9
JH
1098 /* Last resort... */
1099 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1100 /* trie stclasses are too expensive to use here, we are better off to
1101 leave it to regmatch itself */
f8fc2ecf 1102 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1103 /* minlen == 0 is possible if regstclass is \b or \B,
1104 and the fixed substr is ''$.
1105 Since minlen is already taken into account, s+1 is before strend;
1106 accidentally, minlen >= 1 guaranties no false positives at s + 1
1107 even for \b or \B. But (minlen? 1 : 0) below assumes that
1108 regstclass does not come from lookahead... */
1109 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1110 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf 1111 const U8* const str = (U8*)STRING(progi->regstclass);
2c75e362 1112 /* XXX this value could be pre-computed */
f8fc2ecf 1113 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1114 ? (reginfo->is_utf8_pat
1115 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1116 : STR_LEN(progi->regstclass))
66e933ab 1117 : 1);
1de06328
YO
1118 char * endpos;
1119 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1120 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1121 else if (prog->float_substr || prog->float_utf8)
1122 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1123 else
1124 endpos= strend;
1125
d8080198
YO
1126 if (checked_upto < s)
1127 checked_upto = s;
1128 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1129 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1130
6eb5f6b9 1131 t = s;
984e6dd1 1132 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1133 reginfo);
d8080198
YO
1134 if (s) {
1135 checked_upto = s;
1136 } else {
6eb5f6b9 1137#ifdef DEBUGGING
cbbf8932 1138 const char *what = NULL;
6eb5f6b9
JH
1139#endif
1140 if (endpos == strend) {
a3621e74 1141 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1142 "Could not match STCLASS...\n") );
1143 goto fail;
1144 }
a3621e74 1145 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1146 "This position contradicts STCLASS...\n") );
bbe252da 1147 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1148 goto fail;
d8080198
YO
1149 checked_upto = HOPBACKc(endpos, start_shift);
1150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1151 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1152 /* Contradict one of substrings */
33b8afdf 1153 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1154 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1155 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1156 hop_and_restart:
1aa99e6b 1157 s = HOP3c(t, 1, strend);
66e933ab
GS
1158 if (s + start_shift + end_shift > strend) {
1159 /* XXXX Should be taken into account earlier? */
a3621e74 1160 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1161 "Could not match STCLASS...\n") );
1162 goto fail;
1163 }
5e39e1e5
HS
1164 if (!check)
1165 goto giveup;
a3621e74 1166 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1167 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1168 what, (long)(s + start_shift - i_strpos)) );
1169 goto restart;
1170 }
66e933ab 1171 /* Have both, check_string is floating */
6eb5f6b9
JH
1172 if (t + start_shift >= check_at) /* Contradicts floating=check */
1173 goto retry_floating_check;
1174 /* Recheck anchored substring, but not floating... */
9041c2e3 1175 s = check_at;
5e39e1e5
HS
1176 if (!check)
1177 goto giveup;
a3621e74 1178 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1179 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1180 (long)(other_last - i_strpos)) );
1181 goto do_other_anchored;
1182 }
60e71179
GS
1183 /* Another way we could have checked stclass at the
1184 current position only: */
1185 if (ml_anch) {
1186 s = t = t + 1;
5e39e1e5
HS
1187 if (!check)
1188 goto giveup;
a3621e74 1189 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1190 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1191 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1192 goto try_at_offset;
66e933ab 1193 }
f2ed9b32 1194 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1195 goto fail;
486ec47a 1196 /* Check is floating substring. */
6eb5f6b9
JH
1197 retry_floating_check:
1198 t = check_at - start_shift;
a3621e74 1199 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1200 goto hop_and_restart;
1201 }
b7953727 1202 if (t != s) {
a3621e74 1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1204 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1205 (long)(t - i_strpos), (long)(s - i_strpos))
1206 );
1207 }
1208 else {
a3621e74 1209 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1210 "Does not contradict STCLASS...\n");
1211 );
1212 }
6eb5f6b9 1213 }
5e39e1e5 1214 giveup:
a3621e74 1215 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1216 PL_colors[4], (check ? "Guessed" : "Giving up"),
1217 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1218 return s;
2c2d71f5
JH
1219
1220 fail_finish: /* Substring not found */
33b8afdf 1221 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1222 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1223 fail:
a3621e74 1224 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1225 PL_colors[4], PL_colors[5]));
bd61b366 1226 return NULL;
cad2e5aa 1227}
9661b544 1228
a0a388a1
YO
1229#define DECL_TRIE_TYPE(scan) \
1230 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1231 trie_type = ((scan->flags == EXACT) \
1232 ? (utf8_target ? trie_utf8 : trie_plain) \
1233 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1234
fd3249ee
YO
1235#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1236STMT_START { \
fab2782b
YO
1237 STRLEN skiplen; \
1238 switch (trie_type) { \
1239 case trie_utf8_fold: \
1240 if ( foldlen>0 ) { \
1241 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1242 foldlen -= len; \
1243 uscan += len; \
1244 len=0; \
1245 } else { \
1246 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1247 len = UTF8SKIP(uc); \
1248 skiplen = UNISKIP( uvc ); \
1249 foldlen -= skiplen; \
1250 uscan = foldbuf + skiplen; \
1251 } \
1252 break; \
1253 case trie_latin_utf8_fold: \
1254 if ( foldlen>0 ) { \
1255 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1256 foldlen -= len; \
1257 uscan += len; \
1258 len=0; \
1259 } else { \
1260 len = 1; \
51910141 1261 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
fab2782b
YO
1262 skiplen = UNISKIP( uvc ); \
1263 foldlen -= skiplen; \
1264 uscan = foldbuf + skiplen; \
1265 } \
1266 break; \
1267 case trie_utf8: \
1268 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1269 break; \
1270 case trie_plain: \
1271 uvc = (UV)*uc; \
1272 len = 1; \
1273 } \
1274 if (uvc < 256) { \
1275 charid = trie->charmap[ uvc ]; \
1276 } \
1277 else { \
1278 charid = 0; \
1279 if (widecharmap) { \
1280 SV** const svpp = hv_fetch(widecharmap, \
1281 (char*)&uvc, sizeof(UV), 0); \
1282 if (svpp) \
1283 charid = (U16)SvIV(*svpp); \
1284 } \
1285 } \
4cadc6a9
YO
1286} STMT_END
1287
4cadc6a9
YO
1288#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1289STMT_START { \
1290 while (s <= e) { \
1291 if ( (CoNd) \
fac1af77 1292 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1293 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1294 goto got_it; \
1295 s++; \
1296 } \
1297} STMT_END
1298
1299#define REXEC_FBC_UTF8_SCAN(CoDe) \
1300STMT_START { \
9a902117 1301 while (s < strend) { \
4cadc6a9 1302 CoDe \
9a902117 1303 s += UTF8SKIP(s); \
4cadc6a9
YO
1304 } \
1305} STMT_END
1306
1307#define REXEC_FBC_SCAN(CoDe) \
1308STMT_START { \
1309 while (s < strend) { \
1310 CoDe \
1311 s++; \
1312 } \
1313} STMT_END
1314
1315#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1316REXEC_FBC_UTF8_SCAN( \
1317 if (CoNd) { \
02d5137b 1318 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1319 goto got_it; \
1320 else \
1321 tmp = doevery; \
1322 } \
1323 else \
1324 tmp = 1; \
1325)
1326
1327#define REXEC_FBC_CLASS_SCAN(CoNd) \
1328REXEC_FBC_SCAN( \
1329 if (CoNd) { \
02d5137b 1330 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1331 goto got_it; \
1332 else \
1333 tmp = doevery; \
1334 } \
1335 else \
1336 tmp = 1; \
1337)
1338
1339#define REXEC_FBC_TRYIT \
02d5137b 1340if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1341 goto got_it
1342
e1d1eefb 1343#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1344 if (utf8_target) { \
e1d1eefb
YO
1345 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1346 } \
1347 else { \
1348 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1349 }
e1d1eefb 1350
786e8c11 1351#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1352 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1353 startpos, doutf8)
786e8c11 1354
cfaf538b
KW
1355
1356#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1357 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1358 tmp = TEST_NON_UTF8(tmp); \
1359 REXEC_FBC_UTF8_SCAN( \
1360 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1361 tmp = !tmp; \
1362 IF_SUCCESS; \
1363 } \
1364 else { \
1365 IF_FAIL; \
1366 } \
1367 ); \
1368
1369#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1370 if (s == reginfo->strbeg) { \
cfaf538b
KW
1371 tmp = '\n'; \
1372 } \
1373 else { \
9d9163fb 1374 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
cfaf538b
KW
1375 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1376 } \
1377 tmp = TeSt1_UtF8; \
1378 LOAD_UTF8_CHARCLASS_ALNUM(); \
1379 REXEC_FBC_UTF8_SCAN( \
1380 if (tmp == ! (TeSt2_UtF8)) { \
1381 tmp = !tmp; \
1382 IF_SUCCESS; \
1383 } \
1384 else { \
1385 IF_FAIL; \
1386 } \
1387 ); \
1388
63ac0dad
KW
1389/* The only difference between the BOUND and NBOUND cases is that
1390 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1391 * NBOUND. This is accomplished by passing it in either the if or else clause,
1392 * with the other one being empty */
1393#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1394 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1395
1396#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1397 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1398
1399#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1400 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1401
1402#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1403 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1404
63ac0dad
KW
1405
1406/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1407 * be passed in completely with the variable name being tested, which isn't
1408 * such a clean interface, but this is easier to read than it was before. We
1409 * are looking for the boundary (or non-boundary between a word and non-word
1410 * character. The utf8 and non-utf8 cases have the same logic, but the details
1411 * must be different. Find the "wordness" of the character just prior to this
1412 * one, and compare it with the wordness of this one. If they differ, we have
1413 * a boundary. At the beginning of the string, pretend that the previous
1414 * character was a new-line */
cfaf538b 1415#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1416 if (utf8_target) { \
cfaf538b 1417 UTF8_CODE \
63ac0dad
KW
1418 } \
1419 else { /* Not utf8 */ \
9d9163fb 1420 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1421 tmp = TEST_NON_UTF8(tmp); \
1422 REXEC_FBC_SCAN( \
1423 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1424 tmp = !tmp; \
1425 IF_SUCCESS; \
1426 } \
1427 else { \
1428 IF_FAIL; \
1429 } \
1430 ); \
1431 } \
02d5137b 1432 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1433 goto got_it;
1434
786e8c11 1435/* We know what class REx starts with. Try to find this position... */
02d5137b 1436/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1437/* annoyingly all the vars in this routine have different names from their counterparts
1438 in regmatch. /grrr */
1439
3c3eec57 1440STATIC char *
07be1b83 1441S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1442 const char *strend, regmatch_info *reginfo)
a687059c 1443{
73104a1b
KW
1444 dVAR;
1445 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1446 char *pat_string; /* The pattern's exactish string */
1447 char *pat_end; /* ptr to end char of pat_string */
1448 re_fold_t folder; /* Function for computing non-utf8 folds */
1449 const U8 *fold_array; /* array for folding ords < 256 */
1450 STRLEN ln;
1451 STRLEN lnc;
73104a1b
KW
1452 U8 c1;
1453 U8 c2;
1454 char *e;
1455 I32 tmp = 1; /* Scratch variable? */
ba44c216 1456 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1457 UV utf8_fold_flags = 0;
f9176b44 1458 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1459 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1460 with a result inverts that result, as 0^1 =
1461 1 and 1^1 = 0 */
1462 _char_class_number classnum;
1463
73104a1b 1464 RXi_GET_DECL(prog,progi);
2f7f8cb1 1465
73104a1b 1466 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1467
73104a1b
KW
1468 /* We know what class it must start with. */
1469 switch (OP(c)) {
1470 case ANYOF:
9aa1e39f 1471 case ANYOF_SYNTHETIC:
954a2af6 1472 case ANYOF_WARN_SUPER:
73104a1b
KW
1473 if (utf8_target) {
1474 REXEC_FBC_UTF8_CLASS_SCAN(
1475 reginclass(prog, c, (U8*)s, utf8_target));
1476 }
1477 else {
1478 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1479 }
1480 break;
1481 case CANY:
1482 REXEC_FBC_SCAN(
02d5137b 1483 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1484 goto got_it;
1485 else
1486 tmp = doevery;
1487 );
1488 break;
1489
1490 case EXACTFA:
984e6dd1 1491 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1492 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1493 goto do_exactf_utf8;
1494 }
1495 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1496 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1497 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1498
73104a1b
KW
1499 case EXACTF:
1500 if (utf8_target) {
16d951b7 1501
73104a1b
KW
1502 /* regcomp.c already folded this if pattern is in UTF-8 */
1503 utf8_fold_flags = 0;
1504 goto do_exactf_utf8;
1505 }
1506 fold_array = PL_fold;
1507 folder = foldEQ;
1508 goto do_exactf_non_utf8;
1509
1510 case EXACTFL:
984e6dd1 1511 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1512 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1513 goto do_exactf_utf8;
1514 }
1515 fold_array = PL_fold_locale;
1516 folder = foldEQ_locale;
1517 goto do_exactf_non_utf8;
3c760661 1518
73104a1b 1519 case EXACTFU_SS:
984e6dd1 1520 if (is_utf8_pat) {
73104a1b
KW
1521 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1522 }
1523 goto do_exactf_utf8;
16d951b7 1524
73104a1b
KW
1525 case EXACTFU_TRICKYFOLD:
1526 case EXACTFU:
984e6dd1
DM
1527 if (is_utf8_pat || utf8_target) {
1528 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1529 goto do_exactf_utf8;
1530 }
fac1af77 1531
73104a1b
KW
1532 /* Any 'ss' in the pattern should have been replaced by regcomp,
1533 * so we don't have to worry here about this single special case
1534 * in the Latin1 range */
1535 fold_array = PL_fold_latin1;
1536 folder = foldEQ_latin1;
1537
1538 /* FALL THROUGH */
1539
1540 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1541 are no glitches with fold-length differences
1542 between the target string and pattern */
1543
1544 /* The idea in the non-utf8 EXACTF* cases is to first find the
1545 * first character of the EXACTF* node and then, if necessary,
1546 * case-insensitively compare the full text of the node. c1 is the
1547 * first character. c2 is its fold. This logic will not work for
1548 * Unicode semantics and the german sharp ss, which hence should
1549 * not be compiled into a node that gets here. */
1550 pat_string = STRING(c);
1551 ln = STR_LEN(c); /* length to match in octets/bytes */
1552
1553 /* We know that we have to match at least 'ln' bytes (which is the
1554 * same as characters, since not utf8). If we have to match 3
1555 * characters, and there are only 2 availabe, we know without
1556 * trying that it will fail; so don't start a match past the
1557 * required minimum number from the far end */
1558 e = HOP3c(strend, -((I32)ln), s);
1559
02d5137b 1560 if (reginfo->intuit && e < s) {
73104a1b
KW
1561 e = s; /* Due to minlen logic of intuit() */
1562 }
fac1af77 1563
73104a1b
KW
1564 c1 = *pat_string;
1565 c2 = fold_array[c1];
1566 if (c1 == c2) { /* If char and fold are the same */
1567 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1568 }
1569 else {
1570 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1571 }
1572 break;
fac1af77 1573
73104a1b
KW
1574 do_exactf_utf8:
1575 {
1576 unsigned expansion;
1577
1578 /* If one of the operands is in utf8, we can't use the simpler folding
1579 * above, due to the fact that many different characters can have the
1580 * same fold, or portion of a fold, or different- length fold */
1581 pat_string = STRING(c);
1582 ln = STR_LEN(c); /* length to match in octets/bytes */
1583 pat_end = pat_string + ln;
984e6dd1 1584 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1585 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1586 : ln;
1587
1588 /* We have 'lnc' characters to match in the pattern, but because of
1589 * multi-character folding, each character in the target can match
1590 * up to 3 characters (Unicode guarantees it will never exceed
1591 * this) if it is utf8-encoded; and up to 2 if not (based on the
1592 * fact that the Latin 1 folds are already determined, and the
1593 * only multi-char fold in that range is the sharp-s folding to
1594 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1595 * string character. Adjust lnc accordingly, rounding up, so that
1596 * if we need to match at least 4+1/3 chars, that really is 5. */
1597 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1598 lnc = (lnc + expansion - 1) / expansion;
1599
1600 /* As in the non-UTF8 case, if we have to match 3 characters, and
1601 * only 2 are left, it's guaranteed to fail, so don't start a
1602 * match that would require us to go beyond the end of the string
1603 */
1604 e = HOP3c(strend, -((I32)lnc), s);
1605
02d5137b 1606 if (reginfo->intuit && e < s) {
73104a1b
KW
1607 e = s; /* Due to minlen logic of intuit() */
1608 }
0658cdde 1609
73104a1b
KW
1610 /* XXX Note that we could recalculate e to stop the loop earlier,
1611 * as the worst case expansion above will rarely be met, and as we
1612 * go along we would usually find that e moves further to the left.
1613 * This would happen only after we reached the point in the loop
1614 * where if there were no expansion we should fail. Unclear if
1615 * worth the expense */
1616
1617 while (s <= e) {
1618 char *my_strend= (char *)strend;
1619 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1620 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1621 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1622 {
1623 goto got_it;
1624 }
1625 s += (utf8_target) ? UTF8SKIP(s) : 1;
1626 }
1627 break;
1628 }
1629 case BOUNDL:
272d35c9 1630 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1631 FBC_BOUND(isWORDCHAR_LC,
1632 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1633 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1634 break;
1635 case NBOUNDL:
272d35c9 1636 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1637 FBC_NBOUND(isWORDCHAR_LC,
1638 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1639 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1640 break;
1641 case BOUND:
1642 FBC_BOUND(isWORDCHAR,
0eb30aeb 1643 isWORDCHAR_uni(tmp),
03940dc2 1644 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1645 break;
1646 case BOUNDA:
1647 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1648 isWORDCHAR_A(tmp),
1649 isWORDCHAR_A((U8*)s));
1650 break;
1651 case NBOUND:
1652 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1653 isWORDCHAR_uni(tmp),
03940dc2 1654 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1655 break;
1656 case NBOUNDA:
1657 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1658 isWORDCHAR_A(tmp),
1659 isWORDCHAR_A((U8*)s));
1660 break;
1661 case BOUNDU:
1662 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1663 isWORDCHAR_uni(tmp),
03940dc2 1664 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1665 break;
1666 case NBOUNDU:
1667 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1668 isWORDCHAR_uni(tmp),
03940dc2 1669 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1670 break;
73104a1b
KW
1671 case LNBREAK:
1672 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1673 is_LNBREAK_latin1_safe(s, strend)
1674 );
1675 break;
3018b823
KW
1676
1677 /* The argument to all the POSIX node types is the class number to pass to
1678 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1679
1680 case NPOSIXL:
1681 to_complement = 1;
1682 /* FALLTHROUGH */
1683
1684 case POSIXL:
272d35c9 1685 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1686 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1687 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1688 break;
3018b823
KW
1689
1690 case NPOSIXD:
1691 to_complement = 1;
1692 /* FALLTHROUGH */
1693
1694 case POSIXD:
1695 if (utf8_target) {
1696 goto posix_utf8;
1697 }
1698 goto posixa;
1699
1700 case NPOSIXA:
1701 if (utf8_target) {
1702 /* The complement of something that matches only ASCII matches all
1703 * UTF-8 variant code points, plus everything in ASCII that isn't
1704 * in the class */
1705 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1706 || ! _generic_isCC_A(*s, FLAGS(c)));
1707 break;
1708 }
1709
1710 to_complement = 1;
1711 /* FALLTHROUGH */
1712
73104a1b 1713 case POSIXA:
3018b823 1714 posixa:
73104a1b 1715 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1716 * byte invariant character. */
1717 REXEC_FBC_CLASS_SCAN(
1718 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1719 break;
3018b823
KW
1720
1721 case NPOSIXU:
1722 to_complement = 1;
1723 /* FALLTHROUGH */
1724
1725 case POSIXU:
1726 if (! utf8_target) {
1727 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1728 FLAGS(c))));
1729 }
1730 else {
1731
1732 posix_utf8:
1733 classnum = (_char_class_number) FLAGS(c);
1734 if (classnum < _FIRST_NON_SWASH_CC) {
1735 while (s < strend) {
1736
1737 /* We avoid loading in the swash as long as possible, but
1738 * should we have to, we jump to a separate loop. This
1739 * extra 'if' statement is what keeps this code from being
1740 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1741 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1742 goto found_above_latin1;
1743 }
1744 if ((UTF8_IS_INVARIANT(*s)
1745 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1746 classnum)))
1747 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1748 && to_complement ^ cBOOL(
1749 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1750 classnum))))
1751 {
02d5137b 1752 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1753 goto got_it;
1754 else {
1755 tmp = doevery;
1756 }
1757 }
1758 else {
1759 tmp = 1;
1760 }
1761 s += UTF8SKIP(s);
1762 }
1763 }
1764 else switch (classnum) { /* These classes are implemented as
1765 macros */
1766 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1767 revert the change of \v matching this */
1768 /* FALL THROUGH */
1769
1770 case _CC_ENUM_PSXSPC:
1771 REXEC_FBC_UTF8_CLASS_SCAN(
1772 to_complement ^ cBOOL(isSPACE_utf8(s)));
1773 break;
1774
1775 case _CC_ENUM_BLANK:
1776 REXEC_FBC_UTF8_CLASS_SCAN(
1777 to_complement ^ cBOOL(isBLANK_utf8(s)));
1778 break;
1779
1780 case _CC_ENUM_XDIGIT:
1781 REXEC_FBC_UTF8_CLASS_SCAN(
1782 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1783 break;
1784
1785 case _CC_ENUM_VERTSPACE:
1786 REXEC_FBC_UTF8_CLASS_SCAN(
1787 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1788 break;
1789
1790 case _CC_ENUM_CNTRL:
1791 REXEC_FBC_UTF8_CLASS_SCAN(
1792 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1793 break;
1794
1795 default:
1796 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1797 assert(0); /* NOTREACHED */
1798 }
1799 }
1800 break;
1801
1802 found_above_latin1: /* Here we have to load a swash to get the result
1803 for the current code point */
1804 if (! PL_utf8_swash_ptrs[classnum]) {
1805 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1806 PL_utf8_swash_ptrs[classnum] =
1807 _core_swash_init("utf8", swash_property_names[classnum],
1808 &PL_sv_undef, 1, 0, NULL, &flags);
1809 }
1810
1811 /* This is a copy of the loop above for swash classes, though using the
1812 * FBC macro instead of being expanded out. Since we've loaded the
1813 * swash, we don't have to check for that each time through the loop */
1814 REXEC_FBC_UTF8_CLASS_SCAN(
1815 to_complement ^ cBOOL(_generic_utf8(
1816 classnum,
1817 s,
1818 swash_fetch(PL_utf8_swash_ptrs[classnum],
1819 (U8 *) s, TRUE))));
73104a1b
KW
1820 break;
1821
1822 case AHOCORASICKC:
1823 case AHOCORASICK:
1824 {
1825 DECL_TRIE_TYPE(c);
1826 /* what trie are we using right now */
1827 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1828 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1829 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1830
1831 const char *last_start = strend - trie->minlen;
6148ee25 1832#ifdef DEBUGGING
73104a1b 1833 const char *real_start = s;
6148ee25 1834#endif
73104a1b
KW
1835 STRLEN maxlen = trie->maxlen;
1836 SV *sv_points;
1837 U8 **points; /* map of where we were in the input string
1838 when reading a given char. For ASCII this
1839 is unnecessary overhead as the relationship
1840 is always 1:1, but for Unicode, especially
1841 case folded Unicode this is not true. */
1842 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1843 U8 *bitmap=NULL;
1844
1845
1846 GET_RE_DEBUG_FLAGS_DECL;
1847
1848 /* We can't just allocate points here. We need to wrap it in
1849 * an SV so it gets freed properly if there is a croak while
1850 * running the match */
1851 ENTER;
1852 SAVETMPS;
1853 sv_points=newSV(maxlen * sizeof(U8 *));
1854 SvCUR_set(sv_points,
1855 maxlen * sizeof(U8 *));
1856 SvPOK_on(sv_points);
1857 sv_2mortal(sv_points);
1858 points=(U8**)SvPV_nolen(sv_points );
1859 if ( trie_type != trie_utf8_fold
1860 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1861 {
1862 if (trie->bitmap)
1863 bitmap=(U8*)trie->bitmap;
1864 else
1865 bitmap=(U8*)ANYOF_BITMAP(c);
1866 }
1867 /* this is the Aho-Corasick algorithm modified a touch
1868 to include special handling for long "unknown char" sequences.
1869 The basic idea being that we use AC as long as we are dealing
1870 with a possible matching char, when we encounter an unknown char
1871 (and we have not encountered an accepting state) we scan forward
1872 until we find a legal starting char.
1873 AC matching is basically that of trie matching, except that when
1874 we encounter a failing transition, we fall back to the current
1875 states "fail state", and try the current char again, a process
1876 we repeat until we reach the root state, state 1, or a legal
1877 transition. If we fail on the root state then we can either
1878 terminate if we have reached an accepting state previously, or
1879 restart the entire process from the beginning if we have not.
1880
1881 */
1882 while (s <= last_start) {
1883 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1884 U8 *uc = (U8*)s;
1885 U16 charid = 0;
1886 U32 base = 1;
1887 U32 state = 1;
1888 UV uvc = 0;
1889 STRLEN len = 0;
1890 STRLEN foldlen = 0;
1891 U8 *uscan = (U8*)NULL;
1892 U8 *leftmost = NULL;
1893#ifdef DEBUGGING
1894 U32 accepted_word= 0;
786e8c11 1895#endif
73104a1b
KW
1896 U32 pointpos = 0;
1897
1898 while ( state && uc <= (U8*)strend ) {
1899 int failed=0;
1900 U32 word = aho->states[ state ].wordnum;
1901
1902 if( state==1 ) {
1903 if ( bitmap ) {
1904 DEBUG_TRIE_EXECUTE_r(
1905 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 dump_exec_pos( (char *)uc, c, strend, real_start,
1907 (char *)uc, utf8_target );
1908 PerlIO_printf( Perl_debug_log,
1909 " Scanning for legal start char...\n");
1910 }
1911 );
1912 if (utf8_target) {
1913 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1914 uc += UTF8SKIP(uc);
1915 }
1916 } else {
1917 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918 uc++;
1919 }
786e8c11 1920 }
73104a1b 1921 s= (char *)uc;
07be1b83 1922 }
73104a1b
KW
1923 if (uc >(U8*)last_start) break;
1924 }
1925
1926 if ( word ) {
1927 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1928 if (!leftmost || lpos < leftmost) {
1929 DEBUG_r(accepted_word=word);
1930 leftmost= lpos;
7016d6eb 1931 }
73104a1b 1932 if (base==0) break;
7016d6eb 1933
73104a1b
KW
1934 }
1935 points[pointpos++ % maxlen]= uc;
1936 if (foldlen || uc < (U8*)strend) {
1937 REXEC_TRIE_READ_CHAR(trie_type, trie,
1938 widecharmap, uc,
1939 uscan, len, uvc, charid, foldlen,
1940 foldbuf, uniflags);
1941 DEBUG_TRIE_EXECUTE_r({
1942 dump_exec_pos( (char *)uc, c, strend,
1943 real_start, s, utf8_target);
1944 PerlIO_printf(Perl_debug_log,
1945 " Charid:%3u CP:%4"UVxf" ",
1946 charid, uvc);
1947 });
1948 }
1949 else {
1950 len = 0;
1951 charid = 0;
1952 }
07be1b83 1953
73104a1b
KW
1954
1955 do {
6148ee25 1956#ifdef DEBUGGING
73104a1b 1957 word = aho->states[ state ].wordnum;
6148ee25 1958#endif
73104a1b
KW
1959 base = aho->states[ state ].trans.base;
1960
1961 DEBUG_TRIE_EXECUTE_r({
1962 if (failed)
1963 dump_exec_pos( (char *)uc, c, strend, real_start,
1964 s, utf8_target );
1965 PerlIO_printf( Perl_debug_log,
1966 "%sState: %4"UVxf", word=%"UVxf,
1967 failed ? " Fail transition to " : "",
1968 (UV)state, (UV)word);
1969 });
1970 if ( base ) {
1971 U32 tmp;
1972 I32 offset;
1973 if (charid &&
1974 ( ((offset = base + charid
1975 - 1 - trie->uniquecharcount)) >= 0)
1976 && ((U32)offset < trie->lasttrans)
1977 && trie->trans[offset].check == state
1978 && (tmp=trie->trans[offset].next))
1979 {
1980 DEBUG_TRIE_EXECUTE_r(
1981 PerlIO_printf( Perl_debug_log," - legal\n"));
1982 state = tmp;
1983 break;
07be1b83
YO
1984 }
1985 else {
786e8c11 1986 DEBUG_TRIE_EXECUTE_r(
73104a1b 1987 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1988 failed = 1;
73104a1b 1989 state = aho->fail[state];
07be1b83 1990 }
07be1b83 1991 }
73104a1b
KW
1992 else {
1993 /* we must be accepting here */
1994 DEBUG_TRIE_EXECUTE_r(
1995 PerlIO_printf( Perl_debug_log," - accepting\n"));
1996 failed = 1;
1997 break;
786e8c11 1998 }
73104a1b
KW
1999 } while(state);
2000 uc += len;
2001 if (failed) {
2002 if (leftmost)
2003 break;
2004 if (!state) state = 1;
07be1b83 2005 }
73104a1b
KW
2006 }
2007 if ( aho->states[ state ].wordnum ) {
2008 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2009 if (!leftmost || lpos < leftmost) {
2010 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2011 leftmost = lpos;
07be1b83
YO
2012 }
2013 }
73104a1b
KW
2014 if (leftmost) {
2015 s = (char*)leftmost;
2016 DEBUG_TRIE_EXECUTE_r({
2017 PerlIO_printf(
2018 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2019 (UV)accepted_word, (IV)(s - real_start)
2020 );
2021 });
02d5137b 2022 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2023 FREETMPS;
2024 LEAVE;
2025 goto got_it;
2026 }
2027 s = HOPc(s,1);
2028 DEBUG_TRIE_EXECUTE_r({
2029 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2030 });
2031 } else {
2032 DEBUG_TRIE_EXECUTE_r(
2033 PerlIO_printf( Perl_debug_log,"No match.\n"));
2034 break;
2035 }
2036 }
2037 FREETMPS;
2038 LEAVE;
2039 }
2040 break;
2041 default:
2042 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2043 break;
2044 }
2045 return 0;
2046 got_it:
2047 return s;
6eb5f6b9
JH
2048}
2049
60165aa4
DM
2050/* set RX_SAVED_COPY, RX_SUBBEG etc.
2051 * flags have same meanings as with regexec_flags() */
2052
749f4950
DM
2053static void
2054S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2055 char *strbeg,
2056 char *strend,
2057 SV *sv,
2058 U32 flags,
2059 bool utf8_target)
2060{
2061 struct regexp *const prog = ReANY(rx);
2062
60165aa4
DM
2063 if (flags & REXEC_COPY_STR) {
2064#ifdef PERL_ANY_COW
2065 if (SvCANCOW(sv)) {
2066 if (DEBUG_C_TEST) {
2067 PerlIO_printf(Perl_debug_log,
2068 "Copy on write: regexp capture, type %d\n",
2069 (int) SvTYPE(sv));
2070 }
5411a0e5
DM
2071 /* Create a new COW SV to share the match string and store
2072 * in saved_copy, unless the current COW SV in saved_copy
2073 * is valid and suitable for our purpose */
2074 if (( prog->saved_copy
2075 && SvIsCOW(prog->saved_copy)
2076 && SvPOKp(prog->saved_copy)
2077 && SvIsCOW(sv)
2078 && SvPOKp(sv)
2079 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2080 {
5411a0e5
DM
2081 /* just reuse saved_copy SV */
2082 if (RXp_MATCH_COPIED(prog)) {
2083 Safefree(prog->subbeg);
2084 RXp_MATCH_COPIED_off(prog);
2085 }
2086 }
2087 else {
2088 /* create new COW SV to share string */
a76b0e90
DM
2089 RX_MATCH_COPY_FREE(rx);
2090 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2091 }
5411a0e5
DM
2092 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2093 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2094 prog->sublen = strend - strbeg;
2095 prog->suboffset = 0;
2096 prog->subcoffset = 0;
2097 } else
2098#endif
2099 {
2100 I32 min = 0;
2101 I32 max = strend - strbeg;
2102 I32 sublen;
2103
2104 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2105 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2106 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2107 ) { /* don't copy $' part of string */
2108 U32 n = 0;
2109 max = -1;
2110 /* calculate the right-most part of the string covered
2111 * by a capture. Due to look-ahead, this may be to
2112 * the right of $&, so we have to scan all captures */
2113 while (n <= prog->lastparen) {
2114 if (prog->offs[n].end > max)
2115 max = prog->offs[n].end;
2116 n++;
2117 }
2118 if (max == -1)
2119 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2120 ? prog->offs[0].start
2121 : 0;
2122 assert(max >= 0 && max <= strend - strbeg);
2123 }
2124
2125 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2126 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2127 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2128 ) { /* don't copy $` part of string */
2129 U32 n = 0;
2130 min = max;
2131 /* calculate the left-most part of the string covered
2132 * by a capture. Due to look-behind, this may be to
2133 * the left of $&, so we have to scan all captures */
2134 while (min && n <= prog->lastparen) {
2135 if ( prog->offs[n].start != -1
2136 && prog->offs[n].start < min)
2137 {
2138 min = prog->offs[n].start;
2139 }
2140 n++;
2141 }
2142 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2143 && min > prog->offs[0].end
2144 )
2145 min = prog->offs[0].end;
2146
2147 }
2148
2149 assert(min >= 0 && min <= max && min <= strend - strbeg);
2150 sublen = max - min;
2151
2152 if (RX_MATCH_COPIED(rx)) {
2153 if (sublen > prog->sublen)
2154 prog->subbeg =
2155 (char*)saferealloc(prog->subbeg, sublen+1);
2156 }
2157 else
2158 prog->subbeg = (char*)safemalloc(sublen+1);
2159 Copy(strbeg + min, prog->subbeg, sublen, char);
2160 prog->subbeg[sublen] = '\0';
2161 prog->suboffset = min;
2162 prog->sublen = sublen;
2163 RX_MATCH_COPIED_on(rx);
2164 }
2165 prog->subcoffset = prog->suboffset;
2166 if (prog->suboffset && utf8_target) {
2167 /* Convert byte offset to chars.
2168 * XXX ideally should only compute this if @-/@+
2169 * has been seen, a la PL_sawampersand ??? */
2170
2171 /* If there's a direct correspondence between the
2172 * string which we're matching and the original SV,
2173 * then we can use the utf8 len cache associated with
2174 * the SV. In particular, it means that under //g,
2175 * sv_pos_b2u() will use the previously cached
2176 * position to speed up working out the new length of
2177 * subcoffset, rather than counting from the start of
2178 * the string each time. This stops
2179 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2180 * from going quadratic */
2181 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2182 sv_pos_b2u(sv, &(prog->subcoffset));
2183 else
2184 prog->subcoffset = utf8_length((U8*)strbeg,
2185 (U8*)(strbeg+prog->suboffset));
2186 }
2187 }
2188 else {
2189 RX_MATCH_COPY_FREE(rx);
2190 prog->subbeg = strbeg;
2191 prog->suboffset = 0;
2192 prog->subcoffset = 0;
2193 prog->sublen = strend - strbeg;
2194 }
2195}
2196
2197
2198
fae667d5 2199
6eb5f6b9
JH
2200/*
2201 - regexec_flags - match a regexp against a string
2202 */
2203I32
5aaab254 2204Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2205 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2206/* stringarg: the point in the string at which to begin matching */
2207/* strend: pointer to null at end of string */
2208/* strbeg: real beginning of string */
2209/* minend: end of match must be >= minend bytes after stringarg. */
2210/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2211 * itself is accessed via the pointers above */
2212/* data: May be used for some additional optimizations.
d058ec57 2213 Currently unused. */
8fd1a950
DM
2214/* nosave: For optimizations. */
2215
6eb5f6b9 2216{
97aff369 2217 dVAR;
8d919b0a 2218 struct regexp *const prog = ReANY(rx);
5aaab254 2219 char *s;
eb578fdb 2220 regnode *c;
03c83e26 2221 char *startpos;
6eb5f6b9
JH
2222 I32 minlen; /* must match at least this many chars */
2223 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9 2224 I32 end_shift = 0; /* Same for the end. */ /* CC */
f2ed9b32 2225 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2226 I32 multiline;
f8fc2ecf 2227 RXi_GET_DECL(prog,progi);
02d5137b
DM
2228 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2229 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2230 regexp_paren_pair *swap = NULL;
006f26b2 2231 I32 oldsave;
a3621e74
YO
2232 GET_RE_DEBUG_FLAGS_DECL;
2233
7918f24d 2234 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2235 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2236
2237 /* Be paranoid... */
03c83e26 2238 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2239 Perl_croak(aTHX_ "NULL regexp parameter");
2240 return 0;
2241 }
2242
6c3fea77 2243 DEBUG_EXECUTE_r(
03c83e26 2244 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2245 "Matching");
2246 );
8adc0f72 2247
b342a604
DM
2248 startpos = stringarg;
2249
03c83e26 2250 if (prog->extflags & RXf_GPOS_SEEN) {
d307c076
DM
2251 MAGIC *mg;
2252
fef7148b
DM
2253 /* set reginfo->ganch, the position where \G can match */
2254
2255 reginfo->ganch =
2256 (flags & REXEC_IGNOREPOS)
2257 ? stringarg /* use start pos rather than pos() */
2258 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2259 ? strbeg + mg->mg_len /* Defined pos() */
2260 : strbeg; /* pos() not defined; use start of string */
2261
2262 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2263 "GPOS ganch set to strbeg[%"IVdf"]\n", reginfo->ganch - strbeg));
2264
03c83e26
DM
2265 /* in the presence of \G, we may need to start looking earlier in
2266 * the string than the suggested start point of stringarg:
2267 * if gofs->prog is set, then that's a known, fixed minimum
2268 * offset, such as
2269 * /..\G/: gofs = 2
2270 * /ab|c\G/: gofs = 1
2271 * or if the minimum offset isn't known, then we have to go back
2272 * to the start of the string, e.g. /w+\G/
2273 */
2bfbe302
DM
2274
2275 if (prog->extflags & RXf_ANCH_GPOS) {
2276 startpos = reginfo->ganch - prog->gofs;
2277 if (startpos <
2278 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2279 {
2280 DEBUG_r(PerlIO_printf(Perl_debug_log,
2281 "fail: ganch-gofs before earliest possible start\n"));
2282 return 0;
2283 }
2284 }
2285 else if (prog->gofs) {
b342a604
DM
2286 if (startpos - prog->gofs < strbeg)
2287 startpos = strbeg;
2288 else
2289 startpos -= prog->gofs;
03c83e26 2290 }
b342a604
DM
2291 else if (prog->extflags & RXf_GPOS_FLOAT)
2292 startpos = strbeg;
03c83e26
DM
2293 }
2294
2295 minlen = prog->minlen;
b342a604 2296 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2297 DEBUG_r(PerlIO_printf(Perl_debug_log,
2298 "Regex match can't succeed, so not even tried\n"));
2299 return 0;
2300 }
2301
63a3746a
DM
2302 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2303 * which will call destuctors to reset PL_regmatch_state, free higher
2304 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2305 * regmatch_info_aux_eval */
2306
2307 oldsave = PL_savestack_ix;
2308
dfa77d06
DM
2309 s = startpos;
2310
e322109a 2311 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2312 && !(flags & REXEC_CHECKED))
2313 {
dfa77d06 2314 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2315 flags, NULL);
dfa77d06 2316 if (!s)
7fadf4a7
DM
2317 return 0;
2318
e322109a 2319 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2320 /* we can match based purely on the result of INTUIT.
2321 * Set up captures etc just for $& and $-[0]
2322 * (an intuit-only match wont have $1,$2,..) */
2323 assert(!prog->nparens);
d5e7783a
DM
2324
2325 /* s/// doesn't like it if $& is earlier than where we asked it to
2326 * start searching (which can happen on something like /.\G/) */
2327 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2328 && (s < stringarg))
2329 {
2330 /* this should only be possible under \G */
2331 assert(prog->extflags & RXf_GPOS_SEEN);
2332 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2333 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2334 goto phooey;
2335 }
2336
7fadf4a7
DM
2337 /* match via INTUIT shouldn't have any captures.
2338 * Let @-, @+, $^N know */
2339 prog->lastparen = prog->lastcloseparen = 0;
2340 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2341 prog->offs[0].start = s - strbeg;
2342 prog->offs[0].end = utf8_target
2343 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2344 : s - strbeg + prog->minlenret;
7fadf4a7 2345 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2346 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2347 strbeg, strend,
2348 sv, flags, utf8_target);
2349
7fadf4a7
DM
2350 return 1;
2351 }
2352 }
2353
6c3fea77 2354 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2355
dfa77d06 2356 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2357 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2358 "String too short [regexec_flags]...\n"));
2359 goto phooey;
1aa99e6b 2360 }
1de06328 2361
6eb5f6b9 2362 /* Check validity of program. */
f8fc2ecf 2363 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2364 Perl_croak(aTHX_ "corrupted regexp program");
2365 }
2366
272d35c9 2367 RX_MATCH_TAINTED_off(rx);
6eb5f6b9 2368
6c3fea77
DM
2369 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2370 reginfo->intuit = 0;
2371 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2372 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2373 reginfo->warned = FALSE;
9d9163fb 2374 reginfo->strbeg = strbeg;
02d5137b 2375 reginfo->sv = sv;
1cb48e53 2376 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2377 reginfo->strend = strend;
6eb5f6b9 2378 /* see how far we have to get to not match where we matched before */
fe3974be 2379 reginfo->till = stringarg + minend;
6eb5f6b9 2380
82c23608
FC
2381 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2382 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2383 S_cleanup_regmatch_info_aux has executed (registered by
2384 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2385 magic belonging to this SV.
2386 Not newSVsv, either, as it does not COW.
2387 */
2388 reginfo->sv = newSV(0);
2389 sv_setsv(reginfo->sv, sv);
2390 SAVEFREESV(reginfo->sv);
2391 }
2392
331b2dcc
DM
2393 /* reserve next 2 or 3 slots in PL_regmatch_state:
2394 * slot N+0: may currently be in use: skip it
2395 * slot N+1: use for regmatch_info_aux struct
2396 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2397 * slot N+3: ready for use by regmatch()
2398 */
bf2039a9 2399
331b2dcc
DM
2400 {
2401 regmatch_state *old_regmatch_state;
2402 regmatch_slab *old_regmatch_slab;
2403 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2404
2405 /* on first ever match, allocate first slab */
2406 if (!PL_regmatch_slab) {
2407 Newx(PL_regmatch_slab, 1, regmatch_slab);
2408 PL_regmatch_slab->prev = NULL;
2409 PL_regmatch_slab->next = NULL;
2410 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2411 }
bf2039a9 2412
331b2dcc
DM
2413 old_regmatch_state = PL_regmatch_state;
2414 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2415
331b2dcc
DM
2416 for (i=0; i <= max; i++) {
2417 if (i == 1)
2418 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2419 else if (i ==2)
2420 reginfo->info_aux_eval =
2421 reginfo->info_aux->info_aux_eval =
2422 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2423
331b2dcc
DM
2424 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2425 PL_regmatch_state = S_push_slab(aTHX);
2426 }
bf2039a9 2427
331b2dcc
DM
2428 /* note initial PL_regmatch_state position; at end of match we'll
2429 * pop back to there and free any higher slabs */
bf2039a9 2430
331b2dcc
DM
2431 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2432 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2433 reginfo->info_aux->poscache = NULL;
bf2039a9 2434
331b2dcc 2435 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2436
331b2dcc
DM
2437 if ((prog->extflags & RXf_EVAL_SEEN))
2438 S_setup_eval_state(aTHX_ reginfo);
2439 else
2440 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2441 }
d3aa529c 2442
6eb5f6b9 2443 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2444
288b8c02 2445 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2446 /* We have to be careful. If the previous successful match
2447 was from this regex we don't want a subsequent partially
2448 successful match to clobber the old results.
2449 So when we detect this possibility we add a swap buffer
d8da0584
KW
2450 to the re, and switch the buffer each match. If we fail,
2451 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2452 */
2453 swap = prog->offs;
2454 /* do we need a save destructor here for eval dies? */
2455 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2456 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2457 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2458 PTR2UV(prog),
2459 PTR2UV(swap),
2460 PTR2UV(prog->offs)
2461 ));
c74340f9 2462 }
6eb5f6b9
JH
2463
2464 /* Simplest case: anchored match need be tried only once. */
2465 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2466 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
3542935d 2467 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2468 goto got_it;
bbe252da
YO
2469 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2470 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2471 {
2472 char *end;
2473
2474 if (minlen)
2475 dontbother = minlen - 1;
1aa99e6b 2476 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2477 /* for multiline we only have to try after newlines */
33b8afdf 2478 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2479 /* because of the goto we can not easily reuse the macros for bifurcating the
2480 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2481 if (utf8_target) {
2482 if (s == startpos)
2483 goto after_try_utf8;
2484 while (1) {
02d5137b 2485 if (regtry(reginfo, &s)) {
92f3d482
YO
2486 goto got_it;
2487 }
2488 after_try_utf8:
2489 if (s > end) {
2490 goto phooey;
2491 }
2492 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2493 s = re_intuit_start(rx, sv, strbeg,
2494 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2495 if (!s) {
2496 goto phooey;
2497 }
2498 }
2499 else {
2500 s += UTF8SKIP(s);
2501 }
2502 }
2503 } /* end search for check string in unicode */
2504 else {
2505 if (s == startpos) {
2506 goto after_try_latin;
2507 }
2508 while (1) {
02d5137b 2509 if (regtry(reginfo, &s)) {
92f3d482
YO
2510 goto got_it;
2511 }
2512 after_try_latin:
2513 if (s > end) {
2514 goto phooey;
2515 }
2516 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2517 s = re_intuit_start(rx, sv, strbeg,
2518 s + 1, strend, flags, NULL);
92f3d482
YO
2519 if (!s) {
2520 goto phooey;
2521 }
2522 }
2523 else {
2524 s++;
2525 }
2526 }
2527 } /* end search for check string in latin*/
2528 } /* end search for check string */
2529 else { /* search for newline */
2530 if (s > startpos) {
2531 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2532 s--;
92f3d482 2533 }
21eede78
YO
2534 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2535 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2536 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2537 if (regtry(reginfo, &s))
6eb5f6b9
JH
2538 goto got_it;
2539 }
92f3d482
YO
2540 }
2541 } /* end search for newline */
2542 } /* end anchored/multiline check string search */
6eb5f6b9 2543 goto phooey;
bbe252da 2544 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2545 {
2bfbe302
DM
2546 /* For anchored \G, the only position it can match from is
2547 * (ganch-gofs); we already set startpos to this above; if intuit
2548 * moved us on from there, we can't possibly succeed */
2549 assert(startpos == reginfo->ganch - prog->gofs);
2550 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2551 goto got_it;
2552 goto phooey;
2553 }
2554
2555 /* Messy cases: unanchored match. */
bbe252da 2556 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2557 /* we have /x+whatever/ */
984e6dd1 2558 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2559 char ch;
bf93d4cc
GS
2560#ifdef DEBUGGING
2561 int did_match = 0;
2562#endif
f2ed9b32 2563 if (utf8_target) {
7e0d5ad7
KW
2564 if (! prog->anchored_utf8) {
2565 to_utf8_substr(prog);
2566 }
2567 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2568 REXEC_FBC_SCAN(
6eb5f6b9 2569 if (*s == ch) {
a3621e74 2570 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2571 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2572 s += UTF8SKIP(s);
2573 while (s < strend && *s == ch)
2574 s += UTF8SKIP(s);
2575 }
4cadc6a9 2576 );
7e0d5ad7 2577
6eb5f6b9
JH
2578 }
2579 else {
7e0d5ad7
KW
2580 if (! prog->anchored_substr) {
2581 if (! to_byte_substr(prog)) {
6b54ddc5 2582 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2583 }
2584 }
2585 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2586 REXEC_FBC_SCAN(
6eb5f6b9 2587 if (*s == ch) {
a3621e74 2588 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2589 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2590 s++;
2591 while (s < strend && *s == ch)
2592 s++;
2593 }
4cadc6a9 2594 );
6eb5f6b9 2595 }
a3621e74 2596 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2597 PerlIO_printf(Perl_debug_log,
b7953727
JH
2598 "Did not find anchored character...\n")
2599 );
6eb5f6b9 2600 }
a0714e2c
SS
2601 else if (prog->anchored_substr != NULL
2602 || prog->anchored_utf8 != NULL
2603 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2604 && prog->float_max_offset < strend - s)) {
2605 SV *must;
2606 I32 back_max;
2607 I32 back_min;
2608 char *last;
6eb5f6b9 2609 char *last1; /* Last position checked before */
bf93d4cc
GS
2610#ifdef DEBUGGING
2611 int did_match = 0;
2612#endif
33b8afdf 2613 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2614 if (utf8_target) {
2615 if (! prog->anchored_utf8) {
2616 to_utf8_substr(prog);
2617 }
2618 must = prog->anchored_utf8;
2619 }
2620 else {
2621 if (! prog->anchored_substr) {
2622 if (! to_byte_substr(prog)) {
6b54ddc5 2623 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2624 }
2625 }
2626 must = prog->anchored_substr;
2627 }
33b8afdf
JH
2628 back_max = back_min = prog->anchored_offset;
2629 } else {
7e0d5ad7
KW
2630 if (utf8_target) {
2631 if (! prog->float_utf8) {
2632 to_utf8_substr(prog);
2633 }
2634 must = prog->float_utf8;
2635 }
2636 else {
2637 if (! prog->float_substr) {
2638 if (! to_byte_substr(prog)) {
6b54ddc5 2639 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2640 }
2641 }
2642 must = prog->float_substr;
2643 }
33b8afdf
JH
2644 back_max = prog->float_max_offset;
2645 back_min = prog->float_min_offset;
2646 }
1de06328 2647
1de06328
YO
2648 if (back_min<0) {
2649 last = strend;
2650 } else {
2651 last = HOP3c(strend, /* Cannot start after this */
2652 -(I32)(CHR_SVLEN(must)
2653 - (SvTAIL(must) != 0) + back_min), strbeg);
2654 }
9d9163fb 2655 if (s > reginfo->strbeg)
6eb5f6b9
JH
2656 last1 = HOPc(s, -1);
2657 else
2658 last1 = s - 1; /* bogus */
2659
a0288114 2660 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2661 check_substr==must. */
6eb5f6b9
JH
2662 dontbother = end_shift;
2663 strend = HOPc(strend, -dontbother);
2664 while ( (s <= last) &&
c33e64f0 2665 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2666 (unsigned char*)strend, must,
c33e64f0 2667 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2668 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2669 if (HOPc(s, -back_max) > last1) {
2670 last1 = HOPc(s, -back_min);
2671 s = HOPc(s, -back_max);
2672 }
2673 else {
9d9163fb
DM
2674 char * const t = (last1 >= reginfo->strbeg)
2675 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2676
2677 last1 = HOPc(s, -back_min);
52657f30 2678 s = t;
6eb5f6b9 2679 }
f2ed9b32 2680 if (utf8_target) {
6eb5f6b9 2681 while (s <= last1) {
02d5137b 2682 if (regtry(reginfo, &s))
6eb5f6b9 2683 goto got_it;
7016d6eb
DM
2684 if (s >= last1) {
2685 s++; /* to break out of outer loop */
2686 break;
2687 }
2688 s += UTF8SKIP(s);
6eb5f6b9
JH
2689 }
2690 }
2691 else {
2692 while (s <= last1) {
02d5137b 2693 if (regtry(reginfo, &s))
6eb5f6b9
JH
2694 goto got_it;
2695 s++;
2696 }
2697 }
2698 }
ab3bbdeb 2699 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2700 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2701 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2702 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2703 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2704 ? "anchored" : "floating"),
ab3bbdeb
YO
2705 quoted, RE_SV_TAIL(must));
2706 });
6eb5f6b9
JH
2707 goto phooey;
2708 }
f8fc2ecf 2709 else if ( (c = progi->regstclass) ) {
f14c76ed 2710 if (minlen) {
f8fc2ecf 2711 const OPCODE op = OP(progi->regstclass);
66e933ab 2712 /* don't bother with what can't match */
786e8c11 2713 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2714 strend = HOPc(strend, -(minlen - 1));
2715 }
a3621e74 2716 DEBUG_EXECUTE_r({
be8e71aa 2717 SV * const prop = sv_newmortal();
32fc9b6a 2718 regprop(prog, prop, c);
0df25f3d 2719 {
f2ed9b32 2720 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2721 s,strend-s,60);
0df25f3d 2722 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2723 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2724 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2725 quoted, (int)(strend - s));
0df25f3d 2726 }
ffc61ed2 2727 });
f9176b44 2728 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2729 goto got_it;
07be1b83 2730 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2731 }
2732 else {
2733 dontbother = 0;
a0714e2c 2734 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2735 /* Trim the end. */
6af40bd7 2736 char *last= NULL;
33b8afdf 2737 SV* float_real;
c33e64f0
FC
2738 STRLEN len;
2739 const char *little;
33b8afdf 2740
7e0d5ad7
KW
2741 if (utf8_target) {
2742 if (! prog->float_utf8) {
2743 to_utf8_substr(prog);
2744 }
2745 float_real = prog->float_utf8;
2746 }
2747 else {
2748 if (! prog->float_substr) {
2749 if (! to_byte_substr(prog)) {
6b54ddc5 2750 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2751 }
2752 }
2753 float_real = prog->float_substr;
2754 }
d6a28714 2755
c33e64f0
FC
2756 little = SvPV_const(float_real, len);
2757 if (SvTAIL(float_real)) {
7f18ad16
KW
2758 /* This means that float_real contains an artificial \n on
2759 * the end due to the presence of something like this:
2760 * /foo$/ where we can match both "foo" and "foo\n" at the
2761 * end of the string. So we have to compare the end of the
2762 * string first against the float_real without the \n and
2763 * then against the full float_real with the string. We
2764 * have to watch out for cases where the string might be
2765 * smaller than the float_real or the float_real without
2766 * the \n. */
1a13b075
YO
2767 char *checkpos= strend - len;
2768 DEBUG_OPTIMISE_r(
2769 PerlIO_printf(Perl_debug_log,
2770 "%sChecking for float_real.%s\n",
2771 PL_colors[4], PL_colors[5]));
2772 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2773 /* can't match, even if we remove the trailing \n
2774 * string is too short to match */
1a13b075
YO
2775 DEBUG_EXECUTE_r(
2776 PerlIO_printf(Perl_debug_log,
2777 "%sString shorter than required trailing substring, cannot match.%s\n",
2778 PL_colors[4], PL_colors[5]));
2779 goto phooey;
2780 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2781 /* can match, the end of the string matches without the
2782 * "\n" */
1a13b075
YO
2783 last = checkpos + 1;
2784 } else if (checkpos < strbeg) {
7f18ad16
KW
2785 /* cant match, string is too short when the "\n" is
2786 * included */
1a13b075
YO
2787 DEBUG_EXECUTE_r(
2788 PerlIO_printf(Perl_debug_log,
2789 "%sString does not contain required trailing substring, cannot match.%s\n",
2790 PL_colors[4], PL_colors[5]));
2791 goto phooey;
2792 } else if (!multiline) {
7f18ad16
KW
2793 /* non multiline match, so compare with the "\n" at the
2794 * end of the string */
1a13b075
YO
2795 if (memEQ(checkpos, little, len)) {
2796 last= checkpos;
2797 } else {
2798 DEBUG_EXECUTE_r(
2799 PerlIO_printf(Perl_debug_log,
2800 "%sString does not contain required trailing substring, cannot match.%s\n",
2801 PL_colors[4], PL_colors[5]));
2802 goto phooey;
2803 }
2804 } else {
7f18ad16
KW
2805 /* multiline match, so we have to search for a place
2806 * where the full string is located */
d6a28714 2807 goto find_last;
1a13b075 2808 }
c33e64f0 2809 } else {
d6a28714 2810 find_last:
9041c2e3 2811 if (len)
d6a28714 2812 last = rninstr(s, strend, little, little + len);
b8c5462f 2813 else
a0288114 2814 last = strend; /* matching "$" */
b8c5462f 2815 }
6af40bd7 2816 if (!last) {
7f18ad16
KW
2817 /* at one point this block contained a comment which was
2818 * probably incorrect, which said that this was a "should not
2819 * happen" case. Even if it was true when it was written I am
2820 * pretty sure it is not anymore, so I have removed the comment
2821 * and replaced it with this one. Yves */
6bda09f9
YO
2822 DEBUG_EXECUTE_r(
2823 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2824 "String does not contain required substring, cannot match.\n"
2825 ));
2826 goto phooey;
bf93d4cc 2827 }
d6a28714
JH
2828 dontbother = strend - last + prog->float_min_offset;
2829 }
2830 if (minlen && (dontbother < minlen))
2831 dontbother = minlen - 1;
2832 strend -= dontbother; /* this one's always in bytes! */
2833 /* We don't know much -- general case. */
f2ed9b32 2834 if (utf8_target) {
d6a28714 2835 for (;;) {
02d5137b 2836 if (regtry(reginfo, &s))
d6a28714
JH
2837 goto got_it;
2838 if (s >= strend)
2839 break;
b8c5462f 2840 s += UTF8SKIP(s);
d6a28714
JH
2841 };
2842 }
2843 else {
2844 do {
02d5137b 2845 if (regtry(reginfo, &s))
d6a28714
JH
2846 goto got_it;
2847 } while (s++ < strend);
2848 }
2849 }
2850
2851 /* Failure. */
2852 goto phooey;
2853
2854got_it:
d5e7783a
DM
2855 /* s/// doesn't like it if $& is earlier than where we asked it to
2856 * start searching (which can happen on something like /.\G/) */
2857 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2858 && (prog->offs[0].start < stringarg - strbeg))
2859 {
2860 /* this should only be possible under \G */
2861 assert(prog->extflags & RXf_GPOS_SEEN);
2862 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2863 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2864 goto phooey;
2865 }
2866
495f47a5
DM
2867 DEBUG_BUFFERS_r(
2868 if (swap)
2869 PerlIO_printf(Perl_debug_log,
2870 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2871 PTR2UV(prog),
2872 PTR2UV(swap)
2873 );
2874 );
e9105d30 2875 Safefree(swap);
d6a28714 2876
bf2039a9
DM
2877 /* clean up; this will trigger destructors that will free all slabs
2878 * above the current one, and cleanup the regmatch_info_aux
2879 * and regmatch_info_aux_eval sructs */
8adc0f72 2880
006f26b2
DM
2881 LEAVE_SCOPE(oldsave);
2882
5daac39c
NC
2883 if (RXp_PAREN_NAMES(prog))
2884 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 2885
0254aed9
DM
2886 RX_MATCH_UTF8_set(rx, utf8_target);
2887
d6a28714 2888 /* make sure $`, $&, $', and $digit will work later */
60165aa4 2889 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2890 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
2891 strbeg, reginfo->strend,
2892 sv, flags, utf8_target);
9041c2e3 2893
d6a28714
JH
2894 return 1;
2895
2896phooey:
a3621e74 2897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2898 PL_colors[4], PL_colors[5]));
8adc0f72 2899
bf2039a9
DM
2900 /* clean up; this will trigger destructors that will free all slabs
2901 * above the current one, and cleanup the regmatch_info_aux
2902 * and regmatch_info_aux_eval sructs */
8adc0f72 2903
006f26b2
DM
2904 LEAVE_SCOPE(oldsave);
2905
e9105d30 2906 if (swap) {
c74340f9 2907 /* we failed :-( roll it back */
495f47a5
DM
2908 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2909 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2910 PTR2UV(prog),
2911 PTR2UV(prog->offs),
2912 PTR2UV(swap)
2913 ));
e9105d30
GG
2914 Safefree(prog->offs);
2915 prog->offs = swap;
2916 }
d6a28714
JH
2917 return 0;
2918}
2919
6bda09f9 2920
b3d298be 2921/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b
DM
2922 * Do inc before dec, in case old and new rex are the same */
2923#define SET_reg_curpm(Re2) \
bf2039a9 2924 if (reginfo->info_aux_eval) { \
ec43f78b
DM
2925 (void)ReREFCNT_inc(Re2); \
2926 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2927 PM_SETRE((PL_reg_curpm), (Re2)); \
2928 }
2929
2930
d6a28714
JH
2931/*
2932 - regtry - try match at specific point
2933 */
2934STATIC I32 /* 0 failure, 1 success */
f73aaa43 2935S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2936{
97aff369 2937 dVAR;
d6a28714 2938 CHECKPOINT lastcp;
288b8c02 2939 REGEXP *const rx = reginfo->prog;
8d919b0a 2940 regexp *const prog = ReANY(rx);
f73aaa43 2941 I32 result;
f8fc2ecf 2942 RXi_GET_DECL(prog,progi);
a3621e74 2943 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2944
2945 PERL_ARGS_ASSERT_REGTRY;
2946
24b23f37 2947 reginfo->cutpoint=NULL;
d6a28714 2948
9d9163fb 2949 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 2950 prog->lastparen = 0;
03994de8 2951 prog->lastcloseparen = 0;
d6a28714
JH
2952
2953 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2954 to do this again and again, prog->lastparen should take care of
3dd2943c 2955 this! --ilya*/
dafc8851
JH
2956
2957 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2958 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2959 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2960 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2961 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2962 * Meanwhile, this code *is* needed for the
daf18116
JH
2963 * above-mentioned test suite tests to succeed. The common theme
2964 * on those tests seems to be returning null fields from matches.
225593e1 2965 * --jhi updated by dapm */
dafc8851 2966#if 1
d6a28714 2967 if (prog->nparens) {
b93070ed 2968 regexp_paren_pair *pp = prog->offs;
eb578fdb 2969 I32 i;
b93070ed 2970 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2971 ++pp;
2972 pp->start = -1;
2973 pp->end = -1;
d6a28714
JH
2974 }
2975 }
dafc8851 2976#endif
02db2b7b 2977 REGCP_SET(lastcp);
f73aaa43
DM
2978 result = regmatch(reginfo, *startposp, progi->program + 1);
2979 if (result != -1) {
2980 prog->offs[0].end = result;
d6a28714
JH
2981 return 1;
2982 }
24b23f37 2983 if (reginfo->cutpoint)
f73aaa43 2984 *startposp= reginfo->cutpoint;
02db2b7b 2985 REGCP_UNWIND(lastcp);
d6a28714
JH
2986 return 0;
2987}
2988
02db2b7b 2989
8ba1375e
MJD
2990#define sayYES goto yes
2991#define sayNO goto no
262b90c4 2992#define sayNO_SILENT goto no_silent
8ba1375e 2993
f9f4320a
YO
2994/* we dont use STMT_START/END here because it leads to
2995 "unreachable code" warnings, which are bogus, but distracting. */
2996#define CACHEsayNO \
c476f425 2997 if (ST.cache_mask) \
2ac8ff4b 2998 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2999 sayNO
3298f257 3000
a3621e74 3001/* this is used to determine how far from the left messages like
265c4333
YO
3002 'failed...' are printed. It should be set such that messages
3003 are inline with the regop output that created them.
a3621e74 3004*/
265c4333 3005#define REPORT_CODE_OFF 32
a3621e74
YO
3006
3007
40a82448
DM
3008#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3009#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3010#define CHRTEST_NOT_A_CP_1 -999
3011#define CHRTEST_NOT_A_CP_2 -998
9e137952 3012
5d9a96ca
DM
3013/* grab a new slab and return the first slot in it */
3014
3015STATIC regmatch_state *
3016S_push_slab(pTHX)
3017{
a35a87e7 3018#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3019 dMY_CXT;
3020#endif
5d9a96ca
DM
3021 regmatch_slab *s = PL_regmatch_slab->next;
3022 if (!s) {
3023 Newx(s, 1, regmatch_slab);
3024 s->prev = PL_regmatch_slab;
3025 s->next = NULL;
3026 PL_regmatch_slab->next = s;
3027 }
3028 PL_regmatch_slab = s;
86545054 3029 return SLAB_FIRST(s);
5d9a96ca 3030}
5b47454d 3031
95b24440 3032
40a82448
DM
3033/* push a new state then goto it */
3034
4d5016e5
DM
3035#define PUSH_STATE_GOTO(state, node, input) \
3036 pushinput = input; \
40a82448
DM
3037 scan = node; \
3038 st->resume_state = state; \
3039 goto push_state;
3040
3041/* push a new state with success backtracking, then goto it */
3042
4d5016e5
DM
3043#define PUSH_YES_STATE_GOTO(state, node, input) \
3044 pushinput = input; \
40a82448
DM
3045 scan = node; \
3046 st->resume_state = state; \
3047 goto push_yes_state;
3048
aa283a38 3049
aa283a38 3050
4d5016e5 3051
d6a28714 3052/*
95b24440 3053
bf1f174e
DM
3054regmatch() - main matching routine
3055
3056This is basically one big switch statement in a loop. We execute an op,
3057set 'next' to point the next op, and continue. If we come to a point which
3058we may need to backtrack to on failure such as (A|B|C), we push a
3059backtrack state onto the backtrack stack. On failure, we pop the top
3060state, and re-enter the loop at the state indicated. If there are no more
3061states to pop, we return failure.
3062
3063Sometimes we also need to backtrack on success; for example /A+/, where
3064after successfully matching one A, we need to go back and try to
3065match another one; similarly for lookahead assertions: if the assertion
3066completes successfully, we backtrack to the state just before the assertion
3067and then carry on. In these cases, the pushed state is marked as
3068'backtrack on success too'. This marking is in fact done by a chain of
3069pointers, each pointing to the previous 'yes' state. On success, we pop to
3070the nearest yes state, discarding any intermediate failure-only states.
3071Sometimes a yes state is pushed just to force some cleanup code to be
3072called at the end of a successful match or submatch; e.g. (??{$re}) uses
3073it to free the inner regex.
3074
3075Note that failure backtracking rewinds the cursor position, while
3076success backtracking leaves it alone.
3077
3078A pattern is complete when the END op is executed, while a subpattern
3079such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3080ops trigger the "pop to last yes state if any, otherwise return true"
3081behaviour.
3082
3083A common convention in this function is to use A and B to refer to the two
3084subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3085the subpattern to be matched possibly multiple times, while B is the entire
3086rest of the pattern. Variable and state names reflect this convention.
3087
3088The states in the main switch are the union of ops and failure/success of
3089substates associated with with that op. For example, IFMATCH is the op
3090that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3091'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3092successfully matched A and IFMATCH_A_fail is a state saying that we have
3093just failed to match A. Resume states always come in pairs. The backtrack
3094state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3095at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3096on success or failure.
3097
3098The struct that holds a backtracking state is actually a big union, with
3099one variant for each major type of op. The variable st points to the
3100top-most backtrack struct. To make the code clearer, within each
3101block of code we #define ST to alias the relevant union.
3102
3103Here's a concrete example of a (vastly oversimplified) IFMATCH
3104implementation:
3105
3106 switch (state) {
3107 ....
3108
3109#define ST st->u.ifmatch
3110
3111 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3112 ST.foo = ...; // some state we wish to save
95b24440 3113 ...
bf1f174e
DM
3114 // push a yes backtrack state with a resume value of
3115 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3116 // first node of A:
4d5016e5 3117 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3118 // NOTREACHED
3119
3120 case IFMATCH_A: // we have successfully executed A; now continue with B
3121 next = B;
3122 bar = ST.foo; // do something with the preserved value
3123 break;
3124
3125 case IFMATCH_A_fail: // A failed, so the assertion failed
3126 ...; // do some housekeeping, then ...
3127 sayNO; // propagate the failure
3128
3129#undef ST
95b24440 3130
bf1f174e
DM
3131 ...
3132 }
95b24440 3133
bf1f174e
DM
3134For any old-timers reading this who are familiar with the old recursive
3135approach, the code above is equivalent to:
95b24440 3136
bf1f174e
DM
3137 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3138 {
3139 int foo = ...
95b24440 3140 ...
bf1f174e
DM
3141 if (regmatch(A)) {
3142 next = B;
3143 bar = foo;
3144 break;
95b24440 3145 }
bf1f174e
DM
3146 ...; // do some housekeeping, then ...
3147 sayNO; // propagate the failure
95b24440 3148 }
bf1f174e
DM
3149
3150The topmost backtrack state, pointed to by st, is usually free. If you
3151want to claim it, populate any ST.foo fields in it with values you wish to
3152save, then do one of
3153
4d5016e5
DM
3154 PUSH_STATE_GOTO(resume_state, node, newinput);
3155 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3156
3157which sets that backtrack state's resume value to 'resume_state', pushes a
3158new free entry to the top of the backtrack stack, then goes to 'node'.
3159On backtracking, the free slot is popped, and the saved state becomes the
3160new free state. An ST.foo field in this new top state can be temporarily
3161accessed to retrieve values, but once the main loop is re-entered, it
3162becomes available for reuse.
3163
3164Note that the depth of the backtrack stack constantly increases during the
3165left-to-right execution of the pattern, rather than going up and down with
3166the pattern nesting. For example the stack is at its maximum at Z at the
3167end of the pattern, rather than at X in the following:
3168
3169 /(((X)+)+)+....(Y)+....Z/
3170
3171The only exceptions to this are lookahead/behind assertions and the cut,
3172(?>A), which pop all the backtrack states associated with A before
3173continuing.
3174
486ec47a 3175Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3176PL_regmatch_state and st always point to the currently active state,
3177and PL_regmatch_slab points to the slab currently containing
3178PL_regmatch_state. The first time regmatch() is called, the first slab is
3179allocated, and is never freed until interpreter destruction. When the slab
3180is full, a new one is allocated and chained to the end. At exit from
3181regmatch(), slabs allocated since entry are freed.
3182
3183*/
95b24440 3184
40a82448 3185
5bc10b2c 3186#define DEBUG_STATE_pp(pp) \
265c4333 3187 DEBUG_STATE_r({ \
f2ed9b32 3188 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3189 PerlIO_printf(Perl_debug_log, \
5d458dd8 3190 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3191 depth*2, "", \
13d6edb4 3192 PL_reg_name[st->resume_state], \
5d458dd8
YO
3193 ((st==yes_state||st==mark_state) ? "[" : ""), \
3194 ((st==yes_state) ? "Y" : ""), \
3195 ((st==mark_state) ? "M" : ""), \
3196 ((st==yes_state||st==mark_state) ? "]" : "") \
3197 ); \
265c4333 3198 });
5bc10b2c 3199
40a82448 3200
3dab1dad 3201#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3202
3df15adc 3203#ifdef DEBUGGING
5bc10b2c 3204
ab3bbdeb 3205STATIC void
f2ed9b32 3206S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3207 const char *start, const char *end, const char *blurb)
3208{
efd26800 3209 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3210
3211 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3212
ab3bbdeb
YO
3213 if (!PL_colorset)
3214 reginitcolors();
3215 {
3216 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3217 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3218
f2ed9b32 3219 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3220 start, end - start, 60);
3221
3222 PerlIO_printf(Perl_debug_log,
3223 "%s%s REx%s %s against %s\n",
3224 PL_colors[4], blurb, PL_colors[5], s0, s1);
3225
f2ed9b32 3226 if (utf8_target||utf8_pat)
1de06328
YO
3227 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3228 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3229 utf8_pat && utf8_target ? " and " : "",
3230 utf8_target ? "string" : ""
ab3bbdeb
YO
3231 );
3232 }
3233}
3df15adc
YO
3234
3235STATIC void
786e8c11
YO
3236S_dump_exec_pos(pTHX_ const char *locinput,
3237 const regnode *scan,
3238 const char *loc_regeol,
3239 const char *loc_bostr,
3240 const char *loc_reg_starttry,
f2ed9b32 3241 const bool utf8_target)
07be1b83 3242{
786e8c11 3243 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3244 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3245 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3246 /* The part of the string before starttry has one color
3247 (pref0_len chars), between starttry and current
3248 position another one (pref_len - pref0_len chars),
3249 after the current position the third one.
3250 We assume that pref0_len <= pref_len, otherwise we
3251 decrease pref0_len. */
786e8c11
YO
3252 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3253 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3254 int pref0_len;
3255
7918f24d
NC
3256 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3257
f2ed9b32 3258 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3259 pref_len++;
786e8c11
YO
3260 pref0_len = pref_len - (locinput - loc_reg_starttry);
3261 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3262 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3263 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3264 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3265 l--;
3266 if (pref0_len < 0)
3267 pref0_len = 0;
3268 if (pref0_len > pref_len)
3269 pref0_len = pref_len;
3270 {
f2ed9b32 3271 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3272
ab3bbdeb 3273 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3274 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3275
ab3bbdeb 3276 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3277 (locinput - pref_len + pref0_len),
1de06328 3278 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3279
ab3bbdeb 3280 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3281 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3282
1de06328 3283 const STRLEN tlen=len0+len1+len2;
3df15adc 3284 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3285 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3286 (IV)(locinput - loc_bostr),
07be1b83 3287 len0, s0,
07be1b83 3288 len1, s1,
07be1b83 3289 (docolor ? "" : "> <"),
07be1b83 3290 len2, s2,
f9f4320a 3291 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3292 "");
3293 }
3294}
3df15adc 3295
07be1b83
YO
3296#endif
3297
0a4db386
YO
3298/* reg_check_named_buff_matched()
3299 * Checks to see if a named buffer has matched. The data array of
3300 * buffer numbers corresponding to the buffer is expected to reside
3301 * in the regexp->data->data array in the slot stored in the ARG() of
3302 * node involved. Note that this routine doesn't actually care about the
3303 * name, that information is not preserved from compilation to execution.
3304 * Returns the index of the leftmost defined buffer with the given name
3305 * or 0 if non of the buffers matched.
3306 */
3307STATIC I32
7918f24d
NC
3308S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3309{
0a4db386 3310 I32 n;
f8fc2ecf 3311 RXi_GET_DECL(rex,rexi);
ad64d0ec 3312 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3313 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3314
3315 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3316
0a4db386 3317 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3318 if ((I32)rex->lastparen >= nums[n] &&
3319 rex->offs[nums[n]].end != -1)
0a4db386
YO
3320 {
3321 return nums[n];
3322 }
3323 }
3324 return 0;
3325}
3326
2f554ef7 3327
c74f6de9 3328static bool
984e6dd1 3329S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3330 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3331{
79a2a0e8
KW
3332 /* This function determines if there are one or two characters that match
3333 * the first character of the passed-in EXACTish node <text_node>, and if
3334 * so, returns them in the passed-in pointers.
c74f6de9 3335 *
79a2a0e8
KW
3336 * If it determines that no possible character in the target string can
3337 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3338 * the first character in <text_node> requires UTF-8 to represent, and the
3339 * target string isn't in UTF-8.)
c74f6de9 3340 *
79a2a0e8
KW
3341 * If there are more than two characters that could match the beginning of
3342 * <text_node>, or if more context is required to determine a match or not,
3343 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3344 *
3345 * The motiviation behind this function is to allow the caller to set up
3346 * tight loops for matching. If <text_node> is of type EXACT, there is
3347 * only one possible character that can match its first character, and so
3348 * the situation is quite simple. But things get much more complicated if
3349 * folding is involved. It may be that the first character of an EXACTFish
3350 * node doesn't participate in any possible fold, e.g., punctuation, so it
3351 * can be matched only by itself. The vast majority of characters that are
3352 * in folds match just two things, their lower and upper-case equivalents.
3353 * But not all are like that; some have multiple possible matches, or match
3354 * sequences of more than one character. This function sorts all that out.
3355 *
3356 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3357 * loop of trying to match A*, we know we can't exit where the thing
3358 * following it isn't a B. And something can't be a B unless it is the
3359 * beginning of B. By putting a quick test for that beginning in a tight
3360 * loop, we can rule out things that can't possibly be B without having to
3361 * break out of the loop, thus avoiding work. Similarly, if A is a single
3362 * character, we can make a tight loop matching A*, using the outputs of
3363 * this function.
3364 *
3365 * If the target string to match isn't in UTF-8, and there aren't
3366 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3367 * the one or two possible octets (which are characters in this situation)
3368 * that can match. In all cases, if there is only one character that can
3369 * match, *<c1p> and *<c2p> will be identical.
3370 *
3371 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3372 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3373 * can match the beginning of <text_node>. They should be declared with at
3374 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3375 * undefined what these contain.) If one or both of the buffers are
3376 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3377 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3378 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3379 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3380 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3381
ba44c216 3382 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3383
ddb0d839
KW
3384 UV c1 = CHRTEST_NOT_A_CP_1;
3385 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8 3386 bool use_chrtest_void = FALSE;
aed7b151 3387 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3388
3389 /* Used when we have both utf8 input and utf8 output, to avoid converting
3390 * to/from code points */
3391 bool utf8_has_been_setup = FALSE;
3392
c74f6de9
KW
3393 dVAR;
3394
b4291290 3395 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3396
79a2a0e8
KW
3397 if (OP(text_node) == EXACT) {
3398
3399 /* In an exact node, only one thing can be matched, that first
3400 * character. If both the pat and the target are UTF-8, we can just
3401 * copy the input to the output, avoiding finding the code point of
3402 * that character */
984e6dd1 3403 if (!is_utf8_pat) {
79a2a0e8
KW
3404 c2 = c1 = *pat;
3405 }
3406 else if (utf8_target) {
3407 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3408 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3409 utf8_has_been_setup = TRUE;
3410 }
3411 else {
3412 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3413 }
79a2a0e8
KW
3414 }
3415 else /* an EXACTFish node */
984e6dd1 3416 if ((is_utf8_pat
79a2a0e8
KW
3417 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3418 pat + STR_LEN(text_node)))
984e6dd1 3419 || (!is_utf8_pat
79a2a0e8
KW
3420 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3421 pat + STR_LEN(text_node))))
3422 {
3423 /* Multi-character folds require more context to sort out. Also
3424 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3425 * handled outside this routine */
3426 use_chrtest_void = TRUE;
3427 }
3428 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3429 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3430 if (c1 > 256) {
3431 /* Load the folds hash, if not already done */
3432 SV** listp;
3433 if (! PL_utf8_foldclosures) {
3434 if (! PL_utf8_tofold) {
3435 U8 dummy[UTF8_MAXBYTES+1];
3436
3437 /* Force loading this by folding an above-Latin1 char */
3438 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3439 assert(PL_utf8_tofold); /* Verify that worked */
3440 }
3441 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3442 }
3443
3444 /* The fold closures data structure is a hash with the keys being
3445 * the UTF-8 of every character that is folded to, like 'k', and
3446 * the values each an array of all code points that fold to its
3447 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3448 * not included */
3449 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3450 (char *) pat,
3451 UTF8SKIP(pat),
3452 FALSE))))
3453 {
3454 /* Not found in the hash, therefore there are no folds
3455 * containing it, so there is only a single character that
3456 * could match */
3457 c2 = c1;
3458 }
3459 else { /* Does participate in folds */
3460 AV* list = (AV*) *listp;
3461 if (av_len(list) != 1) {
3462
3463 /* If there aren't exactly two folds to this, it is outside
3464 * the scope of this function */
3465 use_chrtest_void = TRUE;
3466 }
3467 else { /* There are two. Get them */
3468 SV** c_p = av_fetch(list, 0, FALSE);
3469 if (c_p == NULL) {
3470 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3471 }
3472 c1 = SvUV(*c_p);
3473
3474 c_p = av_fetch(list, 1, FALSE);
3475 if (c_p == NULL) {
3476 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3477 }
3478 c2 = SvUV(*c_p);
3479
3480 /* Folds that cross the 255/256 boundary are forbidden if
3481 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3482 * pattern character is above 256, and its only other match
3483 * is below 256, the only legal match will be to itself.
3484 * We have thrown away the original, so have to compute
3485 * which is the one above 255 */
3486 if ((c1 < 256) != (c2 < 256)) {
3487 if (OP(text_node) == EXACTFL
3488 || (OP(text_node) == EXACTFA
3489 && (isASCII(c1) || isASCII(c2))))
3490 {
3491 if (c1 < 256) {
3492 c1 = c2;
3493 }
3494 else {
3495 c2 = c1;
3496 }
3497 }
3498 }
3499 }
3500 }
3501 }
3502 else /* Here, c1 is < 255 */
3503 if (utf8_target
3504 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3505 && OP(text_node) != EXACTFL
3506 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3507 {
3508 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3509 * folds to this character in the pattern. All such cases except
3510 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3511 * involved in their folds, so are outside the scope of this
3512 * function */
3513 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3514 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3515 }
3516 else {
3517 use_chrtest_void = TRUE;
3518 }
c74f6de9
KW
3519 }
3520 else { /* Here nothing above Latin1 can fold to the pattern character */
3521 switch (OP(text_node)) {
3522
3523 case EXACTFL: /* /l rules */
79a2a0e8 3524 c2 = PL_fold_locale[c1];
c74f6de9
KW
3525 break;
3526
3527 case EXACTF:
3528 if (! utf8_target) { /* /d rules */
79a2a0e8 3529 c2 = PL_fold[c1];
c74f6de9
KW
3530 break;
3531 }
3532 /* FALLTHROUGH */
3533 /* /u rules for all these. This happens to work for
79a2a0e8 3534 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3535 case EXACTFA:
3536 case EXACTFU_TRICKYFOLD:
79a2a0e8 3537 case EXACTFU_SS:
c74f6de9 3538 case EXACTFU:
79a2a0e8 3539 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3540 break;
3541
878531d3
KW
3542 default:
3543 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3544 assert(0); /* NOTREACHED */
c74f6de9
KW
3545 }
3546 }
3547 }
79a2a0e8
KW
3548
3549 /* Here have figured things out. Set up the returns */
3550 if (use_chrtest_void) {
3551 *c2p = *c1p = CHRTEST_VOID;
3552 }
3553 else if (utf8_target) {
3554 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3555 uvchr_to_utf8(c1_utf8, c1);
3556 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3557 }
c74f6de9 3558
79a2a0e8
KW
3559 /* Invariants are stored in both the utf8 and byte outputs; Use
3560 * negative numbers otherwise for the byte ones. Make sure that the
3561 * byte ones are the same iff the utf8 ones are the same */
3562 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3563 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3564 ? *c2_utf8
3565 : (c1 == c2)
3566 ? CHRTEST_NOT_A_CP_1
3567 : CHRTEST_NOT_A_CP_2;
3568 }
3569 else if (c1 > 255) {
3570 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3571 can represent */
3572 return FALSE;
3573 }
c74f6de9 3574
79a2a0e8
KW
3575 *c1p = *c2p = c2; /* c2 is the only representable value */
3576 }
3577 else { /* c1 is representable; see about c2 */
3578 *c1p = c1;
3579 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3580 }
2f554ef7 3581
c74f6de9
KW
3582 return TRUE;
3583}
2f554ef7 3584
f73aaa43
DM
3585/* returns -1 on failure, $+[0] on success */
3586STATIC I32
3587S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3588{
a35a87e7 3589#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3590 dMY_CXT;
3591#endif
27da23d5 3592 dVAR;
ba44c216 3593 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 3594 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3595 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3596 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3597 RXi_GET_DECL(rex,rexi);
5d9a96ca 3598 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3599 regmatch_state *st;
5d9a96ca 3600 /* cache heavy used fields of st in registers */
eb578fdb
KW
3601 regnode *scan;
3602 regnode *next;
3603 U32 n = 0; /* general value; init to avoid compiler warning */
3604 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3605 char *locinput = startpos;
4d5016e5 3606 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3607 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3608
b69b0499 3609 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3610 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3611 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3612 const U32 max_nochange_depth =
3613 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3614 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3615 regmatch_state *yes_state = NULL; /* state to pop to on success of
3616 subpattern */
e2e6a0f1
YO
3617 /* mark_state piggy backs on the yes_state logic so that when we unwind
3618 the stack on success we can update the mark_state as we go */
3619 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3620 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3621 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3622 U32 state_num;
5d458dd8
YO
3623 bool no_final = 0; /* prevent failure from backtracking? */
3624 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3625 char *startpoint = locinput;
5d458dd8
YO
3626 SV *popmark = NULL; /* are we looking for a mark? */
3627 SV *sv_commit = NULL; /* last mark name seen in failure */
3628 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3629 during a successful match */
5d458dd8
YO
3630 U32 lastopen = 0; /* last open we saw */
3631 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3632 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3633 /* these three flags are set by various ops to signal information to
3634 * the very next op. They have a useful lifetime of exactly one loop
3635 * iteration, and are not preserved or restored by state pushes/pops
3636 */
3637 bool sw = 0; /* the condition value in (?(cond)a|b) */
3638 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3639 int logical = 0; /* the following EVAL is:
3640 0: (?{...})
3641 1: (?(?{...})X|Y)
3642 2: (??{...})
3643 or the following IFMATCH/UNLESSM is:
3644 false: plain (?=foo)
3645 true: used as a condition: (?(?=foo))
3646 */
81ed78b2
DM
3647 PAD* last_pad = NULL;
3648 dMULTICALL;
3649 I32 gimme = G_SCALAR;
3650 CV *caller_cv = NULL; /* who called us */
3651 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3652 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3653 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3654 int to_complement; /* Invert the result? */
3655 _char_class_number classnum;
984e6dd1 3656 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3657
95b24440 3658#ifdef DEBUGGING
e68ec53f 3659 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3660#endif
3661
81ed78b2
DM
3662 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3663 multicall_oldcatch = 0;
3664 multicall_cv = NULL;
3665 cx = NULL;
4f8dbb2d
JL
3666 PERL_UNUSED_VAR(multicall_cop);
3667 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3668
3669
7918f24d
NC
3670 PERL_ARGS_ASSERT_REGMATCH;
3671
3b57cd43 3672 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3673 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3674 }));
5d9a96ca 3675
331b2dcc 3676 st = PL_regmatch_state;
5d9a96ca 3677
d6a28714 3678 /* Note that nextchr is a byte even in UTF */
7016d6eb 3679 SET_nextchr;
d6a28714
JH
3680 scan = prog;
3681 while (scan != NULL) {
8ba1375e 3682
a3621e74 3683 DEBUG_EXECUTE_r( {
6136c704 3684 SV * const prop = sv_newmortal();
1de06328 3685 regnode *rnext=regnext(scan);
f2ed9b32 3686 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3687 regprop(rex, prop, scan);
07be1b83
YO
3688
3689 PerlIO_printf(Perl_debug_log,
3690 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3691 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3692 SvPVX_const(prop),
1de06328 3693 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3694 0 : (IV)(rnext - rexi->program));
2a782b5b 3695 });
d6a28714
JH
3696
3697 next = scan + NEXT_OFF(scan);
3698 if (next == scan)
3699 next = NULL;
40a82448 3700 state_num = OP(scan);
d6a28714 3701
40a82448 3702 reenter_switch:
3018b823 3703 to_complement = 0;
34a81e2b 3704
7016d6eb 3705 SET_nextchr;
e6ca698c 3706 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3707
40a82448 3708 switch (state_num) {
3c0563b9 3709 case BOL: /* /^../ */
9d9163fb 3710 if (locinput == reginfo->strbeg)
b8c5462f 3711 break;
d6a28714 3712 sayNO;
3c0563b9
DM
3713
3714 case MBOL: /* /^../m */
9d9163fb 3715 if (locinput == reginfo->strbeg ||
7016d6eb 3716 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3717 {
b8c5462f
JH
3718 break;
3719 }
d6a28714 3720 sayNO;
3c0563b9
DM
3721
3722 case SBOL: /* /^../s */
9d9163fb 3723 if (locinput == reginfo->strbeg)
b8c5462f 3724 break;
d6a28714 3725 sayNO;
3c0563b9
DM
3726
3727 case GPOS: /* \G */
3b0527fe 3728 if (locinput == reginfo->ganch)
d6a28714
JH
3729 break;
3730 sayNO;
ee9b8eae 3731
3c0563b9 3732 case KEEPS: /* \K */
ee9b8eae 3733 /* update the startpoint */
b93070ed 3734 st->u.keeper.val = rex->offs[0].start;
9d9163fb 3735 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 3736 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3737 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3738 case KEEPS_next_fail:
3739 /* rollback the start point change */
b93070ed 3740 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3741 sayNO_SILENT;
878531d3 3742 assert(0); /*NOTREACHED*/
3c0563b9
DM
3743
3744 case EOL: /* /..$/ */
d6a28714 3745 goto seol;
3c0563b9
DM
3746
3747 case MEOL: /* /..$/m */
7016d6eb 3748 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3749 sayNO;
b8c5462f 3750 break;
3c0563b9
DM
3751
3752 case SEOL: /* /..$/s */
d6a28714 3753 seol:
7016d6eb 3754 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3755 sayNO;
220db18a 3756 if (reginfo->strend - locinput > 1)
b8c5462f 3757 sayNO;
b8c5462f 3758 break;
3c0563b9
DM
3759
3760 case EOS: /* \z */
7016d6eb 3761 if (!NEXTCHR_IS_EOS)
b8c5462f 3762 sayNO;
d6a28714 3763 break;
3c0563b9
DM
3764
3765 case SANY: /* /./s */
7016d6eb 3766 if (NEXTCHR_IS_EOS)
4633a7c4 3767 sayNO;
28b98f76 3768 goto increment_locinput;
3c0563b9
DM
3769
3770 case CANY: /* \C */
7016d6eb 3771 if (NEXTCHR_IS_EOS)
f33976b4 3772 sayNO;
3640db6b 3773 locinput++;
a0d0e21e 3774 break;
3c0563b9
DM
3775
3776 case REG_ANY: /* /./ */
7016d6eb 3777 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3778 sayNO;
28b98f76
DM
3779 goto increment_locinput;
3780
166ba7cd
DM
3781
3782#undef ST
3783#define ST st->u.trie
3c0563b9 3784 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3785 /* In this case the charclass data is available inline so
3786 we can fail fast without a lot of extra overhead.
3787 */
7016d6eb 3788 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3789 DEBUG_EXECUTE_r(
3790 PerlIO_printf(Perl_debug_log,
3791 "%*s %sfailed to match trie start class...%s\n",
3792 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3793 );
3794 sayNO_SILENT;
118e2215 3795 assert(0); /* NOTREACHED */
786e8c11
YO
3796 }
3797 /* FALL THROUGH */
3c0563b9 3798 case TRIE: /* (ab|cd) */
2e64971a
DM
3799 /* the basic plan of execution of the trie is:
3800 * At the beginning, run though all the states, and
3801 * find the longest-matching word. Also remember the position
3802 * of the shortest matching word. For example, this pattern:
3803 * 1 2 3 4 5
3804 * ab|a|x|abcd|abc
3805 * when matched against the string "abcde", will generate
3806 * accept states for all words except 3, with the longest
895cc420 3807 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3808 * the position being after char 1 of the string).
3809 *
3810 * Then for each matching word, in word order (i.e. 1,2,4,5),
3811 * we run the remainder of the pattern; on each try setting
3812 * the current position to the character following the word,
3813 * returning to try the next word on failure.
3814 *
3815 * We avoid having to build a list of words at runtime by
3816 * using a compile-time structure, wordinfo[].prev, which
3817 * gives, for each word, the previous accepting word (if any).
3818 * In the case above it would contain the mappings 1->2, 2->0,
3819 * 3->0, 4->5, 5->1. We can use this table to generate, from
3820 * the longest word (4 above), a list of all words, by
3821 * following the list of prev pointers; this gives us the
3822 * unordered list 4,5,1,2. Then given the current word we have
3823 * just tried, we can go through the list and find the
3824 * next-biggest word to try (so if we just failed on word 2,
3825 * the next in the list is 4).
3826 *
3827 * Since at runtime we don't record the matching position in
3828 * the string for each word, we have to work that out for
3829 * each word we're about to process. The wordinfo table holds
3830 * the character length of each word; given that we recorded
3831 * at the start: the position of the shortest word and its
3832 * length in chars, we just need to move the pointer the
3833 * difference between the two char lengths. Depending on
3834 * Unicode status and folding, that's cheap or expensive.
3835 *
3836 * This algorithm is optimised for the case where are only a
3837 * small number of accept states, i.e. 0,1, or maybe 2.
3838 * With lots of accepts states, and having to try all of them,
3839 * it becomes quadratic on number of accept states to find all
3840 * the next words.
3841 */
3842
3dab1dad 3843 {
07be1b83 3844 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3845 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3846
3847 /* what trie are we using right now */
be8e71aa 3848 reg_trie_data * const trie
f8fc2ecf 3849 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3850 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3851 U32 state = trie->startstate;
166ba7cd 3852
7016d6eb
DM
3853 if ( trie->bitmap
3854 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3855 {
3dab1dad
YO
3856 if (trie->states[ state ].wordnum) {
3857 DEBUG_EXECUTE_r(
3858 PerlIO_printf(Perl_debug_log,
3859 "%*s %smatched empty string...%s\n",
5bc10b2c 3860 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3861 );
20dbff7c
YO
3862 if (!trie->jump)
3863 break;
3dab1dad
YO
3864 } else {
3865 DEBUG_EXECUTE_r(
3866 PerlIO_printf(Perl_debug_log,
786e8c11 3867 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3868 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3869 );
3870 sayNO_SILENT;
3871 }
3872 }
166ba7cd 3873
786e8c11
YO
3874 {
3875 U8 *uc = ( U8* )locinput;
3876
3877 STRLEN len = 0;
3878 STRLEN foldlen = 0;
3879 U8 *uscan = (U8*)NULL;
786e8c11 3880 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3881 U32 charcount = 0; /* how many input chars we have matched */
3882 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3883
786e8c11 3884 ST.jump = trie->jump;
786e8c11 3885 ST.me = scan;
2e64971a
DM
3886 ST.firstpos = NULL;
3887 ST.longfold = FALSE; /* char longer if folded => it's harder */
3888 ST.nextword = 0;
3889
3890 /* fully traverse the TRIE; note the position of the
3891 shortest accept state and the wordnum of the longest
3892 accept state */
07be1b83 3893
220db18a 3894 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 3895 U32 base = trie->states[ state ].trans.base;
f9f4320a 3896 UV uvc = 0;
acb909b4 3897 U16 charid = 0;
2e64971a
DM
3898 U16 wordnum;
3899 wordnum = trie->states[ state ].wordnum;
3900
3901 if (wordnum) { /* it's an accept state */
3902 if (!accepted) {
3903 accepted = 1;
3904 /* record first match position */
3905 if (ST.longfold) {
3906 ST.firstpos = (U8*)locinput;
3907 ST.firstchars = 0;
5b47454d 3908 }
2e64971a
DM
3909 else {
3910 ST.firstpos = uc;
3911 ST.firstchars = charcount;
3912 }
3913 }
3914 if (!ST.nextword || wordnum < ST.nextword)
3915 ST.nextword = wordnum;
3916 ST.topword = wordnum;
786e8c11 3917 }
a3621e74 3918
07be1b83 3919 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3920 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3921 PerlIO_printf( Perl_debug_log,
2e64971a 3922 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3923 2+depth * 2, "", PL_colors[4],
2e64971a 3924 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3925 });
a3621e74 3926
2e64971a 3927 /* read a char and goto next state */
220db18a 3928 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 3929 I32 offset;
55eed653
NC
3930 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3931 uscan, len, uvc, charid, foldlen,
3932 foldbuf, uniflags);
2e64971a
DM
3933 charcount++;
3934 if (foldlen>0)
3935 ST.longfold = TRUE;
5b47454d 3936 if (charid &&
6dd2be57
DM
3937 ( ((offset =
3938 base + charid - 1 - trie->uniquecharcount)) >= 0)
3939
3940 && ((U32)offset < trie->lasttrans)
3941 && trie->trans[offset].check == state)
5b47454d 3942 {
6dd2be57 3943 state = trie->trans[offset].next;
5b47454d
DM
3944 }
3945 else {
3946 state = 0;
3947 }
3948 uc += len;
3949
3950 }
3951 else {
a3621e74
YO
3952 state = 0;
3953 }
3954 DEBUG_TRIE_EXECUTE_r(
e4584336 3955 PerlIO_printf( Perl_debug_log,
786e8c11 3956 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3957 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3958 );
3959 }
2e64971a 3960 if (!accepted)
a3621e74 3961 sayNO;
a3621e74 3962
2e64971a
DM
3963 /* calculate total number of accept states */
3964 {
3965 U16 w = ST.topword;
3966 accepted = 0;
3967 while (w) {
3968 w = trie->wordinfo[w].prev;
3969 accepted++;
3970 }
3971 ST.accepted = accepted;
3972 }
3973
166ba7cd
DM
3974 DEBUG_EXECUTE_r(
3975 PerlIO_printf( Perl_debug_log,
3976 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3977 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3978 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3979 );
2e64971a 3980 goto trie_first_try; /* jump into the fail handler */
786e8c11 3981 }}
118e2215 3982 assert(0); /* NOTREACHED */
2e64971a
DM
3983
3984 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3985 {
3986 U8 *uc;
fae667d5
YO
3987 if ( ST.jump) {
3988 REGCP_UNWIND(ST.cp);
a8d1f4b4 3989 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3990 }
2e64971a
DM
3991 if (!--ST.accepted) {
3992 DEBUG_EXECUTE_r({
3993 PerlIO_printf( Perl_debug_log,
3994 "%*s %sTRIE failed...%s\n",
3995 REPORT_CODE_OFF+depth*2, "",
3996 PL_colors[4],
3997 PL_colors[5] );
3998 });
3999 sayNO_SILENT;
4000 }
4001 {
4002 /* Find next-highest word to process. Note that this code
4003 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
4004 U16 min = 0;
4005 U16 word;
4006 U16 const nextword = ST.nextword;
4007 reg_trie_wordinfo * const wordinfo
2e64971a
DM
4008 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4009 for (word=ST.topword; word; word=wordinfo[word].prev) {
4010 if (word > nextword && (!min || word < min))
4011 min = word;
4012 }
4013 ST.nextword = min;
4014 }
4015
fae667d5 4016 trie_first_try:
5d458dd8
YO
4017 if (do_cutgroup) {
4018 do_cutgroup = 0;
4019 no_final = 0;
4020 }
fae667d5
YO
4021
4022 if ( ST.jump) {
b93070ed 4023 ST.lastparen = rex->lastparen;
f6033a9d 4024 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 4025 REGCP_SET(ST.cp);
2e64971a 4026 }
a3621e74 4027
2e64971a 4028 /* find start char of end of current word */
166ba7cd 4029 {
2e64971a 4030 U32 chars; /* how many chars to skip */
2e64971a
DM
4031 reg_trie_data * const trie
4032 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4033
4034 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035 >= ST.firstchars);
4036 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4037 - ST.firstchars;
a059a757 4038 uc = ST.firstpos;
2e64971a
DM
4039
4040 if (ST.longfold) {
4041 /* the hard option - fold each char in turn and find
4042 * its folded length (which may be different */
4043 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4044 STRLEN foldlen;
4045 STRLEN len;
d9a396a3 4046 UV uvc;
2e64971a
DM
4047 U8 *uscan;
4048
4049 while (chars) {
f2ed9b32 4050 if (utf8_target) {
2e64971a
DM
4051 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4052 uniflags);
4053 uc += len;
4054 }
4055 else {
4056 uvc = *uc;
4057 uc++;
4058 }
4059 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4060 uscan = foldbuf;
4061 while (foldlen) {
4062 if (!--chars)
4063 break;
4064 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4065 uniflags);
4066 uscan += len;
4067 foldlen -= len;
4068 }
4069 }
a3621e74 4070 }
2e64971a 4071 else {
f2ed9b32 4072 if (utf8_target)
2e64971a
DM
4073 while (chars--)
4074 uc += UTF8SKIP(uc);
4075 else
4076 uc += chars;
4077 }
2e64971a 4078 }
166ba7cd 4079
6603fe3e
DM
4080 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4081 ? ST.jump[ST.nextword]
4082 : NEXT_OFF(ST.me));
166ba7cd 4083
2e64971a
DM
4084 DEBUG_EXECUTE_r({
4085 PerlIO_printf( Perl_debug_log,
4086 "%*s %sTRIE matched word #%d, continuing%s\n",
4087 REPORT_CODE_OFF+depth*2, "",
4088 PL_colors[4],
4089 ST.nextword,
4090 PL_colors[5]
4091 );
4092 });
4093
4094 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4095 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4096 assert(0); /* NOTREACHED */
166ba7cd 4097 }
2e64971a
DM
4098 /* only one choice left - just continue */
4099 DEBUG_EXECUTE_r({
4100 AV *const trie_words
4101 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4102 SV ** const tmp = av_fetch( trie_words,
4103 ST.nextword-1, 0 );
4104 SV *sv= tmp ? sv_newmortal() : NULL;
4105
4106 PerlIO_printf( Perl_debug_log,
4107 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4108 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4109 ST.nextword,
4110 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4111 PL_colors[0], PL_colors[1],
c89df6cf 4112 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4113 )
4114 : "not compiled under -Dr",
4115 PL_colors[5] );
4116 });
4117
a059a757 4118 locinput = (char*)uc;
2e64971a 4119 continue; /* execute rest of RE */
118e2215 4120 assert(0); /* NOTREACHED */
a059a757 4121 }
166ba7cd
DM
4122#undef ST
4123
3c0563b9 4124 case EXACT: { /* /abc/ */
95b24440 4125 char *s = STRING(scan);
24d3c4a9 4126 ln = STR_LEN(scan);
984e6dd1 4127 if (utf8_target != is_utf8_pat) {
bc517b45 4128 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4129 char *l = locinput;
24d3c4a9 4130 const char * const e = s + ln;
a72c7584 4131
f2ed9b32 4132 if (utf8_target) {
e6a3850e
KW
4133 /* The target is utf8, the pattern is not utf8.
4134 * Above-Latin1 code points can't match the pattern;
4135 * invariants match exactly, and the other Latin1 ones need
4136 * to be downgraded to a single byte in order to do the
4137 * comparison. (If we could be confident that the target
4138 * is not malformed, this could be refactored to have fewer
4139 * tests by just assuming that if the first bytes match, it
4140 * is an invariant, but there are tests in the test suite
4141 * dealing with (??{...}) which violate this) */
1aa99e6b 4142 while (s < e) {
220db18a
DM
4143 if (l >= reginfo->strend
4144 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4145 {
e6a3850e
KW
4146 sayNO;
4147 }
4148 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4149 if (*l != *s) {
4150 sayNO;
4151 }
4152 l++;
4153 }
4154 else {
4155 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4156 sayNO;
4157 }
4158 l += 2;
4159 }
4160 s++;
1aa99e6b 4161 }
5ff6fc6d
JH
4162 }
4163 else {
4164 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4165 while (s < e) {
220db18a
DM
4166 if (l >= reginfo->strend
4167 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
4168 {
4169 sayNO;
4170 }
4171 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4172 if (*s != *l) {
4173 sayNO;
4174 }
4175 s++;
4176 }
4177 else {
4178 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4179 sayNO;
4180 }
4181 s += 2;
4182 }
4183 l++;
1aa99e6b 4184 }
5ff6fc6d 4185 }
1aa99e6b 4186 locinput = l;
1aa99e6b 4187 }
5ac65bff
KW
4188 else {
4189 /* The target and the pattern have the same utf8ness. */
4190 /* Inline the first character, for speed. */
220db18a 4191 if (reginfo->strend - locinput < ln
5ac65bff
KW
4192 || UCHARAT(s) != nextchr
4193 || (ln > 1 && memNE(s, locinput, ln)))
4194 {
4195 sayNO;
4196 }
4197 locinput += ln;
4198 }
d6a28714 4199 break;
95b24440 4200 }
7016d6eb 4201
3c0563b9 4202 case EXACTFL: { /* /abc/il */
a932d541 4203 re_fold_t folder;
9a5a5549
KW
4204 const U8 * fold_array;
4205 const char * s;
d513472c 4206 U32 fold_utf8_flags;
9a5a5549 4207
272d35c9 4208 RX_MATCH_TAINTED_on(reginfo->prog);
f67f9e53
KW
4209 folder = foldEQ_locale;
4210 fold_array = PL_fold_locale;
17580e7a 4211 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4212 goto do_exactf;
4213
3c0563b9
DM
4214 case EXACTFU_SS: /* /\x{df}/iu */
4215 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4216 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4217 folder = foldEQ_latin1;
4218 fold_array = PL_fold_latin1;
984e6dd1 4219 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4220 goto do_exactf;
4221
3c0563b9 4222 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4223 folder = foldEQ_latin1;
4224 fold_array = PL_fold_latin1;
57014d77 4225 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4226 goto do_exactf;
4227
3c0563b9 4228 case EXACTF: /* /abc/i */
9a5a5549
KW
4229 folder = foldEQ;
4230 fold_array = PL_fold;
62bf7766 4231 fold_utf8_flags = 0;
9a5a5549
KW
4232
4233 do_exactf:
4234 s = STRING(scan);
24d3c4a9 4235 ln = STR_LEN(scan);
d6a28714 4236
984e6dd1 4237 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
3c760661
KW
4238 /* Either target or the pattern are utf8, or has the issue where
4239 * the fold lengths may differ. */
be8e71aa 4240 const char * const l = locinput;
220db18a 4241 char *e = reginfo->strend;
bc517b45 4242
984e6dd1 4243 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4244 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4245 {
4246 sayNO;
5486206c 4247 }
d07ddd77 4248 locinput = e;
d07ddd77 4249 break;
a0ed51b3 4250 }
d6a28714 4251
0a138b74 4252 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4253 if (UCHARAT(s) != nextchr
4254 && !NEXTCHR_IS_EOS
4255 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4256 {
a0ed51b3 4257 sayNO;
9a5a5549 4258 }
220db18a 4259 if (reginfo->strend - locinput < ln)
b8c5462f 4260 sayNO;
9a5a5549 4261 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4262 sayNO;
24d3c4a9 4263 locinput += ln;
a0d0e21e 4264 break;
9a5a5549 4265 }
63ac0dad
KW
4266
4267 /* XXX Could improve efficiency by separating these all out using a
4268 * macro or in-line function. At that point regcomp.c would no longer
4269 * have to set the FLAGS fields of these */
3c0563b9
DM
4270 case BOUNDL: /* /\b/l */
4271 case NBOUNDL: /* /\B/l */
272d35c9 4272 RX_MATCH_TAINTED_on(reginfo->prog);
b2680017 4273 /* FALL THROUGH */
3c0563b9
DM
4274 case BOUND: /* /\b/ */
4275 case BOUNDU: /* /\b/u */
4276 case BOUNDA: /* /\b/a */
4277 case NBOUND: /* /\B/ */
4278 case NBOUNDU: /* /\B/u */
4279 case NBOUNDA: /* /\B/a */
b2680017 4280 /* was last char in word? */
f2e96b5d
KW
4281 if (utf8_target
4282 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4283 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4284 {
9d9163fb 4285 if (locinput == reginfo->strbeg)
b2680017
YO
4286 ln = '\n';
4287 else {
9d9163fb
DM
4288 const U8 * const r =
4289 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
b2680017
YO
4290
4291 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4292 }
63ac0dad 4293 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4294 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4295 if (NEXTCHR_IS_EOS)
4296 n = 0;
4297 else {
4298 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4299 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4300 utf8_target);
4301 }
b2680017
YO
4302 }
4303 else {
0eb30aeb
KW
4304 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4305 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4306 }
4307 }
4308 else {
cfaf538b
KW
4309
4310 /* Here the string isn't utf8, or is utf8 and only ascii
4311 * characters are to match \w. In the latter case looking at
4312 * the byte just prior to the current one may be just the final
4313 * byte of a multi-byte character. This is ok. There are two
4314 * cases:
4315 * 1) it is a single byte character, and then the test is doing
4316 * just what it's supposed to.
4317 * 2) it is a multi-byte character, in which case the final
4318 * byte is never mistakable for ASCII, and so the test
4319 * will say it is not a word character, which is the
4320 * correct answer. */
9d9163fb 4321 ln = (locinput != reginfo->strbeg) ?
b2680017 4322 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4323 switch (FLAGS(scan)) {
4324 case REGEX_UNICODE_CHARSET:
4325 ln = isWORDCHAR_L1(ln);
7016d6eb 4326 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4327 break;
4328 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4329 ln = isWORDCHAR_LC(ln);
4330 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4331 break;
4332 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4333 ln = isWORDCHAR(ln);
4334 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4335 break;
cfaf538b 4336 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4337 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4338 ln = isWORDCHAR_A(ln);
7016d6eb 4339 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4340 break;
63ac0dad
KW
4341 default:
4342 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4343 break;
b2680017
YO
4344 }
4345 }
63ac0dad
KW
4346 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4347 * regcomp.sym */
4348 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4349 sayNO;
4350 break;
3c0563b9 4351
3c0563b9 4352 case ANYOF: /* /[abc]/ */
954a2af6 4353 case ANYOF_WARN_SUPER:
7016d6eb
DM
4354 if (NEXTCHR_IS_EOS)
4355 sayNO;
e0193e47 4356 if (utf8_target) {
635cd5d4 4357 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4358 sayNO;
635cd5d4 4359 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4360 }
4361 else {
20ed0b26 4362 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4363 sayNO;
3640db6b 4364 locinput++;
e0f9d4a8 4365 }
b8c5462f 4366 break;
3c0563b9 4367
3018b823
KW
4368 /* The argument (FLAGS) to all the POSIX node types is the class number
4369 * */
ee9a90b8 4370
3018b823
KW
4371 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4372 to_complement = 1;
4373 /* FALLTHROUGH */
4374
4375 case POSIXL: /* \w or [:punct:] etc. under /l */
4376 if (NEXTCHR_IS_EOS)
bedac28b 4377 sayNO;
bedac28b 4378
3018b823
KW
4379 /* The locale hasn't influenced the outcome before this, so defer
4380 * tainting until now */
272d35c9 4381 RX_MATCH_TAINTED_on(reginfo->prog);
3018b823
KW
4382
4383 /* Use isFOO_lc() for characters within Latin1. (Note that
4384 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4385 * wouldn't be invariant) */
4386 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4387 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4388 sayNO;
4389 }
4390 }
3018b823
KW
4391 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4392 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
eb4e9c04 4393 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
3018b823
KW
4394 *(locinput + 1))))))
4395 {
bedac28b 4396 sayNO;
3018b823 4397 }
bedac28b 4398 }
3018b823
KW
4399 else { /* Here, must be an above Latin-1 code point */
4400 goto utf8_posix_not_eos;
bedac28b 4401 }
3018b823
KW
4402
4403 /* Here, must be utf8 */
4404 locinput += UTF8SKIP(locinput);
bedac28b
KW
4405 break;
4406
3018b823
KW
4407 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4408 to_complement = 1;
4409 /* FALLTHROUGH */
4410
4411 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4412 if (utf8_target) {
3018b823 4413 goto utf8_posix;
bedac28b 4414 }
3018b823
KW
4415 goto posixa;
4416
4417 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4418
3018b823 4419 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4420 sayNO;
4421 }
bedac28b 4422
3018b823
KW
4423 /* All UTF-8 variants match */
4424 if (! UTF8_IS_INVARIANT(nextchr)) {
4425 goto increment_locinput;
bedac28b 4426 }
ee9a90b8 4427
3018b823
KW
4428 to_complement = 1;
4429 /* FALLTHROUGH */
4430
4431 case POSIXA: /* \w or [:punct:] etc. under /a */
4432
4433 posixa:
4434 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4435 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4436 * character is a single byte */
20d0b1e9 4437
3018b823
KW
4438 if (NEXTCHR_IS_EOS
4439 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4440 FLAGS(scan)))))
4441 {
0658cdde
KW
4442 sayNO;
4443 }
3018b823
KW
4444
4445 /* Here we are either not in utf8, or we matched a utf8-invariant,
4446 * so the next char is the next byte */
3640db6b 4447 locinput++;
0658cdde 4448 break;
3c0563b9 4449
3018b823
KW
4450 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4451 to_complement = 1;
4452 /* FALLTHROUGH */
4453
4454 case POSIXU: /* \w or [:punct:] etc. under /u */
4455 utf8_posix:
4456 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4457 sayNO;
4458 }
3018b823
KW
4459 utf8_posix_not_eos:
4460
4461 /* Use _generic_isCC() for characters within Latin1. (Note that
4462 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4463 * wouldn't be invariant) */
4464 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4465 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4466 FLAGS(scan)))))
4467 {
4468 sayNO;
4469 }
4470 locinput++;
4471 }
4472 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4473 if (! (to_complement
4474 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4475 *(locinput + 1)),
4476 FLAGS(scan)))))
4477 {
4478 sayNO;
4479 }
4480 locinput += 2;
4481 }
4482 else { /* Handle above Latin-1 code points */
4483 classnum = (_char_class_number) FLAGS(scan);
4484 if (classnum < _FIRST_NON_SWASH_CC) {
4485
4486 /* Here, uses a swash to find such code points. Load if if
4487 * not done already */
4488 if (! PL_utf8_swash_ptrs[classnum]) {
4489 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4490 PL_utf8_swash_ptrs[classnum]
4491 = _core_swash_init("utf8",
4492 swash_property_names[classnum],
4493 &PL_sv_undef, 1, 0, NULL, &flags);
4494 }
4495 if (! (to_complement
4496 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4497 (U8 *) locinput, TRUE))))
4498 {
4499 sayNO;
4500 }
4501 }
4502 else { /* Here, uses macros to find above Latin-1 code points */
4503 switch (classnum) {
4504 case _CC_ENUM_SPACE: /* XXX would require separate
4505 code if we revert the change
4506 of \v matching this */
4507 case _CC_ENUM_PSXSPC:
4508 if (! (to_complement
4509 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4510 {
4511 sayNO;
4512 }
4513 break;
4514 case _CC_ENUM_BLANK:
4515 if (! (to_complement
4516 ^ cBOOL(is_HORIZWS_high(locinput))))
4517 {
4518 sayNO;
4519 }
4520 break;
4521 case _CC_ENUM_XDIGIT:
4522 if (! (to_complement
4523 ^ cBOOL(is_XDIGIT_high(locinput))))
4524 {
4525 sayNO;
4526 }
4527 break;
4528 case _CC_ENUM_VERTSPACE:
4529 if (! (to_complement
4530 ^ cBOOL(is_VERTWS_high(locinput))))
4531 {
4532 sayNO;
4533 }
4534 break;
4535 default: /* The rest, e.g. [:cntrl:], can't match
4536 above Latin1 */
4537 if (! to_complement) {
4538 sayNO;
4539 }
4540 break;
4541 }
4542 }
4543 locinput += UTF8SKIP(locinput);
4544 }
4545 break;
0658cdde 4546
37e2e78e
KW
4547 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4548 a Unicode extended Grapheme Cluster */
4549 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4550 extended Grapheme Cluster is:
4551
7aee35ff
KW
4552 CR LF
4553 | Prepend* Begin Extend*
4554 | .
37e2e78e 4555
7aee35ff
KW
4556 Begin is: ( Special_Begin | ! Control )
4557 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4558 Extend is: ( Grapheme_Extend | Spacing_Mark )
4559 Control is: [ GCB_Control | CR | LF ]
4560 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4561
27d4fc33
KW
4562 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4563 we can rewrite
4564
4565 Begin is ( Regular_Begin + Special Begin )
4566
4567 It turns out that 98.4% of all Unicode code points match
4568 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4569 the previous implementation for almost all Unicode code points.
27d4fc33 4570
37e2e78e
KW
4571 There is a subtlety with Prepend* which showed up in testing.
4572 Note that the Begin, and only the Begin is required in:
4573 | Prepend* Begin Extend*
cc3b396d
KW
4574 Also, Begin contains '! Control'. A Prepend must be a
4575 '! Control', which means it must also be a Begin. What it
4576 comes down to is that if we match Prepend* and then find no
4577 suitable Begin afterwards, that if we backtrack the last
4578 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4579 */
4580
7016d6eb 4581 if (NEXTCHR_IS_EOS)
a0ed51b3 4582 sayNO;
f2ed9b32 4583 if (! utf8_target) {
37e2e78e
KW
4584
4585 /* Match either CR LF or '.', as all the other possibilities
4586 * require utf8 */
4587 locinput++; /* Match the . or CR */
cc3b396d
KW
4588 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4589 match the LF */
220db18a 4590 && locinput < reginfo->strend
e699a1d5
KW
4591 && UCHARAT(locinput) == '\n')
4592 {
4593 locinput++;
4594 }
37e2e78e
KW
4595 }
4596 else {
4597
4598 /* Utf8: See if is ( CR LF ); already know that locinput <
220db18a
DM
4599 * reginfo->strend, so locinput+1 is in bounds */
4600 if ( nextchr == '\r' && locinput+1 < reginfo->strend
f67f9e53 4601 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4602 {
37e2e78e
KW
4603 locinput += 2;
4604 }
4605 else {
45fdf108
KW
4606 STRLEN len;
4607
37e2e78e
KW
4608 /* In case have to backtrack to beginning, then match '.' */
4609 char *starting = locinput;
4610
4611 /* In case have to backtrack the last prepend */
e699a1d5 4612 char *previous_prepend = NULL;
37e2e78e
KW
4613
4614 LOAD_UTF8_CHARCLASS_GCB();
4615
45fdf108 4616 /* Match (prepend)* */
220db18a 4617 while (locinput < reginfo->strend
45fdf108
KW
4618 && (len = is_GCB_Prepend_utf8(locinput)))
4619 {
4620 previous_prepend = locinput;
4621 locinput += len;
a1853d78 4622 }
37e2e78e
KW
4623
4624 /* As noted above, if we matched a prepend character, but
4625 * the next thing won't match, back off the last prepend we
4626 * matched, as it is guaranteed to match the begin */
4627 if (previous_prepend
220db18a 4628 && (locinput >= reginfo->strend
c101f46d
KW
4629 || (! swash_fetch(PL_utf8_X_regular_begin,
4630 (U8*)locinput, utf8_target)
bff53399 4631 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4632 )
37e2e78e
KW
4633 {
4634 locinput = previous_prepend;
4635 }
4636
220db18a 4637 /* Note that here we know reginfo->strend > locinput, as we
37e2e78e
KW
4638 * tested that upon input to this switch case, and if we
4639 * moved locinput forward, we tested the result just above
4640 * and it either passed, or we backed off so that it will
4641 * now pass */
11dfcd49
KW
4642 if (swash_fetch(PL_utf8_X_regular_begin,
4643 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4644 locinput += UTF8SKIP(locinput);
4645 }
bff53399 4646 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4647
4648 /* Here did not match the required 'Begin' in the
4649 * second term. So just match the very first
4650 * character, the '.' of the final term of the regex */
4651 locinput = starting + UTF8SKIP(starting);
27d4fc33 4652 goto exit_utf8;
37e2e78e
KW
4653 } else {
4654
11dfcd49
KW
4655 /* Here is a special begin. It can be composed of
4656 * several individual characters. One possibility is
4657 * RI+ */
45fdf108
KW
4658 if ((len = is_GCB_RI_utf8(locinput))) {
4659 locinput += len;
220db18a 4660 while (locinput < reginfo->strend
45fdf108 4661 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4662 {
45fdf108 4663 locinput += len;
11dfcd49 4664 }
45fdf108
KW
4665 } else if ((len = is_GCB_T_utf8(locinput))) {
4666 /* Another possibility is T+ */
4667 locinput += len;
220db18a 4668 while (locinput < reginfo->strend
45fdf108 4669 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4670 {
45fdf108 4671 locinput += len;
11dfcd49
KW
4672 }
4673 } else {
4674
4675 /* Here, neither RI+ nor T+; must be some other
4676 * Hangul. That means it is one of the others: L,
4677 * LV, LVT or V, and matches:
4678 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4679
4680 /* Match L* */
220db18a 4681 while (locinput < reginfo->strend
45fdf108 4682 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4683 {
45fdf108 4684 locinput += len;
11dfcd49 4685 }
37e2e78e 4686
11dfcd49
KW
4687 /* Here, have exhausted L*. If the next character
4688 * is not an LV, LVT nor V, it means we had to have
4689 * at least one L, so matches L+ in the original
4690 * equation, we have a complete hangul syllable.
4691 * Are done. */
4692
220db18a 4693 if (locinput < reginfo->strend
45fdf108 4694 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4695 {
11dfcd49 4696 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4697 * See if LVT, by first ruling out V, then LV */
4698 if (! is_GCB_V_utf8(locinput)
4699 /* All but every TCount one is LV */
4700 && (valid_utf8_to_uvchr((U8 *) locinput,
4701 NULL)
4702 - SBASE)
4703 % TCount != 0)
4704 {
11dfcd49
KW
4705 locinput += UTF8SKIP(locinput);
4706 } else {
4707
4708 /* Must be V or LV. Take it, then match
4709 * V* */
4710 locinput += UTF8SKIP(locinput);
220db18a 4711 while (locinput < reginfo->strend
45fdf108 4712 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4713 {
45fdf108 4714 locinput += len;
11dfcd49
KW
4715 }
4716 }
37e2e78e 4717
11dfcd49 4718 /* And any of LV, LVT, or V can be followed
45fdf108 4719 * by T* */
220db18a 4720 while (locinput < reginfo->strend
45fdf108 4721 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4722 {
45fdf108 4723 locinput += len;
11dfcd49
KW
4724 }
4725 }
cd94d768 4726 }
11dfcd49 4727 }
37e2e78e 4728
11dfcd49 4729 /* Match any extender */
220db18a 4730 while (locinput < reginfo->strend
11dfcd49
KW
4731 && swash_fetch(PL_utf8_X_extend,
4732 (U8*)locinput, utf8_target))
4733 {
4734 locinput += UTF8SKIP(locinput);
4735 }
37e2e78e 4736 }
27d4fc33 4737 exit_utf8:
220db18a 4738 if (locinput > reginfo->strend) sayNO;
37e2e78e 4739 }
a0ed51b3 4740 break;
81714fb9 4741
3c0563b9 4742 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4743 { /* The capture buffer cases. The ones beginning with N for the
4744 named buffers just convert to the equivalent numbered and
4745 pretend they were called as the corresponding numbered buffer
4746 op. */
26ecd678
TC
4747 /* don't initialize these in the declaration, it makes C++
4748 unhappy */
9d9163fb 4749 const char *s;
ff1157ca 4750 char type;
8368298a
TC
4751 re_fold_t folder;
4752 const U8 *fold_array;
26ecd678 4753 UV utf8_fold_flags;
8368298a 4754
272d35c9 4755 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4756 folder = foldEQ_locale;
4757 fold_array = PL_fold_locale;
4758 type = REFFL;
17580e7a 4759 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4760 goto do_nref;
4761
3c0563b9 4762 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4763 folder = foldEQ_latin1;
4764 fold_array = PL_fold_latin1;
4765 type = REFFA;
4766 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4767 goto do_nref;
4768
3c0563b9 4769 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4770 folder = foldEQ_latin1;
4771 fold_array = PL_fold_latin1;
4772 type = REFFU;
d513472c 4773 utf8_fold_flags = 0;
d7ef4b73
KW
4774 goto do_nref;
4775
3c0563b9 4776 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4777 folder = foldEQ;
4778 fold_array = PL_fold;
4779 type = REFF;
d513472c 4780 utf8_fold_flags = 0;
d7ef4b73
KW
4781 goto do_nref;
4782
3c0563b9 4783 case NREF: /* /\g{name}/ */
d7ef4b73 4784 type = REF;
83d7b90b
KW
4785 folder = NULL;
4786 fold_array = NULL;
d513472c 4787 utf8_fold_flags = 0;
d7ef4b73
KW
4788 do_nref:
4789
4790 /* For the named back references, find the corresponding buffer
4791 * number */
0a4db386
YO
4792 n = reg_check_named_buff_matched(rex,scan);
4793
d7ef4b73 4794 if ( ! n ) {
81714fb9 4795 sayNO;
d7ef4b73
KW
4796 }
4797 goto do_nref_ref_common;
4798
3c0563b9 4799 case REFFL: /* /\1/il */
272d35c9 4800 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4801 folder = foldEQ_locale;
4802 fold_array = PL_fold_locale;
17580e7a 4803 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4804 goto do_ref;
4805
3c0563b9 4806 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4807 folder = foldEQ_latin1;
4808 fold_array = PL_fold_latin1;
4809 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4810 goto do_ref;
4811
3c0563b9 4812 case REFFU: /* /\1/iu */
d7ef4b73
KW
4813 folder = foldEQ_latin1;
4814 fold_array = PL_fold_latin1;
d513472c 4815 utf8_fold_flags = 0;
d7ef4b73
KW
4816 goto do_ref;
4817
3c0563b9 4818 case REFF: /* /\1/i */
d7ef4b73
KW
4819 folder = foldEQ;
4820 fold_array = PL_fold;
d513472c 4821 utf8_fold_flags = 0;
83d7b90b 4822 goto do_ref;
d7ef4b73 4823
3c0563b9 4824 case REF: /* /\1/ */
83d7b90b
KW
4825 folder = NULL;
4826 fold_array = NULL;
d513472c 4827 utf8_fold_flags = 0;
83d7b90b 4828
d7ef4b73 4829 do_ref:
81714fb9 4830 type = OP(scan);
d7ef4b73
KW
4831 n = ARG(scan); /* which paren pair */
4832
4833 do_nref_ref_common:
b93070ed 4834 ln = rex->offs[n].start;
1cb48e53 4835 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
b93070ed 4836 if (rex->lastparen < n || ln == -1)
af3f8c16 4837 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4838 if (ln == rex->offs[n].end)
a0d0e21e 4839 break;
a0ed51b3 4840
9d9163fb 4841 s = reginfo->strbeg + ln;
d7ef4b73 4842 if (type != REF /* REF can do byte comparison */
2f65c56d 4843 && (utf8_target || type == REFFU))
d7ef4b73 4844 { /* XXX handle REFFL better */
220db18a 4845 char * limit = reginfo->strend;
d7ef4b73
KW
4846
4847 /* This call case insensitively compares the entire buffer
4848 * at s, with the current input starting at locinput, but
220db18a
DM
4849 * not going off the end given by reginfo->strend, and
4850 * returns in <limit> upon success, how much of the
4851 * current input was matched */
b93070ed 4852 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4853 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4854 {
4855 sayNO;
a0ed51b3 4856 }
d7ef4b73 4857 locinput = limit;
a0ed51b3
LW
4858 break;
4859 }
4860
d7ef4b73 4861 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4862 if (!NEXTCHR_IS_EOS &&
4863 UCHARAT(s) != nextchr &&
81714fb9 4864 (type == REF ||
d7ef4b73 4865 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4866 sayNO;
b93070ed 4867 ln = rex->offs[n].end - ln;
220db18a 4868 if (locinput + ln > reginfo->strend)
4633a7c4 4869 sayNO;
81714fb9 4870 if (ln > 1 && (type == REF
24d3c4a9 4871 ? memNE(s, locinput, ln)
d7ef4b73 4872 : ! folder(s, locinput, ln)))
4633a7c4 4873 sayNO;
24d3c4a9 4874 locinput += ln;
a0d0e21e 4875 break;
81714fb9 4876 }
3c0563b9
DM
4877
4878 case NOTHING: /* null op; e.g. the 'nothing' following
4879 * the '*' in m{(a+|b)*}' */
4880 break;
4881 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4882 break;
3c0563b9
DM
4883
4884 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4885 break;
40a82448
DM
4886
4887#undef ST
4888#define ST st->u.eval
c277df42 4889 {
c277df42 4890 SV *ret;
d2f13c59 4891 REGEXP *re_sv;
6bda09f9 4892 regexp *re;
f8fc2ecf 4893 regexp_internal *rei;
1a147d38
YO
4894 regnode *startpoint;
4895
3c0563b9 4896 case GOSTART: /* (?R) */
e7707071
YO
4897 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4898 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4899 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4900 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4901 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4902 Perl_croak(aTHX_
4903 "Pattern subroutine nesting without pos change"
4904 " exceeded limit in regex");
6bda09f9
YO
4905 } else {
4906 nochange_depth = 0;
1a147d38 4907 }
288b8c02 4908 re_sv = rex_sv;
6bda09f9 4909 re = rex;
f8fc2ecf 4910 rei = rexi;
1a147d38 4911 if (OP(scan)==GOSUB) {
6bda09f9
YO
4912 startpoint = scan + ARG2L(scan);
4913 ST.close_paren = ARG(scan);
4914 } else {
f8fc2ecf 4915 startpoint = rei->program+1;
6bda09f9
YO
4916 ST.close_paren = 0;
4917 }
4918 goto eval_recurse_doit;
118e2215 4919 assert(0); /* NOTREACHED */
3c0563b9 4920
6bda09f9
YO
4921 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4922 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4923 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4924 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4925 } else {
4926 nochange_depth = 0;
4927 }
8e5e9ebe 4928 {
4aabdb9b 4929 /* execute the code in the {...} */
81ed78b2 4930
4aabdb9b 4931 dSP;
a6dc34f1 4932 IV before;
1f4d1a1e 4933 OP * const oop = PL_op;
4aabdb9b 4934 COP * const ocurcop = PL_curcop;
81ed78b2 4935 OP *nop;
81ed78b2 4936 CV *newcv;
91332126 4937
74088413 4938 /* save *all* paren positions */
92da3157 4939 regcppush(rex, 0, maxopenparen);
74088413
DM
4940 REGCP_SET(runops_cp);
4941
81ed78b2
DM
4942 if (!caller_cv)
4943 caller_cv = find_runcv(NULL);
4944
4aabdb9b 4945 n = ARG(scan);
81ed78b2 4946
b30fcab9 4947 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4948 newcv = (ReANY(
b30fcab9
DM
4949 (REGEXP*)(rexi->data->data[n])
4950 ))->qr_anoncv
81ed78b2
DM
4951 ;
4952 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4953 }
4954 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4955 newcv = caller_cv;
4956 nop = (OP*)rexi->data->data[n];
4957 assert(CvDEPTH(newcv));
68e2671b
DM
4958 }
4959 else {
d24ca0c5
DM
4960 /* literal with own CV */
4961 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4962 newcv = rex->qr_anoncv;
4963 nop = (OP*)rexi->data->data[n];
68e2671b 4964 }
81ed78b2 4965
0e458318
DM
4966 /* normally if we're about to execute code from the same
4967 * CV that we used previously, we just use the existing
4968 * CX stack entry. However, its possible that in the
4969 * meantime we may have backtracked, popped from the save
4970 * stack, and undone the SAVECOMPPAD(s) associated with
4971 * PUSH_MULTICALL; in which case PL_comppad no longer
4972 * points to newcv's pad. */
4973 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4974 {
b0065247
DM
4975 U8 flags = (CXp_SUB_RE |
4976 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 4977 if (last_pushed_cv) {
b0065247 4978 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
4979 }
4980 else {
b0065247 4981 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
4982 }
4983 last_pushed_cv = newcv;
4984 }
c31ee3bb
DM
4985 else {
4986 /* these assignments are just to silence compiler
4987 * warnings */
4988 multicall_cop = NULL;
4989 newsp = NULL;
4990 }
0e458318
DM
4991 last_pad = PL_comppad;
4992
2e2e3f36
DM
4993 /* the initial nextstate you would normally execute
4994 * at the start of an eval (which would cause error
4995 * messages to come from the eval), may be optimised
4996 * away from the execution path in the regex code blocks;
4997 * so manually set PL_curcop to it initially */
4998 {
81ed78b2 4999 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
5000 assert(o->op_type == OP_NULL);
5001 if (o->op_targ == OP_SCOPE) {
5002 o = cUNOPo->op_first;
5003 }
5004 else {
5005 assert(o->op_targ == OP_LEAVE);
5006 o = cUNOPo->op_first;
5007 assert(o->op_type == OP_ENTER);
5008 o = o->op_sibling;
5009 }
5010
5011 if (o->op_type != OP_STUB) {
5012 assert( o->op_type == OP_NEXTSTATE
5013 || o->op_type == OP_DBSTATE
5014 || (o->op_type == OP_NULL
5015 && ( o->op_targ == OP_NEXTSTATE
5016 || o->op_targ == OP_DBSTATE
5017 )
5018 )
5019 );
5020 PL_curcop = (COP*)o;
5021 }
5022 }
81ed78b2 5023 nop = nop->op_next;
2e2e3f36 5024
24b23f37 5025 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
5026 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5027
8adc0f72 5028 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9
DM
5029 if (reginfo->info_aux_eval->pos_magic)
5030 reginfo->info_aux_eval->pos_magic->mg_len
8adc0f72 5031 = locinput - reginfo->strbeg;
4aabdb9b 5032
2bf803e2
YO
5033 if (sv_yes_mark) {
5034 SV *sv_mrk = get_sv("REGMARK", 1);
5035 sv_setsv(sv_mrk, sv_yes_mark);
5036 }
5037
81ed78b2
DM
5038 /* we don't use MULTICALL here as we want to call the
5039 * first op of the block of interest, rather than the
5040 * first op of the sub */
a6dc34f1 5041 before = (IV)(SP-PL_stack_base);
81ed78b2 5042 PL_op = nop;
8e5e9ebe
RGS
5043 CALLRUNOPS(aTHX); /* Scalar context. */
5044 SPAGAIN;
a6dc34f1 5045 if ((IV)(SP-PL_stack_base) == before)
075aa684 5046 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
5047 else {
5048 ret = POPs;
5049 PUTBACK;
5050 }
4aabdb9b 5051
e4bfbed3
DM
5052 /* before restoring everything, evaluate the returned
5053 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
5054 * PL_op or pad. Also need to process any magic vars
5055 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
5056
5057 PL_op = NULL;
5058
5e98dac2 5059 re_sv = NULL;
e4bfbed3
DM
5060 if (logical == 0) /* (?{})/ */
5061 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5062 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5063 sw = cBOOL(SvTRUE(ret));
5064 logical = 0;
5065 }
5066 else { /* /(??{}) */
497d0a96
DM
5067 /* if its overloaded, let the regex compiler handle
5068 * it; otherwise extract regex, or stringify */
5069 if (!SvAMAGIC(ret)) {
5070 SV *sv = ret;
5071 if (SvROK(sv))
5072 sv = SvRV(sv);
5073 if (SvTYPE(sv) == SVt_REGEXP)
5074 re_sv = (REGEXP*) sv;
5075 else if (SvSMAGICAL(sv)) {
5076 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5077 if (mg)
5078 re_sv = (REGEXP *) mg->mg_obj;
5079 }
e4bfbed3 5080
497d0a96
DM
5081 /* force any magic, undef warnings here */
5082 if (!re_sv) {
5083 ret = sv_mortalcopy(ret);
5084 (void) SvPV_force_nolen(ret);
5085 }
e4bfbed3
DM
5086 }
5087
5088 }
5089
81ed78b2
DM
5090 /* *** Note that at this point we don't restore
5091 * PL_comppad, (or pop the CxSUB) on the assumption it may
5092 * be used again soon. This is safe as long as nothing
5093 * in the regexp code uses the pad ! */
4aabdb9b 5094 PL_op = oop;
4aabdb9b 5095 PL_curcop = ocurcop;
92da3157 5096 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
f5df269c 5097 PL_curpm = PL_reg_curpm;
e4bfbed3
DM
5098
5099 if (logical != 2)
4aabdb9b 5100 break;
8e5e9ebe 5101 }
e4bfbed3
DM
5102
5103 /* only /(??{})/ from now on */
24d3c4a9 5104 logical = 0;
4aabdb9b 5105 {
4f639d21
DM
5106 /* extract RE object from returned value; compiling if
5107 * necessary */
5c35adbb 5108
575c37f6
DM
5109 if (re_sv) {
5110 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5111 }
0f5d15d6 5112 else {
c737faaf 5113 U32 pm_flags = 0;
0f5d15d6 5114
9753d940
DM
5115 if (SvUTF8(ret) && IN_BYTES) {
5116 /* In use 'bytes': make a copy of the octet
5117 * sequence, but without the flag on */
b9ad30b4
NC
5118 STRLEN len;
5119 const char *const p = SvPV(ret, len);
5120 ret = newSVpvn_flags(p, len, SVs_TEMP);
5121 }
732caac7
DM
5122 if (rex->intflags & PREGf_USE_RE_EVAL)
5123 pm_flags |= PMf_USE_RE_EVAL;
5124
5125 /* if we got here, it should be an engine which
5126 * supports compiling code blocks and stuff */
5127 assert(rex->engine && rex->engine->op_comp);
ec841a27 5128 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5129 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5130 rex->engine, NULL, NULL,
5131 /* copy /msix etc to inner pattern */
5132 scan->flags,
5133 pm_flags);
732caac7 5134
9041c2e3 5135 if (!(SvFLAGS(ret)
faf82a0b 5136 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 5137 | SVs_GMG))) {
a2794585
NC
5138 /* This isn't a first class regexp. Instead, it's
5139 caching a regexp onto an existing, Perl visible
5140 scalar. */
575c37f6 5141 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5142 }
74088413
DM
5143 /* safe to do now that any $1 etc has been
5144 * interpolated into the new pattern string and
5145 * compiled */
92da3157 5146 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
0f5d15d6 5147 }
e1ff3a88 5148 SAVEFREESV(re_sv);
8d919b0a 5149 re = ReANY(re_sv);
4aabdb9b 5150 }
07bc277f 5151 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5152 re->subbeg = rex->subbeg;
5153 re->sublen = rex->sublen;
6502e081
DM
5154 re->suboffset = rex->suboffset;
5155 re->subcoffset = rex->subcoffset;
f8fc2ecf 5156 rei = RXi_GET(re);
6bda09f9 5157 DEBUG_EXECUTE_r(
220db18a
DM
5158 debug_start_match(re_sv, utf8_target, locinput,
5159 reginfo->strend, "Matching embedded");
6bda09f9 5160 );
f8fc2ecf 5161 startpoint = rei->program + 1;
1a147d38 5162 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5163
1a147d38 5164 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5165 /* run the pattern returned from (??{...}) */
92da3157
DM
5166
5167 /* Save *all* the positions. */
5168 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5169 REGCP_SET(ST.lastcp);
6bda09f9 5170
0357f1fd
ML
5171 re->lastparen = 0;
5172 re->lastcloseparen = 0;
5173
92da3157 5174 maxopenparen = 0;
4aabdb9b 5175
1cb95af7
DM
5176 /* invalidate the S-L poscache. We're now executing a
5177 * different set of WHILEM ops (and their associated
5178 * indexes) against the same string, so the bits in the
5179 * cache are meaningless. Setting maxiter to zero forces
5180 * the cache to be invalidated and zeroed before reuse.
5181 * XXX This is too dramatic a measure. Ideally we should
5182 * save the old cache and restore when running the outer
5183 * pattern again */
1cb48e53 5184 reginfo->poscache_maxiter = 0;
4aabdb9b 5185
aed7b151 5186 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5187
288b8c02 5188 ST.prev_rex = rex_sv;
faec1544 5189 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5190 rex_sv = re_sv;
5191 SET_reg_curpm(rex_sv);
288b8c02 5192 rex = re;
f8fc2ecf 5193 rexi = rei;
faec1544 5194 cur_curlyx = NULL;
40a82448 5195 ST.B = next;
faec1544
DM
5196 ST.prev_eval = cur_eval;
5197 cur_eval = st;
faec1544 5198 /* now continue from first node in postoned RE */
4d5016e5 5199 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5200 assert(0); /* NOTREACHED */
c277df42 5201 }
40a82448 5202
faec1544
DM
5203 case EVAL_AB: /* cleanup after a successful (??{A})B */
5204 /* note: this is called twice; first after popping B, then A */
ec43f78b 5205 rex_sv = ST.prev_rex;
aed7b151 5206 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5207 SET_reg_curpm(rex_sv);
8d919b0a 5208 rex = ReANY(rex_sv);
f8fc2ecf 5209 rexi = RXi_GET(rex);
faec1544
DM
5210 regcpblow(ST.cp);
5211 cur_eval = ST.prev_eval;
5212 cur_curlyx = ST.prev_curlyx;
34a81e2b 5213
1cb95af7 5214 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5215 reginfo->poscache_maxiter = 0;
e7707071 5216 if ( nochange_depth )
4b196cd4 5217 nochange_depth--;
262b90c4 5218 sayYES;
40a82448 5219
40a82448 5220
faec1544
DM
5221 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5222 /* note: this is called twice; first after popping B, then A */
ec43f78b 5223 rex_sv = ST.prev_rex;
aed7b151 5224 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5225 SET_reg_curpm(rex_sv);
8d919b0a 5226 rex = ReANY(rex_sv);
f8fc2ecf 5227 rexi = RXi_GET(rex);
0357f1fd 5228
40a82448 5229 REGCP_UNWIND(ST.lastcp);
92da3157 5230 regcppop(rex, &maxopenparen);
faec1544
DM
5231 cur_eval = ST.prev_eval;
5232 cur_curlyx = ST.prev_curlyx;
1cb95af7 5233 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5234 reginfo->poscache_maxiter = 0;
e7707071 5235 if ( nochange_depth )
4b196cd4 5236 nochange_depth--;
40a82448 5237 sayNO_SILENT;
40a82448
DM
5238#undef ST
5239
3c0563b9 5240 case OPEN: /* ( */
c277df42 5241 n = ARG(scan); /* which paren pair */
9d9163fb 5242 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
5243 if (n > maxopenparen)
5244 maxopenparen = n;
495f47a5 5245 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5246 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5247 PTR2UV(rex),
5248 PTR2UV(rex->offs),
5249 (UV)n,
5250 (IV)rex->offs[n].start_tmp,
92da3157 5251 (UV)maxopenparen
495f47a5 5252 ));
e2e6a0f1 5253 lastopen = n;
a0d0e21e 5254 break;
495f47a5
DM
5255
5256/* XXX really need to log other places start/end are set too */
5257#define CLOSE_CAPTURE \
5258 rex->offs[n].start = rex->offs[n].start_tmp; \
9d9163fb 5259 rex->offs[n].end = locinput - reginfo->strbeg; \
495f47a5
DM
5260 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5261 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5262 PTR2UV(rex), \
5263 PTR2UV(rex->offs), \
5264 (UV)n, \
5265 (IV)rex->offs[n].start, \
5266 (IV)rex->offs[n].end \
5267 ))
5268
3c0563b9 5269 case CLOSE: /* ) */
c277df42 5270 n = ARG(scan); /* which paren pair */
495f47a5 5271 CLOSE_CAPTURE;
b93070ed
DM
5272 if (n > rex->lastparen)
5273 rex->lastparen = n;
5274 rex->lastcloseparen = n;
3b6647e0 5275 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5276 goto fake_end;
5277 }
a0d0e21e 5278 break;
3c0563b9
DM
5279
5280 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5281 if (ARG(scan)){
5282 regnode *cursor;
5283 for (cursor=scan;
5284 cursor && OP(cursor)!=END;
5285 cursor=regnext(cursor))
5286 {
5287 if ( OP(cursor)==CLOSE ){
5288 n = ARG(cursor);
5289 if ( n <= lastopen ) {
495f47a5 5290 CLOSE_CAPTURE;
b93070ed
DM
5291 if (n > rex->lastparen)
5292 rex->lastparen = n;
5293 rex->lastcloseparen = n;
3b6647e0
RB
5294 if ( n == ARG(scan) || (cur_eval &&
5295 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5296 break;
5297 }
5298 }
5299 }
5300 }
5301 goto fake_end;
5302 /*NOTREACHED*/
3c0563b9
DM
5303
5304 case GROUPP: /* (?(1)) */
c277df42 5305 n = ARG(scan); /* which paren pair */
b93070ed 5306 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5307 break;
3c0563b9
DM
5308
5309 case NGROUPP: /* (?(<name>)) */
0a4db386 5310 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5311 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5312 break;
3c0563b9
DM
5313
5314 case INSUBP: /* (?(R)) */
0a4db386 5315 n = ARG(scan);
3b6647e0 5316 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5317 break;
3c0563b9
DM
5318
5319 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5320 sw = 0;
5321 break;
3c0563b9
DM
5322
5323 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 5324 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 5325 if (sw)
c277df42
IZ
5326 next = NEXTOPER(NEXTOPER(scan));
5327 else {
5328 next = scan + ARG(scan);
5329 if (OP(next) == IFTHEN) /* Fake one. */
5330 next = NEXTOPER(NEXTOPER(next));
5331 }
5332 break;
3c0563b9
DM
5333
5334 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5335 logical = scan->flags;
c277df42 5336 break;
c476f425 5337
2ab05381 5338/*******************************************************************
2ab05381 5339
c476f425
DM
5340The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5341pattern, where A and B are subpatterns. (For simple A, CURLYM or
5342STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5343
c476f425 5344A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5345
c476f425
DM
5346On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5347state, which contains the current count, initialised to -1. It also sets
5348cur_curlyx to point to this state, with any previous value saved in the
5349state block.
2ab05381 5350
c476f425
DM
5351CURLYX then jumps straight to the WHILEM op, rather than executing A,
5352since the pattern may possibly match zero times (i.e. it's a while {} loop
5353rather than a do {} while loop).
2ab05381 5354
c476f425
DM
5355Each entry to WHILEM represents a successful match of A. The count in the
5356CURLYX block is incremented, another WHILEM state is pushed, and execution
5357passes to A or B depending on greediness and the current count.
2ab05381 5358
c476f425
DM
5359For example, if matching against the string a1a2a3b (where the aN are
5360substrings that match /A/), then the match progresses as follows: (the
5361pushed states are interspersed with the bits of strings matched so far):
2ab05381 5362
c476f425
DM
5363 <CURLYX cnt=-1>
5364 <CURLYX cnt=0><WHILEM>
5365 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5366 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5367 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5368 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5369
c476f425
DM
5370(Contrast this with something like CURLYM, which maintains only a single
5371backtrack state:
2ab05381 5372
c476f425
DM
5373 <CURLYM cnt=0> a1
5374 a1 <CURLYM cnt=1> a2
5375 a1 a2 <CURLYM cnt=2> a3
5376 a1 a2 a3 <CURLYM cnt=3> b
5377)
2ab05381 5378
c476f425
DM
5379Each WHILEM state block marks a point to backtrack to upon partial failure
5380of A or B, and also contains some minor state data related to that
5381iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5382overall state, such as the count, and pointers to the A and B ops.
2ab05381 5383
c476f425
DM
5384This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5385must always point to the *current* CURLYX block, the rules are:
2ab05381 5386
c476f425
DM
5387When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5388and set cur_curlyx to point the new block.
2ab05381 5389
c476f425
DM
5390When popping the CURLYX block after a successful or unsuccessful match,
5391restore the previous cur_curlyx.
2ab05381 5392
c476f425
DM
5393When WHILEM is about to execute B, save the current cur_curlyx, and set it
5394to the outer one saved in the CURLYX block.
2ab05381 5395
c476f425
DM
5396When popping the WHILEM block after a successful or unsuccessful B match,
5397restore the previous cur_curlyx.
2ab05381 5398
c476f425
DM
5399Here's an example for the pattern (AI* BI)*BO
5400I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5401
c476f425
DM
5402cur_
5403curlyx backtrack stack
5404------ ---------------
5405NULL
5406CO <CO prev=NULL> <WO>
5407CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5408CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5409NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5410
c476f425
DM
5411At this point the pattern succeeds, and we work back down the stack to
5412clean up, restoring as we go:
95b24440 5413
c476f425
DM
5414CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5415CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5416CO <CO prev=NULL> <WO>
5417NULL
a0374537 5418
c476f425
DM
5419*******************************************************************/
5420
5421#define ST st->u.curlyx
5422
5423 case CURLYX: /* start of /A*B/ (for complex A) */
5424 {
5425 /* No need to save/restore up to this paren */
5426 I32 parenfloor = scan->flags;
5427
5428 assert(next); /* keep Coverity happy */
5429 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5430 next += ARG(next);
5431
5432 /* XXXX Probably it is better to teach regpush to support
92da3157 5433 parenfloor > maxopenparen ... */
b93070ed
DM
5434 if (parenfloor > (I32)rex->lastparen)
5435 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5436
5437 ST.prev_curlyx= cur_curlyx;
5438 cur_curlyx = st;
5439 ST.cp = PL_savestack_ix;
5440
5441 /* these fields contain the state of the current curly.
5442 * they are accessed by subsequent WHILEMs */
5443 ST.parenfloor = parenfloor;
d02d6d97 5444 ST.me = scan;
c476f425 5445 ST.B = next;
24d3c4a9
DM
5446 ST.minmod = minmod;
5447 minmod = 0;
c476f425
DM
5448 ST.count = -1; /* this will be updated by WHILEM */
5449 ST.lastloc = NULL; /* this will be updated by WHILEM */
5450
4d5016e5 5451 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5452 assert(0); /* NOTREACHED */
c476f425 5453 }
a0d0e21e 5454
c476f425 5455 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5456 cur_curlyx = ST.prev_curlyx;
5457 sayYES;
118e2215 5458 assert(0); /* NOTREACHED */
a0d0e21e 5459
c476f425
DM
5460 case CURLYX_end_fail: /* just failed to match all of A*B */
5461 regcpblow(ST.cp);
5462 cur_curlyx = ST.prev_curlyx;
5463 sayNO;
118e2215 5464 assert(0); /* NOTREACHED */
4633a7c4 5465
a0d0e21e 5466
c476f425
DM
5467#undef ST
5468#define ST st->u.whilem
5469
5470 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5471 {
5472 /* see the discussion above about CURLYX/WHILEM */
c476f425 5473 I32 n;
d02d6d97
DM
5474 int min = ARG1(cur_curlyx->u.curlyx.me);
5475 int max = ARG2(cur_curlyx->u.curlyx.me);
5476 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5477
c476f425
DM
5478 assert(cur_curlyx); /* keep Coverity happy */
5479 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5480 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5481 ST.cache_offset = 0;
5482 ST.cache_mask = 0;
5483
c476f425
DM
5484
5485 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5486 "%*s whilem: matched %ld out of %d..%d\n",
5487 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5488 );
a0d0e21e 5489
c476f425 5490 /* First just match a string of min A's. */
a0d0e21e 5491
d02d6d97 5492 if (n < min) {
92da3157
DM
5493 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5494 maxopenparen);
c476f425 5495 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5496 REGCP_SET(ST.lastcp);
5497
4d5016e5 5498 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5499 assert(0); /* NOTREACHED */
c476f425
DM
5500 }
5501
5502 /* If degenerate A matches "", assume A done. */
5503
5504 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5505 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5506 "%*s whilem: empty match detected, trying continuation...\n",
5507 REPORT_CODE_OFF+depth*2, "")
5508 );
5509 goto do_whilem_B_max;
5510 }
5511
1cb95af7
DM
5512 /* super-linear cache processing.
5513 *
5514 * The idea here is that for certain types of CURLYX/WHILEM -
5515 * principally those whose upper bound is infinity (and
5516 * excluding regexes that have things like \1 and other very
5517 * non-regular expresssiony things), then if a pattern like
5518 * /....A*.../ fails and we backtrack to the WHILEM, then we
5519 * make a note that this particular WHILEM op was at string
5520 * position 47 (say) when the rest of pattern failed. Then, if
5521 * we ever find ourselves back at that WHILEM, and at string
5522 * position 47 again, we can just fail immediately rather than
5523 * running the rest of the pattern again.
5524 *
5525 * This is very handy when patterns start to go
5526 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5527 * with a combinatorial explosion of backtracking.
5528 *
5529 * The cache is implemented as a bit array, with one bit per
5530 * string byte position per WHILEM op (up to 16) - so its
5531 * between 0.25 and 2x the string size.
5532 *
5533 * To avoid allocating a poscache buffer every time, we do an
5534 * initially countdown; only after we have executed a WHILEM
5535 * op (string-length x #WHILEMs) times do we allocate the
5536 * cache.
5537 *
5538 * The top 4 bits of scan->flags byte say how many different
5539 * relevant CURLLYX/WHILEM op pairs there are, while the
5540 * bottom 4-bits is the identifying index number of this
5541 * WHILEM.
5542 */
c476f425
DM
5543
5544 if (scan->flags) {
a0d0e21e 5545
1cb48e53 5546 if (!reginfo->poscache_maxiter) {
c476f425
DM
5547 /* start the countdown: Postpone detection until we
5548 * know the match is not *that* much linear. */
1cb48e53 5549 reginfo->poscache_maxiter
9d9163fb
DM
5550 = (reginfo->strend - reginfo->strbeg + 1)
5551 * (scan->flags>>4);
66bf836d 5552 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
5553 if (reginfo->poscache_maxiter < 0)
5554 reginfo->poscache_maxiter = I32_MAX;
5555 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 5556 }
c476f425 5557
1cb48e53 5558 if (reginfo->poscache_iter-- == 0) {
c476f425 5559 /* initialise cache */
1cb48e53 5560 const I32 size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
5561 regmatch_info_aux *const aux = reginfo->info_aux;
5562 if (aux->poscache) {
5563 if ((I32)reginfo->poscache_size < size) {
5564 Renew(aux->poscache, size, char);
5565 reginfo->poscache_size = size;
2c2d71f5 5566 }
2ac8ff4b 5567 Zero(aux->poscache, size, char);
2c2d71f5
JH
5568 }
5569 else {
2ac8ff4b
DM
5570 reginfo->poscache_size = size;
5571 Newxz(aux->poscache, size, char);
2c2d71f5 5572 }
c476f425
DM
5573 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5574 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5575 PL_colors[4], PL_colors[5])
5576 );
2c2d71f5 5577 }
c476f425 5578
1cb48e53 5579 if (reginfo->poscache_iter < 0) {
c476f425
DM
5580 /* have we already failed at this position? */
5581 I32 offset, mask;
338e600a
DM
5582
5583 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 5584 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
5585 + (locinput - reginfo->strbeg)
5586 * (scan->flags>>4);
c476f425
DM
5587 mask = 1 << (offset % 8);
5588 offset /= 8;
2ac8ff4b 5589 if (reginfo->info_aux->poscache[offset] & mask) {
c476f425
DM
5590 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5591 "%*s whilem: (cache) already tried at this position...\n",
5592 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5593 );
3298f257 5594 sayNO; /* cache records failure */
2c2d71f5 5595 }
c476f425
DM
5596 ST.cache_offset = offset;
5597 ST.cache_mask = mask;
2c2d71f5 5598 }
c476f425 5599 }
2c2d71f5 5600
c476f425 5601 /* Prefer B over A for minimal matching. */
a687059c 5602
c476f425
DM
5603 if (cur_curlyx->u.curlyx.minmod) {
5604 ST.save_curlyx = cur_curlyx;
5605 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5606 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5607 maxopenparen);
c476f425 5608 REGCP_SET(ST.lastcp);
4d5016e5
DM
5609 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5610 locinput);
118e2215 5611 assert(0); /* NOTREACHED */
c476f425 5612 }
a0d0e21e 5613
c476f425
DM
5614 /* Prefer A over B for maximal matching. */
5615
d02d6d97 5616 if (n < max) { /* More greed allowed? */
92da3157
DM
5617 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5618 maxopenparen);
c476f425
DM
5619 cur_curlyx->u.curlyx.lastloc = locinput;
5620 REGCP_SET(ST.lastcp);
4d5016e5 5621 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5622 assert(0); /* NOTREACHED */
c476f425
DM
5623 }
5624 goto do_whilem_B_max;
5625 }
118e2215 5626 assert(0); /* NOTREACHED */
c476f425
DM
5627
5628 case WHILEM_B_min: /* just matched B in a minimal match */
5629 case WHILEM_B_max: /* just matched B in a maximal match */
5630 cur_curlyx = ST.save_curlyx;
5631 sayYES;
118e2215 5632 assert(0); /* NOTREACHED */
c476f425
DM
5633
5634 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5635 cur_curlyx = ST.save_curlyx;
5636 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5637 cur_curlyx->u.curlyx.count--;
5638 CACHEsayNO;
118e2215 5639 assert(0); /* NOTREACHED */
c476f425
DM
5640
5641 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5642 /* FALL THROUGH */
5643 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5644 REGCP_UNWIND(ST.lastcp);
92da3157 5645 regcppop(rex, &maxopenparen);
c476f425
DM
5646 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5647 cur_curlyx->u.curlyx.count--;
5648 CACHEsayNO;
118e2215 5649 assert(0); /* NOTREACHED */
c476f425
DM
5650
5651 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5652 REGCP_UNWIND(ST.lastcp);
92da3157 5653 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5655 "%*s whilem: failed, trying continuation...\n",
5656 REPORT_CODE_OFF+depth*2, "")
5657 );
5658 do_whilem_B_max:
5659 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5660 && ckWARN(WARN_REGEXP)
39819bd9 5661 && !reginfo->warned)
c476f425 5662 {
39819bd9 5663 reginfo->warned = TRUE;
dcbac5bb
FC
5664 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5665 "Complex regular subexpression recursion limit (%d) "
5666 "exceeded",
c476f425
DM
5667 REG_INFTY - 1);
5668 }
5669
5670 /* now try B */
5671 ST.save_curlyx = cur_curlyx;
5672 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5673 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5674 locinput);
118e2215 5675 assert(0); /* NOTREACHED */
c476f425
DM
5676
5677 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5678 cur_curlyx = ST.save_curlyx;
5679 REGCP_UNWIND(ST.lastcp);
92da3157 5680 regcppop(rex, &maxopenparen);
c476f425 5681
d02d6d97 5682 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5683 /* Maximum greed exceeded */
5684 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5685 && ckWARN(WARN_REGEXP)
39819bd9 5686 && !reginfo->warned)
c476f425 5687 {
39819bd9 5688 reginfo->warned = TRUE;
c476f425 5689 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5690 "Complex regular subexpression recursion "
5691 "limit (%d) exceeded",
c476f425 5692 REG_INFTY - 1);
a0d0e21e 5693 }
c476f425 5694 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5695 CACHEsayNO;
a0d0e21e 5696 }
c476f425
DM
5697
5698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5699 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5700 );
5701 /* Try grabbing another A and see if it helps. */
c476f425 5702 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5703 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5704 maxopenparen);
c476f425 5705 REGCP_SET(ST.lastcp);
d02d6d97 5706 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5707 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5708 locinput);
118e2215 5709 assert(0); /* NOTREACHED */
40a82448
DM
5710
5711#undef ST
5712#define ST st->u.branch
5713
5714 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5715 next = scan + ARG(scan);
5716 if (next == scan)
5717 next = NULL;
40a82448
DM
5718 scan = NEXTOPER(scan);
5719 /* FALL THROUGH */
c277df42 5720
40a82448
DM
5721 case BRANCH: /* /(...|A|...)/ */
5722 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5723 ST.lastparen = rex->lastparen;
f6033a9d 5724 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5725 ST.next_branch = next;
5726 REGCP_SET(ST.cp);
02db2b7b 5727
40a82448 5728 /* Now go into the branch */
5d458dd8 5729 if (has_cutgroup) {
4d5016e5 5730 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5731 } else {
4d5016e5 5732 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5733 }
118e2215 5734 assert(0); /* NOTREACHED */
3c0563b9
DM
5735
5736 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5737 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5738 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5739 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5740 assert(0); /* NOTREACHED */
3c0563b9 5741
5d458dd8
YO
5742 case CUTGROUP_next_fail:
5743 do_cutgroup = 1;
5744 no_final = 1;
5745 if (st->u.mark.mark_name)
5746 sv_commit = st->u.mark.mark_name;
5747 sayNO;
118e2215 5748 assert(0); /* NOTREACHED */
3c0563b9 5749
5d458dd8
YO
5750 case BRANCH_next:
5751 sayYES;
118e2215 5752 assert(0); /* NOTREACHED */
3c0563b9 5753
40a82448 5754 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5755 if (do_cutgroup) {
5756 do_cutgroup = 0;
5757 no_final = 0;
5758 }
40a82448 5759 REGCP_UNWIND(ST.cp);
a8d1f4b4 5760 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5761 scan = ST.next_branch;
5762 /* no more branches? */
5d458dd8
YO
5763 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5764 DEBUG_EXECUTE_r({
5765 PerlIO_printf( Perl_debug_log,
5766 "%*s %sBRANCH failed...%s\n",
5767 REPORT_CODE_OFF+depth*2, "",
5768 PL_colors[4],
5769 PL_colors[5] );
5770 });
5771 sayNO_SILENT;
5772 }
40a82448 5773 continue; /* execute next BRANCH[J] op */
118e2215 5774 assert(0); /* NOTREACHED */
40a82448 5775
3c0563b9 5776 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5777 minmod = 1;
a0d0e21e 5778 break;
40a82448
DM
5779
5780#undef ST
5781#define ST st->u.curlym
5782
5783 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5784
5785 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5786 * only a single backtracking state, no matter how many matches
40a82448
DM
5787 * there are in {m,n}. It relies on the pattern being constant
5788 * length, with no parens to influence future backrefs
5789 */
5790
5791 ST.me = scan;
dc45a647 5792 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5793
f6033a9d
DM
5794 ST.lastparen = rex->lastparen;
5795 ST.lastcloseparen = rex->lastcloseparen;
5796
40a82448
DM
5797 /* if paren positive, emulate an OPEN/CLOSE around A */
5798 if (ST.me->flags) {
3b6647e0 5799 U32 paren = ST.me->flags;
92da3157
DM
5800 if (paren > maxopenparen)
5801 maxopenparen = paren;
c277df42 5802 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5803 }
40a82448
DM
5804 ST.A = scan;
5805 ST.B = next;
5806 ST.alen = 0;
5807 ST.count = 0;
24d3c4a9
DM
5808 ST.minmod = minmod;
5809 minmod = 0;
40a82448
DM
5810 ST.c1 = CHRTEST_UNINIT;
5811 REGCP_SET(ST.cp);
6407bf3b 5812
40a82448
DM
5813 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5814 goto curlym_do_B;
5815
5816 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5817 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5818 assert(0); /* NOTREACHED */
5f80c4cf 5819
40a82448 5820 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5821 ST.count++;
5822 /* after first match, determine A's length: u.curlym.alen */
5823 if (ST.count == 1) {
ba44c216 5824 if (reginfo->is_utf8_target) {
c07e9d7b
DM
5825 char *s = st->locinput;
5826 while (s < locinput) {
40a82448
DM
5827 ST.alen++;
5828 s += UTF8SKIP(s);
5829 }
5830 }
5831 else {
c07e9d7b 5832 ST.alen = locinput - st->locinput;
40a82448
DM
5833 }
5834 if (ST.alen == 0)
5835 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5836 }
0cadcf80
DM
5837 DEBUG_EXECUTE_r(
5838 PerlIO_printf(Perl_debug_log,
40a82448 5839 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5840 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5841 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5842 );
5843
0a4db386 5844 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5845 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5846 goto fake_end;
5847
c966426a
DM
5848 {
5849 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5850 if ( max == REG_INFTY || ST.count < max )
5851 goto curlym_do_A; /* try to match another A */
5852 }
40a82448 5853 goto curlym_do_B; /* try to match B */
5f80c4cf 5854
40a82448
DM
5855 case CURLYM_A_fail: /* just failed to match an A */
5856 REGCP_UNWIND(ST.cp);
0a4db386
YO
5857
5858 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5859 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5860 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5861 sayNO;
0cadcf80 5862
40a82448 5863 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5864 if (ST.c1 == CHRTEST_UNINIT) {
5865 /* calculate c1 and c2 for possible match of 1st char
5866 * following curly */
5867 ST.c1 = ST.c2 = CHRTEST_VOID;
5868 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5869 regnode *text_node = ST.B;
5870 if (! HAS_TEXT(text_node))
5871 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5872 /* this used to be
5873
5874 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5875
5876 But the former is redundant in light of the latter.
5877
5878 if this changes back then the macro for
5879 IS_TEXT and friends need to change.
5880 */
c74f6de9 5881 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 5882 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 5883 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 5884 reginfo))
c74f6de9
KW
5885 {
5886 sayNO;
5887 }
c277df42 5888 }
c277df42 5889 }
40a82448
DM
5890 }
5891
5892 DEBUG_EXECUTE_r(
5893 PerlIO_printf(Perl_debug_log,
5894 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5895 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5896 "", (IV)ST.count)
5897 );
c74f6de9 5898 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5899 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5900 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5901 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5902 {
5903 /* simulate B failing */
5904 DEBUG_OPTIMISE_r(
5905 PerlIO_printf(Perl_debug_log,
5906 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5907 (int)(REPORT_CODE_OFF+(depth*2)),"",
5908 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5909 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5910 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5911 );
5912 state_num = CURLYM_B_fail;
5913 goto reenter_switch;
5914 }
5915 }
5916 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5917 /* simulate B failing */
5918 DEBUG_OPTIMISE_r(
5919 PerlIO_printf(Perl_debug_log,
79a2a0e8 5920 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5921 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5922 (int) nextchr, ST.c1, ST.c2)
5923 );
5400f398
KW
5924 state_num = CURLYM_B_fail;
5925 goto reenter_switch;
5926 }
c74f6de9 5927 }
40a82448
DM
5928
5929 if (ST.me->flags) {
f6033a9d 5930 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5931 I32 paren = ST.me->flags;
5932 if (ST.count) {
b93070ed 5933 rex->offs[paren].start
9d9163fb
DM
5934 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5935 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
5936 if ((U32)paren > rex->lastparen)
5937 rex->lastparen = paren;
5938 rex->lastcloseparen = paren;
c277df42 5939 }
40a82448 5940 else
b93070ed 5941 rex->offs[paren].end = -1;
0a4db386 5942 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5943 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5944 {
5945 if (ST.count)
5946 goto fake_end;
5947 else
5948 sayNO;
5949 }
c277df42 5950 }
0a4db386 5951
4d5016e5 5952 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5953 assert(0); /* NOTREACHED */
40a82448
DM
5954
5955 case CURLYM_B_fail: /* just failed to match a B */
5956 REGCP_UNWIND(ST.cp);
a8d1f4b4 5957 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5958 if (ST.minmod) {
84d2fa14
HS
5959 I32 max = ARG2(ST.me);
5960 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5961 sayNO;
5962 goto curlym_do_A; /* try to match a further A */
5963 }
5964 /* backtrack one A */
5965 if (ST.count == ARG1(ST.me) /* min */)
5966 sayNO;
5967 ST.count--;
7016d6eb 5968 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5969 goto curlym_do_B; /* try to match B */
5970
c255a977
DM
5971#undef ST
5972#define ST st->u.curly
40a82448 5973
c255a977
DM
5974#define CURLY_SETPAREN(paren, success) \
5975 if (paren) { \
5976 if (success) { \
9d9163fb
DM
5977 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5978 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
5979 if (paren > rex->lastparen) \
5980 rex->lastparen = paren; \
b93070ed 5981 rex->lastcloseparen = paren; \
c255a977 5982 } \
f6033a9d 5983 else { \
b93070ed 5984 rex->offs[paren].end = -1; \
f6033a9d
DM
5985 rex->lastparen = ST.lastparen; \
5986 rex->lastcloseparen = ST.lastcloseparen; \
5987 } \
c255a977
DM
5988 }
5989
b40a2c17 5990 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5991 ST.paren = 0;
5992 ST.min = 0;
5993 ST.max = REG_INFTY;
a0d0e21e
LW
5994 scan = NEXTOPER(scan);
5995 goto repeat;
3c0563b9 5996
b40a2c17 5997 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5998 ST.paren = 0;
5999 ST.min = 1;
6000 ST.max = REG_INFTY;
c277df42 6001 scan = NEXTOPER(scan);
c255a977 6002 goto repeat;
3c0563b9 6003
b40a2c17 6004 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
6005 ST.paren = scan->flags; /* Which paren to set */
6006 ST.lastparen = rex->lastparen;
f6033a9d 6007 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
6008 if (ST.paren > maxopenparen)
6009 maxopenparen = ST.paren;
c255a977
DM
6010 ST.min = ARG1(scan); /* min to match */
6011 ST.max = ARG2(scan); /* max to match */
0a4db386 6012 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6013 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6014 ST.min=1;
6015 ST.max=1;
6016 }
c255a977
DM
6017 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6018 goto repeat;
3c0563b9 6019
b40a2c17 6020 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
6021 ST.paren = 0;
6022 ST.min = ARG1(scan); /* min to match */
6023 ST.max = ARG2(scan); /* max to match */
6024 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 6025 repeat:
a0d0e21e
LW
6026 /*
6027 * Lookahead to avoid useless match attempts
6028 * when we know what character comes next.
c255a977 6029 *
5f80c4cf
JP
6030 * Used to only do .*x and .*?x, but now it allows
6031 * for )'s, ('s and (?{ ... })'s to be in the way
6032 * of the quantifier and the EXACT-like node. -- japhy
6033 */
6034
eb5c1be8 6035 assert(ST.min <= ST.max);
3337dfe3
KW
6036 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6037 ST.c1 = ST.c2 = CHRTEST_VOID;
6038 }
6039 else {
5f80c4cf
JP
6040 regnode *text_node = next;
6041
3dab1dad
YO
6042 if (! HAS_TEXT(text_node))
6043 FIND_NEXT_IMPT(text_node);
5f80c4cf 6044
9e137952 6045 if (! HAS_TEXT(text_node))
c255a977 6046 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 6047 else {
ee9b8eae 6048 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 6049 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 6050 }
c74f6de9 6051 else {
ee9b8eae
YO
6052
6053 /* Currently we only get here when
6054
6055 PL_rekind[OP(text_node)] == EXACT
6056
6057 if this changes back then the macro for IS_TEXT and
6058 friends need to change. */
79a2a0e8 6059 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6060 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6061 reginfo))
c74f6de9
KW
6062 {
6063 sayNO;
6064 }
6065 }
1aa99e6b 6066 }
bbce6d69 6067 }
c255a977
DM
6068
6069 ST.A = scan;
6070 ST.B = next;
24d3c4a9 6071 if (minmod) {
eb72505d 6072 char *li = locinput;
24d3c4a9 6073 minmod = 0;
984e6dd1 6074 if (ST.min &&
f9176b44 6075 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
984e6dd1 6076 < ST.min)
4633a7c4 6077 sayNO;
7016d6eb 6078 SET_locinput(li);
c255a977 6079 ST.count = ST.min;
c255a977
DM
6080 REGCP_SET(ST.cp);
6081 if (ST.c1 == CHRTEST_VOID)
6082 goto curly_try_B_min;
6083
6084 ST.oldloc = locinput;
6085
6086 /* set ST.maxpos to the furthest point along the
6087 * string that could possibly match */
6088 if (ST.max == REG_INFTY) {
220db18a 6089 ST.maxpos = reginfo->strend - 1;
f2ed9b32 6090 if (utf8_target)
c255a977
DM
6091 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6092 ST.maxpos--;
6093 }
f2ed9b32 6094 else if (utf8_target) {
c255a977
DM
6095 int m = ST.max - ST.min;
6096 for (ST.maxpos = locinput;
220db18a 6097 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
6098 ST.maxpos += UTF8SKIP(ST.maxpos);
6099 }
6100 else {
6101 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
6102 if (ST.maxpos >= reginfo->strend)
6103 ST.maxpos = reginfo->strend - 1;
c255a977
DM
6104 }
6105 goto curly_try_B_min_known;
6106
6107 }
6108 else {
eb72505d
DM
6109 /* avoid taking address of locinput, so it can remain
6110 * a register var */
6111 char *li = locinput;
f9176b44 6112 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
c255a977
DM
6113 if (ST.count < ST.min)
6114 sayNO;
7016d6eb 6115 SET_locinput(li);
c255a977
DM
6116 if ((ST.count > ST.min)
6117 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6118 {
6119 /* A{m,n} must come at the end of the string, there's
6120 * no point in backing off ... */
6121 ST.min = ST.count;
6122 /* ...except that $ and \Z can match before *and* after
6123 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6124 We may back off by one in this case. */
eb72505d 6125 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6126 ST.min--;
6127 }
6128 REGCP_SET(ST.cp);
6129 goto curly_try_B_max;
6130 }
118e2215 6131 assert(0); /* NOTREACHED */
c255a977
DM
6132
6133
6134 case CURLY_B_min_known_fail:
6135 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6136
c255a977 6137 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6138 if (ST.paren) {
6139 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6140 }
c255a977
DM
6141 /* Couldn't or didn't -- move forward. */
6142 ST.oldloc = locinput;
f2ed9b32 6143 if (utf8_target)
c255a977
DM
6144 locinput += UTF8SKIP(locinput);
6145 else
6146 locinput++;
6147 ST.count++;
6148 curly_try_B_min_known:
6149 /* find the next place where 'B' could work, then call B */
6150 {
6151 int n;
f2ed9b32 6152 if (utf8_target) {
c255a977
DM
6153 n = (ST.oldloc == locinput) ? 0 : 1;
6154 if (ST.c1 == ST.c2) {
c255a977 6155 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6156 while (locinput <= ST.maxpos
6157 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6158 {
6159 locinput += UTF8SKIP(locinput);
c255a977
DM
6160 n++;
6161 }
1aa99e6b
IH
6162 }
6163 else {
c255a977 6164 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6165 while (locinput <= ST.maxpos
6166 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6167 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6168 {
6169 locinput += UTF8SKIP(locinput);
c255a977 6170 n++;
1aa99e6b 6171 }
0fe9bf95
IZ
6172 }
6173 }
5400f398 6174 else { /* Not utf8_target */
c255a977
DM
6175 if (ST.c1 == ST.c2) {
6176 while (locinput <= ST.maxpos &&
6177 UCHARAT(locinput) != ST.c1)
6178 locinput++;
bbce6d69 6179 }
c255a977
DM
6180 else {
6181 while (locinput <= ST.maxpos
6182 && UCHARAT(locinput) != ST.c1
6183 && UCHARAT(locinput) != ST.c2)
6184 locinput++;
a0ed51b3 6185 }
c255a977
DM
6186 n = locinput - ST.oldloc;
6187 }
6188 if (locinput > ST.maxpos)
6189 sayNO;
c255a977 6190 if (n) {
eb72505d
DM
6191 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6192 * at b; check that everything between oldloc and
6193 * locinput matches */
6194 char *li = ST.oldloc;
c255a977 6195 ST.count += n;
f9176b44 6196 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
4633a7c4 6197 sayNO;
eb72505d 6198 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6199 }
c255a977 6200 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6201 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6202 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6203 goto fake_end;
6204 }
4d5016e5 6205 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6206 }
118e2215 6207 assert(0); /* NOTREACHED */
c255a977
DM
6208
6209
6210 case CURLY_B_min_fail:
6211 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6212
6213 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6214 if (ST.paren) {
6215 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6216 }
c255a977 6217 /* failed -- move forward one */
f73aaa43 6218 {
eb72505d 6219 char *li = locinput;
f9176b44 6220 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
f73aaa43
DM
6221 sayNO;
6222 }
eb72505d 6223 locinput = li;
f73aaa43
DM
6224 }
6225 {
c255a977 6226 ST.count++;
c255a977
DM
6227 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6228 ST.count > 0)) /* count overflow ? */
15272685 6229 {
c255a977
DM
6230 curly_try_B_min:
6231 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6232 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6233 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6234 goto fake_end;
6235 }
4d5016e5 6236 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6237 }
6238 }
c74f6de9 6239 sayNO;
118e2215 6240 assert(0); /* NOTREACHED */
c255a977
DM
6241
6242
6243 curly_try_B_max:
6244 /* a successful greedy match: now try to match B */
40d049e4 6245 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6246 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6247 goto fake_end;
6248 }
c255a977 6249 {
220db18a 6250 bool could_match = locinput < reginfo->strend;
79a2a0e8 6251
c255a977 6252 /* If it could work, try it. */
79a2a0e8
KW
6253 if (ST.c1 != CHRTEST_VOID && could_match) {
6254 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6255 {
6256 could_match = memEQ(locinput,
6257 ST.c1_utf8,
6258 UTF8SKIP(locinput))
6259 || memEQ(locinput,
6260 ST.c2_utf8,
6261 UTF8SKIP(locinput));
6262 }
6263 else {
6264 could_match = UCHARAT(locinput) == ST.c1
6265 || UCHARAT(locinput) == ST.c2;
6266 }
6267 }
6268 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6269 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6270 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6271 assert(0); /* NOTREACHED */
c255a977
DM
6272 }
6273 }
6274 /* FALL THROUGH */
3c0563b9 6275
c255a977
DM
6276 case CURLY_B_max_fail:
6277 /* failed to find B in a greedy match */
c255a977
DM
6278
6279 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6280 if (ST.paren) {
6281 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6282 }
c255a977
DM
6283 /* back up. */
6284 if (--ST.count < ST.min)
6285 sayNO;
eb72505d 6286 locinput = HOPc(locinput, -1);
c255a977
DM
6287 goto curly_try_B_max;
6288
6289#undef ST
6290
3c0563b9 6291 case END: /* last op of main pattern */
6bda09f9 6292 fake_end:
faec1544
DM
6293 if (cur_eval) {
6294 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544 6295
288b8c02 6296 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6297
6298 /* Save *all* the positions. */
6299 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b 6300 rex_sv = cur_eval->u.eval.prev_rex;
aed7b151 6301 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6302 SET_reg_curpm(rex_sv);
8d919b0a 6303 rex = ReANY(rex_sv);
f8fc2ecf 6304 rexi = RXi_GET(rex);
faec1544 6305 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6306
faec1544 6307 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6308
6309 /* Restore parens of the outer rex without popping the
6310 * savestack */
92da3157
DM
6311 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6312 &maxopenparen);
faec1544
DM
6313
6314 st->u.eval.prev_eval = cur_eval;
6315 cur_eval = cur_eval->u.eval.prev_eval;
6316 DEBUG_EXECUTE_r(
2a49f0f5
JH
6317 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6318 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6319 if ( nochange_depth )
6320 nochange_depth--;
6321
4d5016e5
DM
6322 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6323 locinput); /* match B */
faec1544
DM
6324 }
6325
3b0527fe 6326 if (locinput < reginfo->till) {
a3621e74 6327 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6328 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6329 PL_colors[4],
6d59b646
DM
6330 (long)(locinput - startpos),
6331 (long)(reginfo->till - startpos),
7821416a 6332 PL_colors[5]));
58e23c8d 6333
262b90c4 6334 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6335 }
262b90c4 6336 sayYES; /* Success! */
dad79028
DM
6337
6338 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6339 DEBUG_EXECUTE_r(
6340 PerlIO_printf(Perl_debug_log,
6341 "%*s %ssubpattern success...%s\n",
5bc10b2c 6342 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6343 sayYES; /* Success! */
dad79028 6344
40a82448
DM
6345#undef ST
6346#define ST st->u.ifmatch
6347
37f53970
DM
6348 {
6349 char *newstart;
6350
40a82448
DM
6351 case SUSPEND: /* (?>A) */
6352 ST.wanted = 1;
37f53970 6353 newstart = locinput;
9041c2e3 6354 goto do_ifmatch;
dad79028 6355
40a82448
DM
6356 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6357 ST.wanted = 0;
dad79028
DM
6358 goto ifmatch_trivial_fail_test;
6359
40a82448
DM
6360 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6361 ST.wanted = 1;
dad79028 6362 ifmatch_trivial_fail_test:
a0ed51b3 6363 if (scan->flags) {
52657f30 6364 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6365 if (!s) {
6366 /* trivial fail */
24d3c4a9
DM
6367 if (logical) {
6368 logical = 0;
f2338a2e 6369 sw = 1 - cBOOL(ST.wanted);
dad79028 6370 }
40a82448 6371 else if (ST.wanted)
dad79028
DM
6372 sayNO;
6373 next = scan + ARG(scan);
6374 if (next == scan)
6375 next = NULL;
6376 break;
6377 }
37f53970 6378 newstart = s;
a0ed51b3
LW
6379 }
6380 else
37f53970 6381 newstart = locinput;
a0ed51b3 6382
c277df42 6383 do_ifmatch:
40a82448 6384 ST.me = scan;
24d3c4a9 6385 ST.logical = logical;
24d786f4
YO
6386 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6387
40a82448 6388 /* execute body of (?...A) */
37f53970 6389 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6390 assert(0); /* NOTREACHED */
37f53970 6391 }
40a82448
DM
6392
6393 case IFMATCH_A_fail: /* body of (?...A) failed */
6394 ST.wanted = !ST.wanted;
6395 /* FALL THROUGH */
6396
6397 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6398 if (ST.logical) {
f2338a2e 6399 sw = cBOOL(ST.wanted);
40a82448
DM
6400 }
6401 else if (!ST.wanted)
6402 sayNO;
6403
37f53970
DM
6404 if (OP(ST.me) != SUSPEND) {
6405 /* restore old position except for (?>...) */
6406 locinput = st->locinput;
40a82448
DM
6407 }
6408 scan = ST.me + ARG(ST.me);
6409 if (scan == ST.me)
6410 scan = NULL;
6411 continue; /* execute B */
6412
6413#undef ST
dad79028 6414
3c0563b9
DM
6415 case LONGJMP: /* alternative with many branches compiles to
6416 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6417 next = scan + ARG(scan);
6418 if (next == scan)
6419 next = NULL;
a0d0e21e 6420 break;
3c0563b9
DM
6421
6422 case COMMIT: /* (*COMMIT) */
220db18a 6423 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 6424 /* FALLTHROUGH */
3c0563b9
DM
6425
6426 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6427 if (!scan->flags)
ad64d0ec 6428 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6429 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6430 assert(0); /* NOTREACHED */
3c0563b9 6431
54612592
YO
6432 case COMMIT_next_fail:
6433 no_final = 1;
6434 /* FALLTHROUGH */
3c0563b9
DM
6435
6436 case OPFAIL: /* (*FAIL) */
7f69552c 6437 sayNO;
118e2215 6438 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6439
6440#define ST st->u.mark
3c0563b9 6441 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6442 ST.prev_mark = mark_state;
5d458dd8 6443 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6444 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6445 mark_state = st;
4d5016e5
DM
6446 ST.mark_loc = locinput;
6447 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6448 assert(0); /* NOTREACHED */
3c0563b9 6449
e2e6a0f1
YO
6450 case MARKPOINT_next:
6451 mark_state = ST.prev_mark;
6452 sayYES;
118e2215 6453 assert(0); /* NOTREACHED */
3c0563b9 6454
e2e6a0f1 6455 case MARKPOINT_next_fail:
5d458dd8 6456 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6457 {
6458 if (ST.mark_loc > startpoint)
6459 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6460 popmark = NULL; /* we found our mark */
6461 sv_commit = ST.mark_name;
6462
6463 DEBUG_EXECUTE_r({
5d458dd8 6464 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6465 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6466 REPORT_CODE_OFF+depth*2, "",
be2597df 6467 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6468 });
6469 }
6470 mark_state = ST.prev_mark;
5d458dd8
YO
6471 sv_yes_mark = mark_state ?
6472 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6473 sayNO;
118e2215 6474 assert(0); /* NOTREACHED */
3c0563b9
DM
6475
6476 case SKIP: /* (*SKIP) */
5d458dd8 6477 if (scan->flags) {
2bf803e2 6478 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6479 ST.mark_name = NULL;
e2e6a0f1 6480 ST.mark_loc = locinput;
4d5016e5 6481 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6482 } else {
2bf803e2 6483 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6484 otherwise do nothing. Meaning we need to scan
6485 */
6486 regmatch_state *cur = mark_state;
ad64d0ec 6487 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6488
6489 while (cur) {
6490 if ( sv_eq( cur->u.mark.mark_name,
6491 find ) )
6492 {
6493 ST.mark_name = find;
4d5016e5 6494 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6495 }
6496 cur = cur->u.mark.prev_mark;
6497 }
e2e6a0f1 6498 }
2bf803e2 6499 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6500 break;
3c0563b9 6501
5d458dd8
YO
6502 case SKIP_next_fail:
6503 if (ST.mark_name) {
6504 /* (*CUT:NAME) - Set up to search for the name as we
6505 collapse the stack*/
6506 popmark = ST.mark_name;
6507 } else {
6508 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6509 if (ST.mark_loc > startpoint)
6510 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6511 /* but we set sv_commit to latest mark_name if there
6512 is one so they can test to see how things lead to this
6513 cut */
6514 if (mark_state)
6515 sv_commit=mark_state->u.mark.mark_name;
6516 }
e2e6a0f1
YO
6517 no_final = 1;
6518 sayNO;
118e2215 6519 assert(0); /* NOTREACHED */
e2e6a0f1 6520#undef ST
3c0563b9
DM
6521
6522 case LNBREAK: /* \R */
220db18a 6523 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 6524 locinput += n;
e1d1eefb
YO
6525 } else
6526 sayNO;
6527 break;
6528
a0d0e21e 6529 default:
b900a521 6530 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6531 PTR2UV(scan), OP(scan));
cea2e8a9 6532 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6533
6534 /* this is a point to jump to in order to increment
6535 * locinput by one character */
6536 increment_locinput:
e6ca698c 6537 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6538 if (utf8_target) {
6539 locinput += PL_utf8skip[nextchr];
7016d6eb 6540 /* locinput is allowed to go 1 char off the end, but not 2+ */
220db18a 6541 if (locinput > reginfo->strend)
28b98f76 6542 sayNO;
28b98f76
DM
6543 }
6544 else
3640db6b 6545 locinput++;
28b98f76 6546 break;
5d458dd8
YO
6547
6548 } /* end switch */
95b24440 6549
5d458dd8
YO
6550 /* switch break jumps here */
6551 scan = next; /* prepare to execute the next op and ... */
6552 continue; /* ... jump back to the top, reusing st */
118e2215 6553 assert(0); /* NOTREACHED */
95b24440 6554
40a82448
DM
6555 push_yes_state:
6556 /* push a state that backtracks on success */
6557 st->u.yes.prev_yes_state = yes_state;
6558 yes_state = st;
6559 /* FALL THROUGH */
6560 push_state:
6561 /* push a new regex state, then continue at scan */
6562 {
6563 regmatch_state *newst;
6564
24b23f37
YO
6565 DEBUG_STACK_r({
6566 regmatch_state *cur = st;
6567 regmatch_state *curyes = yes_state;
6568 int curd = depth;
6569 regmatch_slab *slab = PL_regmatch_slab;
6570 for (;curd > -1;cur--,curd--) {
6571 if (cur < SLAB_FIRST(slab)) {
6572 slab = slab->prev;
6573 cur = SLAB_LAST(slab);
6574 }
6575 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6576 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6577 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6578 (curyes == cur) ? "yes" : ""
6579 );
6580 if (curyes == cur)
6581 curyes = cur->u.yes.prev_yes_state;
6582 }
6583 } else
6584 DEBUG_STATE_pp("push")
6585 );
40a82448 6586 depth++;
40a82448
DM
6587 st->locinput = locinput;
6588 newst = st+1;
6589 if (newst > SLAB_LAST(PL_regmatch_slab))
6590 newst = S_push_slab(aTHX);
6591 PL_regmatch_state = newst;
786e8c11 6592
4d5016e5 6593 locinput = pushinput;
40a82448
DM
6594 st = newst;
6595 continue;
118e2215 6596 assert(0); /* NOTREACHED */
40a82448 6597 }
a0d0e21e 6598 }
a687059c 6599
a0d0e21e
LW
6600 /*
6601 * We get here only if there's trouble -- normally "case END" is
6602 * the terminating point.
6603 */
cea2e8a9 6604 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6605 /*NOTREACHED*/
4633a7c4
LW
6606 sayNO;
6607
262b90c4 6608yes:
77cb431f
DM
6609 if (yes_state) {
6610 /* we have successfully completed a subexpression, but we must now
6611 * pop to the state marked by yes_state and continue from there */
77cb431f 6612 assert(st != yes_state);
5bc10b2c
DM
6613#ifdef DEBUGGING
6614 while (st != yes_state) {
6615 st--;
6616 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6617 PL_regmatch_slab = PL_regmatch_slab->prev;
6618 st = SLAB_LAST(PL_regmatch_slab);
6619 }
e2e6a0f1 6620 DEBUG_STATE_r({
54612592
YO
6621 if (no_final) {
6622 DEBUG_STATE_pp("pop (no final)");
6623 } else {
6624 DEBUG_STATE_pp("pop (yes)");
6625 }
e2e6a0f1 6626 });
5bc10b2c
DM
6627 depth--;
6628 }
6629#else
77cb431f
DM
6630 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6631 || yes_state > SLAB_LAST(PL_regmatch_slab))
6632 {
6633 /* not in this slab, pop slab */
6634 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6635 PL_regmatch_slab = PL_regmatch_slab->prev;
6636 st = SLAB_LAST(PL_regmatch_slab);
6637 }
6638 depth -= (st - yes_state);
5bc10b2c 6639#endif
77cb431f
DM
6640 st = yes_state;
6641 yes_state = st->u.yes.prev_yes_state;
6642 PL_regmatch_state = st;
24b23f37 6643
3640db6b 6644 if (no_final)
5d458dd8 6645 locinput= st->locinput;
54612592 6646 state_num = st->resume_state + no_final;
24d3c4a9 6647 goto reenter_switch;
77cb431f
DM
6648 }
6649
a3621e74 6650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6651 PL_colors[4], PL_colors[5]));
02db2b7b 6652
bf2039a9 6653 if (reginfo->info_aux_eval) {
19b95bf0
DM
6654 /* each successfully executed (?{...}) block does the equivalent of
6655 * local $^R = do {...}
6656 * When popping the save stack, all these locals would be undone;
6657 * bypass this by setting the outermost saved $^R to the latest
6658 * value */
6659 if (oreplsv != GvSV(PL_replgv))
6660 sv_setsv(oreplsv, GvSV(PL_replgv));
6661 }
95b24440 6662 result = 1;
aa283a38 6663 goto final_exit;
4633a7c4
LW
6664
6665no:
a3621e74 6666 DEBUG_EXECUTE_r(
7821416a 6667 PerlIO_printf(Perl_debug_log,
786e8c11 6668 "%*s %sfailed...%s\n",
5bc10b2c 6669 REPORT_CODE_OFF+depth*2, "",
786e8c11 6670 PL_colors[4], PL_colors[5])
7821416a 6671 );
aa283a38 6672
262b90c4 6673no_silent:
54612592
YO
6674 if (no_final) {
6675 if (yes_state) {
6676 goto yes;
6677 } else {
6678 goto final_exit;
6679 }
6680 }
aa283a38
DM
6681 if (depth) {
6682 /* there's a previous state to backtrack to */
40a82448
DM
6683 st--;
6684 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6685 PL_regmatch_slab = PL_regmatch_slab->prev;
6686 st = SLAB_LAST(PL_regmatch_slab);
6687 }
6688 PL_regmatch_state = st;
40a82448 6689 locinput= st->locinput;
40a82448 6690
5bc10b2c
DM
6691 DEBUG_STATE_pp("pop");
6692 depth--;
262b90c4
DM
6693 if (yes_state == st)
6694 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6695
24d3c4a9
DM
6696 state_num = st->resume_state + 1; /* failure = success + 1 */
6697 goto reenter_switch;
95b24440 6698 }
24d3c4a9 6699 result = 0;
aa283a38 6700
262b90c4 6701 final_exit:
bbe252da 6702 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6703 SV *sv_err = get_sv("REGERROR", 1);
6704 SV *sv_mrk = get_sv("REGMARK", 1);
6705 if (result) {
e2e6a0f1 6706 sv_commit = &PL_sv_no;
5d458dd8
YO
6707 if (!sv_yes_mark)
6708 sv_yes_mark = &PL_sv_yes;
6709 } else {
6710 if (!sv_commit)
6711 sv_commit = &PL_sv_yes;
6712 sv_yes_mark = &PL_sv_no;
6713 }
6714 sv_setsv(sv_err, sv_commit);
6715 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6716 }
19b95bf0 6717
81ed78b2
DM
6718
6719 if (last_pushed_cv) {
6720 dSP;
6721 POP_MULTICALL;
4f8dbb2d 6722 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6723 }
6724
9d9163fb
DM
6725 assert(!result || locinput - reginfo->strbeg >= 0);
6726 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
6727}
6728
6729/*
6730 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6731 *
e64f369d
KW
6732 * What 'simple' means is a node which can be the operand of a quantifier like
6733 * '+', or {1,3}
6734 *
d60de1d1
DM
6735 * startposp - pointer a pointer to the start position. This is updated
6736 * to point to the byte following the highest successful
6737 * match.
6738 * p - the regnode to be repeatedly matched against.
220db18a 6739 * reginfo - struct holding match state, such as strend
4063ade8 6740 * max - maximum number of things to match.
d60de1d1 6741 * depth - (for debugging) backtracking depth.
a687059c 6742 */
76e3520e 6743STATIC I32
272d35c9 6744S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
f9176b44 6745 regmatch_info *const reginfo, I32 max, int depth)
a687059c 6746{
27da23d5 6747 dVAR;
4063ade8 6748 char *scan; /* Pointer to current position in target string */
eb578fdb 6749 I32 c;
220db18a 6750 char *loceol = reginfo->strend; /* local version */
4063ade8 6751 I32 hardcount = 0; /* How many matches so far */
ba44c216 6752 bool utf8_target = reginfo->is_utf8_target;
3018b823 6753 int to_complement = 0; /* Invert the result? */
d513472c 6754 UV utf8_flags;
3018b823 6755 _char_class_number classnum;
4f55667c
SP
6756#ifndef DEBUGGING
6757 PERL_UNUSED_ARG(depth);
6758#endif
a0d0e21e 6759
7918f24d
NC
6760 PERL_ARGS_ASSERT_REGREPEAT;
6761
f73aaa43 6762 scan = *startposp;
faf11cac
HS
6763 if (max == REG_INFTY)
6764 max = I32_MAX;
dfb8f192 6765 else if (! utf8_target && loceol - scan > max)
7f596f4c 6766 loceol = scan + max;
4063ade8
KW
6767
6768 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6769 * to the maximum of how far we should go in it (leaving it set to the real
6770 * end, if the maximum permissible would take us beyond that). This allows
6771 * us to make the loop exit condition that we haven't gone past <loceol> to
6772 * also mean that we haven't exceeded the max permissible count, saving a
6773 * test each time through the loop. But it assumes that the OP matches a
6774 * single byte, which is true for most of the OPs below when applied to a
6775 * non-UTF-8 target. Those relatively few OPs that don't have this
6776 * characteristic will have to compensate.
6777 *
6778 * There is no adjustment for UTF-8 targets, as the number of bytes per
6779 * character varies. OPs will have to test both that the count is less
6780 * than the max permissible (using <hardcount> to keep track), and that we
6781 * are still within the bounds of the string (using <loceol>. A few OPs
6782 * match a single byte no matter what the encoding. They can omit the max
6783 * test if, for the UTF-8 case, they do the adjustment that was skipped
6784 * above.
6785 *
6786 * Thus, the code above sets things up for the common case; and exceptional
6787 * cases need extra work; the common case is to make sure <scan> doesn't
6788 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6789 * count doesn't exceed the maximum permissible */
6790
a0d0e21e 6791 switch (OP(p)) {
22c35a8c 6792 case REG_ANY:
f2ed9b32 6793 if (utf8_target) {
1aa99e6b 6794 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6795 scan += UTF8SKIP(scan);
6796 hardcount++;
6797 }
6798 } else {
6799 while (scan < loceol && *scan != '\n')
6800 scan++;
a0ed51b3
LW
6801 }
6802 break;
ffc61ed2 6803 case SANY:
f2ed9b32 6804 if (utf8_target) {
a0804c9e 6805 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6806 scan += UTF8SKIP(scan);
6807 hardcount++;
6808 }
6809 }
6810 else
6811 scan = loceol;
a0ed51b3 6812 break;
4063ade8 6813 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
9597860a 6814 if (utf8_target && loceol - scan > max) {
4063ade8
KW
6815
6816 /* <loceol> hadn't been adjusted in the UTF-8 case */
6817 scan += max;
6818 }
6819 else {
6820 scan = loceol;
6821 }
f33976b4 6822 break;
59d32103 6823 case EXACT:
f9176b44 6824 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6825
59d32103 6826 c = (U8)*STRING(p);
59d32103 6827
5e4a1da1
KW
6828 /* Can use a simple loop if the pattern char to match on is invariant
6829 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6830 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6831 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 6832 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 6833 if (utf8_target && loceol - scan > max) {
4063ade8
KW
6834 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6835 * since here, to match at all, 1 char == 1 byte */
6836 loceol = scan + max;
6837 }
59d32103
KW
6838 while (scan < loceol && UCHARAT(scan) == c) {
6839 scan++;
6840 }
6841 }
f9176b44 6842 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
6843 if (utf8_target) {
6844 STRLEN scan_char_len;
5e4a1da1 6845
4063ade8 6846 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6847 * string EQ */
6848 while (hardcount < max
9a902117
KW
6849 && scan < loceol
6850 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
6851 && memEQ(scan, STRING(p), scan_char_len))
6852 {
4200a00c 6853 scan += scan_char_len;
5e4a1da1
KW
6854 hardcount++;
6855 }
6856 }
6857 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6858
5e4a1da1
KW
6859 /* Target isn't utf8; convert the character in the UTF-8
6860 * pattern to non-UTF8, and do a simple loop */
6861 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6862 while (scan < loceol && UCHARAT(scan) == c) {
6863 scan++;
6864 }
6865 } /* else pattern char is above Latin1, can't possibly match the
6866 non-UTF-8 target */
b40a2c17 6867 }
5e4a1da1 6868 else {
59d32103 6869
5e4a1da1
KW
6870 /* Here, the string must be utf8; pattern isn't, and <c> is
6871 * different in utf8 than not, so can't compare them directly.
6872 * Outside the loop, find the two utf8 bytes that represent c, and
6873 * then look for those in sequence in the utf8 string */
59d32103
KW
6874 U8 high = UTF8_TWO_BYTE_HI(c);
6875 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6876
6877 while (hardcount < max
6878 && scan + 1 < loceol
6879 && UCHARAT(scan) == high
6880 && UCHARAT(scan + 1) == low)
6881 {
6882 scan += 2;
6883 hardcount++;
6884 }
6885 }
6886 break;
5e4a1da1 6887
2f7f8cb1
KW
6888 case EXACTFA:
6889 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6890 goto do_exactf;
6891
d4e0b827 6892 case EXACTFL:
272d35c9 6893 RXp_MATCH_TAINTED_on(prog);
17580e7a
KW
6894 utf8_flags = FOLDEQ_UTF8_LOCALE;
6895 goto do_exactf;
6896
d4e0b827 6897 case EXACTF:
62bf7766
KW
6898 utf8_flags = 0;
6899 goto do_exactf;
6900
3c760661 6901 case EXACTFU_SS:
fab2782b 6902 case EXACTFU_TRICKYFOLD:
9a5a5549 6903 case EXACTFU:
f9176b44 6904 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6905
613a425d
KW
6906 do_exactf: {
6907 int c1, c2;
6908 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6909
f9176b44 6910 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6911
984e6dd1 6912 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 6913 reginfo))
984e6dd1 6914 {
613a425d 6915 if (c1 == CHRTEST_VOID) {
49b95fad 6916 /* Use full Unicode fold matching */
220db18a 6917 char *tmpeol = reginfo->strend;
f9176b44 6918 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
6919 while (hardcount < max
6920 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6921 STRING(p), NULL, pat_len,
f9176b44 6922 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
6923 {
6924 scan = tmpeol;
220db18a 6925 tmpeol = reginfo->strend;
49b95fad
KW
6926 hardcount++;
6927 }
613a425d
KW
6928 }
6929 else if (utf8_target) {
6930 if (c1 == c2) {
4063ade8
KW
6931 while (scan < loceol
6932 && hardcount < max
613a425d
KW
6933 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6934 {
6935 scan += UTF8SKIP(scan);
6936 hardcount++;
6937 }
6938 }
6939 else {
4063ade8
KW
6940 while (scan < loceol
6941 && hardcount < max
613a425d
KW
6942 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6943 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6944 {
6945 scan += UTF8SKIP(scan);
6946 hardcount++;
6947 }
6948 }
6949 }
6950 else if (c1 == c2) {
6951 while (scan < loceol && UCHARAT(scan) == c1) {
6952 scan++;
6953 }
6954 }
6955 else {
6956 while (scan < loceol &&
6957 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6958 {
6959 scan++;
6960 }
6961 }
634c83a2 6962 }
bbce6d69 6963 break;
613a425d 6964 }
a0d0e21e 6965 case ANYOF:
954a2af6 6966 case ANYOF_WARN_SUPER:
e0193e47 6967 if (utf8_target) {
4e8910e0 6968 while (hardcount < max
9a902117 6969 && scan < loceol
635cd5d4 6970 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0 6971 {
9a902117 6972 scan += UTF8SKIP(scan);
ffc61ed2
JH
6973 hardcount++;
6974 }
6975 } else {
32fc9b6a 6976 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6977 scan++;
6978 }
a0d0e21e 6979 break;
4063ade8 6980
3018b823 6981 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 6982
3018b823
KW
6983 case NPOSIXL:
6984 to_complement = 1;
6985 /* FALLTHROUGH */
980866de 6986
3018b823 6987 case POSIXL:
272d35c9 6988 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
6989 if (! utf8_target) {
6990 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6991 *scan)))
a12cf05f 6992 {
3018b823
KW
6993 scan++;
6994 }
6995 } else {
6996 while (hardcount < max && scan < loceol
6997 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6998 (U8 *) scan)))
6999 {
7000 scan += UTF8SKIP(scan);
ffc61ed2
JH
7001 hardcount++;
7002 }
a0ed51b3
LW
7003 }
7004 break;
0658cdde 7005
3018b823
KW
7006 case POSIXD:
7007 if (utf8_target) {
7008 goto utf8_posix;
7009 }
7010 /* FALLTHROUGH */
7011
0658cdde 7012 case POSIXA:
0430522f 7013 if (utf8_target && loceol - scan > max) {
4063ade8 7014
7aee35ff
KW
7015 /* We didn't adjust <loceol> at the beginning of this routine
7016 * because is UTF-8, but it is actually ok to do so, since here, to
7017 * match, 1 char == 1 byte. */
4063ade8
KW
7018 loceol = scan + max;
7019 }
7020 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
7021 scan++;
7022 }
7023 break;
980866de 7024
3018b823
KW
7025 case NPOSIXD:
7026 if (utf8_target) {
7027 to_complement = 1;
7028 goto utf8_posix;
7029 }
7030 /* FALL THROUGH */
980866de 7031
3018b823
KW
7032 case NPOSIXA:
7033 if (! utf8_target) {
7034 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
7035 scan++;
7036 }
4063ade8 7037 }
3018b823 7038 else {
980866de 7039
3018b823
KW
7040 /* The complement of something that matches only ASCII matches all
7041 * UTF-8 variant code points, plus everything in ASCII that isn't
7042 * in the class. */
bedac28b 7043 while (hardcount < max && scan < loceol
3018b823
KW
7044 && (! UTF8_IS_INVARIANT(*scan)
7045 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 7046 {
3018b823 7047 scan += UTF8SKIP(scan);
ffc61ed2
JH
7048 hardcount++;
7049 }
3018b823
KW
7050 }
7051 break;
980866de 7052
3018b823
KW
7053 case NPOSIXU:
7054 to_complement = 1;
7055 /* FALLTHROUGH */
7056
7057 case POSIXU:
7058 if (! utf8_target) {
7059 while (scan < loceol && to_complement
7060 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 7061 {
3018b823
KW
7062 scan++;
7063 }
cfaf538b
KW
7064 }
7065 else {
3018b823
KW
7066 utf8_posix:
7067 classnum = (_char_class_number) FLAGS(p);
7068 if (classnum < _FIRST_NON_SWASH_CC) {
7069
7070 /* Here, a swash is needed for above-Latin1 code points.
7071 * Process as many Latin1 code points using the built-in rules.
7072 * Go to another loop to finish processing upon encountering
7073 * the first Latin1 code point. We could do that in this loop
7074 * as well, but the other way saves having to test if the swash
7075 * has been loaded every time through the loop: extra space to
7076 * save a test. */
7077 while (hardcount < max && scan < loceol) {
7078 if (UTF8_IS_INVARIANT(*scan)) {
7079 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7080 classnum))))
7081 {
7082 break;
7083 }
7084 scan++;
7085 }
7086 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7087 if (! (to_complement
7088 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
7089 *(scan + 1)),
7090 classnum))))
7091 {
7092 break;
7093 }
7094 scan += 2;
7095 }
7096 else {
7097 goto found_above_latin1;
7098 }
7099
7100 hardcount++;
7101 }
7102 }
7103 else {
7104 /* For these character classes, the knowledge of how to handle
7105 * every code point is compiled in to Perl via a macro. This
7106 * code is written for making the loops as tight as possible.
7107 * It could be refactored to save space instead */
7108 switch (classnum) {
7109 case _CC_ENUM_SPACE: /* XXX would require separate code
7110 if we revert the change of \v
7111 matching this */
7112 /* FALL THROUGH */
7113 case _CC_ENUM_PSXSPC:
7114 while (hardcount < max
7115 && scan < loceol
7116 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7117 {
7118 scan += UTF8SKIP(scan);
7119 hardcount++;
7120 }
7121 break;
7122 case _CC_ENUM_BLANK:
7123 while (hardcount < max
7124 && scan < loceol
7125 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7126 {
7127 scan += UTF8SKIP(scan);
7128 hardcount++;
7129 }
7130 break;
7131 case _CC_ENUM_XDIGIT:
7132 while (hardcount < max
7133 && scan < loceol
7134 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7135 {
7136 scan += UTF8SKIP(scan);
7137 hardcount++;
7138 }
7139 break;
7140 case _CC_ENUM_VERTSPACE:
7141 while (hardcount < max
7142 && scan < loceol
7143 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7144 {
7145 scan += UTF8SKIP(scan);
7146 hardcount++;
7147 }
7148 break;
7149 case _CC_ENUM_CNTRL:
7150 while (hardcount < max
7151 && scan < loceol
7152 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7153 {
7154 scan += UTF8SKIP(scan);
7155 hardcount++;
7156 }
7157 break;
7158 default:
7159 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7160 }
7161 }
a0ed51b3 7162 }
3018b823 7163 break;
4063ade8 7164
3018b823
KW
7165 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7166
7167 /* Load the swash if not already present */
7168 if (! PL_utf8_swash_ptrs[classnum]) {
7169 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7170 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7171 "utf8", swash_property_names[classnum],
7172 &PL_sv_undef, 1, 0, NULL, &flags);
4063ade8 7173 }
3018b823
KW
7174
7175 while (hardcount < max && scan < loceol
7176 && to_complement ^ cBOOL(_generic_utf8(
7177 classnum,
7178 scan,
7179 swash_fetch(PL_utf8_swash_ptrs[classnum],
7180 (U8 *) scan,
7181 TRUE))))
7182 {
7183 scan += UTF8SKIP(scan);
7184 hardcount++;
7185 }
7186 break;
7187
e1d1eefb 7188 case LNBREAK:
e64f369d
KW
7189 if (utf8_target) {
7190 while (hardcount < max && scan < loceol &&
7191 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7192 scan += c;
7193 hardcount++;
7194 }
7195 } else {
7196 /* LNBREAK can match one or two latin chars, which is ok, but we
7197 * have to use hardcount in this situation, and throw away the
7198 * adjustment to <loceol> done before the switch statement */
220db18a 7199 loceol = reginfo->strend;
e64f369d
KW
7200 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7201 scan+=c;
7202 hardcount++;
7203 }
7204 }
7205 break;
e1d1eefb 7206
584b1f02
KW
7207 case BOUND:
7208 case BOUNDA:
7209 case BOUNDL:
7210 case BOUNDU:
7211 case EOS:
7212 case GPOS:
7213 case KEEPS:
7214 case NBOUND:
7215 case NBOUNDA:
7216 case NBOUNDL:
7217 case NBOUNDU:
7218 case OPFAIL:
7219 case SBOL:
7220 case SEOL:
7221 /* These are all 0 width, so match right here or not at all. */
7222 break;
7223
7224 default:
7225 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7226 assert(0); /* NOTREACHED */
7227
a0d0e21e 7228 }
a687059c 7229
a0ed51b3
LW
7230 if (hardcount)
7231 c = hardcount;
7232 else
f73aaa43
DM
7233 c = scan - *startposp;
7234 *startposp = scan;
a687059c 7235
a3621e74 7236 DEBUG_r({
e68ec53f 7237 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7238 DEBUG_EXECUTE_r({
e68ec53f
YO
7239 SV * const prop = sv_newmortal();
7240 regprop(prog, prop, p);
7241 PerlIO_printf(Perl_debug_log,
be8e71aa 7242 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7243 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7244 });
be8e71aa 7245 });
9041c2e3 7246
a0d0e21e 7247 return(c);
a687059c
LW
7248}
7249
c277df42 7250
be8e71aa 7251#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7252/*
6c6525b8 7253- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7254create a copy so that changes the caller makes won't change the shared one.
7255If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7256 */
ffc61ed2 7257SV *
5aaab254 7258Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7259{
6c6525b8 7260 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7261
7262 if (altsvp) {
7263 *altsvp = NULL;
7264 }
7265
7266 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7267}
7268#endif
7269
7270STATIC SV *
5aaab254 7271S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7272{
8c9eb58f
KW
7273 /* Returns the swash for the input 'node' in the regex 'prog'.
7274 * If <doinit> is true, will attempt to create the swash if not already
7275 * done.
7276 * If <listsvp> is non-null, will return the swash initialization string in
7277 * it.
8c9eb58f
KW
7278 * Tied intimately to how regcomp.c sets up the data structure */
7279
97aff369 7280 dVAR;
9e55ce06
JH
7281 SV *sw = NULL;
7282 SV *si = NULL;
7a6c6baa
KW
7283 SV* invlist = NULL;
7284
f8fc2ecf
YO
7285 RXi_GET_DECL(prog,progi);
7286 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7287
6c6525b8 7288 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7289
ccb2541c
KW
7290 assert(ANYOF_NONBITMAP(node));
7291
4f639d21 7292 if (data && data->count) {
a3b680e6 7293 const U32 n = ARG(node);
ffc61ed2 7294
4f639d21 7295 if (data->what[n] == 's') {
ad64d0ec
NC
7296 SV * const rv = MUTABLE_SV(data->data[n]);
7297 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7298 SV **const ary = AvARRAY(av);
87367d5f 7299 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7300
8c9eb58f 7301 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7302
88675427
KW
7303 /* Elements 2 and 3 are either both present or both absent. [2] is
7304 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7305 * that inversion list has any user-defined properties in it. */
88675427
KW
7306 if (av_len(av) >= 2) {
7307 invlist = ary[2];
7308 if (SvUV(ary[3])) {
83199d38
KW
7309 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7310 }
7a6c6baa
KW
7311 }
7312 else {
7313 invlist = NULL;
7a6c6baa
KW
7314 }
7315
8c9eb58f
KW
7316 /* Element [1] is reserved for the set-up swash. If already there,
7317 * return it; if not, create it and store it there */
f192cf32
KW
7318 if (SvROK(ary[1])) {
7319 sw = ary[1];
7320 }
ffc61ed2 7321 else if (si && doinit) {
7a6c6baa
KW
7322
7323 sw = _core_swash_init("utf8", /* the utf8 package */
7324 "", /* nameless */
7325 si,
7326 1, /* binary */
7327 0, /* not from tr/// */
7a6c6baa 7328 invlist,
83199d38 7329 &swash_init_flags);
ffc61ed2
JH
7330 (void)av_store(av, 1, sw);
7331 }
7332 }
7333 }
7334
7a6c6baa
KW
7335 if (listsvp) {
7336 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7337
7338 /* Use the swash, if any, which has to have incorporated into it all
7339 * possibilities */
872dd7e0
KW
7340 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7341 && (si && si != &PL_sv_undef))
7342 {
7a6c6baa 7343
872dd7e0 7344 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7345 sv_catsv(matches_string, si);
7346 }
7347
7348 /* Add the inversion list to whatever we have. This may have come from
7349 * the swash, or from an input parameter */
7350 if (invlist) {
7351 sv_catsv(matches_string, _invlist_contents(invlist));
7352 }
7353 *listsvp = matches_string;
7354 }
7355
ffc61ed2
JH
7356 return sw;
7357}
7358
7359/*
ba7b4546 7360 - reginclass - determine if a character falls into a character class
832705d4 7361
6698fab5
KW
7362 n is the ANYOF regnode
7363 p is the target string
6698fab5 7364 utf8_target tells whether p is in UTF-8.
832705d4 7365
635cd5d4 7366 Returns true if matched; false otherwise.
eba1359e 7367
d5788240
KW
7368 Note that this can be a synthetic start class, a combination of various
7369 nodes, so things you think might be mutually exclusive, such as locale,
7370 aren't. It can match both locale and non-locale
7371
bbce6d69 7372 */
7373
76e3520e 7374STATIC bool
272d35c9 7375S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
bbce6d69 7376{
27da23d5 7377 dVAR;
a3b680e6 7378 const char flags = ANYOF_FLAGS(n);
bbce6d69 7379 bool match = FALSE;
cc07378b 7380 UV c = *p;
1aa99e6b 7381
7918f24d
NC
7382 PERL_ARGS_ASSERT_REGINCLASS;
7383
afd2eb18
KW
7384 /* If c is not already the code point, get it. Note that
7385 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7386 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7387 STRLEN c_len = 0;
f7ab54c6 7388 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7389 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7390 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7391 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7392 * UTF8_ALLOW_FFFF */
f7ab54c6 7393 if (c_len == (STRLEN)-1)
e8a70c6f 7394 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7395 }
4b3cda86 7396
7cdde544
KW
7397 /* If this character is potentially in the bitmap, check it */
7398 if (c < 256) {
ffc61ed2
JH
7399 if (ANYOF_BITMAP_TEST(n, c))
7400 match = TRUE;
11454c59
KW
7401 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7402 && ! utf8_target
7403 && ! isASCII(c))
7404 {
7405 match = TRUE;
7406 }
78969a98 7407 else if (flags & ANYOF_LOCALE) {
272d35c9 7408 RXp_MATCH_TAINTED_on(prog);
78969a98 7409
538b546e 7410 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7411 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7412 {
ffc61ed2 7413 match = TRUE;
78969a98 7414 }
31c7f561
KW
7415 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7416
7417 /* The data structure is arranged so bits 0, 2, 4, ... are set
7418 * if the class includes the Posix character class given by
7419 * bit/2; and 1, 3, 5, ... are set if the class includes the
7420 * complemented Posix class given by int(bit/2). So we loop
7421 * through the bits, each time changing whether we complement
7422 * the result or not. Suppose for the sake of illustration
7423 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7424 * is set, it means there is a match for this ANYOF node if the
7425 * character is in the class given by the expression (0 / 2 = 0
7426 * = \w). If it is in that class, isFOO_lc() will return 1,
7427 * and since 'to_complement' is 0, the result will stay TRUE,
7428 * and we exit the loop. Suppose instead that bit 0 is 0, but
7429 * bit 1 is 1. That means there is a match if the character
7430 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7431 * but will on bit 1. On the second iteration 'to_complement'
7432 * will be 1, so the exclusive or will reverse things, so we
7433 * are testing for \W. On the third iteration, 'to_complement'
7434 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7435 * iteration would test for \S, etc.
7436 *
7437 * Note that this code assumes that all the classes are closed
7438 * under folding. For example, if a character matches \w, then
7439 * its fold does too; and vice versa. This should be true for
7440 * any well-behaved locale for all the currently defined Posix
7441 * classes, except for :lower: and :upper:, which are handled
7442 * by the pseudo-class :cased: which matches if either of the
7443 * other two does. To get rid of this assumption, an outer
7444 * loop could be used below to iterate over both the source
7445 * character, and its fold (if different) */
31c7f561
KW
7446
7447 int count = 0;
7448 int to_complement = 0;
7449 while (count < ANYOF_MAX) {
7450 if (ANYOF_CLASS_TEST(n, count)
7451 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7452 {
7453 match = TRUE;
7454 break;
7455 }
7456 count++;
7457 to_complement ^= 1;
7458 }
ffc61ed2 7459 }
a0ed51b3 7460 }
a0ed51b3
LW
7461 }
7462
7cdde544 7463 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7464 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7465 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7466 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7467 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7468 * could be a combination of locale and non-locale nodes. So allow locale
7469 * to match for the synthetic start class, which will give a false
7470 * positive that will be resolved when the match is done again as not part
7471 * of the synthetic start class */
ef87b810 7472 if (!match) {
10ee90d2
KW
7473 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7474 match = TRUE; /* Everything above 255 matches */
e051a21d 7475 }
6f8d7d0d
KW
7476 else if (ANYOF_NONBITMAP(n)
7477 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7478 || (utf8_target
7479 && (c >=256
7480 || (! (flags & ANYOF_LOCALE))
9aa1e39f 7481 || OP(n) == ANYOF_SYNTHETIC))))
ef87b810 7482 {
e0193e47 7483 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7484 if (sw) {
7485 U8 * utf8_p;
7486 if (utf8_target) {
7487 utf8_p = (U8 *) p;
e0193e47
KW
7488 } else { /* Convert to utf8 */
7489 STRLEN len = 1;
7cdde544
KW
7490 utf8_p = bytes_to_utf8(p, &len);
7491 }
f56b6394 7492
e0193e47 7493 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7494 match = TRUE;
e0193e47 7495 }
7cdde544
KW
7496
7497 /* If we allocated a string above, free it */
7498 if (! utf8_target) Safefree(utf8_p);
7499 }
7500 }
5073ffbd
KW
7501
7502 if (UNICODE_IS_SUPER(c)
954a2af6 7503 && OP(n) == ANYOF_WARN_SUPER
5073ffbd
KW
7504 && ckWARN_d(WARN_NON_UNICODE))
7505 {
7506 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7507 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7508 }
7cdde544
KW
7509 }
7510
f0fdc1c9
KW
7511 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7512 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7513}
161b471a 7514
dfe13c55 7515STATIC U8 *
0ce71af7 7516S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7517{
6af86488
KW
7518 /* return the position 'off' UTF-8 characters away from 's', forward if
7519 * 'off' >= 0, backwards if negative. But don't go outside of position
7520 * 'lim', which better be < s if off < 0 */
7521
97aff369 7522 dVAR;
7918f24d
NC
7523
7524 PERL_ARGS_ASSERT_REGHOP3;
7525
a0ed51b3 7526 if (off >= 0) {
1aa99e6b 7527 while (off-- && s < lim) {
ffc61ed2 7528 /* XXX could check well-formedness here */
a0ed51b3 7529 s += UTF8SKIP(s);
ffc61ed2 7530 }
a0ed51b3
LW
7531 }
7532 else {
1de06328
YO
7533 while (off++ && s > lim) {
7534 s--;
7535 if (UTF8_IS_CONTINUED(*s)) {
7536 while (s > lim && UTF8_IS_CONTINUATION(*s))
7537 s--;
a0ed51b3 7538 }
1de06328 7539 /* XXX could check well-formedness here */
a0ed51b3
LW
7540 }
7541 }
7542 return s;
7543}
161b471a 7544
f9f4320a
YO
7545#ifdef XXX_dmq
7546/* there are a bunch of places where we use two reghop3's that should
7547 be replaced with this routine. but since thats not done yet
7548 we ifdef it out - dmq
7549*/
dfe13c55 7550STATIC U8 *
1de06328
YO
7551S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7552{
7553 dVAR;
7918f24d
NC
7554
7555 PERL_ARGS_ASSERT_REGHOP4;
7556
1de06328
YO
7557 if (off >= 0) {
7558 while (off-- && s < rlim) {
7559 /* XXX could check well-formedness here */
7560 s += UTF8SKIP(s);
7561 }
7562 }
7563 else {
7564 while (off++ && s > llim) {
7565 s--;
7566 if (UTF8_IS_CONTINUED(*s)) {
7567 while (s > llim && UTF8_IS_CONTINUATION(*s))
7568 s--;
7569 }
7570 /* XXX could check well-formedness here */
7571 }
7572 }
7573 return s;
7574}
f9f4320a 7575#endif
1de06328
YO
7576
7577STATIC U8 *
0ce71af7 7578S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7579{
97aff369 7580 dVAR;
7918f24d
NC
7581
7582 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7583
a0ed51b3 7584 if (off >= 0) {
1aa99e6b 7585 while (off-- && s < lim) {
ffc61ed2 7586 /* XXX could check well-formedness here */
a0ed51b3 7587 s += UTF8SKIP(s);
ffc61ed2 7588 }
a0ed51b3 7589 if (off >= 0)
3dab1dad 7590 return NULL;
a0ed51b3
LW
7591 }
7592 else {
1de06328
YO
7593 while (off++ && s > lim) {
7594 s--;
7595 if (UTF8_IS_CONTINUED(*s)) {
7596 while (s > lim && UTF8_IS_CONTINUATION(*s))
7597 s--;
a0ed51b3 7598 }
1de06328 7599 /* XXX could check well-formedness here */
a0ed51b3
LW
7600 }
7601 if (off <= 0)
3dab1dad 7602 return NULL;
a0ed51b3
LW
7603 }
7604 return s;
7605}
51371543 7606
a75351a1
DM
7607
7608/* when executing a regex that may have (?{}), extra stuff needs setting
7609 up that will be visible to the called code, even before the current
7610 match has finished. In particular:
7611
7612 * $_ is localised to the SV currently being matched;
7613 * pos($_) is created if necessary, ready to be updated on each call-out
7614 to code;
7615 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7616 isn't set until the current pattern is successfully finished), so that
7617 $1 etc of the match-so-far can be seen;
7618 * save the old values of subbeg etc of the current regex, and set then
7619 to the current string (again, this is normally only done at the end
7620 of execution)
a75351a1
DM
7621*/
7622
7623static void
7624S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7625{
7626 MAGIC *mg;
7627 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 7628 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 7629
8adc0f72 7630 eval_state->rex = rex;
a75351a1 7631
a75351a1
DM
7632 if (reginfo->sv) {
7633 /* Make $_ available to executed code. */
7634 if (reginfo->sv != DEFSV) {
7635 SAVE_DEFSV;
7636 DEFSV_set(reginfo->sv);
7637 }
7638
96c2a8ff 7639 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 7640 /* prepare for quick setting of pos */
96c2a8ff 7641 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
7642 mg->mg_len = -1;
7643 }
8adc0f72
DM
7644 eval_state->pos_magic = mg;
7645 eval_state->pos = mg->mg_len;
a75351a1 7646 }
8adc0f72
DM
7647 else
7648 eval_state->pos_magic = NULL;
7649
a75351a1 7650 if (!PL_reg_curpm) {
f65e70f5
DM
7651 /* PL_reg_curpm is a fake PMOP that we can attach the current
7652 * regex to and point PL_curpm at, so that $1 et al are visible
7653 * within a /(?{})/. It's just allocated once per interpreter the
7654 * first time its needed */
a75351a1
DM
7655 Newxz(PL_reg_curpm, 1, PMOP);
7656#ifdef USE_ITHREADS
7657 {
7658 SV* const repointer = &PL_sv_undef;
7659 /* this regexp is also owned by the new PL_reg_curpm, which
7660 will try to free it. */
7661 av_push(PL_regex_padav, repointer);
7662 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7663 PL_regex_pad = AvARRAY(PL_regex_padav);
7664 }
7665#endif
7666 }
7667 SET_reg_curpm(reginfo->prog);
8adc0f72 7668 eval_state->curpm = PL_curpm;
a75351a1
DM
7669 PL_curpm = PL_reg_curpm;
7670 if (RXp_MATCH_COPIED(rex)) {
7671 /* Here is a serious problem: we cannot rewrite subbeg,
7672 since it may be needed if this match fails. Thus
7673 $` inside (?{}) could fail... */
8adc0f72
DM
7674 eval_state->subbeg = rex->subbeg;
7675 eval_state->sublen = rex->sublen;
7676 eval_state->suboffset = rex->suboffset;
a8ee055f 7677 eval_state->subcoffset = rex->subcoffset;
a75351a1 7678#ifdef PERL_ANY_COW
8adc0f72 7679 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
7680#endif
7681 RXp_MATCH_COPIED_off(rex);
7682 }
7683 else
8adc0f72 7684 eval_state->subbeg = NULL;
a75351a1
DM
7685 rex->subbeg = (char *)reginfo->strbeg;
7686 rex->suboffset = 0;
7687 rex->subcoffset = 0;
7688 rex->sublen = reginfo->strend - reginfo->strbeg;
7689}
7690
bf2039a9
DM
7691
7692/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 7693
51371543 7694static void
bf2039a9 7695S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 7696{
97aff369 7697 dVAR;
bf2039a9
DM
7698 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7699 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 7700 regmatch_slab *s;
bf2039a9 7701
2ac8ff4b
DM
7702 Safefree(aux->poscache);
7703
331b2dcc 7704 if (eval_state) {
bf2039a9 7705
331b2dcc 7706 /* undo the effects of S_setup_eval_state() */
bf2039a9 7707
331b2dcc
DM
7708 if (eval_state->subbeg) {
7709 regexp * const rex = eval_state->rex;
7710 rex->subbeg = eval_state->subbeg;
7711 rex->sublen = eval_state->sublen;
7712 rex->suboffset = eval_state->suboffset;
7713 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 7714#ifdef PERL_ANY_COW
331b2dcc 7715 rex->saved_copy = eval_state->saved_copy;
ed252734 7716#endif
331b2dcc
DM
7717 RXp_MATCH_COPIED_on(rex);
7718 }
7719 if (eval_state->pos_magic)
7720 eval_state->pos_magic->mg_len = eval_state->pos;
7721
7722 PL_curpm = eval_state->curpm;
8adc0f72 7723 }
bf2039a9 7724
331b2dcc
DM
7725 PL_regmatch_state = aux->old_regmatch_state;
7726 PL_regmatch_slab = aux->old_regmatch_slab;
7727
7728 /* free all slabs above current one - this must be the last action
7729 * of this function, as aux and eval_state are allocated within
7730 * slabs and may be freed here */
7731
7732 s = PL_regmatch_slab->next;
7733 if (s) {
7734 PL_regmatch_slab->next = NULL;
7735 while (s) {
7736 regmatch_slab * const osl = s;
7737 s = s->next;
7738 Safefree(osl);
7739 }
7740 }
51371543 7741}
33b8afdf 7742
8adc0f72 7743
33b8afdf 7744STATIC void
5aaab254 7745S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7746{
7e0d5ad7
KW
7747 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7748 * on the converted value */
7749
a1cac82e 7750 int i = 1;
7918f24d
NC
7751
7752 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7753
a1cac82e
NC
7754 do {
7755 if (prog->substrs->data[i].substr
7756 && !prog->substrs->data[i].utf8_substr) {
7757 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7758 prog->substrs->data[i].utf8_substr = sv;
7759 sv_utf8_upgrade(sv);
610460f9 7760 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7761 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7762 /* Trim the trailing \n that fbm_compile added last
7763 time. */
7764 SvCUR_set(sv, SvCUR(sv) - 1);
7765 /* Whilst this makes the SV technically "invalid" (as its
7766 buffer is no longer followed by "\0") when fbm_compile()
7767 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7768 fbm_compile(sv, FBMcf_TAIL);
7769 } else
7770 fbm_compile(sv, 0);
610460f9 7771 }
a1cac82e
NC
7772 if (prog->substrs->data[i].substr == prog->check_substr)
7773 prog->check_utf8 = sv;
7774 }
7775 } while (i--);
33b8afdf
JH
7776}
7777
7e0d5ad7 7778STATIC bool
5aaab254 7779S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7780{
7e0d5ad7
KW
7781 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7782 * on the converted value; returns FALSE if can't be converted. */
7783
97aff369 7784 dVAR;
a1cac82e 7785 int i = 1;
7918f24d
NC
7786
7787 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7788
a1cac82e
NC
7789 do {
7790 if (prog->substrs->data[i].utf8_substr
7791 && !prog->substrs->data[i].substr) {
7792 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7793 if (! sv_utf8_downgrade(sv, TRUE)) {
7794 return FALSE;
7795 }
5400f398
KW
7796 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7797 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7798 /* Trim the trailing \n that fbm_compile added last
7799 time. */
7800 SvCUR_set(sv, SvCUR(sv) - 1);
7801 fbm_compile(sv, FBMcf_TAIL);
7802 } else
7803 fbm_compile(sv, 0);
7804 }
a1cac82e
NC
7805 prog->substrs->data[i].substr = sv;
7806 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7807 prog->check_substr = sv;
33b8afdf 7808 }
a1cac82e 7809 } while (i--);
7e0d5ad7
KW
7810
7811 return TRUE;
33b8afdf 7812}
66610fdd
RGS
7813
7814/*
7815 * Local variables:
7816 * c-indentation-style: bsd
7817 * c-basic-offset: 4
14d04a33 7818 * indent-tabs-mode: nil
66610fdd
RGS
7819 * End:
7820 *
14d04a33 7821 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7822 */