This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
write_buildcustomize.pl: Setup $^O to be the target os when building Core.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
121 : (U8*)(pos + off))
122#define HOPBACKc(pos, off) \
ba44c216 123 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
124 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
125 : (pos - off >= reginfo->strbeg) \
8e11feef 126 ? (U8*)pos - off \
3dab1dad 127 : NULL)
efb30f32 128
ba44c216 129#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 131
7016d6eb
DM
132
133#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
134#define NEXTCHR_IS_EOS (nextchr < 0)
135
136#define SET_nextchr \
220db18a 137 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
138
139#define SET_locinput(p) \
140 locinput = (p); \
141 SET_nextchr
142
143
2a16ac92 144#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
145 if (!swash_ptr) { \
146 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 147 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 148 1, 0, invlist, &flags); \
c7304fe2
KW
149 assert(swash_ptr); \
150 } \
151 } STMT_END
152
153/* If in debug mode, we test that a known character properly matches */
154#ifdef DEBUGGING
155# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
156 property_name, \
2a16ac92 157 invlist, \
c7304fe2 158 utf8_char_in_property) \
2a16ac92 159 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
160 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
161#else
162# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
163 property_name, \
2a16ac92 164 invlist, \
c7304fe2 165 utf8_char_in_property) \
2a16ac92 166 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 167#endif
d1eb3177 168
c7304fe2
KW
169#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
170 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
171 "", \
172 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 173 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
174
175#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
176 STMT_START { \
177 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
178 "_X_regular_begin", \
2a16ac92 179 NULL, \
df38da56 180 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
181 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
182 "_X_extend", \
2a16ac92 183 NULL, \
c7304fe2
KW
184 COMBINING_GRAVE_ACCENT_UTF8); \
185 } STMT_END
d1eb3177 186
c7304fe2 187#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
188/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
189
5f80c4cf 190/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
191/* it would be nice to rework regcomp.sym to generate this stuff. sigh
192 *
193 * NOTE that *nothing* that affects backtracking should be in here, specifically
194 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
195 * node that is in between two EXACT like nodes when ascertaining what the required
196 * "follow" character is. This should probably be moved to regex compile time
197 * although it may be done at run time beause of the REF possibility - more
198 * investigation required. -- demerphq
199*/
3e901dc0
YO
200#define JUMPABLE(rn) ( \
201 OP(rn) == OPEN || \
202 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
203 OP(rn) == EVAL || \
cca55fe3
JP
204 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
205 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 206 OP(rn) == KEEPS || \
3dab1dad 207 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 208)
ee9b8eae 209#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 210
ee9b8eae
YO
211#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
212
213#if 0
214/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
215 we don't need this definition. */
216#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 217#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
218#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
219
220#else
221/* ... so we use this as its faster. */
222#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 223#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
224#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
225#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
226
227#endif
e2d8ce26 228
a84d97b6
HS
229/*
230 Search for mandatory following text node; for lookahead, the text must
231 follow but for lookbehind (rn->flags != 0) we skip to the next step.
232*/
cca55fe3 233#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
234 while (JUMPABLE(rn)) { \
235 const OPCODE type = OP(rn); \
236 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 237 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 238 else if (type == PLUS) \
cca55fe3 239 rn = NEXTOPER(rn); \
3dab1dad 240 else if (type == IFMATCH) \
a84d97b6 241 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 242 else rn += NEXT_OFF(rn); \
3dab1dad 243 } \
5f80c4cf 244} STMT_END
74750237 245
22913b96
KW
246/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
247 * These are for the pre-composed Hangul syllables, which are all in a
248 * contiguous block and arranged there in such a way so as to facilitate
249 * alorithmic determination of their characteristics. As such, they don't need
250 * a swash, but can be determined by simple arithmetic. Almost all are
251 * GCB=LVT, but every 28th one is a GCB=LV */
252#define SBASE 0xAC00 /* Start of block */
253#define SCount 11172 /* Length of block */
254#define TCount 28
c476f425 255
006f26b2
DM
256#define SLAB_FIRST(s) (&(s)->states[0])
257#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
258
a75351a1 259static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 260static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 261static regmatch_state * S_push_slab(pTHX);
51371543 262
87c0511b 263#define REGCP_PAREN_ELEMS 3
f067efbf 264#define REGCP_OTHER_ELEMS 3
e0fa7e2b 265#define REGCP_FRAME_ELEMS 1
620d5b66
NC
266/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
267 * are needed for the regexp context stack bookkeeping. */
268
76e3520e 269STATIC CHECKPOINT
92da3157 270S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 271{
97aff369 272 dVAR;
a3b680e6 273 const int retval = PL_savestack_ix;
92da3157
DM
274 const int paren_elems_to_push =
275 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
276 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
277 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 278 I32 p;
40a82448 279 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 280
b93070ed
DM
281 PERL_ARGS_ASSERT_REGCPPUSH;
282
e49a9654 283 if (paren_elems_to_push < 0)
5637ef5b
NC
284 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
285 paren_elems_to_push);
e49a9654 286
e0fa7e2b
NC
287 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
288 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 289 " out of range (%lu-%ld)",
92da3157
DM
290 total_elems,
291 (unsigned long)maxopenparen,
292 (long)parenfloor);
e0fa7e2b 293
620d5b66 294 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 295
495f47a5 296 DEBUG_BUFFERS_r(
92da3157 297 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
298 PerlIO_printf(Perl_debug_log,
299 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
300 PTR2UV(rex),
301 PTR2UV(rex->offs)
302 );
303 );
92da3157 304 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 305/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
306 SSPUSHIV(rex->offs[p].end);
307 SSPUSHIV(rex->offs[p].start);
1ca2007e 308 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 309 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
310 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
311 (UV)p,
312 (IV)rex->offs[p].start,
313 (IV)rex->offs[p].start_tmp,
314 (IV)rex->offs[p].end
40a82448 315 ));
a0d0e21e 316 }
b1ce53c5 317/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 318 SSPUSHINT(maxopenparen);
b93070ed
DM
319 SSPUSHINT(rex->lastparen);
320 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 321 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 322
a0d0e21e
LW
323 return retval;
324}
325
c277df42 326/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
327#define REGCP_SET(cp) \
328 DEBUG_STATE_r( \
ab3bbdeb 329 PerlIO_printf(Perl_debug_log, \
e4f74956 330 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
331 (IV)PL_savestack_ix)); \
332 cp = PL_savestack_ix
c3464db5 333
ab3bbdeb 334#define REGCP_UNWIND(cp) \
e4f74956 335 DEBUG_STATE_r( \
ab3bbdeb 336 if (cp != PL_savestack_ix) \
e4f74956
YO
337 PerlIO_printf(Perl_debug_log, \
338 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
339 (IV)(cp), (IV)PL_savestack_ix)); \
340 regcpblow(cp)
c277df42 341
a8d1f4b4
DM
342#define UNWIND_PAREN(lp, lcp) \
343 for (n = rex->lastparen; n > lp; n--) \
344 rex->offs[n].end = -1; \
345 rex->lastparen = n; \
346 rex->lastcloseparen = lcp;
347
348
f067efbf 349STATIC void
92da3157 350S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 351{
97aff369 352 dVAR;
e0fa7e2b 353 UV i;
87c0511b 354 U32 paren;
a3621e74
YO
355 GET_RE_DEBUG_FLAGS_DECL;
356
7918f24d
NC
357 PERL_ARGS_ASSERT_REGCPPOP;
358
b1ce53c5 359 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 360 i = SSPOPUV;
e0fa7e2b
NC
361 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
362 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
363 rex->lastcloseparen = SSPOPINT;
364 rex->lastparen = SSPOPINT;
92da3157 365 *maxopenparen_p = SSPOPINT;
b1ce53c5 366
620d5b66 367 i -= REGCP_OTHER_ELEMS;
b1ce53c5 368 /* Now restore the parentheses context. */
495f47a5
DM
369 DEBUG_BUFFERS_r(
370 if (i || rex->lastparen + 1 <= rex->nparens)
371 PerlIO_printf(Perl_debug_log,
372 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
373 PTR2UV(rex),
374 PTR2UV(rex->offs)
375 );
376 );
92da3157 377 paren = *maxopenparen_p;
620d5b66 378 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 379 SSize_t tmps;
1ca2007e 380 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
381 rex->offs[paren].start = SSPOPIV;
382 tmps = SSPOPIV;
b93070ed
DM
383 if (paren <= rex->lastparen)
384 rex->offs[paren].end = tmps;
495f47a5
DM
385 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
386 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
387 (UV)paren,
388 (IV)rex->offs[paren].start,
389 (IV)rex->offs[paren].start_tmp,
390 (IV)rex->offs[paren].end,
391 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 392 );
87c0511b 393 paren--;
a0d0e21e 394 }
daf18116 395#if 1
dafc8851
JH
396 /* It would seem that the similar code in regtry()
397 * already takes care of this, and in fact it is in
398 * a better location to since this code can #if 0-ed out
399 * but the code in regtry() is needed or otherwise tests
400 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
401 * (as of patchlevel 7877) will fail. Then again,
402 * this code seems to be necessary or otherwise
225593e1
DM
403 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
404 * --jhi updated by dapm */
b93070ed 405 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 406 if (i > *maxopenparen_p)
b93070ed
DM
407 rex->offs[i].start = -1;
408 rex->offs[i].end = -1;
495f47a5
DM
409 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
410 " \\%"UVuf": %s ..-1 undeffing\n",
411 (UV)i,
92da3157 412 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 413 ));
a0d0e21e 414 }
dafc8851 415#endif
a0d0e21e
LW
416}
417
74088413
DM
418/* restore the parens and associated vars at savestack position ix,
419 * but without popping the stack */
420
421STATIC void
92da3157 422S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
423{
424 I32 tmpix = PL_savestack_ix;
425 PL_savestack_ix = ix;
92da3157 426 regcppop(rex, maxopenparen_p);
74088413
DM
427 PL_savestack_ix = tmpix;
428}
429
02db2b7b 430#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 431
31c7f561
KW
432STATIC bool
433S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
434{
435 /* Returns a boolean as to whether or not 'character' is a member of the
436 * Posix character class given by 'classnum' that should be equivalent to a
437 * value in the typedef '_char_class_number'.
438 *
439 * Ideally this could be replaced by a just an array of function pointers
440 * to the C library functions that implement the macros this calls.
441 * However, to compile, the precise function signatures are required, and
442 * these may vary from platform to to platform. To avoid having to figure
443 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
444 * which adds an extra layer of function call overhead (unless the C
445 * optimizer strips it away). But we don't particularly care about
446 * performance with locales anyway. */
31c7f561
KW
447
448 switch ((_char_class_number) classnum) {
15861f94 449 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 450 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
451 case _CC_ENUM_ASCII: return isASCII_LC(character);
452 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
453 case _CC_ENUM_CASED: return isLOWER_LC(character)
454 || isUPPER_LC(character);
e8d596e0 455 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
456 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
457 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
458 case _CC_ENUM_LOWER: return isLOWER_LC(character);
459 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 460 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 461 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 462 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
463 case _CC_ENUM_UPPER: return isUPPER_LC(character);
464 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 465 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
466 default: /* VERTSPACE should never occur in locales */
467 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
468 }
469
470 assert(0); /* NOTREACHED */
471 return FALSE;
472}
473
3018b823
KW
474STATIC bool
475S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
476{
477 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
478 * 'character' is a member of the Posix character class given by 'classnum'
479 * that should be equivalent to a value in the typedef
480 * '_char_class_number'.
481 *
482 * This just calls isFOO_lc on the code point for the character if it is in
483 * the range 0-255. Outside that range, all characters avoid Unicode
484 * rules, ignoring any locale. So use the Unicode function if this class
485 * requires a swash, and use the Unicode macro otherwise. */
486
487 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
488
489 if (UTF8_IS_INVARIANT(*character)) {
490 return isFOO_lc(classnum, *character);
491 }
492 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
493 return isFOO_lc(classnum,
94bb8c36 494 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
495 }
496
497 if (classnum < _FIRST_NON_SWASH_CC) {
498
499 /* Initialize the swash unless done already */
500 if (! PL_utf8_swash_ptrs[classnum]) {
501 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
502 PL_utf8_swash_ptrs[classnum] =
503 _core_swash_init("utf8",
504 "",
505 &PL_sv_undef, 1, 0,
506 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
507 }
508
92a2046b
KW
509 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
510 character,
511 TRUE /* is UTF */ ));
3018b823
KW
512 }
513
514 switch ((_char_class_number) classnum) {
515 case _CC_ENUM_SPACE:
516 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
517
518 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
519 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
520 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
521 default: return 0; /* Things like CNTRL are always
522 below 256 */
523 }
524
525 assert(0); /* NOTREACHED */
526 return FALSE;
527}
528
a687059c 529/*
e50aee73 530 * pregexec and friends
a687059c
LW
531 */
532
76234dfb 533#ifndef PERL_IN_XSUB_RE
a687059c 534/*
c277df42 535 - pregexec - match a regexp against a string
a687059c 536 */
c277df42 537I32
5aaab254 538Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 539 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
540/* stringarg: the point in the string at which to begin matching */
541/* strend: pointer to null at end of string */
542/* strbeg: real beginning of string */
543/* minend: end of match must be >= minend bytes after stringarg. */
544/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
545 * itself is accessed via the pointers above */
546/* nosave: For optimizations. */
c277df42 547{
7918f24d
NC
548 PERL_ARGS_ASSERT_PREGEXEC;
549
c277df42 550 return
9041c2e3 551 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
552 nosave ? 0 : REXEC_COPY_STR);
553}
76234dfb 554#endif
22e551b9 555
9041c2e3 556/*
cad2e5aa
JH
557 * Need to implement the following flags for reg_anch:
558 *
559 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
560 * USE_INTUIT_ML
561 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
562 * INTUIT_AUTORITATIVE_ML
563 * INTUIT_ONCE_NOML - Intuit can match in one location only.
564 * INTUIT_ONCE_ML
565 *
566 * Another flag for this function: SECOND_TIME (so that float substrs
567 * with giant delta may be not rechecked).
568 */
569
3f7c398e 570/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
571 Otherwise, only SvCUR(sv) is used to get strbeg. */
572
6eb5f6b9
JH
573/* XXXX Some places assume that there is a fixed substring.
574 An update may be needed if optimizer marks as "INTUITable"
575 RExen without fixed substrings. Similarly, it is assumed that
576 lengths of all the strings are no more than minlen, thus they
577 cannot come from lookahead.
40d049e4
YO
578 (Or minlen should take into account lookahead.)
579 NOTE: Some of this comment is not correct. minlen does now take account
580 of lookahead/behind. Further research is required. -- demerphq
581
582*/
6eb5f6b9 583
2c2d71f5
JH
584/* A failure to find a constant substring means that there is no need to make
585 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 586 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
587 regtry() should be needed.
588
589 REx compiler's optimizer found 4 possible hints:
590 a) Anchored substring;
591 b) Fixed substring;
592 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 593 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 594 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
595 string which does not contradict any of them.
596 */
2c2d71f5 597
6eb5f6b9
JH
598/* Most of decisions we do here should have been done at compile time.
599 The nodes of the REx which we used for the search should have been
600 deleted from the finite automaton. */
601
52a21eb3
DM
602/* args:
603 * rx: the regex to match against
604 * sv: the SV being matched: only used for utf8 flag; the string
605 * itself is accessed via the pointers below. Note that on
606 * something like an overloaded SV, SvPOK(sv) may be false
607 * and the string pointers may point to something unrelated to
608 * the SV itself.
609 * strbeg: real beginning of string
610 * strpos: the point in the string at which to begin matching
611 * strend: pointer to the byte following the last char of the string
612 * flags currently unused; set to 0
613 * data: currently unused; set to NULL
614 */
615
cad2e5aa 616char *
52a21eb3
DM
617Perl_re_intuit_start(pTHX_
618 REGEXP * const rx,
619 SV *sv,
620 const char * const strbeg,
621 char *strpos,
622 char *strend,
623 const U32 flags,
624 re_scream_pos_data *data)
cad2e5aa 625{
97aff369 626 dVAR;
8d919b0a 627 struct regexp *const prog = ReANY(rx);
49f55535 628 SSize_t start_shift = 0;
cad2e5aa 629 /* Should be nonnegative! */
ea3daa5d 630 SSize_t end_shift = 0;
eb578fdb
KW
631 char *s;
632 SV *check;
cad2e5aa 633 char *t;
f2ed9b32 634 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 635 I32 ml_anch;
eb578fdb 636 char *other_last = NULL; /* other substr checked before this */
bd61b366 637 char *check_at = NULL; /* check substr found at this pos */
d8080198 638 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 639 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 640 RXi_GET_DECL(prog,progi);
02d5137b
DM
641 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
642 regmatch_info *const reginfo = &reginfo_buf;
30944b6d 643#ifdef DEBUGGING
890ce7af 644 const char * const i_strpos = strpos;
30944b6d 645#endif
a3621e74
YO
646 GET_RE_DEBUG_FLAGS_DECL;
647
7918f24d 648 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
649 PERL_UNUSED_ARG(flags);
650 PERL_UNUSED_ARG(data);
7918f24d 651
c344f387
JH
652 /* CHR_DIST() would be more correct here but it makes things slow. */
653 if (prog->minlen > strend - strpos) {
a3621e74 654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 655 "String too short... [re_intuit_start]\n"));
cad2e5aa 656 goto fail;
2c2d71f5 657 }
d8da0584 658
6c3fea77 659 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 660 reginfo->info_aux = NULL;
9d9163fb 661 reginfo->strbeg = strbeg;
220db18a 662 reginfo->strend = strend;
aed7b151 663 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 664 reginfo->intuit = 1;
1cb48e53
DM
665 /* not actually used within intuit, but zero for safety anyway */
666 reginfo->poscache_maxiter = 0;
02d5137b 667
f2ed9b32 668 if (utf8_target) {
33b8afdf
JH
669 if (!prog->check_utf8 && prog->check_substr)
670 to_utf8_substr(prog);
671 check = prog->check_utf8;
672 } else {
7e0d5ad7
KW
673 if (!prog->check_substr && prog->check_utf8) {
674 if (! to_byte_substr(prog)) {
6b54ddc5 675 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
676 }
677 }
33b8afdf
JH
678 check = prog->check_substr;
679 }
0b2c2a84
DM
680 if (prog->extflags & RXf_ANCH) { /* Match at \G, beg-of-str or after \n */
681 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
bbe252da 682 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 683 && !multiline ) ); /* Check after \n? */
cad2e5aa 684
7e25d62c 685 if (!ml_anch) {
0b2c2a84
DM
686 /* we are only allowed to match at BOS or \G */
687
688 if (prog->extflags & RXf_ANCH_GPOS) {
689 /* in this case, we hope(!) that the caller has already
690 * set strpos to pos()-gofs, and will already have checked
691 * that this anchor position is legal
692 */
693 ;
694 }
695 else if (!(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
696 && (strpos != strbeg))
697 {
a3621e74 698 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
699 goto fail;
700 }
d46b78c6
KW
701 if (prog->check_offset_min == prog->check_offset_max
702 && !(prog->extflags & RXf_CANY_SEEN)
703 && ! multiline) /* /m can cause \n's to match that aren't
704 accounted for in the string max length.
705 See [perl #115242] */
706 {
2c2d71f5 707 /* Substring at constant offset from beg-of-str... */
ea3daa5d 708 SSize_t slen;
cad2e5aa 709
1aa99e6b 710 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 711
653099ff
GS
712 if (SvTAIL(check)) {
713 slen = SvCUR(check); /* >= 1 */
cad2e5aa 714
9041c2e3 715 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 716 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 717 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 718 goto fail_finish;
cad2e5aa
JH
719 }
720 /* Now should match s[0..slen-2] */
721 slen--;
3f7c398e 722 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 723 || (slen > 1
3f7c398e 724 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 725 report_neq:
a3621e74 726 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
727 goto fail_finish;
728 }
cad2e5aa 729 }
3f7c398e 730 else if (*SvPVX_const(check) != *s
653099ff 731 || ((slen = SvCUR(check)) > 1
3f7c398e 732 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 733 goto report_neq;
c315bfe8 734 check_at = s;
2c2d71f5 735 goto success_at_start;
7e25d62c 736 }
cad2e5aa 737 }
2c2d71f5 738 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 739 s = strpos;
2c2d71f5 740 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
741 end_shift = prog->check_end_shift;
742
2c2d71f5 743 if (!ml_anch) {
ea3daa5d 744 const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 745 - (SvTAIL(check) != 0);
ea3daa5d 746 const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
747
748 if (end_shift < eshift)
749 end_shift = eshift;
750 }
cad2e5aa 751 }
2c2d71f5 752 else { /* Can match at random position */
cad2e5aa
JH
753 ml_anch = 0;
754 s = strpos;
1de06328
YO
755 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
756 end_shift = prog->check_end_shift;
757
758 /* end shift should be non negative here */
cad2e5aa
JH
759 }
760
19188028 761#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 762 if (end_shift < 0)
1de06328 763 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 764 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
765#endif
766
2c2d71f5
JH
767 restart:
768 /* Find a possible match in the region s..strend by looking for
769 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
770
771 {
49f55535 772 SSize_t srch_start_shift = start_shift;
ea3daa5d 773 SSize_t srch_end_shift = end_shift;
c33e64f0
FC
774 U8* start_point;
775 U8* end_point;
1de06328
YO
776 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
777 srch_end_shift -= ((strbeg - s) - srch_start_shift);
778 srch_start_shift = strbeg - s;
779 }
6bda09f9 780 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
781 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
782 (IV)prog->check_offset_min,
783 (IV)srch_start_shift,
784 (IV)srch_end_shift,
785 (IV)prog->check_end_shift);
786 });
787
bbe252da 788 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
789 start_point= (U8*)(s + srch_start_shift);
790 end_point= (U8*)(strend - srch_end_shift);
791 } else {
792 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
793 end_point= HOP3(strend, -srch_end_shift, strbeg);
794 }
6bda09f9 795 DEBUG_OPTIMISE_MORE_r({
56570a2c 796 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 797 (int)(end_point - start_point),
fc8cd66c 798 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
799 start_point);
800 });
801
802 s = fbm_instr( start_point, end_point,
7fba1cd6 803 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 804 }
cad2e5aa
JH
805 /* Update the count-of-usability, remove useless subpatterns,
806 unshift s. */
2c2d71f5 807
ab3bbdeb 808 DEBUG_EXECUTE_r({
f2ed9b32 809 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
810 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
811 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 812 (s ? "Found" : "Did not find"),
f2ed9b32 813 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
814 ? "anchored" : "floating"),
815 quoted,
816 RE_SV_TAIL(check),
817 (s ? " at offset " : "...\n") );
818 });
2c2d71f5
JH
819
820 if (!s)
821 goto fail_finish;
2c2d71f5 822 /* Finish the diagnostic message */
a3621e74 823 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 824
1de06328
YO
825 /* XXX dmq: first branch is for positive lookbehind...
826 Our check string is offset from the beginning of the pattern.
827 So we need to do any stclass tests offset forward from that
828 point. I think. :-(
829 */
830
831
832
833 check_at=s;
834
835
2c2d71f5
JH
836 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
837 Start with the other substr.
838 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 839 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
840 *always* match. Probably should be marked during compile...
841 Probably it is right to do no SCREAM here...
842 */
843
f2ed9b32 844 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
845 : (prog->float_substr && prog->anchored_substr))
846 {
30944b6d 847 /* Take into account the "other" substring. */
2c2d71f5
JH
848 /* XXXX May be hopelessly wrong for UTF... */
849 if (!other_last)
6eb5f6b9 850 other_last = strpos;
f2ed9b32 851 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
852 do_other_anchored:
853 {
890ce7af
AL
854 char * const last = HOP3c(s, -start_shift, strbeg);
855 char *last1, *last2;
be8e71aa 856 char * const saved_s = s;
33b8afdf 857 SV* must;
2c2d71f5 858
2c2d71f5
JH
859 t = s - prog->check_offset_max;
860 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 861 && (!utf8_target
0ce71af7 862 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 863 && t > strpos)))
6f207bd3 864 NOOP;
2c2d71f5
JH
865 else
866 t = strpos;
1aa99e6b 867 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
868 if (t < other_last) /* These positions already checked */
869 t = other_last;
1aa99e6b 870 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
871 if (last < last1)
872 last1 = last;
1de06328
YO
873 /* XXXX It is not documented what units *_offsets are in.
874 We assume bytes, but this is clearly wrong.
875 Meaning this code needs to be carefully reviewed for errors.
876 dmq.
877 */
878
2c2d71f5 879 /* On end-of-str: see comment below. */
f2ed9b32 880 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
881 if (must == &PL_sv_undef) {
882 s = (char*)NULL;
1de06328 883 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
884 }
885 else
886 s = fbm_instr(
887 (unsigned char*)t,
888 HOP3(HOP3(last1, prog->anchored_offset, strend)
889 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
890 must,
7fba1cd6 891 multiline ? FBMrf_MULTILINE : 0
33b8afdf 892 );
ab3bbdeb 893 DEBUG_EXECUTE_r({
f2ed9b32 894 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
895 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
896 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 897 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
898 quoted, RE_SV_TAIL(must));
899 });
900
901
2c2d71f5
JH
902 if (!s) {
903 if (last1 >= last2) {
a3621e74 904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
905 ", giving up...\n"));
906 goto fail_finish;
907 }
a3621e74 908 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 909 ", trying floating at offset %ld...\n",
be8e71aa 910 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
911 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
912 s = HOP3c(last, 1, strend);
2c2d71f5
JH
913 goto restart;
914 }
915 else {
a3621e74 916 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 917 (long)(s - i_strpos)));
1aa99e6b
IH
918 t = HOP3c(s, -prog->anchored_offset, strbeg);
919 other_last = HOP3c(s, 1, strend);
be8e71aa 920 s = saved_s;
2c2d71f5
JH
921 if (t == strpos)
922 goto try_at_start;
2c2d71f5
JH
923 goto try_at_offset;
924 }
30944b6d 925 }
2c2d71f5
JH
926 }
927 else { /* Take into account the floating substring. */
33b8afdf 928 char *last, *last1;
be8e71aa 929 char * const saved_s = s;
33b8afdf
JH
930 SV* must;
931
932 t = HOP3c(s, -start_shift, strbeg);
933 last1 = last =
934 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
935 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
936 last = HOP3c(t, prog->float_max_offset, strend);
937 s = HOP3c(t, prog->float_min_offset, strend);
938 if (s < other_last)
939 s = other_last;
2c2d71f5 940 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 941 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
942 /* fbm_instr() takes into account exact value of end-of-str
943 if the check is SvTAIL(ed). Since false positives are OK,
944 and end-of-str is not later than strend we are OK. */
945 if (must == &PL_sv_undef) {
946 s = (char*)NULL;
1de06328 947 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
948 }
949 else
2c2d71f5 950 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
951 (unsigned char*)last + SvCUR(must)
952 - (SvTAIL(must)!=0),
7fba1cd6 953 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 954 DEBUG_EXECUTE_r({
f2ed9b32 955 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
956 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
957 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 958 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
959 quoted, RE_SV_TAIL(must));
960 });
33b8afdf
JH
961 if (!s) {
962 if (last1 == last) {
a3621e74 963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
964 ", giving up...\n"));
965 goto fail_finish;
2c2d71f5 966 }
a3621e74 967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 968 ", trying anchored starting at offset %ld...\n",
be8e71aa 969 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
970 other_last = last;
971 s = HOP3c(t, 1, strend);
972 goto restart;
973 }
974 else {
a3621e74 975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
976 (long)(s - i_strpos)));
977 other_last = s; /* Fix this later. --Hugo */
be8e71aa 978 s = saved_s;
33b8afdf
JH
979 if (t == strpos)
980 goto try_at_start;
981 goto try_at_offset;
982 }
2c2d71f5 983 }
cad2e5aa 984 }
2c2d71f5 985
1de06328 986
9ef43ace 987 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 988
6bda09f9 989 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
990 PerlIO_printf(Perl_debug_log,
991 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
992 (IV)prog->check_offset_min,
993 (IV)prog->check_offset_max,
994 (IV)(s-strpos),
995 (IV)(t-strpos),
996 (IV)(t-s),
997 (IV)(strend-strpos)
998 )
999 );
1000
2c2d71f5 1001 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 1002 && (!utf8_target
9ef43ace 1003 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
1004 && t > strpos)))
1005 {
2c2d71f5
JH
1006 /* Fixed substring is found far enough so that the match
1007 cannot start at strpos. */
1008 try_at_offset:
cad2e5aa 1009 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
1010 /* Eventually fbm_*() should handle this, but often
1011 anchored_offset is not 0, so this check will not be wasted. */
1012 /* XXXX In the code below we prefer to look for "^" even in
1013 presence of anchored substrings. And we search even
1014 beyond the found float position. These pessimizations
1015 are historical artefacts only. */
1016 find_anchor:
2c2d71f5 1017 while (t < strend - prog->minlen) {
cad2e5aa 1018 if (*t == '\n') {
4ee3650e 1019 if (t < check_at - prog->check_offset_min) {
f2ed9b32 1020 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
1021 /* Since we moved from the found position,
1022 we definitely contradict the found anchored
30944b6d
IZ
1023 substr. Due to the above check we do not
1024 contradict "check" substr.
1025 Thus we can arrive here only if check substr
1026 is float. Redo checking for "other"=="fixed".
1027 */
9041c2e3 1028 strpos = t + 1;
a3621e74 1029 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1030 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1031 goto do_other_anchored;
1032 }
4ee3650e
GS
1033 /* We don't contradict the found floating substring. */
1034 /* XXXX Why not check for STCLASS? */
cad2e5aa 1035 s = t + 1;
a3621e74 1036 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1037 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1038 goto set_useful;
1039 }
4ee3650e
GS
1040 /* Position contradicts check-string */
1041 /* XXXX probably better to look for check-string
1042 than for "\n", so one should lower the limit for t? */
a3621e74 1043 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1044 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1045 other_last = strpos = s = t + 1;
cad2e5aa
JH
1046 goto restart;
1047 }
1048 t++;
1049 }
a3621e74 1050 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1051 PL_colors[0], PL_colors[1]));
2c2d71f5 1052 goto fail_finish;
cad2e5aa 1053 }
f5952150 1054 else {
a3621e74 1055 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1056 PL_colors[0], PL_colors[1]));
f5952150 1057 }
cad2e5aa
JH
1058 s = t;
1059 set_useful:
f2ed9b32 1060 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1061 }
1062 else {
f5952150 1063 /* The found string does not prohibit matching at strpos,
2c2d71f5 1064 - no optimization of calling REx engine can be performed,
f5952150
GS
1065 unless it was an MBOL and we are not after MBOL,
1066 or a future STCLASS check will fail this. */
2c2d71f5
JH
1067 try_at_start:
1068 /* Even in this situation we may use MBOL flag if strpos is offset
1069 wrt the start of the string. */
52a21eb3 1070 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1071 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1072 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1073 {
cad2e5aa
JH
1074 t = strpos;
1075 goto find_anchor;
1076 }
a3621e74 1077 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1078 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1079 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1080 );
2c2d71f5 1081 success_at_start:
bbe252da 1082 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1083 && (utf8_target ? (
33b8afdf
JH
1084 prog->check_utf8 /* Could be deleted already */
1085 && --BmUSEFUL(prog->check_utf8) < 0
1086 && (prog->check_utf8 == prog->float_utf8)
1087 ) : (
1088 prog->check_substr /* Could be deleted already */
1089 && --BmUSEFUL(prog->check_substr) < 0
1090 && (prog->check_substr == prog->float_substr)
1091 )))
66e933ab 1092 {
cad2e5aa 1093 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1095 /* XXX Does the destruction order has to change with utf8_target? */
1096 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1097 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1098 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1099 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1100 check = NULL; /* abort */
cad2e5aa 1101 s = strpos;
486ec47a 1102 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1103 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1104 if (prog->intflags & PREGf_IMPLICIT)
1105 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1106 /* XXXX This is a remnant of the old implementation. It
1107 looks wasteful, since now INTUIT can use many
6eb5f6b9 1108 other heuristics. */
bbe252da 1109 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1110 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1111 }
1112 else
1113 s = strpos;
1114 }
1115
6eb5f6b9
JH
1116 /* Last resort... */
1117 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1118 /* trie stclasses are too expensive to use here, we are better off to
1119 leave it to regmatch itself */
f8fc2ecf 1120 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1121 /* minlen == 0 is possible if regstclass is \b or \B,
1122 and the fixed substr is ''$.
1123 Since minlen is already taken into account, s+1 is before strend;
1124 accidentally, minlen >= 1 guaranties no false positives at s + 1
1125 even for \b or \B. But (minlen? 1 : 0) below assumes that
1126 regstclass does not come from lookahead... */
1127 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1128 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf 1129 const U8* const str = (U8*)STRING(progi->regstclass);
2c75e362 1130 /* XXX this value could be pre-computed */
f8fc2ecf 1131 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1132 ? (reginfo->is_utf8_pat
1133 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1134 : STR_LEN(progi->regstclass))
66e933ab 1135 : 1);
1de06328
YO
1136 char * endpos;
1137 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1138 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1139 else if (prog->float_substr || prog->float_utf8)
1140 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1141 else
1142 endpos= strend;
1143
d8080198
YO
1144 if (checked_upto < s)
1145 checked_upto = s;
1146 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1147 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1148
6eb5f6b9 1149 t = s;
984e6dd1 1150 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1151 reginfo);
d8080198
YO
1152 if (s) {
1153 checked_upto = s;
1154 } else {
6eb5f6b9 1155#ifdef DEBUGGING
cbbf8932 1156 const char *what = NULL;
6eb5f6b9
JH
1157#endif
1158 if (endpos == strend) {
a3621e74 1159 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1160 "Could not match STCLASS...\n") );
1161 goto fail;
1162 }
a3621e74 1163 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1164 "This position contradicts STCLASS...\n") );
bbe252da 1165 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1166 goto fail;
d8080198
YO
1167 checked_upto = HOPBACKc(endpos, start_shift);
1168 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1169 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1170 /* Contradict one of substrings */
33b8afdf 1171 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1172 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1173 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1174 hop_and_restart:
1aa99e6b 1175 s = HOP3c(t, 1, strend);
66e933ab
GS
1176 if (s + start_shift + end_shift > strend) {
1177 /* XXXX Should be taken into account earlier? */
a3621e74 1178 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1179 "Could not match STCLASS...\n") );
1180 goto fail;
1181 }
5e39e1e5
HS
1182 if (!check)
1183 goto giveup;
a3621e74 1184 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1185 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1186 what, (long)(s + start_shift - i_strpos)) );
1187 goto restart;
1188 }
66e933ab 1189 /* Have both, check_string is floating */
6eb5f6b9
JH
1190 if (t + start_shift >= check_at) /* Contradicts floating=check */
1191 goto retry_floating_check;
1192 /* Recheck anchored substring, but not floating... */
9041c2e3 1193 s = check_at;
5e39e1e5
HS
1194 if (!check)
1195 goto giveup;
a3621e74 1196 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1197 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1198 (long)(other_last - i_strpos)) );
1199 goto do_other_anchored;
1200 }
60e71179
GS
1201 /* Another way we could have checked stclass at the
1202 current position only: */
1203 if (ml_anch) {
1204 s = t = t + 1;
5e39e1e5
HS
1205 if (!check)
1206 goto giveup;
a3621e74 1207 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1208 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1209 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1210 goto try_at_offset;
66e933ab 1211 }
f2ed9b32 1212 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1213 goto fail;
486ec47a 1214 /* Check is floating substring. */
6eb5f6b9
JH
1215 retry_floating_check:
1216 t = check_at - start_shift;
a3621e74 1217 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1218 goto hop_and_restart;
1219 }
b7953727 1220 if (t != s) {
a3621e74 1221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1222 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1223 (long)(t - i_strpos), (long)(s - i_strpos))
1224 );
1225 }
1226 else {
a3621e74 1227 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1228 "Does not contradict STCLASS...\n");
1229 );
1230 }
6eb5f6b9 1231 }
5e39e1e5 1232 giveup:
a3621e74 1233 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1234 PL_colors[4], (check ? "Guessed" : "Giving up"),
1235 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1236 return s;
2c2d71f5
JH
1237
1238 fail_finish: /* Substring not found */
33b8afdf 1239 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1240 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1241 fail:
a3621e74 1242 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1243 PL_colors[4], PL_colors[5]));
bd61b366 1244 return NULL;
cad2e5aa 1245}
9661b544 1246
a0a388a1 1247#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1248 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1249 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1250 trie_type = ((scan->flags == EXACT) \
1251 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1252 : (scan->flags == EXACTFA) \
1253 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1254 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1255
fd3249ee
YO
1256#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1257STMT_START { \
fab2782b 1258 STRLEN skiplen; \
098b07d5 1259 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1260 switch (trie_type) { \
098b07d5
KW
1261 case trie_utf8_exactfa_fold: \
1262 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1263 /* FALL THROUGH */ \
fab2782b
YO
1264 case trie_utf8_fold: \
1265 if ( foldlen>0 ) { \
c80e42f3 1266 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1267 foldlen -= len; \
1268 uscan += len; \
1269 len=0; \
1270 } else { \
098b07d5 1271 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \
fab2782b
YO
1272 len = UTF8SKIP(uc); \
1273 skiplen = UNISKIP( uvc ); \
1274 foldlen -= skiplen; \
1275 uscan = foldbuf + skiplen; \
1276 } \
1277 break; \
098b07d5
KW
1278 case trie_latin_utf8_exactfa_fold: \
1279 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1280 /* FALL THROUGH */ \
fab2782b
YO
1281 case trie_latin_utf8_fold: \
1282 if ( foldlen>0 ) { \
c80e42f3 1283 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1284 foldlen -= len; \
1285 uscan += len; \
1286 len=0; \
1287 } else { \
1288 len = 1; \
098b07d5 1289 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1290 skiplen = UNISKIP( uvc ); \
1291 foldlen -= skiplen; \
1292 uscan = foldbuf + skiplen; \
1293 } \
1294 break; \
1295 case trie_utf8: \
c80e42f3 1296 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1297 break; \
1298 case trie_plain: \
1299 uvc = (UV)*uc; \
1300 len = 1; \
1301 } \
1302 if (uvc < 256) { \
1303 charid = trie->charmap[ uvc ]; \
1304 } \
1305 else { \
1306 charid = 0; \
1307 if (widecharmap) { \
1308 SV** const svpp = hv_fetch(widecharmap, \
1309 (char*)&uvc, sizeof(UV), 0); \
1310 if (svpp) \
1311 charid = (U16)SvIV(*svpp); \
1312 } \
1313 } \
4cadc6a9
YO
1314} STMT_END
1315
4cadc6a9
YO
1316#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1317STMT_START { \
1318 while (s <= e) { \
1319 if ( (CoNd) \
fac1af77 1320 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1321 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1322 goto got_it; \
1323 s++; \
1324 } \
1325} STMT_END
1326
1327#define REXEC_FBC_UTF8_SCAN(CoDe) \
1328STMT_START { \
9a902117 1329 while (s < strend) { \
4cadc6a9 1330 CoDe \
9a902117 1331 s += UTF8SKIP(s); \
4cadc6a9
YO
1332 } \
1333} STMT_END
1334
1335#define REXEC_FBC_SCAN(CoDe) \
1336STMT_START { \
1337 while (s < strend) { \
1338 CoDe \
1339 s++; \
1340 } \
1341} STMT_END
1342
1343#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1344REXEC_FBC_UTF8_SCAN( \
1345 if (CoNd) { \
02d5137b 1346 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1347 goto got_it; \
1348 else \
1349 tmp = doevery; \
1350 } \
1351 else \
1352 tmp = 1; \
1353)
1354
1355#define REXEC_FBC_CLASS_SCAN(CoNd) \
1356REXEC_FBC_SCAN( \
1357 if (CoNd) { \
02d5137b 1358 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1359 goto got_it; \
1360 else \
1361 tmp = doevery; \
1362 } \
1363 else \
1364 tmp = 1; \
1365)
1366
1367#define REXEC_FBC_TRYIT \
02d5137b 1368if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1369 goto got_it
1370
e1d1eefb 1371#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1372 if (utf8_target) { \
e1d1eefb
YO
1373 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1374 } \
1375 else { \
1376 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1377 }
e1d1eefb 1378
786e8c11 1379#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1380 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1381 startpos, doutf8)
786e8c11 1382
cfaf538b
KW
1383
1384#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1385 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1386 tmp = TEST_NON_UTF8(tmp); \
1387 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1388 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1389 tmp = !tmp; \
1390 IF_SUCCESS; \
1391 } \
1392 else { \
1393 IF_FAIL; \
1394 } \
1395 ); \
1396
1397#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1398 if (s == reginfo->strbeg) { \
cfaf538b
KW
1399 tmp = '\n'; \
1400 } \
1401 else { \
9d9163fb 1402 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1403 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1404 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1405 } \
1406 tmp = TeSt1_UtF8; \
3db24e1e 1407 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1408 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1409 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1410 tmp = !tmp; \
1411 IF_SUCCESS; \
1412 } \
1413 else { \
1414 IF_FAIL; \
1415 } \
1416 ); \
1417
63ac0dad
KW
1418/* The only difference between the BOUND and NBOUND cases is that
1419 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1420 * NBOUND. This is accomplished by passing it in either the if or else clause,
1421 * with the other one being empty */
1422#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1423 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1424
1425#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1426 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1427
1428#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1429 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1430
1431#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1432 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1433
63ac0dad
KW
1434
1435/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1436 * be passed in completely with the variable name being tested, which isn't
1437 * such a clean interface, but this is easier to read than it was before. We
1438 * are looking for the boundary (or non-boundary between a word and non-word
1439 * character. The utf8 and non-utf8 cases have the same logic, but the details
1440 * must be different. Find the "wordness" of the character just prior to this
1441 * one, and compare it with the wordness of this one. If they differ, we have
1442 * a boundary. At the beginning of the string, pretend that the previous
1443 * character was a new-line */
cfaf538b 1444#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1445 if (utf8_target) { \
cfaf538b 1446 UTF8_CODE \
63ac0dad
KW
1447 } \
1448 else { /* Not utf8 */ \
9d9163fb 1449 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1450 tmp = TEST_NON_UTF8(tmp); \
1451 REXEC_FBC_SCAN( \
1452 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1453 tmp = !tmp; \
1454 IF_SUCCESS; \
1455 } \
1456 else { \
1457 IF_FAIL; \
1458 } \
1459 ); \
1460 } \
02d5137b 1461 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1462 goto got_it;
1463
786e8c11 1464/* We know what class REx starts with. Try to find this position... */
02d5137b 1465/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1466/* annoyingly all the vars in this routine have different names from their counterparts
1467 in regmatch. /grrr */
1468
3c3eec57 1469STATIC char *
07be1b83 1470S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1471 const char *strend, regmatch_info *reginfo)
a687059c 1472{
73104a1b
KW
1473 dVAR;
1474 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1475 char *pat_string; /* The pattern's exactish string */
1476 char *pat_end; /* ptr to end char of pat_string */
1477 re_fold_t folder; /* Function for computing non-utf8 folds */
1478 const U8 *fold_array; /* array for folding ords < 256 */
1479 STRLEN ln;
1480 STRLEN lnc;
73104a1b
KW
1481 U8 c1;
1482 U8 c2;
1483 char *e;
1484 I32 tmp = 1; /* Scratch variable? */
ba44c216 1485 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1486 UV utf8_fold_flags = 0;
f9176b44 1487 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1488 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1489 with a result inverts that result, as 0^1 =
1490 1 and 1^1 = 0 */
1491 _char_class_number classnum;
1492
73104a1b 1493 RXi_GET_DECL(prog,progi);
2f7f8cb1 1494
73104a1b 1495 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1496
73104a1b
KW
1497 /* We know what class it must start with. */
1498 switch (OP(c)) {
1499 case ANYOF:
9aa1e39f 1500 case ANYOF_SYNTHETIC:
73104a1b
KW
1501 if (utf8_target) {
1502 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1503 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1504 }
1505 else {
1506 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1507 }
1508 break;
1509 case CANY:
1510 REXEC_FBC_SCAN(
02d5137b 1511 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1512 goto got_it;
1513 else
1514 tmp = doevery;
1515 );
1516 break;
1517
098b07d5
KW
1518 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1519 assert(! is_utf8_pat);
1520 /* FALL THROUGH */
73104a1b 1521 case EXACTFA:
984e6dd1 1522 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1523 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1524 goto do_exactf_utf8;
1525 }
1526 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1527 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1528 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1529
2fdb7295
KW
1530 case EXACTF: /* This node only generated for non-utf8 patterns */
1531 assert(! is_utf8_pat);
73104a1b 1532 if (utf8_target) {
73104a1b
KW
1533 utf8_fold_flags = 0;
1534 goto do_exactf_utf8;
1535 }
1536 fold_array = PL_fold;
1537 folder = foldEQ;
1538 goto do_exactf_non_utf8;
1539
1540 case EXACTFL:
984e6dd1 1541 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1542 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1543 goto do_exactf_utf8;
1544 }
1545 fold_array = PL_fold_locale;
1546 folder = foldEQ_locale;
1547 goto do_exactf_non_utf8;
3c760661 1548
73104a1b 1549 case EXACTFU_SS:
984e6dd1 1550 if (is_utf8_pat) {
73104a1b
KW
1551 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1552 }
1553 goto do_exactf_utf8;
16d951b7 1554
73104a1b 1555 case EXACTFU:
984e6dd1
DM
1556 if (is_utf8_pat || utf8_target) {
1557 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1558 goto do_exactf_utf8;
1559 }
fac1af77 1560
73104a1b
KW
1561 /* Any 'ss' in the pattern should have been replaced by regcomp,
1562 * so we don't have to worry here about this single special case
1563 * in the Latin1 range */
1564 fold_array = PL_fold_latin1;
1565 folder = foldEQ_latin1;
1566
1567 /* FALL THROUGH */
1568
1569 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1570 are no glitches with fold-length differences
1571 between the target string and pattern */
1572
1573 /* The idea in the non-utf8 EXACTF* cases is to first find the
1574 * first character of the EXACTF* node and then, if necessary,
1575 * case-insensitively compare the full text of the node. c1 is the
1576 * first character. c2 is its fold. This logic will not work for
1577 * Unicode semantics and the german sharp ss, which hence should
1578 * not be compiled into a node that gets here. */
1579 pat_string = STRING(c);
1580 ln = STR_LEN(c); /* length to match in octets/bytes */
1581
1582 /* We know that we have to match at least 'ln' bytes (which is the
1583 * same as characters, since not utf8). If we have to match 3
1584 * characters, and there are only 2 availabe, we know without
1585 * trying that it will fail; so don't start a match past the
1586 * required minimum number from the far end */
ea3daa5d 1587 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1588
02d5137b 1589 if (reginfo->intuit && e < s) {
73104a1b
KW
1590 e = s; /* Due to minlen logic of intuit() */
1591 }
fac1af77 1592
73104a1b
KW
1593 c1 = *pat_string;
1594 c2 = fold_array[c1];
1595 if (c1 == c2) { /* If char and fold are the same */
1596 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1597 }
1598 else {
1599 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1600 }
1601 break;
fac1af77 1602
73104a1b
KW
1603 do_exactf_utf8:
1604 {
1605 unsigned expansion;
1606
1607 /* If one of the operands is in utf8, we can't use the simpler folding
1608 * above, due to the fact that many different characters can have the
1609 * same fold, or portion of a fold, or different- length fold */
1610 pat_string = STRING(c);
1611 ln = STR_LEN(c); /* length to match in octets/bytes */
1612 pat_end = pat_string + ln;
984e6dd1 1613 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1614 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1615 : ln;
1616
1617 /* We have 'lnc' characters to match in the pattern, but because of
1618 * multi-character folding, each character in the target can match
1619 * up to 3 characters (Unicode guarantees it will never exceed
1620 * this) if it is utf8-encoded; and up to 2 if not (based on the
1621 * fact that the Latin 1 folds are already determined, and the
1622 * only multi-char fold in that range is the sharp-s folding to
1623 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1624 * string character. Adjust lnc accordingly, rounding up, so that
1625 * if we need to match at least 4+1/3 chars, that really is 5. */
1626 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1627 lnc = (lnc + expansion - 1) / expansion;
1628
1629 /* As in the non-UTF8 case, if we have to match 3 characters, and
1630 * only 2 are left, it's guaranteed to fail, so don't start a
1631 * match that would require us to go beyond the end of the string
1632 */
ea3daa5d 1633 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1634
02d5137b 1635 if (reginfo->intuit && e < s) {
73104a1b
KW
1636 e = s; /* Due to minlen logic of intuit() */
1637 }
0658cdde 1638
73104a1b
KW
1639 /* XXX Note that we could recalculate e to stop the loop earlier,
1640 * as the worst case expansion above will rarely be met, and as we
1641 * go along we would usually find that e moves further to the left.
1642 * This would happen only after we reached the point in the loop
1643 * where if there were no expansion we should fail. Unclear if
1644 * worth the expense */
1645
1646 while (s <= e) {
1647 char *my_strend= (char *)strend;
1648 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1649 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1650 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1651 {
1652 goto got_it;
1653 }
1654 s += (utf8_target) ? UTF8SKIP(s) : 1;
1655 }
1656 break;
1657 }
1658 case BOUNDL:
272d35c9 1659 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1660 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1661 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1662 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1663 break;
1664 case NBOUNDL:
272d35c9 1665 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1666 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1667 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1668 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1669 break;
1670 case BOUND:
1671 FBC_BOUND(isWORDCHAR,
0eb30aeb 1672 isWORDCHAR_uni(tmp),
03940dc2 1673 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1674 break;
1675 case BOUNDA:
1676 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1677 isWORDCHAR_A(tmp),
1678 isWORDCHAR_A((U8*)s));
1679 break;
1680 case NBOUND:
1681 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1682 isWORDCHAR_uni(tmp),
03940dc2 1683 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1684 break;
1685 case NBOUNDA:
1686 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1687 isWORDCHAR_A(tmp),
1688 isWORDCHAR_A((U8*)s));
1689 break;
1690 case BOUNDU:
1691 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1692 isWORDCHAR_uni(tmp),
03940dc2 1693 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1694 break;
1695 case NBOUNDU:
1696 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1697 isWORDCHAR_uni(tmp),
03940dc2 1698 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1699 break;
73104a1b
KW
1700 case LNBREAK:
1701 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1702 is_LNBREAK_latin1_safe(s, strend)
1703 );
1704 break;
3018b823
KW
1705
1706 /* The argument to all the POSIX node types is the class number to pass to
1707 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1708
1709 case NPOSIXL:
1710 to_complement = 1;
1711 /* FALLTHROUGH */
1712
1713 case POSIXL:
272d35c9 1714 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1715 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1716 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1717 break;
3018b823
KW
1718
1719 case NPOSIXD:
1720 to_complement = 1;
1721 /* FALLTHROUGH */
1722
1723 case POSIXD:
1724 if (utf8_target) {
1725 goto posix_utf8;
1726 }
1727 goto posixa;
1728
1729 case NPOSIXA:
1730 if (utf8_target) {
1731 /* The complement of something that matches only ASCII matches all
1732 * UTF-8 variant code points, plus everything in ASCII that isn't
1733 * in the class */
1734 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1735 || ! _generic_isCC_A(*s, FLAGS(c)));
1736 break;
1737 }
1738
1739 to_complement = 1;
1740 /* FALLTHROUGH */
1741
73104a1b 1742 case POSIXA:
3018b823 1743 posixa:
73104a1b 1744 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1745 * byte invariant character. */
1746 REXEC_FBC_CLASS_SCAN(
1747 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1748 break;
3018b823
KW
1749
1750 case NPOSIXU:
1751 to_complement = 1;
1752 /* FALLTHROUGH */
1753
1754 case POSIXU:
1755 if (! utf8_target) {
1756 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1757 FLAGS(c))));
1758 }
1759 else {
1760
1761 posix_utf8:
1762 classnum = (_char_class_number) FLAGS(c);
1763 if (classnum < _FIRST_NON_SWASH_CC) {
1764 while (s < strend) {
1765
1766 /* We avoid loading in the swash as long as possible, but
1767 * should we have to, we jump to a separate loop. This
1768 * extra 'if' statement is what keeps this code from being
1769 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1770 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1771 goto found_above_latin1;
1772 }
1773 if ((UTF8_IS_INVARIANT(*s)
1774 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1775 classnum)))
1776 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1777 && to_complement ^ cBOOL(
94bb8c36
KW
1778 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1779 *(s + 1)),
3018b823
KW
1780 classnum))))
1781 {
02d5137b 1782 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1783 goto got_it;
1784 else {
1785 tmp = doevery;
1786 }
1787 }
1788 else {
1789 tmp = 1;
1790 }
1791 s += UTF8SKIP(s);
1792 }
1793 }
1794 else switch (classnum) { /* These classes are implemented as
1795 macros */
1796 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1797 revert the change of \v matching this */
1798 /* FALL THROUGH */
1799
1800 case _CC_ENUM_PSXSPC:
1801 REXEC_FBC_UTF8_CLASS_SCAN(
1802 to_complement ^ cBOOL(isSPACE_utf8(s)));
1803 break;
1804
1805 case _CC_ENUM_BLANK:
1806 REXEC_FBC_UTF8_CLASS_SCAN(
1807 to_complement ^ cBOOL(isBLANK_utf8(s)));
1808 break;
1809
1810 case _CC_ENUM_XDIGIT:
1811 REXEC_FBC_UTF8_CLASS_SCAN(
1812 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1813 break;
1814
1815 case _CC_ENUM_VERTSPACE:
1816 REXEC_FBC_UTF8_CLASS_SCAN(
1817 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1818 break;
1819
1820 case _CC_ENUM_CNTRL:
1821 REXEC_FBC_UTF8_CLASS_SCAN(
1822 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1823 break;
1824
1825 default:
1826 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1827 assert(0); /* NOTREACHED */
1828 }
1829 }
1830 break;
1831
1832 found_above_latin1: /* Here we have to load a swash to get the result
1833 for the current code point */
1834 if (! PL_utf8_swash_ptrs[classnum]) {
1835 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1836 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
1837 _core_swash_init("utf8",
1838 "",
1839 &PL_sv_undef, 1, 0,
1840 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
1841 }
1842
1843 /* This is a copy of the loop above for swash classes, though using the
1844 * FBC macro instead of being expanded out. Since we've loaded the
1845 * swash, we don't have to check for that each time through the loop */
1846 REXEC_FBC_UTF8_CLASS_SCAN(
1847 to_complement ^ cBOOL(_generic_utf8(
1848 classnum,
1849 s,
1850 swash_fetch(PL_utf8_swash_ptrs[classnum],
1851 (U8 *) s, TRUE))));
73104a1b
KW
1852 break;
1853
1854 case AHOCORASICKC:
1855 case AHOCORASICK:
1856 {
1857 DECL_TRIE_TYPE(c);
1858 /* what trie are we using right now */
1859 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1860 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1861 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1862
1863 const char *last_start = strend - trie->minlen;
6148ee25 1864#ifdef DEBUGGING
73104a1b 1865 const char *real_start = s;
6148ee25 1866#endif
73104a1b
KW
1867 STRLEN maxlen = trie->maxlen;
1868 SV *sv_points;
1869 U8 **points; /* map of where we were in the input string
1870 when reading a given char. For ASCII this
1871 is unnecessary overhead as the relationship
1872 is always 1:1, but for Unicode, especially
1873 case folded Unicode this is not true. */
1874 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1875 U8 *bitmap=NULL;
1876
1877
1878 GET_RE_DEBUG_FLAGS_DECL;
1879
1880 /* We can't just allocate points here. We need to wrap it in
1881 * an SV so it gets freed properly if there is a croak while
1882 * running the match */
1883 ENTER;
1884 SAVETMPS;
1885 sv_points=newSV(maxlen * sizeof(U8 *));
1886 SvCUR_set(sv_points,
1887 maxlen * sizeof(U8 *));
1888 SvPOK_on(sv_points);
1889 sv_2mortal(sv_points);
1890 points=(U8**)SvPV_nolen(sv_points );
1891 if ( trie_type != trie_utf8_fold
1892 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1893 {
1894 if (trie->bitmap)
1895 bitmap=(U8*)trie->bitmap;
1896 else
1897 bitmap=(U8*)ANYOF_BITMAP(c);
1898 }
1899 /* this is the Aho-Corasick algorithm modified a touch
1900 to include special handling for long "unknown char" sequences.
1901 The basic idea being that we use AC as long as we are dealing
1902 with a possible matching char, when we encounter an unknown char
1903 (and we have not encountered an accepting state) we scan forward
1904 until we find a legal starting char.
1905 AC matching is basically that of trie matching, except that when
1906 we encounter a failing transition, we fall back to the current
1907 states "fail state", and try the current char again, a process
1908 we repeat until we reach the root state, state 1, or a legal
1909 transition. If we fail on the root state then we can either
1910 terminate if we have reached an accepting state previously, or
1911 restart the entire process from the beginning if we have not.
1912
1913 */
1914 while (s <= last_start) {
1915 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1916 U8 *uc = (U8*)s;
1917 U16 charid = 0;
1918 U32 base = 1;
1919 U32 state = 1;
1920 UV uvc = 0;
1921 STRLEN len = 0;
1922 STRLEN foldlen = 0;
1923 U8 *uscan = (U8*)NULL;
1924 U8 *leftmost = NULL;
1925#ifdef DEBUGGING
1926 U32 accepted_word= 0;
786e8c11 1927#endif
73104a1b
KW
1928 U32 pointpos = 0;
1929
1930 while ( state && uc <= (U8*)strend ) {
1931 int failed=0;
1932 U32 word = aho->states[ state ].wordnum;
1933
1934 if( state==1 ) {
1935 if ( bitmap ) {
1936 DEBUG_TRIE_EXECUTE_r(
1937 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1938 dump_exec_pos( (char *)uc, c, strend, real_start,
1939 (char *)uc, utf8_target );
1940 PerlIO_printf( Perl_debug_log,
1941 " Scanning for legal start char...\n");
1942 }
1943 );
1944 if (utf8_target) {
1945 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1946 uc += UTF8SKIP(uc);
1947 }
1948 } else {
1949 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1950 uc++;
1951 }
786e8c11 1952 }
73104a1b 1953 s= (char *)uc;
07be1b83 1954 }
73104a1b
KW
1955 if (uc >(U8*)last_start) break;
1956 }
1957
1958 if ( word ) {
1959 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1960 if (!leftmost || lpos < leftmost) {
1961 DEBUG_r(accepted_word=word);
1962 leftmost= lpos;
7016d6eb 1963 }
73104a1b 1964 if (base==0) break;
7016d6eb 1965
73104a1b
KW
1966 }
1967 points[pointpos++ % maxlen]= uc;
1968 if (foldlen || uc < (U8*)strend) {
1969 REXEC_TRIE_READ_CHAR(trie_type, trie,
1970 widecharmap, uc,
1971 uscan, len, uvc, charid, foldlen,
1972 foldbuf, uniflags);
1973 DEBUG_TRIE_EXECUTE_r({
1974 dump_exec_pos( (char *)uc, c, strend,
1975 real_start, s, utf8_target);
1976 PerlIO_printf(Perl_debug_log,
1977 " Charid:%3u CP:%4"UVxf" ",
1978 charid, uvc);
1979 });
1980 }
1981 else {
1982 len = 0;
1983 charid = 0;
1984 }
07be1b83 1985
73104a1b
KW
1986
1987 do {
6148ee25 1988#ifdef DEBUGGING
73104a1b 1989 word = aho->states[ state ].wordnum;
6148ee25 1990#endif
73104a1b
KW
1991 base = aho->states[ state ].trans.base;
1992
1993 DEBUG_TRIE_EXECUTE_r({
1994 if (failed)
1995 dump_exec_pos( (char *)uc, c, strend, real_start,
1996 s, utf8_target );
1997 PerlIO_printf( Perl_debug_log,
1998 "%sState: %4"UVxf", word=%"UVxf,
1999 failed ? " Fail transition to " : "",
2000 (UV)state, (UV)word);
2001 });
2002 if ( base ) {
2003 U32 tmp;
2004 I32 offset;
2005 if (charid &&
2006 ( ((offset = base + charid
2007 - 1 - trie->uniquecharcount)) >= 0)
2008 && ((U32)offset < trie->lasttrans)
2009 && trie->trans[offset].check == state
2010 && (tmp=trie->trans[offset].next))
2011 {
2012 DEBUG_TRIE_EXECUTE_r(
2013 PerlIO_printf( Perl_debug_log," - legal\n"));
2014 state = tmp;
2015 break;
07be1b83
YO
2016 }
2017 else {
786e8c11 2018 DEBUG_TRIE_EXECUTE_r(
73104a1b 2019 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2020 failed = 1;
73104a1b 2021 state = aho->fail[state];
07be1b83 2022 }
07be1b83 2023 }
73104a1b
KW
2024 else {
2025 /* we must be accepting here */
2026 DEBUG_TRIE_EXECUTE_r(
2027 PerlIO_printf( Perl_debug_log," - accepting\n"));
2028 failed = 1;
2029 break;
786e8c11 2030 }
73104a1b
KW
2031 } while(state);
2032 uc += len;
2033 if (failed) {
2034 if (leftmost)
2035 break;
2036 if (!state) state = 1;
07be1b83 2037 }
73104a1b
KW
2038 }
2039 if ( aho->states[ state ].wordnum ) {
2040 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2041 if (!leftmost || lpos < leftmost) {
2042 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2043 leftmost = lpos;
07be1b83
YO
2044 }
2045 }
73104a1b
KW
2046 if (leftmost) {
2047 s = (char*)leftmost;
2048 DEBUG_TRIE_EXECUTE_r({
2049 PerlIO_printf(
2050 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2051 (UV)accepted_word, (IV)(s - real_start)
2052 );
2053 });
02d5137b 2054 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2055 FREETMPS;
2056 LEAVE;
2057 goto got_it;
2058 }
2059 s = HOPc(s,1);
2060 DEBUG_TRIE_EXECUTE_r({
2061 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2062 });
2063 } else {
2064 DEBUG_TRIE_EXECUTE_r(
2065 PerlIO_printf( Perl_debug_log,"No match.\n"));
2066 break;
2067 }
2068 }
2069 FREETMPS;
2070 LEAVE;
2071 }
2072 break;
2073 default:
2074 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2075 break;
2076 }
2077 return 0;
2078 got_it:
2079 return s;
6eb5f6b9
JH
2080}
2081
60165aa4
DM
2082/* set RX_SAVED_COPY, RX_SUBBEG etc.
2083 * flags have same meanings as with regexec_flags() */
2084
749f4950
DM
2085static void
2086S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2087 char *strbeg,
2088 char *strend,
2089 SV *sv,
2090 U32 flags,
2091 bool utf8_target)
2092{
2093 struct regexp *const prog = ReANY(rx);
2094
60165aa4
DM
2095 if (flags & REXEC_COPY_STR) {
2096#ifdef PERL_ANY_COW
2097 if (SvCANCOW(sv)) {
2098 if (DEBUG_C_TEST) {
2099 PerlIO_printf(Perl_debug_log,
2100 "Copy on write: regexp capture, type %d\n",
2101 (int) SvTYPE(sv));
2102 }
5411a0e5
DM
2103 /* Create a new COW SV to share the match string and store
2104 * in saved_copy, unless the current COW SV in saved_copy
2105 * is valid and suitable for our purpose */
2106 if (( prog->saved_copy
2107 && SvIsCOW(prog->saved_copy)
2108 && SvPOKp(prog->saved_copy)
2109 && SvIsCOW(sv)
2110 && SvPOKp(sv)
2111 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2112 {
5411a0e5
DM
2113 /* just reuse saved_copy SV */
2114 if (RXp_MATCH_COPIED(prog)) {
2115 Safefree(prog->subbeg);
2116 RXp_MATCH_COPIED_off(prog);
2117 }
2118 }
2119 else {
2120 /* create new COW SV to share string */
a76b0e90
DM
2121 RX_MATCH_COPY_FREE(rx);
2122 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2123 }
5411a0e5
DM
2124 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2125 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2126 prog->sublen = strend - strbeg;
2127 prog->suboffset = 0;
2128 prog->subcoffset = 0;
2129 } else
2130#endif
2131 {
99a90e59
FC
2132 SSize_t min = 0;
2133 SSize_t max = strend - strbeg;
ea3daa5d 2134 SSize_t sublen;
60165aa4
DM
2135
2136 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2137 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2138 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2139 ) { /* don't copy $' part of string */
2140 U32 n = 0;
2141 max = -1;
2142 /* calculate the right-most part of the string covered
2143 * by a capture. Due to look-ahead, this may be to
2144 * the right of $&, so we have to scan all captures */
2145 while (n <= prog->lastparen) {
2146 if (prog->offs[n].end > max)
2147 max = prog->offs[n].end;
2148 n++;
2149 }
2150 if (max == -1)
2151 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2152 ? prog->offs[0].start
2153 : 0;
2154 assert(max >= 0 && max <= strend - strbeg);
2155 }
2156
2157 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2158 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2159 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2160 ) { /* don't copy $` part of string */
2161 U32 n = 0;
2162 min = max;
2163 /* calculate the left-most part of the string covered
2164 * by a capture. Due to look-behind, this may be to
2165 * the left of $&, so we have to scan all captures */
2166 while (min && n <= prog->lastparen) {
2167 if ( prog->offs[n].start != -1
2168 && prog->offs[n].start < min)
2169 {
2170 min = prog->offs[n].start;
2171 }
2172 n++;
2173 }
2174 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2175 && min > prog->offs[0].end
2176 )
2177 min = prog->offs[0].end;
2178
2179 }
2180
2181 assert(min >= 0 && min <= max && min <= strend - strbeg);
2182 sublen = max - min;
2183
2184 if (RX_MATCH_COPIED(rx)) {
2185 if (sublen > prog->sublen)
2186 prog->subbeg =
2187 (char*)saferealloc(prog->subbeg, sublen+1);
2188 }
2189 else
2190 prog->subbeg = (char*)safemalloc(sublen+1);
2191 Copy(strbeg + min, prog->subbeg, sublen, char);
2192 prog->subbeg[sublen] = '\0';
2193 prog->suboffset = min;
2194 prog->sublen = sublen;
2195 RX_MATCH_COPIED_on(rx);
2196 }
2197 prog->subcoffset = prog->suboffset;
2198 if (prog->suboffset && utf8_target) {
2199 /* Convert byte offset to chars.
2200 * XXX ideally should only compute this if @-/@+
2201 * has been seen, a la PL_sawampersand ??? */
2202
2203 /* If there's a direct correspondence between the
2204 * string which we're matching and the original SV,
2205 * then we can use the utf8 len cache associated with
2206 * the SV. In particular, it means that under //g,
2207 * sv_pos_b2u() will use the previously cached
2208 * position to speed up working out the new length of
2209 * subcoffset, rather than counting from the start of
2210 * the string each time. This stops
2211 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2212 * from going quadratic */
2213 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2214 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2215 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2216 else
2217 prog->subcoffset = utf8_length((U8*)strbeg,
2218 (U8*)(strbeg+prog->suboffset));
2219 }
2220 }
2221 else {
2222 RX_MATCH_COPY_FREE(rx);
2223 prog->subbeg = strbeg;
2224 prog->suboffset = 0;
2225 prog->subcoffset = 0;
2226 prog->sublen = strend - strbeg;
2227 }
2228}
2229
2230
2231
fae667d5 2232
6eb5f6b9
JH
2233/*
2234 - regexec_flags - match a regexp against a string
2235 */
2236I32
5aaab254 2237Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2238 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2239/* stringarg: the point in the string at which to begin matching */
2240/* strend: pointer to null at end of string */
2241/* strbeg: real beginning of string */
2242/* minend: end of match must be >= minend bytes after stringarg. */
2243/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2244 * itself is accessed via the pointers above */
2245/* data: May be used for some additional optimizations.
d058ec57 2246 Currently unused. */
a340edde 2247/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2248
6eb5f6b9 2249{
97aff369 2250 dVAR;
8d919b0a 2251 struct regexp *const prog = ReANY(rx);
5aaab254 2252 char *s;
eb578fdb 2253 regnode *c;
03c83e26 2254 char *startpos;
ea3daa5d
FC
2255 SSize_t minlen; /* must match at least this many chars */
2256 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2257 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2258 I32 multiline;
f8fc2ecf 2259 RXi_GET_DECL(prog,progi);
02d5137b
DM
2260 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2261 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2262 regexp_paren_pair *swap = NULL;
006f26b2 2263 I32 oldsave;
a3621e74
YO
2264 GET_RE_DEBUG_FLAGS_DECL;
2265
7918f24d 2266 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2267 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2268
2269 /* Be paranoid... */
03c83e26 2270 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2271 Perl_croak(aTHX_ "NULL regexp parameter");
2272 return 0;
2273 }
2274
6c3fea77 2275 DEBUG_EXECUTE_r(
03c83e26 2276 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2277 "Matching");
2278 );
8adc0f72 2279
b342a604
DM
2280 startpos = stringarg;
2281
03c83e26 2282 if (prog->extflags & RXf_GPOS_SEEN) {
d307c076
DM
2283 MAGIC *mg;
2284
fef7148b
DM
2285 /* set reginfo->ganch, the position where \G can match */
2286
2287 reginfo->ganch =
2288 (flags & REXEC_IGNOREPOS)
2289 ? stringarg /* use start pos rather than pos() */
2290 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2291 /* Defined pos(): */
2292 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2293 : strbeg; /* pos() not defined; use start of string */
2294
2295 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2296 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2297
03c83e26
DM
2298 /* in the presence of \G, we may need to start looking earlier in
2299 * the string than the suggested start point of stringarg:
0b2c2a84 2300 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2301 * offset, such as
2302 * /..\G/: gofs = 2
2303 * /ab|c\G/: gofs = 1
2304 * or if the minimum offset isn't known, then we have to go back
2305 * to the start of the string, e.g. /w+\G/
2306 */
2bfbe302
DM
2307
2308 if (prog->extflags & RXf_ANCH_GPOS) {
2309 startpos = reginfo->ganch - prog->gofs;
2310 if (startpos <
2311 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2312 {
2313 DEBUG_r(PerlIO_printf(Perl_debug_log,
2314 "fail: ganch-gofs before earliest possible start\n"));
2315 return 0;
2316 }
2317 }
2318 else if (prog->gofs) {
b342a604
DM
2319 if (startpos - prog->gofs < strbeg)
2320 startpos = strbeg;
2321 else
2322 startpos -= prog->gofs;
03c83e26 2323 }
b342a604
DM
2324 else if (prog->extflags & RXf_GPOS_FLOAT)
2325 startpos = strbeg;
03c83e26
DM
2326 }
2327
2328 minlen = prog->minlen;
b342a604 2329 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2330 DEBUG_r(PerlIO_printf(Perl_debug_log,
2331 "Regex match can't succeed, so not even tried\n"));
2332 return 0;
2333 }
2334
63a3746a
DM
2335 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2336 * which will call destuctors to reset PL_regmatch_state, free higher
2337 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2338 * regmatch_info_aux_eval */
2339
2340 oldsave = PL_savestack_ix;
2341
dfa77d06
DM
2342 s = startpos;
2343
e322109a 2344 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2345 && !(flags & REXEC_CHECKED))
2346 {
dfa77d06 2347 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2348 flags, NULL);
dfa77d06 2349 if (!s)
7fadf4a7
DM
2350 return 0;
2351
e322109a 2352 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2353 /* we can match based purely on the result of INTUIT.
2354 * Set up captures etc just for $& and $-[0]
2355 * (an intuit-only match wont have $1,$2,..) */
2356 assert(!prog->nparens);
d5e7783a
DM
2357
2358 /* s/// doesn't like it if $& is earlier than where we asked it to
2359 * start searching (which can happen on something like /.\G/) */
2360 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2361 && (s < stringarg))
2362 {
2363 /* this should only be possible under \G */
2364 assert(prog->extflags & RXf_GPOS_SEEN);
2365 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2366 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2367 goto phooey;
2368 }
2369
7fadf4a7
DM
2370 /* match via INTUIT shouldn't have any captures.
2371 * Let @-, @+, $^N know */
2372 prog->lastparen = prog->lastcloseparen = 0;
2373 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2374 prog->offs[0].start = s - strbeg;
2375 prog->offs[0].end = utf8_target
2376 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2377 : s - strbeg + prog->minlenret;
7fadf4a7 2378 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2379 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2380 strbeg, strend,
2381 sv, flags, utf8_target);
2382
7fadf4a7
DM
2383 return 1;
2384 }
2385 }
2386
6c3fea77 2387 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2388
dfa77d06 2389 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2390 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2391 "String too short [regexec_flags]...\n"));
2392 goto phooey;
1aa99e6b 2393 }
1de06328 2394
6eb5f6b9 2395 /* Check validity of program. */
f8fc2ecf 2396 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2397 Perl_croak(aTHX_ "corrupted regexp program");
2398 }
2399
272d35c9 2400 RX_MATCH_TAINTED_off(rx);
6eb5f6b9 2401
6c3fea77
DM
2402 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2403 reginfo->intuit = 0;
2404 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2405 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2406 reginfo->warned = FALSE;
9d9163fb 2407 reginfo->strbeg = strbeg;
02d5137b 2408 reginfo->sv = sv;
1cb48e53 2409 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2410 reginfo->strend = strend;
6eb5f6b9 2411 /* see how far we have to get to not match where we matched before */
fe3974be 2412 reginfo->till = stringarg + minend;
6eb5f6b9 2413
82c23608
FC
2414 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2415 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2416 S_cleanup_regmatch_info_aux has executed (registered by
2417 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2418 magic belonging to this SV.
2419 Not newSVsv, either, as it does not COW.
2420 */
2421 reginfo->sv = newSV(0);
4cba5ac0 2422 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2423 SAVEFREESV(reginfo->sv);
2424 }
2425
331b2dcc
DM
2426 /* reserve next 2 or 3 slots in PL_regmatch_state:
2427 * slot N+0: may currently be in use: skip it
2428 * slot N+1: use for regmatch_info_aux struct
2429 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2430 * slot N+3: ready for use by regmatch()
2431 */
bf2039a9 2432
331b2dcc
DM
2433 {
2434 regmatch_state *old_regmatch_state;
2435 regmatch_slab *old_regmatch_slab;
2436 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2437
2438 /* on first ever match, allocate first slab */
2439 if (!PL_regmatch_slab) {
2440 Newx(PL_regmatch_slab, 1, regmatch_slab);
2441 PL_regmatch_slab->prev = NULL;
2442 PL_regmatch_slab->next = NULL;
2443 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2444 }
bf2039a9 2445
331b2dcc
DM
2446 old_regmatch_state = PL_regmatch_state;
2447 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2448
331b2dcc
DM
2449 for (i=0; i <= max; i++) {
2450 if (i == 1)
2451 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2452 else if (i ==2)
2453 reginfo->info_aux_eval =
2454 reginfo->info_aux->info_aux_eval =
2455 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2456
331b2dcc
DM
2457 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2458 PL_regmatch_state = S_push_slab(aTHX);
2459 }
bf2039a9 2460
331b2dcc
DM
2461 /* note initial PL_regmatch_state position; at end of match we'll
2462 * pop back to there and free any higher slabs */
bf2039a9 2463
331b2dcc
DM
2464 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2465 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2466 reginfo->info_aux->poscache = NULL;
bf2039a9 2467
331b2dcc 2468 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2469
331b2dcc
DM
2470 if ((prog->extflags & RXf_EVAL_SEEN))
2471 S_setup_eval_state(aTHX_ reginfo);
2472 else
2473 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2474 }
d3aa529c 2475
6eb5f6b9 2476 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2477
288b8c02 2478 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2479 /* We have to be careful. If the previous successful match
2480 was from this regex we don't want a subsequent partially
2481 successful match to clobber the old results.
2482 So when we detect this possibility we add a swap buffer
d8da0584
KW
2483 to the re, and switch the buffer each match. If we fail,
2484 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2485 */
2486 swap = prog->offs;
2487 /* do we need a save destructor here for eval dies? */
2488 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2489 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2490 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2491 PTR2UV(prog),
2492 PTR2UV(swap),
2493 PTR2UV(prog->offs)
2494 ));
c74340f9 2495 }
6eb5f6b9
JH
2496
2497 /* Simplest case: anchored match need be tried only once. */
2498 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2499 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
3542935d 2500 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2501 goto got_it;
bbe252da
YO
2502 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2503 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2504 {
2505 char *end;
2506
2507 if (minlen)
2508 dontbother = minlen - 1;
1aa99e6b 2509 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2510 /* for multiline we only have to try after newlines */
33b8afdf 2511 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2512 /* because of the goto we can not easily reuse the macros for bifurcating the
2513 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2514 if (utf8_target) {
2515 if (s == startpos)
2516 goto after_try_utf8;
2517 while (1) {
02d5137b 2518 if (regtry(reginfo, &s)) {
92f3d482
YO
2519 goto got_it;
2520 }
2521 after_try_utf8:
2522 if (s > end) {
2523 goto phooey;
2524 }
2525 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2526 s = re_intuit_start(rx, sv, strbeg,
2527 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2528 if (!s) {
2529 goto phooey;
2530 }
2531 }
2532 else {
2533 s += UTF8SKIP(s);
2534 }
2535 }
2536 } /* end search for check string in unicode */
2537 else {
2538 if (s == startpos) {
2539 goto after_try_latin;
2540 }
2541 while (1) {
02d5137b 2542 if (regtry(reginfo, &s)) {
92f3d482
YO
2543 goto got_it;
2544 }
2545 after_try_latin:
2546 if (s > end) {
2547 goto phooey;
2548 }
2549 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2550 s = re_intuit_start(rx, sv, strbeg,
2551 s + 1, strend, flags, NULL);
92f3d482
YO
2552 if (!s) {
2553 goto phooey;
2554 }
2555 }
2556 else {
2557 s++;
2558 }
2559 }
2560 } /* end search for check string in latin*/
2561 } /* end search for check string */
2562 else { /* search for newline */
2563 if (s > startpos) {
2564 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2565 s--;
92f3d482 2566 }
21eede78
YO
2567 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2568 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2569 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2570 if (regtry(reginfo, &s))
6eb5f6b9
JH
2571 goto got_it;
2572 }
92f3d482
YO
2573 }
2574 } /* end search for newline */
2575 } /* end anchored/multiline check string search */
6eb5f6b9 2576 goto phooey;
bbe252da 2577 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2578 {
2bfbe302
DM
2579 /* For anchored \G, the only position it can match from is
2580 * (ganch-gofs); we already set startpos to this above; if intuit
2581 * moved us on from there, we can't possibly succeed */
2582 assert(startpos == reginfo->ganch - prog->gofs);
2583 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2584 goto got_it;
2585 goto phooey;
2586 }
2587
2588 /* Messy cases: unanchored match. */
bbe252da 2589 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2590 /* we have /x+whatever/ */
984e6dd1 2591 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2592 char ch;
bf93d4cc
GS
2593#ifdef DEBUGGING
2594 int did_match = 0;
2595#endif
f2ed9b32 2596 if (utf8_target) {
7e0d5ad7
KW
2597 if (! prog->anchored_utf8) {
2598 to_utf8_substr(prog);
2599 }
2600 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2601 REXEC_FBC_SCAN(
6eb5f6b9 2602 if (*s == ch) {
a3621e74 2603 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2604 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2605 s += UTF8SKIP(s);
2606 while (s < strend && *s == ch)
2607 s += UTF8SKIP(s);
2608 }
4cadc6a9 2609 );
7e0d5ad7 2610
6eb5f6b9
JH
2611 }
2612 else {
7e0d5ad7
KW
2613 if (! prog->anchored_substr) {
2614 if (! to_byte_substr(prog)) {
6b54ddc5 2615 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2616 }
2617 }
2618 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2619 REXEC_FBC_SCAN(
6eb5f6b9 2620 if (*s == ch) {
a3621e74 2621 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2622 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2623 s++;
2624 while (s < strend && *s == ch)
2625 s++;
2626 }
4cadc6a9 2627 );
6eb5f6b9 2628 }
a3621e74 2629 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2630 PerlIO_printf(Perl_debug_log,
b7953727
JH
2631 "Did not find anchored character...\n")
2632 );
6eb5f6b9 2633 }
a0714e2c
SS
2634 else if (prog->anchored_substr != NULL
2635 || prog->anchored_utf8 != NULL
2636 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2637 && prog->float_max_offset < strend - s)) {
2638 SV *must;
ea3daa5d
FC
2639 SSize_t back_max;
2640 SSize_t back_min;
33b8afdf 2641 char *last;
6eb5f6b9 2642 char *last1; /* Last position checked before */
bf93d4cc
GS
2643#ifdef DEBUGGING
2644 int did_match = 0;
2645#endif
33b8afdf 2646 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2647 if (utf8_target) {
2648 if (! prog->anchored_utf8) {
2649 to_utf8_substr(prog);
2650 }
2651 must = prog->anchored_utf8;
2652 }
2653 else {
2654 if (! prog->anchored_substr) {
2655 if (! to_byte_substr(prog)) {
6b54ddc5 2656 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2657 }
2658 }
2659 must = prog->anchored_substr;
2660 }
33b8afdf
JH
2661 back_max = back_min = prog->anchored_offset;
2662 } else {
7e0d5ad7
KW
2663 if (utf8_target) {
2664 if (! prog->float_utf8) {
2665 to_utf8_substr(prog);
2666 }
2667 must = prog->float_utf8;
2668 }
2669 else {
2670 if (! prog->float_substr) {
2671 if (! to_byte_substr(prog)) {
6b54ddc5 2672 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2673 }
2674 }
2675 must = prog->float_substr;
2676 }
33b8afdf
JH
2677 back_max = prog->float_max_offset;
2678 back_min = prog->float_min_offset;
2679 }
1de06328 2680
1de06328
YO
2681 if (back_min<0) {
2682 last = strend;
2683 } else {
2684 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2685 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2686 - (SvTAIL(must) != 0) + back_min), strbeg);
2687 }
9d9163fb 2688 if (s > reginfo->strbeg)
6eb5f6b9
JH
2689 last1 = HOPc(s, -1);
2690 else
2691 last1 = s - 1; /* bogus */
2692
a0288114 2693 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2694 check_substr==must. */
bf05793b 2695 dontbother = 0;
6eb5f6b9
JH
2696 strend = HOPc(strend, -dontbother);
2697 while ( (s <= last) &&
c33e64f0 2698 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2699 (unsigned char*)strend, must,
c33e64f0 2700 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2701 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2702 if (HOPc(s, -back_max) > last1) {
2703 last1 = HOPc(s, -back_min);
2704 s = HOPc(s, -back_max);
2705 }
2706 else {
9d9163fb
DM
2707 char * const t = (last1 >= reginfo->strbeg)
2708 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2709
2710 last1 = HOPc(s, -back_min);
52657f30 2711 s = t;
6eb5f6b9 2712 }
f2ed9b32 2713 if (utf8_target) {
6eb5f6b9 2714 while (s <= last1) {
02d5137b 2715 if (regtry(reginfo, &s))
6eb5f6b9 2716 goto got_it;
7016d6eb
DM
2717 if (s >= last1) {
2718 s++; /* to break out of outer loop */
2719 break;
2720 }
2721 s += UTF8SKIP(s);
6eb5f6b9
JH
2722 }
2723 }
2724 else {
2725 while (s <= last1) {
02d5137b 2726 if (regtry(reginfo, &s))
6eb5f6b9
JH
2727 goto got_it;
2728 s++;
2729 }
2730 }
2731 }
ab3bbdeb 2732 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2733 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2734 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2735 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2736 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2737 ? "anchored" : "floating"),
ab3bbdeb
YO
2738 quoted, RE_SV_TAIL(must));
2739 });
6eb5f6b9
JH
2740 goto phooey;
2741 }
f8fc2ecf 2742 else if ( (c = progi->regstclass) ) {
f14c76ed 2743 if (minlen) {
f8fc2ecf 2744 const OPCODE op = OP(progi->regstclass);
66e933ab 2745 /* don't bother with what can't match */
786e8c11 2746 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2747 strend = HOPc(strend, -(minlen - 1));
2748 }
a3621e74 2749 DEBUG_EXECUTE_r({
be8e71aa 2750 SV * const prop = sv_newmortal();
32fc9b6a 2751 regprop(prog, prop, c);
0df25f3d 2752 {
f2ed9b32 2753 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2754 s,strend-s,60);
0df25f3d 2755 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2756 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2757 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2758 quoted, (int)(strend - s));
0df25f3d 2759 }
ffc61ed2 2760 });
f9176b44 2761 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2762 goto got_it;
07be1b83 2763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2764 }
2765 else {
2766 dontbother = 0;
a0714e2c 2767 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2768 /* Trim the end. */
6af40bd7 2769 char *last= NULL;
33b8afdf 2770 SV* float_real;
c33e64f0
FC
2771 STRLEN len;
2772 const char *little;
33b8afdf 2773
7e0d5ad7
KW
2774 if (utf8_target) {
2775 if (! prog->float_utf8) {
2776 to_utf8_substr(prog);
2777 }
2778 float_real = prog->float_utf8;
2779 }
2780 else {
2781 if (! prog->float_substr) {
2782 if (! to_byte_substr(prog)) {
6b54ddc5 2783 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2784 }
2785 }
2786 float_real = prog->float_substr;
2787 }
d6a28714 2788
c33e64f0
FC
2789 little = SvPV_const(float_real, len);
2790 if (SvTAIL(float_real)) {
7f18ad16
KW
2791 /* This means that float_real contains an artificial \n on
2792 * the end due to the presence of something like this:
2793 * /foo$/ where we can match both "foo" and "foo\n" at the
2794 * end of the string. So we have to compare the end of the
2795 * string first against the float_real without the \n and
2796 * then against the full float_real with the string. We
2797 * have to watch out for cases where the string might be
2798 * smaller than the float_real or the float_real without
2799 * the \n. */
1a13b075
YO
2800 char *checkpos= strend - len;
2801 DEBUG_OPTIMISE_r(
2802 PerlIO_printf(Perl_debug_log,
2803 "%sChecking for float_real.%s\n",
2804 PL_colors[4], PL_colors[5]));
2805 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2806 /* can't match, even if we remove the trailing \n
2807 * string is too short to match */
1a13b075
YO
2808 DEBUG_EXECUTE_r(
2809 PerlIO_printf(Perl_debug_log,
2810 "%sString shorter than required trailing substring, cannot match.%s\n",
2811 PL_colors[4], PL_colors[5]));
2812 goto phooey;
2813 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2814 /* can match, the end of the string matches without the
2815 * "\n" */
1a13b075
YO
2816 last = checkpos + 1;
2817 } else if (checkpos < strbeg) {
7f18ad16
KW
2818 /* cant match, string is too short when the "\n" is
2819 * included */
1a13b075
YO
2820 DEBUG_EXECUTE_r(
2821 PerlIO_printf(Perl_debug_log,
2822 "%sString does not contain required trailing substring, cannot match.%s\n",
2823 PL_colors[4], PL_colors[5]));
2824 goto phooey;
2825 } else if (!multiline) {
7f18ad16
KW
2826 /* non multiline match, so compare with the "\n" at the
2827 * end of the string */
1a13b075
YO
2828 if (memEQ(checkpos, little, len)) {
2829 last= checkpos;
2830 } else {
2831 DEBUG_EXECUTE_r(
2832 PerlIO_printf(Perl_debug_log,
2833 "%sString does not contain required trailing substring, cannot match.%s\n",
2834 PL_colors[4], PL_colors[5]));
2835 goto phooey;
2836 }
2837 } else {
7f18ad16
KW
2838 /* multiline match, so we have to search for a place
2839 * where the full string is located */
d6a28714 2840 goto find_last;
1a13b075 2841 }
c33e64f0 2842 } else {
d6a28714 2843 find_last:
9041c2e3 2844 if (len)
d6a28714 2845 last = rninstr(s, strend, little, little + len);
b8c5462f 2846 else
a0288114 2847 last = strend; /* matching "$" */
b8c5462f 2848 }
6af40bd7 2849 if (!last) {
7f18ad16
KW
2850 /* at one point this block contained a comment which was
2851 * probably incorrect, which said that this was a "should not
2852 * happen" case. Even if it was true when it was written I am
2853 * pretty sure it is not anymore, so I have removed the comment
2854 * and replaced it with this one. Yves */
6bda09f9
YO
2855 DEBUG_EXECUTE_r(
2856 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2857 "String does not contain required substring, cannot match.\n"
2858 ));
2859 goto phooey;
bf93d4cc 2860 }
d6a28714
JH
2861 dontbother = strend - last + prog->float_min_offset;
2862 }
2863 if (minlen && (dontbother < minlen))
2864 dontbother = minlen - 1;
2865 strend -= dontbother; /* this one's always in bytes! */
2866 /* We don't know much -- general case. */
f2ed9b32 2867 if (utf8_target) {
d6a28714 2868 for (;;) {
02d5137b 2869 if (regtry(reginfo, &s))
d6a28714
JH
2870 goto got_it;
2871 if (s >= strend)
2872 break;
b8c5462f 2873 s += UTF8SKIP(s);
d6a28714
JH
2874 };
2875 }
2876 else {
2877 do {
02d5137b 2878 if (regtry(reginfo, &s))
d6a28714
JH
2879 goto got_it;
2880 } while (s++ < strend);
2881 }
2882 }
2883
2884 /* Failure. */
2885 goto phooey;
2886
2887got_it:
d5e7783a
DM
2888 /* s/// doesn't like it if $& is earlier than where we asked it to
2889 * start searching (which can happen on something like /.\G/) */
2890 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2891 && (prog->offs[0].start < stringarg - strbeg))
2892 {
2893 /* this should only be possible under \G */
2894 assert(prog->extflags & RXf_GPOS_SEEN);
2895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2896 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2897 goto phooey;
2898 }
2899
495f47a5
DM
2900 DEBUG_BUFFERS_r(
2901 if (swap)
2902 PerlIO_printf(Perl_debug_log,
2903 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2904 PTR2UV(prog),
2905 PTR2UV(swap)
2906 );
2907 );
e9105d30 2908 Safefree(swap);
d6a28714 2909
bf2039a9
DM
2910 /* clean up; this will trigger destructors that will free all slabs
2911 * above the current one, and cleanup the regmatch_info_aux
2912 * and regmatch_info_aux_eval sructs */
8adc0f72 2913
006f26b2
DM
2914 LEAVE_SCOPE(oldsave);
2915
5daac39c
NC
2916 if (RXp_PAREN_NAMES(prog))
2917 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 2918
0254aed9
DM
2919 RX_MATCH_UTF8_set(rx, utf8_target);
2920
d6a28714 2921 /* make sure $`, $&, $', and $digit will work later */
60165aa4 2922 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2923 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
2924 strbeg, reginfo->strend,
2925 sv, flags, utf8_target);
9041c2e3 2926
d6a28714
JH
2927 return 1;
2928
2929phooey:
a3621e74 2930 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2931 PL_colors[4], PL_colors[5]));
8adc0f72 2932
bf2039a9
DM
2933 /* clean up; this will trigger destructors that will free all slabs
2934 * above the current one, and cleanup the regmatch_info_aux
2935 * and regmatch_info_aux_eval sructs */
8adc0f72 2936
006f26b2
DM
2937 LEAVE_SCOPE(oldsave);
2938
e9105d30 2939 if (swap) {
c74340f9 2940 /* we failed :-( roll it back */
495f47a5
DM
2941 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2942 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2943 PTR2UV(prog),
2944 PTR2UV(prog->offs),
2945 PTR2UV(swap)
2946 ));
e9105d30
GG
2947 Safefree(prog->offs);
2948 prog->offs = swap;
2949 }
d6a28714
JH
2950 return 0;
2951}
2952
6bda09f9 2953
b3d298be 2954/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b
DM
2955 * Do inc before dec, in case old and new rex are the same */
2956#define SET_reg_curpm(Re2) \
bf2039a9 2957 if (reginfo->info_aux_eval) { \
ec43f78b
DM
2958 (void)ReREFCNT_inc(Re2); \
2959 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2960 PM_SETRE((PL_reg_curpm), (Re2)); \
2961 }
2962
2963
d6a28714
JH
2964/*
2965 - regtry - try match at specific point
2966 */
2967STATIC I32 /* 0 failure, 1 success */
f73aaa43 2968S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2969{
97aff369 2970 dVAR;
d6a28714 2971 CHECKPOINT lastcp;
288b8c02 2972 REGEXP *const rx = reginfo->prog;
8d919b0a 2973 regexp *const prog = ReANY(rx);
99a90e59 2974 SSize_t result;
f8fc2ecf 2975 RXi_GET_DECL(prog,progi);
a3621e74 2976 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2977
2978 PERL_ARGS_ASSERT_REGTRY;
2979
24b23f37 2980 reginfo->cutpoint=NULL;
d6a28714 2981
9d9163fb 2982 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 2983 prog->lastparen = 0;
03994de8 2984 prog->lastcloseparen = 0;
d6a28714
JH
2985
2986 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2987 to do this again and again, prog->lastparen should take care of
3dd2943c 2988 this! --ilya*/
dafc8851
JH
2989
2990 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2991 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2992 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2993 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2994 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2995 * Meanwhile, this code *is* needed for the
daf18116
JH
2996 * above-mentioned test suite tests to succeed. The common theme
2997 * on those tests seems to be returning null fields from matches.
225593e1 2998 * --jhi updated by dapm */
dafc8851 2999#if 1
d6a28714 3000 if (prog->nparens) {
b93070ed 3001 regexp_paren_pair *pp = prog->offs;
eb578fdb 3002 I32 i;
b93070ed 3003 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3004 ++pp;
3005 pp->start = -1;
3006 pp->end = -1;
d6a28714
JH
3007 }
3008 }
dafc8851 3009#endif
02db2b7b 3010 REGCP_SET(lastcp);
f73aaa43
DM
3011 result = regmatch(reginfo, *startposp, progi->program + 1);
3012 if (result != -1) {
3013 prog->offs[0].end = result;
d6a28714
JH
3014 return 1;
3015 }
24b23f37 3016 if (reginfo->cutpoint)
f73aaa43 3017 *startposp= reginfo->cutpoint;
02db2b7b 3018 REGCP_UNWIND(lastcp);
d6a28714
JH
3019 return 0;
3020}
3021
02db2b7b 3022
8ba1375e
MJD
3023#define sayYES goto yes
3024#define sayNO goto no
262b90c4 3025#define sayNO_SILENT goto no_silent
8ba1375e 3026
f9f4320a
YO
3027/* we dont use STMT_START/END here because it leads to
3028 "unreachable code" warnings, which are bogus, but distracting. */
3029#define CACHEsayNO \
c476f425 3030 if (ST.cache_mask) \
2ac8ff4b 3031 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3032 sayNO
3298f257 3033
a3621e74 3034/* this is used to determine how far from the left messages like
265c4333
YO
3035 'failed...' are printed. It should be set such that messages
3036 are inline with the regop output that created them.
a3621e74 3037*/
265c4333 3038#define REPORT_CODE_OFF 32
a3621e74
YO
3039
3040
40a82448
DM
3041#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3042#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3043#define CHRTEST_NOT_A_CP_1 -999
3044#define CHRTEST_NOT_A_CP_2 -998
9e137952 3045
5d9a96ca
DM
3046/* grab a new slab and return the first slot in it */
3047
3048STATIC regmatch_state *
3049S_push_slab(pTHX)
3050{
a35a87e7 3051#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3052 dMY_CXT;
3053#endif
5d9a96ca
DM
3054 regmatch_slab *s = PL_regmatch_slab->next;
3055 if (!s) {
3056 Newx(s, 1, regmatch_slab);
3057 s->prev = PL_regmatch_slab;
3058 s->next = NULL;
3059 PL_regmatch_slab->next = s;
3060 }
3061 PL_regmatch_slab = s;
86545054 3062 return SLAB_FIRST(s);
5d9a96ca 3063}
5b47454d 3064
95b24440 3065
40a82448
DM
3066/* push a new state then goto it */
3067
4d5016e5
DM
3068#define PUSH_STATE_GOTO(state, node, input) \
3069 pushinput = input; \
40a82448
DM
3070 scan = node; \
3071 st->resume_state = state; \
3072 goto push_state;
3073
3074/* push a new state with success backtracking, then goto it */
3075
4d5016e5
DM
3076#define PUSH_YES_STATE_GOTO(state, node, input) \
3077 pushinput = input; \
40a82448
DM
3078 scan = node; \
3079 st->resume_state = state; \
3080 goto push_yes_state;
3081
aa283a38 3082
aa283a38 3083
4d5016e5 3084
d6a28714 3085/*
95b24440 3086
bf1f174e
DM
3087regmatch() - main matching routine
3088
3089This is basically one big switch statement in a loop. We execute an op,
3090set 'next' to point the next op, and continue. If we come to a point which
3091we may need to backtrack to on failure such as (A|B|C), we push a
3092backtrack state onto the backtrack stack. On failure, we pop the top
3093state, and re-enter the loop at the state indicated. If there are no more
3094states to pop, we return failure.
3095
3096Sometimes we also need to backtrack on success; for example /A+/, where
3097after successfully matching one A, we need to go back and try to
3098match another one; similarly for lookahead assertions: if the assertion
3099completes successfully, we backtrack to the state just before the assertion
3100and then carry on. In these cases, the pushed state is marked as
3101'backtrack on success too'. This marking is in fact done by a chain of
3102pointers, each pointing to the previous 'yes' state. On success, we pop to
3103the nearest yes state, discarding any intermediate failure-only states.
3104Sometimes a yes state is pushed just to force some cleanup code to be
3105called at the end of a successful match or submatch; e.g. (??{$re}) uses
3106it to free the inner regex.
3107
3108Note that failure backtracking rewinds the cursor position, while
3109success backtracking leaves it alone.
3110
3111A pattern is complete when the END op is executed, while a subpattern
3112such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3113ops trigger the "pop to last yes state if any, otherwise return true"
3114behaviour.
3115
3116A common convention in this function is to use A and B to refer to the two
3117subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3118the subpattern to be matched possibly multiple times, while B is the entire
3119rest of the pattern. Variable and state names reflect this convention.
3120
3121The states in the main switch are the union of ops and failure/success of
3122substates associated with with that op. For example, IFMATCH is the op
3123that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3124'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3125successfully matched A and IFMATCH_A_fail is a state saying that we have
3126just failed to match A. Resume states always come in pairs. The backtrack
3127state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3128at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3129on success or failure.
3130
3131The struct that holds a backtracking state is actually a big union, with
3132one variant for each major type of op. The variable st points to the
3133top-most backtrack struct. To make the code clearer, within each
3134block of code we #define ST to alias the relevant union.
3135
3136Here's a concrete example of a (vastly oversimplified) IFMATCH
3137implementation:
3138
3139 switch (state) {
3140 ....
3141
3142#define ST st->u.ifmatch
3143
3144 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3145 ST.foo = ...; // some state we wish to save
95b24440 3146 ...
bf1f174e
DM
3147 // push a yes backtrack state with a resume value of
3148 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3149 // first node of A:
4d5016e5 3150 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3151 // NOTREACHED
3152
3153 case IFMATCH_A: // we have successfully executed A; now continue with B
3154 next = B;
3155 bar = ST.foo; // do something with the preserved value
3156 break;
3157
3158 case IFMATCH_A_fail: // A failed, so the assertion failed
3159 ...; // do some housekeeping, then ...
3160 sayNO; // propagate the failure
3161
3162#undef ST
95b24440 3163
bf1f174e
DM
3164 ...
3165 }
95b24440 3166
bf1f174e
DM
3167For any old-timers reading this who are familiar with the old recursive
3168approach, the code above is equivalent to:
95b24440 3169
bf1f174e
DM
3170 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3171 {
3172 int foo = ...
95b24440 3173 ...
bf1f174e
DM
3174 if (regmatch(A)) {
3175 next = B;
3176 bar = foo;
3177 break;
95b24440 3178 }
bf1f174e
DM
3179 ...; // do some housekeeping, then ...
3180 sayNO; // propagate the failure
95b24440 3181 }
bf1f174e
DM
3182
3183The topmost backtrack state, pointed to by st, is usually free. If you
3184want to claim it, populate any ST.foo fields in it with values you wish to
3185save, then do one of
3186
4d5016e5
DM
3187 PUSH_STATE_GOTO(resume_state, node, newinput);
3188 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3189
3190which sets that backtrack state's resume value to 'resume_state', pushes a
3191new free entry to the top of the backtrack stack, then goes to 'node'.
3192On backtracking, the free slot is popped, and the saved state becomes the
3193new free state. An ST.foo field in this new top state can be temporarily
3194accessed to retrieve values, but once the main loop is re-entered, it
3195becomes available for reuse.
3196
3197Note that the depth of the backtrack stack constantly increases during the
3198left-to-right execution of the pattern, rather than going up and down with
3199the pattern nesting. For example the stack is at its maximum at Z at the
3200end of the pattern, rather than at X in the following:
3201
3202 /(((X)+)+)+....(Y)+....Z/
3203
3204The only exceptions to this are lookahead/behind assertions and the cut,
3205(?>A), which pop all the backtrack states associated with A before
3206continuing.
3207
486ec47a 3208Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3209PL_regmatch_state and st always point to the currently active state,
3210and PL_regmatch_slab points to the slab currently containing
3211PL_regmatch_state. The first time regmatch() is called, the first slab is
3212allocated, and is never freed until interpreter destruction. When the slab
3213is full, a new one is allocated and chained to the end. At exit from
3214regmatch(), slabs allocated since entry are freed.
3215
3216*/
95b24440 3217
40a82448 3218
5bc10b2c 3219#define DEBUG_STATE_pp(pp) \
265c4333 3220 DEBUG_STATE_r({ \
f2ed9b32 3221 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3222 PerlIO_printf(Perl_debug_log, \
5d458dd8 3223 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3224 depth*2, "", \
13d6edb4 3225 PL_reg_name[st->resume_state], \
5d458dd8
YO
3226 ((st==yes_state||st==mark_state) ? "[" : ""), \
3227 ((st==yes_state) ? "Y" : ""), \
3228 ((st==mark_state) ? "M" : ""), \
3229 ((st==yes_state||st==mark_state) ? "]" : "") \
3230 ); \
265c4333 3231 });
5bc10b2c 3232
40a82448 3233
3dab1dad 3234#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3235
3df15adc 3236#ifdef DEBUGGING
5bc10b2c 3237
ab3bbdeb 3238STATIC void
f2ed9b32 3239S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3240 const char *start, const char *end, const char *blurb)
3241{
efd26800 3242 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3243
3244 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3245
ab3bbdeb
YO
3246 if (!PL_colorset)
3247 reginitcolors();
3248 {
3249 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3250 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3251
f2ed9b32 3252 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3253 start, end - start, 60);
3254
3255 PerlIO_printf(Perl_debug_log,
3256 "%s%s REx%s %s against %s\n",
3257 PL_colors[4], blurb, PL_colors[5], s0, s1);
3258
f2ed9b32 3259 if (utf8_target||utf8_pat)
1de06328
YO
3260 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3261 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3262 utf8_pat && utf8_target ? " and " : "",
3263 utf8_target ? "string" : ""
ab3bbdeb
YO
3264 );
3265 }
3266}
3df15adc
YO
3267
3268STATIC void
786e8c11
YO
3269S_dump_exec_pos(pTHX_ const char *locinput,
3270 const regnode *scan,
3271 const char *loc_regeol,
3272 const char *loc_bostr,
3273 const char *loc_reg_starttry,
f2ed9b32 3274 const bool utf8_target)
07be1b83 3275{
786e8c11 3276 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3277 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3278 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3279 /* The part of the string before starttry has one color
3280 (pref0_len chars), between starttry and current
3281 position another one (pref_len - pref0_len chars),
3282 after the current position the third one.
3283 We assume that pref0_len <= pref_len, otherwise we
3284 decrease pref0_len. */
786e8c11
YO
3285 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3286 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3287 int pref0_len;
3288
7918f24d
NC
3289 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3290
f2ed9b32 3291 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3292 pref_len++;
786e8c11
YO
3293 pref0_len = pref_len - (locinput - loc_reg_starttry);
3294 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3295 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3296 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3297 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3298 l--;
3299 if (pref0_len < 0)
3300 pref0_len = 0;
3301 if (pref0_len > pref_len)
3302 pref0_len = pref_len;
3303 {
f2ed9b32 3304 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3305
ab3bbdeb 3306 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3307 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3308
ab3bbdeb 3309 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3310 (locinput - pref_len + pref0_len),
1de06328 3311 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3312
ab3bbdeb 3313 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3314 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3315
1de06328 3316 const STRLEN tlen=len0+len1+len2;
3df15adc 3317 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3318 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3319 (IV)(locinput - loc_bostr),
07be1b83 3320 len0, s0,
07be1b83 3321 len1, s1,
07be1b83 3322 (docolor ? "" : "> <"),
07be1b83 3323 len2, s2,
f9f4320a 3324 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3325 "");
3326 }
3327}
3df15adc 3328
07be1b83
YO
3329#endif
3330
0a4db386
YO
3331/* reg_check_named_buff_matched()
3332 * Checks to see if a named buffer has matched. The data array of
3333 * buffer numbers corresponding to the buffer is expected to reside
3334 * in the regexp->data->data array in the slot stored in the ARG() of
3335 * node involved. Note that this routine doesn't actually care about the
3336 * name, that information is not preserved from compilation to execution.
3337 * Returns the index of the leftmost defined buffer with the given name
3338 * or 0 if non of the buffers matched.
3339 */
3340STATIC I32
7918f24d
NC
3341S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3342{
0a4db386 3343 I32 n;
f8fc2ecf 3344 RXi_GET_DECL(rex,rexi);
ad64d0ec 3345 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3346 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3347
3348 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3349
0a4db386 3350 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3351 if ((I32)rex->lastparen >= nums[n] &&
3352 rex->offs[nums[n]].end != -1)
0a4db386
YO
3353 {
3354 return nums[n];
3355 }
3356 }
3357 return 0;
3358}
3359
2f554ef7 3360
c74f6de9 3361static bool
984e6dd1 3362S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3363 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3364{
79a2a0e8
KW
3365 /* This function determines if there are one or two characters that match
3366 * the first character of the passed-in EXACTish node <text_node>, and if
3367 * so, returns them in the passed-in pointers.
c74f6de9 3368 *
79a2a0e8
KW
3369 * If it determines that no possible character in the target string can
3370 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3371 * the first character in <text_node> requires UTF-8 to represent, and the
3372 * target string isn't in UTF-8.)
c74f6de9 3373 *
79a2a0e8
KW
3374 * If there are more than two characters that could match the beginning of
3375 * <text_node>, or if more context is required to determine a match or not,
3376 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3377 *
3378 * The motiviation behind this function is to allow the caller to set up
3379 * tight loops for matching. If <text_node> is of type EXACT, there is
3380 * only one possible character that can match its first character, and so
3381 * the situation is quite simple. But things get much more complicated if
3382 * folding is involved. It may be that the first character of an EXACTFish
3383 * node doesn't participate in any possible fold, e.g., punctuation, so it
3384 * can be matched only by itself. The vast majority of characters that are
3385 * in folds match just two things, their lower and upper-case equivalents.
3386 * But not all are like that; some have multiple possible matches, or match
3387 * sequences of more than one character. This function sorts all that out.
3388 *
3389 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3390 * loop of trying to match A*, we know we can't exit where the thing
3391 * following it isn't a B. And something can't be a B unless it is the
3392 * beginning of B. By putting a quick test for that beginning in a tight
3393 * loop, we can rule out things that can't possibly be B without having to
3394 * break out of the loop, thus avoiding work. Similarly, if A is a single
3395 * character, we can make a tight loop matching A*, using the outputs of
3396 * this function.
3397 *
3398 * If the target string to match isn't in UTF-8, and there aren't
3399 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3400 * the one or two possible octets (which are characters in this situation)
3401 * that can match. In all cases, if there is only one character that can
3402 * match, *<c1p> and *<c2p> will be identical.
3403 *
3404 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3405 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3406 * can match the beginning of <text_node>. They should be declared with at
3407 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3408 * undefined what these contain.) If one or both of the buffers are
3409 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3410 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3411 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3412 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3413 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3414
ba44c216 3415 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3416
ddb0d839
KW
3417 UV c1 = CHRTEST_NOT_A_CP_1;
3418 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8 3419 bool use_chrtest_void = FALSE;
aed7b151 3420 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3421
3422 /* Used when we have both utf8 input and utf8 output, to avoid converting
3423 * to/from code points */
3424 bool utf8_has_been_setup = FALSE;
3425
c74f6de9
KW
3426 dVAR;
3427
b4291290 3428 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3429
79a2a0e8
KW
3430 if (OP(text_node) == EXACT) {
3431
3432 /* In an exact node, only one thing can be matched, that first
3433 * character. If both the pat and the target are UTF-8, we can just
3434 * copy the input to the output, avoiding finding the code point of
3435 * that character */
984e6dd1 3436 if (!is_utf8_pat) {
79a2a0e8
KW
3437 c2 = c1 = *pat;
3438 }
3439 else if (utf8_target) {
3440 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3441 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3442 utf8_has_been_setup = TRUE;
3443 }
3444 else {
3445 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3446 }
79a2a0e8
KW
3447 }
3448 else /* an EXACTFish node */
984e6dd1 3449 if ((is_utf8_pat
79a2a0e8
KW
3450 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3451 pat + STR_LEN(text_node)))
984e6dd1 3452 || (!is_utf8_pat
79a2a0e8
KW
3453 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3454 pat + STR_LEN(text_node))))
3455 {
3456 /* Multi-character folds require more context to sort out. Also
3457 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3458 * handled outside this routine */
3459 use_chrtest_void = TRUE;
3460 }
3461 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3462 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3463 if (c1 > 256) {
3464 /* Load the folds hash, if not already done */
3465 SV** listp;
3466 if (! PL_utf8_foldclosures) {
3467 if (! PL_utf8_tofold) {
632ccf21 3468 U8 dummy[UTF8_MAXBYTES_CASE+1];
79a2a0e8
KW
3469
3470 /* Force loading this by folding an above-Latin1 char */
3471 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3472 assert(PL_utf8_tofold); /* Verify that worked */
3473 }
3474 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3475 }
3476
3477 /* The fold closures data structure is a hash with the keys being
3478 * the UTF-8 of every character that is folded to, like 'k', and
3479 * the values each an array of all code points that fold to its
3480 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3481 * not included */
3482 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3483 (char *) pat,
3484 UTF8SKIP(pat),
3485 FALSE))))
3486 {
3487 /* Not found in the hash, therefore there are no folds
3488 * containing it, so there is only a single character that
3489 * could match */
3490 c2 = c1;
3491 }
3492 else { /* Does participate in folds */
3493 AV* list = (AV*) *listp;
3494 if (av_len(list) != 1) {
3495
3496 /* If there aren't exactly two folds to this, it is outside
3497 * the scope of this function */
3498 use_chrtest_void = TRUE;
3499 }
3500 else { /* There are two. Get them */
3501 SV** c_p = av_fetch(list, 0, FALSE);
3502 if (c_p == NULL) {
3503 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3504 }
3505 c1 = SvUV(*c_p);
3506
3507 c_p = av_fetch(list, 1, FALSE);
3508 if (c_p == NULL) {
3509 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3510 }
3511 c2 = SvUV(*c_p);
3512
3513 /* Folds that cross the 255/256 boundary are forbidden if
3514 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3515 * pattern character is above 256, and its only other match
3516 * is below 256, the only legal match will be to itself.
3517 * We have thrown away the original, so have to compute
3518 * which is the one above 255 */
3519 if ((c1 < 256) != (c2 < 256)) {
3520 if (OP(text_node) == EXACTFL
098b07d5
KW
3521 || ((OP(text_node) == EXACTFA
3522 || OP(text_node) == EXACTFA_NO_TRIE)
79a2a0e8
KW
3523 && (isASCII(c1) || isASCII(c2))))
3524 {
3525 if (c1 < 256) {
3526 c1 = c2;
3527 }
3528 else {
3529 c2 = c1;
3530 }
3531 }
3532 }
3533 }
3534 }
3535 }
3536 else /* Here, c1 is < 255 */
3537 if (utf8_target
3538 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3539 && OP(text_node) != EXACTFL
098b07d5
KW
3540 && ((OP(text_node) != EXACTFA
3541 && OP(text_node) != EXACTFA_NO_TRIE)
3542 || ! isASCII(c1)))
c74f6de9
KW
3543 {
3544 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3545 * folds to this character in the pattern. All such cases except
3546 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3547 * involved in their folds, so are outside the scope of this
3548 * function */
3549 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3550 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3551 }
3552 else {
3553 use_chrtest_void = TRUE;
3554 }
c74f6de9
KW
3555 }
3556 else { /* Here nothing above Latin1 can fold to the pattern character */
3557 switch (OP(text_node)) {
3558
3559 case EXACTFL: /* /l rules */
79a2a0e8 3560 c2 = PL_fold_locale[c1];
c74f6de9
KW
3561 break;
3562
2fdb7295
KW
3563 case EXACTF: /* This node only generated for non-utf8
3564 patterns */
3565 assert(! is_utf8_pat);
c74f6de9 3566 if (! utf8_target) { /* /d rules */
79a2a0e8 3567 c2 = PL_fold[c1];
c74f6de9
KW
3568 break;
3569 }
3570 /* FALLTHROUGH */
3571 /* /u rules for all these. This happens to work for
79a2a0e8 3572 * EXACTFA as nothing in Latin1 folds to ASCII */
098b07d5
KW
3573 case EXACTFA_NO_TRIE: /* This node only generated for
3574 non-utf8 patterns */
3575 assert(! is_utf8_pat);
3576 /* FALL THROUGH */
c74f6de9 3577 case EXACTFA:
79a2a0e8 3578 case EXACTFU_SS:
c74f6de9 3579 case EXACTFU:
79a2a0e8 3580 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3581 break;
3582
878531d3
KW
3583 default:
3584 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3585 assert(0); /* NOTREACHED */
c74f6de9
KW
3586 }
3587 }
3588 }
79a2a0e8
KW
3589
3590 /* Here have figured things out. Set up the returns */
3591 if (use_chrtest_void) {
3592 *c2p = *c1p = CHRTEST_VOID;
3593 }
3594 else if (utf8_target) {
3595 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3596 uvchr_to_utf8(c1_utf8, c1);
3597 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3598 }
c74f6de9 3599
79a2a0e8
KW
3600 /* Invariants are stored in both the utf8 and byte outputs; Use
3601 * negative numbers otherwise for the byte ones. Make sure that the
3602 * byte ones are the same iff the utf8 ones are the same */
3603 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3604 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3605 ? *c2_utf8
3606 : (c1 == c2)
3607 ? CHRTEST_NOT_A_CP_1
3608 : CHRTEST_NOT_A_CP_2;
3609 }
3610 else if (c1 > 255) {
3611 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3612 can represent */
3613 return FALSE;
3614 }
c74f6de9 3615
79a2a0e8
KW
3616 *c1p = *c2p = c2; /* c2 is the only representable value */
3617 }
3618 else { /* c1 is representable; see about c2 */
3619 *c1p = c1;
3620 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3621 }
2f554ef7 3622
c74f6de9
KW
3623 return TRUE;
3624}
2f554ef7 3625
f73aaa43 3626/* returns -1 on failure, $+[0] on success */
99a90e59 3627STATIC SSize_t
f73aaa43 3628S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3629{
a35a87e7 3630#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3631 dMY_CXT;
3632#endif
27da23d5 3633 dVAR;
ba44c216 3634 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 3635 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3636 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3637 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3638 RXi_GET_DECL(rex,rexi);
5d9a96ca 3639 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3640 regmatch_state *st;
5d9a96ca 3641 /* cache heavy used fields of st in registers */
eb578fdb
KW
3642 regnode *scan;
3643 regnode *next;
3644 U32 n = 0; /* general value; init to avoid compiler warning */
ea3daa5d 3645 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3646 char *locinput = startpos;
4d5016e5 3647 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3648 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3649
b69b0499 3650 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3651 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3652 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3653 const U32 max_nochange_depth =
3654 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3655 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3656 regmatch_state *yes_state = NULL; /* state to pop to on success of
3657 subpattern */
e2e6a0f1
YO
3658 /* mark_state piggy backs on the yes_state logic so that when we unwind
3659 the stack on success we can update the mark_state as we go */
3660 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3661 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3662 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3663 U32 state_num;
5d458dd8
YO
3664 bool no_final = 0; /* prevent failure from backtracking? */
3665 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3666 char *startpoint = locinput;
5d458dd8
YO
3667 SV *popmark = NULL; /* are we looking for a mark? */
3668 SV *sv_commit = NULL; /* last mark name seen in failure */
3669 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3670 during a successful match */
5d458dd8
YO
3671 U32 lastopen = 0; /* last open we saw */
3672 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4a2b275c 3673 SV* const oreplsv = GvSVn(PL_replgv);
24d3c4a9
DM
3674 /* these three flags are set by various ops to signal information to
3675 * the very next op. They have a useful lifetime of exactly one loop
3676 * iteration, and are not preserved or restored by state pushes/pops
3677 */
3678 bool sw = 0; /* the condition value in (?(cond)a|b) */
3679 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3680 int logical = 0; /* the following EVAL is:
3681 0: (?{...})
3682 1: (?(?{...})X|Y)
3683 2: (??{...})
3684 or the following IFMATCH/UNLESSM is:
3685 false: plain (?=foo)
3686 true: used as a condition: (?(?=foo))
3687 */
81ed78b2
DM
3688 PAD* last_pad = NULL;
3689 dMULTICALL;
3690 I32 gimme = G_SCALAR;
3691 CV *caller_cv = NULL; /* who called us */
3692 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3693 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3694 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3695 int to_complement; /* Invert the result? */
3696 _char_class_number classnum;
984e6dd1 3697 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3698
95b24440 3699#ifdef DEBUGGING
e68ec53f 3700 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3701#endif
3702
81ed78b2
DM
3703 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3704 multicall_oldcatch = 0;
3705 multicall_cv = NULL;
3706 cx = NULL;
4f8dbb2d
JL
3707 PERL_UNUSED_VAR(multicall_cop);
3708 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3709
3710
7918f24d
NC
3711 PERL_ARGS_ASSERT_REGMATCH;
3712
3b57cd43 3713 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3714 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3715 }));
5d9a96ca 3716
331b2dcc 3717 st = PL_regmatch_state;
5d9a96ca 3718
d6a28714 3719 /* Note that nextchr is a byte even in UTF */
7016d6eb 3720 SET_nextchr;
d6a28714
JH
3721 scan = prog;
3722 while (scan != NULL) {
8ba1375e 3723
a3621e74 3724 DEBUG_EXECUTE_r( {
6136c704 3725 SV * const prop = sv_newmortal();
1de06328 3726 regnode *rnext=regnext(scan);
f2ed9b32 3727 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3728 regprop(rex, prop, scan);
07be1b83
YO
3729
3730 PerlIO_printf(Perl_debug_log,
3731 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3732 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3733 SvPVX_const(prop),
1de06328 3734 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3735 0 : (IV)(rnext - rexi->program));
2a782b5b 3736 });
d6a28714
JH
3737
3738 next = scan + NEXT_OFF(scan);
3739 if (next == scan)
3740 next = NULL;
40a82448 3741 state_num = OP(scan);
d6a28714 3742
40a82448 3743 reenter_switch:
3018b823 3744 to_complement = 0;
34a81e2b 3745
7016d6eb 3746 SET_nextchr;
e6ca698c 3747 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3748
40a82448 3749 switch (state_num) {
3c0563b9 3750 case BOL: /* /^../ */
9d9163fb 3751 if (locinput == reginfo->strbeg)
b8c5462f 3752 break;
d6a28714 3753 sayNO;
3c0563b9
DM
3754
3755 case MBOL: /* /^../m */
9d9163fb 3756 if (locinput == reginfo->strbeg ||
7016d6eb 3757 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3758 {
b8c5462f
JH
3759 break;
3760 }
d6a28714 3761 sayNO;
3c0563b9
DM
3762
3763 case SBOL: /* /^../s */
9d9163fb 3764 if (locinput == reginfo->strbeg)
b8c5462f 3765 break;
d6a28714 3766 sayNO;
3c0563b9
DM
3767
3768 case GPOS: /* \G */
3b0527fe 3769 if (locinput == reginfo->ganch)
d6a28714
JH
3770 break;
3771 sayNO;
ee9b8eae 3772
3c0563b9 3773 case KEEPS: /* \K */
ee9b8eae 3774 /* update the startpoint */
b93070ed 3775 st->u.keeper.val = rex->offs[0].start;
9d9163fb 3776 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 3777 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3778 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3779 case KEEPS_next_fail:
3780 /* rollback the start point change */
b93070ed 3781 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3782 sayNO_SILENT;
878531d3 3783 assert(0); /*NOTREACHED*/
3c0563b9 3784
3c0563b9 3785 case MEOL: /* /..$/m */
7016d6eb 3786 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3787 sayNO;
b8c5462f 3788 break;
3c0563b9 3789
62e16618
KW
3790 case EOL: /* /..$/ */
3791 /* FALL THROUGH */
3c0563b9 3792 case SEOL: /* /..$/s */
7016d6eb 3793 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3794 sayNO;
220db18a 3795 if (reginfo->strend - locinput > 1)
b8c5462f 3796 sayNO;
b8c5462f 3797 break;
3c0563b9
DM
3798
3799 case EOS: /* \z */
7016d6eb 3800 if (!NEXTCHR_IS_EOS)
b8c5462f 3801 sayNO;
d6a28714 3802 break;
3c0563b9
DM
3803
3804 case SANY: /* /./s */
7016d6eb 3805 if (NEXTCHR_IS_EOS)
4633a7c4 3806 sayNO;
28b98f76 3807 goto increment_locinput;
3c0563b9
DM
3808
3809 case CANY: /* \C */
7016d6eb 3810 if (NEXTCHR_IS_EOS)
f33976b4 3811 sayNO;
3640db6b 3812 locinput++;
a0d0e21e 3813 break;
3c0563b9
DM
3814
3815 case REG_ANY: /* /./ */
7016d6eb 3816 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3817 sayNO;
28b98f76
DM
3818 goto increment_locinput;
3819
166ba7cd
DM
3820
3821#undef ST
3822#define ST st->u.trie
3c0563b9 3823 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3824 /* In this case the charclass data is available inline so
3825 we can fail fast without a lot of extra overhead.
3826 */
7016d6eb 3827 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3828 DEBUG_EXECUTE_r(
3829 PerlIO_printf(Perl_debug_log,
3830 "%*s %sfailed to match trie start class...%s\n",
3831 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3832 );
3833 sayNO_SILENT;
118e2215 3834 assert(0); /* NOTREACHED */
786e8c11
YO
3835 }
3836 /* FALL THROUGH */
3c0563b9 3837 case TRIE: /* (ab|cd) */
2e64971a
DM
3838 /* the basic plan of execution of the trie is:
3839 * At the beginning, run though all the states, and
3840 * find the longest-matching word. Also remember the position
3841 * of the shortest matching word. For example, this pattern:
3842 * 1 2 3 4 5
3843 * ab|a|x|abcd|abc
3844 * when matched against the string "abcde", will generate
3845 * accept states for all words except 3, with the longest
895cc420 3846 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3847 * the position being after char 1 of the string).
3848 *
3849 * Then for each matching word, in word order (i.e. 1,2,4,5),
3850 * we run the remainder of the pattern; on each try setting
3851 * the current position to the character following the word,
3852 * returning to try the next word on failure.
3853 *
3854 * We avoid having to build a list of words at runtime by
3855 * using a compile-time structure, wordinfo[].prev, which
3856 * gives, for each word, the previous accepting word (if any).
3857 * In the case above it would contain the mappings 1->2, 2->0,
3858 * 3->0, 4->5, 5->1. We can use this table to generate, from
3859 * the longest word (4 above), a list of all words, by
3860 * following the list of prev pointers; this gives us the
3861 * unordered list 4,5,1,2. Then given the current word we have
3862 * just tried, we can go through the list and find the
3863 * next-biggest word to try (so if we just failed on word 2,
3864 * the next in the list is 4).
3865 *
3866 * Since at runtime we don't record the matching position in
3867 * the string for each word, we have to work that out for
3868 * each word we're about to process. The wordinfo table holds
3869 * the character length of each word; given that we recorded
3870 * at the start: the position of the shortest word and its
3871 * length in chars, we just need to move the pointer the
3872 * difference between the two char lengths. Depending on
3873 * Unicode status and folding, that's cheap or expensive.
3874 *
3875 * This algorithm is optimised for the case where are only a
3876 * small number of accept states, i.e. 0,1, or maybe 2.
3877 * With lots of accepts states, and having to try all of them,
3878 * it becomes quadratic on number of accept states to find all
3879 * the next words.
3880 */
3881
3dab1dad 3882 {
07be1b83 3883 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3884 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3885
3886 /* what trie are we using right now */
be8e71aa 3887 reg_trie_data * const trie
f8fc2ecf 3888 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3889 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3890 U32 state = trie->startstate;
166ba7cd 3891
7016d6eb
DM
3892 if ( trie->bitmap
3893 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3894 {
3dab1dad
YO
3895 if (trie->states[ state ].wordnum) {
3896 DEBUG_EXECUTE_r(
3897 PerlIO_printf(Perl_debug_log,
3898 "%*s %smatched empty string...%s\n",
5bc10b2c 3899 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3900 );
20dbff7c
YO
3901 if (!trie->jump)
3902 break;
3dab1dad
YO
3903 } else {
3904 DEBUG_EXECUTE_r(
3905 PerlIO_printf(Perl_debug_log,
786e8c11 3906 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3907 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3908 );
3909 sayNO_SILENT;
3910 }
3911 }
166ba7cd 3912
786e8c11
YO
3913 {
3914 U8 *uc = ( U8* )locinput;
3915
3916 STRLEN len = 0;
3917 STRLEN foldlen = 0;
3918 U8 *uscan = (U8*)NULL;
786e8c11 3919 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3920 U32 charcount = 0; /* how many input chars we have matched */
3921 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3922
786e8c11 3923 ST.jump = trie->jump;
786e8c11 3924 ST.me = scan;
2e64971a
DM
3925 ST.firstpos = NULL;
3926 ST.longfold = FALSE; /* char longer if folded => it's harder */
3927 ST.nextword = 0;
3928
3929 /* fully traverse the TRIE; note the position of the
3930 shortest accept state and the wordnum of the longest
3931 accept state */
07be1b83 3932
220db18a 3933 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 3934 U32 base = trie->states[ state ].trans.base;
f9f4320a 3935 UV uvc = 0;
acb909b4 3936 U16 charid = 0;
2e64971a
DM
3937 U16 wordnum;
3938 wordnum = trie->states[ state ].wordnum;
3939
3940 if (wordnum) { /* it's an accept state */
3941 if (!accepted) {
3942 accepted = 1;
3943 /* record first match position */
3944 if (ST.longfold) {
3945 ST.firstpos = (U8*)locinput;
3946 ST.firstchars = 0;
5b47454d 3947 }
2e64971a
DM
3948 else {
3949 ST.firstpos = uc;
3950 ST.firstchars = charcount;
3951 }
3952 }
3953 if (!ST.nextword || wordnum < ST.nextword)
3954 ST.nextword = wordnum;
3955 ST.topword = wordnum;
786e8c11 3956 }
a3621e74 3957
07be1b83 3958 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3959 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3960 PerlIO_printf( Perl_debug_log,
2e64971a 3961 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3962 2+depth * 2, "", PL_colors[4],
2e64971a 3963 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3964 });
a3621e74 3965
2e64971a 3966 /* read a char and goto next state */
220db18a 3967 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 3968 I32 offset;
55eed653
NC
3969 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3970 uscan, len, uvc, charid, foldlen,
3971 foldbuf, uniflags);
2e64971a
DM
3972 charcount++;
3973 if (foldlen>0)
3974 ST.longfold = TRUE;
5b47454d 3975 if (charid &&
6dd2be57
DM
3976 ( ((offset =
3977 base + charid - 1 - trie->uniquecharcount)) >= 0)
3978
3979 && ((U32)offset < trie->lasttrans)
3980 && trie->trans[offset].check == state)
5b47454d 3981 {
6dd2be57 3982 state = trie->trans[offset].next;
5b47454d
DM
3983 }
3984 else {
3985 state = 0;
3986 }
3987 uc += len;
3988
3989 }
3990 else {
a3621e74
YO
3991 state = 0;
3992 }
3993 DEBUG_TRIE_EXECUTE_r(
e4584336 3994 PerlIO_printf( Perl_debug_log,
786e8c11 3995 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3996 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3997 );
3998 }
2e64971a 3999 if (!accepted)
a3621e74 4000 sayNO;
a3621e74 4001
2e64971a
DM
4002 /* calculate total number of accept states */
4003 {
4004 U16 w = ST.topword;
4005 accepted = 0;
4006 while (w) {
4007 w = trie->wordinfo[w].prev;
4008 accepted++;
4009 }
4010 ST.accepted = accepted;
4011 }
4012
166ba7cd
DM
4013 DEBUG_EXECUTE_r(
4014 PerlIO_printf( Perl_debug_log,
4015 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 4016 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
4017 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4018 );
2e64971a 4019 goto trie_first_try; /* jump into the fail handler */
786e8c11 4020 }}
118e2215 4021 assert(0); /* NOTREACHED */
2e64971a
DM
4022
4023 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
4024 {
4025 U8 *uc;
fae667d5
YO
4026 if ( ST.jump) {
4027 REGCP_UNWIND(ST.cp);
a8d1f4b4 4028 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 4029 }
2e64971a
DM
4030 if (!--ST.accepted) {
4031 DEBUG_EXECUTE_r({
4032 PerlIO_printf( Perl_debug_log,
4033 "%*s %sTRIE failed...%s\n",
4034 REPORT_CODE_OFF+depth*2, "",
4035 PL_colors[4],
4036 PL_colors[5] );
4037 });
4038 sayNO_SILENT;
4039 }
4040 {
4041 /* Find next-highest word to process. Note that this code
4042 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
4043 U16 min = 0;
4044 U16 word;
4045 U16 const nextword = ST.nextword;
4046 reg_trie_wordinfo * const wordinfo
2e64971a
DM
4047 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4048 for (word=ST.topword; word; word=wordinfo[word].prev) {
4049 if (word > nextword && (!min || word < min))
4050 min = word;
4051 }
4052 ST.nextword = min;
4053 }
4054
fae667d5 4055 trie_first_try:
5d458dd8
YO
4056 if (do_cutgroup) {
4057 do_cutgroup = 0;
4058 no_final = 0;
4059 }
fae667d5
YO
4060
4061 if ( ST.jump) {
b93070ed 4062 ST.lastparen = rex->lastparen;
f6033a9d 4063 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 4064 REGCP_SET(ST.cp);
2e64971a 4065 }
a3621e74 4066
2e64971a 4067 /* find start char of end of current word */
166ba7cd 4068 {
2e64971a 4069 U32 chars; /* how many chars to skip */
2e64971a
DM
4070 reg_trie_data * const trie
4071 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4072
4073 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4074 >= ST.firstchars);
4075 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4076 - ST.firstchars;
a059a757 4077 uc = ST.firstpos;
2e64971a
DM
4078
4079 if (ST.longfold) {
4080 /* the hard option - fold each char in turn and find
4081 * its folded length (which may be different */
4082 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4083 STRLEN foldlen;
4084 STRLEN len;
d9a396a3 4085 UV uvc;
2e64971a
DM
4086 U8 *uscan;
4087
4088 while (chars) {
f2ed9b32 4089 if (utf8_target) {
c80e42f3 4090 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
2e64971a
DM
4091 uniflags);
4092 uc += len;
4093 }
4094 else {
4095 uvc = *uc;
4096 uc++;
4097 }
4098 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4099 uscan = foldbuf;
4100 while (foldlen) {
4101 if (!--chars)
4102 break;
c80e42f3 4103 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
2e64971a
DM
4104 uniflags);
4105 uscan += len;
4106 foldlen -= len;
4107 }
4108 }
a3621e74 4109 }
2e64971a 4110 else {
f2ed9b32 4111 if (utf8_target)
2e64971a
DM
4112 while (chars--)
4113 uc += UTF8SKIP(uc);
4114 else
4115 uc += chars;
4116 }
2e64971a 4117 }
166ba7cd 4118
6603fe3e
DM
4119 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4120 ? ST.jump[ST.nextword]
4121 : NEXT_OFF(ST.me));
166ba7cd 4122
2e64971a
DM
4123 DEBUG_EXECUTE_r({
4124 PerlIO_printf( Perl_debug_log,
4125 "%*s %sTRIE matched word #%d, continuing%s\n",
4126 REPORT_CODE_OFF+depth*2, "",
4127 PL_colors[4],
4128 ST.nextword,
4129 PL_colors[5]
4130 );
4131 });
4132
4133 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4134 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4135 assert(0); /* NOTREACHED */
166ba7cd 4136 }
2e64971a
DM
4137 /* only one choice left - just continue */
4138 DEBUG_EXECUTE_r({
4139 AV *const trie_words
4140 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4141 SV ** const tmp = av_fetch( trie_words,
4142 ST.nextword-1, 0 );
4143 SV *sv= tmp ? sv_newmortal() : NULL;
4144
4145 PerlIO_printf( Perl_debug_log,
4146 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4147 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4148 ST.nextword,
4149 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4150 PL_colors[0], PL_colors[1],
c89df6cf 4151 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4152 )
4153 : "not compiled under -Dr",
4154 PL_colors[5] );
4155 });
4156
a059a757 4157 locinput = (char*)uc;
2e64971a 4158 continue; /* execute rest of RE */
118e2215 4159 assert(0); /* NOTREACHED */
a059a757 4160 }
166ba7cd
DM
4161#undef ST
4162
3c0563b9 4163 case EXACT: { /* /abc/ */
95b24440 4164 char *s = STRING(scan);
24d3c4a9 4165 ln = STR_LEN(scan);
984e6dd1 4166 if (utf8_target != is_utf8_pat) {
bc517b45 4167 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4168 char *l = locinput;
24d3c4a9 4169 const char * const e = s + ln;
a72c7584 4170
f2ed9b32 4171 if (utf8_target) {
e6a3850e
KW
4172 /* The target is utf8, the pattern is not utf8.
4173 * Above-Latin1 code points can't match the pattern;
4174 * invariants match exactly, and the other Latin1 ones need
4175 * to be downgraded to a single byte in order to do the
4176 * comparison. (If we could be confident that the target
4177 * is not malformed, this could be refactored to have fewer
4178 * tests by just assuming that if the first bytes match, it
4179 * is an invariant, but there are tests in the test suite
4180 * dealing with (??{...}) which violate this) */
1aa99e6b 4181 while (s < e) {
220db18a
DM
4182 if (l >= reginfo->strend
4183 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4184 {
e6a3850e
KW
4185 sayNO;
4186 }
4187 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4188 if (*l != *s) {
4189 sayNO;
4190 }
4191 l++;
4192 }
4193 else {
94bb8c36
KW
4194 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4195 {
e6a3850e
KW
4196 sayNO;
4197 }
4198 l += 2;
4199 }
4200 s++;
1aa99e6b 4201 }
5ff6fc6d
JH
4202 }
4203 else {
4204 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4205 while (s < e) {
220db18a
DM
4206 if (l >= reginfo->strend
4207 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
4208 {
4209 sayNO;
4210 }
4211 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4212 if (*s != *l) {
4213 sayNO;
4214 }
4215 s++;
4216 }
4217 else {
94bb8c36
KW
4218 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4219 {
e6a3850e
KW
4220 sayNO;
4221 }
4222 s += 2;
4223 }
4224 l++;
1aa99e6b 4225 }
5ff6fc6d 4226 }
1aa99e6b 4227 locinput = l;
1aa99e6b 4228 }
5ac65bff
KW
4229 else {
4230 /* The target and the pattern have the same utf8ness. */
4231 /* Inline the first character, for speed. */
220db18a 4232 if (reginfo->strend - locinput < ln
5ac65bff
KW
4233 || UCHARAT(s) != nextchr
4234 || (ln > 1 && memNE(s, locinput, ln)))
4235 {
4236 sayNO;
4237 }
4238 locinput += ln;
4239 }
d6a28714 4240 break;
95b24440 4241 }
7016d6eb 4242
3c0563b9 4243 case EXACTFL: { /* /abc/il */
a932d541 4244 re_fold_t folder;
9a5a5549
KW
4245 const U8 * fold_array;
4246 const char * s;
d513472c 4247 U32 fold_utf8_flags;
9a5a5549 4248
272d35c9 4249 RX_MATCH_TAINTED_on(reginfo->prog);
f67f9e53
KW
4250 folder = foldEQ_locale;
4251 fold_array = PL_fold_locale;
17580e7a 4252 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4253 goto do_exactf;
4254
3c0563b9 4255 case EXACTFU_SS: /* /\x{df}/iu */
3c0563b9 4256 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4257 folder = foldEQ_latin1;
4258 fold_array = PL_fold_latin1;
984e6dd1 4259 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4260 goto do_exactf;
4261
098b07d5
KW
4262 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4263 patterns */
4264 assert(! is_utf8_pat);
4265 /* FALL THROUGH */
3c0563b9 4266 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4267 folder = foldEQ_latin1;
4268 fold_array = PL_fold_latin1;
57014d77 4269 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4270 goto do_exactf;
4271
2fdb7295
KW
4272 case EXACTF: /* /abc/i This node only generated for
4273 non-utf8 patterns */
4274 assert(! is_utf8_pat);
9a5a5549
KW
4275 folder = foldEQ;
4276 fold_array = PL_fold;
62bf7766 4277 fold_utf8_flags = 0;
9a5a5549
KW
4278
4279 do_exactf:
4280 s = STRING(scan);
24d3c4a9 4281 ln = STR_LEN(scan);
d6a28714 4282
984e6dd1 4283 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
3c760661
KW
4284 /* Either target or the pattern are utf8, or has the issue where
4285 * the fold lengths may differ. */
be8e71aa 4286 const char * const l = locinput;
220db18a 4287 char *e = reginfo->strend;
bc517b45 4288
984e6dd1 4289 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4290 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4291 {
4292 sayNO;
5486206c 4293 }
d07ddd77 4294 locinput = e;
d07ddd77 4295 break;
a0ed51b3 4296 }
d6a28714 4297
0a138b74 4298 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4299 if (UCHARAT(s) != nextchr
4300 && !NEXTCHR_IS_EOS
4301 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4302 {
a0ed51b3 4303 sayNO;
9a5a5549 4304 }
220db18a 4305 if (reginfo->strend - locinput < ln)
b8c5462f 4306 sayNO;
9a5a5549 4307 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4308 sayNO;
24d3c4a9 4309 locinput += ln;
a0d0e21e 4310 break;
9a5a5549 4311 }
63ac0dad
KW
4312
4313 /* XXX Could improve efficiency by separating these all out using a
4314 * macro or in-line function. At that point regcomp.c would no longer
4315 * have to set the FLAGS fields of these */
3c0563b9
DM
4316 case BOUNDL: /* /\b/l */
4317 case NBOUNDL: /* /\B/l */
272d35c9 4318 RX_MATCH_TAINTED_on(reginfo->prog);
b2680017 4319 /* FALL THROUGH */
3c0563b9
DM
4320 case BOUND: /* /\b/ */
4321 case BOUNDU: /* /\b/u */
4322 case BOUNDA: /* /\b/a */
4323 case NBOUND: /* /\B/ */
4324 case NBOUNDU: /* /\B/u */
4325 case NBOUNDA: /* /\B/a */
b2680017 4326 /* was last char in word? */
f2e96b5d
KW
4327 if (utf8_target
4328 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4329 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4330 {
9d9163fb 4331 if (locinput == reginfo->strbeg)
b2680017
YO
4332 ln = '\n';
4333 else {
9d9163fb
DM
4334 const U8 * const r =
4335 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
b2680017 4336
3db24e1e
KW
4337 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4338 0, uniflags);
b2680017 4339 }
63ac0dad 4340 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4341 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4342 if (NEXTCHR_IS_EOS)
4343 n = 0;
4344 else {
4345 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4346 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4347 utf8_target);
4348 }
b2680017
YO
4349 }
4350 else {
f4cd282c 4351 ln = isWORDCHAR_LC_uvchr(ln);
0eb30aeb 4352 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4353 }
4354 }
4355 else {
cfaf538b
KW
4356
4357 /* Here the string isn't utf8, or is utf8 and only ascii
4358 * characters are to match \w. In the latter case looking at
4359 * the byte just prior to the current one may be just the final
4360 * byte of a multi-byte character. This is ok. There are two
4361 * cases:
4362 * 1) it is a single byte character, and then the test is doing
4363 * just what it's supposed to.
4364 * 2) it is a multi-byte character, in which case the final
4365 * byte is never mistakable for ASCII, and so the test
4366 * will say it is not a word character, which is the
4367 * correct answer. */
9d9163fb 4368 ln = (locinput != reginfo->strbeg) ?
b2680017 4369 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4370 switch (FLAGS(scan)) {
4371 case REGEX_UNICODE_CHARSET:
4372 ln = isWORDCHAR_L1(ln);
7016d6eb 4373 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4374 break;
4375 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4376 ln = isWORDCHAR_LC(ln);
4377 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4378 break;
4379 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4380 ln = isWORDCHAR(ln);
4381 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4382 break;
cfaf538b 4383 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4384 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4385 ln = isWORDCHAR_A(ln);
7016d6eb 4386 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4387 break;
63ac0dad
KW
4388 default:
4389 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4390 break;
b2680017
YO
4391 }
4392 }
63ac0dad
KW
4393 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4394 * regcomp.sym */
4395 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4396 sayNO;
4397 break;
3c0563b9 4398
3c0563b9 4399 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4400 if (NEXTCHR_IS_EOS)
4401 sayNO;
e0193e47 4402 if (utf8_target) {
3db24e1e
KW
4403 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4404 utf8_target))
09b08e9b 4405 sayNO;
635cd5d4 4406 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4407 }
4408 else {
20ed0b26 4409 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4410 sayNO;
3640db6b 4411 locinput++;
e0f9d4a8 4412 }
b8c5462f 4413 break;
3c0563b9 4414
3018b823
KW
4415 /* The argument (FLAGS) to all the POSIX node types is the class number
4416 * */
ee9a90b8 4417
3018b823
KW
4418 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4419 to_complement = 1;
4420 /* FALLTHROUGH */
4421
4422 case POSIXL: /* \w or [:punct:] etc. under /l */
4423 if (NEXTCHR_IS_EOS)
bedac28b 4424 sayNO;
bedac28b 4425
3018b823
KW
4426 /* The locale hasn't influenced the outcome before this, so defer
4427 * tainting until now */
272d35c9 4428 RX_MATCH_TAINTED_on(reginfo->prog);
3018b823
KW
4429
4430 /* Use isFOO_lc() for characters within Latin1. (Note that
4431 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4432 * wouldn't be invariant) */
4433 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4434 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4435 sayNO;
4436 }
4437 }
3018b823
KW
4438 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4439 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
94bb8c36 4440 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823
KW
4441 *(locinput + 1))))))
4442 {
bedac28b 4443 sayNO;
3018b823 4444 }
bedac28b 4445 }
3018b823
KW
4446 else { /* Here, must be an above Latin-1 code point */
4447 goto utf8_posix_not_eos;
bedac28b 4448 }
3018b823
KW
4449
4450 /* Here, must be utf8 */
4451 locinput += UTF8SKIP(locinput);
bedac28b
KW
4452 break;
4453
3018b823
KW
4454 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4455 to_complement = 1;
4456 /* FALLTHROUGH */
4457
4458 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4459 if (utf8_target) {
3018b823 4460 goto utf8_posix;
bedac28b 4461 }
3018b823
KW
4462 goto posixa;
4463
4464 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4465
3018b823 4466 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4467 sayNO;
4468 }
bedac28b 4469
3018b823
KW
4470 /* All UTF-8 variants match */
4471 if (! UTF8_IS_INVARIANT(nextchr)) {
4472 goto increment_locinput;
bedac28b 4473 }
ee9a90b8 4474
3018b823
KW
4475 to_complement = 1;
4476 /* FALLTHROUGH */
4477
4478 case POSIXA: /* \w or [:punct:] etc. under /a */
4479
4480 posixa:
4481 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4482 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4483 * character is a single byte */
20d0b1e9 4484
3018b823
KW
4485 if (NEXTCHR_IS_EOS
4486 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4487 FLAGS(scan)))))
4488 {
0658cdde
KW
4489 sayNO;
4490 }
3018b823
KW
4491
4492 /* Here we are either not in utf8, or we matched a utf8-invariant,
4493 * so the next char is the next byte */
3640db6b 4494 locinput++;
0658cdde 4495 break;
3c0563b9 4496
3018b823
KW
4497 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4498 to_complement = 1;
4499 /* FALLTHROUGH */
4500
4501 case POSIXU: /* \w or [:punct:] etc. under /u */
4502 utf8_posix:
4503 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4504 sayNO;
4505 }
3018b823
KW
4506 utf8_posix_not_eos:
4507
4508 /* Use _generic_isCC() for characters within Latin1. (Note that
4509 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4510 * wouldn't be invariant) */
4511 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4512 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4513 FLAGS(scan)))))
4514 {
4515 sayNO;
4516 }
4517 locinput++;
4518 }
4519 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4520 if (! (to_complement
94bb8c36 4521 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823 4522 *(locinput + 1)),
94bb8c36 4523 FLAGS(scan)))))
3018b823
KW
4524 {
4525 sayNO;
4526 }
4527 locinput += 2;
4528 }
4529 else { /* Handle above Latin-1 code points */
4530 classnum = (_char_class_number) FLAGS(scan);
4531 if (classnum < _FIRST_NON_SWASH_CC) {
4532
4533 /* Here, uses a swash to find such code points. Load if if
4534 * not done already */
4535 if (! PL_utf8_swash_ptrs[classnum]) {
4536 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4537 PL_utf8_swash_ptrs[classnum]
4538 = _core_swash_init("utf8",
2a16ac92
KW
4539 "",
4540 &PL_sv_undef, 1, 0,
4541 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
4542 }
4543 if (! (to_complement
4544 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4545 (U8 *) locinput, TRUE))))
4546 {
4547 sayNO;
4548 }
4549 }
4550 else { /* Here, uses macros to find above Latin-1 code points */
4551 switch (classnum) {
4552 case _CC_ENUM_SPACE: /* XXX would require separate
4553 code if we revert the change
4554 of \v matching this */
4555 case _CC_ENUM_PSXSPC:
4556 if (! (to_complement
4557 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4558 {
4559 sayNO;
4560 }
4561 break;
4562 case _CC_ENUM_BLANK:
4563 if (! (to_complement
4564 ^ cBOOL(is_HORIZWS_high(locinput))))
4565 {
4566 sayNO;
4567 }
4568 break;
4569 case _CC_ENUM_XDIGIT:
4570 if (! (to_complement
4571 ^ cBOOL(is_XDIGIT_high(locinput))))
4572 {
4573 sayNO;
4574 }
4575 break;
4576 case _CC_ENUM_VERTSPACE:
4577 if (! (to_complement
4578 ^ cBOOL(is_VERTWS_high(locinput))))
4579 {
4580 sayNO;
4581 }
4582 break;
4583 default: /* The rest, e.g. [:cntrl:], can't match
4584 above Latin1 */
4585 if (! to_complement) {
4586 sayNO;
4587 }
4588 break;
4589 }
4590 }
4591 locinput += UTF8SKIP(locinput);
4592 }
4593 break;
0658cdde 4594
37e2e78e
KW
4595 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4596 a Unicode extended Grapheme Cluster */
4597 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4598 extended Grapheme Cluster is:
4599
7aee35ff
KW
4600 CR LF
4601 | Prepend* Begin Extend*
4602 | .
37e2e78e 4603
7aee35ff
KW
4604 Begin is: ( Special_Begin | ! Control )
4605 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4606 Extend is: ( Grapheme_Extend | Spacing_Mark )
4607 Control is: [ GCB_Control | CR | LF ]
4608 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4609
27d4fc33
KW
4610 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4611 we can rewrite
4612
4613 Begin is ( Regular_Begin + Special Begin )
4614
4615 It turns out that 98.4% of all Unicode code points match
4616 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4617 the previous implementation for almost all Unicode code points.
27d4fc33 4618
37e2e78e
KW
4619 There is a subtlety with Prepend* which showed up in testing.
4620 Note that the Begin, and only the Begin is required in:
4621 | Prepend* Begin Extend*
cc3b396d
KW
4622 Also, Begin contains '! Control'. A Prepend must be a
4623 '! Control', which means it must also be a Begin. What it
4624 comes down to is that if we match Prepend* and then find no
4625 suitable Begin afterwards, that if we backtrack the last
4626 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4627 */
4628
7016d6eb 4629 if (NEXTCHR_IS_EOS)
a0ed51b3 4630 sayNO;
f2ed9b32 4631 if (! utf8_target) {
37e2e78e
KW
4632
4633 /* Match either CR LF or '.', as all the other possibilities
4634 * require utf8 */
4635 locinput++; /* Match the . or CR */
cc3b396d
KW
4636 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4637 match the LF */
220db18a 4638 && locinput < reginfo->strend
e699a1d5
KW
4639 && UCHARAT(locinput) == '\n')
4640 {
4641 locinput++;
4642 }
37e2e78e
KW
4643 }
4644 else {
4645
4646 /* Utf8: See if is ( CR LF ); already know that locinput <
220db18a
DM
4647 * reginfo->strend, so locinput+1 is in bounds */
4648 if ( nextchr == '\r' && locinput+1 < reginfo->strend
f67f9e53 4649 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4650 {
37e2e78e
KW
4651 locinput += 2;
4652 }
4653 else {
45fdf108
KW
4654 STRLEN len;
4655
37e2e78e
KW
4656 /* In case have to backtrack to beginning, then match '.' */
4657 char *starting = locinput;
4658
4659 /* In case have to backtrack the last prepend */
e699a1d5 4660 char *previous_prepend = NULL;
37e2e78e
KW
4661
4662 LOAD_UTF8_CHARCLASS_GCB();
4663
45fdf108 4664 /* Match (prepend)* */
220db18a 4665 while (locinput < reginfo->strend
45fdf108
KW
4666 && (len = is_GCB_Prepend_utf8(locinput)))
4667 {
4668 previous_prepend = locinput;
4669 locinput += len;
a1853d78 4670 }
37e2e78e
KW
4671
4672 /* As noted above, if we matched a prepend character, but
4673 * the next thing won't match, back off the last prepend we
4674 * matched, as it is guaranteed to match the begin */
4675 if (previous_prepend
220db18a 4676 && (locinput >= reginfo->strend
c101f46d
KW
4677 || (! swash_fetch(PL_utf8_X_regular_begin,
4678 (U8*)locinput, utf8_target)
bff53399 4679 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4680 )
37e2e78e
KW
4681 {
4682 locinput = previous_prepend;
4683 }
4684
220db18a 4685 /* Note that here we know reginfo->strend > locinput, as we
37e2e78e
KW
4686 * tested that upon input to this switch case, and if we
4687 * moved locinput forward, we tested the result just above
4688 * and it either passed, or we backed off so that it will
4689 * now pass */
11dfcd49
KW
4690 if (swash_fetch(PL_utf8_X_regular_begin,
4691 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4692 locinput += UTF8SKIP(locinput);
4693 }
bff53399 4694 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4695
4696 /* Here did not match the required 'Begin' in the
4697 * second term. So just match the very first
4698 * character, the '.' of the final term of the regex */
4699 locinput = starting + UTF8SKIP(starting);
27d4fc33 4700 goto exit_utf8;
37e2e78e
KW
4701 } else {
4702
11dfcd49
KW
4703 /* Here is a special begin. It can be composed of
4704 * several individual characters. One possibility is
4705 * RI+ */
45fdf108
KW
4706 if ((len = is_GCB_RI_utf8(locinput))) {
4707 locinput += len;
220db18a 4708 while (locinput < reginfo->strend
45fdf108 4709 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4710 {
45fdf108 4711 locinput += len;
11dfcd49 4712 }
45fdf108
KW
4713 } else if ((len = is_GCB_T_utf8(locinput))) {
4714 /* Another possibility is T+ */
4715 locinput += len;
220db18a 4716 while (locinput < reginfo->strend
45fdf108 4717 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4718 {
45fdf108 4719 locinput += len;
11dfcd49
KW
4720 }
4721 } else {
4722
4723 /* Here, neither RI+ nor T+; must be some other
4724 * Hangul. That means it is one of the others: L,
4725 * LV, LVT or V, and matches:
4726 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4727
4728 /* Match L* */
220db18a 4729 while (locinput < reginfo->strend
45fdf108 4730 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4731 {
45fdf108 4732 locinput += len;
11dfcd49 4733 }
37e2e78e 4734
11dfcd49
KW
4735 /* Here, have exhausted L*. If the next character
4736 * is not an LV, LVT nor V, it means we had to have
4737 * at least one L, so matches L+ in the original
4738 * equation, we have a complete hangul syllable.
4739 * Are done. */
4740
220db18a 4741 if (locinput < reginfo->strend
45fdf108 4742 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4743 {
11dfcd49 4744 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4745 * See if LVT, by first ruling out V, then LV */
4746 if (! is_GCB_V_utf8(locinput)
4747 /* All but every TCount one is LV */
4748 && (valid_utf8_to_uvchr((U8 *) locinput,
4749 NULL)
4750 - SBASE)
4751 % TCount != 0)
4752 {
11dfcd49
KW
4753 locinput += UTF8SKIP(locinput);
4754 } else {
4755
4756 /* Must be V or LV. Take it, then match
4757 * V* */
4758 locinput += UTF8SKIP(locinput);
220db18a 4759 while (locinput < reginfo->strend
45fdf108 4760 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4761 {
45fdf108 4762 locinput += len;
11dfcd49
KW
4763 }
4764 }
37e2e78e 4765
11dfcd49 4766 /* And any of LV, LVT, or V can be followed
45fdf108 4767 * by T* */
220db18a 4768 while (locinput < reginfo->strend
45fdf108 4769 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4770 {
45fdf108 4771 locinput += len;
11dfcd49
KW
4772 }
4773 }
cd94d768 4774 }
11dfcd49 4775 }
37e2e78e 4776
11dfcd49 4777 /* Match any extender */
220db18a 4778 while (locinput < reginfo->strend
11dfcd49
KW
4779 && swash_fetch(PL_utf8_X_extend,
4780 (U8*)locinput, utf8_target))
4781 {
4782 locinput += UTF8SKIP(locinput);
4783 }
37e2e78e 4784 }
27d4fc33 4785 exit_utf8:
220db18a 4786 if (locinput > reginfo->strend) sayNO;
37e2e78e 4787 }
a0ed51b3 4788 break;
81714fb9 4789
3c0563b9 4790 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4791 { /* The capture buffer cases. The ones beginning with N for the
4792 named buffers just convert to the equivalent numbered and
4793 pretend they were called as the corresponding numbered buffer
4794 op. */
26ecd678
TC
4795 /* don't initialize these in the declaration, it makes C++
4796 unhappy */
9d9163fb 4797 const char *s;
ff1157ca 4798 char type;
8368298a
TC
4799 re_fold_t folder;
4800 const U8 *fold_array;
26ecd678 4801 UV utf8_fold_flags;
8368298a 4802
272d35c9 4803 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4804 folder = foldEQ_locale;
4805 fold_array = PL_fold_locale;
4806 type = REFFL;
17580e7a 4807 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4808 goto do_nref;
4809
3c0563b9 4810 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4811 folder = foldEQ_latin1;
4812 fold_array = PL_fold_latin1;
4813 type = REFFA;
4814 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4815 goto do_nref;
4816
3c0563b9 4817 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4818 folder = foldEQ_latin1;
4819 fold_array = PL_fold_latin1;
4820 type = REFFU;
d513472c 4821 utf8_fold_flags = 0;
d7ef4b73
KW
4822 goto do_nref;
4823
3c0563b9 4824 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4825 folder = foldEQ;
4826 fold_array = PL_fold;
4827 type = REFF;
d513472c 4828 utf8_fold_flags = 0;
d7ef4b73
KW
4829 goto do_nref;
4830
3c0563b9 4831 case NREF: /* /\g{name}/ */
d7ef4b73 4832 type = REF;
83d7b90b
KW
4833 folder = NULL;
4834 fold_array = NULL;
d513472c 4835 utf8_fold_flags = 0;
d7ef4b73
KW
4836 do_nref:
4837
4838 /* For the named back references, find the corresponding buffer
4839 * number */
0a4db386
YO
4840 n = reg_check_named_buff_matched(rex,scan);
4841
d7ef4b73 4842 if ( ! n ) {
81714fb9 4843 sayNO;
d7ef4b73
KW
4844 }
4845 goto do_nref_ref_common;
4846
3c0563b9 4847 case REFFL: /* /\1/il */
272d35c9 4848 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4849 folder = foldEQ_locale;
4850 fold_array = PL_fold_locale;
17580e7a 4851 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4852 goto do_ref;
4853
3c0563b9 4854 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4855 folder = foldEQ_latin1;
4856 fold_array = PL_fold_latin1;
4857 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4858 goto do_ref;
4859
3c0563b9 4860 case REFFU: /* /\1/iu */
d7ef4b73
KW
4861 folder = foldEQ_latin1;
4862 fold_array = PL_fold_latin1;
d513472c 4863 utf8_fold_flags = 0;
d7ef4b73
KW
4864 goto do_ref;
4865
3c0563b9 4866 case REFF: /* /\1/i */
d7ef4b73
KW
4867 folder = foldEQ;
4868 fold_array = PL_fold;
d513472c 4869 utf8_fold_flags = 0;
83d7b90b 4870 goto do_ref;
d7ef4b73 4871
3c0563b9 4872 case REF: /* /\1/ */
83d7b90b
KW
4873 folder = NULL;
4874 fold_array = NULL;
d513472c 4875 utf8_fold_flags = 0;
83d7b90b 4876
d7ef4b73 4877 do_ref:
81714fb9 4878 type = OP(scan);
d7ef4b73
KW
4879 n = ARG(scan); /* which paren pair */
4880
4881 do_nref_ref_common:
b93070ed 4882 ln = rex->offs[n].start;
1cb48e53 4883 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
b93070ed 4884 if (rex->lastparen < n || ln == -1)
af3f8c16 4885 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4886 if (ln == rex->offs[n].end)
a0d0e21e 4887 break;
a0ed51b3 4888
9d9163fb 4889 s = reginfo->strbeg + ln;
d7ef4b73 4890 if (type != REF /* REF can do byte comparison */
2f65c56d 4891 && (utf8_target || type == REFFU))
d7ef4b73 4892 { /* XXX handle REFFL better */
220db18a 4893 char * limit = reginfo->strend;
d7ef4b73
KW
4894
4895 /* This call case insensitively compares the entire buffer
4896 * at s, with the current input starting at locinput, but
220db18a
DM
4897 * not going off the end given by reginfo->strend, and
4898 * returns in <limit> upon success, how much of the
4899 * current input was matched */
b93070ed 4900 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4901 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4902 {
4903 sayNO;
a0ed51b3 4904 }
d7ef4b73 4905 locinput = limit;
a0ed51b3
LW
4906 break;
4907 }
4908
d7ef4b73 4909 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4910 if (!NEXTCHR_IS_EOS &&
4911 UCHARAT(s) != nextchr &&
81714fb9 4912 (type == REF ||
d7ef4b73 4913 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4914 sayNO;
b93070ed 4915 ln = rex->offs[n].end - ln;
220db18a 4916 if (locinput + ln > reginfo->strend)
4633a7c4 4917 sayNO;
81714fb9 4918 if (ln > 1 && (type == REF
24d3c4a9 4919 ? memNE(s, locinput, ln)
d7ef4b73 4920 : ! folder(s, locinput, ln)))
4633a7c4 4921 sayNO;
24d3c4a9 4922 locinput += ln;
a0d0e21e 4923 break;
81714fb9 4924 }
3c0563b9
DM
4925
4926 case NOTHING: /* null op; e.g. the 'nothing' following
4927 * the '*' in m{(a+|b)*}' */
4928 break;
4929 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4930 break;
3c0563b9
DM
4931
4932 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4933 break;
40a82448
DM
4934
4935#undef ST
4936#define ST st->u.eval
c277df42 4937 {
c277df42 4938 SV *ret;
d2f13c59 4939 REGEXP *re_sv;
6bda09f9 4940 regexp *re;
f8fc2ecf 4941 regexp_internal *rei;
1a147d38
YO
4942 regnode *startpoint;
4943
3c0563b9 4944 case GOSTART: /* (?R) */
e7707071
YO
4945 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4946 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4947 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4948 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4949 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4950 Perl_croak(aTHX_
4951 "Pattern subroutine nesting without pos change"
4952 " exceeded limit in regex");
6bda09f9
YO
4953 } else {
4954 nochange_depth = 0;
1a147d38 4955 }
288b8c02 4956 re_sv = rex_sv;
6bda09f9 4957 re = rex;
f8fc2ecf 4958 rei = rexi;
1a147d38 4959 if (OP(scan)==GOSUB) {
6bda09f9
YO
4960 startpoint = scan + ARG2L(scan);
4961 ST.close_paren = ARG(scan);
4962 } else {
f8fc2ecf 4963 startpoint = rei->program+1;
6bda09f9
YO
4964 ST.close_paren = 0;
4965 }
4966 goto eval_recurse_doit;
118e2215 4967 assert(0); /* NOTREACHED */
3c0563b9 4968
6bda09f9
YO
4969 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4970 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4971 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4972 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4973 } else {
4974 nochange_depth = 0;
4975 }
8e5e9ebe 4976 {
4aabdb9b 4977 /* execute the code in the {...} */
81ed78b2 4978
4aabdb9b 4979 dSP;
a6dc34f1 4980 IV before;
1f4d1a1e 4981 OP * const oop = PL_op;
4aabdb9b 4982 COP * const ocurcop = PL_curcop;
81ed78b2 4983 OP *nop;
81ed78b2 4984 CV *newcv;
91332126 4985
74088413 4986 /* save *all* paren positions */
92da3157 4987 regcppush(rex, 0, maxopenparen);
74088413
DM
4988 REGCP_SET(runops_cp);
4989
81ed78b2
DM
4990 if (!caller_cv)
4991 caller_cv = find_runcv(NULL);
4992
4aabdb9b 4993 n = ARG(scan);
81ed78b2 4994
b30fcab9 4995 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4996 newcv = (ReANY(
b30fcab9
DM
4997 (REGEXP*)(rexi->data->data[n])
4998 ))->qr_anoncv
81ed78b2
DM
4999 ;
5000 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
5001 }
5002 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
5003 newcv = caller_cv;
5004 nop = (OP*)rexi->data->data[n];
5005 assert(CvDEPTH(newcv));
68e2671b
DM
5006 }
5007 else {
d24ca0c5
DM
5008 /* literal with own CV */
5009 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
5010 newcv = rex->qr_anoncv;
5011 nop = (OP*)rexi->data->data[n];
68e2671b 5012 }
81ed78b2 5013
0e458318
DM
5014 /* normally if we're about to execute code from the same
5015 * CV that we used previously, we just use the existing
5016 * CX stack entry. However, its possible that in the
5017 * meantime we may have backtracked, popped from the save
5018 * stack, and undone the SAVECOMPPAD(s) associated with
5019 * PUSH_MULTICALL; in which case PL_comppad no longer
5020 * points to newcv's pad. */
5021 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5022 {
b0065247
DM
5023 U8 flags = (CXp_SUB_RE |
5024 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 5025 if (last_pushed_cv) {
b0065247 5026 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5027 }
5028 else {
b0065247 5029 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5030 }
5031 last_pushed_cv = newcv;
5032 }
c31ee3bb
DM
5033 else {
5034 /* these assignments are just to silence compiler
5035 * warnings */
5036 multicall_cop = NULL;
5037 newsp = NULL;
5038 }
0e458318
DM
5039 last_pad = PL_comppad;
5040
2e2e3f36
DM
5041 /* the initial nextstate you would normally execute
5042 * at the start of an eval (which would cause error
5043 * messages to come from the eval), may be optimised
5044 * away from the execution path in the regex code blocks;
5045 * so manually set PL_curcop to it initially */
5046 {
81ed78b2 5047 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
5048 assert(o->op_type == OP_NULL);
5049 if (o->op_targ == OP_SCOPE) {
5050 o = cUNOPo->op_first;
5051 }
5052 else {
5053 assert(o->op_targ == OP_LEAVE);
5054 o = cUNOPo->op_first;
5055 assert(o->op_type == OP_ENTER);
5056 o = o->op_sibling;
5057 }
5058
5059 if (o->op_type != OP_STUB) {
5060 assert( o->op_type == OP_NEXTSTATE
5061 || o->op_type == OP_DBSTATE
5062 || (o->op_type == OP_NULL
5063 && ( o->op_targ == OP_NEXTSTATE
5064 || o->op_targ == OP_DBSTATE
5065 )
5066 )
5067 );
5068 PL_curcop = (COP*)o;
5069 }
5070 }
81ed78b2 5071 nop = nop->op_next;
2e2e3f36 5072
24b23f37 5073 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
5074 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5075
8adc0f72 5076 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9 5077 if (reginfo->info_aux_eval->pos_magic)
25fdce4a
FC
5078 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5079 reginfo->sv, reginfo->strbeg,
5080 locinput - reginfo->strbeg);
4aabdb9b 5081
2bf803e2
YO
5082 if (sv_yes_mark) {
5083 SV *sv_mrk = get_sv("REGMARK", 1);
5084 sv_setsv(sv_mrk, sv_yes_mark);
5085 }
5086
81ed78b2
DM
5087 /* we don't use MULTICALL here as we want to call the
5088 * first op of the block of interest, rather than the
5089 * first op of the sub */
a6dc34f1 5090 before = (IV)(SP-PL_stack_base);
81ed78b2 5091 PL_op = nop;
8e5e9ebe
RGS
5092 CALLRUNOPS(aTHX); /* Scalar context. */
5093 SPAGAIN;
a6dc34f1 5094 if ((IV)(SP-PL_stack_base) == before)
075aa684 5095 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
5096 else {
5097 ret = POPs;
5098 PUTBACK;
5099 }
4aabdb9b 5100
e4bfbed3
DM
5101 /* before restoring everything, evaluate the returned
5102 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
5103 * PL_op or pad. Also need to process any magic vars
5104 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
5105
5106 PL_op = NULL;
5107
5e98dac2 5108 re_sv = NULL;
e4bfbed3
DM
5109 if (logical == 0) /* (?{})/ */
5110 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5111 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5112 sw = cBOOL(SvTRUE(ret));
5113 logical = 0;
5114 }
5115 else { /* /(??{}) */
497d0a96
DM
5116 /* if its overloaded, let the regex compiler handle
5117 * it; otherwise extract regex, or stringify */
237da807 5118 if (SvGMAGICAL(ret))
2685dc2d 5119 ret = sv_mortalcopy(ret);
497d0a96
DM
5120 if (!SvAMAGIC(ret)) {
5121 SV *sv = ret;
5122 if (SvROK(sv))
5123 sv = SvRV(sv);
5124 if (SvTYPE(sv) == SVt_REGEXP)
5125 re_sv = (REGEXP*) sv;
63620942
FC
5126 else if (SvSMAGICAL(ret)) {
5127 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
497d0a96
DM
5128 if (mg)
5129 re_sv = (REGEXP *) mg->mg_obj;
5130 }
e4bfbed3 5131
2685dc2d 5132 /* force any undef warnings here */
237da807
FC
5133 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5134 ret = sv_mortalcopy(ret);
497d0a96
DM
5135 (void) SvPV_force_nolen(ret);
5136 }
e4bfbed3
DM
5137 }
5138
5139 }
5140
81ed78b2
DM
5141 /* *** Note that at this point we don't restore
5142 * PL_comppad, (or pop the CxSUB) on the assumption it may
5143 * be used again soon. This is safe as long as nothing
5144 * in the regexp code uses the pad ! */
4aabdb9b 5145 PL_op = oop;
4aabdb9b 5146 PL_curcop = ocurcop;
92da3157 5147 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
f5df269c 5148 PL_curpm = PL_reg_curpm;
e4bfbed3
DM
5149
5150 if (logical != 2)
4aabdb9b 5151 break;
8e5e9ebe 5152 }
e4bfbed3
DM
5153
5154 /* only /(??{})/ from now on */
24d3c4a9 5155 logical = 0;
4aabdb9b 5156 {
4f639d21
DM
5157 /* extract RE object from returned value; compiling if
5158 * necessary */
5c35adbb 5159
575c37f6
DM
5160 if (re_sv) {
5161 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5162 }
0f5d15d6 5163 else {
c737faaf 5164 U32 pm_flags = 0;
0f5d15d6 5165
9753d940
DM
5166 if (SvUTF8(ret) && IN_BYTES) {
5167 /* In use 'bytes': make a copy of the octet
5168 * sequence, but without the flag on */
b9ad30b4
NC
5169 STRLEN len;
5170 const char *const p = SvPV(ret, len);
5171 ret = newSVpvn_flags(p, len, SVs_TEMP);
5172 }
732caac7
DM
5173 if (rex->intflags & PREGf_USE_RE_EVAL)
5174 pm_flags |= PMf_USE_RE_EVAL;
5175
5176 /* if we got here, it should be an engine which
5177 * supports compiling code blocks and stuff */
5178 assert(rex->engine && rex->engine->op_comp);
ec841a27 5179 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5180 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5181 rex->engine, NULL, NULL,
5182 /* copy /msix etc to inner pattern */
5183 scan->flags,
5184 pm_flags);
732caac7 5185
9041c2e3 5186 if (!(SvFLAGS(ret)
237da807
FC
5187 & (SVs_TEMP | SVs_GMG | SVf_ROK))
5188 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
a2794585
NC
5189 /* This isn't a first class regexp. Instead, it's
5190 caching a regexp onto an existing, Perl visible
5191 scalar. */
575c37f6 5192 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5193 }
0f5d15d6 5194 }
e1ff3a88 5195 SAVEFREESV(re_sv);
8d919b0a 5196 re = ReANY(re_sv);
4aabdb9b 5197 }
07bc277f 5198 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5199 re->subbeg = rex->subbeg;
5200 re->sublen = rex->sublen;
6502e081
DM
5201 re->suboffset = rex->suboffset;
5202 re->subcoffset = rex->subcoffset;
f8fc2ecf 5203 rei = RXi_GET(re);
6bda09f9 5204 DEBUG_EXECUTE_r(
220db18a
DM
5205 debug_start_match(re_sv, utf8_target, locinput,
5206 reginfo->strend, "Matching embedded");
6bda09f9 5207 );
f8fc2ecf 5208 startpoint = rei->program + 1;
1a147d38 5209 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5210
1a147d38 5211 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5212 /* run the pattern returned from (??{...}) */
92da3157
DM
5213
5214 /* Save *all* the positions. */
5215 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5216 REGCP_SET(ST.lastcp);
6bda09f9 5217
0357f1fd
ML
5218 re->lastparen = 0;
5219 re->lastcloseparen = 0;
5220
92da3157 5221 maxopenparen = 0;
4aabdb9b 5222
1cb95af7
DM
5223 /* invalidate the S-L poscache. We're now executing a
5224 * different set of WHILEM ops (and their associated
5225 * indexes) against the same string, so the bits in the
5226 * cache are meaningless. Setting maxiter to zero forces
5227 * the cache to be invalidated and zeroed before reuse.
5228 * XXX This is too dramatic a measure. Ideally we should
5229 * save the old cache and restore when running the outer
5230 * pattern again */
1cb48e53 5231 reginfo->poscache_maxiter = 0;
4aabdb9b 5232
aed7b151 5233 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5234
288b8c02 5235 ST.prev_rex = rex_sv;
faec1544 5236 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5237 rex_sv = re_sv;
5238 SET_reg_curpm(rex_sv);
288b8c02 5239 rex = re;
f8fc2ecf 5240 rexi = rei;
faec1544 5241 cur_curlyx = NULL;
40a82448 5242 ST.B = next;
faec1544
DM
5243 ST.prev_eval = cur_eval;
5244 cur_eval = st;
faec1544 5245 /* now continue from first node in postoned RE */
4d5016e5 5246 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5247 assert(0); /* NOTREACHED */
c277df42 5248 }
40a82448 5249
faec1544
DM
5250 case EVAL_AB: /* cleanup after a successful (??{A})B */
5251 /* note: this is called twice; first after popping B, then A */
ec43f78b 5252 rex_sv = ST.prev_rex;
aed7b151 5253 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5254 SET_reg_curpm(rex_sv);
8d919b0a 5255 rex = ReANY(rex_sv);
f8fc2ecf 5256 rexi = RXi_GET(rex);
faec1544
DM
5257 regcpblow(ST.cp);
5258 cur_eval = ST.prev_eval;
5259 cur_curlyx = ST.prev_curlyx;
34a81e2b 5260
1cb95af7 5261 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5262 reginfo->poscache_maxiter = 0;
e7707071 5263 if ( nochange_depth )
4b196cd4 5264 nochange_depth--;
262b90c4 5265 sayYES;
40a82448 5266
40a82448 5267
faec1544
DM
5268 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5269 /* note: this is called twice; first after popping B, then A */
ec43f78b 5270 rex_sv = ST.prev_rex;
aed7b151 5271 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5272 SET_reg_curpm(rex_sv);
8d919b0a 5273 rex = ReANY(rex_sv);
f8fc2ecf 5274 rexi = RXi_GET(rex);
0357f1fd 5275
40a82448 5276 REGCP_UNWIND(ST.lastcp);
92da3157 5277 regcppop(rex, &maxopenparen);
faec1544
DM
5278 cur_eval = ST.prev_eval;
5279 cur_curlyx = ST.prev_curlyx;
1cb95af7 5280 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5281 reginfo->poscache_maxiter = 0;
e7707071 5282 if ( nochange_depth )
4b196cd4 5283 nochange_depth--;
40a82448 5284 sayNO_SILENT;
40a82448
DM
5285#undef ST
5286
3c0563b9 5287 case OPEN: /* ( */
c277df42 5288 n = ARG(scan); /* which paren pair */
9d9163fb 5289 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
5290 if (n > maxopenparen)
5291 maxopenparen = n;
495f47a5 5292 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5293 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5294 PTR2UV(rex),
5295 PTR2UV(rex->offs),
5296 (UV)n,
5297 (IV)rex->offs[n].start_tmp,
92da3157 5298 (UV)maxopenparen
495f47a5 5299 ));
e2e6a0f1 5300 lastopen = n;
a0d0e21e 5301 break;
495f47a5
DM
5302
5303/* XXX really need to log other places start/end are set too */
5304#define CLOSE_CAPTURE \
5305 rex->offs[n].start = rex->offs[n].start_tmp; \
9d9163fb 5306 rex->offs[n].end = locinput - reginfo->strbeg; \
495f47a5
DM
5307 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5308 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5309 PTR2UV(rex), \
5310 PTR2UV(rex->offs), \
5311 (UV)n, \
5312 (IV)rex->offs[n].start, \
5313 (IV)rex->offs[n].end \
5314 ))
5315
3c0563b9 5316 case CLOSE: /* ) */
c277df42 5317 n = ARG(scan); /* which paren pair */
495f47a5 5318 CLOSE_CAPTURE;
b93070ed
DM
5319 if (n > rex->lastparen)
5320 rex->lastparen = n;
5321 rex->lastcloseparen = n;
3b6647e0 5322 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5323 goto fake_end;
5324 }
a0d0e21e 5325 break;
3c0563b9
DM
5326
5327 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5328 if (ARG(scan)){
5329 regnode *cursor;
5330 for (cursor=scan;
5331 cursor && OP(cursor)!=END;
5332 cursor=regnext(cursor))
5333 {
5334 if ( OP(cursor)==CLOSE ){
5335 n = ARG(cursor);
5336 if ( n <= lastopen ) {
495f47a5 5337 CLOSE_CAPTURE;
b93070ed
DM
5338 if (n > rex->lastparen)
5339 rex->lastparen = n;
5340 rex->lastcloseparen = n;
3b6647e0
RB
5341 if ( n == ARG(scan) || (cur_eval &&
5342 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5343 break;
5344 }
5345 }
5346 }
5347 }
5348 goto fake_end;
5349 /*NOTREACHED*/
3c0563b9
DM
5350
5351 case GROUPP: /* (?(1)) */
c277df42 5352 n = ARG(scan); /* which paren pair */
b93070ed 5353 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5354 break;
3c0563b9
DM
5355
5356 case NGROUPP: /* (?(<name>)) */
0a4db386 5357 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5358 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5359 break;
3c0563b9
DM
5360
5361 case INSUBP: /* (?(R)) */
0a4db386 5362 n = ARG(scan);
3b6647e0 5363 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5364 break;
3c0563b9
DM
5365
5366 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5367 sw = 0;
5368 break;
3c0563b9
DM
5369
5370 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 5371 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 5372 if (sw)
c277df42
IZ
5373 next = NEXTOPER(NEXTOPER(scan));
5374 else {
5375 next = scan + ARG(scan);
5376 if (OP(next) == IFTHEN) /* Fake one. */
5377 next = NEXTOPER(NEXTOPER(next));
5378 }
5379 break;
3c0563b9
DM
5380
5381 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5382 logical = scan->flags;
c277df42 5383 break;
c476f425 5384
2ab05381 5385/*******************************************************************
2ab05381 5386
c476f425
DM
5387The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5388pattern, where A and B are subpatterns. (For simple A, CURLYM or
5389STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5390
c476f425 5391A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5392
c476f425
DM
5393On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5394state, which contains the current count, initialised to -1. It also sets
5395cur_curlyx to point to this state, with any previous value saved in the
5396state block.
2ab05381 5397
c476f425
DM
5398CURLYX then jumps straight to the WHILEM op, rather than executing A,
5399since the pattern may possibly match zero times (i.e. it's a while {} loop
5400rather than a do {} while loop).
2ab05381 5401
c476f425
DM
5402Each entry to WHILEM represents a successful match of A. The count in the
5403CURLYX block is incremented, another WHILEM state is pushed, and execution
5404passes to A or B depending on greediness and the current count.
2ab05381 5405
c476f425
DM
5406For example, if matching against the string a1a2a3b (where the aN are
5407substrings that match /A/), then the match progresses as follows: (the
5408pushed states are interspersed with the bits of strings matched so far):
2ab05381 5409
c476f425
DM
5410 <CURLYX cnt=-1>
5411 <CURLYX cnt=0><WHILEM>
5412 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5413 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5414 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5415 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5416
c476f425
DM
5417(Contrast this with something like CURLYM, which maintains only a single
5418backtrack state:
2ab05381 5419
c476f425
DM
5420 <CURLYM cnt=0> a1
5421 a1 <CURLYM cnt=1> a2
5422 a1 a2 <CURLYM cnt=2> a3
5423 a1 a2 a3 <CURLYM cnt=3> b
5424)
2ab05381 5425
c476f425
DM
5426Each WHILEM state block marks a point to backtrack to upon partial failure
5427of A or B, and also contains some minor state data related to that
5428iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5429overall state, such as the count, and pointers to the A and B ops.
2ab05381 5430
c476f425
DM
5431This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5432must always point to the *current* CURLYX block, the rules are:
2ab05381 5433
c476f425
DM
5434When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5435and set cur_curlyx to point the new block.
2ab05381 5436
c476f425
DM
5437When popping the CURLYX block after a successful or unsuccessful match,
5438restore the previous cur_curlyx.
2ab05381 5439
c476f425
DM
5440When WHILEM is about to execute B, save the current cur_curlyx, and set it
5441to the outer one saved in the CURLYX block.
2ab05381 5442
c476f425
DM
5443When popping the WHILEM block after a successful or unsuccessful B match,
5444restore the previous cur_curlyx.
2ab05381 5445
c476f425
DM
5446Here's an example for the pattern (AI* BI)*BO
5447I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5448
c476f425
DM
5449cur_
5450curlyx backtrack stack
5451------ ---------------
5452NULL
5453CO <CO prev=NULL> <WO>
5454CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5455CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5456NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5457
c476f425
DM
5458At this point the pattern succeeds, and we work back down the stack to
5459clean up, restoring as we go:
95b24440 5460
c476f425
DM
5461CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5462CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5463CO <CO prev=NULL> <WO>
5464NULL
a0374537 5465
c476f425
DM
5466*******************************************************************/
5467
5468#define ST st->u.curlyx
5469
5470 case CURLYX: /* start of /A*B/ (for complex A) */
5471 {
5472 /* No need to save/restore up to this paren */
5473 I32 parenfloor = scan->flags;
5474
5475 assert(next); /* keep Coverity happy */
5476 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5477 next += ARG(next);
5478
5479 /* XXXX Probably it is better to teach regpush to support
92da3157 5480 parenfloor > maxopenparen ... */
b93070ed
DM
5481 if (parenfloor > (I32)rex->lastparen)
5482 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5483
5484 ST.prev_curlyx= cur_curlyx;
5485 cur_curlyx = st;
5486 ST.cp = PL_savestack_ix;
5487
5488 /* these fields contain the state of the current curly.
5489 * they are accessed by subsequent WHILEMs */
5490 ST.parenfloor = parenfloor;
d02d6d97 5491 ST.me = scan;
c476f425 5492 ST.B = next;
24d3c4a9
DM
5493 ST.minmod = minmod;
5494 minmod = 0;
c476f425
DM
5495 ST.count = -1; /* this will be updated by WHILEM */
5496 ST.lastloc = NULL; /* this will be updated by WHILEM */
5497
4d5016e5 5498 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5499 assert(0); /* NOTREACHED */
c476f425 5500 }
a0d0e21e 5501
c476f425 5502 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5503 cur_curlyx = ST.prev_curlyx;
5504 sayYES;
118e2215 5505 assert(0); /* NOTREACHED */
a0d0e21e 5506
c476f425
DM
5507 case CURLYX_end_fail: /* just failed to match all of A*B */
5508 regcpblow(ST.cp);
5509 cur_curlyx = ST.prev_curlyx;
5510 sayNO;
118e2215 5511 assert(0); /* NOTREACHED */
4633a7c4 5512
a0d0e21e 5513
c476f425
DM
5514#undef ST
5515#define ST st->u.whilem
5516
5517 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5518 {
5519 /* see the discussion above about CURLYX/WHILEM */
c476f425 5520 I32 n;
d02d6d97
DM
5521 int min = ARG1(cur_curlyx->u.curlyx.me);
5522 int max = ARG2(cur_curlyx->u.curlyx.me);
5523 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5524
c476f425
DM
5525 assert(cur_curlyx); /* keep Coverity happy */
5526 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5527 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5528 ST.cache_offset = 0;
5529 ST.cache_mask = 0;
5530
c476f425
DM
5531
5532 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5533 "%*s whilem: matched %ld out of %d..%d\n",
5534 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5535 );
a0d0e21e 5536
c476f425 5537 /* First just match a string of min A's. */
a0d0e21e 5538
d02d6d97 5539 if (n < min) {
92da3157
DM
5540 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5541 maxopenparen);
c476f425 5542 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5543 REGCP_SET(ST.lastcp);
5544
4d5016e5 5545 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5546 assert(0); /* NOTREACHED */
c476f425
DM
5547 }
5548
5549 /* If degenerate A matches "", assume A done. */
5550
5551 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5552 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5553 "%*s whilem: empty match detected, trying continuation...\n",
5554 REPORT_CODE_OFF+depth*2, "")
5555 );
5556 goto do_whilem_B_max;
5557 }
5558
1cb95af7
DM
5559 /* super-linear cache processing.
5560 *
5561 * The idea here is that for certain types of CURLYX/WHILEM -
5562 * principally those whose upper bound is infinity (and
5563 * excluding regexes that have things like \1 and other very
5564 * non-regular expresssiony things), then if a pattern like
5565 * /....A*.../ fails and we backtrack to the WHILEM, then we
5566 * make a note that this particular WHILEM op was at string
5567 * position 47 (say) when the rest of pattern failed. Then, if
5568 * we ever find ourselves back at that WHILEM, and at string
5569 * position 47 again, we can just fail immediately rather than
5570 * running the rest of the pattern again.
5571 *
5572 * This is very handy when patterns start to go
5573 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5574 * with a combinatorial explosion of backtracking.
5575 *
5576 * The cache is implemented as a bit array, with one bit per
5577 * string byte position per WHILEM op (up to 16) - so its
5578 * between 0.25 and 2x the string size.
5579 *
5580 * To avoid allocating a poscache buffer every time, we do an
5581 * initially countdown; only after we have executed a WHILEM
5582 * op (string-length x #WHILEMs) times do we allocate the
5583 * cache.
5584 *
5585 * The top 4 bits of scan->flags byte say how many different
5586 * relevant CURLLYX/WHILEM op pairs there are, while the
5587 * bottom 4-bits is the identifying index number of this
5588 * WHILEM.
5589 */
c476f425
DM
5590
5591 if (scan->flags) {
a0d0e21e 5592
1cb48e53 5593 if (!reginfo->poscache_maxiter) {
c476f425
DM
5594 /* start the countdown: Postpone detection until we
5595 * know the match is not *that* much linear. */
1cb48e53 5596 reginfo->poscache_maxiter
9d9163fb
DM
5597 = (reginfo->strend - reginfo->strbeg + 1)
5598 * (scan->flags>>4);
66bf836d 5599 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
5600 if (reginfo->poscache_maxiter < 0)
5601 reginfo->poscache_maxiter = I32_MAX;
5602 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 5603 }
c476f425 5604
1cb48e53 5605 if (reginfo->poscache_iter-- == 0) {
c476f425 5606 /* initialise cache */
ea3daa5d 5607 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
5608 regmatch_info_aux *const aux = reginfo->info_aux;
5609 if (aux->poscache) {
ea3daa5d 5610 if ((SSize_t)reginfo->poscache_size < size) {
2ac8ff4b
DM
5611 Renew(aux->poscache, size, char);
5612 reginfo->poscache_size = size;
2c2d71f5 5613 }
2ac8ff4b 5614 Zero(aux->poscache, size, char);
2c2d71f5
JH
5615 }
5616 else {
2ac8ff4b
DM
5617 reginfo->poscache_size = size;
5618 Newxz(aux->poscache, size, char);
2c2d71f5 5619 }
c476f425
DM
5620 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5621 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5622 PL_colors[4], PL_colors[5])
5623 );
2c2d71f5 5624 }
c476f425 5625
1cb48e53 5626 if (reginfo->poscache_iter < 0) {
c476f425 5627 /* have we already failed at this position? */
ea3daa5d 5628 SSize_t offset, mask;
338e600a
DM
5629
5630 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 5631 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
5632 + (locinput - reginfo->strbeg)
5633 * (scan->flags>>4);
c476f425
DM
5634 mask = 1 << (offset % 8);
5635 offset /= 8;
2ac8ff4b 5636 if (reginfo->info_aux->poscache[offset] & mask) {
c476f425
DM
5637 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5638 "%*s whilem: (cache) already tried at this position...\n",
5639 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5640 );
3298f257 5641 sayNO; /* cache records failure */
2c2d71f5 5642 }
c476f425
DM
5643 ST.cache_offset = offset;
5644 ST.cache_mask = mask;
2c2d71f5 5645 }
c476f425 5646 }
2c2d71f5 5647
c476f425 5648 /* Prefer B over A for minimal matching. */
a687059c 5649
c476f425
DM
5650 if (cur_curlyx->u.curlyx.minmod) {
5651 ST.save_curlyx = cur_curlyx;
5652 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5653 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5654 maxopenparen);
c476f425 5655 REGCP_SET(ST.lastcp);
4d5016e5
DM
5656 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5657 locinput);
118e2215 5658 assert(0); /* NOTREACHED */
c476f425 5659 }
a0d0e21e 5660
c476f425
DM
5661 /* Prefer A over B for maximal matching. */
5662
d02d6d97 5663 if (n < max) { /* More greed allowed? */
92da3157
DM
5664 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5665 maxopenparen);
c476f425
DM
5666 cur_curlyx->u.curlyx.lastloc = locinput;
5667 REGCP_SET(ST.lastcp);
4d5016e5 5668 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5669 assert(0); /* NOTREACHED */
c476f425
DM
5670 }
5671 goto do_whilem_B_max;
5672 }
118e2215 5673 assert(0); /* NOTREACHED */
c476f425
DM
5674
5675 case WHILEM_B_min: /* just matched B in a minimal match */
5676 case WHILEM_B_max: /* just matched B in a maximal match */
5677 cur_curlyx = ST.save_curlyx;
5678 sayYES;
118e2215 5679 assert(0); /* NOTREACHED */
c476f425
DM
5680
5681 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5682 cur_curlyx = ST.save_curlyx;
5683 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5684 cur_curlyx->u.curlyx.count--;
5685 CACHEsayNO;
118e2215 5686 assert(0); /* NOTREACHED */
c476f425
DM
5687
5688 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5689 /* FALL THROUGH */
5690 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5691 REGCP_UNWIND(ST.lastcp);
92da3157 5692 regcppop(rex, &maxopenparen);
c476f425
DM
5693 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5694 cur_curlyx->u.curlyx.count--;
5695 CACHEsayNO;
118e2215 5696 assert(0); /* NOTREACHED */
c476f425
DM
5697
5698 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5699 REGCP_UNWIND(ST.lastcp);
92da3157 5700 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5702 "%*s whilem: failed, trying continuation...\n",
5703 REPORT_CODE_OFF+depth*2, "")
5704 );
5705 do_whilem_B_max:
5706 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5707 && ckWARN(WARN_REGEXP)
39819bd9 5708 && !reginfo->warned)
c476f425 5709 {
39819bd9 5710 reginfo->warned = TRUE;
dcbac5bb
FC
5711 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5712 "Complex regular subexpression recursion limit (%d) "
5713 "exceeded",
c476f425
DM
5714 REG_INFTY - 1);
5715 }
5716
5717 /* now try B */
5718 ST.save_curlyx = cur_curlyx;
5719 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5720 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5721 locinput);
118e2215 5722 assert(0); /* NOTREACHED */
c476f425
DM
5723
5724 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5725 cur_curlyx = ST.save_curlyx;
5726 REGCP_UNWIND(ST.lastcp);
92da3157 5727 regcppop(rex, &maxopenparen);
c476f425 5728
d02d6d97 5729 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5730 /* Maximum greed exceeded */
5731 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5732 && ckWARN(WARN_REGEXP)
39819bd9 5733 && !reginfo->warned)
c476f425 5734 {
39819bd9 5735 reginfo->warned = TRUE;
c476f425 5736 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5737 "Complex regular subexpression recursion "
5738 "limit (%d) exceeded",
c476f425 5739 REG_INFTY - 1);
a0d0e21e 5740 }
c476f425 5741 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5742 CACHEsayNO;
a0d0e21e 5743 }
c476f425
DM
5744
5745 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5746 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5747 );
5748 /* Try grabbing another A and see if it helps. */
c476f425 5749 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5750 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5751 maxopenparen);
c476f425 5752 REGCP_SET(ST.lastcp);
d02d6d97 5753 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5754 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5755 locinput);
118e2215 5756 assert(0); /* NOTREACHED */
40a82448
DM
5757
5758#undef ST
5759#define ST st->u.branch
5760
5761 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5762 next = scan + ARG(scan);
5763 if (next == scan)
5764 next = NULL;
40a82448
DM
5765 scan = NEXTOPER(scan);
5766 /* FALL THROUGH */
c277df42 5767
40a82448
DM
5768 case BRANCH: /* /(...|A|...)/ */
5769 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5770 ST.lastparen = rex->lastparen;
f6033a9d 5771 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5772 ST.next_branch = next;
5773 REGCP_SET(ST.cp);
02db2b7b 5774
40a82448 5775 /* Now go into the branch */
5d458dd8 5776 if (has_cutgroup) {
4d5016e5 5777 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5778 } else {
4d5016e5 5779 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5780 }
118e2215 5781 assert(0); /* NOTREACHED */
3c0563b9
DM
5782
5783 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5784 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5785 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5786 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5787 assert(0); /* NOTREACHED */
3c0563b9 5788
5d458dd8
YO
5789 case CUTGROUP_next_fail:
5790 do_cutgroup = 1;
5791 no_final = 1;
5792 if (st->u.mark.mark_name)
5793 sv_commit = st->u.mark.mark_name;
5794 sayNO;
118e2215 5795 assert(0); /* NOTREACHED */
3c0563b9 5796
5d458dd8
YO
5797 case BRANCH_next:
5798 sayYES;
118e2215 5799 assert(0); /* NOTREACHED */
3c0563b9 5800
40a82448 5801 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5802 if (do_cutgroup) {
5803 do_cutgroup = 0;
5804 no_final = 0;
5805 }
40a82448 5806 REGCP_UNWIND(ST.cp);
a8d1f4b4 5807 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5808 scan = ST.next_branch;
5809 /* no more branches? */
5d458dd8
YO
5810 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5811 DEBUG_EXECUTE_r({
5812 PerlIO_printf( Perl_debug_log,
5813 "%*s %sBRANCH failed...%s\n",
5814 REPORT_CODE_OFF+depth*2, "",
5815 PL_colors[4],
5816 PL_colors[5] );
5817 });
5818 sayNO_SILENT;
5819 }
40a82448 5820 continue; /* execute next BRANCH[J] op */
118e2215 5821 assert(0); /* NOTREACHED */
40a82448 5822
3c0563b9 5823 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5824 minmod = 1;
a0d0e21e 5825 break;
40a82448
DM
5826
5827#undef ST
5828#define ST st->u.curlym
5829
5830 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5831
5832 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5833 * only a single backtracking state, no matter how many matches
40a82448
DM
5834 * there are in {m,n}. It relies on the pattern being constant
5835 * length, with no parens to influence future backrefs
5836 */
5837
5838 ST.me = scan;
dc45a647 5839 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5840
f6033a9d
DM
5841 ST.lastparen = rex->lastparen;
5842 ST.lastcloseparen = rex->lastcloseparen;
5843
40a82448
DM
5844 /* if paren positive, emulate an OPEN/CLOSE around A */
5845 if (ST.me->flags) {
3b6647e0 5846 U32 paren = ST.me->flags;
92da3157
DM
5847 if (paren > maxopenparen)
5848 maxopenparen = paren;
c277df42 5849 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5850 }
40a82448
DM
5851 ST.A = scan;
5852 ST.B = next;
5853 ST.alen = 0;
5854 ST.count = 0;
24d3c4a9
DM
5855 ST.minmod = minmod;
5856 minmod = 0;
40a82448
DM
5857 ST.c1 = CHRTEST_UNINIT;
5858 REGCP_SET(ST.cp);
6407bf3b 5859
40a82448
DM
5860 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5861 goto curlym_do_B;
5862
5863 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5864 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5865 assert(0); /* NOTREACHED */
5f80c4cf 5866
40a82448 5867 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5868 ST.count++;
5869 /* after first match, determine A's length: u.curlym.alen */
5870 if (ST.count == 1) {
ba44c216 5871 if (reginfo->is_utf8_target) {
c07e9d7b
DM
5872 char *s = st->locinput;
5873 while (s < locinput) {
40a82448
DM
5874 ST.alen++;
5875 s += UTF8SKIP(s);
5876 }
5877 }
5878 else {
c07e9d7b 5879 ST.alen = locinput - st->locinput;
40a82448
DM
5880 }
5881 if (ST.alen == 0)
5882 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5883 }
0cadcf80
DM
5884 DEBUG_EXECUTE_r(
5885 PerlIO_printf(Perl_debug_log,
40a82448 5886 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5887 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5888 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5889 );
5890
0a4db386 5891 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5892 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5893 goto fake_end;
5894
c966426a
DM
5895 {
5896 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5897 if ( max == REG_INFTY || ST.count < max )
5898 goto curlym_do_A; /* try to match another A */
5899 }
40a82448 5900 goto curlym_do_B; /* try to match B */
5f80c4cf 5901
40a82448
DM
5902 case CURLYM_A_fail: /* just failed to match an A */
5903 REGCP_UNWIND(ST.cp);
0a4db386
YO
5904
5905 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5906 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5907 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5908 sayNO;
0cadcf80 5909
40a82448 5910 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5911 if (ST.c1 == CHRTEST_UNINIT) {
5912 /* calculate c1 and c2 for possible match of 1st char
5913 * following curly */
5914 ST.c1 = ST.c2 = CHRTEST_VOID;
5915 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5916 regnode *text_node = ST.B;
5917 if (! HAS_TEXT(text_node))
5918 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5919 /* this used to be
5920
5921 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5922
5923 But the former is redundant in light of the latter.
5924
5925 if this changes back then the macro for
5926 IS_TEXT and friends need to change.
5927 */
c74f6de9 5928 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 5929 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 5930 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 5931 reginfo))
c74f6de9
KW
5932 {
5933 sayNO;
5934 }
c277df42 5935 }
c277df42 5936 }
40a82448
DM
5937 }
5938
5939 DEBUG_EXECUTE_r(
5940 PerlIO_printf(Perl_debug_log,
5941 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5942 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5943 "", (IV)ST.count)
5944 );
c74f6de9 5945 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5946 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5947 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5948 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5949 {
5950 /* simulate B failing */
5951 DEBUG_OPTIMISE_r(
5952 PerlIO_printf(Perl_debug_log,
33daa3a5 5953 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
79a2a0e8
KW
5954 (int)(REPORT_CODE_OFF+(depth*2)),"",
5955 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5956 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5957 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5958 );
5959 state_num = CURLYM_B_fail;
5960 goto reenter_switch;
5961 }
5962 }
5963 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5964 /* simulate B failing */
5965 DEBUG_OPTIMISE_r(
5966 PerlIO_printf(Perl_debug_log,
33daa3a5 5967 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5400f398 5968 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5969 (int) nextchr, ST.c1, ST.c2)
5970 );
5400f398
KW
5971 state_num = CURLYM_B_fail;
5972 goto reenter_switch;
5973 }
c74f6de9 5974 }
40a82448
DM
5975
5976 if (ST.me->flags) {
f6033a9d 5977 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5978 I32 paren = ST.me->flags;
5979 if (ST.count) {
b93070ed 5980 rex->offs[paren].start
9d9163fb
DM
5981 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5982 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
5983 if ((U32)paren > rex->lastparen)
5984 rex->lastparen = paren;
5985 rex->lastcloseparen = paren;
c277df42 5986 }
40a82448 5987 else
b93070ed 5988 rex->offs[paren].end = -1;
0a4db386 5989 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5990 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5991 {
5992 if (ST.count)
5993 goto fake_end;
5994 else
5995 sayNO;
5996 }
c277df42 5997 }
0a4db386 5998
4d5016e5 5999 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 6000 assert(0); /* NOTREACHED */
40a82448
DM
6001
6002 case CURLYM_B_fail: /* just failed to match a B */
6003 REGCP_UNWIND(ST.cp);
a8d1f4b4 6004 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 6005 if (ST.minmod) {
84d2fa14
HS
6006 I32 max = ARG2(ST.me);
6007 if (max != REG_INFTY && ST.count == max)
40a82448
DM
6008 sayNO;
6009 goto curlym_do_A; /* try to match a further A */
6010 }
6011 /* backtrack one A */
6012 if (ST.count == ARG1(ST.me) /* min */)
6013 sayNO;
6014 ST.count--;
7016d6eb 6015 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
6016 goto curlym_do_B; /* try to match B */
6017
c255a977
DM
6018#undef ST
6019#define ST st->u.curly
40a82448 6020
c255a977
DM
6021#define CURLY_SETPAREN(paren, success) \
6022 if (paren) { \
6023 if (success) { \
9d9163fb
DM
6024 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6025 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
6026 if (paren > rex->lastparen) \
6027 rex->lastparen = paren; \
b93070ed 6028 rex->lastcloseparen = paren; \
c255a977 6029 } \
f6033a9d 6030 else { \
b93070ed 6031 rex->offs[paren].end = -1; \
f6033a9d
DM
6032 rex->lastparen = ST.lastparen; \
6033 rex->lastcloseparen = ST.lastcloseparen; \
6034 } \
c255a977
DM
6035 }
6036
b40a2c17 6037 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
6038 ST.paren = 0;
6039 ST.min = 0;
6040 ST.max = REG_INFTY;
a0d0e21e
LW
6041 scan = NEXTOPER(scan);
6042 goto repeat;
3c0563b9 6043
b40a2c17 6044 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
6045 ST.paren = 0;
6046 ST.min = 1;
6047 ST.max = REG_INFTY;
c277df42 6048 scan = NEXTOPER(scan);
c255a977 6049 goto repeat;
3c0563b9 6050
b40a2c17 6051 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
6052 ST.paren = scan->flags; /* Which paren to set */
6053 ST.lastparen = rex->lastparen;
f6033a9d 6054 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
6055 if (ST.paren > maxopenparen)
6056 maxopenparen = ST.paren;
c255a977
DM
6057 ST.min = ARG1(scan); /* min to match */
6058 ST.max = ARG2(scan); /* max to match */
0a4db386 6059 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6060 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6061 ST.min=1;
6062 ST.max=1;
6063 }
c255a977
DM
6064 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6065 goto repeat;
3c0563b9 6066
b40a2c17 6067 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
6068 ST.paren = 0;
6069 ST.min = ARG1(scan); /* min to match */
6070 ST.max = ARG2(scan); /* max to match */
6071 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 6072 repeat:
a0d0e21e
LW
6073 /*
6074 * Lookahead to avoid useless match attempts
6075 * when we know what character comes next.
c255a977 6076 *
5f80c4cf
JP
6077 * Used to only do .*x and .*?x, but now it allows
6078 * for )'s, ('s and (?{ ... })'s to be in the way
6079 * of the quantifier and the EXACT-like node. -- japhy
6080 */
6081
eb5c1be8 6082 assert(ST.min <= ST.max);
3337dfe3
KW
6083 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6084 ST.c1 = ST.c2 = CHRTEST_VOID;
6085 }
6086 else {
5f80c4cf
JP
6087 regnode *text_node = next;
6088
3dab1dad
YO
6089 if (! HAS_TEXT(text_node))
6090 FIND_NEXT_IMPT(text_node);
5f80c4cf 6091
9e137952 6092 if (! HAS_TEXT(text_node))
c255a977 6093 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 6094 else {
ee9b8eae 6095 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 6096 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 6097 }
c74f6de9 6098 else {
ee9b8eae
YO
6099
6100 /* Currently we only get here when
6101
6102 PL_rekind[OP(text_node)] == EXACT
6103
6104 if this changes back then the macro for IS_TEXT and
6105 friends need to change. */
79a2a0e8 6106 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6107 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6108 reginfo))
c74f6de9
KW
6109 {
6110 sayNO;
6111 }
6112 }
1aa99e6b 6113 }
bbce6d69 6114 }
c255a977
DM
6115
6116 ST.A = scan;
6117 ST.B = next;
24d3c4a9 6118 if (minmod) {
eb72505d 6119 char *li = locinput;
24d3c4a9 6120 minmod = 0;
984e6dd1 6121 if (ST.min &&
f9176b44 6122 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
984e6dd1 6123 < ST.min)
4633a7c4 6124 sayNO;
7016d6eb 6125 SET_locinput(li);
c255a977 6126 ST.count = ST.min;
c255a977
DM
6127 REGCP_SET(ST.cp);
6128 if (ST.c1 == CHRTEST_VOID)
6129 goto curly_try_B_min;
6130
6131 ST.oldloc = locinput;
6132
6133 /* set ST.maxpos to the furthest point along the
6134 * string that could possibly match */
6135 if (ST.max == REG_INFTY) {
220db18a 6136 ST.maxpos = reginfo->strend - 1;
f2ed9b32 6137 if (utf8_target)
c255a977
DM
6138 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6139 ST.maxpos--;
6140 }
f2ed9b32 6141 else if (utf8_target) {
c255a977
DM
6142 int m = ST.max - ST.min;
6143 for (ST.maxpos = locinput;
220db18a 6144 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
6145 ST.maxpos += UTF8SKIP(ST.maxpos);
6146 }
6147 else {
6148 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
6149 if (ST.maxpos >= reginfo->strend)
6150 ST.maxpos = reginfo->strend - 1;
c255a977
DM
6151 }
6152 goto curly_try_B_min_known;
6153
6154 }
6155 else {
eb72505d
DM
6156 /* avoid taking address of locinput, so it can remain
6157 * a register var */
6158 char *li = locinput;
f9176b44 6159 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
c255a977
DM
6160 if (ST.count < ST.min)
6161 sayNO;
7016d6eb 6162 SET_locinput(li);
c255a977
DM
6163 if ((ST.count > ST.min)
6164 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6165 {
6166 /* A{m,n} must come at the end of the string, there's
6167 * no point in backing off ... */
6168 ST.min = ST.count;
6169 /* ...except that $ and \Z can match before *and* after
6170 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6171 We may back off by one in this case. */
eb72505d 6172 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6173 ST.min--;
6174 }
6175 REGCP_SET(ST.cp);
6176 goto curly_try_B_max;
6177 }
118e2215 6178 assert(0); /* NOTREACHED */
c255a977
DM
6179
6180
6181 case CURLY_B_min_known_fail:
6182 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6183
c255a977 6184 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6185 if (ST.paren) {
6186 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6187 }
c255a977
DM
6188 /* Couldn't or didn't -- move forward. */
6189 ST.oldloc = locinput;
f2ed9b32 6190 if (utf8_target)
c255a977
DM
6191 locinput += UTF8SKIP(locinput);
6192 else
6193 locinput++;
6194 ST.count++;
6195 curly_try_B_min_known:
6196 /* find the next place where 'B' could work, then call B */
6197 {
6198 int n;
f2ed9b32 6199 if (utf8_target) {
c255a977
DM
6200 n = (ST.oldloc == locinput) ? 0 : 1;
6201 if (ST.c1 == ST.c2) {
c255a977 6202 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6203 while (locinput <= ST.maxpos
6204 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6205 {
6206 locinput += UTF8SKIP(locinput);
c255a977
DM
6207 n++;
6208 }
1aa99e6b
IH
6209 }
6210 else {
c255a977 6211 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6212 while (locinput <= ST.maxpos
6213 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6214 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6215 {
6216 locinput += UTF8SKIP(locinput);
c255a977 6217 n++;
1aa99e6b 6218 }
0fe9bf95
IZ
6219 }
6220 }
5400f398 6221 else { /* Not utf8_target */
c255a977
DM
6222 if (ST.c1 == ST.c2) {
6223 while (locinput <= ST.maxpos &&
6224 UCHARAT(locinput) != ST.c1)
6225 locinput++;
bbce6d69 6226 }
c255a977
DM
6227 else {
6228 while (locinput <= ST.maxpos
6229 && UCHARAT(locinput) != ST.c1
6230 && UCHARAT(locinput) != ST.c2)
6231 locinput++;
a0ed51b3 6232 }
c255a977
DM
6233 n = locinput - ST.oldloc;
6234 }
6235 if (locinput > ST.maxpos)
6236 sayNO;
c255a977 6237 if (n) {
eb72505d
DM
6238 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6239 * at b; check that everything between oldloc and
6240 * locinput matches */
6241 char *li = ST.oldloc;
c255a977 6242 ST.count += n;
f9176b44 6243 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
4633a7c4 6244 sayNO;
eb72505d 6245 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6246 }
c255a977 6247 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6248 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6249 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6250 goto fake_end;
6251 }
4d5016e5 6252 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6253 }
118e2215 6254 assert(0); /* NOTREACHED */
c255a977
DM
6255
6256
6257 case CURLY_B_min_fail:
6258 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6259
6260 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6261 if (ST.paren) {
6262 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6263 }
c255a977 6264 /* failed -- move forward one */
f73aaa43 6265 {
eb72505d 6266 char *li = locinput;
f9176b44 6267 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
f73aaa43
DM
6268 sayNO;
6269 }
eb72505d 6270 locinput = li;
f73aaa43
DM
6271 }
6272 {
c255a977 6273 ST.count++;
c255a977
DM
6274 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6275 ST.count > 0)) /* count overflow ? */
15272685 6276 {
c255a977
DM
6277 curly_try_B_min:
6278 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6279 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6280 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6281 goto fake_end;
6282 }
4d5016e5 6283 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6284 }
6285 }
c74f6de9 6286 sayNO;
118e2215 6287 assert(0); /* NOTREACHED */
c255a977
DM
6288
6289
6290 curly_try_B_max:
6291 /* a successful greedy match: now try to match B */
40d049e4 6292 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6293 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6294 goto fake_end;
6295 }
c255a977 6296 {
220db18a 6297 bool could_match = locinput < reginfo->strend;
79a2a0e8 6298
c255a977 6299 /* If it could work, try it. */
79a2a0e8
KW
6300 if (ST.c1 != CHRTEST_VOID && could_match) {
6301 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6302 {
6303 could_match = memEQ(locinput,
6304 ST.c1_utf8,
6305 UTF8SKIP(locinput))
6306 || memEQ(locinput,
6307 ST.c2_utf8,
6308 UTF8SKIP(locinput));
6309 }
6310 else {
6311 could_match = UCHARAT(locinput) == ST.c1
6312 || UCHARAT(locinput) == ST.c2;
6313 }
6314 }
6315 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6316 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6317 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6318 assert(0); /* NOTREACHED */
c255a977
DM
6319 }
6320 }
6321 /* FALL THROUGH */
3c0563b9 6322
c255a977
DM
6323 case CURLY_B_max_fail:
6324 /* failed to find B in a greedy match */
c255a977
DM
6325
6326 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6327 if (ST.paren) {
6328 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6329 }
c255a977
DM
6330 /* back up. */
6331 if (--ST.count < ST.min)
6332 sayNO;
eb72505d 6333 locinput = HOPc(locinput, -1);
c255a977
DM
6334 goto curly_try_B_max;
6335
6336#undef ST
6337
3c0563b9 6338 case END: /* last op of main pattern */
6bda09f9 6339 fake_end:
faec1544
DM
6340 if (cur_eval) {
6341 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544 6342
288b8c02 6343 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6344
6345 /* Save *all* the positions. */
6346 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b 6347 rex_sv = cur_eval->u.eval.prev_rex;
aed7b151 6348 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6349 SET_reg_curpm(rex_sv);
8d919b0a 6350 rex = ReANY(rex_sv);
f8fc2ecf 6351 rexi = RXi_GET(rex);
faec1544 6352 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6353
faec1544 6354 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6355
6356 /* Restore parens of the outer rex without popping the
6357 * savestack */
92da3157
DM
6358 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6359 &maxopenparen);
faec1544
DM
6360
6361 st->u.eval.prev_eval = cur_eval;
6362 cur_eval = cur_eval->u.eval.prev_eval;
6363 DEBUG_EXECUTE_r(
2a49f0f5
JH
6364 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6365 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6366 if ( nochange_depth )
6367 nochange_depth--;
6368
4d5016e5
DM
6369 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6370 locinput); /* match B */
faec1544
DM
6371 }
6372
3b0527fe 6373 if (locinput < reginfo->till) {
a3621e74 6374 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6375 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6376 PL_colors[4],
6d59b646
DM
6377 (long)(locinput - startpos),
6378 (long)(reginfo->till - startpos),
7821416a 6379 PL_colors[5]));
58e23c8d 6380
262b90c4 6381 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6382 }
262b90c4 6383 sayYES; /* Success! */
dad79028
DM
6384
6385 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6386 DEBUG_EXECUTE_r(
6387 PerlIO_printf(Perl_debug_log,
6388 "%*s %ssubpattern success...%s\n",
5bc10b2c 6389 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6390 sayYES; /* Success! */
dad79028 6391
40a82448
DM
6392#undef ST
6393#define ST st->u.ifmatch
6394
37f53970
DM
6395 {
6396 char *newstart;
6397
40a82448
DM
6398 case SUSPEND: /* (?>A) */
6399 ST.wanted = 1;
37f53970 6400 newstart = locinput;
9041c2e3 6401 goto do_ifmatch;
dad79028 6402
40a82448
DM
6403 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6404 ST.wanted = 0;
dad79028
DM
6405 goto ifmatch_trivial_fail_test;
6406
40a82448
DM
6407 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6408 ST.wanted = 1;
dad79028 6409 ifmatch_trivial_fail_test:
a0ed51b3 6410 if (scan->flags) {
52657f30 6411 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6412 if (!s) {
6413 /* trivial fail */
24d3c4a9
DM
6414 if (logical) {
6415 logical = 0;
f2338a2e 6416 sw = 1 - cBOOL(ST.wanted);
dad79028 6417 }
40a82448 6418 else if (ST.wanted)
dad79028
DM
6419 sayNO;
6420 next = scan + ARG(scan);
6421 if (next == scan)
6422 next = NULL;
6423 break;
6424 }
37f53970 6425 newstart = s;
a0ed51b3
LW
6426 }
6427 else
37f53970 6428 newstart = locinput;
a0ed51b3 6429
c277df42 6430 do_ifmatch:
40a82448 6431 ST.me = scan;
24d3c4a9 6432 ST.logical = logical;
24d786f4
YO
6433 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6434
40a82448 6435 /* execute body of (?...A) */
37f53970 6436 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6437 assert(0); /* NOTREACHED */
37f53970 6438 }
40a82448
DM
6439
6440 case IFMATCH_A_fail: /* body of (?...A) failed */
6441 ST.wanted = !ST.wanted;
6442 /* FALL THROUGH */
6443
6444 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6445 if (ST.logical) {
f2338a2e 6446 sw = cBOOL(ST.wanted);
40a82448
DM
6447 }
6448 else if (!ST.wanted)
6449 sayNO;
6450
37f53970
DM
6451 if (OP(ST.me) != SUSPEND) {
6452 /* restore old position except for (?>...) */
6453 locinput = st->locinput;
40a82448
DM
6454 }
6455 scan = ST.me + ARG(ST.me);
6456 if (scan == ST.me)
6457 scan = NULL;
6458 continue; /* execute B */
6459
6460#undef ST
dad79028 6461
3c0563b9
DM
6462 case LONGJMP: /* alternative with many branches compiles to
6463 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6464 next = scan + ARG(scan);
6465 if (next == scan)
6466 next = NULL;
a0d0e21e 6467 break;
3c0563b9
DM
6468
6469 case COMMIT: /* (*COMMIT) */
220db18a 6470 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 6471 /* FALLTHROUGH */
3c0563b9
DM
6472
6473 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6474 if (!scan->flags)
ad64d0ec 6475 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6476 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6477 assert(0); /* NOTREACHED */
3c0563b9 6478
54612592
YO
6479 case COMMIT_next_fail:
6480 no_final = 1;
6481 /* FALLTHROUGH */
3c0563b9
DM
6482
6483 case OPFAIL: /* (*FAIL) */
7f69552c 6484 sayNO;
118e2215 6485 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6486
6487#define ST st->u.mark
3c0563b9 6488 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6489 ST.prev_mark = mark_state;
5d458dd8 6490 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6491 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6492 mark_state = st;
4d5016e5
DM
6493 ST.mark_loc = locinput;
6494 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6495 assert(0); /* NOTREACHED */
3c0563b9 6496
e2e6a0f1
YO
6497 case MARKPOINT_next:
6498 mark_state = ST.prev_mark;
6499 sayYES;
118e2215 6500 assert(0); /* NOTREACHED */
3c0563b9 6501
e2e6a0f1 6502 case MARKPOINT_next_fail:
5d458dd8 6503 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6504 {
6505 if (ST.mark_loc > startpoint)
6506 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6507 popmark = NULL; /* we found our mark */
6508 sv_commit = ST.mark_name;
6509
6510 DEBUG_EXECUTE_r({
5d458dd8 6511 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6512 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6513 REPORT_CODE_OFF+depth*2, "",
be2597df 6514 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6515 });
6516 }
6517 mark_state = ST.prev_mark;
5d458dd8
YO
6518 sv_yes_mark = mark_state ?
6519 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6520 sayNO;
118e2215 6521 assert(0); /* NOTREACHED */
3c0563b9
DM
6522
6523 case SKIP: /* (*SKIP) */
5d458dd8 6524 if (scan->flags) {
2bf803e2 6525 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6526 ST.mark_name = NULL;
e2e6a0f1 6527 ST.mark_loc = locinput;
4d5016e5 6528 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6529 } else {
2bf803e2 6530 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6531 otherwise do nothing. Meaning we need to scan
6532 */
6533 regmatch_state *cur = mark_state;
ad64d0ec 6534 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6535
6536 while (cur) {
6537 if ( sv_eq( cur->u.mark.mark_name,
6538 find ) )
6539 {
6540 ST.mark_name = find;
4d5016e5 6541 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6542 }
6543 cur = cur->u.mark.prev_mark;
6544 }
e2e6a0f1 6545 }
2bf803e2 6546 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6547 break;
3c0563b9 6548
5d458dd8
YO
6549 case SKIP_next_fail:
6550 if (ST.mark_name) {
6551 /* (*CUT:NAME) - Set up to search for the name as we
6552 collapse the stack*/
6553 popmark = ST.mark_name;
6554 } else {
6555 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6556 if (ST.mark_loc > startpoint)
6557 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6558 /* but we set sv_commit to latest mark_name if there
6559 is one so they can test to see how things lead to this
6560 cut */
6561 if (mark_state)
6562 sv_commit=mark_state->u.mark.mark_name;
6563 }
e2e6a0f1
YO
6564 no_final = 1;
6565 sayNO;
118e2215 6566 assert(0); /* NOTREACHED */
e2e6a0f1 6567#undef ST
3c0563b9
DM
6568
6569 case LNBREAK: /* \R */
220db18a 6570 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 6571 locinput += n;
e1d1eefb
YO
6572 } else
6573 sayNO;
6574 break;
6575
a0d0e21e 6576 default:
b900a521 6577 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6578 PTR2UV(scan), OP(scan));
cea2e8a9 6579 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6580
6581 /* this is a point to jump to in order to increment
6582 * locinput by one character */
6583 increment_locinput:
e6ca698c 6584 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6585 if (utf8_target) {
6586 locinput += PL_utf8skip[nextchr];
7016d6eb 6587 /* locinput is allowed to go 1 char off the end, but not 2+ */
220db18a 6588 if (locinput > reginfo->strend)
28b98f76 6589 sayNO;
28b98f76
DM
6590 }
6591 else
3640db6b 6592 locinput++;
28b98f76 6593 break;
5d458dd8
YO
6594
6595 } /* end switch */
95b24440 6596
5d458dd8
YO
6597 /* switch break jumps here */
6598 scan = next; /* prepare to execute the next op and ... */
6599 continue; /* ... jump back to the top, reusing st */
118e2215 6600 assert(0); /* NOTREACHED */
95b24440 6601
40a82448
DM
6602 push_yes_state:
6603 /* push a state that backtracks on success */
6604 st->u.yes.prev_yes_state = yes_state;
6605 yes_state = st;
6606 /* FALL THROUGH */
6607 push_state:
6608 /* push a new regex state, then continue at scan */
6609 {
6610 regmatch_state *newst;
6611
24b23f37
YO
6612 DEBUG_STACK_r({
6613 regmatch_state *cur = st;
6614 regmatch_state *curyes = yes_state;
6615 int curd = depth;
6616 regmatch_slab *slab = PL_regmatch_slab;
6617 for (;curd > -1;cur--,curd--) {
6618 if (cur < SLAB_FIRST(slab)) {
6619 slab = slab->prev;
6620 cur = SLAB_LAST(slab);
6621 }
6622 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6623 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6624 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6625 (curyes == cur) ? "yes" : ""
6626 );
6627 if (curyes == cur)
6628 curyes = cur->u.yes.prev_yes_state;
6629 }
6630 } else
6631 DEBUG_STATE_pp("push")
6632 );
40a82448 6633 depth++;
40a82448
DM
6634 st->locinput = locinput;
6635 newst = st+1;
6636 if (newst > SLAB_LAST(PL_regmatch_slab))
6637 newst = S_push_slab(aTHX);
6638 PL_regmatch_state = newst;
786e8c11 6639
4d5016e5 6640 locinput = pushinput;
40a82448
DM
6641 st = newst;
6642 continue;
118e2215 6643 assert(0); /* NOTREACHED */
40a82448 6644 }
a0d0e21e 6645 }
a687059c 6646
a0d0e21e
LW
6647 /*
6648 * We get here only if there's trouble -- normally "case END" is
6649 * the terminating point.
6650 */
cea2e8a9 6651 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6652 /*NOTREACHED*/
4633a7c4
LW
6653 sayNO;
6654
262b90c4 6655yes:
77cb431f
DM
6656 if (yes_state) {
6657 /* we have successfully completed a subexpression, but we must now
6658 * pop to the state marked by yes_state and continue from there */
77cb431f 6659 assert(st != yes_state);
5bc10b2c
DM
6660#ifdef DEBUGGING
6661 while (st != yes_state) {
6662 st--;
6663 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6664 PL_regmatch_slab = PL_regmatch_slab->prev;
6665 st = SLAB_LAST(PL_regmatch_slab);
6666 }
e2e6a0f1 6667 DEBUG_STATE_r({
54612592
YO
6668 if (no_final) {
6669 DEBUG_STATE_pp("pop (no final)");
6670 } else {
6671 DEBUG_STATE_pp("pop (yes)");
6672 }
e2e6a0f1 6673 });
5bc10b2c
DM
6674 depth--;
6675 }
6676#else
77cb431f
DM
6677 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6678 || yes_state > SLAB_LAST(PL_regmatch_slab))
6679 {
6680 /* not in this slab, pop slab */
6681 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6682 PL_regmatch_slab = PL_regmatch_slab->prev;
6683 st = SLAB_LAST(PL_regmatch_slab);
6684 }
6685 depth -= (st - yes_state);
5bc10b2c 6686#endif
77cb431f
DM
6687 st = yes_state;
6688 yes_state = st->u.yes.prev_yes_state;
6689 PL_regmatch_state = st;
24b23f37 6690
3640db6b 6691 if (no_final)
5d458dd8 6692 locinput= st->locinput;
54612592 6693 state_num = st->resume_state + no_final;
24d3c4a9 6694 goto reenter_switch;
77cb431f
DM
6695 }
6696
a3621e74 6697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6698 PL_colors[4], PL_colors[5]));
02db2b7b 6699
bf2039a9 6700 if (reginfo->info_aux_eval) {
19b95bf0
DM
6701 /* each successfully executed (?{...}) block does the equivalent of
6702 * local $^R = do {...}
6703 * When popping the save stack, all these locals would be undone;
6704 * bypass this by setting the outermost saved $^R to the latest
6705 * value */
6706 if (oreplsv != GvSV(PL_replgv))
6707 sv_setsv(oreplsv, GvSV(PL_replgv));
6708 }
95b24440 6709 result = 1;
aa283a38 6710 goto final_exit;
4633a7c4
LW
6711
6712no:
a3621e74 6713 DEBUG_EXECUTE_r(
7821416a 6714 PerlIO_printf(Perl_debug_log,
786e8c11 6715 "%*s %sfailed...%s\n",
5bc10b2c 6716 REPORT_CODE_OFF+depth*2, "",
786e8c11 6717 PL_colors[4], PL_colors[5])
7821416a 6718 );
aa283a38 6719
262b90c4 6720no_silent:
54612592
YO
6721 if (no_final) {
6722 if (yes_state) {
6723 goto yes;
6724 } else {
6725 goto final_exit;
6726 }
6727 }
aa283a38
DM
6728 if (depth) {
6729 /* there's a previous state to backtrack to */
40a82448
DM
6730 st--;
6731 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6732 PL_regmatch_slab = PL_regmatch_slab->prev;
6733 st = SLAB_LAST(PL_regmatch_slab);
6734 }
6735 PL_regmatch_state = st;
40a82448 6736 locinput= st->locinput;
40a82448 6737
5bc10b2c
DM
6738 DEBUG_STATE_pp("pop");
6739 depth--;
262b90c4
DM
6740 if (yes_state == st)
6741 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6742
24d3c4a9
DM
6743 state_num = st->resume_state + 1; /* failure = success + 1 */
6744 goto reenter_switch;
95b24440 6745 }
24d3c4a9 6746 result = 0;
aa283a38 6747
262b90c4 6748 final_exit:
bbe252da 6749 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6750 SV *sv_err = get_sv("REGERROR", 1);
6751 SV *sv_mrk = get_sv("REGMARK", 1);
6752 if (result) {
e2e6a0f1 6753 sv_commit = &PL_sv_no;
5d458dd8
YO
6754 if (!sv_yes_mark)
6755 sv_yes_mark = &PL_sv_yes;
6756 } else {
6757 if (!sv_commit)
6758 sv_commit = &PL_sv_yes;
6759 sv_yes_mark = &PL_sv_no;
6760 }
6761 sv_setsv(sv_err, sv_commit);
6762 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6763 }
19b95bf0 6764
81ed78b2
DM
6765
6766 if (last_pushed_cv) {
6767 dSP;
6768 POP_MULTICALL;
4f8dbb2d 6769 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6770 }
6771
9d9163fb
DM
6772 assert(!result || locinput - reginfo->strbeg >= 0);
6773 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
6774}
6775
6776/*
6777 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6778 *
e64f369d
KW
6779 * What 'simple' means is a node which can be the operand of a quantifier like
6780 * '+', or {1,3}
6781 *
d60de1d1
DM
6782 * startposp - pointer a pointer to the start position. This is updated
6783 * to point to the byte following the highest successful
6784 * match.
6785 * p - the regnode to be repeatedly matched against.
220db18a 6786 * reginfo - struct holding match state, such as strend
4063ade8 6787 * max - maximum number of things to match.
d60de1d1 6788 * depth - (for debugging) backtracking depth.
a687059c 6789 */
76e3520e 6790STATIC I32
272d35c9 6791S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
f9176b44 6792 regmatch_info *const reginfo, I32 max, int depth)
a687059c 6793{
27da23d5 6794 dVAR;
4063ade8 6795 char *scan; /* Pointer to current position in target string */
eb578fdb 6796 I32 c;
220db18a 6797 char *loceol = reginfo->strend; /* local version */
4063ade8 6798 I32 hardcount = 0; /* How many matches so far */
ba44c216 6799 bool utf8_target = reginfo->is_utf8_target;
3018b823 6800 int to_complement = 0; /* Invert the result? */
d513472c 6801 UV utf8_flags;
3018b823 6802 _char_class_number classnum;
4f55667c
SP
6803#ifndef DEBUGGING
6804 PERL_UNUSED_ARG(depth);
6805#endif
a0d0e21e 6806
7918f24d
NC
6807 PERL_ARGS_ASSERT_REGREPEAT;
6808
f73aaa43 6809 scan = *startposp;
faf11cac
HS
6810 if (max == REG_INFTY)
6811 max = I32_MAX;
dfb8f192 6812 else if (! utf8_target && loceol - scan > max)
7f596f4c 6813 loceol = scan + max;
4063ade8
KW
6814
6815 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6816 * to the maximum of how far we should go in it (leaving it set to the real
6817 * end, if the maximum permissible would take us beyond that). This allows
6818 * us to make the loop exit condition that we haven't gone past <loceol> to
6819 * also mean that we haven't exceeded the max permissible count, saving a
6820 * test each time through the loop. But it assumes that the OP matches a
6821 * single byte, which is true for most of the OPs below when applied to a
6822 * non-UTF-8 target. Those relatively few OPs that don't have this
6823 * characteristic will have to compensate.
6824 *
6825 * There is no adjustment for UTF-8 targets, as the number of bytes per
6826 * character varies. OPs will have to test both that the count is less
6827 * than the max permissible (using <hardcount> to keep track), and that we
6828 * are still within the bounds of the string (using <loceol>. A few OPs
6829 * match a single byte no matter what the encoding. They can omit the max
6830 * test if, for the UTF-8 case, they do the adjustment that was skipped
6831 * above.
6832 *
6833 * Thus, the code above sets things up for the common case; and exceptional
6834 * cases need extra work; the common case is to make sure <scan> doesn't
6835 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6836 * count doesn't exceed the maximum permissible */
6837
a0d0e21e 6838 switch (OP(p)) {
22c35a8c 6839 case REG_ANY:
f2ed9b32 6840 if (utf8_target) {
1aa99e6b 6841 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6842 scan += UTF8SKIP(scan);
6843 hardcount++;
6844 }
6845 } else {
6846 while (scan < loceol && *scan != '\n')
6847 scan++;
a0ed51b3
LW
6848 }
6849 break;
ffc61ed2 6850 case SANY:
f2ed9b32 6851 if (utf8_target) {
a0804c9e 6852 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6853 scan += UTF8SKIP(scan);
6854 hardcount++;
6855 }
6856 }
6857 else
6858 scan = loceol;
a0ed51b3 6859 break;
4063ade8 6860 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
9597860a 6861 if (utf8_target && loceol - scan > max) {
4063ade8
KW
6862
6863 /* <loceol> hadn't been adjusted in the UTF-8 case */
6864 scan += max;
6865 }
6866 else {
6867 scan = loceol;
6868 }
f33976b4 6869 break;
59d32103 6870 case EXACT:
f9176b44 6871 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6872
59d32103 6873 c = (U8)*STRING(p);
59d32103 6874
5e4a1da1
KW
6875 /* Can use a simple loop if the pattern char to match on is invariant
6876 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6877 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6878 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 6879 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 6880 if (utf8_target && loceol - scan > max) {
4063ade8
KW
6881 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6882 * since here, to match at all, 1 char == 1 byte */
6883 loceol = scan + max;
6884 }
59d32103
KW
6885 while (scan < loceol && UCHARAT(scan) == c) {
6886 scan++;
6887 }
6888 }
f9176b44 6889 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
6890 if (utf8_target) {
6891 STRLEN scan_char_len;
5e4a1da1 6892
4063ade8 6893 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6894 * string EQ */
6895 while (hardcount < max
9a902117
KW
6896 && scan < loceol
6897 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
6898 && memEQ(scan, STRING(p), scan_char_len))
6899 {
4200a00c 6900 scan += scan_char_len;
5e4a1da1
KW
6901 hardcount++;
6902 }
6903 }
6904 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6905
5e4a1da1
KW
6906 /* Target isn't utf8; convert the character in the UTF-8
6907 * pattern to non-UTF8, and do a simple loop */
94bb8c36 6908 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
5e4a1da1
KW
6909 while (scan < loceol && UCHARAT(scan) == c) {
6910 scan++;
6911 }
6912 } /* else pattern char is above Latin1, can't possibly match the
6913 non-UTF-8 target */
b40a2c17 6914 }
5e4a1da1 6915 else {
59d32103 6916
5e4a1da1
KW
6917 /* Here, the string must be utf8; pattern isn't, and <c> is
6918 * different in utf8 than not, so can't compare them directly.
6919 * Outside the loop, find the two utf8 bytes that represent c, and
6920 * then look for those in sequence in the utf8 string */
59d32103
KW
6921 U8 high = UTF8_TWO_BYTE_HI(c);
6922 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6923
6924 while (hardcount < max
6925 && scan + 1 < loceol
6926 && UCHARAT(scan) == high
6927 && UCHARAT(scan + 1) == low)
6928 {
6929 scan += 2;
6930 hardcount++;
6931 }
6932 }
6933 break;
5e4a1da1 6934
098b07d5
KW
6935 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
6936 assert(! reginfo->is_utf8_pat);
6937 /* FALL THROUGH */
2f7f8cb1 6938 case EXACTFA:
098b07d5 6939 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
6940 goto do_exactf;
6941
d4e0b827 6942 case EXACTFL:
272d35c9 6943 RXp_MATCH_TAINTED_on(prog);
17580e7a
KW
6944 utf8_flags = FOLDEQ_UTF8_LOCALE;
6945 goto do_exactf;
6946
2fdb7295
KW
6947 case EXACTF: /* This node only generated for non-utf8 patterns */
6948 assert(! reginfo->is_utf8_pat);
098b07d5
KW
6949 utf8_flags = 0;
6950 goto do_exactf;
62bf7766 6951
3c760661 6952 case EXACTFU_SS:
9a5a5549 6953 case EXACTFU:
f9176b44 6954 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6955
613a425d
KW
6956 do_exactf: {
6957 int c1, c2;
6958 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6959
f9176b44 6960 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6961
984e6dd1 6962 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 6963 reginfo))
984e6dd1 6964 {
613a425d 6965 if (c1 == CHRTEST_VOID) {
49b95fad 6966 /* Use full Unicode fold matching */
220db18a 6967 char *tmpeol = reginfo->strend;
f9176b44 6968 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
6969 while (hardcount < max
6970 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6971 STRING(p), NULL, pat_len,
f9176b44 6972 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
6973 {
6974 scan = tmpeol;
220db18a 6975 tmpeol = reginfo->strend;
49b95fad
KW
6976 hardcount++;
6977 }
613a425d
KW
6978 }
6979 else if (utf8_target) {
6980 if (c1 == c2) {
4063ade8
KW
6981 while (scan < loceol
6982 && hardcount < max
613a425d
KW
6983 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6984 {
6985 scan += UTF8SKIP(scan);
6986 hardcount++;
6987 }
6988 }
6989 else {
4063ade8
KW
6990 while (scan < loceol
6991 && hardcount < max
613a425d
KW
6992 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6993 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6994 {
6995 scan += UTF8SKIP(scan);
6996 hardcount++;
6997 }
6998 }
6999 }
7000 else if (c1 == c2) {
7001 while (scan < loceol && UCHARAT(scan) == c1) {
7002 scan++;
7003 }
7004 }
7005 else {
7006 while (scan < loceol &&
7007 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7008 {
7009 scan++;
7010 }
7011 }
634c83a2 7012 }
bbce6d69 7013 break;
613a425d 7014 }
a0d0e21e 7015 case ANYOF:
e0193e47 7016 if (utf8_target) {
4e8910e0 7017 while (hardcount < max
9a902117 7018 && scan < loceol
3db24e1e 7019 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
4e8910e0 7020 {
9a902117 7021 scan += UTF8SKIP(scan);
ffc61ed2
JH
7022 hardcount++;
7023 }
7024 } else {
32fc9b6a 7025 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
7026 scan++;
7027 }
a0d0e21e 7028 break;
4063ade8 7029
3018b823 7030 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 7031
3018b823
KW
7032 case NPOSIXL:
7033 to_complement = 1;
7034 /* FALLTHROUGH */
980866de 7035
3018b823 7036 case POSIXL:
272d35c9 7037 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
7038 if (! utf8_target) {
7039 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7040 *scan)))
a12cf05f 7041 {
3018b823
KW
7042 scan++;
7043 }
7044 } else {
7045 while (hardcount < max && scan < loceol
7046 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7047 (U8 *) scan)))
7048 {
7049 scan += UTF8SKIP(scan);
ffc61ed2
JH
7050 hardcount++;
7051 }
a0ed51b3
LW
7052 }
7053 break;
0658cdde 7054
3018b823
KW
7055 case POSIXD:
7056 if (utf8_target) {
7057 goto utf8_posix;
7058 }
7059 /* FALLTHROUGH */
7060
0658cdde 7061 case POSIXA:
0430522f 7062 if (utf8_target && loceol - scan > max) {
4063ade8 7063
7aee35ff
KW
7064 /* We didn't adjust <loceol> at the beginning of this routine
7065 * because is UTF-8, but it is actually ok to do so, since here, to
7066 * match, 1 char == 1 byte. */
4063ade8
KW
7067 loceol = scan + max;
7068 }
7069 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
7070 scan++;
7071 }
7072 break;
980866de 7073
3018b823
KW
7074 case NPOSIXD:
7075 if (utf8_target) {
7076 to_complement = 1;
7077 goto utf8_posix;
7078 }
7079 /* FALL THROUGH */
980866de 7080
3018b823
KW
7081 case NPOSIXA:
7082 if (! utf8_target) {
7083 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
7084 scan++;
7085 }
4063ade8 7086 }
3018b823 7087 else {
980866de 7088
3018b823
KW
7089 /* The complement of something that matches only ASCII matches all
7090 * UTF-8 variant code points, plus everything in ASCII that isn't
7091 * in the class. */
bedac28b 7092 while (hardcount < max && scan < loceol
3018b823
KW
7093 && (! UTF8_IS_INVARIANT(*scan)
7094 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 7095 {
3018b823 7096 scan += UTF8SKIP(scan);
ffc61ed2
JH
7097 hardcount++;
7098 }
3018b823
KW
7099 }
7100 break;
980866de 7101
3018b823
KW
7102 case NPOSIXU:
7103 to_complement = 1;
7104 /* FALLTHROUGH */
7105
7106 case POSIXU:
7107 if (! utf8_target) {
7108 while (scan < loceol && to_complement
7109 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 7110 {
3018b823
KW
7111 scan++;
7112 }
cfaf538b
KW
7113 }
7114 else {
3018b823
KW
7115 utf8_posix:
7116 classnum = (_char_class_number) FLAGS(p);
7117 if (classnum < _FIRST_NON_SWASH_CC) {
7118
7119 /* Here, a swash is needed for above-Latin1 code points.
7120 * Process as many Latin1 code points using the built-in rules.
7121 * Go to another loop to finish processing upon encountering
7122 * the first Latin1 code point. We could do that in this loop
7123 * as well, but the other way saves having to test if the swash
7124 * has been loaded every time through the loop: extra space to
7125 * save a test. */
7126 while (hardcount < max && scan < loceol) {
7127 if (UTF8_IS_INVARIANT(*scan)) {
7128 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7129 classnum))))
7130 {
7131 break;
7132 }
7133 scan++;
7134 }
7135 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7136 if (! (to_complement
94bb8c36
KW
7137 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7138 *(scan + 1)),
3018b823
KW
7139 classnum))))
7140 {
7141 break;
7142 }
7143 scan += 2;
7144 }
7145 else {
7146 goto found_above_latin1;
7147 }
7148
7149 hardcount++;
7150 }
7151 }
7152 else {
7153 /* For these character classes, the knowledge of how to handle
7154 * every code point is compiled in to Perl via a macro. This
7155 * code is written for making the loops as tight as possible.
7156 * It could be refactored to save space instead */
7157 switch (classnum) {
7158 case _CC_ENUM_SPACE: /* XXX would require separate code
7159 if we revert the change of \v
7160 matching this */
7161 /* FALL THROUGH */
7162 case _CC_ENUM_PSXSPC:
7163 while (hardcount < max
7164 && scan < loceol
7165 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7166 {
7167 scan += UTF8SKIP(scan);
7168 hardcount++;
7169 }
7170 break;
7171 case _CC_ENUM_BLANK:
7172 while (hardcount < max
7173 && scan < loceol
7174 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7175 {
7176 scan += UTF8SKIP(scan);
7177 hardcount++;
7178 }
7179 break;
7180 case _CC_ENUM_XDIGIT:
7181 while (hardcount < max
7182 && scan < loceol
7183 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7184 {
7185 scan += UTF8SKIP(scan);
7186 hardcount++;
7187 }
7188 break;
7189 case _CC_ENUM_VERTSPACE:
7190 while (hardcount < max
7191 && scan < loceol
7192 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7193 {
7194 scan += UTF8SKIP(scan);
7195 hardcount++;
7196 }
7197 break;
7198 case _CC_ENUM_CNTRL:
7199 while (hardcount < max
7200 && scan < loceol
7201 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7202 {
7203 scan += UTF8SKIP(scan);
7204 hardcount++;
7205 }
7206 break;
7207 default:
7208 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7209 }
7210 }
a0ed51b3 7211 }
3018b823 7212 break;
4063ade8 7213
3018b823
KW
7214 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7215
7216 /* Load the swash if not already present */
7217 if (! PL_utf8_swash_ptrs[classnum]) {
7218 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7219 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
2a16ac92
KW
7220 "utf8",
7221 "",
7222 &PL_sv_undef, 1, 0,
7223 PL_XPosix_ptrs[classnum], &flags);
4063ade8 7224 }
3018b823
KW
7225
7226 while (hardcount < max && scan < loceol
7227 && to_complement ^ cBOOL(_generic_utf8(
7228 classnum,
7229 scan,
7230 swash_fetch(PL_utf8_swash_ptrs[classnum],
7231 (U8 *) scan,
7232 TRUE))))
7233 {
7234 scan += UTF8SKIP(scan);
7235 hardcount++;
7236 }
7237 break;
7238
e1d1eefb 7239 case LNBREAK:
e64f369d
KW
7240 if (utf8_target) {
7241 while (hardcount < max && scan < loceol &&
7242 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7243 scan += c;
7244 hardcount++;
7245 }
7246 } else {
7247 /* LNBREAK can match one or two latin chars, which is ok, but we
7248 * have to use hardcount in this situation, and throw away the
7249 * adjustment to <loceol> done before the switch statement */
220db18a 7250 loceol = reginfo->strend;
e64f369d
KW
7251 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7252 scan+=c;
7253 hardcount++;
7254 }
7255 }
7256 break;
e1d1eefb 7257
584b1f02
KW
7258 case BOUND:
7259 case BOUNDA:
7260 case BOUNDL:
7261 case BOUNDU:
7262 case EOS:
7263 case GPOS:
7264 case KEEPS:
7265 case NBOUND:
7266 case NBOUNDA:
7267 case NBOUNDL:
7268 case NBOUNDU:
7269 case OPFAIL:
7270 case SBOL:
7271 case SEOL:
7272 /* These are all 0 width, so match right here or not at all. */
7273 break;
7274
7275 default:
7276 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7277 assert(0); /* NOTREACHED */
7278
a0d0e21e 7279 }
a687059c 7280
a0ed51b3
LW
7281 if (hardcount)
7282 c = hardcount;
7283 else
f73aaa43
DM
7284 c = scan - *startposp;
7285 *startposp = scan;
a687059c 7286
a3621e74 7287 DEBUG_r({
e68ec53f 7288 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7289 DEBUG_EXECUTE_r({
e68ec53f
YO
7290 SV * const prop = sv_newmortal();
7291 regprop(prog, prop, p);
7292 PerlIO_printf(Perl_debug_log,
be8e71aa 7293 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7294 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7295 });
be8e71aa 7296 });
9041c2e3 7297
a0d0e21e 7298 return(c);
a687059c
LW
7299}
7300
c277df42 7301
be8e71aa 7302#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7303/*
6c6525b8 7304- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7305create a copy so that changes the caller makes won't change the shared one.
7306If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7307 */
ffc61ed2 7308SV *
5aaab254 7309Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7310{
6c6525b8 7311 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7312
7313 if (altsvp) {
7314 *altsvp = NULL;
7315 }
7316
7317 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7318}
7319#endif
7320
7321STATIC SV *
5aaab254 7322S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7323{
8c9eb58f 7324 /* Returns the swash for the input 'node' in the regex 'prog'.
a498e2f8 7325 * If <doinit> is 'true', will attempt to create the swash if not already
8c9eb58f 7326 * done.
a498e2f8
KW
7327 * If <listsvp> is non-null, will return the printable contents of the
7328 * swash. This can be used to get debugging information even before the
7329 * swash exists, by calling this function with 'doinit' set to false, in
7330 * which case the components that will be used to eventually create the
7331 * swash are returned (in a printable form).
8c9eb58f
KW
7332 * Tied intimately to how regcomp.c sets up the data structure */
7333
97aff369 7334 dVAR;
9e55ce06 7335 SV *sw = NULL;
a498e2f8 7336 SV *si = NULL; /* Input swash initialization string */
7a6c6baa
KW
7337 SV* invlist = NULL;
7338
f8fc2ecf
YO
7339 RXi_GET_DECL(prog,progi);
7340 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7341
6c6525b8 7342 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7343
ccb2541c
KW
7344 assert(ANYOF_NONBITMAP(node));
7345
4f639d21 7346 if (data && data->count) {
a3b680e6 7347 const U32 n = ARG(node);
ffc61ed2 7348
4f639d21 7349 if (data->what[n] == 's') {
ad64d0ec
NC
7350 SV * const rv = MUTABLE_SV(data->data[n]);
7351 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7352 SV **const ary = AvARRAY(av);
87367d5f 7353 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7354
8c9eb58f 7355 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7356
88675427
KW
7357 /* Elements 2 and 3 are either both present or both absent. [2] is
7358 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7359 * that inversion list has any user-defined properties in it. */
88675427
KW
7360 if (av_len(av) >= 2) {
7361 invlist = ary[2];
7362 if (SvUV(ary[3])) {
83199d38
KW
7363 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7364 }
7a6c6baa
KW
7365 }
7366 else {
7367 invlist = NULL;
7a6c6baa
KW
7368 }
7369
8c9eb58f
KW
7370 /* Element [1] is reserved for the set-up swash. If already there,
7371 * return it; if not, create it and store it there */
ce0d59fd 7372 if (ary[1] && SvROK(ary[1])) {
f192cf32
KW
7373 sw = ary[1];
7374 }
ffc61ed2 7375 else if (si && doinit) {
7a6c6baa
KW
7376
7377 sw = _core_swash_init("utf8", /* the utf8 package */
7378 "", /* nameless */
7379 si,
7380 1, /* binary */
7381 0, /* not from tr/// */
7a6c6baa 7382 invlist,
83199d38 7383 &swash_init_flags);
ffc61ed2
JH
7384 (void)av_store(av, 1, sw);
7385 }
7386 }
7387 }
7388
a498e2f8 7389 /* If requested, return a printable version of what this swash matches */
7a6c6baa
KW
7390 if (listsvp) {
7391 SV* matches_string = newSVpvn("", 0);
7a6c6baa 7392
a498e2f8
KW
7393 /* The swash should be used, if possible, to get the data, as it
7394 * contains the resolved data. But this function can be called at
7395 * compile-time, before everything gets resolved, in which case we
7396 * return the currently best available information, which is the string
7397 * that will eventually be used to do that resolving, 'si' */
872dd7e0
KW
7398 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7399 && (si && si != &PL_sv_undef))
7400 {
7a6c6baa
KW
7401 sv_catsv(matches_string, si);
7402 }
7403
7404 /* Add the inversion list to whatever we have. This may have come from
7405 * the swash, or from an input parameter */
7406 if (invlist) {
7407 sv_catsv(matches_string, _invlist_contents(invlist));
7408 }
7409 *listsvp = matches_string;
7410 }
7411
ffc61ed2
JH
7412 return sw;
7413}
7414
7415/*
ba7b4546 7416 - reginclass - determine if a character falls into a character class
832705d4 7417
6698fab5
KW
7418 n is the ANYOF regnode
7419 p is the target string
3db24e1e 7420 p_end points to one byte beyond the end of the target string
6698fab5 7421 utf8_target tells whether p is in UTF-8.
832705d4 7422
635cd5d4 7423 Returns true if matched; false otherwise.
eba1359e 7424
d5788240
KW
7425 Note that this can be a synthetic start class, a combination of various
7426 nodes, so things you think might be mutually exclusive, such as locale,
7427 aren't. It can match both locale and non-locale
7428
bbce6d69 7429 */
7430
76e3520e 7431STATIC bool
3db24e1e 7432S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
bbce6d69 7433{
27da23d5 7434 dVAR;
a3b680e6 7435 const char flags = ANYOF_FLAGS(n);
bbce6d69 7436 bool match = FALSE;
cc07378b 7437 UV c = *p;
1aa99e6b 7438
7918f24d
NC
7439 PERL_ARGS_ASSERT_REGINCLASS;
7440
afd2eb18
KW
7441 /* If c is not already the code point, get it. Note that
7442 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7443 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7444 STRLEN c_len = 0;
3db24e1e 7445 c = utf8n_to_uvchr(p, p_end - p, &c_len,
6182169b
KW
7446 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7447 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7448 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7449 * UTF8_ALLOW_FFFF */
f7ab54c6 7450 if (c_len == (STRLEN)-1)
e8a70c6f 7451 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7452 }
4b3cda86 7453
7cdde544
KW
7454 /* If this character is potentially in the bitmap, check it */
7455 if (c < 256) {
ffc61ed2
JH
7456 if (ANYOF_BITMAP_TEST(n, c))
7457 match = TRUE;
11454c59
KW
7458 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7459 && ! utf8_target
7460 && ! isASCII(c))
7461 {
7462 match = TRUE;
7463 }
78969a98 7464 else if (flags & ANYOF_LOCALE) {
b99851e1
KW
7465 if (flags & ANYOF_LOC_FOLD) {
7466 RXp_MATCH_TAINTED_on(prog);
7467 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7468 match = TRUE;
7469 }
7470 }
ee48a02a 7471 if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
31c7f561
KW
7472
7473 /* The data structure is arranged so bits 0, 2, 4, ... are set
7474 * if the class includes the Posix character class given by
7475 * bit/2; and 1, 3, 5, ... are set if the class includes the
7476 * complemented Posix class given by int(bit/2). So we loop
7477 * through the bits, each time changing whether we complement
7478 * the result or not. Suppose for the sake of illustration
7479 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7480 * is set, it means there is a match for this ANYOF node if the
7481 * character is in the class given by the expression (0 / 2 = 0
7482 * = \w). If it is in that class, isFOO_lc() will return 1,
7483 * and since 'to_complement' is 0, the result will stay TRUE,
7484 * and we exit the loop. Suppose instead that bit 0 is 0, but
7485 * bit 1 is 1. That means there is a match if the character
7486 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7487 * but will on bit 1. On the second iteration 'to_complement'
7488 * will be 1, so the exclusive or will reverse things, so we
7489 * are testing for \W. On the third iteration, 'to_complement'
7490 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7491 * iteration would test for \S, etc.
7492 *
7493 * Note that this code assumes that all the classes are closed
7494 * under folding. For example, if a character matches \w, then
7495 * its fold does too; and vice versa. This should be true for
7496 * any well-behaved locale for all the currently defined Posix
7497 * classes, except for :lower: and :upper:, which are handled
7498 * by the pseudo-class :cased: which matches if either of the
7499 * other two does. To get rid of this assumption, an outer
7500 * loop could be used below to iterate over both the source
7501 * character, and its fold (if different) */
31c7f561
KW
7502
7503 int count = 0;
7504 int to_complement = 0;
522b3c1e
KW
7505
7506 RXp_MATCH_TAINTED_on(prog);
31c7f561 7507 while (count < ANYOF_MAX) {
8efd3f97 7508 if (ANYOF_POSIXL_TEST(n, count)
31c7f561
KW
7509 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7510 {
7511 match = TRUE;
7512 break;
7513 }
7514 count++;
7515 to_complement ^= 1;
7516 }
ffc61ed2 7517 }
a0ed51b3 7518 }
a0ed51b3
LW
7519 }
7520
7cdde544 7521 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7522 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7523 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7524 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7525 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7526 * could be a combination of locale and non-locale nodes. So allow locale
7527 * to match for the synthetic start class, which will give a false
7528 * positive that will be resolved when the match is done again as not part
7529 * of the synthetic start class */
ef87b810 7530 if (!match) {
5c2c7b4d 7531 if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
10ee90d2 7532 match = TRUE; /* Everything above 255 matches */
e051a21d 7533 }
6f8d7d0d
KW
7534 else if (ANYOF_NONBITMAP(n)
7535 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7536 || (utf8_target
7537 && (c >=256
7538 || (! (flags & ANYOF_LOCALE))
9aa1e39f 7539 || OP(n) == ANYOF_SYNTHETIC))))
ef87b810 7540 {
e0193e47 7541 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7542 if (sw) {
7543 U8 * utf8_p;
7544 if (utf8_target) {
7545 utf8_p = (U8 *) p;
e0193e47
KW
7546 } else { /* Convert to utf8 */
7547 STRLEN len = 1;
7cdde544
KW
7548 utf8_p = bytes_to_utf8(p, &len);
7549 }
f56b6394 7550
e0193e47 7551 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7552 match = TRUE;
e0193e47 7553 }
7cdde544
KW
7554
7555 /* If we allocated a string above, free it */
7556 if (! utf8_target) Safefree(utf8_p);
7557 }
7558 }
5073ffbd
KW
7559
7560 if (UNICODE_IS_SUPER(c)
ae986089 7561 && (flags & ANYOF_WARN_SUPER)
5073ffbd
KW
7562 && ckWARN_d(WARN_NON_UNICODE))
7563 {
7564 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2d88a86a 7565 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
5073ffbd 7566 }
7cdde544
KW
7567 }
7568
5dbb0c08
KW
7569#if ANYOF_INVERT != 1
7570 /* Depending on compiler optimization cBOOL takes time, so if don't have to
7571 * use it, don't */
7572# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7573#endif
7574
f0fdc1c9 7575 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
5dbb0c08 7576 return (flags & ANYOF_INVERT) ^ match;
a0ed51b3 7577}
161b471a 7578
dfe13c55 7579STATIC U8 *
ea3daa5d 7580S_reghop3(U8 *s, SSize_t off, const U8* lim)
9041c2e3 7581{
6af86488
KW
7582 /* return the position 'off' UTF-8 characters away from 's', forward if
7583 * 'off' >= 0, backwards if negative. But don't go outside of position
7584 * 'lim', which better be < s if off < 0 */
7585
97aff369 7586 dVAR;
7918f24d
NC
7587
7588 PERL_ARGS_ASSERT_REGHOP3;
7589
a0ed51b3 7590 if (off >= 0) {
1aa99e6b 7591 while (off-- && s < lim) {
ffc61ed2 7592 /* XXX could check well-formedness here */
a0ed51b3 7593 s += UTF8SKIP(s);
ffc61ed2 7594 }
a0ed51b3
LW
7595 }
7596 else {
1de06328
YO
7597 while (off++ && s > lim) {
7598 s--;
7599 if (UTF8_IS_CONTINUED(*s)) {
7600 while (s > lim && UTF8_IS_CONTINUATION(*s))
7601 s--;
a0ed51b3 7602 }
1de06328 7603 /* XXX could check well-formedness here */
a0ed51b3
LW
7604 }
7605 }
7606 return s;
7607}
161b471a 7608
f9f4320a
YO
7609#ifdef XXX_dmq
7610/* there are a bunch of places where we use two reghop3's that should
7611 be replaced with this routine. but since thats not done yet
7612 we ifdef it out - dmq
7613*/
dfe13c55 7614STATIC U8 *
ea3daa5d 7615S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
1de06328
YO
7616{
7617 dVAR;
7918f24d
NC
7618
7619 PERL_ARGS_ASSERT_REGHOP4;
7620
1de06328
YO
7621 if (off >= 0) {
7622 while (off-- && s < rlim) {
7623 /* XXX could check well-formedness here */
7624 s += UTF8SKIP(s);
7625 }
7626 }
7627 else {
7628 while (off++ && s > llim) {
7629 s--;
7630 if (UTF8_IS_CONTINUED(*s)) {
7631 while (s > llim && UTF8_IS_CONTINUATION(*s))
7632 s--;
7633 }
7634 /* XXX could check well-formedness here */
7635 }
7636 }
7637 return s;
7638}
f9f4320a 7639#endif
1de06328
YO
7640
7641STATIC U8 *
ea3daa5d 7642S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
a0ed51b3 7643{
97aff369 7644 dVAR;
7918f24d
NC
7645
7646 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7647
a0ed51b3 7648 if (off >= 0) {
1aa99e6b 7649 while (off-- && s < lim) {
ffc61ed2 7650 /* XXX could check well-formedness here */
a0ed51b3 7651 s += UTF8SKIP(s);
ffc61ed2 7652 }
a0ed51b3 7653 if (off >= 0)
3dab1dad 7654 return NULL;
a0ed51b3
LW
7655 }
7656 else {
1de06328
YO
7657 while (off++ && s > lim) {
7658 s--;
7659 if (UTF8_IS_CONTINUED(*s)) {
7660 while (s > lim && UTF8_IS_CONTINUATION(*s))
7661 s--;
a0ed51b3 7662 }
1de06328 7663 /* XXX could check well-formedness here */
a0ed51b3
LW
7664 }
7665 if (off <= 0)
3dab1dad 7666 return NULL;
a0ed51b3
LW
7667 }
7668 return s;
7669}
51371543 7670
a75351a1
DM
7671
7672/* when executing a regex that may have (?{}), extra stuff needs setting
7673 up that will be visible to the called code, even before the current
7674 match has finished. In particular:
7675
7676 * $_ is localised to the SV currently being matched;
7677 * pos($_) is created if necessary, ready to be updated on each call-out
7678 to code;
7679 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7680 isn't set until the current pattern is successfully finished), so that
7681 $1 etc of the match-so-far can be seen;
7682 * save the old values of subbeg etc of the current regex, and set then
7683 to the current string (again, this is normally only done at the end
7684 of execution)
a75351a1
DM
7685*/
7686
7687static void
7688S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7689{
7690 MAGIC *mg;
7691 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 7692 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 7693
8adc0f72 7694 eval_state->rex = rex;
a75351a1 7695
a75351a1
DM
7696 if (reginfo->sv) {
7697 /* Make $_ available to executed code. */
7698 if (reginfo->sv != DEFSV) {
7699 SAVE_DEFSV;
7700 DEFSV_set(reginfo->sv);
7701 }
7702
96c2a8ff 7703 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 7704 /* prepare for quick setting of pos */
96c2a8ff 7705 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
7706 mg->mg_len = -1;
7707 }
8adc0f72
DM
7708 eval_state->pos_magic = mg;
7709 eval_state->pos = mg->mg_len;
25fdce4a 7710 eval_state->pos_flags = mg->mg_flags;
a75351a1 7711 }
8adc0f72
DM
7712 else
7713 eval_state->pos_magic = NULL;
7714
a75351a1 7715 if (!PL_reg_curpm) {
f65e70f5
DM
7716 /* PL_reg_curpm is a fake PMOP that we can attach the current
7717 * regex to and point PL_curpm at, so that $1 et al are visible
7718 * within a /(?{})/. It's just allocated once per interpreter the
7719 * first time its needed */
a75351a1
DM
7720 Newxz(PL_reg_curpm, 1, PMOP);
7721#ifdef USE_ITHREADS
7722 {
7723 SV* const repointer = &PL_sv_undef;
7724 /* this regexp is also owned by the new PL_reg_curpm, which
7725 will try to free it. */
7726 av_push(PL_regex_padav, repointer);
7727 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7728 PL_regex_pad = AvARRAY(PL_regex_padav);
7729 }
7730#endif
7731 }
7732 SET_reg_curpm(reginfo->prog);
8adc0f72 7733 eval_state->curpm = PL_curpm;
a75351a1
DM
7734 PL_curpm = PL_reg_curpm;
7735 if (RXp_MATCH_COPIED(rex)) {
7736 /* Here is a serious problem: we cannot rewrite subbeg,
7737 since it may be needed if this match fails. Thus
7738 $` inside (?{}) could fail... */
8adc0f72
DM
7739 eval_state->subbeg = rex->subbeg;
7740 eval_state->sublen = rex->sublen;
7741 eval_state->suboffset = rex->suboffset;
a8ee055f 7742 eval_state->subcoffset = rex->subcoffset;
a75351a1 7743#ifdef PERL_ANY_COW
8adc0f72 7744 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
7745#endif
7746 RXp_MATCH_COPIED_off(rex);
7747 }
7748 else
8adc0f72 7749 eval_state->subbeg = NULL;
a75351a1
DM
7750 rex->subbeg = (char *)reginfo->strbeg;
7751 rex->suboffset = 0;
7752 rex->subcoffset = 0;
7753 rex->sublen = reginfo->strend - reginfo->strbeg;
7754}
7755
bf2039a9
DM
7756
7757/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 7758
51371543 7759static void
bf2039a9 7760S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 7761{
97aff369 7762 dVAR;
bf2039a9
DM
7763 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7764 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 7765 regmatch_slab *s;
bf2039a9 7766
2ac8ff4b
DM
7767 Safefree(aux->poscache);
7768
331b2dcc 7769 if (eval_state) {
bf2039a9 7770
331b2dcc 7771 /* undo the effects of S_setup_eval_state() */
bf2039a9 7772
331b2dcc
DM
7773 if (eval_state->subbeg) {
7774 regexp * const rex = eval_state->rex;
7775 rex->subbeg = eval_state->subbeg;
7776 rex->sublen = eval_state->sublen;
7777 rex->suboffset = eval_state->suboffset;
7778 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 7779#ifdef PERL_ANY_COW
331b2dcc 7780 rex->saved_copy = eval_state->saved_copy;
ed252734 7781#endif
331b2dcc
DM
7782 RXp_MATCH_COPIED_on(rex);
7783 }
7784 if (eval_state->pos_magic)
25fdce4a 7785 {
331b2dcc 7786 eval_state->pos_magic->mg_len = eval_state->pos;
25fdce4a
FC
7787 eval_state->pos_magic->mg_flags =
7788 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7789 | (eval_state->pos_flags & MGf_BYTES);
7790 }
331b2dcc
DM
7791
7792 PL_curpm = eval_state->curpm;
8adc0f72 7793 }
bf2039a9 7794
331b2dcc
DM
7795 PL_regmatch_state = aux->old_regmatch_state;
7796 PL_regmatch_slab = aux->old_regmatch_slab;
7797
7798 /* free all slabs above current one - this must be the last action
7799 * of this function, as aux and eval_state are allocated within
7800 * slabs and may be freed here */
7801
7802 s = PL_regmatch_slab->next;
7803 if (s) {
7804 PL_regmatch_slab->next = NULL;
7805 while (s) {
7806 regmatch_slab * const osl = s;
7807 s = s->next;
7808 Safefree(osl);
7809 }
7810 }
51371543 7811}
33b8afdf 7812
8adc0f72 7813
33b8afdf 7814STATIC void
5aaab254 7815S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7816{
7e0d5ad7
KW
7817 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7818 * on the converted value */
7819
a1cac82e 7820 int i = 1;
7918f24d
NC
7821
7822 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7823
a1cac82e
NC
7824 do {
7825 if (prog->substrs->data[i].substr
7826 && !prog->substrs->data[i].utf8_substr) {
7827 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7828 prog->substrs->data[i].utf8_substr = sv;
7829 sv_utf8_upgrade(sv);
610460f9 7830 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7831 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7832 /* Trim the trailing \n that fbm_compile added last
7833 time. */
7834 SvCUR_set(sv, SvCUR(sv) - 1);
7835 /* Whilst this makes the SV technically "invalid" (as its
7836 buffer is no longer followed by "\0") when fbm_compile()
7837 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7838 fbm_compile(sv, FBMcf_TAIL);
7839 } else
7840 fbm_compile(sv, 0);
610460f9 7841 }
a1cac82e
NC
7842 if (prog->substrs->data[i].substr == prog->check_substr)
7843 prog->check_utf8 = sv;
7844 }
7845 } while (i--);
33b8afdf
JH
7846}
7847
7e0d5ad7 7848STATIC bool
5aaab254 7849S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7850{
7e0d5ad7
KW
7851 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7852 * on the converted value; returns FALSE if can't be converted. */
7853
97aff369 7854 dVAR;
a1cac82e 7855 int i = 1;
7918f24d
NC
7856
7857 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7858
a1cac82e
NC
7859 do {
7860 if (prog->substrs->data[i].utf8_substr
7861 && !prog->substrs->data[i].substr) {
7862 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7863 if (! sv_utf8_downgrade(sv, TRUE)) {
7864 return FALSE;
7865 }
5400f398
KW
7866 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7867 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7868 /* Trim the trailing \n that fbm_compile added last
7869 time. */
7870 SvCUR_set(sv, SvCUR(sv) - 1);
7871 fbm_compile(sv, FBMcf_TAIL);
7872 } else
7873 fbm_compile(sv, 0);
7874 }
a1cac82e
NC
7875 prog->substrs->data[i].substr = sv;
7876 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7877 prog->check_substr = sv;
33b8afdf 7878 }
a1cac82e 7879 } while (i--);
7e0d5ad7
KW
7880
7881 return TRUE;
33b8afdf 7882}
66610fdd
RGS
7883
7884/*
7885 * Local variables:
7886 * c-indentation-style: bsd
7887 * c-basic-offset: 4
14d04a33 7888 * indent-tabs-mode: nil
66610fdd
RGS
7889 * End:
7890 *
14d04a33 7891 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7892 */