This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
B.xs: fix syntax err in PERL_VERSION < 17
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 121 : (U8*)(pos + off))
557f47af 122
3dab1dad 123#define HOPBACKc(pos, off) \
ba44c216 124 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
126 : (pos - off >= reginfo->strbeg) \
8e11feef 127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
ba44c216 130#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 131#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 132
557f47af
DM
133/* lim must be +ve. Returns NULL on overshoot */
134#define HOPMAYBE3(pos,off,lim) \
135 (reginfo->is_utf8_target \
136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
137 : ((U8*)pos + off <= lim) \
138 ? (U8*)pos + off \
139 : NULL)
140
8e9f2289
DM
141/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
142 * off must be >=0; args should be vars rather than expressions */
143#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
145 : (U8*)((pos + off) > lim ? lim : (pos + off)))
146
2974eaec
DM
147#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
149 : (U8*)(pos + off))
150#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
151
152#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
153#define NEXTCHR_IS_EOS (nextchr < 0)
154
155#define SET_nextchr \
220db18a 156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
157
158#define SET_locinput(p) \
159 locinput = (p); \
160 SET_nextchr
161
162
2a16ac92 163#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
164 if (!swash_ptr) { \
165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 167 1, 0, invlist, &flags); \
c7304fe2
KW
168 assert(swash_ptr); \
169 } \
170 } STMT_END
171
172/* If in debug mode, we test that a known character properly matches */
173#ifdef DEBUGGING
174# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
175 property_name, \
2a16ac92 176 invlist, \
c7304fe2 177 utf8_char_in_property) \
2a16ac92 178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
180#else
181# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
182 property_name, \
2a16ac92 183 invlist, \
c7304fe2 184 utf8_char_in_property) \
2a16ac92 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 186#endif
d1eb3177 187
c7304fe2
KW
188#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
190 "", \
191 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
193
194#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
195 STMT_START { \
196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
197 "_X_regular_begin", \
2a16ac92 198 NULL, \
df38da56 199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
201 "_X_extend", \
2a16ac92 202 NULL, \
c7304fe2
KW
203 COMBINING_GRAVE_ACCENT_UTF8); \
204 } STMT_END
d1eb3177 205
c7304fe2 206#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
207/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
208
5f80c4cf 209/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
210/* it would be nice to rework regcomp.sym to generate this stuff. sigh
211 *
212 * NOTE that *nothing* that affects backtracking should be in here, specifically
213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
214 * node that is in between two EXACT like nodes when ascertaining what the required
215 * "follow" character is. This should probably be moved to regex compile time
216 * although it may be done at run time beause of the REF possibility - more
217 * investigation required. -- demerphq
218*/
baa60164
KW
219#define JUMPABLE(rn) ( \
220 OP(rn) == OPEN || \
3e901dc0 221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
222 OP(rn) == EVAL || \
223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
224 OP(rn) == PLUS || OP(rn) == MINMOD || \
225 OP(rn) == KEEPS || \
226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 227)
ee9b8eae 228#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 229
ee9b8eae
YO
230#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
231
232#if 0
233/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
234 we don't need this definition. */
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 236#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
237#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
238
239#else
240/* ... so we use this as its faster. */
241#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 242#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
243#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
244#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245
246#endif
e2d8ce26 247
a84d97b6
HS
248/*
249 Search for mandatory following text node; for lookahead, the text must
250 follow but for lookbehind (rn->flags != 0) we skip to the next step.
251*/
baa60164 252#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
253 while (JUMPABLE(rn)) { \
254 const OPCODE type = OP(rn); \
255 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 256 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 257 else if (type == PLUS) \
cca55fe3 258 rn = NEXTOPER(rn); \
3dab1dad 259 else if (type == IFMATCH) \
a84d97b6 260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 261 else rn += NEXT_OFF(rn); \
3dab1dad 262 } \
5f80c4cf 263} STMT_END
74750237 264
22913b96
KW
265/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
266 * These are for the pre-composed Hangul syllables, which are all in a
267 * contiguous block and arranged there in such a way so as to facilitate
268 * alorithmic determination of their characteristics. As such, they don't need
269 * a swash, but can be determined by simple arithmetic. Almost all are
270 * GCB=LVT, but every 28th one is a GCB=LV */
271#define SBASE 0xAC00 /* Start of block */
272#define SCount 11172 /* Length of block */
273#define TCount 28
c476f425 274
006f26b2
DM
275#define SLAB_FIRST(s) (&(s)->states[0])
276#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
277
a75351a1 278static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 279static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 280static regmatch_state * S_push_slab(pTHX);
51371543 281
87c0511b 282#define REGCP_PAREN_ELEMS 3
f067efbf 283#define REGCP_OTHER_ELEMS 3
e0fa7e2b 284#define REGCP_FRAME_ELEMS 1
620d5b66
NC
285/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
286 * are needed for the regexp context stack bookkeeping. */
287
76e3520e 288STATIC CHECKPOINT
92da3157 289S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 290{
a3b680e6 291 const int retval = PL_savestack_ix;
92da3157
DM
292 const int paren_elems_to_push =
293 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
294 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
295 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 296 I32 p;
40a82448 297 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 298
b93070ed
DM
299 PERL_ARGS_ASSERT_REGCPPUSH;
300
e49a9654 301 if (paren_elems_to_push < 0)
e8a85d26
JH
302 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
303 (int)paren_elems_to_push, (int)maxopenparen,
304 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 305
e0fa7e2b
NC
306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 308 " out of range (%lu-%ld)",
92da3157
DM
309 total_elems,
310 (unsigned long)maxopenparen,
311 (long)parenfloor);
e0fa7e2b 312
620d5b66 313 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 314
495f47a5 315 DEBUG_BUFFERS_r(
92da3157 316 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
317 PerlIO_printf(Perl_debug_log,
318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
319 PTR2UV(rex),
320 PTR2UV(rex->offs)
321 );
322 );
92da3157 323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 324/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
325 SSPUSHIV(rex->offs[p].end);
326 SSPUSHIV(rex->offs[p].start);
1ca2007e 327 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
330 (UV)p,
331 (IV)rex->offs[p].start,
332 (IV)rex->offs[p].start_tmp,
333 (IV)rex->offs[p].end
40a82448 334 ));
a0d0e21e 335 }
b1ce53c5 336/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 337 SSPUSHINT(maxopenparen);
b93070ed
DM
338 SSPUSHINT(rex->lastparen);
339 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 341
a0d0e21e
LW
342 return retval;
343}
344
c277df42 345/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
346#define REGCP_SET(cp) \
347 DEBUG_STATE_r( \
ab3bbdeb 348 PerlIO_printf(Perl_debug_log, \
e4f74956 349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
350 (IV)PL_savestack_ix)); \
351 cp = PL_savestack_ix
c3464db5 352
ab3bbdeb 353#define REGCP_UNWIND(cp) \
e4f74956 354 DEBUG_STATE_r( \
ab3bbdeb 355 if (cp != PL_savestack_ix) \
e4f74956
YO
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
358 (IV)(cp), (IV)PL_savestack_ix)); \
359 regcpblow(cp)
c277df42 360
a8d1f4b4
DM
361#define UNWIND_PAREN(lp, lcp) \
362 for (n = rex->lastparen; n > lp; n--) \
363 rex->offs[n].end = -1; \
364 rex->lastparen = n; \
365 rex->lastcloseparen = lcp;
366
367
f067efbf 368STATIC void
92da3157 369S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 370{
e0fa7e2b 371 UV i;
87c0511b 372 U32 paren;
a3621e74
YO
373 GET_RE_DEBUG_FLAGS_DECL;
374
7918f24d
NC
375 PERL_ARGS_ASSERT_REGCPPOP;
376
b1ce53c5 377 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 378 i = SSPOPUV;
e0fa7e2b
NC
379 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
380 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
381 rex->lastcloseparen = SSPOPINT;
382 rex->lastparen = SSPOPINT;
92da3157 383 *maxopenparen_p = SSPOPINT;
b1ce53c5 384
620d5b66 385 i -= REGCP_OTHER_ELEMS;
b1ce53c5 386 /* Now restore the parentheses context. */
495f47a5
DM
387 DEBUG_BUFFERS_r(
388 if (i || rex->lastparen + 1 <= rex->nparens)
389 PerlIO_printf(Perl_debug_log,
390 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
391 PTR2UV(rex),
392 PTR2UV(rex->offs)
393 );
394 );
92da3157 395 paren = *maxopenparen_p;
620d5b66 396 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 397 SSize_t tmps;
1ca2007e 398 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
399 rex->offs[paren].start = SSPOPIV;
400 tmps = SSPOPIV;
b93070ed
DM
401 if (paren <= rex->lastparen)
402 rex->offs[paren].end = tmps;
495f47a5
DM
403 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
404 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
405 (UV)paren,
406 (IV)rex->offs[paren].start,
407 (IV)rex->offs[paren].start_tmp,
408 (IV)rex->offs[paren].end,
409 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 410 );
87c0511b 411 paren--;
a0d0e21e 412 }
daf18116 413#if 1
dafc8851
JH
414 /* It would seem that the similar code in regtry()
415 * already takes care of this, and in fact it is in
416 * a better location to since this code can #if 0-ed out
417 * but the code in regtry() is needed or otherwise tests
418 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
419 * (as of patchlevel 7877) will fail. Then again,
420 * this code seems to be necessary or otherwise
225593e1
DM
421 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
422 * --jhi updated by dapm */
b93070ed 423 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 424 if (i > *maxopenparen_p)
b93070ed
DM
425 rex->offs[i].start = -1;
426 rex->offs[i].end = -1;
495f47a5
DM
427 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
428 " \\%"UVuf": %s ..-1 undeffing\n",
429 (UV)i,
92da3157 430 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 431 ));
a0d0e21e 432 }
dafc8851 433#endif
a0d0e21e
LW
434}
435
74088413
DM
436/* restore the parens and associated vars at savestack position ix,
437 * but without popping the stack */
438
439STATIC void
92da3157 440S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
441{
442 I32 tmpix = PL_savestack_ix;
443 PL_savestack_ix = ix;
92da3157 444 regcppop(rex, maxopenparen_p);
74088413
DM
445 PL_savestack_ix = tmpix;
446}
447
02db2b7b 448#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 449
31c7f561
KW
450STATIC bool
451S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
452{
453 /* Returns a boolean as to whether or not 'character' is a member of the
454 * Posix character class given by 'classnum' that should be equivalent to a
455 * value in the typedef '_char_class_number'.
456 *
457 * Ideally this could be replaced by a just an array of function pointers
458 * to the C library functions that implement the macros this calls.
459 * However, to compile, the precise function signatures are required, and
460 * these may vary from platform to to platform. To avoid having to figure
461 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
462 * which adds an extra layer of function call overhead (unless the C
463 * optimizer strips it away). But we don't particularly care about
464 * performance with locales anyway. */
31c7f561
KW
465
466 switch ((_char_class_number) classnum) {
15861f94 467 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 468 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
469 case _CC_ENUM_ASCII: return isASCII_LC(character);
470 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
471 case _CC_ENUM_CASED: return isLOWER_LC(character)
472 || isUPPER_LC(character);
e8d596e0 473 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
474 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
475 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
476 case _CC_ENUM_LOWER: return isLOWER_LC(character);
477 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 478 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 479 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 480 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
481 case _CC_ENUM_UPPER: return isUPPER_LC(character);
482 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 483 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
484 default: /* VERTSPACE should never occur in locales */
485 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
486 }
487
a25b5927 488 assert(0); /* NOTREACHED */
31c7f561
KW
489 return FALSE;
490}
491
3018b823
KW
492STATIC bool
493S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
494{
495 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
496 * 'character' is a member of the Posix character class given by 'classnum'
497 * that should be equivalent to a value in the typedef
498 * '_char_class_number'.
499 *
500 * This just calls isFOO_lc on the code point for the character if it is in
501 * the range 0-255. Outside that range, all characters avoid Unicode
502 * rules, ignoring any locale. So use the Unicode function if this class
503 * requires a swash, and use the Unicode macro otherwise. */
504
505 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
506
507 if (UTF8_IS_INVARIANT(*character)) {
508 return isFOO_lc(classnum, *character);
509 }
510 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
511 return isFOO_lc(classnum,
94bb8c36 512 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
513 }
514
515 if (classnum < _FIRST_NON_SWASH_CC) {
516
517 /* Initialize the swash unless done already */
518 if (! PL_utf8_swash_ptrs[classnum]) {
519 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
520 PL_utf8_swash_ptrs[classnum] =
521 _core_swash_init("utf8",
522 "",
523 &PL_sv_undef, 1, 0,
524 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
525 }
526
92a2046b
KW
527 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
528 character,
529 TRUE /* is UTF */ ));
3018b823
KW
530 }
531
532 switch ((_char_class_number) classnum) {
533 case _CC_ENUM_SPACE:
534 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
535
536 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
537 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
538 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 539 default: break;
3018b823
KW
540 }
541
e1ee3960 542 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
543}
544
a687059c 545/*
e50aee73 546 * pregexec and friends
a687059c
LW
547 */
548
76234dfb 549#ifndef PERL_IN_XSUB_RE
a687059c 550/*
c277df42 551 - pregexec - match a regexp against a string
a687059c 552 */
c277df42 553I32
5aaab254 554Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 555 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
556/* stringarg: the point in the string at which to begin matching */
557/* strend: pointer to null at end of string */
558/* strbeg: real beginning of string */
559/* minend: end of match must be >= minend bytes after stringarg. */
560/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
561 * itself is accessed via the pointers above */
562/* nosave: For optimizations. */
c277df42 563{
7918f24d
NC
564 PERL_ARGS_ASSERT_PREGEXEC;
565
c277df42 566 return
9041c2e3 567 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
568 nosave ? 0 : REXEC_COPY_STR);
569}
76234dfb 570#endif
22e551b9 571
cad2e5aa 572
6eb5f6b9 573
1a4edc3c
DM
574/* re_intuit_start():
575 *
576 * Based on some optimiser hints, try to find the earliest position in the
577 * string where the regex could match.
578 *
579 * rx: the regex to match against
580 * sv: the SV being matched: only used for utf8 flag; the string
581 * itself is accessed via the pointers below. Note that on
582 * something like an overloaded SV, SvPOK(sv) may be false
583 * and the string pointers may point to something unrelated to
584 * the SV itself.
585 * strbeg: real beginning of string
586 * strpos: the point in the string at which to begin matching
587 * strend: pointer to the byte following the last char of the string
588 * flags currently unused; set to 0
589 * data: currently unused; set to NULL
590 *
591 * The basic idea of re_intuit_start() is to use some known information
592 * about the pattern, namely:
593 *
594 * a) the longest known anchored substring (i.e. one that's at a
595 * constant offset from the beginning of the pattern; but not
596 * necessarily at a fixed offset from the beginning of the
597 * string);
598 * b) the longest floating substring (i.e. one that's not at a constant
599 * offset from the beginning of the pattern);
600 * c) Whether the pattern is anchored to the string; either
601 * an absolute anchor: /^../, or anchored to \n: /^.../m,
602 * or anchored to pos(): /\G/;
603 * d) A start class: a real or synthetic character class which
604 * represents which characters are legal at the start of the pattern;
605 *
606 * to either quickly reject the match, or to find the earliest position
607 * within the string at which the pattern might match, thus avoiding
608 * running the full NFA engine at those earlier locations, only to
609 * eventually fail and retry further along.
610 *
611 * Returns NULL if the pattern can't match, or returns the address within
612 * the string which is the earliest place the match could occur.
613 *
614 * The longest of the anchored and floating substrings is called 'check'
615 * and is checked first. The other is called 'other' and is checked
616 * second. The 'other' substring may not be present. For example,
617 *
618 * /(abc|xyz)ABC\d{0,3}DEFG/
619 *
620 * will have
621 *
622 * check substr (float) = "DEFG", offset 6..9 chars
623 * other substr (anchored) = "ABC", offset 3..3 chars
624 * stclass = [ax]
625 *
626 * Be aware that during the course of this function, sometimes 'anchored'
627 * refers to a substring being anchored relative to the start of the
628 * pattern, and sometimes to the pattern itself being anchored relative to
629 * the string. For example:
630 *
631 * /\dabc/: "abc" is anchored to the pattern;
632 * /^\dabc/: "abc" is anchored to the pattern and the string;
633 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
634 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
635 * but the pattern is anchored to the string.
52a21eb3
DM
636 */
637
cad2e5aa 638char *
52a21eb3
DM
639Perl_re_intuit_start(pTHX_
640 REGEXP * const rx,
641 SV *sv,
642 const char * const strbeg,
643 char *strpos,
644 char *strend,
645 const U32 flags,
646 re_scream_pos_data *data)
cad2e5aa 647{
8d919b0a 648 struct regexp *const prog = ReANY(rx);
6b071d16 649 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 650 /* Should be nonnegative! */
ea3daa5d 651 SSize_t end_shift = 0;
0fc004dd
DM
652 /* current lowest pos in string where the regex can start matching */
653 char *rx_origin = strpos;
eb578fdb 654 SV *check;
f2ed9b32 655 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 656 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 657 bool ml_anch = 0;
8f4bf5fc 658 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 659 char *check_at = NULL; /* check substr found at this pos */
bbe252da 660 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 661 RXi_GET_DECL(prog,progi);
02d5137b
DM
662 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
663 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
664 GET_RE_DEBUG_FLAGS_DECL;
665
7918f24d 666 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
667 PERL_UNUSED_ARG(flags);
668 PERL_UNUSED_ARG(data);
7918f24d 669
1dc475d0
DM
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
671 "Intuit: trying to determine minimum start position...\n"));
672
fb9bbddb
DM
673 /* for now, assume that all substr offsets are positive. If at some point
674 * in the future someone wants to do clever things with look-behind and
675 * -ve offsets, they'll need to fix up any code in this function
676 * which uses these offsets. See the thread beginning
677 * <20140113145929.GF27210@iabyn.com>
678 */
679 assert(prog->substrs->data[0].min_offset >= 0);
680 assert(prog->substrs->data[0].max_offset >= 0);
681 assert(prog->substrs->data[1].min_offset >= 0);
682 assert(prog->substrs->data[1].max_offset >= 0);
683 assert(prog->substrs->data[2].min_offset >= 0);
684 assert(prog->substrs->data[2].max_offset >= 0);
685
f7022b5a 686 /* for now, assume that if both present, that the floating substring
83f2232d 687 * doesn't start before the anchored substring.
f7022b5a
DM
688 * If you break this assumption (e.g. doing better optimisations
689 * with lookahead/behind), then you'll need to audit the code in this
690 * function carefully first
691 */
692 assert(
693 ! ( (prog->anchored_utf8 || prog->anchored_substr)
694 && (prog->float_utf8 || prog->float_substr))
695 || (prog->float_min_offset >= prog->anchored_offset));
696
1a4edc3c
DM
697 /* byte rather than char calculation for efficiency. It fails
698 * to quickly reject some cases that can't match, but will reject
699 * them later after doing full char arithmetic */
c344f387 700 if (prog->minlen > strend - strpos) {
a3621e74 701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 702 " String too short...\n"));
cad2e5aa 703 goto fail;
2c2d71f5 704 }
d8da0584 705
6c3fea77 706 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 707 reginfo->info_aux = NULL;
9d9163fb 708 reginfo->strbeg = strbeg;
220db18a 709 reginfo->strend = strend;
aed7b151 710 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 711 reginfo->intuit = 1;
1cb48e53
DM
712 /* not actually used within intuit, but zero for safety anyway */
713 reginfo->poscache_maxiter = 0;
02d5137b 714
f2ed9b32 715 if (utf8_target) {
33b8afdf
JH
716 if (!prog->check_utf8 && prog->check_substr)
717 to_utf8_substr(prog);
718 check = prog->check_utf8;
719 } else {
7e0d5ad7
KW
720 if (!prog->check_substr && prog->check_utf8) {
721 if (! to_byte_substr(prog)) {
6b54ddc5 722 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
723 }
724 }
33b8afdf
JH
725 check = prog->check_substr;
726 }
274cd312 727
1dc475d0
DM
728 /* dump the various substring data */
729 DEBUG_OPTIMISE_MORE_r({
730 int i;
731 for (i=0; i<=2; i++) {
732 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
733 : prog->substrs->data[i].substr);
734 if (!sv)
735 continue;
736
737 PerlIO_printf(Perl_debug_log,
738 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
739 " useful=%"IVdf" utf8=%d [%s]\n",
740 i,
741 (IV)prog->substrs->data[i].min_offset,
742 (IV)prog->substrs->data[i].max_offset,
743 (IV)prog->substrs->data[i].end_shift,
744 BmUSEFUL(sv),
745 utf8_target ? 1 : 0,
746 SvPEEK(sv));
747 }
748 });
749
8e1490ee 750 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
751
752 /* ml_anch: check after \n?
753 *
754 * A note about IMPLICIT: on an un-anchored pattern beginning
755 * with /.*.../, these flags will have been added by the
756 * compiler:
757 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
758 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
759 */
7d2d37f5
DM
760 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
761 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 762
343c8a29 763 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
764 /* we are only allowed to match at BOS or \G */
765
57fcbfa7 766 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
767 *
768 * Note that we don't try to do a similar quick reject for
769 * \G, since generally the caller will have calculated strpos
770 * based on pos() and gofs, so the string is already correctly
771 * anchored by definition; and handling the exceptions would
772 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 773 */
7bb3b9eb 774 if ( strpos != strbeg
d3d47aac 775 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8
DM
776 {
777 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 778 " Not at start...\n"));
c889ccc8
DM
779 goto fail;
780 }
781
a5d12a4b
DM
782 /* in the presence of an anchor, the anchored (relative to the
783 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
784 * to strpos. So quickly reject if substr isn't found there.
785 * This works for \G too, because the caller will already have
786 * subtracted gofs from pos, and gofs is the offset from the
787 * \G to the start of the regex. For example, in /.abc\Gdef/,
788 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
789 * caller will have set strpos=pos()-4; we look for the substr
790 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 791
c889ccc8 792 if (prog->check_offset_min == prog->check_offset_max
d0d44648 793 && !(prog->intflags & PREGf_CANY_SEEN))
c889ccc8
DM
794 {
795 /* Substring at constant offset from beg-of-str... */
d307bf57 796 SSize_t slen = SvCUR(check);
343c8a29 797 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 798
1dc475d0
DM
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
800 " Looking for check substr at fixed offset %"IVdf"...\n",
801 (IV)prog->check_offset_min));
802
7742aa66
DM
803 if (SvTAIL(check)) {
804 /* In this case, the regex is anchored at the end too.
805 * Unless it's a multiline match, the lengths must match
806 * exactly, give or take a \n. NB: slen >= 1 since
807 * the last char of check is \n */
808 if (!multiline
809 && ( strend - s > slen
810 || strend - s < slen - 1
811 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
812 {
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 814 " String too long...\n"));
c889ccc8
DM
815 goto fail_finish;
816 }
817 /* Now should match s[0..slen-2] */
818 slen--;
c889ccc8 819 }
d307bf57
DM
820 if (slen && (*SvPVX_const(check) != *s
821 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
822 {
823 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 824 " String not equal...\n"));
d307bf57
DM
825 goto fail_finish;
826 }
c889ccc8
DM
827
828 check_at = s;
829 goto success_at_start;
cad2e5aa 830 }
cad2e5aa 831 }
cad2e5aa 832 }
0fc004dd 833
c0e0ec46 834 end_shift = prog->check_end_shift;
cad2e5aa 835
19188028 836#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 837 if (end_shift < 0)
1de06328 838 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 839 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
840#endif
841
2c2d71f5 842 restart:
1de06328 843
66b7ec5c
DM
844 /* This is the (re)entry point of the main loop in this function.
845 * The goal of this loop is to:
846 * 1) find the "check" substring in the region rx_origin..strend
847 * (adjusted by start_shift / end_shift). If not found, reject
848 * immediately.
849 * 2) If it exists, look for the "other" substr too if defined; for
850 * example, if the check substr maps to the anchored substr, then
851 * check the floating substr, and vice-versa. If not found, go
852 * back to (1) with rx_origin suitably incremented.
853 * 3) If we find an rx_origin position that doesn't contradict
854 * either of the substrings, then check the possible additional
855 * constraints on rx_origin of /^.../m or a known start class.
856 * If these fail, then depending on which constraints fail, jump
857 * back to here, or to various other re-entry points further along
858 * that skip some of the first steps.
859 * 4) If we pass all those tests, update the BmUSEFUL() count on the
860 * substring. If the start position was determined to be at the
861 * beginning of the string - so, not rejected, but not optimised,
862 * since we have to run regmatch from position 0 - decrement the
863 * BmUSEFUL() count. Otherwise increment it.
864 */
865
1a4edc3c
DM
866
867 /* first, look for the 'check' substring */
868
1de06328 869 {
c33e64f0
FC
870 U8* start_point;
871 U8* end_point;
c889ccc8 872
c889ccc8 873 DEBUG_OPTIMISE_MORE_r({
1dc475d0 874 PerlIO_printf(Perl_debug_log,
ae5d4331 875 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 876 " Start shift: %"IVdf" End shift %"IVdf
4d281893 877 " Real end Shift: %"IVdf"\n",
197641b6 878 (IV)(rx_origin - strpos),
c889ccc8 879 (IV)prog->check_offset_min,
12fbc530
DM
880 (IV)start_shift,
881 (IV)end_shift,
c889ccc8
DM
882 (IV)prog->check_end_shift);
883 });
1de06328 884
0d331aaf 885 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 886 start_point= (U8*)(rx_origin + start_shift);
12fbc530 887 end_point= (U8*)(strend - end_shift);
557f47af
DM
888 if (start_point > end_point)
889 goto fail_finish;
1de06328 890 } else {
557f47af
DM
891 end_point = HOP3(strend, -end_shift, strbeg);
892 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
893 if (!start_point)
894 goto fail_finish;
1de06328 895 }
c889ccc8 896
557f47af 897
e0362b86 898 /* If the regex is absolutely anchored to either the start of the
d3d47aac 899 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
900 * check_offset_max represents an upper bound on the string where
901 * the substr could start. For the ANCH_GPOS case, we assume that
902 * the caller of intuit will have already set strpos to
903 * pos()-gofs, so in this case strpos + offset_max will still be
904 * an upper bound on the substr.
905 */
c19c836a
DM
906 if (!ml_anch
907 && prog->intflags & PREGf_ANCH
e0362b86 908 && prog->check_offset_max != SSize_t_MAX)
c19c836a 909 {
1a08ba3a 910 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
911 const char * const anchor =
912 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
913
914 /* do a bytes rather than chars comparison. It's conservative;
915 * so it skips doing the HOP if the result can't possibly end
916 * up earlier than the old value of end_point.
917 */
918 if ((char*)end_point - anchor > prog->check_offset_max) {
919 end_point = HOP3lim((U8*)anchor,
920 prog->check_offset_max,
921 end_point -len)
922 + len;
923 }
d6ef1678
DM
924 }
925
6bda09f9 926 DEBUG_OPTIMISE_MORE_r({
1dc475d0 927 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 928 (int)(end_point - start_point),
fc8cd66c 929 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
930 start_point);
931 });
932
ae5d4331 933 check_at = fbm_instr( start_point, end_point,
7fba1cd6 934 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 935
8fd34720
DM
936 /* Update the count-of-usability, remove useless subpatterns,
937 unshift s. */
938
939 DEBUG_EXECUTE_r({
940 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
941 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
942 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
943 (check_at ? "Found" : "Did not find"),
944 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
945 ? "anchored" : "floating"),
946 quoted,
947 RE_SV_TAIL(check),
948 (check_at ? " at offset " : "...\n") );
949 });
2c2d71f5 950
8fd34720
DM
951 if (!check_at)
952 goto fail_finish;
953 /* Finish the diagnostic message */
954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 955
8fd34720
DM
956 /* set rx_origin to the minimum position where the regex could start
957 * matching, given the constraint of the just-matched check substring.
958 * But don't set it lower than previously.
959 */
fdc003fd 960
8fd34720
DM
961 if (check_at - rx_origin > prog->check_offset_max)
962 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
963 }
fdc003fd
DM
964
965
1a4edc3c 966 /* now look for the 'other' substring if defined */
2c2d71f5 967
6480a6c4
DM
968 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
969 : prog->substrs->data[other_ix].substr)
1de06328 970 {
30944b6d 971 /* Take into account the "other" substring. */
6c3343a6
DM
972 char *last, *last1;
973 char *s;
974 SV* must;
975 struct reg_substr_datum *other;
976
977 do_other_substr:
978 other = &prog->substrs->data[other_ix];
979
980 /* if "other" is anchored:
981 * we've previously found a floating substr starting at check_at.
982 * This means that the regex origin must lie somewhere
983 * between min (rx_origin): HOP3(check_at, -check_offset_max)
984 * and max: HOP3(check_at, -check_offset_min)
985 * (except that min will be >= strpos)
986 * So the fixed substr must lie somewhere between
987 * HOP3(min, anchored_offset)
988 * HOP3(max, anchored_offset) + SvCUR(substr)
989 */
990
991 /* if "other" is floating
992 * Calculate last1, the absolute latest point where the
993 * floating substr could start in the string, ignoring any
994 * constraints from the earlier fixed match. It is calculated
995 * as follows:
996 *
997 * strend - prog->minlen (in chars) is the absolute latest
998 * position within the string where the origin of the regex
999 * could appear. The latest start point for the floating
1000 * substr is float_min_offset(*) on from the start of the
1001 * regex. last1 simply combines thee two offsets.
1002 *
1003 * (*) You might think the latest start point should be
1004 * float_max_offset from the regex origin, and technically
1005 * you'd be correct. However, consider
1006 * /a\d{2,4}bcd\w/
1007 * Here, float min, max are 3,5 and minlen is 7.
1008 * This can match either
1009 * /a\d\dbcd\w/
1010 * /a\d\d\dbcd\w/
1011 * /a\d\d\d\dbcd\w/
1012 * In the first case, the regex matches minlen chars; in the
1013 * second, minlen+1, in the third, minlen+2.
1014 * In the first case, the floating offset is 3 (which equals
1015 * float_min), in the second, 4, and in the third, 5 (which
1016 * equals float_max). In all cases, the floating string bcd
1017 * can never start more than 4 chars from the end of the
1018 * string, which equals minlen - float_min. As the substring
1019 * starts to match more than float_min from the start of the
1020 * regex, it makes the regex match more than minlen chars,
1021 * and the two cancel each other out. So we can always use
1022 * float_min - minlen, rather than float_max - minlen for the
1023 * latest position in the string.
1024 *
1025 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1026 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1027 */
1028
e7a14a9c 1029 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1030 last1 = HOP3c(strend,
1031 other->min_offset - prog->minlen, strbeg);
1032
4d006249 1033 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1034 /* last is the latest point where the floating substr could
1035 * start, *given* any constraints from the earlier fixed
1036 * match. This constraint is that the floating string starts
1037 * <= float_max_offset chars from the regex origin (rx_origin).
1038 * If this value is less than last1, use it instead.
eb3831ce 1039 */
6c3343a6
DM
1040 assert(rx_origin <= last1);
1041 last =
1042 /* this condition handles the offset==infinity case, and
1043 * is a short-cut otherwise. Although it's comparing a
1044 * byte offset to a char length, it does so in a safe way,
1045 * since 1 char always occupies 1 or more bytes,
1046 * so if a string range is (last1 - rx_origin) bytes,
1047 * it will be less than or equal to (last1 - rx_origin)
1048 * chars; meaning it errs towards doing the accurate HOP3
1049 * rather than just using last1 as a short-cut */
1050 (last1 - rx_origin) < other->max_offset
1051 ? last1
1052 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1053 }
1054 else {
1055 assert(strpos + start_shift <= check_at);
1056 last = HOP4c(check_at, other->min_offset - start_shift,
1057 strbeg, strend);
1058 }
ead917d0 1059
6c3343a6
DM
1060 s = HOP3c(rx_origin, other->min_offset, strend);
1061 if (s < other_last) /* These positions already checked */
1062 s = other_last;
1063
1064 must = utf8_target ? other->utf8_substr : other->substr;
1065 assert(SvPOK(must));
1066 s = fbm_instr(
1067 (unsigned char*)s,
1068 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1069 must,
1070 multiline ? FBMrf_MULTILINE : 0
1071 );
1072 DEBUG_EXECUTE_r({
1073 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1076 s ? "Found" : "Contradicts",
1077 other_ix ? "floating" : "anchored",
1078 quoted, RE_SV_TAIL(must));
1079 });
ead917d0 1080
ead917d0 1081
6c3343a6
DM
1082 if (!s) {
1083 /* last1 is latest possible substr location. If we didn't
1084 * find it before there, we never will */
1085 if (last >= last1) {
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1087 ", giving up...\n"));
1088 goto fail_finish;
ead917d0
DM
1089 }
1090
6c3343a6
DM
1091 /* try to find the check substr again at a later
1092 * position. Maybe next time we'll find the "other" substr
1093 * in range too */
1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095 ", trying %s at offset %ld...\n",
1096 (other_ix ? "floating" : "anchored"),
197641b6 1097 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1098
1099 other_last = HOP3c(last, 1, strend) /* highest failure */;
1100 rx_origin =
4d006249 1101 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1102 ? HOP3c(rx_origin, 1, strend)
1103 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1104 goto restart;
1105 }
1106 else {
1107 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1108 (long)(s - strpos)));
ead917d0 1109
4d006249 1110 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1111 /* other_last is set to s, not s+1, since its possible for
1112 * a floating substr to fail first time, then succeed
1113 * second time at the same floating position; e.g.:
1114 * "-AB--AABZ" =~ /\wAB\d*Z/
1115 * The first time round, anchored and float match at
1116 * "-(AB)--AAB(Z)" then fail on the initial \w character
1117 * class. Second time round, they match at "-AB--A(AB)(Z)".
1118 */
1119 other_last = s;
ead917d0
DM
1120 }
1121 else {
6c3343a6
DM
1122 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1123 other_last = HOP3c(s, 1, strend);
ead917d0 1124 }
6c3343a6 1125 }
cad2e5aa 1126 }
acba93e8
DM
1127 else {
1128 DEBUG_OPTIMISE_MORE_r(
1129 PerlIO_printf(Perl_debug_log,
1130 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1131 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1132 " strend-strpos:%"IVdf"\n",
1133 (IV)prog->check_offset_min,
1134 (IV)prog->check_offset_max,
1c1c599d 1135 (IV)(check_at-strpos),
acba93e8 1136 (IV)(rx_origin-strpos),
1c1c599d 1137 (IV)(rx_origin-check_at),
acba93e8
DM
1138 (IV)(strend-strpos)
1139 )
1140 );
1141 }
2c2d71f5 1142
acba93e8 1143 postprocess_substr_matches:
0991020e 1144
1a4edc3c 1145 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1146
7d2d37f5 1147 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1148 char *s;
1149
a62659bd
DM
1150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1151 " looking for /^/m anchor"));
d0880ea7
DM
1152
1153 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1154 * Find the next \n, if any, even if it's beyond the current
1155 * anchored and/or floating substrings. Whether we should be
1156 * scanning ahead for the next \n or the next substr is debatable.
1157 * On the one hand you'd expect rare substrings to appear less
1158 * often than \n's. On the other hand, searching for \n means
1159 * we're effectively flipping been check_substr and "\n" on each
1160 * iteration as the current "rarest" string candidate, which
1161 * means for example that we'll quickly reject the whole string if
1162 * hasn't got a \n, rather than trying every substr position
1163 * first
1164 */
d0880ea7 1165
4620cb61
DM
1166 s = HOP3c(strend, - prog->minlen, strpos);
1167 if (s <= rx_origin ||
1168 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1169 {
d0880ea7
DM
1170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1171 " Did not find /%s^%s/m...\n",
1172 PL_colors[0], PL_colors[1]));
a62659bd
DM
1173 goto fail_finish;
1174 }
d0880ea7 1175
4ada1233
DM
1176 /* earliest possible origin is 1 char after the \n.
1177 * (since *rx_origin == '\n', it's safe to ++ here rather than
1178 * HOP(rx_origin, 1)) */
1179 rx_origin++;
d0880ea7 1180
f4f115de 1181 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1182 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1183 {
d0880ea7
DM
1184 /* Position contradicts check-string; either because
1185 * check was anchored (and thus has no wiggle room),
4ada1233 1186 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1187 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1188 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1189 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1190 goto restart;
1191 }
1192
1193 /* if we get here, the check substr must have been float,
2e759faa 1194 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1195 * "other" substr which still contradicts */
1196 assert(prog->substrs->check_ix); /* check is float */
1197
1198 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1199 /* whoops, the anchored "other" substr exists, so we still
1200 * contradict. On the other hand, the float "check" substr
1201 * didn't contradict, so just retry the anchored "other"
1202 * substr */
1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1205 PL_colors[0], PL_colors[1],
1206 (long)(rx_origin - strpos),
1207 (long)(rx_origin - strpos + prog->anchored_offset)));
1208 goto do_other_substr;
1209 }
1210
1211 /* success: we don't contradict the found floating substring
1212 * (and there's no anchored substr). */
d0880ea7
DM
1213 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1214 " Found /%s^%s/m at offset %ld...\n",
1215 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1216 }
1217 else {
2e759faa 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1219 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1220 }
1221
ffad1e6a 1222 success_at_start:
e3c6feb0 1223
cad2e5aa 1224
dd170ff5
DM
1225 /* if we have a starting character class, then test that extra constraint.
1226 * (trie stclasses are too expensive to use here, we are better off to
1227 * leave it to regmatch itself) */
1228
f8fc2ecf 1229 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1230 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1231
2c75e362 1232 /* XXX this value could be pre-computed */
f8fc2ecf 1233 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1234 ? (reginfo->is_utf8_pat
1235 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1236 : STR_LEN(progi->regstclass))
66e933ab 1237 : 1);
1de06328 1238 char * endpos;
fa3bb21d 1239 char *s;
000dfd2d
DM
1240 /* latest pos that a matching float substr constrains rx start to */
1241 char *rx_max_float = NULL;
1242
c75a3985
DM
1243 /* if the current rx_origin is anchored, either by satisfying an
1244 * anchored substring constraint, or a /^.../m constraint, then we
1245 * can reject the current origin if the start class isn't found
1246 * at the current position. If we have a float-only match, then
1247 * rx_origin is constrained to a range; so look for the start class
1248 * in that range. if neither, then look for the start class in the
1249 * whole rest of the string */
1250
dd170ff5
DM
1251 /* XXX DAPM it's not clear what the minlen test is for, and why
1252 * it's not used in the floating case. Nothing in the test suite
1253 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1254 * Here are some old comments, which may or may not be correct:
1255 *
1256 * minlen == 0 is possible if regstclass is \b or \B,
1257 * and the fixed substr is ''$.
1258 * Since minlen is already taken into account, rx_origin+1 is
1259 * before strend; accidentally, minlen >= 1 guaranties no false
1260 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1261 * 0) below assumes that regstclass does not come from lookahead...
1262 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1263 * This leaves EXACTF-ish only, which are dealt with in
1264 * find_byclass().
1265 */
1266
7d2d37f5 1267 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1268 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1269 else if (prog->float_substr || prog->float_utf8) {
1270 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1271 endpos= HOP3c(rx_max_float, cl_l, strend);
1272 }
1de06328
YO
1273 else
1274 endpos= strend;
1275
1dc475d0
DM
1276 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1277 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1278 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1279 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1280 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1281
c43b5520 1282 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1283 reginfo);
be778b1a 1284 if (!s) {
6eb5f6b9 1285 if (endpos == strend) {
a3621e74 1286 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1287 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1288 goto fail;
1289 }
a3621e74 1290 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1291 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1292 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1293 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1294 goto fail;
9fed8d02 1295
6eb5f6b9 1296 /* Contradict one of substrings */
97136c8a
DM
1297 if (prog->anchored_substr || prog->anchored_utf8) {
1298 if (prog->substrs->check_ix == 1) { /* check is float */
1299 /* Have both, check_string is floating */
1300 assert(rx_origin + start_shift <= check_at);
1301 if (rx_origin + start_shift != check_at) {
1302 /* not at latest position float substr could match:
c75a3985
DM
1303 * Recheck anchored substring, but not floating.
1304 * The condition above is in bytes rather than
1305 * chars for efficiency. It's conservative, in
1306 * that it errs on the side of doing 'goto
1307 * do_other_substr', where a more accurate
1308 * char-based calculation will be done */
97136c8a
DM
1309 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1310 " Looking for anchored substr starting at offset %ld...\n",
1311 (long)(other_last - strpos)) );
1312 goto do_other_substr;
3369914b 1313 }
3369914b
DM
1314 }
1315 }
97136c8a 1316 else {
9fed8d02
DM
1317 /* float-only */
1318
7d2d37f5 1319 if (ml_anch) {
c75a3985
DM
1320 /* In the presence of ml_anch, we might be able to
1321 * find another \n without breaking the current float
1322 * constraint. */
1323
1324 /* strictly speaking this should be HOP3c(..., 1, ...),
1325 * but since we goto a block of code that's going to
1326 * search for the next \n if any, its safe here */
9fed8d02 1327 rx_origin++;
9fed8d02
DM
1328 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1329 " Looking for /%s^%s/m starting at offset %ld...\n",
1330 PL_colors[0], PL_colors[1],
1331 (long)(rx_origin - strpos)) );
9fed8d02 1332 goto postprocess_substr_matches;
ab60c45a 1333 }
c75a3985
DM
1334
1335 /* strictly speaking this can never be true; but might
1336 * be if we ever allow intuit without substrings */
1337 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1338 goto fail;
c75a3985 1339
000dfd2d 1340 rx_origin = rx_max_float;
9fed8d02
DM
1341 }
1342
c75a3985
DM
1343 /* at this point, any matching substrings have been
1344 * contradicted. Start again... */
1345
9fed8d02 1346 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1347
1348 /* uses bytes rather than char calculations for efficiency.
1349 * It's conservative: it errs on the side of doing 'goto restart',
1350 * where there is code that does a proper char-based test */
9fed8d02 1351 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1352 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1353 " Could not match STCLASS...\n") );
1354 goto fail;
1355 }
1356 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1357 " Looking for %s substr starting at offset %ld...\n",
1358 (prog->substrs->check_ix ? "floating" : "anchored"),
1359 (long)(rx_origin + start_shift - strpos)) );
1360 goto restart;
6eb5f6b9 1361 }
9fed8d02 1362
c75a3985
DM
1363 /* Success !!! */
1364
5f9c6575 1365 if (rx_origin != s) {
a3621e74 1366 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1367 " By STCLASS: moving %ld --> %ld\n",
5f9c6575 1368 (long)(rx_origin - strpos), (long)(s - strpos))
b7953727
JH
1369 );
1370 }
1371 else {
a3621e74 1372 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1373 " Does not contradict STCLASS...\n");
b7953727
JH
1374 );
1375 }
6eb5f6b9 1376 }
ffad1e6a
DM
1377
1378 /* Decide whether using the substrings helped */
1379
1380 if (rx_origin != strpos) {
1381 /* Fixed substring is found far enough so that the match
1382 cannot start at strpos. */
1383
1384 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1385 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1386 }
1387 else {
70563e16
DM
1388 /* The found rx_origin position does not prohibit matching at
1389 * strpos, so calling intuit didn't gain us anything. Decrement
1390 * the BmUSEFUL() count on the check substring, and if we reach
1391 * zero, free it. */
1392 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1393 && (utf8_target ? (
1394 prog->check_utf8 /* Could be deleted already */
1395 && --BmUSEFUL(prog->check_utf8) < 0
1396 && (prog->check_utf8 == prog->float_utf8)
1397 ) : (
1398 prog->check_substr /* Could be deleted already */
1399 && --BmUSEFUL(prog->check_substr) < 0
1400 && (prog->check_substr == prog->float_substr)
1401 )))
1402 {
1403 /* If flags & SOMETHING - do not do it many times on the same match */
1404 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1405 /* XXX Does the destruction order has to change with utf8_target? */
1406 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1407 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1408 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1409 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1410 check = NULL; /* abort */
ffad1e6a
DM
1411 /* XXXX This is a remnant of the old implementation. It
1412 looks wasteful, since now INTUIT can use many
1413 other heuristics. */
1414 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1415 }
1416 }
1417
1418 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1419 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1420 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1421
c765d6e0 1422 return rx_origin;
2c2d71f5
JH
1423
1424 fail_finish: /* Substring not found */
33b8afdf 1425 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1426 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1427 fail:
a3621e74 1428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1429 PL_colors[4], PL_colors[5]));
bd61b366 1430 return NULL;
cad2e5aa 1431}
9661b544 1432
70563e16 1433
a0a388a1 1434#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1435 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1436 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1437 trie_type = ((scan->flags == EXACT) \
1438 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1439 : (scan->flags == EXACTFA) \
1440 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1441 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1442
fd3249ee 1443#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1444STMT_START { \
fab2782b 1445 STRLEN skiplen; \
baa60164 1446 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1447 switch (trie_type) { \
31f05a37 1448 case trie_utf8_exactfa_fold: \
baa60164 1449 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1450 /* FALLTHROUGH */ \
fab2782b
YO
1451 case trie_utf8_fold: \
1452 if ( foldlen>0 ) { \
c80e42f3 1453 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1454 foldlen -= len; \
1455 uscan += len; \
1456 len=0; \
1457 } else { \
445bf929 1458 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1459 len = UTF8SKIP(uc); \
1460 skiplen = UNISKIP( uvc ); \
1461 foldlen -= skiplen; \
1462 uscan = foldbuf + skiplen; \
1463 } \
1464 break; \
baa60164
KW
1465 case trie_latin_utf8_exactfa_fold: \
1466 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1467 /* FALLTHROUGH */ \
fab2782b
YO
1468 case trie_latin_utf8_fold: \
1469 if ( foldlen>0 ) { \
c80e42f3 1470 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1471 foldlen -= len; \
1472 uscan += len; \
1473 len=0; \
1474 } else { \
1475 len = 1; \
31f05a37 1476 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1477 skiplen = UNISKIP( uvc ); \
1478 foldlen -= skiplen; \
1479 uscan = foldbuf + skiplen; \
1480 } \
1481 break; \
1482 case trie_utf8: \
c80e42f3 1483 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1484 break; \
1485 case trie_plain: \
1486 uvc = (UV)*uc; \
1487 len = 1; \
1488 } \
1489 if (uvc < 256) { \
1490 charid = trie->charmap[ uvc ]; \
1491 } \
1492 else { \
1493 charid = 0; \
1494 if (widecharmap) { \
1495 SV** const svpp = hv_fetch(widecharmap, \
1496 (char*)&uvc, sizeof(UV), 0); \
1497 if (svpp) \
1498 charid = (U16)SvIV(*svpp); \
1499 } \
1500 } \
4cadc6a9
YO
1501} STMT_END
1502
ae7c5b9b
KW
1503#define DUMP_EXEC_POS(li,s,doutf8) \
1504 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1505 startpos, doutf8)
1506
c84a03c5 1507#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1508STMT_START { \
1509 while (s <= e) { \
c84a03c5 1510 if ( (COND) \
fac1af77 1511 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1512 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1513 goto got_it; \
1514 s++; \
1515 } \
1516} STMT_END
1517
c84a03c5 1518#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1519STMT_START { \
9a902117 1520 while (s < strend) { \
c84a03c5 1521 CODE \
9a902117 1522 s += UTF8SKIP(s); \
4cadc6a9
YO
1523 } \
1524} STMT_END
1525
c84a03c5 1526#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1527STMT_START { \
1528 while (s < strend) { \
c84a03c5 1529 CODE \
4cadc6a9
YO
1530 s++; \
1531 } \
1532} STMT_END
1533
05bd126c
KW
1534#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1535REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1536 if (COND) { \
1537 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1538 goto got_it; \
1539 else \
1540 tmp = doevery; \
1541 } \
1542 else \
1543 tmp = 1; \
4cadc6a9
YO
1544)
1545
05bd126c
KW
1546#define REXEC_FBC_CLASS_SCAN(COND) \
1547REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1548 if (COND) { \
1549 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1550 goto got_it; \
1551 else \
1552 tmp = doevery; \
1553 } \
1554 else \
1555 tmp = 1; \
4cadc6a9
YO
1556)
1557
c84a03c5 1558#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1559 if (utf8_target) { \
c84a03c5 1560 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1561 } \
1562 else { \
c84a03c5 1563 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1564 }
05bd126c 1565
05bd126c
KW
1566/* The three macros below are slightly different versions of the same logic.
1567 *
1568 * The first is for /a and /aa when the target string is UTF-8. This can only
1569 * match ascii, but it must advance based on UTF-8. The other two handle the
1570 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1571 * for the boundary (or non-boundary) between a word and non-word character.
1572 * The utf8 and non-utf8 cases have the same logic, but the details must be
1573 * different. Find the "wordness" of the character just prior to this one, and
1574 * compare it with the wordness of this one. If they differ, we have a
1575 * boundary. At the beginning of the string, pretend that the previous
1576 * character was a new-line.
1577 *
1578 * All these macros uncleanly have side-effects with each other and outside
1579 * variables. So far it's been too much trouble to clean-up
1580 *
1581 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1582 * a word character or not.
1583 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1584 * word/non-word
1585 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1586 *
1587 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1588 * are looking for a boundary or for a non-boundary. If we are looking for a
1589 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1590 * see if this tentative match actually works, and if so, to quit the loop
1591 * here. And vice-versa if we are looking for a non-boundary.
1592 *
1593 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1594 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1595 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1596 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1597 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1598 * complement. But in that branch we complement tmp, meaning that at the
1599 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1600 * which means at the top of the loop in the next iteration, it is
1601 * TEST_NON_UTF8(s-1) */
b2f4e957 1602#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1603 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1604 tmp = TEST_NON_UTF8(tmp); \
1605 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1606 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1607 tmp = !tmp; \
1608 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1609 } \
1610 else { \
1611 IF_FAIL; \
1612 } \
1613 ); \
1614
1615/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1616 * TEST_UTF8 is a macro that for the same input code points returns identically
1617 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1618#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1619 if (s == reginfo->strbeg) { \
1620 tmp = '\n'; \
1621 } \
1622 else { /* Back-up to the start of the previous character */ \
1623 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1624 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1625 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1626 } \
1627 tmp = TEST_UV(tmp); \
1628 LOAD_UTF8_CHARCLASS_ALNUM(); \
1629 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1630 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1631 tmp = !tmp; \
1632 IF_SUCCESS; \
1633 } \
1634 else { \
1635 IF_FAIL; \
1636 } \
1637 );
cfaf538b 1638
05bd126c
KW
1639/* Like the above two macros. UTF8_CODE is the complete code for handling
1640 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1641 * macros below */
baa60164 1642#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1643 if (utf8_target) { \
05bd126c 1644 UTF8_CODE \
63ac0dad
KW
1645 } \
1646 else { /* Not utf8 */ \
9d9163fb 1647 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1648 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1649 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1650 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1651 IF_SUCCESS; \
760cfa8e 1652 tmp = !tmp; \
63ac0dad
KW
1653 } \
1654 else { \
1655 IF_FAIL; \
1656 } \
1657 ); \
1658 } \
c8519dc7
KW
1659 /* Here, things have been set up by the previous code so that tmp is the \
1660 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1661 * utf8ness of the target). We also have to check if this matches against \
1662 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1663 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1664 * string */ \
1665 if (tmp == ! TEST_NON_UTF8('\n')) { \
1666 IF_SUCCESS; \
1667 } \
1668 else { \
1669 IF_FAIL; \
1670 }
63ac0dad 1671
ae7c5b9b
KW
1672/* This is the macro to use when we want to see if something that looks like it
1673 * could match, actually does, and if so exits the loop */
1674#define REXEC_FBC_TRYIT \
1675 if ((reginfo->intuit || regtry(reginfo, &s))) \
1676 goto got_it
1677
1678/* The only difference between the BOUND and NBOUND cases is that
1679 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1680 * NBOUND. This is accomplished by passing it as either the if or else clause,
1681 * with the other one being empty (PLACEHOLDER is defined as empty).
1682 *
1683 * The TEST_FOO parameters are for operating on different forms of input, but
1684 * all should be ones that return identically for the same underlying code
1685 * points */
1686#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1687 FBC_BOUND_COMMON( \
1688 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1689 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1690
1691#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1692 FBC_BOUND_COMMON( \
1693 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1694 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1695
1696#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1697 FBC_BOUND_COMMON( \
1698 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1699 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1700
1701#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1702 FBC_BOUND_COMMON( \
1703 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1704 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1705
05bd126c 1706
786e8c11 1707/* We know what class REx starts with. Try to find this position... */
02d5137b 1708/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1709/* annoyingly all the vars in this routine have different names from their counterparts
1710 in regmatch. /grrr */
3c3eec57 1711STATIC char *
07be1b83 1712S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1713 const char *strend, regmatch_info *reginfo)
a687059c 1714{
73104a1b
KW
1715 dVAR;
1716 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1717 char *pat_string; /* The pattern's exactish string */
1718 char *pat_end; /* ptr to end char of pat_string */
1719 re_fold_t folder; /* Function for computing non-utf8 folds */
1720 const U8 *fold_array; /* array for folding ords < 256 */
1721 STRLEN ln;
1722 STRLEN lnc;
73104a1b
KW
1723 U8 c1;
1724 U8 c2;
1725 char *e;
1726 I32 tmp = 1; /* Scratch variable? */
ba44c216 1727 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1728 UV utf8_fold_flags = 0;
f9176b44 1729 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1730 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1731 with a result inverts that result, as 0^1 =
1732 1 and 1^1 = 0 */
1733 _char_class_number classnum;
1734
73104a1b 1735 RXi_GET_DECL(prog,progi);
2f7f8cb1 1736
73104a1b 1737 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1738
73104a1b
KW
1739 /* We know what class it must start with. */
1740 switch (OP(c)) {
1741 case ANYOF:
1742 if (utf8_target) {
1743 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1744 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1745 }
1746 else {
1747 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1748 }
1749 break;
1750 case CANY:
1751 REXEC_FBC_SCAN(
02d5137b 1752 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1753 goto got_it;
1754 else
1755 tmp = doevery;
1756 );
1757 break;
1758
098b07d5
KW
1759 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1760 assert(! is_utf8_pat);
924ba076 1761 /* FALLTHROUGH */
73104a1b 1762 case EXACTFA:
984e6dd1 1763 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1764 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1765 goto do_exactf_utf8;
1766 }
1767 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1768 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1769 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1770
2fdb7295
KW
1771 case EXACTF: /* This node only generated for non-utf8 patterns */
1772 assert(! is_utf8_pat);
73104a1b 1773 if (utf8_target) {
73104a1b
KW
1774 utf8_fold_flags = 0;
1775 goto do_exactf_utf8;
1776 }
1777 fold_array = PL_fold;
1778 folder = foldEQ;
1779 goto do_exactf_non_utf8;
1780
1781 case EXACTFL:
31f05a37 1782 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1783 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1784 goto do_exactf_utf8;
1785 }
1786 fold_array = PL_fold_locale;
1787 folder = foldEQ_locale;
1788 goto do_exactf_non_utf8;
3c760661 1789
73104a1b 1790 case EXACTFU_SS:
984e6dd1 1791 if (is_utf8_pat) {
73104a1b
KW
1792 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1793 }
1794 goto do_exactf_utf8;
16d951b7 1795
73104a1b 1796 case EXACTFU:
984e6dd1
DM
1797 if (is_utf8_pat || utf8_target) {
1798 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1799 goto do_exactf_utf8;
1800 }
fac1af77 1801
73104a1b
KW
1802 /* Any 'ss' in the pattern should have been replaced by regcomp,
1803 * so we don't have to worry here about this single special case
1804 * in the Latin1 range */
1805 fold_array = PL_fold_latin1;
1806 folder = foldEQ_latin1;
1807
924ba076 1808 /* FALLTHROUGH */
73104a1b
KW
1809
1810 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1811 are no glitches with fold-length differences
1812 between the target string and pattern */
1813
1814 /* The idea in the non-utf8 EXACTF* cases is to first find the
1815 * first character of the EXACTF* node and then, if necessary,
1816 * case-insensitively compare the full text of the node. c1 is the
1817 * first character. c2 is its fold. This logic will not work for
1818 * Unicode semantics and the german sharp ss, which hence should
1819 * not be compiled into a node that gets here. */
1820 pat_string = STRING(c);
1821 ln = STR_LEN(c); /* length to match in octets/bytes */
1822
1823 /* We know that we have to match at least 'ln' bytes (which is the
1824 * same as characters, since not utf8). If we have to match 3
1825 * characters, and there are only 2 availabe, we know without
1826 * trying that it will fail; so don't start a match past the
1827 * required minimum number from the far end */
ea3daa5d 1828 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1829
02d5137b 1830 if (reginfo->intuit && e < s) {
73104a1b
KW
1831 e = s; /* Due to minlen logic of intuit() */
1832 }
fac1af77 1833
73104a1b
KW
1834 c1 = *pat_string;
1835 c2 = fold_array[c1];
1836 if (c1 == c2) { /* If char and fold are the same */
1837 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1838 }
1839 else {
1840 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1841 }
1842 break;
fac1af77 1843
73104a1b
KW
1844 do_exactf_utf8:
1845 {
1846 unsigned expansion;
1847
1848 /* If one of the operands is in utf8, we can't use the simpler folding
1849 * above, due to the fact that many different characters can have the
1850 * same fold, or portion of a fold, or different- length fold */
1851 pat_string = STRING(c);
1852 ln = STR_LEN(c); /* length to match in octets/bytes */
1853 pat_end = pat_string + ln;
984e6dd1 1854 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1855 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1856 : ln;
1857
1858 /* We have 'lnc' characters to match in the pattern, but because of
1859 * multi-character folding, each character in the target can match
1860 * up to 3 characters (Unicode guarantees it will never exceed
1861 * this) if it is utf8-encoded; and up to 2 if not (based on the
1862 * fact that the Latin 1 folds are already determined, and the
1863 * only multi-char fold in that range is the sharp-s folding to
1864 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1865 * string character. Adjust lnc accordingly, rounding up, so that
1866 * if we need to match at least 4+1/3 chars, that really is 5. */
1867 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1868 lnc = (lnc + expansion - 1) / expansion;
1869
1870 /* As in the non-UTF8 case, if we have to match 3 characters, and
1871 * only 2 are left, it's guaranteed to fail, so don't start a
1872 * match that would require us to go beyond the end of the string
1873 */
ea3daa5d 1874 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1875
02d5137b 1876 if (reginfo->intuit && e < s) {
73104a1b
KW
1877 e = s; /* Due to minlen logic of intuit() */
1878 }
0658cdde 1879
73104a1b
KW
1880 /* XXX Note that we could recalculate e to stop the loop earlier,
1881 * as the worst case expansion above will rarely be met, and as we
1882 * go along we would usually find that e moves further to the left.
1883 * This would happen only after we reached the point in the loop
1884 * where if there were no expansion we should fail. Unclear if
1885 * worth the expense */
1886
1887 while (s <= e) {
1888 char *my_strend= (char *)strend;
1889 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1890 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1891 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1892 {
1893 goto got_it;
1894 }
1895 s += (utf8_target) ? UTF8SKIP(s) : 1;
1896 }
1897 break;
1898 }
236d82fd 1899
73104a1b 1900 case BOUNDL:
236d82fd 1901 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b
KW
1902 break;
1903 case NBOUNDL:
236d82fd 1904 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b
KW
1905 break;
1906 case BOUND:
236d82fd 1907 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1908 break;
1909 case BOUNDA:
236d82fd 1910 FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
73104a1b
KW
1911 break;
1912 case NBOUND:
236d82fd 1913 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1914 break;
1915 case NBOUNDA:
236d82fd 1916 FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
73104a1b
KW
1917 break;
1918 case BOUNDU:
236d82fd 1919 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1920 break;
1921 case NBOUNDU:
236d82fd 1922 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 1923 break;
73104a1b
KW
1924 case LNBREAK:
1925 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1926 is_LNBREAK_latin1_safe(s, strend)
1927 );
1928 break;
3018b823
KW
1929
1930 /* The argument to all the POSIX node types is the class number to pass to
1931 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1932
1933 case NPOSIXL:
1934 to_complement = 1;
1935 /* FALLTHROUGH */
1936
1937 case POSIXL:
3018b823
KW
1938 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1939 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1940 break;
3018b823
KW
1941
1942 case NPOSIXD:
1943 to_complement = 1;
1944 /* FALLTHROUGH */
1945
1946 case POSIXD:
1947 if (utf8_target) {
1948 goto posix_utf8;
1949 }
1950 goto posixa;
1951
1952 case NPOSIXA:
1953 if (utf8_target) {
1954 /* The complement of something that matches only ASCII matches all
837226c8
KW
1955 * non-ASCII, plus everything in ASCII that isn't in the class. */
1956 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
1957 || ! _generic_isCC_A(*s, FLAGS(c)));
1958 break;
1959 }
1960
1961 to_complement = 1;
1962 /* FALLTHROUGH */
1963
73104a1b 1964 case POSIXA:
3018b823 1965 posixa:
73104a1b 1966 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1967 * byte invariant character. */
1968 REXEC_FBC_CLASS_SCAN(
1969 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1970 break;
3018b823
KW
1971
1972 case NPOSIXU:
1973 to_complement = 1;
1974 /* FALLTHROUGH */
1975
1976 case POSIXU:
1977 if (! utf8_target) {
1978 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1979 FLAGS(c))));
1980 }
1981 else {
1982
1983 posix_utf8:
1984 classnum = (_char_class_number) FLAGS(c);
1985 if (classnum < _FIRST_NON_SWASH_CC) {
1986 while (s < strend) {
1987
1988 /* We avoid loading in the swash as long as possible, but
1989 * should we have to, we jump to a separate loop. This
1990 * extra 'if' statement is what keeps this code from being
1991 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1992 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1993 goto found_above_latin1;
1994 }
1995 if ((UTF8_IS_INVARIANT(*s)
1996 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1997 classnum)))
1998 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1999 && to_complement ^ cBOOL(
94bb8c36
KW
2000 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2001 *(s + 1)),
3018b823
KW
2002 classnum))))
2003 {
02d5137b 2004 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2005 goto got_it;
2006 else {
2007 tmp = doevery;
2008 }
2009 }
2010 else {
2011 tmp = 1;
2012 }
2013 s += UTF8SKIP(s);
2014 }
2015 }
2016 else switch (classnum) { /* These classes are implemented as
2017 macros */
2018 case _CC_ENUM_SPACE: /* XXX would require separate code if we
2019 revert the change of \v matching this */
924ba076 2020 /* FALLTHROUGH */
3018b823
KW
2021
2022 case _CC_ENUM_PSXSPC:
2023 REXEC_FBC_UTF8_CLASS_SCAN(
2024 to_complement ^ cBOOL(isSPACE_utf8(s)));
2025 break;
2026
2027 case _CC_ENUM_BLANK:
2028 REXEC_FBC_UTF8_CLASS_SCAN(
2029 to_complement ^ cBOOL(isBLANK_utf8(s)));
2030 break;
2031
2032 case _CC_ENUM_XDIGIT:
2033 REXEC_FBC_UTF8_CLASS_SCAN(
2034 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2035 break;
2036
2037 case _CC_ENUM_VERTSPACE:
2038 REXEC_FBC_UTF8_CLASS_SCAN(
2039 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2040 break;
2041
2042 case _CC_ENUM_CNTRL:
2043 REXEC_FBC_UTF8_CLASS_SCAN(
2044 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2045 break;
2046
2047 default:
2048 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
a25b5927 2049 assert(0); /* NOTREACHED */
3018b823
KW
2050 }
2051 }
2052 break;
2053
2054 found_above_latin1: /* Here we have to load a swash to get the result
2055 for the current code point */
2056 if (! PL_utf8_swash_ptrs[classnum]) {
2057 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2058 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2059 _core_swash_init("utf8",
2060 "",
2061 &PL_sv_undef, 1, 0,
2062 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2063 }
2064
2065 /* This is a copy of the loop above for swash classes, though using the
2066 * FBC macro instead of being expanded out. Since we've loaded the
2067 * swash, we don't have to check for that each time through the loop */
2068 REXEC_FBC_UTF8_CLASS_SCAN(
2069 to_complement ^ cBOOL(_generic_utf8(
2070 classnum,
2071 s,
2072 swash_fetch(PL_utf8_swash_ptrs[classnum],
2073 (U8 *) s, TRUE))));
73104a1b
KW
2074 break;
2075
2076 case AHOCORASICKC:
2077 case AHOCORASICK:
2078 {
2079 DECL_TRIE_TYPE(c);
2080 /* what trie are we using right now */
2081 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2082 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2083 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2084
2085 const char *last_start = strend - trie->minlen;
6148ee25 2086#ifdef DEBUGGING
73104a1b 2087 const char *real_start = s;
6148ee25 2088#endif
73104a1b
KW
2089 STRLEN maxlen = trie->maxlen;
2090 SV *sv_points;
2091 U8 **points; /* map of where we were in the input string
2092 when reading a given char. For ASCII this
2093 is unnecessary overhead as the relationship
2094 is always 1:1, but for Unicode, especially
2095 case folded Unicode this is not true. */
2096 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2097 U8 *bitmap=NULL;
2098
2099
2100 GET_RE_DEBUG_FLAGS_DECL;
2101
2102 /* We can't just allocate points here. We need to wrap it in
2103 * an SV so it gets freed properly if there is a croak while
2104 * running the match */
2105 ENTER;
2106 SAVETMPS;
2107 sv_points=newSV(maxlen * sizeof(U8 *));
2108 SvCUR_set(sv_points,
2109 maxlen * sizeof(U8 *));
2110 SvPOK_on(sv_points);
2111 sv_2mortal(sv_points);
2112 points=(U8**)SvPV_nolen(sv_points );
2113 if ( trie_type != trie_utf8_fold
2114 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2115 {
2116 if (trie->bitmap)
2117 bitmap=(U8*)trie->bitmap;
2118 else
2119 bitmap=(U8*)ANYOF_BITMAP(c);
2120 }
2121 /* this is the Aho-Corasick algorithm modified a touch
2122 to include special handling for long "unknown char" sequences.
2123 The basic idea being that we use AC as long as we are dealing
2124 with a possible matching char, when we encounter an unknown char
2125 (and we have not encountered an accepting state) we scan forward
2126 until we find a legal starting char.
2127 AC matching is basically that of trie matching, except that when
2128 we encounter a failing transition, we fall back to the current
2129 states "fail state", and try the current char again, a process
2130 we repeat until we reach the root state, state 1, or a legal
2131 transition. If we fail on the root state then we can either
2132 terminate if we have reached an accepting state previously, or
2133 restart the entire process from the beginning if we have not.
2134
2135 */
2136 while (s <= last_start) {
2137 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2138 U8 *uc = (U8*)s;
2139 U16 charid = 0;
2140 U32 base = 1;
2141 U32 state = 1;
2142 UV uvc = 0;
2143 STRLEN len = 0;
2144 STRLEN foldlen = 0;
2145 U8 *uscan = (U8*)NULL;
2146 U8 *leftmost = NULL;
2147#ifdef DEBUGGING
2148 U32 accepted_word= 0;
786e8c11 2149#endif
73104a1b
KW
2150 U32 pointpos = 0;
2151
2152 while ( state && uc <= (U8*)strend ) {
2153 int failed=0;
2154 U32 word = aho->states[ state ].wordnum;
2155
2156 if( state==1 ) {
2157 if ( bitmap ) {
2158 DEBUG_TRIE_EXECUTE_r(
2159 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2160 dump_exec_pos( (char *)uc, c, strend, real_start,
2161 (char *)uc, utf8_target );
2162 PerlIO_printf( Perl_debug_log,
2163 " Scanning for legal start char...\n");
2164 }
2165 );
2166 if (utf8_target) {
2167 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2168 uc += UTF8SKIP(uc);
2169 }
2170 } else {
2171 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2172 uc++;
2173 }
786e8c11 2174 }
73104a1b 2175 s= (char *)uc;
07be1b83 2176 }
73104a1b
KW
2177 if (uc >(U8*)last_start) break;
2178 }
2179
2180 if ( word ) {
2181 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2182 if (!leftmost || lpos < leftmost) {
2183 DEBUG_r(accepted_word=word);
2184 leftmost= lpos;
7016d6eb 2185 }
73104a1b 2186 if (base==0) break;
7016d6eb 2187
73104a1b
KW
2188 }
2189 points[pointpos++ % maxlen]= uc;
2190 if (foldlen || uc < (U8*)strend) {
2191 REXEC_TRIE_READ_CHAR(trie_type, trie,
2192 widecharmap, uc,
2193 uscan, len, uvc, charid, foldlen,
2194 foldbuf, uniflags);
2195 DEBUG_TRIE_EXECUTE_r({
2196 dump_exec_pos( (char *)uc, c, strend,
2197 real_start, s, utf8_target);
2198 PerlIO_printf(Perl_debug_log,
2199 " Charid:%3u CP:%4"UVxf" ",
2200 charid, uvc);
2201 });
2202 }
2203 else {
2204 len = 0;
2205 charid = 0;
2206 }
07be1b83 2207
73104a1b
KW
2208
2209 do {
6148ee25 2210#ifdef DEBUGGING
73104a1b 2211 word = aho->states[ state ].wordnum;
6148ee25 2212#endif
73104a1b
KW
2213 base = aho->states[ state ].trans.base;
2214
2215 DEBUG_TRIE_EXECUTE_r({
2216 if (failed)
2217 dump_exec_pos( (char *)uc, c, strend, real_start,
2218 s, utf8_target );
2219 PerlIO_printf( Perl_debug_log,
2220 "%sState: %4"UVxf", word=%"UVxf,
2221 failed ? " Fail transition to " : "",
2222 (UV)state, (UV)word);
2223 });
2224 if ( base ) {
2225 U32 tmp;
2226 I32 offset;
2227 if (charid &&
2228 ( ((offset = base + charid
2229 - 1 - trie->uniquecharcount)) >= 0)
2230 && ((U32)offset < trie->lasttrans)
2231 && trie->trans[offset].check == state
2232 && (tmp=trie->trans[offset].next))
2233 {
2234 DEBUG_TRIE_EXECUTE_r(
2235 PerlIO_printf( Perl_debug_log," - legal\n"));
2236 state = tmp;
2237 break;
07be1b83
YO
2238 }
2239 else {
786e8c11 2240 DEBUG_TRIE_EXECUTE_r(
73104a1b 2241 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2242 failed = 1;
73104a1b 2243 state = aho->fail[state];
07be1b83 2244 }
07be1b83 2245 }
73104a1b
KW
2246 else {
2247 /* we must be accepting here */
2248 DEBUG_TRIE_EXECUTE_r(
2249 PerlIO_printf( Perl_debug_log," - accepting\n"));
2250 failed = 1;
2251 break;
786e8c11 2252 }
73104a1b
KW
2253 } while(state);
2254 uc += len;
2255 if (failed) {
2256 if (leftmost)
2257 break;
2258 if (!state) state = 1;
07be1b83 2259 }
73104a1b
KW
2260 }
2261 if ( aho->states[ state ].wordnum ) {
2262 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2263 if (!leftmost || lpos < leftmost) {
2264 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2265 leftmost = lpos;
07be1b83
YO
2266 }
2267 }
73104a1b
KW
2268 if (leftmost) {
2269 s = (char*)leftmost;
2270 DEBUG_TRIE_EXECUTE_r({
2271 PerlIO_printf(
2272 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2273 (UV)accepted_word, (IV)(s - real_start)
2274 );
2275 });
02d5137b 2276 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2277 FREETMPS;
2278 LEAVE;
2279 goto got_it;
2280 }
2281 s = HOPc(s,1);
2282 DEBUG_TRIE_EXECUTE_r({
2283 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2284 });
2285 } else {
2286 DEBUG_TRIE_EXECUTE_r(
2287 PerlIO_printf( Perl_debug_log,"No match.\n"));
2288 break;
2289 }
2290 }
2291 FREETMPS;
2292 LEAVE;
2293 }
2294 break;
2295 default:
2296 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2297 }
2298 return 0;
2299 got_it:
2300 return s;
6eb5f6b9
JH
2301}
2302
60165aa4
DM
2303/* set RX_SAVED_COPY, RX_SUBBEG etc.
2304 * flags have same meanings as with regexec_flags() */
2305
749f4950
DM
2306static void
2307S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2308 char *strbeg,
2309 char *strend,
2310 SV *sv,
2311 U32 flags,
2312 bool utf8_target)
2313{
2314 struct regexp *const prog = ReANY(rx);
2315
60165aa4
DM
2316 if (flags & REXEC_COPY_STR) {
2317#ifdef PERL_ANY_COW
2318 if (SvCANCOW(sv)) {
2319 if (DEBUG_C_TEST) {
2320 PerlIO_printf(Perl_debug_log,
2321 "Copy on write: regexp capture, type %d\n",
2322 (int) SvTYPE(sv));
2323 }
5411a0e5
DM
2324 /* Create a new COW SV to share the match string and store
2325 * in saved_copy, unless the current COW SV in saved_copy
2326 * is valid and suitable for our purpose */
2327 if (( prog->saved_copy
2328 && SvIsCOW(prog->saved_copy)
2329 && SvPOKp(prog->saved_copy)
2330 && SvIsCOW(sv)
2331 && SvPOKp(sv)
2332 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2333 {
5411a0e5
DM
2334 /* just reuse saved_copy SV */
2335 if (RXp_MATCH_COPIED(prog)) {
2336 Safefree(prog->subbeg);
2337 RXp_MATCH_COPIED_off(prog);
2338 }
2339 }
2340 else {
2341 /* create new COW SV to share string */
a76b0e90
DM
2342 RX_MATCH_COPY_FREE(rx);
2343 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2344 }
5411a0e5
DM
2345 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2346 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2347 prog->sublen = strend - strbeg;
2348 prog->suboffset = 0;
2349 prog->subcoffset = 0;
2350 } else
2351#endif
2352 {
99a90e59
FC
2353 SSize_t min = 0;
2354 SSize_t max = strend - strbeg;
ea3daa5d 2355 SSize_t sublen;
60165aa4
DM
2356
2357 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2358 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2359 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2360 ) { /* don't copy $' part of string */
2361 U32 n = 0;
2362 max = -1;
2363 /* calculate the right-most part of the string covered
2364 * by a capture. Due to look-ahead, this may be to
2365 * the right of $&, so we have to scan all captures */
2366 while (n <= prog->lastparen) {
2367 if (prog->offs[n].end > max)
2368 max = prog->offs[n].end;
2369 n++;
2370 }
2371 if (max == -1)
2372 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2373 ? prog->offs[0].start
2374 : 0;
2375 assert(max >= 0 && max <= strend - strbeg);
2376 }
2377
2378 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2379 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2380 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2381 ) { /* don't copy $` part of string */
2382 U32 n = 0;
2383 min = max;
2384 /* calculate the left-most part of the string covered
2385 * by a capture. Due to look-behind, this may be to
2386 * the left of $&, so we have to scan all captures */
2387 while (min && n <= prog->lastparen) {
2388 if ( prog->offs[n].start != -1
2389 && prog->offs[n].start < min)
2390 {
2391 min = prog->offs[n].start;
2392 }
2393 n++;
2394 }
2395 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2396 && min > prog->offs[0].end
2397 )
2398 min = prog->offs[0].end;
2399
2400 }
2401
2402 assert(min >= 0 && min <= max && min <= strend - strbeg);
2403 sublen = max - min;
2404
2405 if (RX_MATCH_COPIED(rx)) {
2406 if (sublen > prog->sublen)
2407 prog->subbeg =
2408 (char*)saferealloc(prog->subbeg, sublen+1);
2409 }
2410 else
2411 prog->subbeg = (char*)safemalloc(sublen+1);
2412 Copy(strbeg + min, prog->subbeg, sublen, char);
2413 prog->subbeg[sublen] = '\0';
2414 prog->suboffset = min;
2415 prog->sublen = sublen;
2416 RX_MATCH_COPIED_on(rx);
2417 }
2418 prog->subcoffset = prog->suboffset;
2419 if (prog->suboffset && utf8_target) {
2420 /* Convert byte offset to chars.
2421 * XXX ideally should only compute this if @-/@+
2422 * has been seen, a la PL_sawampersand ??? */
2423
2424 /* If there's a direct correspondence between the
2425 * string which we're matching and the original SV,
2426 * then we can use the utf8 len cache associated with
2427 * the SV. In particular, it means that under //g,
2428 * sv_pos_b2u() will use the previously cached
2429 * position to speed up working out the new length of
2430 * subcoffset, rather than counting from the start of
2431 * the string each time. This stops
2432 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2433 * from going quadratic */
2434 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2435 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2436 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2437 else
2438 prog->subcoffset = utf8_length((U8*)strbeg,
2439 (U8*)(strbeg+prog->suboffset));
2440 }
2441 }
2442 else {
2443 RX_MATCH_COPY_FREE(rx);
2444 prog->subbeg = strbeg;
2445 prog->suboffset = 0;
2446 prog->subcoffset = 0;
2447 prog->sublen = strend - strbeg;
2448 }
2449}
2450
2451
2452
fae667d5 2453
6eb5f6b9
JH
2454/*
2455 - regexec_flags - match a regexp against a string
2456 */
2457I32
5aaab254 2458Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2459 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2460/* stringarg: the point in the string at which to begin matching */
2461/* strend: pointer to null at end of string */
2462/* strbeg: real beginning of string */
2463/* minend: end of match must be >= minend bytes after stringarg. */
2464/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2465 * itself is accessed via the pointers above */
2466/* data: May be used for some additional optimizations.
d058ec57 2467 Currently unused. */
a340edde 2468/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2469
6eb5f6b9 2470{
8d919b0a 2471 struct regexp *const prog = ReANY(rx);
5aaab254 2472 char *s;
eb578fdb 2473 regnode *c;
03c83e26 2474 char *startpos;
ea3daa5d
FC
2475 SSize_t minlen; /* must match at least this many chars */
2476 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2477 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2478 I32 multiline;
f8fc2ecf 2479 RXi_GET_DECL(prog,progi);
02d5137b
DM
2480 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2481 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2482 regexp_paren_pair *swap = NULL;
006f26b2 2483 I32 oldsave;
a3621e74
YO
2484 GET_RE_DEBUG_FLAGS_DECL;
2485
7918f24d 2486 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2487 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2488
2489 /* Be paranoid... */
03c83e26 2490 if (prog == NULL || stringarg == NULL) {
6eb5f6b9 2491 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2492 }
2493
6c3fea77 2494 DEBUG_EXECUTE_r(
03c83e26 2495 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2496 "Matching");
2497 );
8adc0f72 2498
b342a604
DM
2499 startpos = stringarg;
2500
58430ea8 2501 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2502 MAGIC *mg;
2503
fef7148b
DM
2504 /* set reginfo->ganch, the position where \G can match */
2505
2506 reginfo->ganch =
2507 (flags & REXEC_IGNOREPOS)
2508 ? stringarg /* use start pos rather than pos() */
2509 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2510 /* Defined pos(): */
2511 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2512 : strbeg; /* pos() not defined; use start of string */
2513
2514 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2515 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2516
03c83e26
DM
2517 /* in the presence of \G, we may need to start looking earlier in
2518 * the string than the suggested start point of stringarg:
0b2c2a84 2519 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2520 * offset, such as
2521 * /..\G/: gofs = 2
2522 * /ab|c\G/: gofs = 1
2523 * or if the minimum offset isn't known, then we have to go back
2524 * to the start of the string, e.g. /w+\G/
2525 */
2bfbe302 2526
8e1490ee 2527 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2528 startpos = reginfo->ganch - prog->gofs;
2529 if (startpos <
2530 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2531 {
2532 DEBUG_r(PerlIO_printf(Perl_debug_log,
2533 "fail: ganch-gofs before earliest possible start\n"));
2534 return 0;
2535 }
2536 }
2537 else if (prog->gofs) {
b342a604
DM
2538 if (startpos - prog->gofs < strbeg)
2539 startpos = strbeg;
2540 else
2541 startpos -= prog->gofs;
03c83e26 2542 }
58430ea8 2543 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2544 startpos = strbeg;
03c83e26
DM
2545 }
2546
2547 minlen = prog->minlen;
b342a604 2548 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2549 DEBUG_r(PerlIO_printf(Perl_debug_log,
2550 "Regex match can't succeed, so not even tried\n"));
2551 return 0;
2552 }
2553
63a3746a
DM
2554 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2555 * which will call destuctors to reset PL_regmatch_state, free higher
2556 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2557 * regmatch_info_aux_eval */
2558
2559 oldsave = PL_savestack_ix;
2560
dfa77d06
DM
2561 s = startpos;
2562
e322109a 2563 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2564 && !(flags & REXEC_CHECKED))
2565 {
dfa77d06 2566 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2567 flags, NULL);
dfa77d06 2568 if (!s)
7fadf4a7
DM
2569 return 0;
2570
e322109a 2571 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2572 /* we can match based purely on the result of INTUIT.
2573 * Set up captures etc just for $& and $-[0]
2574 * (an intuit-only match wont have $1,$2,..) */
2575 assert(!prog->nparens);
d5e7783a
DM
2576
2577 /* s/// doesn't like it if $& is earlier than where we asked it to
2578 * start searching (which can happen on something like /.\G/) */
2579 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2580 && (s < stringarg))
2581 {
2582 /* this should only be possible under \G */
58430ea8 2583 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2585 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2586 goto phooey;
2587 }
2588
7fadf4a7
DM
2589 /* match via INTUIT shouldn't have any captures.
2590 * Let @-, @+, $^N know */
2591 prog->lastparen = prog->lastcloseparen = 0;
2592 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2593 prog->offs[0].start = s - strbeg;
2594 prog->offs[0].end = utf8_target
2595 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2596 : s - strbeg + prog->minlenret;
7fadf4a7 2597 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2598 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2599 strbeg, strend,
2600 sv, flags, utf8_target);
2601
7fadf4a7
DM
2602 return 1;
2603 }
2604 }
2605
6c3fea77 2606 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2607
dfa77d06 2608 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2609 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2610 "String too short [regexec_flags]...\n"));
2611 goto phooey;
1aa99e6b 2612 }
1de06328 2613
6eb5f6b9 2614 /* Check validity of program. */
f8fc2ecf 2615 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2616 Perl_croak(aTHX_ "corrupted regexp program");
2617 }
2618
1738e041
DM
2619 RX_MATCH_TAINTED_off(rx);
2620
6c3fea77
DM
2621 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2622 reginfo->intuit = 0;
2623 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2624 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2625 reginfo->warned = FALSE;
9d9163fb 2626 reginfo->strbeg = strbeg;
02d5137b 2627 reginfo->sv = sv;
1cb48e53 2628 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2629 reginfo->strend = strend;
6eb5f6b9 2630 /* see how far we have to get to not match where we matched before */
fe3974be 2631 reginfo->till = stringarg + minend;
6eb5f6b9 2632
60779a30 2633 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2634 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2635 S_cleanup_regmatch_info_aux has executed (registered by
2636 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2637 magic belonging to this SV.
2638 Not newSVsv, either, as it does not COW.
2639 */
2640 reginfo->sv = newSV(0);
4cba5ac0 2641 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2642 SAVEFREESV(reginfo->sv);
2643 }
2644
331b2dcc
DM
2645 /* reserve next 2 or 3 slots in PL_regmatch_state:
2646 * slot N+0: may currently be in use: skip it
2647 * slot N+1: use for regmatch_info_aux struct
2648 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2649 * slot N+3: ready for use by regmatch()
2650 */
bf2039a9 2651
331b2dcc
DM
2652 {
2653 regmatch_state *old_regmatch_state;
2654 regmatch_slab *old_regmatch_slab;
2655 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2656
2657 /* on first ever match, allocate first slab */
2658 if (!PL_regmatch_slab) {
2659 Newx(PL_regmatch_slab, 1, regmatch_slab);
2660 PL_regmatch_slab->prev = NULL;
2661 PL_regmatch_slab->next = NULL;
2662 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2663 }
bf2039a9 2664
331b2dcc
DM
2665 old_regmatch_state = PL_regmatch_state;
2666 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2667
331b2dcc
DM
2668 for (i=0; i <= max; i++) {
2669 if (i == 1)
2670 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2671 else if (i ==2)
2672 reginfo->info_aux_eval =
2673 reginfo->info_aux->info_aux_eval =
2674 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2675
331b2dcc
DM
2676 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2677 PL_regmatch_state = S_push_slab(aTHX);
2678 }
bf2039a9 2679
331b2dcc
DM
2680 /* note initial PL_regmatch_state position; at end of match we'll
2681 * pop back to there and free any higher slabs */
bf2039a9 2682
331b2dcc
DM
2683 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2684 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2685 reginfo->info_aux->poscache = NULL;
bf2039a9 2686
331b2dcc 2687 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2688
331b2dcc
DM
2689 if ((prog->extflags & RXf_EVAL_SEEN))
2690 S_setup_eval_state(aTHX_ reginfo);
2691 else
2692 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2693 }
d3aa529c 2694
6eb5f6b9 2695 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2696
288b8c02 2697 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2698 /* We have to be careful. If the previous successful match
2699 was from this regex we don't want a subsequent partially
2700 successful match to clobber the old results.
2701 So when we detect this possibility we add a swap buffer
d8da0584
KW
2702 to the re, and switch the buffer each match. If we fail,
2703 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2704 */
2705 swap = prog->offs;
2706 /* do we need a save destructor here for eval dies? */
2707 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2708 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2709 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2710 PTR2UV(prog),
2711 PTR2UV(swap),
2712 PTR2UV(prog->offs)
2713 ));
c74340f9 2714 }
6eb5f6b9
JH
2715
2716 /* Simplest case: anchored match need be tried only once. */
d3d47aac 2717 /* [unless only anchor is MBOL - implying multiline is set] */
8e1490ee 2718 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2719 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2720 goto got_it;
8e1490ee 2721 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2722 {
2723 char *end;
2724
2725 if (minlen)
2726 dontbother = minlen - 1;
1aa99e6b 2727 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2728 /* for multiline we only have to try after newlines */
33b8afdf 2729 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2730 /* because of the goto we can not easily reuse the macros for bifurcating the
2731 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2732 if (utf8_target) {
2733 if (s == startpos)
2734 goto after_try_utf8;
2735 while (1) {
02d5137b 2736 if (regtry(reginfo, &s)) {
92f3d482
YO
2737 goto got_it;
2738 }
2739 after_try_utf8:
2740 if (s > end) {
2741 goto phooey;
2742 }
2743 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2744 s = re_intuit_start(rx, sv, strbeg,
2745 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2746 if (!s) {
2747 goto phooey;
2748 }
2749 }
2750 else {
2751 s += UTF8SKIP(s);
2752 }
2753 }
2754 } /* end search for check string in unicode */
2755 else {
2756 if (s == startpos) {
2757 goto after_try_latin;
2758 }
2759 while (1) {
02d5137b 2760 if (regtry(reginfo, &s)) {
92f3d482
YO
2761 goto got_it;
2762 }
2763 after_try_latin:
2764 if (s > end) {
2765 goto phooey;
2766 }
2767 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2768 s = re_intuit_start(rx, sv, strbeg,
2769 s + 1, strend, flags, NULL);
92f3d482
YO
2770 if (!s) {
2771 goto phooey;
2772 }
2773 }
2774 else {
2775 s++;
2776 }
2777 }
2778 } /* end search for check string in latin*/
2779 } /* end search for check string */
2780 else { /* search for newline */
2781 if (s > startpos) {
2782 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2783 s--;
92f3d482 2784 }
21eede78
YO
2785 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2786 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2787 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2788 if (regtry(reginfo, &s))
6eb5f6b9
JH
2789 goto got_it;
2790 }
92f3d482
YO
2791 }
2792 } /* end search for newline */
2793 } /* end anchored/multiline check string search */
6eb5f6b9 2794 goto phooey;
a8430a8b 2795 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2796 {
a8430a8b
YO
2797 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2798 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2799 /* For anchored \G, the only position it can match from is
2800 * (ganch-gofs); we already set startpos to this above; if intuit
2801 * moved us on from there, we can't possibly succeed */
2802 assert(startpos == reginfo->ganch - prog->gofs);
2803 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2804 goto got_it;
2805 goto phooey;
2806 }
2807
2808 /* Messy cases: unanchored match. */
bbe252da 2809 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2810 /* we have /x+whatever/ */
984e6dd1 2811 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2812 char ch;
bf93d4cc
GS
2813#ifdef DEBUGGING
2814 int did_match = 0;
2815#endif
f2ed9b32 2816 if (utf8_target) {
7e0d5ad7
KW
2817 if (! prog->anchored_utf8) {
2818 to_utf8_substr(prog);
2819 }
2820 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2821 REXEC_FBC_SCAN(
6eb5f6b9 2822 if (*s == ch) {
a3621e74 2823 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2824 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2825 s += UTF8SKIP(s);
2826 while (s < strend && *s == ch)
2827 s += UTF8SKIP(s);
2828 }
4cadc6a9 2829 );
7e0d5ad7 2830
6eb5f6b9
JH
2831 }
2832 else {
7e0d5ad7
KW
2833 if (! prog->anchored_substr) {
2834 if (! to_byte_substr(prog)) {
6b54ddc5 2835 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2836 }
2837 }
2838 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2839 REXEC_FBC_SCAN(
6eb5f6b9 2840 if (*s == ch) {
a3621e74 2841 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2842 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2843 s++;
2844 while (s < strend && *s == ch)
2845 s++;
2846 }
4cadc6a9 2847 );
6eb5f6b9 2848 }
a3621e74 2849 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2850 PerlIO_printf(Perl_debug_log,
b7953727
JH
2851 "Did not find anchored character...\n")
2852 );
6eb5f6b9 2853 }
a0714e2c
SS
2854 else if (prog->anchored_substr != NULL
2855 || prog->anchored_utf8 != NULL
2856 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2857 && prog->float_max_offset < strend - s)) {
2858 SV *must;
ea3daa5d
FC
2859 SSize_t back_max;
2860 SSize_t back_min;
33b8afdf 2861 char *last;
6eb5f6b9 2862 char *last1; /* Last position checked before */
bf93d4cc
GS
2863#ifdef DEBUGGING
2864 int did_match = 0;
2865#endif
33b8afdf 2866 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2867 if (utf8_target) {
2868 if (! prog->anchored_utf8) {
2869 to_utf8_substr(prog);
2870 }
2871 must = prog->anchored_utf8;
2872 }
2873 else {
2874 if (! prog->anchored_substr) {
2875 if (! to_byte_substr(prog)) {
6b54ddc5 2876 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2877 }
2878 }
2879 must = prog->anchored_substr;
2880 }
33b8afdf
JH
2881 back_max = back_min = prog->anchored_offset;
2882 } else {
7e0d5ad7
KW
2883 if (utf8_target) {
2884 if (! prog->float_utf8) {
2885 to_utf8_substr(prog);
2886 }
2887 must = prog->float_utf8;
2888 }
2889 else {
2890 if (! prog->float_substr) {
2891 if (! to_byte_substr(prog)) {
6b54ddc5 2892 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2893 }
2894 }
2895 must = prog->float_substr;
2896 }
33b8afdf
JH
2897 back_max = prog->float_max_offset;
2898 back_min = prog->float_min_offset;
2899 }
1de06328 2900
1de06328
YO
2901 if (back_min<0) {
2902 last = strend;
2903 } else {
2904 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2905 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2906 - (SvTAIL(must) != 0) + back_min), strbeg);
2907 }
9d9163fb 2908 if (s > reginfo->strbeg)
6eb5f6b9
JH
2909 last1 = HOPc(s, -1);
2910 else
2911 last1 = s - 1; /* bogus */
2912
a0288114 2913 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2914 check_substr==must. */
bf05793b 2915 dontbother = 0;
6eb5f6b9
JH
2916 strend = HOPc(strend, -dontbother);
2917 while ( (s <= last) &&
e50d57d4 2918 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2919 (unsigned char*)strend, must,
c33e64f0 2920 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2921 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2922 if (HOPc(s, -back_max) > last1) {
2923 last1 = HOPc(s, -back_min);
2924 s = HOPc(s, -back_max);
2925 }
2926 else {
9d9163fb
DM
2927 char * const t = (last1 >= reginfo->strbeg)
2928 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2929
2930 last1 = HOPc(s, -back_min);
52657f30 2931 s = t;
6eb5f6b9 2932 }
f2ed9b32 2933 if (utf8_target) {
6eb5f6b9 2934 while (s <= last1) {
02d5137b 2935 if (regtry(reginfo, &s))
6eb5f6b9 2936 goto got_it;
7016d6eb
DM
2937 if (s >= last1) {
2938 s++; /* to break out of outer loop */
2939 break;
2940 }
2941 s += UTF8SKIP(s);
6eb5f6b9
JH
2942 }
2943 }
2944 else {
2945 while (s <= last1) {
02d5137b 2946 if (regtry(reginfo, &s))
6eb5f6b9
JH
2947 goto got_it;
2948 s++;
2949 }
2950 }
2951 }
ab3bbdeb 2952 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2953 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2954 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2955 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2956 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2957 ? "anchored" : "floating"),
ab3bbdeb
YO
2958 quoted, RE_SV_TAIL(must));
2959 });
6eb5f6b9
JH
2960 goto phooey;
2961 }
f8fc2ecf 2962 else if ( (c = progi->regstclass) ) {
f14c76ed 2963 if (minlen) {
f8fc2ecf 2964 const OPCODE op = OP(progi->regstclass);
66e933ab 2965 /* don't bother with what can't match */
786e8c11 2966 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2967 strend = HOPc(strend, -(minlen - 1));
2968 }
a3621e74 2969 DEBUG_EXECUTE_r({
be8e71aa 2970 SV * const prop = sv_newmortal();
2395827c 2971 regprop(prog, prop, c, reginfo);
0df25f3d 2972 {
f2ed9b32 2973 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2974 s,strend-s,60);
0df25f3d 2975 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2976 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2977 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2978 quoted, (int)(strend - s));
0df25f3d 2979 }
ffc61ed2 2980 });
f9176b44 2981 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2982 goto got_it;
07be1b83 2983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2984 }
2985 else {
2986 dontbother = 0;
a0714e2c 2987 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2988 /* Trim the end. */
6af40bd7 2989 char *last= NULL;
33b8afdf 2990 SV* float_real;
c33e64f0
FC
2991 STRLEN len;
2992 const char *little;
33b8afdf 2993
7e0d5ad7
KW
2994 if (utf8_target) {
2995 if (! prog->float_utf8) {
2996 to_utf8_substr(prog);
2997 }
2998 float_real = prog->float_utf8;
2999 }
3000 else {
3001 if (! prog->float_substr) {
3002 if (! to_byte_substr(prog)) {
6b54ddc5 3003 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3004 }
3005 }
3006 float_real = prog->float_substr;
3007 }
d6a28714 3008
c33e64f0
FC
3009 little = SvPV_const(float_real, len);
3010 if (SvTAIL(float_real)) {
7f18ad16
KW
3011 /* This means that float_real contains an artificial \n on
3012 * the end due to the presence of something like this:
3013 * /foo$/ where we can match both "foo" and "foo\n" at the
3014 * end of the string. So we have to compare the end of the
3015 * string first against the float_real without the \n and
3016 * then against the full float_real with the string. We
3017 * have to watch out for cases where the string might be
3018 * smaller than the float_real or the float_real without
3019 * the \n. */
1a13b075
YO
3020 char *checkpos= strend - len;
3021 DEBUG_OPTIMISE_r(
3022 PerlIO_printf(Perl_debug_log,
3023 "%sChecking for float_real.%s\n",
3024 PL_colors[4], PL_colors[5]));
3025 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3026 /* can't match, even if we remove the trailing \n
3027 * string is too short to match */
1a13b075
YO
3028 DEBUG_EXECUTE_r(
3029 PerlIO_printf(Perl_debug_log,
3030 "%sString shorter than required trailing substring, cannot match.%s\n",
3031 PL_colors[4], PL_colors[5]));
3032 goto phooey;
3033 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3034 /* can match, the end of the string matches without the
3035 * "\n" */
1a13b075
YO
3036 last = checkpos + 1;
3037 } else if (checkpos < strbeg) {
7f18ad16
KW
3038 /* cant match, string is too short when the "\n" is
3039 * included */
1a13b075
YO
3040 DEBUG_EXECUTE_r(
3041 PerlIO_printf(Perl_debug_log,
3042 "%sString does not contain required trailing substring, cannot match.%s\n",
3043 PL_colors[4], PL_colors[5]));
3044 goto phooey;
3045 } else if (!multiline) {
7f18ad16
KW
3046 /* non multiline match, so compare with the "\n" at the
3047 * end of the string */
1a13b075
YO
3048 if (memEQ(checkpos, little, len)) {
3049 last= checkpos;
3050 } else {
3051 DEBUG_EXECUTE_r(
3052 PerlIO_printf(Perl_debug_log,
3053 "%sString does not contain required trailing substring, cannot match.%s\n",
3054 PL_colors[4], PL_colors[5]));
3055 goto phooey;
3056 }
3057 } else {
7f18ad16
KW
3058 /* multiline match, so we have to search for a place
3059 * where the full string is located */
d6a28714 3060 goto find_last;
1a13b075 3061 }
c33e64f0 3062 } else {
d6a28714 3063 find_last:
9041c2e3 3064 if (len)
d6a28714 3065 last = rninstr(s, strend, little, little + len);
b8c5462f 3066 else
a0288114 3067 last = strend; /* matching "$" */
b8c5462f 3068 }
6af40bd7 3069 if (!last) {
7f18ad16
KW
3070 /* at one point this block contained a comment which was
3071 * probably incorrect, which said that this was a "should not
3072 * happen" case. Even if it was true when it was written I am
3073 * pretty sure it is not anymore, so I have removed the comment
3074 * and replaced it with this one. Yves */
6bda09f9
YO
3075 DEBUG_EXECUTE_r(
3076 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
3077 "String does not contain required substring, cannot match.\n"
3078 ));
3079 goto phooey;
bf93d4cc 3080 }
d6a28714
JH
3081 dontbother = strend - last + prog->float_min_offset;
3082 }
3083 if (minlen && (dontbother < minlen))
3084 dontbother = minlen - 1;
3085 strend -= dontbother; /* this one's always in bytes! */
3086 /* We don't know much -- general case. */
f2ed9b32 3087 if (utf8_target) {
d6a28714 3088 for (;;) {
02d5137b 3089 if (regtry(reginfo, &s))
d6a28714
JH
3090 goto got_it;
3091 if (s >= strend)
3092 break;
b8c5462f 3093 s += UTF8SKIP(s);
d6a28714
JH
3094 };
3095 }
3096 else {
3097 do {
02d5137b 3098 if (regtry(reginfo, &s))
d6a28714
JH
3099 goto got_it;
3100 } while (s++ < strend);
3101 }
3102 }
3103
3104 /* Failure. */
3105 goto phooey;
3106
3107got_it:
d5e7783a
DM
3108 /* s/// doesn't like it if $& is earlier than where we asked it to
3109 * start searching (which can happen on something like /.\G/) */
3110 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3111 && (prog->offs[0].start < stringarg - strbeg))
3112 {
3113 /* this should only be possible under \G */
58430ea8 3114 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3115 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3116 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3117 goto phooey;
3118 }
3119
495f47a5
DM
3120 DEBUG_BUFFERS_r(
3121 if (swap)
3122 PerlIO_printf(Perl_debug_log,
3123 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3124 PTR2UV(prog),
3125 PTR2UV(swap)
3126 );
3127 );
e9105d30 3128 Safefree(swap);
d6a28714 3129
bf2039a9
DM
3130 /* clean up; this will trigger destructors that will free all slabs
3131 * above the current one, and cleanup the regmatch_info_aux
3132 * and regmatch_info_aux_eval sructs */
8adc0f72 3133
006f26b2
DM
3134 LEAVE_SCOPE(oldsave);
3135
5daac39c
NC
3136 if (RXp_PAREN_NAMES(prog))
3137 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3138
0254aed9
DM
3139 RX_MATCH_UTF8_set(rx, utf8_target);
3140
d6a28714 3141 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3142 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3143 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3144 strbeg, reginfo->strend,
3145 sv, flags, utf8_target);
9041c2e3 3146
d6a28714
JH
3147 return 1;
3148
3149phooey:
a3621e74 3150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3151 PL_colors[4], PL_colors[5]));
8adc0f72 3152
bf2039a9
DM
3153 /* clean up; this will trigger destructors that will free all slabs
3154 * above the current one, and cleanup the regmatch_info_aux
3155 * and regmatch_info_aux_eval sructs */
8adc0f72 3156
006f26b2
DM
3157 LEAVE_SCOPE(oldsave);
3158
e9105d30 3159 if (swap) {
c74340f9 3160 /* we failed :-( roll it back */
495f47a5
DM
3161 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3162 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3163 PTR2UV(prog),
3164 PTR2UV(prog->offs),
3165 PTR2UV(swap)
3166 ));
e9105d30
GG
3167 Safefree(prog->offs);
3168 prog->offs = swap;
3169 }
d6a28714
JH
3170 return 0;
3171}
3172
6bda09f9 3173
b3d298be 3174/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3175 * Do inc before dec, in case old and new rex are the same */
baa60164 3176#define SET_reg_curpm(Re2) \
bf2039a9 3177 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3178 (void)ReREFCNT_inc(Re2); \
3179 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3180 PM_SETRE((PL_reg_curpm), (Re2)); \
3181 }
3182
3183
d6a28714
JH
3184/*
3185 - regtry - try match at specific point
3186 */
3187STATIC I32 /* 0 failure, 1 success */
f73aaa43 3188S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3189{
d6a28714 3190 CHECKPOINT lastcp;
288b8c02 3191 REGEXP *const rx = reginfo->prog;
8d919b0a 3192 regexp *const prog = ReANY(rx);
99a90e59 3193 SSize_t result;
f8fc2ecf 3194 RXi_GET_DECL(prog,progi);
a3621e74 3195 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3196
3197 PERL_ARGS_ASSERT_REGTRY;
3198
24b23f37 3199 reginfo->cutpoint=NULL;
d6a28714 3200
9d9163fb 3201 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3202 prog->lastparen = 0;
03994de8 3203 prog->lastcloseparen = 0;
d6a28714
JH
3204
3205 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3206 to do this again and again, prog->lastparen should take care of
3dd2943c 3207 this! --ilya*/
dafc8851
JH
3208
3209 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3210 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3211 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3212 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3213 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3214 * Meanwhile, this code *is* needed for the
daf18116
JH
3215 * above-mentioned test suite tests to succeed. The common theme
3216 * on those tests seems to be returning null fields from matches.
225593e1 3217 * --jhi updated by dapm */
dafc8851 3218#if 1
d6a28714 3219 if (prog->nparens) {
b93070ed 3220 regexp_paren_pair *pp = prog->offs;
eb578fdb 3221 I32 i;
b93070ed 3222 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3223 ++pp;
3224 pp->start = -1;
3225 pp->end = -1;
d6a28714
JH
3226 }
3227 }
dafc8851 3228#endif
02db2b7b 3229 REGCP_SET(lastcp);
f73aaa43
DM
3230 result = regmatch(reginfo, *startposp, progi->program + 1);
3231 if (result != -1) {
3232 prog->offs[0].end = result;
d6a28714
JH
3233 return 1;
3234 }
24b23f37 3235 if (reginfo->cutpoint)
f73aaa43 3236 *startposp= reginfo->cutpoint;
02db2b7b 3237 REGCP_UNWIND(lastcp);
d6a28714
JH
3238 return 0;
3239}
3240
02db2b7b 3241
8ba1375e
MJD
3242#define sayYES goto yes
3243#define sayNO goto no
262b90c4 3244#define sayNO_SILENT goto no_silent
8ba1375e 3245
f9f4320a
YO
3246/* we dont use STMT_START/END here because it leads to
3247 "unreachable code" warnings, which are bogus, but distracting. */
3248#define CACHEsayNO \
c476f425 3249 if (ST.cache_mask) \
2ac8ff4b 3250 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3251 sayNO
3298f257 3252
a3621e74 3253/* this is used to determine how far from the left messages like
265c4333
YO
3254 'failed...' are printed. It should be set such that messages
3255 are inline with the regop output that created them.
a3621e74 3256*/
265c4333 3257#define REPORT_CODE_OFF 32
a3621e74
YO
3258
3259
40a82448
DM
3260#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3261#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3262#define CHRTEST_NOT_A_CP_1 -999
3263#define CHRTEST_NOT_A_CP_2 -998
9e137952 3264
5d9a96ca
DM
3265/* grab a new slab and return the first slot in it */
3266
3267STATIC regmatch_state *
3268S_push_slab(pTHX)
3269{
a35a87e7 3270#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3271 dMY_CXT;
3272#endif
5d9a96ca
DM
3273 regmatch_slab *s = PL_regmatch_slab->next;
3274 if (!s) {
3275 Newx(s, 1, regmatch_slab);
3276 s->prev = PL_regmatch_slab;
3277 s->next = NULL;
3278 PL_regmatch_slab->next = s;
3279 }
3280 PL_regmatch_slab = s;
86545054 3281 return SLAB_FIRST(s);
5d9a96ca 3282}
5b47454d 3283
95b24440 3284
40a82448
DM
3285/* push a new state then goto it */
3286
4d5016e5
DM
3287#define PUSH_STATE_GOTO(state, node, input) \
3288 pushinput = input; \
40a82448
DM
3289 scan = node; \
3290 st->resume_state = state; \
3291 goto push_state;
3292
3293/* push a new state with success backtracking, then goto it */
3294
4d5016e5
DM
3295#define PUSH_YES_STATE_GOTO(state, node, input) \
3296 pushinput = input; \
40a82448
DM
3297 scan = node; \
3298 st->resume_state = state; \
3299 goto push_yes_state;
3300
aa283a38 3301
aa283a38 3302
4d5016e5 3303
d6a28714 3304/*
95b24440 3305
bf1f174e
DM
3306regmatch() - main matching routine
3307
3308This is basically one big switch statement in a loop. We execute an op,
3309set 'next' to point the next op, and continue. If we come to a point which
3310we may need to backtrack to on failure such as (A|B|C), we push a
3311backtrack state onto the backtrack stack. On failure, we pop the top
3312state, and re-enter the loop at the state indicated. If there are no more
3313states to pop, we return failure.
3314
3315Sometimes we also need to backtrack on success; for example /A+/, where
3316after successfully matching one A, we need to go back and try to
3317match another one; similarly for lookahead assertions: if the assertion
3318completes successfully, we backtrack to the state just before the assertion
3319and then carry on. In these cases, the pushed state is marked as
3320'backtrack on success too'. This marking is in fact done by a chain of
3321pointers, each pointing to the previous 'yes' state. On success, we pop to
3322the nearest yes state, discarding any intermediate failure-only states.
3323Sometimes a yes state is pushed just to force some cleanup code to be
3324called at the end of a successful match or submatch; e.g. (??{$re}) uses
3325it to free the inner regex.
3326
3327Note that failure backtracking rewinds the cursor position, while
3328success backtracking leaves it alone.
3329
3330A pattern is complete when the END op is executed, while a subpattern
3331such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3332ops trigger the "pop to last yes state if any, otherwise return true"
3333behaviour.
3334
3335A common convention in this function is to use A and B to refer to the two
3336subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3337the subpattern to be matched possibly multiple times, while B is the entire
3338rest of the pattern. Variable and state names reflect this convention.
3339
3340The states in the main switch are the union of ops and failure/success of
3341substates associated with with that op. For example, IFMATCH is the op
3342that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3343'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3344successfully matched A and IFMATCH_A_fail is a state saying that we have
3345just failed to match A. Resume states always come in pairs. The backtrack
3346state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3347at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3348on success or failure.
3349
3350The struct that holds a backtracking state is actually a big union, with
3351one variant for each major type of op. The variable st points to the
3352top-most backtrack struct. To make the code clearer, within each
3353block of code we #define ST to alias the relevant union.
3354
3355Here's a concrete example of a (vastly oversimplified) IFMATCH
3356implementation:
3357
3358 switch (state) {
3359 ....
3360
3361#define ST st->u.ifmatch
3362
3363 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3364 ST.foo = ...; // some state we wish to save
95b24440 3365 ...
bf1f174e
DM
3366 // push a yes backtrack state with a resume value of
3367 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3368 // first node of A:
4d5016e5 3369 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3370 // NOTREACHED
3371
3372 case IFMATCH_A: // we have successfully executed A; now continue with B
3373 next = B;
3374 bar = ST.foo; // do something with the preserved value
3375 break;
3376
3377 case IFMATCH_A_fail: // A failed, so the assertion failed
3378 ...; // do some housekeeping, then ...
3379 sayNO; // propagate the failure
3380
3381#undef ST
95b24440 3382
bf1f174e
DM
3383 ...
3384 }
95b24440 3385
bf1f174e
DM
3386For any old-timers reading this who are familiar with the old recursive
3387approach, the code above is equivalent to:
95b24440 3388
bf1f174e
DM
3389 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3390 {
3391 int foo = ...
95b24440 3392 ...
bf1f174e
DM
3393 if (regmatch(A)) {
3394 next = B;
3395 bar = foo;
3396 break;
95b24440 3397 }
bf1f174e
DM
3398 ...; // do some housekeeping, then ...
3399 sayNO; // propagate the failure
95b24440 3400 }
bf1f174e
DM
3401
3402The topmost backtrack state, pointed to by st, is usually free. If you
3403want to claim it, populate any ST.foo fields in it with values you wish to
3404save, then do one of
3405
4d5016e5
DM
3406 PUSH_STATE_GOTO(resume_state, node, newinput);
3407 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3408
3409which sets that backtrack state's resume value to 'resume_state', pushes a
3410new free entry to the top of the backtrack stack, then goes to 'node'.
3411On backtracking, the free slot is popped, and the saved state becomes the
3412new free state. An ST.foo field in this new top state can be temporarily
3413accessed to retrieve values, but once the main loop is re-entered, it
3414becomes available for reuse.
3415
3416Note that the depth of the backtrack stack constantly increases during the
3417left-to-right execution of the pattern, rather than going up and down with
3418the pattern nesting. For example the stack is at its maximum at Z at the
3419end of the pattern, rather than at X in the following:
3420
3421 /(((X)+)+)+....(Y)+....Z/
3422
3423The only exceptions to this are lookahead/behind assertions and the cut,
3424(?>A), which pop all the backtrack states associated with A before
3425continuing.
3426
486ec47a 3427Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3428PL_regmatch_state and st always point to the currently active state,
3429and PL_regmatch_slab points to the slab currently containing
3430PL_regmatch_state. The first time regmatch() is called, the first slab is
3431allocated, and is never freed until interpreter destruction. When the slab
3432is full, a new one is allocated and chained to the end. At exit from
3433regmatch(), slabs allocated since entry are freed.
3434
3435*/
95b24440 3436
40a82448 3437
5bc10b2c 3438#define DEBUG_STATE_pp(pp) \
265c4333 3439 DEBUG_STATE_r({ \
baa60164 3440 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3441 PerlIO_printf(Perl_debug_log, \
5d458dd8 3442 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3443 depth*2, "", \
baa60164 3444 PL_reg_name[st->resume_state], \
5d458dd8
YO
3445 ((st==yes_state||st==mark_state) ? "[" : ""), \
3446 ((st==yes_state) ? "Y" : ""), \
3447 ((st==mark_state) ? "M" : ""), \
3448 ((st==yes_state||st==mark_state) ? "]" : "") \
3449 ); \
265c4333 3450 });
5bc10b2c 3451
40a82448 3452
3dab1dad 3453#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3454
3df15adc 3455#ifdef DEBUGGING
5bc10b2c 3456
ab3bbdeb 3457STATIC void
f2ed9b32 3458S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3459 const char *start, const char *end, const char *blurb)
3460{
efd26800 3461 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3462
3463 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3464
ab3bbdeb
YO
3465 if (!PL_colorset)
3466 reginitcolors();
3467 {
3468 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3469 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3470
f2ed9b32 3471 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3472 start, end - start, 60);
3473
3474 PerlIO_printf(Perl_debug_log,
3475 "%s%s REx%s %s against %s\n",
3476 PL_colors[4], blurb, PL_colors[5], s0, s1);
3477
f2ed9b32 3478 if (utf8_target||utf8_pat)
1de06328
YO
3479 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3480 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3481 utf8_pat && utf8_target ? " and " : "",
3482 utf8_target ? "string" : ""
ab3bbdeb
YO
3483 );
3484 }
3485}
3df15adc
YO
3486
3487STATIC void
786e8c11
YO
3488S_dump_exec_pos(pTHX_ const char *locinput,
3489 const regnode *scan,
3490 const char *loc_regeol,
3491 const char *loc_bostr,
3492 const char *loc_reg_starttry,
f2ed9b32 3493 const bool utf8_target)
07be1b83 3494{
786e8c11 3495 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3496 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3497 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3498 /* The part of the string before starttry has one color
3499 (pref0_len chars), between starttry and current
3500 position another one (pref_len - pref0_len chars),
3501 after the current position the third one.
3502 We assume that pref0_len <= pref_len, otherwise we
3503 decrease pref0_len. */
786e8c11
YO
3504 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3505 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3506 int pref0_len;
3507
7918f24d
NC
3508 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3509
f2ed9b32 3510 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3511 pref_len++;
786e8c11
YO
3512 pref0_len = pref_len - (locinput - loc_reg_starttry);
3513 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3514 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3515 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3516 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3517 l--;
3518 if (pref0_len < 0)
3519 pref0_len = 0;
3520 if (pref0_len > pref_len)
3521 pref0_len = pref_len;
3522 {
f2ed9b32 3523 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3524
ab3bbdeb 3525 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3526 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3527
ab3bbdeb 3528 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3529 (locinput - pref_len + pref0_len),
1de06328 3530 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3531
ab3bbdeb 3532 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3533 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3534
1de06328 3535 const STRLEN tlen=len0+len1+len2;
3df15adc 3536 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3537 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3538 (IV)(locinput - loc_bostr),
07be1b83 3539 len0, s0,
07be1b83 3540 len1, s1,
07be1b83 3541 (docolor ? "" : "> <"),
07be1b83 3542 len2, s2,
f9f4320a 3543 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3544 "");
3545 }
3546}
3df15adc 3547
07be1b83
YO
3548#endif
3549
0a4db386
YO
3550/* reg_check_named_buff_matched()
3551 * Checks to see if a named buffer has matched. The data array of
3552 * buffer numbers corresponding to the buffer is expected to reside
3553 * in the regexp->data->data array in the slot stored in the ARG() of
3554 * node involved. Note that this routine doesn't actually care about the
3555 * name, that information is not preserved from compilation to execution.
3556 * Returns the index of the leftmost defined buffer with the given name
3557 * or 0 if non of the buffers matched.
3558 */
3559STATIC I32
dc3bf405 3560S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
7918f24d 3561{
0a4db386 3562 I32 n;
f8fc2ecf 3563 RXi_GET_DECL(rex,rexi);
ad64d0ec 3564 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3565 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3566
3567 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3568
0a4db386 3569 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3570 if ((I32)rex->lastparen >= nums[n] &&
3571 rex->offs[nums[n]].end != -1)
0a4db386
YO
3572 {
3573 return nums[n];
3574 }
3575 }
3576 return 0;
3577}
3578
2f554ef7 3579
c74f6de9 3580static bool
984e6dd1 3581S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3582 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3583{
79a2a0e8
KW
3584 /* This function determines if there are one or two characters that match
3585 * the first character of the passed-in EXACTish node <text_node>, and if
3586 * so, returns them in the passed-in pointers.
c74f6de9 3587 *
79a2a0e8
KW
3588 * If it determines that no possible character in the target string can
3589 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3590 * the first character in <text_node> requires UTF-8 to represent, and the
3591 * target string isn't in UTF-8.)
c74f6de9 3592 *
79a2a0e8
KW
3593 * If there are more than two characters that could match the beginning of
3594 * <text_node>, or if more context is required to determine a match or not,
3595 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3596 *
3597 * The motiviation behind this function is to allow the caller to set up
3598 * tight loops for matching. If <text_node> is of type EXACT, there is
3599 * only one possible character that can match its first character, and so
3600 * the situation is quite simple. But things get much more complicated if
3601 * folding is involved. It may be that the first character of an EXACTFish
3602 * node doesn't participate in any possible fold, e.g., punctuation, so it
3603 * can be matched only by itself. The vast majority of characters that are
3604 * in folds match just two things, their lower and upper-case equivalents.
3605 * But not all are like that; some have multiple possible matches, or match
3606 * sequences of more than one character. This function sorts all that out.
3607 *
3608 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3609 * loop of trying to match A*, we know we can't exit where the thing
3610 * following it isn't a B. And something can't be a B unless it is the
3611 * beginning of B. By putting a quick test for that beginning in a tight
3612 * loop, we can rule out things that can't possibly be B without having to
3613 * break out of the loop, thus avoiding work. Similarly, if A is a single
3614 * character, we can make a tight loop matching A*, using the outputs of
3615 * this function.
3616 *
3617 * If the target string to match isn't in UTF-8, and there aren't
3618 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3619 * the one or two possible octets (which are characters in this situation)
3620 * that can match. In all cases, if there is only one character that can
3621 * match, *<c1p> and *<c2p> will be identical.
3622 *
3623 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3624 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3625 * can match the beginning of <text_node>. They should be declared with at
3626 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3627 * undefined what these contain.) If one or both of the buffers are
3628 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3629 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3630 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3631 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3632 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3633
ba44c216 3634 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3635
9cd990bf
JH
3636 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3637 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
79a2a0e8 3638 bool use_chrtest_void = FALSE;
aed7b151 3639 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3640
3641 /* Used when we have both utf8 input and utf8 output, to avoid converting
3642 * to/from code points */
3643 bool utf8_has_been_setup = FALSE;
3644
c74f6de9
KW
3645 dVAR;
3646
b4291290 3647 U8 *pat = (U8*)STRING(text_node);
a6715020 3648 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
c74f6de9 3649
79a2a0e8
KW
3650 if (OP(text_node) == EXACT) {
3651
3652 /* In an exact node, only one thing can be matched, that first
3653 * character. If both the pat and the target are UTF-8, we can just
3654 * copy the input to the output, avoiding finding the code point of
3655 * that character */
984e6dd1 3656 if (!is_utf8_pat) {
79a2a0e8
KW
3657 c2 = c1 = *pat;
3658 }
3659 else if (utf8_target) {
3660 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3661 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3662 utf8_has_been_setup = TRUE;
3663 }
3664 else {
3665 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3666 }
79a2a0e8 3667 }
31f05a37
KW
3668 else { /* an EXACTFish node */
3669 U8 *pat_end = pat + STR_LEN(text_node);
3670
3671 /* An EXACTFL node has at least some characters unfolded, because what
3672 * they match is not known until now. So, now is the time to fold
3673 * the first few of them, as many as are needed to determine 'c1' and
3674 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3675 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3676 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3677 * need to fold as many characters as a single character can fold to,
3678 * so that later we can check if the first ones are such a multi-char
3679 * fold. But, in such a pattern only locale-problematic characters
3680 * aren't folded, so we can skip this completely if the first character
3681 * in the node isn't one of the tricky ones */
3682 if (OP(text_node) == EXACTFL) {
3683
3684 if (! is_utf8_pat) {
3685 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3686 {
3687 folded[0] = folded[1] = 's';
3688 pat = folded;
3689 pat_end = folded + 2;
3690 }
3691 }
3692 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3693 U8 *s = pat;
3694 U8 *d = folded;
3695 int i;
3696
3697 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3698 if (isASCII(*s)) {
3699 *(d++) = (U8) toFOLD_LC(*s);
3700 s++;
3701 }
3702 else {
3703 STRLEN len;
3704 _to_utf8_fold_flags(s,
3705 d,
3706 &len,
3707 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3708 d += len;
3709 s += UTF8SKIP(s);
3710 }
3711 }
3712
3713 pat = folded;
3714 pat_end = d;
3715 }
3716 }
3717
251b239f
KW
3718 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3719 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
baa60164
KW
3720 {
3721 /* Multi-character folds require more context to sort out. Also
3722 * PL_utf8_foldclosures used below doesn't handle them, so have to
3723 * be handled outside this routine */
3724 use_chrtest_void = TRUE;
3725 }
3726 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3727 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
e8ea8356 3728 if (c1 > 255) {
baa60164
KW
3729 /* Load the folds hash, if not already done */
3730 SV** listp;
3731 if (! PL_utf8_foldclosures) {
31667e6b 3732 _load_PL_utf8_foldclosures();
79a2a0e8 3733 }
79a2a0e8 3734
baa60164
KW
3735 /* The fold closures data structure is a hash with the keys
3736 * being the UTF-8 of every character that is folded to, like
3737 * 'k', and the values each an array of all code points that
3738 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
3739 * Multi-character folds are not included */
3740 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3741 (char *) pat,
3742 UTF8SKIP(pat),
3743 FALSE))))
3744 {
3745 /* Not found in the hash, therefore there are no folds
3746 * containing it, so there is only a single character that
3747 * could match */
3748 c2 = c1;
79a2a0e8 3749 }
baa60164
KW
3750 else { /* Does participate in folds */
3751 AV* list = (AV*) *listp;
b9f2b683 3752 if (av_tindex(list) != 1) {
79a2a0e8 3753
baa60164
KW
3754 /* If there aren't exactly two folds to this, it is
3755 * outside the scope of this function */
3756 use_chrtest_void = TRUE;
79a2a0e8 3757 }
baa60164
KW
3758 else { /* There are two. Get them */
3759 SV** c_p = av_fetch(list, 0, FALSE);
3760 if (c_p == NULL) {
3761 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3762 }
3763 c1 = SvUV(*c_p);
3764
3765 c_p = av_fetch(list, 1, FALSE);
3766 if (c_p == NULL) {
3767 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3768 }
3769 c2 = SvUV(*c_p);
3770
3771 /* Folds that cross the 255/256 boundary are forbidden
3772 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3773 * one is ASCIII. Since the pattern character is above
e8ea8356 3774 * 255, and its only other match is below 256, the only
baa60164
KW
3775 * legal match will be to itself. We have thrown away
3776 * the original, so have to compute which is the one
e8ea8356 3777 * above 255. */
baa60164
KW
3778 if ((c1 < 256) != (c2 < 256)) {
3779 if ((OP(text_node) == EXACTFL
3780 && ! IN_UTF8_CTYPE_LOCALE)
3781 || ((OP(text_node) == EXACTFA
3782 || OP(text_node) == EXACTFA_NO_TRIE)
3783 && (isASCII(c1) || isASCII(c2))))
3784 {
3785 if (c1 < 256) {
3786 c1 = c2;
3787 }
3788 else {
3789 c2 = c1;
3790 }
79a2a0e8
KW
3791 }
3792 }
3793 }
3794 }
3795 }
e8ea8356 3796 else /* Here, c1 is <= 255 */
baa60164
KW
3797 if (utf8_target
3798 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3799 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3800 && ((OP(text_node) != EXACTFA
3801 && OP(text_node) != EXACTFA_NO_TRIE)
3802 || ! isASCII(c1)))
3803 {
3804 /* Here, there could be something above Latin1 in the target
3805 * which folds to this character in the pattern. All such
3806 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3807 * than two characters involved in their folds, so are outside
3808 * the scope of this function */
3809 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3810 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3811 }
3812 else {
3813 use_chrtest_void = TRUE;
3814 }
79a2a0e8 3815 }
baa60164
KW
3816 else { /* Here nothing above Latin1 can fold to the pattern
3817 character */
3818 switch (OP(text_node)) {
c74f6de9 3819
baa60164
KW
3820 case EXACTFL: /* /l rules */
3821 c2 = PL_fold_locale[c1];
3822 break;
c74f6de9 3823
baa60164
KW
3824 case EXACTF: /* This node only generated for non-utf8
3825 patterns */
3826 assert(! is_utf8_pat);
3827 if (! utf8_target) { /* /d rules */
3828 c2 = PL_fold[c1];
3829 break;
3830 }
3831 /* FALLTHROUGH */
3832 /* /u rules for all these. This happens to work for
3833 * EXACTFA as nothing in Latin1 folds to ASCII */
3834 case EXACTFA_NO_TRIE: /* This node only generated for
3835 non-utf8 patterns */
3836 assert(! is_utf8_pat);
924ba076 3837 /* FALLTHROUGH */
baa60164
KW
3838 case EXACTFA:
3839 case EXACTFU_SS:
3840 case EXACTFU:
3841 c2 = PL_fold_latin1[c1];
c74f6de9 3842 break;
c74f6de9 3843
baa60164
KW
3844 default:
3845 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
a25b5927 3846 assert(0); /* NOTREACHED */
baa60164 3847 }
c74f6de9
KW
3848 }
3849 }
3850 }
79a2a0e8
KW
3851
3852 /* Here have figured things out. Set up the returns */
3853 if (use_chrtest_void) {
3854 *c2p = *c1p = CHRTEST_VOID;
3855 }
3856 else if (utf8_target) {
3857 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3858 uvchr_to_utf8(c1_utf8, c1);
3859 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3860 }
c74f6de9 3861
79a2a0e8
KW
3862 /* Invariants are stored in both the utf8 and byte outputs; Use
3863 * negative numbers otherwise for the byte ones. Make sure that the
3864 * byte ones are the same iff the utf8 ones are the same */
3865 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3866 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3867 ? *c2_utf8
3868 : (c1 == c2)
3869 ? CHRTEST_NOT_A_CP_1
3870 : CHRTEST_NOT_A_CP_2;
3871 }
3872 else if (c1 > 255) {
3873 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3874 can represent */
3875 return FALSE;
3876 }
c74f6de9 3877
79a2a0e8
KW
3878 *c1p = *c2p = c2; /* c2 is the only representable value */
3879 }
3880 else { /* c1 is representable; see about c2 */
3881 *c1p = c1;
3882 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3883 }
2f554ef7 3884
c74f6de9
KW
3885 return TRUE;
3886}
2f554ef7 3887
f73aaa43 3888/* returns -1 on failure, $+[0] on success */
99a90e59 3889STATIC SSize_t
f73aaa43 3890S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3891{
a35a87e7 3892#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3893 dMY_CXT;
3894#endif
27da23d5 3895 dVAR;
ba44c216 3896 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 3897 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3898 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3899 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3900 RXi_GET_DECL(rex,rexi);
5d9a96ca 3901 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3902 regmatch_state *st;
5d9a96ca 3903 /* cache heavy used fields of st in registers */
eb578fdb
KW
3904 regnode *scan;
3905 regnode *next;
3906 U32 n = 0; /* general value; init to avoid compiler warning */
ea3daa5d 3907 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3908 char *locinput = startpos;
4d5016e5 3909 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3910 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3911
b69b0499 3912 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3913 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3914 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3915 const U32 max_nochange_depth =
3916 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3917 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3918 regmatch_state *yes_state = NULL; /* state to pop to on success of
3919 subpattern */
e2e6a0f1
YO
3920 /* mark_state piggy backs on the yes_state logic so that when we unwind
3921 the stack on success we can update the mark_state as we go */
3922 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3923 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3924 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3925 U32 state_num;
5d458dd8
YO
3926 bool no_final = 0; /* prevent failure from backtracking? */
3927 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3928 char *startpoint = locinput;
5d458dd8
YO
3929 SV *popmark = NULL; /* are we looking for a mark? */
3930 SV *sv_commit = NULL; /* last mark name seen in failure */
3931 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3932 during a successful match */
5d458dd8
YO
3933 U32 lastopen = 0; /* last open we saw */
3934 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4a2b275c 3935 SV* const oreplsv = GvSVn(PL_replgv);
24d3c4a9
DM
3936 /* these three flags are set by various ops to signal information to
3937 * the very next op. They have a useful lifetime of exactly one loop
3938 * iteration, and are not preserved or restored by state pushes/pops
3939 */
3940 bool sw = 0; /* the condition value in (?(cond)a|b) */
3941 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3942 int logical = 0; /* the following EVAL is:
3943 0: (?{...})
3944 1: (?(?{...})X|Y)
3945 2: (??{...})
3946 or the following IFMATCH/UNLESSM is:
3947 false: plain (?=foo)
3948 true: used as a condition: (?(?=foo))
3949 */
81ed78b2
DM
3950 PAD* last_pad = NULL;
3951 dMULTICALL;
3952 I32 gimme = G_SCALAR;
3953 CV *caller_cv = NULL; /* who called us */
3954 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3955 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3956 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3957 int to_complement; /* Invert the result? */
3958 _char_class_number classnum;
984e6dd1 3959 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3960
95b24440 3961#ifdef DEBUGGING
e68ec53f 3962 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3963#endif
3964
7e68f152
FC
3965 /* protect against undef(*^R) */
3966 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3967
81ed78b2
DM
3968 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3969 multicall_oldcatch = 0;
3970 multicall_cv = NULL;
3971 cx = NULL;
4f8dbb2d
JL
3972 PERL_UNUSED_VAR(multicall_cop);
3973 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3974
3975
7918f24d
NC
3976 PERL_ARGS_ASSERT_REGMATCH;
3977
3b57cd43 3978 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3979 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3980 }));
5d9a96ca 3981
331b2dcc 3982 st = PL_regmatch_state;
5d9a96ca 3983
d6a28714 3984 /* Note that nextchr is a byte even in UTF */
7016d6eb 3985 SET_nextchr;
d6a28714
JH
3986 scan = prog;
3987 while (scan != NULL) {
8ba1375e 3988
a3621e74 3989 DEBUG_EXECUTE_r( {
6136c704 3990 SV * const prop = sv_newmortal();
1de06328 3991 regnode *rnext=regnext(scan);
f2ed9b32 3992 DUMP_EXEC_POS( locinput, scan, utf8_target );
2395827c 3993 regprop(rex, prop, scan, reginfo);
07be1b83
YO
3994
3995 PerlIO_printf(Perl_debug_log,
3996 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3997 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3998 SvPVX_const(prop),
1de06328 3999 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 4000 0 : (IV)(rnext - rexi->program));
2a782b5b 4001 });
d6a28714
JH
4002
4003 next = scan + NEXT_OFF(scan);
4004 if (next == scan)
4005 next = NULL;
40a82448 4006 state_num = OP(scan);
d6a28714 4007
40a82448 4008 reenter_switch:
3018b823 4009 to_complement = 0;
34a81e2b 4010
7016d6eb 4011 SET_nextchr;
e6ca698c 4012 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 4013
40a82448 4014 switch (state_num) {
d3d47aac 4015 case SBOL: /* /^../ and /\A../ */
9d9163fb 4016 if (locinput == reginfo->strbeg)
b8c5462f 4017 break;
d6a28714 4018 sayNO;
3c0563b9
DM
4019
4020 case MBOL: /* /^../m */
9d9163fb 4021 if (locinput == reginfo->strbeg ||
7016d6eb 4022 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 4023 {
b8c5462f
JH
4024 break;
4025 }
d6a28714 4026 sayNO;
3c0563b9 4027
3c0563b9 4028 case GPOS: /* \G */
3b0527fe 4029 if (locinput == reginfo->ganch)
d6a28714
JH
4030 break;
4031 sayNO;
ee9b8eae 4032
3c0563b9 4033 case KEEPS: /* \K */
ee9b8eae 4034 /* update the startpoint */
b93070ed 4035 st->u.keeper.val = rex->offs[0].start;
9d9163fb 4036 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 4037 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
a74ff37d
JH
4038 /* NOTREACHED */
4039 assert(0);
4040
ee9b8eae
YO
4041 case KEEPS_next_fail:
4042 /* rollback the start point change */
b93070ed 4043 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 4044 sayNO_SILENT;
a74ff37d
JH
4045 /* NOTREACHED */
4046 assert(0);
3c0563b9 4047
3c0563b9 4048 case MEOL: /* /..$/m */
7016d6eb 4049 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4050 sayNO;
b8c5462f 4051 break;
3c0563b9 4052
d3d47aac 4053 case SEOL: /* /..$/ */
7016d6eb 4054 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4055 sayNO;
220db18a 4056 if (reginfo->strend - locinput > 1)
b8c5462f 4057 sayNO;
b8c5462f 4058 break;
3c0563b9
DM
4059
4060 case EOS: /* \z */
7016d6eb 4061 if (!NEXTCHR_IS_EOS)
b8c5462f 4062 sayNO;
d6a28714 4063 break;
3c0563b9
DM
4064
4065 case SANY: /* /./s */
7016d6eb 4066 if (NEXTCHR_IS_EOS)
4633a7c4 4067 sayNO;
28b98f76 4068 goto increment_locinput;
3c0563b9
DM
4069
4070 case CANY: /* \C */
7016d6eb 4071 if (NEXTCHR_IS_EOS)
f33976b4 4072 sayNO;
3640db6b 4073 locinput++;
a0d0e21e 4074 break;
3c0563b9
DM
4075
4076 case REG_ANY: /* /./ */
7016d6eb 4077 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 4078 sayNO;
28b98f76
DM
4079 goto increment_locinput;
4080
166ba7cd
DM
4081
4082#undef ST
4083#define ST st->u.trie
3c0563b9 4084 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
4085 /* In this case the charclass data is available inline so
4086 we can fail fast without a lot of extra overhead.
4087 */
7016d6eb 4088 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
4089 DEBUG_EXECUTE_r(
4090 PerlIO_printf(Perl_debug_log,
4091 "%*s %sfailed to match trie start class...%s\n",
4092 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4093 );
4094 sayNO_SILENT;
a74ff37d
JH
4095 /* NOTREACHED */
4096 assert(0);
786e8c11 4097 }
924ba076 4098 /* FALLTHROUGH */
3c0563b9 4099 case TRIE: /* (ab|cd) */
2e64971a
DM
4100 /* the basic plan of execution of the trie is:
4101 * At the beginning, run though all the states, and
4102 * find the longest-matching word. Also remember the position
4103 * of the shortest matching word. For example, this pattern:
4104 * 1 2 3 4 5
4105 * ab|a|x|abcd|abc
4106 * when matched against the string "abcde", will generate
4107 * accept states for all words except 3, with the longest
895cc420 4108 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
4109 * the position being after char 1 of the string).
4110 *
4111 * Then for each matching word, in word order (i.e. 1,2,4,5),
4112 * we run the remainder of the pattern; on each try setting
4113 * the current position to the character following the word,
4114 * returning to try the next word on failure.
4115 *
4116 * We avoid having to build a list of words at runtime by
4117 * using a compile-time structure, wordinfo[].prev, which
4118 * gives, for each word, the previous accepting word (if any).
4119 * In the case above it would contain the mappings 1->2, 2->0,
4120 * 3->0, 4->5, 5->1. We can use this table to generate, from
4121 * the longest word (4 above), a list of all words, by
4122 * following the list of prev pointers; this gives us the
4123 * unordered list 4,5,1,2. Then given the current word we have
4124 * just tried, we can go through the list and find the
4125 * next-biggest word to try (so if we just failed on word 2,
4126 * the next in the list is 4).
4127 *
4128 * Since at runtime we don't record the matching position in
4129 * the string for each word, we have to work that out for
4130 * each word we're about to process. The wordinfo table holds
4131 * the character length of each word; given that we recorded
4132 * at the start: the position of the shortest word and its
4133 * length in chars, we just need to move the pointer the
4134 * difference between the two char lengths. Depending on
4135 * Unicode status and folding, that's cheap or expensive.
4136 *
4137 * This algorithm is optimised for the case where are only a
4138 * small number of accept states, i.e. 0,1, or maybe 2.
4139 * With lots of accepts states, and having to try all of them,
4140 * it becomes quadratic on number of accept states to find all
4141 * the next words.
4142 */
4143
3dab1dad 4144 {
07be1b83 4145 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 4146 DECL_TRIE_TYPE(scan);
3dab1dad
YO
4147
4148 /* what trie are we using right now */
be8e71aa 4149 reg_trie_data * const trie
f8fc2ecf 4150 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 4151 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 4152 U32 state = trie->startstate;
166ba7cd 4153
7016d6eb
DM
4154 if ( trie->bitmap
4155 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4156 {
3dab1dad
YO
4157 if (trie->states[ state ].wordnum) {
4158 DEBUG_EXECUTE_r(
4159 PerlIO_printf(Perl_debug_log,
4160 "%*s %smatched empty string...%s\n",
5bc10b2c 4161 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 4162 );
20dbff7c
YO
4163 if (!trie->jump)
4164 break;
3dab1dad
YO
4165 } else {
4166 DEBUG_EXECUTE_r(
4167 PerlIO_printf(Perl_debug_log,
786e8c11 4168 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 4169 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
4170 );
4171 sayNO_SILENT;
4172 }
4173 }
166ba7cd 4174
786e8c11
YO
4175 {
4176 U8 *uc = ( U8* )locinput;
4177
4178 STRLEN len = 0;
4179 STRLEN foldlen = 0;
4180 U8 *uscan = (U8*)NULL;
786e8c11 4181 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
4182 U32 charcount = 0; /* how many input chars we have matched */
4183 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 4184
786e8c11 4185 ST.jump = trie->jump;
786e8c11 4186 ST.me = scan;
2e64971a
DM
4187 ST.firstpos = NULL;
4188 ST.longfold = FALSE; /* char longer if folded => it's harder */
4189 ST.nextword = 0;
4190
4191 /* fully traverse the TRIE; note the position of the
4192 shortest accept state and the wordnum of the longest
4193 accept state */
07be1b83 4194
220db18a 4195 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 4196 U32 base = trie->states[ state ].trans.base;
f9f4320a 4197 UV uvc = 0;
acb909b4 4198 U16 charid = 0;
2e64971a
DM
4199 U16 wordnum;
4200 wordnum = trie->states[ state ].wordnum;
4201
4202 if (wordnum) { /* it's an accept state */
4203 if (!accepted) {
4204 accepted = 1;
4205 /* record first match position */
4206 if (ST.longfold) {
4207 ST.firstpos = (U8*)locinput;
4208 ST.firstchars = 0;
5b47454d 4209 }
2e64971a
DM
4210 else {
4211 ST.firstpos = uc;
4212 ST.firstchars = charcount;
4213 }
4214 }
4215 if (!ST.nextword || wordnum < ST.nextword)
4216 ST.nextword = wordnum;
4217 ST.topword = wordnum;
786e8c11 4218 }
a3621e74 4219
07be1b83 4220 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 4221 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 4222 PerlIO_printf( Perl_debug_log,
2e64971a 4223 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 4224 2+depth * 2, "", PL_colors[4],
2e64971a 4225 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 4226 });
a3621e74 4227
2e64971a 4228 /* read a char and goto next state */
220db18a 4229 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 4230 I32 offset;
55eed653
NC
4231 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4232 uscan, len, uvc, charid, foldlen,
4233 foldbuf, uniflags);
2e64971a
DM
4234 charcount++;
4235 if (foldlen>0)
4236 ST.longfold = TRUE;
5b47454d 4237 if (charid &&
6dd2be57
DM
4238 ( ((offset =
4239 base + charid - 1 - trie->uniquecharcount)) >= 0)
4240
4241 && ((U32)offset < trie->lasttrans)
4242 && trie->trans[offset].check == state)
5b47454d 4243 {
6dd2be57 4244 state = trie->trans[offset].next;
5b47454d
DM
4245 }
4246 else {
4247 state = 0;
4248 }
4249 uc += len;
4250
4251 }
4252 else {
a3621e74
YO
4253 state = 0;
4254 }
4255 DEBUG_TRIE_EXECUTE_r(
e4584336 4256 PerlIO_printf( Perl_debug_log,
786e8c11 4257 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 4258 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
4259 );
4260 }
2e64971a 4261 if (!accepted)
a3621e74 4262 sayNO;
a3621e74 4263
2e64971a
DM
4264 /* calculate total number of accept states */
4265 {
4266 U16 w = ST.topword;
4267 accepted = 0;
4268 while (w) {
4269 w = trie->wordinfo[w].prev;
4270 accepted++;
4271 }
4272 ST.accepted = accepted;
4273 }
4274
166ba7cd
DM
4275 DEBUG_EXECUTE_r(
4276 PerlIO_printf( Perl_debug_log,
4277 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 4278 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
4279 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4280 );
2e64971a 4281 goto trie_first_try; /* jump into the fail handler */
786e8c11 4282 }}
a74ff37d
JH
4283 /* NOTREACHED */
4284 assert(0);
2e64971a
DM
4285
4286 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
4287 {
4288 U8 *uc;
fae667d5
YO
4289 if ( ST.jump) {
4290 REGCP_UNWIND(ST.cp);
a8d1f4b4 4291 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 4292 }
2e64971a
DM
4293 if (!--ST.accepted) {
4294 DEBUG_EXECUTE_r({
4295 PerlIO_printf( Perl_debug_log,
4296 "%*s %sTRIE failed...%s\n",
4297 REPORT_CODE_OFF+depth*2, "",
4298 PL_colors[4],
4299 PL_colors[5] );
4300 });
4301 sayNO_SILENT;
4302 }
4303 {
4304 /* Find next-highest word to process. Note that this code
4305 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
4306 U16 min = 0;
4307 U16 word;
4308 U16 const nextword = ST.nextword;
4309 reg_trie_wordinfo * const wordinfo
2e64971a
DM
4310 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4311 for (word=ST.topword; word; word=wordinfo[word].prev) {
4312 if (word > nextword && (!min || word < min))
4313 min = word;
4314 }
4315 ST.nextword = min;
4316 }
4317
fae667d5 4318 trie_first_try:
5d458dd8
YO
4319 if (do_cutgroup) {
4320 do_cutgroup = 0;
4321 no_final = 0;
4322 }
fae667d5
YO
4323
4324 if ( ST.jump) {
b93070ed 4325 ST.lastparen = rex->lastparen;
f6033a9d 4326 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 4327 REGCP_SET(ST.cp);
2e64971a 4328 }
a3621e74 4329
2e64971a 4330 /* find start char of end of current word */
166ba7cd 4331 {
2e64971a 4332 U32 chars; /* how many chars to skip */
2e64971a
DM
4333 reg_trie_data * const trie
4334 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4335
4336 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4337 >= ST.firstchars);
4338 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4339 - ST.firstchars;
a059a757 4340 uc = ST.firstpos;
2e64971a
DM
4341
4342 if (ST.longfold) {
4343 /* the hard option - fold each char in turn and find
4344 * its folded length (which may be different */
4345 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4346 STRLEN foldlen;
4347 STRLEN len;
d9a396a3 4348 UV uvc;
2e64971a
DM
4349 U8 *uscan;
4350
4351 while (chars) {
f2ed9b32 4352 if (utf8_target) {
c80e42f3 4353 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
2e64971a
DM
4354 uniflags);
4355 uc += len;
4356 }
4357 else {
4358 uvc = *uc;
4359 uc++;
4360 }
4361 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4362 uscan = foldbuf;
4363 while (foldlen) {
4364 if (!--chars)
4365 break;
c80e42f3 4366 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
2e64971a
DM
4367 uniflags);
4368 uscan += len;
4369 foldlen -= len;
4370 }
4371 }
a3621e74 4372 }
2e64971a 4373 else {
f2ed9b32 4374 if (utf8_target)
2e64971a
DM
4375 while (chars--)
4376 uc += UTF8SKIP(uc);
4377 else
4378 uc += chars;
4379 }
2e64971a 4380 }
166ba7cd 4381
6603fe3e
DM
4382 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4383 ? ST.jump[ST.nextword]
4384 : NEXT_OFF(ST.me));
166ba7cd 4385
2e64971a
DM
4386 DEBUG_EXECUTE_r({
4387 PerlIO_printf( Perl_debug_log,
4388 "%*s %sTRIE matched word #%d, continuing%s\n",
4389 REPORT_CODE_OFF+depth*2, "",
4390 PL_colors[4],
4391 ST.nextword,
4392 PL_colors[5]
4393 );
4394 });
4395
4396 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4397 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
a74ff37d
JH
4398 /* NOTREACHED */
4399 assert(0);
166ba7cd 4400 }
2e64971a
DM
4401 /* only one choice left - just continue */
4402 DEBUG_EXECUTE_r({
4403 AV *const trie_words
4404 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4405 SV ** const tmp = av_fetch( trie_words,
4406 ST.nextword-1, 0 );
4407 SV *sv= tmp ? sv_newmortal() : NULL;
4408
4409 PerlIO_printf( Perl_debug_log,
4410 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4411 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4412 ST.nextword,
4413 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4414 PL_colors[0], PL_colors[1],
c89df6cf 4415 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4416 )
4417 : "not compiled under -Dr",
4418 PL_colors[5] );
4419 });
4420
a059a757 4421 locinput = (char*)uc;
2e64971a 4422 continue; /* execute rest of RE */
a74ff37d
JH
4423 /* NOTREACHED */
4424 assert(0);
a059a757 4425 }
166ba7cd
DM
4426#undef ST
4427
3c0563b9 4428 case EXACT: { /* /abc/ */
95b24440 4429 char *s = STRING(scan);
24d3c4a9 4430 ln = STR_LEN(scan);
984e6dd1 4431 if (utf8_target != is_utf8_pat) {
bc517b45 4432 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4433 char *l = locinput;
24d3c4a9 4434 const char * const e = s + ln;
a72c7584 4435
f2ed9b32 4436 if (utf8_target) {
e6a3850e
KW
4437 /* The target is utf8, the pattern is not utf8.
4438 * Above-Latin1 code points can't match the pattern;
4439 * invariants match exactly, and the other Latin1 ones need
4440 * to be downgraded to a single byte in order to do the
4441 * comparison. (If we could be confident that the target
4442 * is not malformed, this could be refactored to have fewer
4443 * tests by just assuming that if the first bytes match, it
4444 * is an invariant, but there are tests in the test suite
4445 * dealing with (??{...}) which violate this) */
1aa99e6b 4446 while (s < e) {
220db18a
DM
4447 if (l >= reginfo->strend
4448 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4449 {
e6a3850e
KW
4450 sayNO;
4451 }
4452 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4453 if (*l != *s) {
4454 sayNO;
4455 }
4456 l++;
4457 }
4458 else {
94bb8c36
KW
4459 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4460 {
e6a3850e
KW
4461 sayNO;
4462 }
4463 l += 2;
4464 }
4465 s++;
1aa99e6b 4466 }
5ff6fc6d
JH
4467 }
4468 else {
4469 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4470 while (s < e) {
220db18a
DM
4471 if (l >= reginfo->strend
4472 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
4473 {
4474 sayNO;
4475 }
4476 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4477 if (*s != *l) {
4478 sayNO;
4479 }
4480 s++;
4481 }
4482 else {
94bb8c36
KW
4483 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4484 {
e6a3850e
KW
4485 sayNO;
4486 }
4487 s += 2;
4488 }
4489 l++;
1aa99e6b 4490 }
5ff6fc6d 4491 }
1aa99e6b 4492 locinput = l;
1aa99e6b 4493 }
5ac65bff
KW
4494 else {
4495 /* The target and the pattern have the same utf8ness. */
4496 /* Inline the first character, for speed. */
220db18a 4497 if (reginfo->strend - locinput < ln
5ac65bff
KW
4498 || UCHARAT(s) != nextchr
4499 || (ln > 1 && memNE(s, locinput, ln)))
4500 {
4501 sayNO;
4502 }
4503 locinput += ln;
4504 }
d6a28714 4505 break;
95b24440 4506 }
7016d6eb 4507
3c0563b9 4508 case EXACTFL: { /* /abc/il */
a932d541 4509 re_fold_t folder;
9a5a5549
KW
4510 const U8 * fold_array;
4511 const char * s;
d513472c 4512 U32 fold_utf8_flags;
9a5a5549 4513
f67f9e53
KW
4514 folder = foldEQ_locale;
4515 fold_array = PL_fold_locale;
cea315b6 4516 fold_utf8_flags = FOLDEQ_LOCALE;
9a5a5549
KW
4517 goto do_exactf;
4518
3c0563b9 4519 case EXACTFU_SS: /* /\x{df}/iu */
3c0563b9 4520 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4521 folder = foldEQ_latin1;
4522 fold_array = PL_fold_latin1;
984e6dd1 4523 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4524 goto do_exactf;
4525
098b07d5
KW
4526 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4527 patterns */
4528 assert(! is_utf8_pat);
924ba076 4529 /* FALLTHROUGH */
3c0563b9 4530 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4531 folder = foldEQ_latin1;
4532 fold_array = PL_fold_latin1;
57014d77 4533 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4534 goto do_exactf;
4535
2fdb7295
KW
4536 case EXACTF: /* /abc/i This node only generated for
4537 non-utf8 patterns */
4538 assert(! is_utf8_pat);
9a5a5549
KW
4539 folder = foldEQ;
4540 fold_array = PL_fold;
62bf7766 4541 fold_utf8_flags = 0;
9a5a5549
KW
4542
4543 do_exactf:
4544 s = STRING(scan);
24d3c4a9 4545 ln = STR_LEN(scan);
d6a28714 4546
31f05a37
KW
4547 if (utf8_target
4548 || is_utf8_pat
4549 || state_num == EXACTFU_SS
4550 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4551 {
3c760661
KW
4552 /* Either target or the pattern are utf8, or has the issue where
4553 * the fold lengths may differ. */
be8e71aa 4554 const char * const l = locinput;
220db18a 4555 char *e = reginfo->strend;
bc517b45 4556
984e6dd1 4557 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4558 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4559 {
4560 sayNO;
5486206c 4561 }
d07ddd77 4562 locinput = e;
d07ddd77 4563 break;
a0ed51b3 4564 }
d6a28714 4565
0a138b74 4566 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4567 if (UCHARAT(s) != nextchr
4568 && !NEXTCHR_IS_EOS
4569 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4570 {
a0ed51b3 4571 sayNO;
9a5a5549 4572 }
220db18a 4573 if (reginfo->strend - locinput < ln)
b8c5462f 4574 sayNO;
9a5a5549 4575 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4576 sayNO;
24d3c4a9 4577 locinput += ln;
a0d0e21e 4578 break;
9a5a5549 4579 }
63ac0dad
KW
4580
4581 /* XXX Could improve efficiency by separating these all out using a
4582 * macro or in-line function. At that point regcomp.c would no longer
4583 * have to set the FLAGS fields of these */
3c0563b9
DM
4584 case BOUNDL: /* /\b/l */
4585 case NBOUNDL: /* /\B/l */
3c0563b9
DM
4586 case BOUND: /* /\b/ */
4587 case BOUNDU: /* /\b/u */
4588 case BOUNDA: /* /\b/a */
4589 case NBOUND: /* /\B/ */
4590 case NBOUNDU: /* /\B/u */
4591 case NBOUNDA: /* /\B/a */
b2680017 4592 /* was last char in word? */
f2e96b5d
KW
4593 if (utf8_target
4594 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4595 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4596 {
9d9163fb 4597 if (locinput == reginfo->strbeg)
b2680017
YO
4598 ln = '\n';
4599 else {
9d9163fb
DM
4600 const U8 * const r =
4601 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
b2680017 4602
3db24e1e
KW
4603 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4604 0, uniflags);
b2680017 4605 }
63ac0dad 4606 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4607 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4608 if (NEXTCHR_IS_EOS)
4609 n = 0;
4610 else {
4611 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4612 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4613 utf8_target);
4614 }
b2680017
YO
4615 }
4616 else {
f4cd282c 4617 ln = isWORDCHAR_LC_uvchr(ln);
0eb30aeb 4618 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4619 }
4620 }
4621 else {
cfaf538b
KW
4622
4623 /* Here the string isn't utf8, or is utf8 and only ascii
4624 * characters are to match \w. In the latter case looking at
4625 * the byte just prior to the current one may be just the final
4626 * byte of a multi-byte character. This is ok. There are two
4627 * cases:
4628 * 1) it is a single byte character, and then the test is doing
4629 * just what it's supposed to.
4630 * 2) it is a multi-byte character, in which case the final
4631 * byte is never mistakable for ASCII, and so the test
4632 * will say it is not a word character, which is the
4633 * correct answer. */
9d9163fb 4634 ln = (locinput != reginfo->strbeg) ?
b2680017 4635 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4636 switch (FLAGS(scan)) {
4637 case REGEX_UNICODE_CHARSET:
4638 ln = isWORDCHAR_L1(ln);
7016d6eb 4639 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4640 break;
4641 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4642 ln = isWORDCHAR_LC(ln);
4643 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4644 break;
4645 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4646 ln = isWORDCHAR(ln);
4647 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4648 break;
cfaf538b 4649 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4650 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4651 ln = isWORDCHAR_A(ln);
7016d6eb 4652 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4653 break;
63ac0dad
KW
4654 default:
4655 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
b2680017
YO
4656 }
4657 }
63ac0dad
KW
4658 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4659 * regcomp.sym */
4660 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4661 sayNO;
4662 break;
3c0563b9 4663
3c0563b9 4664 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4665 if (NEXTCHR_IS_EOS)
4666 sayNO;
e0193e47 4667 if (utf8_target) {
3db24e1e
KW
4668 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4669 utf8_target))
09b08e9b 4670 sayNO;
635cd5d4 4671 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4672 }
4673 else {
20ed0b26 4674 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4675 sayNO;
3640db6b 4676 locinput++;
e0f9d4a8 4677 }
b8c5462f 4678 break;
3c0563b9 4679
3018b823
KW
4680 /* The argument (FLAGS) to all the POSIX node types is the class number
4681 * */
ee9a90b8 4682
3018b823
KW
4683 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4684 to_complement = 1;
4685 /* FALLTHROUGH */
4686
4687 case POSIXL: /* \w or [:punct:] etc. under /l */
4688 if (NEXTCHR_IS_EOS)
bedac28b 4689 sayNO;
bedac28b 4690
3018b823
KW
4691 /* Use isFOO_lc() for characters within Latin1. (Note that
4692 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4693 * wouldn't be invariant) */
4694 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4695 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4696 sayNO;
4697 }
4698 }
3018b823
KW
4699 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4700 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
94bb8c36 4701 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823
KW
4702 *(locinput + 1))))))
4703 {
bedac28b 4704 sayNO;
3018b823 4705 }
bedac28b 4706 }
3018b823
KW
4707 else { /* Here, must be an above Latin-1 code point */
4708 goto utf8_posix_not_eos;
bedac28b 4709 }
3018b823
KW
4710
4711 /* Here, must be utf8 */
4712 locinput += UTF8SKIP(locinput);
bedac28b
KW
4713 break;
4714
3018b823
KW
4715 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4716 to_complement = 1;
4717 /* FALLTHROUGH */
4718
4719 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4720 if (utf8_target) {
3018b823 4721 goto utf8_posix;
bedac28b 4722 }
3018b823
KW
4723 goto posixa;
4724
4725 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4726
3018b823 4727 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4728 sayNO;
4729 }
bedac28b 4730
3018b823
KW
4731 /* All UTF-8 variants match */
4732 if (! UTF8_IS_INVARIANT(nextchr)) {
4733 goto increment_locinput;
bedac28b 4734 }
ee9a90b8 4735
3018b823
KW
4736 to_complement = 1;
4737 /* FALLTHROUGH */
4738
4739 case POSIXA: /* \w or [:punct:] etc. under /a */
4740
4741 posixa:
4742 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4743 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4744 * character is a single byte */
20d0b1e9 4745
3018b823
KW
4746 if (NEXTCHR_IS_EOS
4747 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4748 FLAGS(scan)))))
4749 {
0658cdde
KW
4750 sayNO;
4751 }
3018b823
KW
4752
4753 /* Here we are either not in utf8, or we matched a utf8-invariant,
4754 * so the next char is the next byte */
3640db6b 4755 locinput++;
0658cdde 4756 break;
3c0563b9 4757
3018b823
KW
4758 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4759 to_complement = 1;
4760 /* FALLTHROUGH */
4761
4762 case POSIXU: /* \w or [:punct:] etc. under /u */
4763 utf8_posix:
4764 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4765 sayNO;
4766 }
3018b823
KW
4767 utf8_posix_not_eos:
4768
4769 /* Use _generic_isCC() for characters within Latin1. (Note that
4770 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4771 * wouldn't be invariant) */
4772 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4773 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4774 FLAGS(scan)))))
4775 {
4776 sayNO;
4777 }
4778 locinput++;
4779 }
4780 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4781 if (! (to_complement
94bb8c36 4782 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823 4783 *(locinput + 1)),
94bb8c36 4784 FLAGS(scan)))))
3018b823
KW
4785 {
4786 sayNO;
4787 }
4788 locinput += 2;
4789 }
4790 else { /* Handle above Latin-1 code points */
4791 classnum = (_char_class_number) FLAGS(scan);
4792 if (classnum < _FIRST_NON_SWASH_CC) {
4793
4794 /* Here, uses a swash to find such code points. Load if if
4795 * not done already */
4796 if (! PL_utf8_swash_ptrs[classnum]) {
4797 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4798 PL_utf8_swash_ptrs[classnum]
4799 = _core_swash_init("utf8",
2a16ac92
KW
4800 "",
4801 &PL_sv_undef, 1, 0,
4802 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
4803 }
4804 if (! (to_complement
4805 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4806 (U8 *) locinput, TRUE))))
4807 {
4808 sayNO;
4809 }
4810 }
4811 else { /* Here, uses macros to find above Latin-1 code points */
4812 switch (classnum) {
4813 case _CC_ENUM_SPACE: /* XXX would require separate
4814 code if we revert the change
4815 of \v matching this */
4816 case _CC_ENUM_PSXSPC:
4817 if (! (to_complement
4818 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4819 {
4820 sayNO;
4821 }
4822 break;
4823 case _CC_ENUM_BLANK:
4824 if (! (to_complement
4825 ^ cBOOL(is_HORIZWS_high(locinput))))
4826 {
4827 sayNO;
4828 }
4829 break;
4830 case _CC_ENUM_XDIGIT:
4831 if (! (to_complement
4832 ^ cBOOL(is_XDIGIT_high(locinput))))
4833 {
4834 sayNO;
4835 }
4836 break;
4837 case _CC_ENUM_VERTSPACE:
4838 if (! (to_complement
4839 ^ cBOOL(is_VERTWS_high(locinput))))
4840 {
4841 sayNO;
4842 }
4843 break;
4844 default: /* The rest, e.g. [:cntrl:], can't match
4845 above Latin1 */
4846 if (! to_complement) {
4847 sayNO;
4848 }
4849 break;
4850 }
4851 }
4852 locinput += UTF8SKIP(locinput);
4853 }
4854 break;
0658cdde 4855
37e2e78e
KW
4856 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4857 a Unicode extended Grapheme Cluster */
4858 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4859 extended Grapheme Cluster is:
4860
7aee35ff
KW
4861 CR LF
4862 | Prepend* Begin Extend*
4863 | .
37e2e78e 4864
7aee35ff
KW
4865 Begin is: ( Special_Begin | ! Control )
4866 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4867 Extend is: ( Grapheme_Extend | Spacing_Mark )
4868 Control is: [ GCB_Control | CR | LF ]
4869 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4870
27d4fc33
KW
4871 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4872 we can rewrite
4873
4874 Begin is ( Regular_Begin + Special Begin )
4875
4876 It turns out that 98.4% of all Unicode code points match
4877 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4878 the previous implementation for almost all Unicode code points.
27d4fc33 4879
37e2e78e
KW
4880 There is a subtlety with Prepend* which showed up in testing.
4881 Note that the Begin, and only the Begin is required in:
4882 | Prepend* Begin Extend*
cc3b396d
KW
4883 Also, Begin contains '! Control'. A Prepend must be a
4884 '! Control', which means it must also be a Begin. What it
4885 comes down to is that if we match Prepend* and then find no
4886 suitable Begin afterwards, that if we backtrack the last
4887 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4888 */
4889
7016d6eb 4890 if (NEXTCHR_IS_EOS)
a0ed51b3 4891 sayNO;
f2ed9b32 4892 if (! utf8_target) {
37e2e78e
KW
4893
4894 /* Match either CR LF or '.', as all the other possibilities
4895 * require utf8 */
4896 locinput++; /* Match the . or CR */
cc3b396d
KW
4897 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4898 match the LF */
220db18a 4899 && locinput < reginfo->strend
e699a1d5
KW
4900 && UCHARAT(locinput) == '\n')
4901 {
4902 locinput++;
4903 }
37e2e78e
KW
4904 }
4905 else {
4906
4907 /* Utf8: See if is ( CR LF ); already know that locinput <
220db18a
DM
4908 * reginfo->strend, so locinput+1 is in bounds */
4909 if ( nextchr == '\r' && locinput+1 < reginfo->strend
f67f9e53 4910 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4911 {
37e2e78e
KW
4912 locinput += 2;
4913 }
4914 else {
45fdf108
KW
4915 STRLEN len;
4916
37e2e78e
KW
4917 /* In case have to backtrack to beginning, then match '.' */
4918 char *starting = locinput;
4919
4920 /* In case have to backtrack the last prepend */
e699a1d5 4921 char *previous_prepend = NULL;
37e2e78e
KW
4922
4923 LOAD_UTF8_CHARCLASS_GCB();
4924
45fdf108 4925 /* Match (prepend)* */
220db18a 4926 while (locinput < reginfo->strend
45fdf108
KW
4927 && (len = is_GCB_Prepend_utf8(locinput)))
4928 {
4929 previous_prepend = locinput;
4930 locinput += len;
a1853d78 4931 }
37e2e78e
KW
4932
4933 /* As noted above, if we matched a prepend character, but
4934 * the next thing won't match, back off the last prepend we
4935 * matched, as it is guaranteed to match the begin */
4936 if (previous_prepend
220db18a 4937 && (locinput >= reginfo->strend
c101f46d
KW
4938 || (! swash_fetch(PL_utf8_X_regular_begin,
4939 (U8*)locinput, utf8_target)
bff53399 4940 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4941 )
37e2e78e
KW
4942 {
4943 locinput = previous_prepend;
4944 }
4945
220db18a 4946 /* Note that here we know reginfo->strend > locinput, as we
37e2e78e
KW
4947 * tested that upon input to this switch case, and if we
4948 * moved locinput forward, we tested the result just above
4949 * and it either passed, or we backed off so that it will
4950 * now pass */
11dfcd49
KW
4951 if (swash_fetch(PL_utf8_X_regular_begin,
4952 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4953 locinput += UTF8SKIP(locinput);
4954 }
bff53399 4955 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4956
4957 /* Here did not match the required 'Begin' in the
4958 * second term. So just match the very first
4959 * character, the '.' of the final term of the regex */
4960 locinput = starting + UTF8SKIP(starting);
27d4fc33 4961 goto exit_utf8;
37e2e78e
KW
4962 } else {
4963
11dfcd49
KW
4964 /* Here is a special begin. It can be composed of
4965 * several individual characters. One possibility is
4966 * RI+ */
45fdf108
KW
4967 if ((len = is_GCB_RI_utf8(locinput))) {
4968 locinput += len;
220db18a 4969 while (locinput < reginfo->strend
45fdf108 4970 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4971 {
45fdf108 4972 locinput += len;
11dfcd49 4973 }
45fdf108
KW
4974 } else if ((len = is_GCB_T_utf8(locinput))) {
4975 /* Another possibility is T+ */
4976 locinput += len;
220db18a 4977 while (locinput < reginfo->strend
45fdf108 4978 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4979 {
45fdf108 4980 locinput += len;
11dfcd49
KW
4981 }
4982 } else {
4983
4984 /* Here, neither RI+ nor T+; must be some other
4985 * Hangul. That means it is one of the others: L,
4986 * LV, LVT or V, and matches:
4987 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4988
4989 /* Match L* */
220db18a 4990 while (locinput < reginfo->strend
45fdf108 4991 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4992 {
45fdf108 4993 locinput += len;
11dfcd49 4994 }
37e2e78e 4995
11dfcd49
KW
4996 /* Here, have exhausted L*. If the next character
4997 * is not an LV, LVT nor V, it means we had to have
4998 * at least one L, so matches L+ in the original
4999 * equation, we have a complete hangul syllable.
5000 * Are done. */
5001
220db18a 5002 if (locinput < reginfo->strend
45fdf108 5003 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 5004 {
11dfcd49 5005 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
5006 * See if LVT, by first ruling out V, then LV */
5007 if (! is_GCB_V_utf8(locinput)
5008 /* All but every TCount one is LV */
5009 && (valid_utf8_to_uvchr((U8 *) locinput,
5010 NULL)
5011 - SBASE)
5012 % TCount != 0)
5013 {
11dfcd49
KW
5014 locinput += UTF8SKIP(locinput);
5015 } else {
5016
5017 /* Must be V or LV. Take it, then match
5018 * V* */
5019 locinput += UTF8SKIP(locinput);
220db18a 5020 while (locinput < reginfo->strend
45fdf108 5021 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 5022 {
45fdf108 5023 locinput += len;
11dfcd49
KW
5024 }
5025 }
37e2e78e 5026
11dfcd49 5027 /* And any of LV, LVT, or V can be followed
45fdf108 5028 * by T* */
220db18a 5029 while (locinput < reginfo->strend
45fdf108 5030 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 5031 {
45fdf108 5032 locinput += len;
11dfcd49
KW
5033 }
5034 }
cd94d768 5035 }
11dfcd49 5036 }
37e2e78e 5037
11dfcd49 5038 /* Match any extender */
220db18a 5039 while (locinput < reginfo->strend
11dfcd49
KW
5040 && swash_fetch(PL_utf8_X_extend,
5041 (U8*)locinput, utf8_target))
5042 {
5043 locinput += UTF8SKIP(locinput);
5044 }
37e2e78e 5045 }
27d4fc33 5046 exit_utf8:
220db18a 5047 if (locinput > reginfo->strend) sayNO;
37e2e78e 5048 }
a0ed51b3 5049 break;
81714fb9 5050
3c0563b9 5051 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
5052 { /* The capture buffer cases. The ones beginning with N for the
5053 named buffers just convert to the equivalent numbered and
5054 pretend they were called as the corresponding numbered buffer
5055 op. */
26ecd678
TC
5056 /* don't initialize these in the declaration, it makes C++
5057 unhappy */
9d9163fb 5058 const char *s;
ff1157ca 5059 char type;
8368298a
TC
5060 re_fold_t folder;
5061 const U8 *fold_array;
26ecd678 5062 UV utf8_fold_flags;
8368298a 5063
d7ef4b73
KW
5064 folder = foldEQ_locale;
5065 fold_array = PL_fold_locale;
5066 type = REFFL;
cea315b6 5067 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
5068 goto do_nref;
5069
3c0563b9 5070 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
5071 folder = foldEQ_latin1;
5072 fold_array = PL_fold_latin1;
5073 type = REFFA;
5074 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5075 goto do_nref;
5076
3c0563b9 5077 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
5078 folder = foldEQ_latin1;
5079 fold_array = PL_fold_latin1;
5080 type = REFFU;
d513472c 5081 utf8_fold_flags = 0;
d7ef4b73
KW
5082 goto do_nref;
5083
3c0563b9 5084 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
5085 folder = foldEQ;
5086 fold_array = PL_fold;
5087 type = REFF;
d513472c 5088 utf8_fold_flags = 0;
d7ef4b73
KW
5089 goto do_nref;
5090
3c0563b9 5091 case NREF: /* /\g{name}/ */
d7ef4b73 5092 type = REF;
83d7b90b
KW
5093 folder = NULL;
5094 fold_array = NULL;
d513472c 5095 utf8_fold_flags = 0;
d7ef4b73
KW
5096 do_nref:
5097
5098 /* For the named back references, find the corresponding buffer
5099 * number */
0a4db386
YO
5100 n = reg_check_named_buff_matched(rex,scan);
5101
d7ef4b73 5102 if ( ! n ) {
81714fb9 5103 sayNO;
d7ef4b73
KW
5104 }
5105 goto do_nref_ref_common;
5106
3c0563b9 5107 case REFFL: /* /\1/il */
d7ef4b73
KW
5108 folder = foldEQ_locale;
5109 fold_array = PL_fold_locale;
cea315b6 5110 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
5111 goto do_ref;
5112
3c0563b9 5113 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
5114 folder = foldEQ_latin1;
5115 fold_array = PL_fold_latin1;
5116 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5117 goto do_ref;
5118
3c0563b9 5119 case REFFU: /* /\1/iu */
d7ef4b73
KW
5120 folder = foldEQ_latin1;
5121 fold_array = PL_fold_latin1;
d513472c 5122 utf8_fold_flags = 0;
d7ef4b73
KW
5123 goto do_ref;
5124
3c0563b9 5125 case REFF: /* /\1/i */
d7ef4b73
KW
5126 folder = foldEQ;
5127 fold_array = PL_fold;
d513472c 5128 utf8_fold_flags = 0;
83d7b90b 5129 goto do_ref;
d7ef4b73 5130
3c0563b9 5131 case REF: /* /\1/ */
83d7b90b
KW
5132 folder = NULL;
5133 fold_array = NULL;
d513472c 5134 utf8_fold_flags = 0;
83d7b90b 5135
d7ef4b73 5136 do_ref:
81714fb9 5137 type = OP(scan);
d7ef4b73
KW
5138 n = ARG(scan); /* which paren pair */
5139
5140 do_nref_ref_common:
b93070ed 5141 ln = rex->offs[n].start;
1cb48e53 5142 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
b93070ed 5143 if (rex->lastparen < n || ln == -1)
af3f8c16 5144 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 5145 if (ln == rex->offs[n].end)
a0d0e21e 5146 break;
a0ed51b3 5147
9d9163fb 5148 s = reginfo->strbeg + ln;
d7ef4b73 5149 if (type != REF /* REF can do byte comparison */
31f05a37
KW
5150 && (utf8_target || type == REFFU || type == REFFL))
5151 {
220db18a 5152 char * limit = reginfo->strend;
d7ef4b73
KW
5153
5154 /* This call case insensitively compares the entire buffer
5155 * at s, with the current input starting at locinput, but
220db18a
DM
5156 * not going off the end given by reginfo->strend, and
5157 * returns in <limit> upon success, how much of the
5158 * current input was matched */
b93070ed 5159 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 5160 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
5161 {
5162 sayNO;
a0ed51b3 5163 }
d7ef4b73 5164 locinput = limit;
a0ed51b3
LW
5165 break;
5166 }
5167
d7ef4b73 5168 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
5169 if (!NEXTCHR_IS_EOS &&
5170 UCHARAT(s) != nextchr &&
81714fb9 5171 (type == REF ||
d7ef4b73 5172 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 5173 sayNO;
b93070ed 5174 ln = rex->offs[n].end - ln;
220db18a 5175 if (locinput + ln > reginfo->strend)
4633a7c4 5176 sayNO;
81714fb9 5177 if (ln > 1 && (type == REF
24d3c4a9 5178 ? memNE(s, locinput, ln)
d7ef4b73 5179 : ! folder(s, locinput, ln)))
4633a7c4 5180 sayNO;
24d3c4a9 5181 locinput += ln;
a0d0e21e 5182 break;
81714fb9 5183 }
3c0563b9
DM
5184
5185 case NOTHING: /* null op; e.g. the 'nothing' following
5186 * the '*' in m{(a+|b)*}' */
5187 break;
5188 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 5189 break;
3c0563b9
DM
5190
5191 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 5192 break;
40a82448
DM
5193
5194#undef ST
5195#define ST st->u.eval
c277df42 5196 {
c277df42 5197 SV *ret;
d2f13c59 5198 REGEXP *re_sv;
6bda09f9 5199 regexp *re;
f8fc2ecf 5200 regexp_internal *rei;
1a147d38
YO
5201 regnode *startpoint;
5202
3c0563b9 5203 case GOSTART: /* (?R) */
e7707071
YO
5204 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
5205 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 5206 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 5207 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 5208 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
5209 Perl_croak(aTHX_
5210 "Pattern subroutine nesting without pos change"
5211 " exceeded limit in regex");
6bda09f9
YO
5212 } else {
5213 nochange_depth = 0;
1a147d38 5214 }
288b8c02 5215 re_sv = rex_sv;
6bda09f9 5216 re = rex;
f8fc2ecf 5217 rei = rexi;
1a147d38 5218 if (OP(scan)==GOSUB) {
6bda09f9
YO
5219 startpoint = scan + ARG2L(scan);
5220 ST.close_paren = ARG(scan);
5221 } else {
f8fc2ecf 5222 startpoint = rei->program+1;
6bda09f9
YO
5223 ST.close_paren = 0;
5224 }
d1b2014a
YO
5225
5226 /* Save all the positions seen so far. */
5227 ST.cp = regcppush(rex, 0, maxopenparen);
5228 REGCP_SET(ST.lastcp);
5229
5230 /* and then jump to the code we share with EVAL */
6bda09f9 5231 goto eval_recurse_doit;
d1b2014a 5232
a74ff37d
JH
5233 /* NOTREACHED */
5234 assert(0);
3c0563b9 5235
6bda09f9
YO
5236 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
5237 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 5238 if ( ++nochange_depth > max_nochange_depth )
1a147d38 5239 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
5240 } else {
5241 nochange_depth = 0;
5242 }
8e5e9ebe 5243 {
4aabdb9b 5244 /* execute the code in the {...} */
81ed78b2 5245
4aabdb9b 5246 dSP;
a6dc34f1 5247 IV before;
1f4d1a1e 5248 OP * const oop = PL_op;
4aabdb9b 5249 COP * const ocurcop = PL_curcop;
81ed78b2 5250 OP *nop;
81ed78b2 5251 CV *newcv;
91332126 5252
74088413 5253 /* save *all* paren positions */
92da3157 5254 regcppush(rex, 0, maxopenparen);
74088413
DM
5255 REGCP_SET(runops_cp);
5256
81ed78b2
DM
5257 if (!caller_cv)
5258 caller_cv = find_runcv(NULL);
5259
4aabdb9b 5260 n = ARG(scan);
81ed78b2 5261
b30fcab9 5262 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 5263 newcv = (ReANY(
b30fcab9
DM
5264 (REGEXP*)(rexi->data->data[n])
5265 ))->qr_anoncv
81ed78b2
DM
5266 ;
5267 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
5268 }
5269 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
5270 newcv = caller_cv;
5271 nop = (OP*)rexi->data->data[n];
5272 assert(CvDEPTH(newcv));
68e2671b
DM
5273 }
5274 else {
d24ca0c5
DM
5275 /* literal with own CV */
5276 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
5277 newcv = rex->qr_anoncv;
5278 nop = (OP*)rexi->data->data[n];
68e2671b 5279 }
81ed78b2 5280
0e458318
DM
5281 /* normally if we're about to execute code from the same
5282 * CV that we used previously, we just use the existing
5283 * CX stack entry. However, its possible that in the
5284 * meantime we may have backtracked, popped from the save
5285 * stack, and undone the SAVECOMPPAD(s) associated with
5286 * PUSH_MULTICALL; in which case PL_comppad no longer
5287 * points to newcv's pad. */
5288 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5289 {
b0065247
DM
5290 U8 flags = (CXp_SUB_RE |
5291 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 5292 if (last_pushed_cv) {
b0065247 5293 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5294 }
5295 else {
b0065247 5296 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5297 }
5298 last_pushed_cv = newcv;
5299 }
c31ee3bb
DM
5300 else {
5301 /* these assignments are just to silence compiler
5302 * warnings */
5303 multicall_cop = NULL;
5304 newsp = NULL;
5305 }
0e458318
DM
5306 last_pad = PL_comppad;
5307
2e2e3f36
DM
5308 /* the initial nextstate you would normally execute
5309 * at the start of an eval (which would cause error
5310 * messages to come from the eval), may be optimised
5311 * away from the execution path in the regex code blocks;
5312 * so manually set PL_curcop to it initially */
5313 {
81ed78b2 5314 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
5315 assert(o->op_type == OP_NULL);
5316 if (o->op_targ == OP_SCOPE) {
5317 o = cUNOPo->op_first;
5318 }
5319 else {
5320 assert(o->op_targ == OP_LEAVE);
5321 o = cUNOPo->op_first;
5322 assert(o->op_type == OP_ENTER);
1ed44841 5323 o = OP_SIBLING(o);
2e2e3f36
DM
5324 }
5325
5326 if (o->op_type != OP_STUB) {
5327 assert( o->op_type == OP_NEXTSTATE
5328 || o->op_type == OP_DBSTATE
5329 || (o->op_type == OP_NULL
5330 && ( o->op_targ == OP_NEXTSTATE
5331 || o->op_targ == OP_DBSTATE
5332 )
5333 )
5334 );
5335 PL_curcop = (COP*)o;
5336 }
5337 }
81ed78b2 5338 nop = nop->op_next;
2e2e3f36 5339
24b23f37 5340 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
5341 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5342
8adc0f72 5343 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9 5344 if (reginfo->info_aux_eval->pos_magic)
25fdce4a
FC
5345 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5346 reginfo->sv, reginfo->strbeg,
5347 locinput - reginfo->strbeg);
4aabdb9b 5348
2bf803e2
YO
5349 if (sv_yes_mark) {
5350 SV *sv_mrk = get_sv("REGMARK", 1);
5351 sv_setsv(sv_mrk, sv_yes_mark);
5352 }
5353
81ed78b2
DM
5354 /* we don't use MULTICALL here as we want to call the
5355 * first op of the block of interest, rather than the
5356 * first op of the sub */
a6dc34f1 5357 before = (IV)(SP-PL_stack_base);
81ed78b2 5358 PL_op = nop;
8e5e9ebe
RGS
5359 CALLRUNOPS(aTHX); /* Scalar context. */
5360 SPAGAIN;
a6dc34f1 5361 if ((IV)(SP-PL_stack_base) == before)
075aa684 5362 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
5363 else {
5364 ret = POPs;
5365 PUTBACK;
5366 }
4aabdb9b 5367
e4bfbed3
DM
5368 /* before restoring everything, evaluate the returned
5369 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
5370 * PL_op or pad. Also need to process any magic vars
5371 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
5372
5373 PL_op = NULL;
5374
5e98dac2 5375 re_sv = NULL;
e4bfbed3
DM
5376 if (logical == 0) /* (?{})/ */
5377 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5378 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5379 sw = cBOOL(SvTRUE(ret));
5380 logical = 0;
5381 }
5382 else { /* /(??{}) */
497d0a96
DM
5383 /* if its overloaded, let the regex compiler handle
5384 * it; otherwise extract regex, or stringify */
237da807 5385 if (SvGMAGICAL(ret))
2685dc2d 5386 ret = sv_mortalcopy(ret);
497d0a96
DM
5387 if (!SvAMAGIC(ret)) {
5388 SV *sv = ret;
5389 if (SvROK(sv))
5390 sv = SvRV(sv);
5391 if (SvTYPE(sv) == SVt_REGEXP)
5392 re_sv = (REGEXP*) sv;
63620942
FC
5393 else if (SvSMAGICAL(ret)) {
5394 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
497d0a96
DM
5395 if (mg)
5396 re_sv = (REGEXP *) mg->mg_obj;
5397 }
e4bfbed3 5398
2685dc2d 5399 /* force any undef warnings here */
237da807
FC
5400 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5401 ret = sv_mortalcopy(ret);
497d0a96
DM
5402 (void) SvPV_force_nolen(ret);
5403 }
e4bfbed3
DM
5404 }
5405
5406 }
5407
81ed78b2
DM
5408 /* *** Note that at this point we don't restore
5409 * PL_comppad, (or pop the CxSUB) on the assumption it may
5410 * be used again soon. This is safe as long as nothing
5411 * in the regexp code uses the pad ! */
4aabdb9b 5412 PL_op = oop;
4aabdb9b 5413 PL_curcop = ocurcop;
92da3157 5414 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
f5df269c 5415 PL_curpm = PL_reg_curpm;
e4bfbed3
DM
5416
5417 if (logical != 2)
4aabdb9b 5418 break;
8e5e9ebe 5419 }
e4bfbed3
DM
5420
5421 /* only /(??{})/ from now on */
24d3c4a9 5422 logical = 0;
4aabdb9b 5423 {
4f639d21
DM
5424 /* extract RE object from returned value; compiling if
5425 * necessary */
5c35adbb 5426
575c37f6
DM
5427 if (re_sv) {
5428 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5429 }
0f5d15d6 5430 else {
c737faaf 5431 U32 pm_flags = 0;
0f5d15d6 5432
9753d940
DM
5433 if (SvUTF8(ret) && IN_BYTES) {
5434 /* In use 'bytes': make a copy of the octet
5435 * sequence, but without the flag on */
b9ad30b4
NC
5436 STRLEN len;
5437 const char *const p = SvPV(ret, len);
5438 ret = newSVpvn_flags(p, len, SVs_TEMP);
5439 }
732caac7
DM
5440 if (rex->intflags & PREGf_USE_RE_EVAL)
5441 pm_flags |= PMf_USE_RE_EVAL;
5442
5443 /* if we got here, it should be an engine which
5444 * supports compiling code blocks and stuff */
5445 assert(rex->engine && rex->engine->op_comp);
ec841a27 5446 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5447 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5448 rex->engine, NULL, NULL,
5449 /* copy /msix etc to inner pattern */
5450 scan->flags,
5451 pm_flags);
732caac7 5452
9041c2e3 5453 if (!(SvFLAGS(ret)
237da807
FC
5454 & (SVs_TEMP | SVs_GMG | SVf_ROK))
5455 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
a2794585
NC
5456 /* This isn't a first class regexp. Instead, it's
5457 caching a regexp onto an existing, Perl visible
5458 scalar. */
575c37f6 5459 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5460 }
0f5d15d6 5461 }
e1ff3a88 5462 SAVEFREESV(re_sv);
8d919b0a 5463 re = ReANY(re_sv);
4aabdb9b 5464 }
07bc277f 5465 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5466 re->subbeg = rex->subbeg;
5467 re->sublen = rex->sublen;
6502e081
DM
5468 re->suboffset = rex->suboffset;
5469 re->subcoffset = rex->subcoffset;
d1b2014a
YO
5470 re->lastparen = 0;
5471 re->lastcloseparen = 0;
f8fc2ecf 5472 rei = RXi_GET(re);
6bda09f9 5473 DEBUG_EXECUTE_r(
220db18a
DM
5474 debug_start_match(re_sv, utf8_target, locinput,
5475 reginfo->strend, "Matching embedded");
6bda09f9 5476 );
f8fc2ecf 5477 startpoint = rei->program + 1;
1a147d38 5478 ST.close_paren = 0; /* only used for GOSUB */
d1b2014a
YO
5479 /* Save all the seen positions so far. */
5480 ST.cp = regcppush(rex, 0, maxopenparen);
5481 REGCP_SET(ST.lastcp);
5482 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5483 maxopenparen = 0;
5484 /* run the pattern returned from (??{...}) */
aa283a38 5485
d1b2014a
YO
5486 eval_recurse_doit: /* Share code with GOSUB below this line
5487 * At this point we expect the stack context to be
5488 * set up correctly */
4aabdb9b 5489
1cb95af7
DM
5490 /* invalidate the S-L poscache. We're now executing a
5491 * different set of WHILEM ops (and their associated
5492 * indexes) against the same string, so the bits in the
5493 * cache are meaningless. Setting maxiter to zero forces
5494 * the cache to be invalidated and zeroed before reuse.
5495 * XXX This is too dramatic a measure. Ideally we should
5496 * save the old cache and restore when running the outer
5497 * pattern again */
1cb48e53 5498 reginfo->poscache_maxiter = 0;
4aabdb9b 5499
d1b2014a 5500 /* the new regexp might have a different is_utf8_pat than we do */
aed7b151 5501 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5502
288b8c02 5503 ST.prev_rex = rex_sv;
faec1544 5504 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5505 rex_sv = re_sv;
5506 SET_reg_curpm(rex_sv);
288b8c02 5507 rex = re;
f8fc2ecf 5508 rexi = rei;
faec1544 5509 cur_curlyx = NULL;
40a82448 5510 ST.B = next;
faec1544
DM
5511 ST.prev_eval = cur_eval;
5512 cur_eval = st;
faec1544 5513 /* now continue from first node in postoned RE */
4d5016e5 5514 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
a74ff37d
JH
5515 /* NOTREACHED */
5516 assert(0);
c277df42 5517 }
40a82448 5518
faec1544
DM
5519 case EVAL_AB: /* cleanup after a successful (??{A})B */
5520 /* note: this is called twice; first after popping B, then A */
ec43f78b 5521 rex_sv = ST.prev_rex;
aed7b151 5522 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5523 SET_reg_curpm(rex_sv);
8d919b0a 5524 rex = ReANY(rex_sv);
f8fc2ecf 5525 rexi = RXi_GET(rex);
4b22688e
YO
5526 {
5527 /* preserve $^R across LEAVE's. See Bug 121070. */
5528 SV *save_sv= GvSV(PL_replgv);
5529 SvREFCNT_inc(save_sv);
5530 regcpblow(ST.cp); /* LEAVE in disguise */
5531 sv_setsv(GvSV(PL_replgv), save_sv);
5532 SvREFCNT_dec(save_sv);
5533 }
faec1544
DM
5534 cur_eval = ST.prev_eval;
5535 cur_curlyx = ST.prev_curlyx;
34a81e2b 5536
1cb95af7 5537 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5538 reginfo->poscache_maxiter = 0;
e7707071 5539 if ( nochange_depth )
4b196cd4 5540 nochange_depth--;
262b90c4 5541 sayYES;
40a82448 5542
40a82448 5543
faec1544
DM
5544 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5545 /* note: this is called twice; first after popping B, then A */
ec43f78b 5546 rex_sv = ST.prev_rex;
aed7b151 5547 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5548 SET_reg_curpm(rex_sv);
8d919b0a 5549 rex = ReANY(rex_sv);
f8fc2ecf 5550 rexi = RXi_GET(rex);
0357f1fd 5551
40a82448 5552 REGCP_UNWIND(ST.lastcp);
92da3157 5553 regcppop(rex, &maxopenparen);
faec1544
DM
5554 cur_eval = ST.prev_eval;
5555 cur_curlyx = ST.prev_curlyx;
1cb95af7 5556 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5557 reginfo->poscache_maxiter = 0;
e7707071 5558 if ( nochange_depth )
4b196cd4 5559 nochange_depth--;
40a82448 5560 sayNO_SILENT;
40a82448
DM
5561#undef ST
5562
3c0563b9 5563 case OPEN: /* ( */
c277df42 5564 n = ARG(scan); /* which paren pair */
9d9163fb 5565 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
5566 if (n > maxopenparen)
5567 maxopenparen = n;
495f47a5 5568 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5569 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5570 PTR2UV(rex),
5571 PTR2UV(rex->offs),
5572 (UV)n,
5573 (IV)rex->offs[n].start_tmp,
92da3157 5574 (UV)maxopenparen
495f47a5 5575 ));
e2e6a0f1 5576 lastopen = n;
a0d0e21e 5577 break;
495f47a5
DM
5578
5579/* XXX really need to log other places start/end are set too */
5580#define CLOSE_CAPTURE \
5581 rex->offs[n].start = rex->offs[n].start_tmp; \
9d9163fb 5582 rex->offs[n].end = locinput - reginfo->strbeg; \
495f47a5
DM
5583 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5584 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5585 PTR2UV(rex), \
5586 PTR2UV(rex->offs), \
5587 (UV)n, \
5588 (IV)rex->offs[n].start, \
5589 (IV)rex->offs[n].end \
5590 ))
5591
3c0563b9 5592 case CLOSE: /* ) */
c277df42 5593 n = ARG(scan); /* which paren pair */
495f47a5 5594 CLOSE_CAPTURE;
b93070ed
DM
5595 if (n > rex->lastparen)
5596 rex->lastparen = n;
5597 rex->lastcloseparen = n;
3b6647e0 5598 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5599 goto fake_end;
5600 }
a0d0e21e 5601 break;
3c0563b9
DM
5602
5603 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5604 if (ARG(scan)){
5605 regnode *cursor;
5606 for (cursor=scan;
5607 cursor && OP(cursor)!=END;
5608 cursor=regnext(cursor))
5609 {
5610 if ( OP(cursor)==CLOSE ){
5611 n = ARG(cursor);
5612 if ( n <= lastopen ) {
495f47a5 5613 CLOSE_CAPTURE;
b93070ed
DM
5614 if (n > rex->lastparen)
5615 rex->lastparen = n;
5616 rex->lastcloseparen = n;
3b6647e0
RB
5617 if ( n == ARG(scan) || (cur_eval &&
5618 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5619 break;
5620 }
5621 }
5622 }
5623 }
5624 goto fake_end;
a74ff37d 5625 /* NOTREACHED */
3c0563b9
DM
5626
5627 case GROUPP: /* (?(1)) */
c277df42 5628 n = ARG(scan); /* which paren pair */
b93070ed 5629 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5630 break;
3c0563b9
DM
5631
5632 case NGROUPP: /* (?(<name>)) */
0a4db386 5633 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5634 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5635 break;
3c0563b9
DM
5636
5637 case INSUBP: /* (?(R)) */
0a4db386 5638 n = ARG(scan);
3b6647e0 5639 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5640 break;
3c0563b9
DM
5641
5642 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5643 sw = 0;
5644 break;
3c0563b9
DM
5645
5646 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 5647 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 5648 if (sw)
c277df42
IZ
5649 next = NEXTOPER(NEXTOPER(scan));
5650 else {
5651 next = scan + ARG(scan);
5652 if (OP(next) == IFTHEN) /* Fake one. */
5653 next = NEXTOPER(NEXTOPER(next));
5654 }
5655 break;
3c0563b9
DM
5656
5657 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5658 logical = scan->flags;
c277df42 5659 break;
c476f425 5660
2ab05381 5661/*******************************************************************
2ab05381 5662
c476f425
DM
5663The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5664pattern, where A and B are subpatterns. (For simple A, CURLYM or
5665STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5666
c476f425 5667A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5668
c476f425
DM
5669On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5670state, which contains the current count, initialised to -1. It also sets
5671cur_curlyx to point to this state, with any previous value saved in the
5672state block.
2ab05381 5673
c476f425
DM
5674CURLYX then jumps straight to the WHILEM op, rather than executing A,
5675since the pattern may possibly match zero times (i.e. it's a while {} loop
5676rather than a do {} while loop).
2ab05381 5677
c476f425
DM
5678Each entry to WHILEM represents a successful match of A. The count in the
5679CURLYX block is incremented, another WHILEM state is pushed, and execution
5680passes to A or B depending on greediness and the current count.
2ab05381 5681
c476f425
DM
5682For example, if matching against the string a1a2a3b (where the aN are
5683substrings that match /A/), then the match progresses as follows: (the
5684pushed states are interspersed with the bits of strings matched so far):
2ab05381 5685
c476f425
DM
5686 <CURLYX cnt=-1>
5687 <CURLYX cnt=0><WHILEM>
5688 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5689 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5690 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5691 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5692
c476f425
DM
5693(Contrast this with something like CURLYM, which maintains only a single
5694backtrack state:
2ab05381 5695
c476f425
DM
5696 <CURLYM cnt=0> a1
5697 a1 <CURLYM cnt=1> a2
5698 a1 a2 <CURLYM cnt=2> a3
5699 a1 a2 a3 <CURLYM cnt=3> b
5700)
2ab05381 5701
c476f425
DM
5702Each WHILEM state block marks a point to backtrack to upon partial failure
5703of A or B, and also contains some minor state data related to that
5704iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5705overall state, such as the count, and pointers to the A and B ops.
2ab05381 5706
c476f425
DM
5707This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5708must always point to the *current* CURLYX block, the rules are:
2ab05381 5709
c476f425
DM
5710When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5711and set cur_curlyx to point the new block.
2ab05381 5712
c476f425
DM
5713When popping the CURLYX block after a successful or unsuccessful match,
5714restore the previous cur_curlyx.
2ab05381 5715
c476f425
DM
5716When WHILEM is about to execute B, save the current cur_curlyx, and set it
5717to the outer one saved in the CURLYX block.
2ab05381 5718
c476f425
DM
5719When popping the WHILEM block after a successful or unsuccessful B match,
5720restore the previous cur_curlyx.
2ab05381 5721
c476f425
DM
5722Here's an example for the pattern (AI* BI)*BO
5723I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5724
c476f425
DM
5725cur_
5726curlyx backtrack stack
5727------ ---------------
5728NULL
5729CO <CO prev=NULL> <WO>
5730CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5731CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5732NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5733
c476f425
DM
5734At this point the pattern succeeds, and we work back down the stack to
5735clean up, restoring as we go:
95b24440 5736
c476f425
DM
5737CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5738CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5739CO <CO prev=NULL> <WO>
5740NULL
a0374537 5741
c476f425
DM
5742*******************************************************************/
5743
5744#define ST st->u.curlyx
5745
5746 case CURLYX: /* start of /A*B/ (for complex A) */
5747 {
5748 /* No need to save/restore up to this paren */
5749 I32 parenfloor = scan->flags;
5750
5751 assert(next); /* keep Coverity happy */
5752 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5753 next += ARG(next);
5754
5755 /* XXXX Probably it is better to teach regpush to support
92da3157 5756 parenfloor > maxopenparen ... */
b93070ed
DM
5757 if (parenfloor > (I32)rex->lastparen)
5758 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5759
5760 ST.prev_curlyx= cur_curlyx;
5761 cur_curlyx = st;
5762 ST.cp = PL_savestack_ix;
5763
5764 /* these fields contain the state of the current curly.
5765 * they are accessed by subsequent WHILEMs */
5766 ST.parenfloor = parenfloor;
d02d6d97 5767 ST.me = scan;
c476f425 5768 ST.B = next;
24d3c4a9
DM
5769 ST.minmod = minmod;
5770 minmod = 0;
c476f425
DM
5771 ST.count = -1; /* this will be updated by WHILEM */
5772 ST.lastloc = NULL; /* this will be updated by WHILEM */
5773
4d5016e5 5774 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
a74ff37d
JH
5775 /* NOTREACHED */
5776 assert(0);
c476f425 5777 }
a0d0e21e 5778
c476f425 5779 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5780 cur_curlyx = ST.prev_curlyx;
5781 sayYES;
a74ff37d
JH
5782 /* NOTREACHED */
5783 assert(0);
a0d0e21e 5784
c476f425
DM
5785 case CURLYX_end_fail: /* just failed to match all of A*B */
5786 regcpblow(ST.cp);
5787 cur_curlyx = ST.prev_curlyx;
5788 sayNO;
a74ff37d
JH
5789 /* NOTREACHED */
5790 assert(0);
4633a7c4 5791
a0d0e21e 5792
c476f425
DM
5793#undef ST
5794#define ST st->u.whilem
5795
5796 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5797 {
5798 /* see the discussion above about CURLYX/WHILEM */
c476f425 5799 I32 n;
1a522d5d
JH
5800 int min, max;
5801 regnode *A;
d02d6d97 5802
c476f425 5803 assert(cur_curlyx); /* keep Coverity happy */
1a522d5d
JH
5804
5805 min = ARG1(cur_curlyx->u.curlyx.me);
5806 max = ARG2(cur_curlyx->u.curlyx.me);
5807 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
c476f425
DM
5808 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5809 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5810 ST.cache_offset = 0;
5811 ST.cache_mask = 0;
5812
c476f425
DM
5813
5814 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5815 "%*s whilem: matched %ld out of %d..%d\n",
5816 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5817 );
a0d0e21e 5818
c476f425 5819 /* First just match a string of min A's. */
a0d0e21e 5820
d02d6d97 5821 if (n < min) {
92da3157
DM
5822 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5823 maxopenparen);
c476f425 5824 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5825 REGCP_SET(ST.lastcp);
5826
4d5016e5 5827 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
a74ff37d
JH
5828 /* NOTREACHED */
5829 assert(0);
c476f425
DM
5830 }
5831
5832 /* If degenerate A matches "", assume A done. */
5833
5834 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5835 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5836 "%*s whilem: empty match detected, trying continuation...\n",
5837 REPORT_CODE_OFF+depth*2, "")
5838 );
5839 goto do_whilem_B_max;
5840 }
5841
1cb95af7
DM
5842 /* super-linear cache processing.
5843 *
5844 * The idea here is that for certain types of CURLYX/WHILEM -
5845 * principally those whose upper bound is infinity (and
5846 * excluding regexes that have things like \1 and other very
5847 * non-regular expresssiony things), then if a pattern like
5848 * /....A*.../ fails and we backtrack to the WHILEM, then we
5849 * make a note that this particular WHILEM op was at string
5850 * position 47 (say) when the rest of pattern failed. Then, if
5851 * we ever find ourselves back at that WHILEM, and at string
5852 * position 47 again, we can just fail immediately rather than
5853 * running the rest of the pattern again.
5854 *
5855 * This is very handy when patterns start to go
5856 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5857 * with a combinatorial explosion of backtracking.
5858 *
5859 * The cache is implemented as a bit array, with one bit per
5860 * string byte position per WHILEM op (up to 16) - so its
5861 * between 0.25 and 2x the string size.
5862 *
5863 * To avoid allocating a poscache buffer every time, we do an
5864 * initially countdown; only after we have executed a WHILEM
5865 * op (string-length x #WHILEMs) times do we allocate the
5866 * cache.
5867 *
5868 * The top 4 bits of scan->flags byte say how many different
5869 * relevant CURLLYX/WHILEM op pairs there are, while the
5870 * bottom 4-bits is the identifying index number of this
5871 * WHILEM.
5872 */
c476f425
DM
5873
5874 if (scan->flags) {
a0d0e21e 5875
1cb48e53 5876 if (!reginfo->poscache_maxiter) {
c476f425
DM
5877 /* start the countdown: Postpone detection until we
5878 * know the match is not *that* much linear. */
1cb48e53 5879 reginfo->poscache_maxiter
9d9163fb
DM
5880 = (reginfo->strend - reginfo->strbeg + 1)
5881 * (scan->flags>>4);
66bf836d 5882 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
5883 if (reginfo->poscache_maxiter < 0)
5884 reginfo->poscache_maxiter = I32_MAX;
5885 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 5886 }
c476f425 5887
1cb48e53 5888 if (reginfo->poscache_iter-- == 0) {
c476f425 5889 /* initialise cache */
ea3daa5d 5890 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
5891 regmatch_info_aux *const aux = reginfo->info_aux;
5892 if (aux->poscache) {
ea3daa5d 5893 if ((SSize_t)reginfo->poscache_size < size) {
2ac8ff4b
DM
5894 Renew(aux->poscache, size, char);
5895 reginfo->poscache_size = size;
2c2d71f5 5896 }
2ac8ff4b 5897 Zero(aux->poscache, size, char);
2c2d71f5
JH
5898 }
5899 else {
2ac8ff4b
DM
5900 reginfo->poscache_size = size;
5901 Newxz(aux->poscache, size, char);
2c2d71f5 5902 }
c476f425
DM
5903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5904 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5905 PL_colors[4], PL_colors[5])
5906 );
2c2d71f5 5907 }
c476f425 5908
1cb48e53 5909 if (reginfo->poscache_iter < 0) {
c476f425 5910 /* have we already failed at this position? */
ea3daa5d 5911 SSize_t offset, mask;
338e600a
DM
5912
5913 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 5914 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
5915 + (locinput - reginfo->strbeg)
5916 * (scan->flags>>4);
c476f425
DM
5917 mask = 1 << (offset % 8);
5918 offset /= 8;
2ac8ff4b 5919 if (reginfo->info_aux->poscache[offset] & mask) {
c476f425
DM
5920 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5921 "%*s whilem: (cache) already tried at this position...\n",
5922 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5923 );
3298f257 5924 sayNO; /* cache records failure */
2c2d71f5 5925 }
c476f425
DM
5926 ST.cache_offset = offset;
5927 ST.cache_mask = mask;
2c2d71f5 5928 }
c476f425 5929 }
2c2d71f5 5930
c476f425 5931 /* Prefer B over A for minimal matching. */
a687059c 5932
c476f425
DM
5933 if (cur_curlyx->u.curlyx.minmod) {
5934 ST.save_curlyx = cur_curlyx;
5935 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5936 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5937 maxopenparen);
c476f425 5938 REGCP_SET(ST.lastcp);
4d5016e5
DM
5939 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5940 locinput);
a74ff37d
JH
5941 /* NOTREACHED */
5942 assert(0);
c476f425 5943 }
a0d0e21e 5944
c476f425
DM
5945 /* Prefer A over B for maximal matching. */
5946
d02d6d97 5947 if (n < max) { /* More greed allowed? */
92da3157
DM
5948 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5949 maxopenparen);
c476f425
DM
5950 cur_curlyx->u.curlyx.lastloc = locinput;
5951 REGCP_SET(ST.lastcp);
4d5016e5 5952 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
a74ff37d
JH
5953 /* NOTREACHED */
5954 assert(0);
c476f425
DM
5955 }
5956 goto do_whilem_B_max;
5957 }
a74ff37d
JH
5958 /* NOTREACHED */
5959 assert(0);
c476f425
DM
5960
5961 case WHILEM_B_min: /* just matched B in a minimal match */
5962 case WHILEM_B_max: /* just matched B in a maximal match */
5963 cur_curlyx = ST.save_curlyx;
5964 sayYES;
a74ff37d
JH
5965 /* NOTREACHED */
5966 assert(0);
c476f425
DM
5967
5968 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5969 cur_curlyx = ST.save_curlyx;
5970 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5971 cur_curlyx->u.curlyx.count--;
5972 CACHEsayNO;
a74ff37d
JH
5973 /* NOTREACHED */
5974 assert(0);
c476f425
DM
5975
5976 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
924ba076 5977 /* FALLTHROUGH */
c476f425 5978 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5979 REGCP_UNWIND(ST.lastcp);
92da3157 5980 regcppop(rex, &maxopenparen);
c476f425
DM
5981 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5982 cur_curlyx->u.curlyx.count--;
5983 CACHEsayNO;
a74ff37d
JH
5984 /* NOTREACHED */
5985 assert(0);
c476f425
DM
5986
5987 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5988 REGCP_UNWIND(ST.lastcp);
92da3157 5989 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5990 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5991 "%*s whilem: failed, trying continuation...\n",
5992 REPORT_CODE_OFF+depth*2, "")
5993 );
5994 do_whilem_B_max:
5995 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5996 && ckWARN(WARN_REGEXP)
39819bd9 5997 && !reginfo->warned)
c476f425 5998 {
39819bd9 5999 reginfo->warned = TRUE;
dcbac5bb
FC
6000 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6001 "Complex regular subexpression recursion limit (%d) "
6002 "exceeded",
c476f425
DM
6003 REG_INFTY - 1);
6004 }
6005
6006 /* now try B */
6007 ST.save_curlyx = cur_curlyx;
6008 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
6009 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6010 locinput);
a74ff37d
JH
6011 /* NOTREACHED */
6012 assert(0);
c476f425
DM
6013
6014 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6015 cur_curlyx = ST.save_curlyx;
6016 REGCP_UNWIND(ST.lastcp);
92da3157 6017 regcppop(rex, &maxopenparen);
c476f425 6018
d02d6d97 6019 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
6020 /* Maximum greed exceeded */
6021 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6022 && ckWARN(WARN_REGEXP)
39819bd9 6023 && !reginfo->warned)
c476f425 6024 {
39819bd9 6025 reginfo->warned = TRUE;
c476f425 6026 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
6027 "Complex regular subexpression recursion "
6028 "limit (%d) exceeded",
c476f425 6029 REG_INFTY - 1);
a0d0e21e 6030 }
c476f425 6031 cur_curlyx->u.curlyx.count--;
3ab3c9b4 6032 CACHEsayNO;
a0d0e21e 6033 }
c476f425
DM
6034
6035 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6036 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6037 );
6038 /* Try grabbing another A and see if it helps. */
c476f425 6039 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
6040 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6041 maxopenparen);
c476f425 6042 REGCP_SET(ST.lastcp);
d02d6d97 6043 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
6044 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6045 locinput);
a74ff37d
JH
6046 /* NOTREACHED */
6047 assert(0);
40a82448
DM
6048
6049#undef ST
6050#define ST st->u.branch
6051
6052 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
6053 next = scan + ARG(scan);
6054 if (next == scan)
6055 next = NULL;
40a82448 6056 scan = NEXTOPER(scan);
924ba076 6057 /* FALLTHROUGH */
c277df42 6058
40a82448
DM
6059 case BRANCH: /* /(...|A|...)/ */
6060 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 6061 ST.lastparen = rex->lastparen;
f6033a9d 6062 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
6063 ST.next_branch = next;
6064 REGCP_SET(ST.cp);
02db2b7b 6065
40a82448 6066 /* Now go into the branch */
5d458dd8 6067 if (has_cutgroup) {
4d5016e5 6068 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6069 } else {
4d5016e5 6070 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6071 }
a74ff37d
JH
6072 /* NOTREACHED */
6073 assert(0);
3c0563b9
DM
6074
6075 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 6076 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 6077 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6078 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
a74ff37d
JH
6079 /* NOTREACHED */
6080 assert(0);
3c0563b9 6081
5d458dd8
YO
6082 case CUTGROUP_next_fail:
6083 do_cutgroup = 1;
6084 no_final = 1;
6085 if (st->u.mark.mark_name)
6086 sv_commit = st->u.mark.mark_name;
6087 sayNO;
a74ff37d
JH
6088 /* NOTREACHED */
6089 assert(0);
3c0563b9 6090
5d458dd8
YO
6091 case BRANCH_next:
6092 sayYES;
a74ff37d
JH
6093 /* NOTREACHED */
6094 assert(0);
3c0563b9 6095
40a82448 6096 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
6097 if (do_cutgroup) {
6098 do_cutgroup = 0;
6099 no_final = 0;
6100 }
40a82448 6101 REGCP_UNWIND(ST.cp);
a8d1f4b4 6102 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
6103 scan = ST.next_branch;
6104 /* no more branches? */
5d458dd8
YO
6105 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6106 DEBUG_EXECUTE_r({
6107 PerlIO_printf( Perl_debug_log,
6108 "%*s %sBRANCH failed...%s\n",
6109 REPORT_CODE_OFF+depth*2, "",
6110 PL_colors[4],
6111 PL_colors[5] );
6112 });
6113 sayNO_SILENT;
6114 }
40a82448 6115 continue; /* execute next BRANCH[J] op */
a74ff37d
JH
6116 /* NOTREACHED */
6117 assert(0);
40a82448 6118
3c0563b9 6119 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 6120 minmod = 1;
a0d0e21e 6121 break;
40a82448
DM
6122
6123#undef ST
6124#define ST st->u.curlym
6125
6126 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6127
6128 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 6129 * only a single backtracking state, no matter how many matches
40a82448
DM
6130 * there are in {m,n}. It relies on the pattern being constant
6131 * length, with no parens to influence future backrefs
6132 */
6133
6134 ST.me = scan;
dc45a647 6135 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 6136
f6033a9d
DM
6137 ST.lastparen = rex->lastparen;
6138 ST.lastcloseparen = rex->lastcloseparen;
6139
40a82448
DM
6140 /* if paren positive, emulate an OPEN/CLOSE around A */
6141 if (ST.me->flags) {
3b6647e0 6142 U32 paren = ST.me->flags;
92da3157
DM
6143 if (paren > maxopenparen)
6144 maxopenparen = paren;
c277df42 6145 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 6146 }
40a82448
DM
6147 ST.A = scan;
6148 ST.B = next;
6149 ST.alen = 0;
6150 ST.count = 0;
24d3c4a9
DM
6151 ST.minmod = minmod;
6152 minmod = 0;
40a82448
DM
6153 ST.c1 = CHRTEST_UNINIT;
6154 REGCP_SET(ST.cp);
6407bf3b 6155
40a82448
DM
6156 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6157 goto curlym_do_B;
6158
6159 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 6160 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
a74ff37d
JH
6161 /* NOTREACHED */
6162 assert(0);
5f80c4cf 6163
40a82448 6164 case CURLYM_A: /* we've just matched an A */
40a82448
DM
6165 ST.count++;
6166 /* after first match, determine A's length: u.curlym.alen */
6167 if (ST.count == 1) {
ba44c216 6168 if (reginfo->is_utf8_target) {
c07e9d7b
DM
6169 char *s = st->locinput;
6170 while (s < locinput) {
40a82448
DM
6171 ST.alen++;
6172 s += UTF8SKIP(s);
6173 }
6174 }
6175 else {
c07e9d7b 6176 ST.alen = locinput - st->locinput;
40a82448
DM
6177 }
6178 if (ST.alen == 0)
6179 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6180 }
0cadcf80
DM
6181 DEBUG_EXECUTE_r(
6182 PerlIO_printf(Perl_debug_log,
40a82448 6183 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 6184 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 6185 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
6186 );
6187
0a4db386 6188 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6189 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
6190 goto fake_end;
6191
c966426a
DM
6192 {
6193 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6194 if ( max == REG_INFTY || ST.count < max )
6195 goto curlym_do_A; /* try to match another A */
6196 }
40a82448 6197 goto curlym_do_B; /* try to match B */
5f80c4cf 6198
40a82448
DM
6199 case CURLYM_A_fail: /* just failed to match an A */
6200 REGCP_UNWIND(ST.cp);
0a4db386
YO
6201
6202 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6203 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6204 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 6205 sayNO;
0cadcf80 6206
40a82448 6207 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
6208 if (ST.c1 == CHRTEST_UNINIT) {
6209 /* calculate c1 and c2 for possible match of 1st char
6210 * following curly */
6211 ST.c1 = ST.c2 = CHRTEST_VOID;
d20a21f4 6212 assert(ST.B);
40a82448
DM
6213 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6214 regnode *text_node = ST.B;
6215 if (! HAS_TEXT(text_node))
6216 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
6217 /* this used to be
6218
6219 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6220
6221 But the former is redundant in light of the latter.
6222
6223 if this changes back then the macro for
6224 IS_TEXT and friends need to change.
6225 */
c74f6de9 6226 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 6227 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6228 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6229 reginfo))
c74f6de9
KW
6230 {
6231 sayNO;
6232 }
c277df42 6233 }
c277df42 6234 }
40a82448
DM
6235 }
6236
6237 DEBUG_EXECUTE_r(
6238 PerlIO_printf(Perl_debug_log,
6239 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 6240 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
6241 "", (IV)ST.count)
6242 );
c74f6de9 6243 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
6244 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6245 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6246 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6247 {
6248 /* simulate B failing */
6249 DEBUG_OPTIMISE_r(
6250 PerlIO_printf(Perl_debug_log,
33daa3a5 6251 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
79a2a0e8
KW
6252 (int)(REPORT_CODE_OFF+(depth*2)),"",
6253 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6254 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6255 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6256 );
6257 state_num = CURLYM_B_fail;
6258 goto reenter_switch;
6259 }
6260 }
6261 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
6262 /* simulate B failing */
6263 DEBUG_OPTIMISE_r(
6264 PerlIO_printf(Perl_debug_log,
33daa3a5 6265 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5400f398 6266 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
6267 (int) nextchr, ST.c1, ST.c2)
6268 );
5400f398
KW
6269 state_num = CURLYM_B_fail;
6270 goto reenter_switch;
6271 }
c74f6de9 6272 }
40a82448
DM
6273
6274 if (ST.me->flags) {
f6033a9d 6275 /* emulate CLOSE: mark current A as captured */
40a82448
DM
6276 I32 paren = ST.me->flags;
6277 if (ST.count) {
b93070ed 6278 rex->offs[paren].start
9d9163fb
DM
6279 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6280 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
6281 if ((U32)paren > rex->lastparen)
6282 rex->lastparen = paren;
6283 rex->lastcloseparen = paren;
c277df42 6284 }
40a82448 6285 else
b93070ed 6286 rex->offs[paren].end = -1;
0a4db386 6287 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6288 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
6289 {
6290 if (ST.count)
6291 goto fake_end;
6292 else
6293 sayNO;
6294 }
c277df42 6295 }
0a4db386 6296
4d5016e5 6297 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
a74ff37d
JH
6298 /* NOTREACHED */
6299 assert(0);
40a82448
DM
6300
6301 case CURLYM_B_fail: /* just failed to match a B */
6302 REGCP_UNWIND(ST.cp);
a8d1f4b4 6303 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 6304 if (ST.minmod) {
84d2fa14
HS
6305 I32 max = ARG2(ST.me);
6306 if (max != REG_INFTY && ST.count == max)
40a82448
DM
6307 sayNO;
6308 goto curlym_do_A; /* try to match a further A */
6309 }
6310 /* backtrack one A */
6311 if (ST.count == ARG1(ST.me) /* min */)
6312 sayNO;
6313 ST.count--;
7016d6eb 6314 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
6315 goto curlym_do_B; /* try to match B */
6316
c255a977
DM
6317#undef ST
6318#define ST st->u.curly
40a82448 6319
c255a977
DM
6320#define CURLY_SETPAREN(paren, success) \
6321 if (paren) { \
6322 if (success) { \
9d9163fb
DM
6323 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6324 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
6325 if (paren > rex->lastparen) \
6326 rex->lastparen = paren; \
b93070ed 6327 rex->lastcloseparen = paren; \
c255a977 6328 } \
f6033a9d 6329 else { \
b93070ed 6330 rex->offs[paren].end = -1; \
f6033a9d
DM
6331 rex->lastparen = ST.lastparen; \
6332 rex->lastcloseparen = ST.lastcloseparen; \
6333 } \
c255a977
DM
6334 }
6335
b40a2c17 6336 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
6337 ST.paren = 0;
6338 ST.min = 0;
6339 ST.max = REG_INFTY;
a0d0e21e
LW
6340 scan = NEXTOPER(scan);
6341 goto repeat;
3c0563b9 6342
b40a2c17 6343 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
6344 ST.paren = 0;
6345 ST.min = 1;
6346 ST.max = REG_INFTY;
c277df42 6347 scan = NEXTOPER(scan);
c255a977 6348 goto repeat;
3c0563b9 6349
b40a2c17 6350 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
6351 ST.paren = scan->flags; /* Which paren to set */
6352 ST.lastparen = rex->lastparen;
f6033a9d 6353 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
6354 if (ST.paren > maxopenparen)
6355 maxopenparen = ST.paren;
c255a977
DM
6356 ST.min = ARG1(scan); /* min to match */
6357 ST.max = ARG2(scan); /* max to match */
0a4db386 6358 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6359 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6360 ST.min=1;
6361 ST.max=1;
6362 }
c255a977
DM
6363 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6364 goto repeat;
3c0563b9 6365
b40a2c17 6366 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
6367 ST.paren = 0;
6368 ST.min = ARG1(scan); /* min to match */
6369 ST.max = ARG2(scan); /* max to match */
6370 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 6371 repeat:
a0d0e21e
LW
6372 /*
6373 * Lookahead to avoid useless match attempts
6374 * when we know what character comes next.
c255a977 6375 *
5f80c4cf
JP
6376 * Used to only do .*x and .*?x, but now it allows
6377 * for )'s, ('s and (?{ ... })'s to be in the way
6378 * of the quantifier and the EXACT-like node. -- japhy
6379 */
6380
eb5c1be8 6381 assert(ST.min <= ST.max);
3337dfe3
KW
6382 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6383 ST.c1 = ST.c2 = CHRTEST_VOID;
6384 }
6385 else {
5f80c4cf
JP
6386 regnode *text_node = next;
6387
3dab1dad
YO
6388 if (! HAS_TEXT(text_node))
6389 FIND_NEXT_IMPT(text_node);
5f80c4cf 6390
9e137952 6391 if (! HAS_TEXT(text_node))
c255a977 6392 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 6393 else {
ee9b8eae 6394 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 6395 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 6396 }
c74f6de9 6397 else {
ee9b8eae
YO
6398
6399 /* Currently we only get here when
6400
6401 PL_rekind[OP(text_node)] == EXACT
6402
6403 if this changes back then the macro for IS_TEXT and
6404 friends need to change. */
79a2a0e8 6405 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6406 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6407 reginfo))
c74f6de9
KW
6408 {
6409 sayNO;
6410 }
6411 }
1aa99e6b 6412 }
bbce6d69 6413 }
c255a977
DM
6414
6415 ST.A = scan;
6416 ST.B = next;
24d3c4a9 6417 if (minmod) {
eb72505d 6418 char *li = locinput;
24d3c4a9 6419 minmod = 0;
984e6dd1 6420 if (ST.min &&
f9176b44 6421 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
984e6dd1 6422 < ST.min)
4633a7c4 6423 sayNO;
7016d6eb 6424 SET_locinput(li);
c255a977 6425 ST.count = ST.min;
c255a977
DM
6426 REGCP_SET(ST.cp);
6427 if (ST.c1 == CHRTEST_VOID)
6428 goto curly_try_B_min;
6429
6430 ST.oldloc = locinput;
6431
6432 /* set ST.maxpos to the furthest point along the
6433 * string that could possibly match */
6434 if (ST.max == REG_INFTY) {
220db18a 6435 ST.maxpos = reginfo->strend - 1;
f2ed9b32 6436 if (utf8_target)
c255a977
DM
6437 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6438 ST.maxpos--;
6439 }
f2ed9b32 6440 else if (utf8_target) {
c255a977
DM
6441 int m = ST.max - ST.min;
6442 for (ST.maxpos = locinput;
220db18a 6443 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
6444 ST.maxpos += UTF8SKIP(ST.maxpos);
6445 }
6446 else {
6447 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
6448 if (ST.maxpos >= reginfo->strend)
6449 ST.maxpos = reginfo->strend - 1;
c255a977
DM
6450 }
6451 goto curly_try_B_min_known;
6452
6453 }
6454 else {
eb72505d
DM
6455 /* avoid taking address of locinput, so it can remain
6456 * a register var */
6457 char *li = locinput;
f9176b44 6458 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
c255a977
DM
6459 if (ST.count < ST.min)
6460 sayNO;
7016d6eb 6461 SET_locinput(li);
c255a977
DM
6462 if ((ST.count > ST.min)
6463 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6464 {
6465 /* A{m,n} must come at the end of the string, there's
6466 * no point in backing off ... */
6467 ST.min = ST.count;
6468 /* ...except that $ and \Z can match before *and* after
6469 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6470 We may back off by one in this case. */
eb72505d 6471 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6472 ST.min--;
6473 }
6474 REGCP_SET(ST.cp);
6475 goto curly_try_B_max;
6476 }
a74ff37d
JH
6477 /* NOTREACHED */
6478 assert(0);
c255a977
DM
6479
6480 case CURLY_B_min_known_fail:
6481 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6482
c255a977 6483 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6484 if (ST.paren) {
6485 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6486 }
c255a977
DM
6487 /* Couldn't or didn't -- move forward. */
6488 ST.oldloc = locinput;
f2ed9b32 6489 if (utf8_target)
c255a977
DM
6490 locinput += UTF8SKIP(locinput);
6491 else
6492 locinput++;
6493 ST.count++;
6494 curly_try_B_min_known:
6495 /* find the next place where 'B' could work, then call B */
6496 {
6497 int n;
f2ed9b32 6498 if (utf8_target) {
c255a977
DM
6499 n = (ST.oldloc == locinput) ? 0 : 1;
6500 if (ST.c1 == ST.c2) {
c255a977 6501 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6502 while (locinput <= ST.maxpos
6503 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6504 {
6505 locinput += UTF8SKIP(locinput);
c255a977
DM
6506 n++;
6507 }
1aa99e6b
IH
6508 }
6509 else {
c255a977 6510 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6511 while (locinput <= ST.maxpos
6512 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6513 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6514 {
6515 locinput += UTF8SKIP(locinput);
c255a977 6516 n++;
1aa99e6b 6517 }
0fe9bf95
IZ
6518 }
6519 }
5400f398 6520 else { /* Not utf8_target */
c255a977
DM
6521 if (ST.c1 == ST.c2) {
6522 while (locinput <= ST.maxpos &&
6523 UCHARAT(locinput) != ST.c1)
6524 locinput++;
bbce6d69 6525 }
c255a977
DM
6526 else {
6527 while (locinput <= ST.maxpos
6528 && UCHARAT(locinput) != ST.c1
6529 && UCHARAT(locinput) != ST.c2)
6530 locinput++;
a0ed51b3 6531 }
c255a977
DM
6532 n = locinput - ST.oldloc;
6533 }
6534 if (locinput > ST.maxpos)
6535 sayNO;
c255a977 6536 if (n) {
eb72505d
DM
6537 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6538 * at b; check that everything between oldloc and
6539 * locinput matches */
6540 char *li = ST.oldloc;
c255a977 6541 ST.count += n;
f9176b44 6542 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
4633a7c4 6543 sayNO;
eb72505d 6544 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6545 }
c255a977 6546 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6547 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6548 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6549 goto fake_end;
6550 }
4d5016e5 6551 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6552 }
a74ff37d
JH
6553 /* NOTREACHED */
6554 assert(0);
c255a977
DM
6555
6556 case CURLY_B_min_fail:
6557 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6558
6559 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6560 if (ST.paren) {
6561 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6562 }
c255a977 6563 /* failed -- move forward one */
f73aaa43 6564 {
eb72505d 6565 char *li = locinput;
f9176b44 6566 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
f73aaa43
DM
6567 sayNO;
6568 }
eb72505d 6569 locinput = li;
f73aaa43
DM
6570 }
6571 {
c255a977 6572 ST.count++;
c255a977
DM
6573 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6574 ST.count > 0)) /* count overflow ? */
15272685 6575 {
c255a977
DM
6576 curly_try_B_min:
6577 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6578 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6579 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6580 goto fake_end;
6581 }
4d5016e5 6582 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6583 }
6584 }
c74f6de9 6585 sayNO;
a74ff37d
JH
6586 /* NOTREACHED */
6587 assert(0);
c255a977
DM
6588
6589 curly_try_B_max:
6590 /* a successful greedy match: now try to match B */
40d049e4 6591 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6592 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6593 goto fake_end;
6594 }
c255a977 6595 {
220db18a 6596 bool could_match = locinput < reginfo->strend;
79a2a0e8 6597
c255a977 6598 /* If it could work, try it. */
79a2a0e8
KW
6599 if (ST.c1 != CHRTEST_VOID && could_match) {
6600 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6601 {
6602 could_match = memEQ(locinput,
6603 ST.c1_utf8,
6604 UTF8SKIP(locinput))
6605 || memEQ(locinput,
6606 ST.c2_utf8,
6607 UTF8SKIP(locinput));
6608 }
6609 else {
6610 could_match = UCHARAT(locinput) == ST.c1
6611 || UCHARAT(locinput) == ST.c2;
6612 }
6613 }
6614 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6615 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6616 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
a74ff37d
JH
6617 /* NOTREACHED */
6618 assert(0);
c255a977
DM
6619 }
6620 }
924ba076 6621 /* FALLTHROUGH */
3c0563b9 6622
c255a977
DM
6623 case CURLY_B_max_fail:
6624 /* failed to find B in a greedy match */
c255a977
DM
6625
6626 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6627 if (ST.paren) {
6628 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6629 }
c255a977
DM
6630 /* back up. */
6631 if (--ST.count < ST.min)
6632 sayNO;
eb72505d 6633 locinput = HOPc(locinput, -1);
c255a977
DM
6634 goto curly_try_B_max;
6635
6636#undef ST
6637
3c0563b9 6638 case END: /* last op of main pattern */
6bda09f9 6639 fake_end:
faec1544
DM
6640 if (cur_eval) {
6641 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544 6642
288b8c02 6643 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6644
6645 /* Save *all* the positions. */
6646 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b 6647 rex_sv = cur_eval->u.eval.prev_rex;
aed7b151 6648 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6649 SET_reg_curpm(rex_sv);
8d919b0a 6650 rex = ReANY(rex_sv);
f8fc2ecf 6651 rexi = RXi_GET(rex);
faec1544 6652 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6653
faec1544 6654 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6655
6656 /* Restore parens of the outer rex without popping the
6657 * savestack */
92da3157
DM
6658 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6659 &maxopenparen);
faec1544
DM
6660
6661 st->u.eval.prev_eval = cur_eval;
6662 cur_eval = cur_eval->u.eval.prev_eval;
6663 DEBUG_EXECUTE_r(
2a49f0f5
JH
6664 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6665 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6666 if ( nochange_depth )
6667 nochange_depth--;
6668
4d5016e5
DM
6669 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6670 locinput); /* match B */
faec1544
DM
6671 }
6672
3b0527fe 6673 if (locinput < reginfo->till) {
a3621e74 6674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6675 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6676 PL_colors[4],
6d59b646
DM
6677 (long)(locinput - startpos),
6678 (long)(reginfo->till - startpos),
7821416a 6679 PL_colors[5]));
58e23c8d 6680
262b90c4 6681 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6682 }
262b90c4 6683 sayYES; /* Success! */
dad79028
DM
6684
6685 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6686 DEBUG_EXECUTE_r(
6687 PerlIO_printf(Perl_debug_log,
6688 "%*s %ssubpattern success...%s\n",
5bc10b2c 6689 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6690 sayYES; /* Success! */
dad79028 6691
40a82448
DM
6692#undef ST
6693#define ST st->u.ifmatch
6694
37f53970
DM
6695 {
6696 char *newstart;
6697
40a82448
DM
6698 case SUSPEND: /* (?>A) */
6699 ST.wanted = 1;
37f53970 6700 newstart = locinput;
9041c2e3 6701 goto do_ifmatch;
dad79028 6702
40a82448
DM
6703 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6704 ST.wanted = 0;
dad79028
DM
6705 goto ifmatch_trivial_fail_test;
6706
40a82448
DM
6707 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6708 ST.wanted = 1;
dad79028 6709 ifmatch_trivial_fail_test:
a0ed51b3 6710 if (scan->flags) {
52657f30 6711 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6712 if (!s) {
6713 /* trivial fail */
24d3c4a9
DM
6714 if (logical) {
6715 logical = 0;
f2338a2e 6716 sw = 1 - cBOOL(ST.wanted);
dad79028 6717 }
40a82448 6718 else if (ST.wanted)
dad79028
DM
6719 sayNO;
6720 next = scan + ARG(scan);
6721 if (next == scan)
6722 next = NULL;
6723 break;
6724 }
37f53970 6725 newstart = s;
a0ed51b3
LW
6726 }
6727 else
37f53970 6728 newstart = locinput;
a0ed51b3 6729
c277df42 6730 do_ifmatch:
40a82448 6731 ST.me = scan;
24d3c4a9 6732 ST.logical = logical;
24d786f4
YO
6733 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6734
40a82448 6735 /* execute body of (?...A) */
37f53970 6736 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
a74ff37d
JH
6737 /* NOTREACHED */
6738 assert(0);
37f53970 6739 }
40a82448
DM
6740
6741 case IFMATCH_A_fail: /* body of (?...A) failed */
6742 ST.wanted = !ST.wanted;
924ba076 6743 /* FALLTHROUGH */
40a82448
DM
6744
6745 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6746 if (ST.logical) {
f2338a2e 6747 sw = cBOOL(ST.wanted);
40a82448
DM
6748 }
6749 else if (!ST.wanted)
6750 sayNO;
6751
37f53970
DM
6752 if (OP(ST.me) != SUSPEND) {
6753 /* restore old position except for (?>...) */
6754 locinput = st->locinput;
40a82448
DM
6755 }
6756 scan = ST.me + ARG(ST.me);
6757 if (scan == ST.me)
6758 scan = NULL;
6759 continue; /* execute B */
6760
6761#undef ST
dad79028 6762
3c0563b9
DM
6763 case LONGJMP: /* alternative with many branches compiles to
6764 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6765 next = scan + ARG(scan);
6766 if (next == scan)
6767 next = NULL;
a0d0e21e 6768 break;
3c0563b9
DM
6769
6770 case COMMIT: /* (*COMMIT) */
220db18a 6771 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 6772 /* FALLTHROUGH */
3c0563b9
DM
6773
6774 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6775 if (!scan->flags)
ad64d0ec 6776 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6777 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
a74ff37d
JH
6778 /* NOTREACHED */
6779 assert(0);
3c0563b9 6780
54612592
YO
6781 case COMMIT_next_fail:
6782 no_final = 1;
6783 /* FALLTHROUGH */
3c0563b9
DM
6784
6785 case OPFAIL: /* (*FAIL) */
7f69552c 6786 sayNO;
a74ff37d
JH
6787 /* NOTREACHED */
6788 assert(0);
e2e6a0f1
YO
6789
6790#define ST st->u.mark
3c0563b9 6791 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6792 ST.prev_mark = mark_state;
5d458dd8 6793 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6794 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6795 mark_state = st;
4d5016e5
DM
6796 ST.mark_loc = locinput;
6797 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
a74ff37d
JH
6798 /* NOTREACHED */
6799 assert(0);
3c0563b9 6800
e2e6a0f1
YO
6801 case MARKPOINT_next:
6802 mark_state = ST.prev_mark;
6803 sayYES;
a74ff37d
JH
6804 /* NOTREACHED */
6805 assert(0);
3c0563b9 6806
e2e6a0f1 6807 case MARKPOINT_next_fail:
5d458dd8 6808 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6809 {
6810 if (ST.mark_loc > startpoint)
6811 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6812 popmark = NULL; /* we found our mark */
6813 sv_commit = ST.mark_name;
6814
6815 DEBUG_EXECUTE_r({
5d458dd8 6816 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6817 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6818 REPORT_CODE_OFF+depth*2, "",
be2597df 6819 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6820 });
6821 }
6822 mark_state = ST.prev_mark;
5d458dd8
YO
6823 sv_yes_mark = mark_state ?
6824 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6825 sayNO;
a74ff37d
JH
6826 /* NOTREACHED */
6827 assert(0);
3c0563b9
DM
6828
6829 case SKIP: /* (*SKIP) */
5d458dd8 6830 if (scan->flags) {
2bf803e2 6831 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6832 ST.mark_name = NULL;
e2e6a0f1 6833 ST.mark_loc = locinput;
4d5016e5 6834 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6835 } else {
2bf803e2 6836 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6837 otherwise do nothing. Meaning we need to scan
6838 */
6839 regmatch_state *cur = mark_state;
ad64d0ec 6840 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6841
6842 while (cur) {
6843 if ( sv_eq( cur->u.mark.mark_name,
6844 find ) )
6845 {
6846 ST.mark_name = find;
4d5016e5 6847 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6848 }
6849 cur = cur->u.mark.prev_mark;
6850 }
e2e6a0f1 6851 }
2bf803e2 6852 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6853 break;
3c0563b9 6854
5d458dd8
YO
6855 case SKIP_next_fail:
6856 if (ST.mark_name) {
6857 /* (*CUT:NAME) - Set up to search for the name as we
6858 collapse the stack*/
6859 popmark = ST.mark_name;
6860 } else {
6861 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6862 if (ST.mark_loc > startpoint)
6863 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6864 /* but we set sv_commit to latest mark_name if there
6865 is one so they can test to see how things lead to this
6866 cut */
6867 if (mark_state)
6868 sv_commit=mark_state->u.mark.mark_name;
6869 }
e2e6a0f1
YO
6870 no_final = 1;
6871 sayNO;
a74ff37d
JH
6872 /* NOTREACHED */
6873 assert(0);
e2e6a0f1 6874#undef ST
3c0563b9
DM
6875
6876 case LNBREAK: /* \R */
220db18a 6877 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 6878 locinput += n;
e1d1eefb
YO
6879 } else
6880 sayNO;
6881 break;
6882
a0d0e21e 6883 default:
b900a521 6884 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6885 PTR2UV(scan), OP(scan));
cea2e8a9 6886 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6887
6888 /* this is a point to jump to in order to increment
6889 * locinput by one character */
6890 increment_locinput:
e6ca698c 6891 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6892 if (utf8_target) {
6893 locinput += PL_utf8skip[nextchr];
7016d6eb 6894 /* locinput is allowed to go 1 char off the end, but not 2+ */
220db18a 6895 if (locinput > reginfo->strend)
28b98f76 6896 sayNO;
28b98f76
DM
6897 }
6898 else
3640db6b 6899 locinput++;
28b98f76 6900 break;
5d458dd8
YO
6901
6902 } /* end switch */
95b24440 6903
5d458dd8
YO
6904 /* switch break jumps here */
6905 scan = next; /* prepare to execute the next op and ... */
6906 continue; /* ... jump back to the top, reusing st */
a74ff37d
JH
6907 /* NOTREACHED */
6908 assert(0);
95b24440 6909
40a82448
DM
6910 push_yes_state:
6911 /* push a state that backtracks on success */
6912 st->u.yes.prev_yes_state = yes_state;
6913 yes_state = st;
924ba076 6914 /* FALLTHROUGH */
40a82448
DM
6915 push_state:
6916 /* push a new regex state, then continue at scan */
6917 {
6918 regmatch_state *newst;
6919
24b23f37
YO
6920 DEBUG_STACK_r({
6921 regmatch_state *cur = st;
6922 regmatch_state *curyes = yes_state;
6923 int curd = depth;
6924 regmatch_slab *slab = PL_regmatch_slab;
6925 for (;curd > -1;cur--,curd--) {
6926 if (cur < SLAB_FIRST(slab)) {
6927 slab = slab->prev;
6928 cur = SLAB_LAST(slab);
6929 }
6930 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6931 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6932 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6933 (curyes == cur) ? "yes" : ""
6934 );
6935 if (curyes == cur)
6936 curyes = cur->u.yes.prev_yes_state;
6937 }
6938 } else
6939 DEBUG_STATE_pp("push")
6940 );
40a82448 6941 depth++;
40a82448
DM
6942 st->locinput = locinput;
6943 newst = st+1;
6944 if (newst > SLAB_LAST(PL_regmatch_slab))
6945 newst = S_push_slab(aTHX);
6946 PL_regmatch_state = newst;
786e8c11 6947
4d5016e5 6948 locinput = pushinput;
40a82448
DM
6949 st = newst;
6950 continue;
a74ff37d
JH
6951 /* NOTREACHED */
6952 assert(0);
40a82448 6953 }
a0d0e21e 6954 }
a687059c 6955
a0d0e21e
LW
6956 /*
6957 * We get here only if there's trouble -- normally "case END" is
6958 * the terminating point.
6959 */
cea2e8a9 6960 Perl_croak(aTHX_ "corrupted regexp pointers");
a74ff37d 6961 /* NOTREACHED */
4633a7c4
LW
6962 sayNO;
6963
262b90c4 6964yes:
77cb431f
DM
6965 if (yes_state) {
6966 /* we have successfully completed a subexpression, but we must now
6967 * pop to the state marked by yes_state and continue from there */
77cb431f 6968 assert(st != yes_state);
5bc10b2c
DM
6969#ifdef DEBUGGING
6970 while (st != yes_state) {
6971 st--;
6972 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6973 PL_regmatch_slab = PL_regmatch_slab->prev;
6974 st = SLAB_LAST(PL_regmatch_slab);
6975 }
e2e6a0f1 6976 DEBUG_STATE_r({
54612592
YO
6977 if (no_final) {
6978 DEBUG_STATE_pp("pop (no final)");
6979 } else {
6980 DEBUG_STATE_pp("pop (yes)");
6981 }
e2e6a0f1 6982 });
5bc10b2c
DM
6983 depth--;
6984 }
6985#else
77cb431f
DM
6986 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6987 || yes_state > SLAB_LAST(PL_regmatch_slab))
6988 {
6989 /* not in this slab, pop slab */
6990 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6991 PL_regmatch_slab = PL_regmatch_slab->prev;
6992 st = SLAB_LAST(PL_regmatch_slab);
6993 }
6994 depth -= (st - yes_state);
5bc10b2c 6995#endif
77cb431f
DM
6996 st = yes_state;
6997 yes_state = st->u.yes.prev_yes_state;
6998 PL_regmatch_state = st;
24b23f37 6999
3640db6b 7000 if (no_final)
5d458dd8 7001 locinput= st->locinput;
54612592 7002 state_num = st->resume_state + no_final;
24d3c4a9 7003 goto reenter_switch;
77cb431f
DM
7004 }
7005
a3621e74 7006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 7007 PL_colors[4], PL_colors[5]));
02db2b7b 7008
bf2039a9 7009 if (reginfo->info_aux_eval) {
19b95bf0
DM
7010 /* each successfully executed (?{...}) block does the equivalent of
7011 * local $^R = do {...}
7012 * When popping the save stack, all these locals would be undone;
7013 * bypass this by setting the outermost saved $^R to the latest
7014 * value */
4b22688e
YO
7015 /* I dont know if this is needed or works properly now.
7016 * see code related to PL_replgv elsewhere in this file.
7017 * Yves
7018 */
19b95bf0
DM
7019 if (oreplsv != GvSV(PL_replgv))
7020 sv_setsv(oreplsv, GvSV(PL_replgv));
7021 }
95b24440 7022 result = 1;
aa283a38 7023 goto final_exit;
4633a7c4
LW
7024
7025no:
a3621e74 7026 DEBUG_EXECUTE_r(
7821416a 7027 PerlIO_printf(Perl_debug_log,
786e8c11 7028 "%*s %sfailed...%s\n",
5bc10b2c 7029 REPORT_CODE_OFF+depth*2, "",
786e8c11 7030 PL_colors[4], PL_colors[5])
7821416a 7031 );
aa283a38 7032
262b90c4 7033no_silent:
54612592
YO
7034 if (no_final) {
7035 if (yes_state) {
7036 goto yes;
7037 } else {
7038 goto final_exit;
7039 }
7040 }
aa283a38
DM
7041 if (depth) {
7042 /* there's a previous state to backtrack to */
40a82448
DM
7043 st--;
7044 if (st < SLAB_FIRST(PL_regmatch_slab)) {
7045 PL_regmatch_slab = PL_regmatch_slab->prev;
7046 st = SLAB_LAST(PL_regmatch_slab);
7047 }
7048 PL_regmatch_state = st;
40a82448 7049 locinput= st->locinput;
40a82448 7050
5bc10b2c
DM
7051 DEBUG_STATE_pp("pop");
7052 depth--;
262b90c4
DM
7053 if (yes_state == st)
7054 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 7055
24d3c4a9
DM
7056 state_num = st->resume_state + 1; /* failure = success + 1 */
7057 goto reenter_switch;
95b24440 7058 }
24d3c4a9 7059 result = 0;
aa283a38 7060
262b90c4 7061 final_exit:
bbe252da 7062 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
7063 SV *sv_err = get_sv("REGERROR", 1);
7064 SV *sv_mrk = get_sv("REGMARK", 1);
7065 if (result) {
e2e6a0f1 7066 sv_commit = &PL_sv_no;
5d458dd8
YO
7067 if (!sv_yes_mark)
7068 sv_yes_mark = &PL_sv_yes;
7069 } else {
7070 if (!sv_commit)
7071 sv_commit = &PL_sv_yes;
7072 sv_yes_mark = &PL_sv_no;
7073 }
316ebaf2
JH
7074 assert(sv_err);
7075 assert(sv_mrk);
5d458dd8
YO
7076 sv_setsv(sv_err, sv_commit);
7077 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 7078 }
19b95bf0 7079
81ed78b2
DM
7080
7081 if (last_pushed_cv) {
7082 dSP;
7083 POP_MULTICALL;
4f8dbb2d 7084 PERL_UNUSED_VAR(SP);
81ed78b2
DM
7085 }
7086
9d9163fb
DM
7087 assert(!result || locinput - reginfo->strbeg >= 0);
7088 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
7089}
7090
7091/*
7092 - regrepeat - repeatedly match something simple, report how many
d60de1d1 7093 *
e64f369d
KW
7094 * What 'simple' means is a node which can be the operand of a quantifier like
7095 * '+', or {1,3}
7096 *
d60de1d1
DM
7097 * startposp - pointer a pointer to the start position. This is updated
7098 * to point to the byte following the highest successful
7099 * match.
7100 * p - the regnode to be repeatedly matched against.
220db18a 7101 * reginfo - struct holding match state, such as strend
4063ade8 7102 * max - maximum number of things to match.
d60de1d1 7103 * depth - (for debugging) backtracking depth.
a687059c 7104 */
76e3520e 7105STATIC I32
272d35c9 7106S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
f9176b44 7107 regmatch_info *const reginfo, I32 max, int depth)
a687059c 7108{
4063ade8 7109 char *scan; /* Pointer to current position in target string */
eb578fdb 7110 I32 c;
220db18a 7111 char *loceol = reginfo->strend; /* local version */
4063ade8 7112 I32 hardcount = 0; /* How many matches so far */
ba44c216 7113 bool utf8_target = reginfo->is_utf8_target;
3018b823 7114 int to_complement = 0; /* Invert the result? */
d513472c 7115 UV utf8_flags;
3018b823 7116 _char_class_number classnum;
4f55667c
SP
7117#ifndef DEBUGGING
7118 PERL_UNUSED_ARG(depth);
7119#endif
a0d0e21e 7120
7918f24d
NC
7121 PERL_ARGS_ASSERT_REGREPEAT;
7122
f73aaa43 7123 scan = *startposp;
faf11cac
HS
7124 if (max == REG_INFTY)
7125 max = I32_MAX;
dfb8f192 7126 else if (! utf8_target && loceol - scan > max)
7f596f4c 7127 loceol = scan + max;
4063ade8
KW
7128
7129 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7130 * to the maximum of how far we should go in it (leaving it set to the real
7131 * end, if the maximum permissible would take us beyond that). This allows
7132 * us to make the loop exit condition that we haven't gone past <loceol> to
7133 * also mean that we haven't exceeded the max permissible count, saving a
7134 * test each time through the loop. But it assumes that the OP matches a
7135 * single byte, which is true for most of the OPs below when applied to a
7136 * non-UTF-8 target. Those relatively few OPs that don't have this
7137 * characteristic will have to compensate.
7138 *
7139 * There is no adjustment for UTF-8 targets, as the number of bytes per
7140 * character varies. OPs will have to test both that the count is less
7141 * than the max permissible (using <hardcount> to keep track), and that we
7142 * are still within the bounds of the string (using <loceol>. A few OPs
7143 * match a single byte no matter what the encoding. They can omit the max
7144 * test if, for the UTF-8 case, they do the adjustment that was skipped
7145 * above.
7146 *
7147 * Thus, the code above sets things up for the common case; and exceptional
7148 * cases need extra work; the common case is to make sure <scan> doesn't
7149 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7150 * count doesn't exceed the maximum permissible */
7151
a0d0e21e 7152 switch (OP(p)) {
22c35a8c 7153 case REG_ANY:
f2ed9b32 7154 if (utf8_target) {
1aa99e6b 7155 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
7156 scan += UTF8SKIP(scan);
7157 hardcount++;
7158 }
7159 } else {
7160 while (scan < loceol && *scan != '\n')
7161 scan++;
a0ed51b3
LW
7162 }
7163 break;
ffc61ed2 7164 case SANY:
f2ed9b32 7165 if (utf8_target) {
a0804c9e 7166 while (scan < loceol && hardcount < max) {
def8e4ea
JH
7167 scan += UTF8SKIP(scan);
7168 hardcount++;
7169 }
7170 }
7171 else
7172 scan = loceol;
a0ed51b3 7173 break;
4063ade8 7174 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
9597860a 7175 if (utf8_target && loceol - scan > max) {
4063ade8
KW
7176
7177 /* <loceol> hadn't been adjusted in the UTF-8 case */
7178 scan += max;
7179 }
7180 else {
7181 scan = loceol;
7182 }
f33976b4 7183 break;
59d32103 7184 case EXACT:
f9176b44 7185 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 7186
59d32103 7187 c = (U8)*STRING(p);
59d32103 7188
5e4a1da1
KW
7189 /* Can use a simple loop if the pattern char to match on is invariant
7190 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
7191 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7192 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 7193 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 7194 if (utf8_target && loceol - scan > max) {
4063ade8
KW
7195 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7196 * since here, to match at all, 1 char == 1 byte */
7197 loceol = scan + max;
7198 }
59d32103
KW
7199 while (scan < loceol && UCHARAT(scan) == c) {
7200 scan++;
7201 }
7202 }
f9176b44 7203 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
7204 if (utf8_target) {
7205 STRLEN scan_char_len;
5e4a1da1 7206
4063ade8 7207 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
7208 * string EQ */
7209 while (hardcount < max
9a902117
KW
7210 && scan < loceol
7211 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
7212 && memEQ(scan, STRING(p), scan_char_len))
7213 {
4200a00c 7214 scan += scan_char_len;
5e4a1da1
KW
7215 hardcount++;
7216 }
7217 }
7218 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 7219
5e4a1da1
KW
7220 /* Target isn't utf8; convert the character in the UTF-8
7221 * pattern to non-UTF8, and do a simple loop */
94bb8c36 7222 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
5e4a1da1
KW
7223 while (scan < loceol && UCHARAT(scan) == c) {
7224 scan++;
7225 }
7226 } /* else pattern char is above Latin1, can't possibly match the
7227 non-UTF-8 target */
b40a2c17 7228 }
5e4a1da1 7229 else {
59d32103 7230
5e4a1da1
KW
7231 /* Here, the string must be utf8; pattern isn't, and <c> is
7232 * different in utf8 than not, so can't compare them directly.
7233 * Outside the loop, find the two utf8 bytes that represent c, and
7234 * then look for those in sequence in the utf8 string */
59d32103
KW
7235 U8 high = UTF8_TWO_BYTE_HI(c);
7236 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
7237
7238 while (hardcount < max
7239 && scan + 1 < loceol
7240 && UCHARAT(scan) == high
7241 && UCHARAT(scan + 1) == low)
7242 {
7243 scan += 2;
7244 hardcount++;
7245 }
7246 }
7247 break;
5e4a1da1 7248
098b07d5
KW
7249 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
7250 assert(! reginfo->is_utf8_pat);
924ba076 7251 /* FALLTHROUGH */
2f7f8cb1 7252 case EXACTFA:
098b07d5 7253 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
7254 goto do_exactf;
7255
d4e0b827 7256 case EXACTFL:
cea315b6 7257 utf8_flags = FOLDEQ_LOCALE;
17580e7a
KW
7258 goto do_exactf;
7259
2fdb7295
KW
7260 case EXACTF: /* This node only generated for non-utf8 patterns */
7261 assert(! reginfo->is_utf8_pat);
098b07d5
KW
7262 utf8_flags = 0;
7263 goto do_exactf;
62bf7766 7264
3c760661 7265 case EXACTFU_SS:
9a5a5549 7266 case EXACTFU:
f9176b44 7267 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 7268
613a425d
KW
7269 do_exactf: {
7270 int c1, c2;
7271 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 7272
f9176b44 7273 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 7274
984e6dd1 7275 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 7276 reginfo))
984e6dd1 7277 {
613a425d 7278 if (c1 == CHRTEST_VOID) {
49b95fad 7279 /* Use full Unicode fold matching */
220db18a 7280 char *tmpeol = reginfo->strend;
f9176b44 7281 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
7282 while (hardcount < max
7283 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7284 STRING(p), NULL, pat_len,
f9176b44 7285 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
7286 {
7287 scan = tmpeol;
220db18a 7288 tmpeol = reginfo->strend;
49b95fad
KW
7289 hardcount++;
7290 }
613a425d
KW
7291 }
7292 else if (utf8_target) {
7293 if (c1 == c2) {
4063ade8
KW
7294 while (scan < loceol
7295 && hardcount < max
613a425d
KW
7296 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7297 {
7298 scan += UTF8SKIP(scan);
7299 hardcount++;
7300 }
7301 }
7302 else {
4063ade8
KW
7303 while (scan < loceol
7304 && hardcount < max
613a425d
KW
7305 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7306 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7307 {
7308 scan += UTF8SKIP(scan);
7309 hardcount++;
7310 }
7311 }
7312 }
7313 else if (c1 == c2) {
7314 while (scan < loceol && UCHARAT(scan) == c1) {
7315 scan++;
7316 }
7317 }
7318 else {
7319 while (scan < loceol &&
7320 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7321 {
7322 scan++;
7323 }
7324 }
634c83a2 7325 }
bbce6d69 7326 break;
613a425d 7327 }
a0d0e21e 7328 case ANYOF:
e0193e47 7329 if (utf8_target) {
4e8910e0 7330 while (hardcount < max
9a902117 7331 && scan < loceol
3db24e1e 7332 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
4e8910e0 7333 {
9a902117 7334 scan += UTF8SKIP(scan);
ffc61ed2
JH
7335 hardcount++;
7336 }
7337 } else {
32fc9b6a 7338 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
7339 scan++;
7340 }
a0d0e21e 7341 break;
4063ade8 7342
3018b823 7343 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 7344
3018b823
KW
7345 case NPOSIXL:
7346 to_complement = 1;
7347 /* FALLTHROUGH */
980866de 7348
3018b823 7349 case POSIXL:
3018b823
KW
7350 if (! utf8_target) {
7351 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7352 *scan)))
a12cf05f 7353 {
3018b823
KW
7354 scan++;
7355 }
7356 } else {
7357 while (hardcount < max && scan < loceol
7358 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7359 (U8 *) scan)))
7360 {
7361 scan += UTF8SKIP(scan);
ffc61ed2
JH
7362 hardcount++;
7363 }
a0ed51b3
LW
7364 }
7365 break;
0658cdde 7366
3018b823
KW
7367 case POSIXD:
7368 if (utf8_target) {
7369 goto utf8_posix;
7370 }
7371 /* FALLTHROUGH */
7372
0658cdde 7373 case POSIXA:
0430522f 7374 if (utf8_target && loceol - scan > max) {
4063ade8 7375
7aee35ff
KW
7376 /* We didn't adjust <loceol> at the beginning of this routine
7377 * because is UTF-8, but it is actually ok to do so, since here, to
7378 * match, 1 char == 1 byte. */
4063ade8
KW
7379 loceol = scan + max;
7380 }
7381 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
7382 scan++;
7383 }
7384 break;
980866de 7385
3018b823
KW
7386 case NPOSIXD:
7387 if (utf8_target) {
7388 to_complement = 1;
7389 goto utf8_posix;
7390 }
924ba076 7391 /* FALLTHROUGH */
980866de 7392
3018b823
KW
7393 case NPOSIXA:
7394 if (! utf8_target) {
7395 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
7396 scan++;
7397 }
4063ade8 7398 }
3018b823 7399 else {
980866de 7400
3018b823 7401 /* The complement of something that matches only ASCII matches all
837226c8 7402 * non-ASCII, plus everything in ASCII that isn't in the class. */
bedac28b 7403 while (hardcount < max && scan < loceol
837226c8 7404 && (! isASCII_utf8(scan)
3018b823 7405 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 7406 {
3018b823 7407 scan += UTF8SKIP(scan);
ffc61ed2
JH
7408 hardcount++;
7409 }
3018b823
KW
7410 }
7411 break;
980866de 7412
3018b823
KW
7413 case NPOSIXU:
7414 to_complement = 1;
7415 /* FALLTHROUGH */
7416
7417 case POSIXU:
7418 if (! utf8_target) {
7419 while (scan < loceol && to_complement
7420 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 7421 {
3018b823
KW
7422 scan++;
7423 }
cfaf538b
KW
7424 }
7425 else {
3018b823
KW
7426 utf8_posix:
7427 classnum = (_char_class_number) FLAGS(p);
7428 if (classnum < _FIRST_NON_SWASH_CC) {
7429
7430 /* Here, a swash is needed for above-Latin1 code points.
7431 * Process as many Latin1 code points using the built-in rules.
7432 * Go to another loop to finish processing upon encountering
7433 * the first Latin1 code point. We could do that in this loop
7434 * as well, but the other way saves having to test if the swash
7435 * has been loaded every time through the loop: extra space to
7436 * save a test. */
7437 while (hardcount < max && scan < loceol) {
7438 if (UTF8_IS_INVARIANT(*scan)) {
7439 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7440 classnum))))
7441 {
7442 break;
7443 }
7444 scan++;
7445 }
7446 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7447 if (! (to_complement
94bb8c36
KW
7448 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7449 *(scan + 1)),
3018b823
KW
7450 classnum))))
7451 {
7452 break;
7453 }
7454 scan += 2;
7455 }
7456 else {
7457 goto found_above_latin1;
7458 }
7459
7460 hardcount++;
7461 }
7462 }
7463 else {
7464 /* For these character classes, the knowledge of how to handle
7465 * every code point is compiled in to Perl via a macro. This
7466 * code is written for making the loops as tight as possible.
7467 * It could be refactored to save space instead */
7468 switch (classnum) {
7469 case _CC_ENUM_SPACE: /* XXX would require separate code
7470 if we revert the change of \v
7471 matching this */
924ba076 7472 /* FALLTHROUGH */
3018b823
KW
7473 case _CC_ENUM_PSXSPC:
7474 while (hardcount < max
7475 && scan < loceol
7476 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7477 {
7478 scan += UTF8SKIP(scan);
7479 hardcount++;
7480 }
7481 break;
7482 case _CC_ENUM_BLANK:
7483 while (hardcount < max
7484 && scan < loceol
7485 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7486 {
7487 scan += UTF8SKIP(scan);
7488 hardcount++;
7489 }
7490 break;
7491 case _CC_ENUM_XDIGIT:
7492 while (hardcount < max
7493 && scan < loceol
7494 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7495 {
7496 scan += UTF8SKIP(scan);
7497 hardcount++;
7498 }
7499 break;
7500 case _CC_ENUM_VERTSPACE:
7501 while (hardcount < max
7502 && scan < loceol
7503 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7504 {
7505 scan += UTF8SKIP(scan);
7506 hardcount++;
7507 }
7508 break;
7509 case _CC_ENUM_CNTRL:
7510 while (hardcount < max
7511 && scan < loceol
7512 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7513 {
7514 scan += UTF8SKIP(scan);
7515 hardcount++;
7516 }
7517 break;
7518 default:
7519 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7520 }
7521 }
a0ed51b3 7522 }
3018b823 7523 break;
4063ade8 7524
3018b823
KW
7525 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7526
7527 /* Load the swash if not already present */
7528 if (! PL_utf8_swash_ptrs[classnum]) {
7529 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7530 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
2a16ac92
KW
7531 "utf8",
7532 "",
7533 &PL_sv_undef, 1, 0,
7534 PL_XPosix_ptrs[classnum], &flags);
4063ade8 7535 }
3018b823
KW
7536
7537 while (hardcount < max && scan < loceol
7538 && to_complement ^ cBOOL(_generic_utf8(
7539 classnum,
7540 scan,
7541 swash_fetch(PL_utf8_swash_ptrs[classnum],
7542 (U8 *) scan,
7543 TRUE))))
7544 {
7545 scan += UTF8SKIP(scan);
7546 hardcount++;
7547 }
7548 break;
7549
e1d1eefb 7550 case LNBREAK:
e64f369d
KW
7551 if (utf8_target) {
7552 while (hardcount < max && scan < loceol &&
7553 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7554 scan += c;
7555 hardcount++;
7556 }
7557 } else {
7558 /* LNBREAK can match one or two latin chars, which is ok, but we
7559 * have to use hardcount in this situation, and throw away the
7560 * adjustment to <loceol> done before the switch statement */
220db18a 7561 loceol = reginfo->strend;
e64f369d
KW
7562 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7563 scan+=c;
7564 hardcount++;
7565 }
7566 }
7567 break;
e1d1eefb 7568
584b1f02
KW
7569 case BOUND:
7570 case BOUNDA:
7571 case BOUNDL:
7572 case BOUNDU:
7573 case EOS:
7574 case GPOS:
7575 case KEEPS:
7576 case NBOUND:
7577 case NBOUNDA:
7578 case NBOUNDL:
7579 case NBOUNDU:
7580 case OPFAIL:
7581 case SBOL:
7582 case SEOL:
7583 /* These are all 0 width, so match right here or not at all. */
7584 break;
7585
7586 default:
7587 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
a74ff37d
JH
7588 /* NOTREACHED */
7589 assert(0);
584b1f02 7590
a0d0e21e 7591 }
a687059c 7592
a0ed51b3
LW
7593 if (hardcount)
7594 c = hardcount;
7595 else
f73aaa43
DM
7596 c = scan - *startposp;
7597 *startposp = scan;
a687059c 7598
a3621e74 7599 DEBUG_r({
e68ec53f 7600 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7601 DEBUG_EXECUTE_r({
e68ec53f 7602 SV * const prop = sv_newmortal();
2395827c 7603 regprop(prog, prop, p, reginfo);
e68ec53f 7604 PerlIO_printf(Perl_debug_log,
be8e71aa 7605 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7606 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7607 });
be8e71aa 7608 });
9041c2e3 7609
a0d0e21e 7610 return(c);
a687059c
LW
7611}
7612
c277df42 7613
be8e71aa 7614#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7615/*
6c6525b8 7616- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7617create a copy so that changes the caller makes won't change the shared one.
7618If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7619 */
ffc61ed2 7620SV *
5aaab254 7621Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7622{
6c6525b8 7623 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7624
7625 if (altsvp) {
7626 *altsvp = NULL;
7627 }
7628
ef9bc832 7629 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
6c6525b8 7630}
6c6525b8 7631
3e63bed3 7632#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
ffc61ed2
JH
7633
7634/*
ba7b4546 7635 - reginclass - determine if a character falls into a character class
832705d4 7636
6698fab5
KW
7637 n is the ANYOF regnode
7638 p is the target string
3db24e1e 7639 p_end points to one byte beyond the end of the target string
6698fab5 7640 utf8_target tells whether p is in UTF-8.
832705d4 7641
635cd5d4 7642 Returns true if matched; false otherwise.
eba1359e 7643
d5788240
KW
7644 Note that this can be a synthetic start class, a combination of various
7645 nodes, so things you think might be mutually exclusive, such as locale,
7646 aren't. It can match both locale and non-locale
7647
bbce6d69 7648 */
7649
76e3520e 7650STATIC bool
3db24e1e 7651S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
bbce6d69 7652{
27da23d5 7653 dVAR;
a3b680e6 7654 const char flags = ANYOF_FLAGS(n);
bbce6d69 7655 bool match = FALSE;
cc07378b 7656 UV c = *p;
1aa99e6b 7657
7918f24d
NC
7658 PERL_ARGS_ASSERT_REGINCLASS;
7659
afd2eb18
KW
7660 /* If c is not already the code point, get it. Note that
7661 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7662 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7663 STRLEN c_len = 0;
3db24e1e 7664 c = utf8n_to_uvchr(p, p_end - p, &c_len,
6182169b
KW
7665 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7666 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7667 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7668 * UTF8_ALLOW_FFFF */
f7ab54c6 7669 if (c_len == (STRLEN)-1)
e8a70c6f 7670 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7671 }
4b3cda86 7672
7cdde544 7673 /* If this character is potentially in the bitmap, check it */
dcb20b36 7674 if (c < NUM_ANYOF_CODE_POINTS) {
ffc61ed2
JH
7675 if (ANYOF_BITMAP_TEST(n, c))
7676 match = TRUE;
93e92956
KW
7677 else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
7678 && ! utf8_target
7679 && ! isASCII(c))
11454c59
KW
7680 {
7681 match = TRUE;
7682 }
1462525b 7683 else if (flags & ANYOF_LOCALE_FLAGS) {
6942fd9a 7684 if ((flags & ANYOF_LOC_FOLD)
e0a1ff7a 7685 && c < 256
6942fd9a
KW
7686 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7687 {
7688 match = TRUE;
b99851e1 7689 }
e0a1ff7a
KW
7690 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
7691 && c < 256
7692 ) {
31c7f561
KW
7693
7694 /* The data structure is arranged so bits 0, 2, 4, ... are set
7695 * if the class includes the Posix character class given by
7696 * bit/2; and 1, 3, 5, ... are set if the class includes the
7697 * complemented Posix class given by int(bit/2). So we loop
7698 * through the bits, each time changing whether we complement
7699 * the result or not. Suppose for the sake of illustration
7700 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7701 * is set, it means there is a match for this ANYOF node if the
7702 * character is in the class given by the expression (0 / 2 = 0
7703 * = \w). If it is in that class, isFOO_lc() will return 1,
7704 * and since 'to_complement' is 0, the result will stay TRUE,
7705 * and we exit the loop. Suppose instead that bit 0 is 0, but
7706 * bit 1 is 1. That means there is a match if the character
7707 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7708 * but will on bit 1. On the second iteration 'to_complement'
7709 * will be 1, so the exclusive or will reverse things, so we
7710 * are testing for \W. On the third iteration, 'to_complement'
7711 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7712 * iteration would test for \S, etc.
7713 *
7714 * Note that this code assumes that all the classes are closed
7715 * under folding. For example, if a character matches \w, then
7716 * its fold does too; and vice versa. This should be true for
7717 * any well-behaved locale for all the currently defined Posix
7718 * classes, except for :lower: and :upper:, which are handled
7719 * by the pseudo-class :cased: which matches if either of the
7720 * other two does. To get rid of this assumption, an outer
7721 * loop could be used below to iterate over both the source
7722 * character, and its fold (if different) */
31c7f561
KW
7723
7724 int count = 0;
7725 int to_complement = 0;
522b3c1e 7726
31c7f561 7727 while (count < ANYOF_MAX) {
8efd3f97 7728 if (ANYOF_POSIXL_TEST(n, count)
31c7f561
KW
7729 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7730 {
7731 match = TRUE;
7732 break;
7733 }
7734 count++;
7735 to_complement ^= 1;
7736 }
ffc61ed2 7737 }
a0ed51b3 7738 }
a0ed51b3
LW
7739 }
7740
31f05a37 7741
7cdde544 7742 /* If the bitmap didn't (or couldn't) match, and something outside the
3b04b210 7743 * bitmap could match, try that. */
ef87b810 7744 if (!match) {
93e92956
KW
7745 if (c >= NUM_ANYOF_CODE_POINTS
7746 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7747 {
7748 match = TRUE; /* Everything above the bitmap matches */
e051a21d 7749 }
93e92956
KW
7750 else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
7751 || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
1ee208c4
KW
7752 || ((flags & ANYOF_LOC_FOLD)
7753 && IN_UTF8_CTYPE_LOCALE
93e92956 7754 && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
3b04b210 7755 {
1ee208c4
KW
7756 SV* only_utf8_locale = NULL;
7757 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
ef9bc832 7758 &only_utf8_locale, NULL);
7cdde544 7759 if (sw) {
893ef8be 7760 U8 utf8_buffer[2];
7cdde544
KW
7761 U8 * utf8_p;
7762 if (utf8_target) {
7763 utf8_p = (U8 *) p;
e0193e47 7764 } else { /* Convert to utf8 */
893ef8be
KW
7765 utf8_p = utf8_buffer;
7766 append_utf8_from_native_byte(*p, &utf8_p);
7767 utf8_p = utf8_buffer;
7cdde544 7768 }
f56b6394 7769
e0193e47 7770 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7771 match = TRUE;
e0193e47 7772 }
7cdde544 7773 }
1ee208c4
KW
7774 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7775 match = _invlist_contains_cp(only_utf8_locale, c);
7776 }
7cdde544 7777 }
5073ffbd
KW
7778
7779 if (UNICODE_IS_SUPER(c)
ae986089 7780 && (flags & ANYOF_WARN_SUPER)
5073ffbd
KW
7781 && ckWARN_d(WARN_NON_UNICODE))
7782 {
7783 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2d88a86a 7784 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
5073ffbd 7785 }
7cdde544
KW
7786 }
7787
5dbb0c08
KW
7788#if ANYOF_INVERT != 1
7789 /* Depending on compiler optimization cBOOL takes time, so if don't have to
7790 * use it, don't */
7791# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7792#endif
7793
f0fdc1c9 7794 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
5dbb0c08 7795 return (flags & ANYOF_INVERT) ^ match;
a0ed51b3 7796}
161b471a 7797
dfe13c55 7798STATIC U8 *
ea3daa5d 7799S_reghop3(U8 *s, SSize_t off, const U8* lim)
9041c2e3 7800{
6af86488
KW
7801 /* return the position 'off' UTF-8 characters away from 's', forward if
7802 * 'off' >= 0, backwards if negative. But don't go outside of position
7803 * 'lim', which better be < s if off < 0 */
7804
7918f24d
NC
7805 PERL_ARGS_ASSERT_REGHOP3;
7806
a0ed51b3 7807 if (off >= 0) {
1aa99e6b 7808 while (off-- && s < lim) {
ffc61ed2 7809 /* XXX could check well-formedness here */
a0ed51b3 7810 s += UTF8SKIP(s);
ffc61ed2 7811 }
a0ed51b3
LW
7812 }
7813 else {
1de06328
YO
7814 while (off++ && s > lim) {
7815 s--;
7816 if (UTF8_IS_CONTINUED(*s)) {
7817 while (s > lim && UTF8_IS_CONTINUATION(*s))
7818 s--;
a0ed51b3 7819 }
1de06328 7820 /* XXX could check well-formedness here */
a0ed51b3
LW
7821 }
7822 }
7823 return s;
7824}
161b471a 7825
dfe13c55 7826STATIC U8 *
ea3daa5d 7827S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
1de06328 7828{
7918f24d
NC
7829 PERL_ARGS_ASSERT_REGHOP4;
7830
1de06328
YO
7831 if (off >= 0) {
7832 while (off-- && s < rlim) {
7833 /* XXX could check well-formedness here */
7834 s += UTF8SKIP(s);
7835 }
7836 }
7837 else {
7838 while (off++ && s > llim) {
7839 s--;
7840 if (UTF8_IS_CONTINUED(*s)) {
7841 while (s > llim && UTF8_IS_CONTINUATION(*s))
7842 s--;
7843 }
7844 /* XXX could check well-formedness here */
7845 }
7846 }
7847 return s;
7848}
1de06328 7849
557f47af
DM
7850/* like reghop3, but returns NULL on overrun, rather than returning last
7851 * char pos */
7852
1de06328 7853STATIC U8 *
ea3daa5d 7854S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
a0ed51b3 7855{
7918f24d
NC
7856 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7857
a0ed51b3 7858 if (off >= 0) {
1aa99e6b 7859 while (off-- && s < lim) {
ffc61ed2 7860 /* XXX could check well-formedness here */
a0ed51b3 7861 s += UTF8SKIP(s);
ffc61ed2 7862 }
a0ed51b3 7863 if (off >= 0)
3dab1dad 7864 return NULL;
a0ed51b3
LW
7865 }
7866 else {
1de06328
YO
7867 while (off++ && s > lim) {
7868 s--;
7869 if (UTF8_IS_CONTINUED(*s)) {
7870 while (s > lim && UTF8_IS_CONTINUATION(*s))
7871 s--;
a0ed51b3 7872 }
1de06328 7873 /* XXX could check well-formedness here */
a0ed51b3
LW
7874 }
7875 if (off <= 0)
3dab1dad 7876 return NULL;
a0ed51b3
LW
7877 }
7878 return s;
7879}
51371543 7880
a75351a1
DM
7881
7882/* when executing a regex that may have (?{}), extra stuff needs setting
7883 up that will be visible to the called code, even before the current
7884 match has finished. In particular:
7885
7886 * $_ is localised to the SV currently being matched;
7887 * pos($_) is created if necessary, ready to be updated on each call-out
7888 to code;
7889 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7890 isn't set until the current pattern is successfully finished), so that
7891 $1 etc of the match-so-far can be seen;
7892 * save the old values of subbeg etc of the current regex, and set then
7893 to the current string (again, this is normally only done at the end
7894 of execution)
a75351a1
DM
7895*/
7896
7897static void
7898S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7899{
7900 MAGIC *mg;
7901 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 7902 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 7903
8adc0f72 7904 eval_state->rex = rex;
a75351a1 7905
a75351a1
DM
7906 if (reginfo->sv) {
7907 /* Make $_ available to executed code. */
7908 if (reginfo->sv != DEFSV) {
7909 SAVE_DEFSV;
7910 DEFSV_set(reginfo->sv);
7911 }
7912
96c2a8ff 7913 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 7914 /* prepare for quick setting of pos */
96c2a8ff 7915 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
7916 mg->mg_len = -1;
7917 }
8adc0f72
DM
7918 eval_state->pos_magic = mg;
7919 eval_state->pos = mg->mg_len;
25fdce4a 7920 eval_state->pos_flags = mg->mg_flags;
a75351a1 7921 }
8adc0f72
DM
7922 else
7923 eval_state->pos_magic = NULL;
7924
a75351a1 7925 if (!PL_reg_curpm) {
f65e70f5
DM
7926 /* PL_reg_curpm is a fake PMOP that we can attach the current
7927 * regex to and point PL_curpm at, so that $1 et al are visible
7928 * within a /(?{})/. It's just allocated once per interpreter the
7929 * first time its needed */
a75351a1
DM
7930 Newxz(PL_reg_curpm, 1, PMOP);
7931#ifdef USE_ITHREADS
7932 {
7933 SV* const repointer = &PL_sv_undef;
7934 /* this regexp is also owned by the new PL_reg_curpm, which
7935 will try to free it. */
7936 av_push(PL_regex_padav, repointer);
b9f2b683 7937 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
a75351a1
DM
7938 PL_regex_pad = AvARRAY(PL_regex_padav);
7939 }
7940#endif
7941 }
7942 SET_reg_curpm(reginfo->prog);
8adc0f72 7943 eval_state->curpm = PL_curpm;
a75351a1
DM
7944 PL_curpm = PL_reg_curpm;
7945 if (RXp_MATCH_COPIED(rex)) {
7946 /* Here is a serious problem: we cannot rewrite subbeg,
7947 since it may be needed if this match fails. Thus
7948 $` inside (?{}) could fail... */
8adc0f72
DM
7949 eval_state->subbeg = rex->subbeg;
7950 eval_state->sublen = rex->sublen;
7951 eval_state->suboffset = rex->suboffset;
a8ee055f 7952 eval_state->subcoffset = rex->subcoffset;
a75351a1 7953#ifdef PERL_ANY_COW
8adc0f72 7954 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
7955#endif
7956 RXp_MATCH_COPIED_off(rex);
7957 }
7958 else
8adc0f72 7959 eval_state->subbeg = NULL;
a75351a1
DM
7960 rex->subbeg = (char *)reginfo->strbeg;
7961 rex->suboffset = 0;
7962 rex->subcoffset = 0;
7963 rex->sublen = reginfo->strend - reginfo->strbeg;
7964}
7965
bf2039a9
DM
7966
7967/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 7968
51371543 7969static void
bf2039a9 7970S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 7971{
bf2039a9
DM
7972 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7973 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 7974 regmatch_slab *s;
bf2039a9 7975
2ac8ff4b
DM
7976 Safefree(aux->poscache);
7977
331b2dcc 7978 if (eval_state) {
bf2039a9 7979
331b2dcc 7980 /* undo the effects of S_setup_eval_state() */
bf2039a9 7981
331b2dcc
DM
7982 if (eval_state->subbeg) {
7983 regexp * const rex = eval_state->rex;
7984 rex->subbeg = eval_state->subbeg;
7985 rex->sublen = eval_state->sublen;
7986 rex->suboffset = eval_state->suboffset;
7987 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 7988#ifdef PERL_ANY_COW
331b2dcc 7989 rex->saved_copy = eval_state->saved_copy;
ed252734 7990#endif
331b2dcc
DM
7991 RXp_MATCH_COPIED_on(rex);
7992 }
7993 if (eval_state->pos_magic)
25fdce4a 7994 {
331b2dcc 7995 eval_state->pos_magic->mg_len = eval_state->pos;
25fdce4a
FC
7996 eval_state->pos_magic->mg_flags =
7997 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
7998 | (eval_state->pos_flags & MGf_BYTES);
7999 }
331b2dcc
DM
8000
8001 PL_curpm = eval_state->curpm;
8adc0f72 8002 }
bf2039a9 8003
331b2dcc
DM
8004 PL_regmatch_state = aux->old_regmatch_state;
8005 PL_regmatch_slab = aux->old_regmatch_slab;
8006
8007 /* free all slabs above current one - this must be the last action
8008 * of this function, as aux and eval_state are allocated within
8009 * slabs and may be freed here */
8010
8011 s = PL_regmatch_slab->next;
8012 if (s) {
8013 PL_regmatch_slab->next = NULL;
8014 while (s) {
8015 regmatch_slab * const osl = s;
8016 s = s->next;
8017 Safefree(osl);
8018 }
8019 }
51371543 8020}
33b8afdf 8021
8adc0f72 8022
33b8afdf 8023STATIC void
5aaab254 8024S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 8025{
7e0d5ad7
KW
8026 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8027 * on the converted value */
8028
a1cac82e 8029 int i = 1;
7918f24d
NC
8030
8031 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8032
a1cac82e
NC
8033 do {
8034 if (prog->substrs->data[i].substr
8035 && !prog->substrs->data[i].utf8_substr) {
8036 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8037 prog->substrs->data[i].utf8_substr = sv;
8038 sv_utf8_upgrade(sv);
610460f9 8039 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 8040 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
8041 /* Trim the trailing \n that fbm_compile added last
8042 time. */
8043 SvCUR_set(sv, SvCUR(sv) - 1);
8044 /* Whilst this makes the SV technically "invalid" (as its
8045 buffer is no longer followed by "\0") when fbm_compile()
8046 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
8047 fbm_compile(sv, FBMcf_TAIL);
8048 } else
8049 fbm_compile(sv, 0);
610460f9 8050 }
a1cac82e
NC
8051 if (prog->substrs->data[i].substr == prog->check_substr)
8052 prog->check_utf8 = sv;
8053 }
8054 } while (i--);
33b8afdf
JH
8055}
8056
7e0d5ad7 8057STATIC bool
5aaab254 8058S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 8059{
7e0d5ad7
KW
8060 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8061 * on the converted value; returns FALSE if can't be converted. */
8062
a1cac82e 8063 int i = 1;
7918f24d
NC
8064
8065 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8066
a1cac82e
NC
8067 do {
8068 if (prog->substrs->data[i].utf8_substr
8069 && !prog->substrs->data[i].substr) {
8070 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
8071 if (! sv_utf8_downgrade(sv, TRUE)) {
8072 return FALSE;
8073 }
5400f398
KW
8074 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8075 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8076 /* Trim the trailing \n that fbm_compile added last
8077 time. */
8078 SvCUR_set(sv, SvCUR(sv) - 1);
8079 fbm_compile(sv, FBMcf_TAIL);
8080 } else
8081 fbm_compile(sv, 0);
8082 }
a1cac82e
NC
8083 prog->substrs->data[i].substr = sv;
8084 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8085 prog->check_substr = sv;
33b8afdf 8086 }
a1cac82e 8087 } while (i--);
7e0d5ad7
KW
8088
8089 return TRUE;
33b8afdf 8090}
66610fdd
RGS
8091
8092/*
8093 * Local variables:
8094 * c-indentation-style: bsd
8095 * c-basic-offset: 4
14d04a33 8096 * indent-tabs-mode: nil
66610fdd
RGS
8097 * End:
8098 *
14d04a33 8099 * ex: set ts=8 sts=4 sw=4 et:
37442d52 8100 */