This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re_intuit_start(): indent rest of check block
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 121 : (U8*)(pos + off))
557f47af 122
3dab1dad 123#define HOPBACKc(pos, off) \
ba44c216 124 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
126 : (pos - off >= reginfo->strbeg) \
8e11feef 127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
ba44c216 130#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 131#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 132
557f47af
DM
133/* lim must be +ve. Returns NULL on overshoot */
134#define HOPMAYBE3(pos,off,lim) \
135 (reginfo->is_utf8_target \
136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
137 : ((U8*)pos + off <= lim) \
138 ? (U8*)pos + off \
139 : NULL)
140
8e9f2289
DM
141/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
142 * off must be >=0; args should be vars rather than expressions */
143#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
145 : (U8*)((pos + off) > lim ? lim : (pos + off)))
146
2974eaec
DM
147#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
149 : (U8*)(pos + off))
150#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
151
152#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
153#define NEXTCHR_IS_EOS (nextchr < 0)
154
155#define SET_nextchr \
220db18a 156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
157
158#define SET_locinput(p) \
159 locinput = (p); \
160 SET_nextchr
161
162
2a16ac92 163#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
164 if (!swash_ptr) { \
165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 167 1, 0, invlist, &flags); \
c7304fe2
KW
168 assert(swash_ptr); \
169 } \
170 } STMT_END
171
172/* If in debug mode, we test that a known character properly matches */
173#ifdef DEBUGGING
174# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
175 property_name, \
2a16ac92 176 invlist, \
c7304fe2 177 utf8_char_in_property) \
2a16ac92 178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
180#else
181# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
182 property_name, \
2a16ac92 183 invlist, \
c7304fe2 184 utf8_char_in_property) \
2a16ac92 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 186#endif
d1eb3177 187
c7304fe2
KW
188#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
190 "", \
191 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
193
194#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
195 STMT_START { \
196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
197 "_X_regular_begin", \
2a16ac92 198 NULL, \
df38da56 199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
201 "_X_extend", \
2a16ac92 202 NULL, \
c7304fe2
KW
203 COMBINING_GRAVE_ACCENT_UTF8); \
204 } STMT_END
d1eb3177 205
c7304fe2 206#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
207/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
208
5f80c4cf 209/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
210/* it would be nice to rework regcomp.sym to generate this stuff. sigh
211 *
212 * NOTE that *nothing* that affects backtracking should be in here, specifically
213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
214 * node that is in between two EXACT like nodes when ascertaining what the required
215 * "follow" character is. This should probably be moved to regex compile time
216 * although it may be done at run time beause of the REF possibility - more
217 * investigation required. -- demerphq
218*/
baa60164
KW
219#define JUMPABLE(rn) ( \
220 OP(rn) == OPEN || \
3e901dc0 221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
222 OP(rn) == EVAL || \
223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
224 OP(rn) == PLUS || OP(rn) == MINMOD || \
225 OP(rn) == KEEPS || \
226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 227)
ee9b8eae 228#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 229
ee9b8eae
YO
230#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
231
232#if 0
233/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
234 we don't need this definition. */
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 236#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
237#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
238
239#else
240/* ... so we use this as its faster. */
241#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 242#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
243#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
244#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245
246#endif
e2d8ce26 247
a84d97b6
HS
248/*
249 Search for mandatory following text node; for lookahead, the text must
250 follow but for lookbehind (rn->flags != 0) we skip to the next step.
251*/
baa60164 252#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
253 while (JUMPABLE(rn)) { \
254 const OPCODE type = OP(rn); \
255 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 256 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 257 else if (type == PLUS) \
cca55fe3 258 rn = NEXTOPER(rn); \
3dab1dad 259 else if (type == IFMATCH) \
a84d97b6 260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 261 else rn += NEXT_OFF(rn); \
3dab1dad 262 } \
5f80c4cf 263} STMT_END
74750237 264
22913b96
KW
265/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
266 * These are for the pre-composed Hangul syllables, which are all in a
267 * contiguous block and arranged there in such a way so as to facilitate
268 * alorithmic determination of their characteristics. As such, they don't need
269 * a swash, but can be determined by simple arithmetic. Almost all are
270 * GCB=LVT, but every 28th one is a GCB=LV */
271#define SBASE 0xAC00 /* Start of block */
272#define SCount 11172 /* Length of block */
273#define TCount 28
c476f425 274
006f26b2
DM
275#define SLAB_FIRST(s) (&(s)->states[0])
276#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
277
a75351a1 278static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 279static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 280static regmatch_state * S_push_slab(pTHX);
51371543 281
87c0511b 282#define REGCP_PAREN_ELEMS 3
f067efbf 283#define REGCP_OTHER_ELEMS 3
e0fa7e2b 284#define REGCP_FRAME_ELEMS 1
620d5b66
NC
285/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
286 * are needed for the regexp context stack bookkeeping. */
287
76e3520e 288STATIC CHECKPOINT
92da3157 289S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 290{
97aff369 291 dVAR;
a3b680e6 292 const int retval = PL_savestack_ix;
92da3157
DM
293 const int paren_elems_to_push =
294 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
295 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
296 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 297 I32 p;
40a82448 298 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 299
b93070ed
DM
300 PERL_ARGS_ASSERT_REGCPPUSH;
301
e49a9654 302 if (paren_elems_to_push < 0)
d1b2014a
YO
303 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
304 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
e49a9654 305
e0fa7e2b
NC
306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 308 " out of range (%lu-%ld)",
92da3157
DM
309 total_elems,
310 (unsigned long)maxopenparen,
311 (long)parenfloor);
e0fa7e2b 312
620d5b66 313 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 314
495f47a5 315 DEBUG_BUFFERS_r(
92da3157 316 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
317 PerlIO_printf(Perl_debug_log,
318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
319 PTR2UV(rex),
320 PTR2UV(rex->offs)
321 );
322 );
92da3157 323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 324/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
325 SSPUSHIV(rex->offs[p].end);
326 SSPUSHIV(rex->offs[p].start);
1ca2007e 327 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
330 (UV)p,
331 (IV)rex->offs[p].start,
332 (IV)rex->offs[p].start_tmp,
333 (IV)rex->offs[p].end
40a82448 334 ));
a0d0e21e 335 }
b1ce53c5 336/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 337 SSPUSHINT(maxopenparen);
b93070ed
DM
338 SSPUSHINT(rex->lastparen);
339 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 341
a0d0e21e
LW
342 return retval;
343}
344
c277df42 345/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
346#define REGCP_SET(cp) \
347 DEBUG_STATE_r( \
ab3bbdeb 348 PerlIO_printf(Perl_debug_log, \
e4f74956 349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
350 (IV)PL_savestack_ix)); \
351 cp = PL_savestack_ix
c3464db5 352
ab3bbdeb 353#define REGCP_UNWIND(cp) \
e4f74956 354 DEBUG_STATE_r( \
ab3bbdeb 355 if (cp != PL_savestack_ix) \
e4f74956
YO
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
358 (IV)(cp), (IV)PL_savestack_ix)); \
359 regcpblow(cp)
c277df42 360
a8d1f4b4
DM
361#define UNWIND_PAREN(lp, lcp) \
362 for (n = rex->lastparen; n > lp; n--) \
363 rex->offs[n].end = -1; \
364 rex->lastparen = n; \
365 rex->lastcloseparen = lcp;
366
367
f067efbf 368STATIC void
92da3157 369S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 370{
97aff369 371 dVAR;
e0fa7e2b 372 UV i;
87c0511b 373 U32 paren;
a3621e74
YO
374 GET_RE_DEBUG_FLAGS_DECL;
375
7918f24d
NC
376 PERL_ARGS_ASSERT_REGCPPOP;
377
b1ce53c5 378 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 379 i = SSPOPUV;
e0fa7e2b
NC
380 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
381 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
382 rex->lastcloseparen = SSPOPINT;
383 rex->lastparen = SSPOPINT;
92da3157 384 *maxopenparen_p = SSPOPINT;
b1ce53c5 385
620d5b66 386 i -= REGCP_OTHER_ELEMS;
b1ce53c5 387 /* Now restore the parentheses context. */
495f47a5
DM
388 DEBUG_BUFFERS_r(
389 if (i || rex->lastparen + 1 <= rex->nparens)
390 PerlIO_printf(Perl_debug_log,
391 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
392 PTR2UV(rex),
393 PTR2UV(rex->offs)
394 );
395 );
92da3157 396 paren = *maxopenparen_p;
620d5b66 397 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 398 SSize_t tmps;
1ca2007e 399 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
400 rex->offs[paren].start = SSPOPIV;
401 tmps = SSPOPIV;
b93070ed
DM
402 if (paren <= rex->lastparen)
403 rex->offs[paren].end = tmps;
495f47a5
DM
404 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
405 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
406 (UV)paren,
407 (IV)rex->offs[paren].start,
408 (IV)rex->offs[paren].start_tmp,
409 (IV)rex->offs[paren].end,
410 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 411 );
87c0511b 412 paren--;
a0d0e21e 413 }
daf18116 414#if 1
dafc8851
JH
415 /* It would seem that the similar code in regtry()
416 * already takes care of this, and in fact it is in
417 * a better location to since this code can #if 0-ed out
418 * but the code in regtry() is needed or otherwise tests
419 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
420 * (as of patchlevel 7877) will fail. Then again,
421 * this code seems to be necessary or otherwise
225593e1
DM
422 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
423 * --jhi updated by dapm */
b93070ed 424 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 425 if (i > *maxopenparen_p)
b93070ed
DM
426 rex->offs[i].start = -1;
427 rex->offs[i].end = -1;
495f47a5
DM
428 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
429 " \\%"UVuf": %s ..-1 undeffing\n",
430 (UV)i,
92da3157 431 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 432 ));
a0d0e21e 433 }
dafc8851 434#endif
a0d0e21e
LW
435}
436
74088413
DM
437/* restore the parens and associated vars at savestack position ix,
438 * but without popping the stack */
439
440STATIC void
92da3157 441S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
442{
443 I32 tmpix = PL_savestack_ix;
444 PL_savestack_ix = ix;
92da3157 445 regcppop(rex, maxopenparen_p);
74088413
DM
446 PL_savestack_ix = tmpix;
447}
448
02db2b7b 449#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 450
31c7f561
KW
451STATIC bool
452S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
453{
454 /* Returns a boolean as to whether or not 'character' is a member of the
455 * Posix character class given by 'classnum' that should be equivalent to a
456 * value in the typedef '_char_class_number'.
457 *
458 * Ideally this could be replaced by a just an array of function pointers
459 * to the C library functions that implement the macros this calls.
460 * However, to compile, the precise function signatures are required, and
461 * these may vary from platform to to platform. To avoid having to figure
462 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
463 * which adds an extra layer of function call overhead (unless the C
464 * optimizer strips it away). But we don't particularly care about
465 * performance with locales anyway. */
31c7f561
KW
466
467 switch ((_char_class_number) classnum) {
15861f94 468 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 469 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
470 case _CC_ENUM_ASCII: return isASCII_LC(character);
471 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
472 case _CC_ENUM_CASED: return isLOWER_LC(character)
473 || isUPPER_LC(character);
e8d596e0 474 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
475 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
476 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
477 case _CC_ENUM_LOWER: return isLOWER_LC(character);
478 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 479 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 480 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 481 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
482 case _CC_ENUM_UPPER: return isUPPER_LC(character);
483 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 484 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
485 default: /* VERTSPACE should never occur in locales */
486 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
487 }
488
489 assert(0); /* NOTREACHED */
490 return FALSE;
491}
492
3018b823
KW
493STATIC bool
494S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
495{
496 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
497 * 'character' is a member of the Posix character class given by 'classnum'
498 * that should be equivalent to a value in the typedef
499 * '_char_class_number'.
500 *
501 * This just calls isFOO_lc on the code point for the character if it is in
502 * the range 0-255. Outside that range, all characters avoid Unicode
503 * rules, ignoring any locale. So use the Unicode function if this class
504 * requires a swash, and use the Unicode macro otherwise. */
505
506 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
507
508 if (UTF8_IS_INVARIANT(*character)) {
509 return isFOO_lc(classnum, *character);
510 }
511 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
512 return isFOO_lc(classnum,
94bb8c36 513 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
514 }
515
516 if (classnum < _FIRST_NON_SWASH_CC) {
517
518 /* Initialize the swash unless done already */
519 if (! PL_utf8_swash_ptrs[classnum]) {
520 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
521 PL_utf8_swash_ptrs[classnum] =
522 _core_swash_init("utf8",
523 "",
524 &PL_sv_undef, 1, 0,
525 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
526 }
527
92a2046b
KW
528 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
529 character,
530 TRUE /* is UTF */ ));
3018b823
KW
531 }
532
533 switch ((_char_class_number) classnum) {
534 case _CC_ENUM_SPACE:
535 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
536
537 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
538 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
539 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
540 default: return 0; /* Things like CNTRL are always
541 below 256 */
542 }
543
544 assert(0); /* NOTREACHED */
545 return FALSE;
546}
547
a687059c 548/*
e50aee73 549 * pregexec and friends
a687059c
LW
550 */
551
76234dfb 552#ifndef PERL_IN_XSUB_RE
a687059c 553/*
c277df42 554 - pregexec - match a regexp against a string
a687059c 555 */
c277df42 556I32
5aaab254 557Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 558 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
559/* stringarg: the point in the string at which to begin matching */
560/* strend: pointer to null at end of string */
561/* strbeg: real beginning of string */
562/* minend: end of match must be >= minend bytes after stringarg. */
563/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
564 * itself is accessed via the pointers above */
565/* nosave: For optimizations. */
c277df42 566{
7918f24d
NC
567 PERL_ARGS_ASSERT_PREGEXEC;
568
c277df42 569 return
9041c2e3 570 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
571 nosave ? 0 : REXEC_COPY_STR);
572}
76234dfb 573#endif
22e551b9 574
9041c2e3 575/*
cad2e5aa
JH
576 * Need to implement the following flags for reg_anch:
577 *
578 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
579 * USE_INTUIT_ML
580 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
581 * INTUIT_AUTORITATIVE_ML
582 * INTUIT_ONCE_NOML - Intuit can match in one location only.
583 * INTUIT_ONCE_ML
584 *
585 * Another flag for this function: SECOND_TIME (so that float substrs
586 * with giant delta may be not rechecked).
587 */
588
3f7c398e 589/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
590 Otherwise, only SvCUR(sv) is used to get strbeg. */
591
6eb5f6b9
JH
592/* XXXX Some places assume that there is a fixed substring.
593 An update may be needed if optimizer marks as "INTUITable"
594 RExen without fixed substrings. Similarly, it is assumed that
595 lengths of all the strings are no more than minlen, thus they
596 cannot come from lookahead.
40d049e4
YO
597 (Or minlen should take into account lookahead.)
598 NOTE: Some of this comment is not correct. minlen does now take account
599 of lookahead/behind. Further research is required. -- demerphq
600
601*/
6eb5f6b9 602
2c2d71f5
JH
603/* A failure to find a constant substring means that there is no need to make
604 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 605 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
606 regtry() should be needed.
607
608 REx compiler's optimizer found 4 possible hints:
609 a) Anchored substring;
610 b) Fixed substring;
611 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 612 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 613 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
614 string which does not contradict any of them.
615 */
2c2d71f5 616
6eb5f6b9
JH
617/* Most of decisions we do here should have been done at compile time.
618 The nodes of the REx which we used for the search should have been
619 deleted from the finite automaton. */
620
52a21eb3
DM
621/* args:
622 * rx: the regex to match against
623 * sv: the SV being matched: only used for utf8 flag; the string
624 * itself is accessed via the pointers below. Note that on
625 * something like an overloaded SV, SvPOK(sv) may be false
626 * and the string pointers may point to something unrelated to
627 * the SV itself.
628 * strbeg: real beginning of string
629 * strpos: the point in the string at which to begin matching
630 * strend: pointer to the byte following the last char of the string
631 * flags currently unused; set to 0
632 * data: currently unused; set to NULL
633 */
634
cad2e5aa 635char *
52a21eb3
DM
636Perl_re_intuit_start(pTHX_
637 REGEXP * const rx,
638 SV *sv,
639 const char * const strbeg,
640 char *strpos,
641 char *strend,
642 const U32 flags,
643 re_scream_pos_data *data)
cad2e5aa 644{
97aff369 645 dVAR;
8d919b0a 646 struct regexp *const prog = ReANY(rx);
6b071d16 647 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 648 /* Should be nonnegative! */
ea3daa5d 649 SSize_t end_shift = 0;
0fc004dd
DM
650 /* current lowest pos in string where the regex can start matching */
651 char *rx_origin = strpos;
eb578fdb 652 SV *check;
f2ed9b32 653 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 654 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 655 bool ml_anch = 0;
8f4bf5fc 656 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 657 char *check_at = NULL; /* check substr found at this pos */
bbe252da 658 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 659 RXi_GET_DECL(prog,progi);
02d5137b
DM
660 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
661 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
662 GET_RE_DEBUG_FLAGS_DECL;
663
7918f24d 664 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
665 PERL_UNUSED_ARG(flags);
666 PERL_UNUSED_ARG(data);
7918f24d 667
1dc475d0
DM
668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
669 "Intuit: trying to determine minimum start position...\n"));
670
fb9bbddb
DM
671 /* for now, assume that all substr offsets are positive. If at some point
672 * in the future someone wants to do clever things with look-behind and
673 * -ve offsets, they'll need to fix up any code in this function
674 * which uses these offsets. See the thread beginning
675 * <20140113145929.GF27210@iabyn.com>
676 */
677 assert(prog->substrs->data[0].min_offset >= 0);
678 assert(prog->substrs->data[0].max_offset >= 0);
679 assert(prog->substrs->data[1].min_offset >= 0);
680 assert(prog->substrs->data[1].max_offset >= 0);
681 assert(prog->substrs->data[2].min_offset >= 0);
682 assert(prog->substrs->data[2].max_offset >= 0);
683
f7022b5a 684 /* for now, assume that if both present, that the floating substring
83f2232d 685 * doesn't start before the anchored substring.
f7022b5a
DM
686 * If you break this assumption (e.g. doing better optimisations
687 * with lookahead/behind), then you'll need to audit the code in this
688 * function carefully first
689 */
690 assert(
691 ! ( (prog->anchored_utf8 || prog->anchored_substr)
692 && (prog->float_utf8 || prog->float_substr))
693 || (prog->float_min_offset >= prog->anchored_offset));
694
c344f387
JH
695 /* CHR_DIST() would be more correct here but it makes things slow. */
696 if (prog->minlen > strend - strpos) {
a3621e74 697 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 698 " String too short...\n"));
cad2e5aa 699 goto fail;
2c2d71f5 700 }
d8da0584 701
6c3fea77 702 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 703 reginfo->info_aux = NULL;
9d9163fb 704 reginfo->strbeg = strbeg;
220db18a 705 reginfo->strend = strend;
aed7b151 706 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 707 reginfo->intuit = 1;
1cb48e53
DM
708 /* not actually used within intuit, but zero for safety anyway */
709 reginfo->poscache_maxiter = 0;
02d5137b 710
f2ed9b32 711 if (utf8_target) {
33b8afdf
JH
712 if (!prog->check_utf8 && prog->check_substr)
713 to_utf8_substr(prog);
714 check = prog->check_utf8;
715 } else {
7e0d5ad7
KW
716 if (!prog->check_substr && prog->check_utf8) {
717 if (! to_byte_substr(prog)) {
6b54ddc5 718 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
719 }
720 }
33b8afdf
JH
721 check = prog->check_substr;
722 }
274cd312 723
1dc475d0
DM
724 /* dump the various substring data */
725 DEBUG_OPTIMISE_MORE_r({
726 int i;
727 for (i=0; i<=2; i++) {
728 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
729 : prog->substrs->data[i].substr);
730 if (!sv)
731 continue;
732
733 PerlIO_printf(Perl_debug_log,
734 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
735 " useful=%"IVdf" utf8=%d [%s]\n",
736 i,
737 (IV)prog->substrs->data[i].min_offset,
738 (IV)prog->substrs->data[i].max_offset,
739 (IV)prog->substrs->data[i].end_shift,
740 BmUSEFUL(sv),
741 utf8_target ? 1 : 0,
742 SvPEEK(sv));
743 }
744 });
745
8e1490ee 746 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
274cd312
DM
747 /* Check after \n? */
748 ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
749 || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
cad2e5aa 750
7e25d62c 751 if (!ml_anch) {
c889ccc8
DM
752 /* we are only allowed to match at BOS or \G */
753
57fcbfa7
DM
754 /* trivially reject if there's a BOS anchor and we're not at BOS.
755 * In the case of \G, we hope(!) that the caller has already
756 * set strpos to pos()-gofs, and will already have checked
757 * that this anchor position is legal. So we can skip it here.
758 */
759 if ( !(prog->intflags & PREGf_ANCH_GPOS)
760 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
761 && (strpos != strbeg))
c889ccc8
DM
762 {
763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 764 " Not at start...\n"));
c889ccc8
DM
765 goto fail;
766 }
767
a5d12a4b
DM
768 /* in the presence of an anchor, the anchored (relative to the
769 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
770 * to strpos. So quickly reject if substr isn't found there.
771 * This works for \G too, because the caller will already have
772 * subtracted gofs from pos, and gofs is the offset from the
773 * \G to the start of the regex. For example, in /.abc\Gdef/,
774 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
775 * caller will have set strpos=pos()-4; we look for the substr
776 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 777
c889ccc8
DM
778 if (prog->check_offset_min == prog->check_offset_max
779 && !(prog->intflags & PREGf_CANY_SEEN)
780 && ! multiline) /* /m can cause \n's to match that aren't
781 accounted for in the string max length.
782 See [perl #115242] */
783 {
784 /* Substring at constant offset from beg-of-str... */
d307bf57 785 SSize_t slen = SvCUR(check);
7ef63983 786 char *s;
c889ccc8
DM
787
788 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 789
1dc475d0
DM
790 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
791 " Looking for check substr at fixed offset %"IVdf"...\n",
792 (IV)prog->check_offset_min));
793
c889ccc8 794 if (SvTAIL(check)) {
d307bf57
DM
795 /* In this case, the regex is anchored at the end too,
796 * so the lengths must match exactly, give or take a \n.
797 * NB: slen >= 1 since the last char of check is \n */
c889ccc8
DM
798 if ( strend - s > slen || strend - s < slen - 1
799 || (strend - s == slen && strend[-1] != '\n'))
800 {
801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 802 " String too long...\n"));
c889ccc8
DM
803 goto fail_finish;
804 }
805 /* Now should match s[0..slen-2] */
806 slen--;
c889ccc8 807 }
d307bf57
DM
808 if (slen && (*SvPVX_const(check) != *s
809 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
810 {
811 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 812 " String not equal...\n"));
d307bf57
DM
813 goto fail_finish;
814 }
c889ccc8
DM
815
816 check_at = s;
817 goto success_at_start;
cad2e5aa 818 }
cad2e5aa 819 }
cad2e5aa 820 }
0fc004dd 821
c0e0ec46 822 end_shift = prog->check_end_shift;
cad2e5aa 823
19188028 824#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 825 if (end_shift < 0)
1de06328 826 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 827 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
828#endif
829
2c2d71f5 830 restart:
1de06328 831
66b7ec5c
DM
832 /* This is the (re)entry point of the main loop in this function.
833 * The goal of this loop is to:
834 * 1) find the "check" substring in the region rx_origin..strend
835 * (adjusted by start_shift / end_shift). If not found, reject
836 * immediately.
837 * 2) If it exists, look for the "other" substr too if defined; for
838 * example, if the check substr maps to the anchored substr, then
839 * check the floating substr, and vice-versa. If not found, go
840 * back to (1) with rx_origin suitably incremented.
841 * 3) If we find an rx_origin position that doesn't contradict
842 * either of the substrings, then check the possible additional
843 * constraints on rx_origin of /^.../m or a known start class.
844 * If these fail, then depending on which constraints fail, jump
845 * back to here, or to various other re-entry points further along
846 * that skip some of the first steps.
847 * 4) If we pass all those tests, update the BmUSEFUL() count on the
848 * substring. If the start position was determined to be at the
849 * beginning of the string - so, not rejected, but not optimised,
850 * since we have to run regmatch from position 0 - decrement the
851 * BmUSEFUL() count. Otherwise increment it.
852 */
853
1de06328 854 {
c33e64f0
FC
855 U8* start_point;
856 U8* end_point;
c889ccc8 857
c889ccc8 858 DEBUG_OPTIMISE_MORE_r({
1dc475d0 859 PerlIO_printf(Perl_debug_log,
ae5d4331 860 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 861 " Start shift: %"IVdf" End shift %"IVdf
4d281893 862 " Real end Shift: %"IVdf"\n",
197641b6 863 (IV)(rx_origin - strpos),
c889ccc8 864 (IV)prog->check_offset_min,
12fbc530
DM
865 (IV)start_shift,
866 (IV)end_shift,
c889ccc8
DM
867 (IV)prog->check_end_shift);
868 });
1de06328 869
0d331aaf 870 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 871 start_point= (U8*)(rx_origin + start_shift);
12fbc530 872 end_point= (U8*)(strend - end_shift);
557f47af
DM
873 if (start_point > end_point)
874 goto fail_finish;
1de06328 875 } else {
557f47af
DM
876 end_point = HOP3(strend, -end_shift, strbeg);
877 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
878 if (!start_point)
879 goto fail_finish;
1de06328 880 }
c889ccc8 881
557f47af 882
77656d5b
DM
883 /* if the regex is absolutely anchored to the start of the string,
884 * then check_offset_max represents an upper bound on the string
885 * where the substr could start */
c19c836a
DM
886 if (!ml_anch
887 && prog->intflags & PREGf_ANCH
77656d5b
DM
888 && prog->check_offset_max != SSize_t_MAX
889 && start_shift < prog->check_offset_max)
c19c836a 890 {
1a08ba3a
DM
891 SSize_t len = SvCUR(check) - !!SvTAIL(check);
892 end_point = HOP3lim(start_point,
893 prog->check_offset_max - start_shift,
894 end_point -len)
895 + len;
d6ef1678
DM
896 }
897
6bda09f9 898 DEBUG_OPTIMISE_MORE_r({
1dc475d0 899 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 900 (int)(end_point - start_point),
fc8cd66c 901 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
902 start_point);
903 });
904
ae5d4331 905 check_at = fbm_instr( start_point, end_point,
7fba1cd6 906 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 907
8fd34720
DM
908 /* Update the count-of-usability, remove useless subpatterns,
909 unshift s. */
910
911 DEBUG_EXECUTE_r({
912 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
913 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
914 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
915 (check_at ? "Found" : "Did not find"),
916 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
917 ? "anchored" : "floating"),
918 quoted,
919 RE_SV_TAIL(check),
920 (check_at ? " at offset " : "...\n") );
921 });
2c2d71f5 922
8fd34720
DM
923 if (!check_at)
924 goto fail_finish;
925 /* Finish the diagnostic message */
926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 927
8fd34720
DM
928 /* set rx_origin to the minimum position where the regex could start
929 * matching, given the constraint of the just-matched check substring.
930 * But don't set it lower than previously.
931 */
fdc003fd 932
8fd34720
DM
933 if (check_at - rx_origin > prog->check_offset_max)
934 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
935 }
fdc003fd
DM
936
937
1de06328
YO
938 /* XXX dmq: first branch is for positive lookbehind...
939 Our check string is offset from the beginning of the pattern.
940 So we need to do any stclass tests offset forward from that
941 point. I think. :-(
942 */
943
2c2d71f5
JH
944 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
945 Start with the other substr.
946 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 947 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
948 *always* match. Probably should be marked during compile...
949 Probably it is right to do no SCREAM here...
950 */
951
6480a6c4
DM
952 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
953 : prog->substrs->data[other_ix].substr)
1de06328 954 {
30944b6d 955 /* Take into account the "other" substring. */
6c3343a6
DM
956 char *last, *last1;
957 char *s;
958 SV* must;
959 struct reg_substr_datum *other;
960
961 do_other_substr:
962 other = &prog->substrs->data[other_ix];
963
964 /* if "other" is anchored:
965 * we've previously found a floating substr starting at check_at.
966 * This means that the regex origin must lie somewhere
967 * between min (rx_origin): HOP3(check_at, -check_offset_max)
968 * and max: HOP3(check_at, -check_offset_min)
969 * (except that min will be >= strpos)
970 * So the fixed substr must lie somewhere between
971 * HOP3(min, anchored_offset)
972 * HOP3(max, anchored_offset) + SvCUR(substr)
973 */
974
975 /* if "other" is floating
976 * Calculate last1, the absolute latest point where the
977 * floating substr could start in the string, ignoring any
978 * constraints from the earlier fixed match. It is calculated
979 * as follows:
980 *
981 * strend - prog->minlen (in chars) is the absolute latest
982 * position within the string where the origin of the regex
983 * could appear. The latest start point for the floating
984 * substr is float_min_offset(*) on from the start of the
985 * regex. last1 simply combines thee two offsets.
986 *
987 * (*) You might think the latest start point should be
988 * float_max_offset from the regex origin, and technically
989 * you'd be correct. However, consider
990 * /a\d{2,4}bcd\w/
991 * Here, float min, max are 3,5 and minlen is 7.
992 * This can match either
993 * /a\d\dbcd\w/
994 * /a\d\d\dbcd\w/
995 * /a\d\d\d\dbcd\w/
996 * In the first case, the regex matches minlen chars; in the
997 * second, minlen+1, in the third, minlen+2.
998 * In the first case, the floating offset is 3 (which equals
999 * float_min), in the second, 4, and in the third, 5 (which
1000 * equals float_max). In all cases, the floating string bcd
1001 * can never start more than 4 chars from the end of the
1002 * string, which equals minlen - float_min. As the substring
1003 * starts to match more than float_min from the start of the
1004 * regex, it makes the regex match more than minlen chars,
1005 * and the two cancel each other out. So we can always use
1006 * float_min - minlen, rather than float_max - minlen for the
1007 * latest position in the string.
1008 *
1009 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1010 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1011 */
1012
e7a14a9c 1013 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1014 last1 = HOP3c(strend,
1015 other->min_offset - prog->minlen, strbeg);
1016
4d006249 1017 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1018 /* last is the latest point where the floating substr could
1019 * start, *given* any constraints from the earlier fixed
1020 * match. This constraint is that the floating string starts
1021 * <= float_max_offset chars from the regex origin (rx_origin).
1022 * If this value is less than last1, use it instead.
eb3831ce 1023 */
6c3343a6
DM
1024 assert(rx_origin <= last1);
1025 last =
1026 /* this condition handles the offset==infinity case, and
1027 * is a short-cut otherwise. Although it's comparing a
1028 * byte offset to a char length, it does so in a safe way,
1029 * since 1 char always occupies 1 or more bytes,
1030 * so if a string range is (last1 - rx_origin) bytes,
1031 * it will be less than or equal to (last1 - rx_origin)
1032 * chars; meaning it errs towards doing the accurate HOP3
1033 * rather than just using last1 as a short-cut */
1034 (last1 - rx_origin) < other->max_offset
1035 ? last1
1036 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1037 }
1038 else {
1039 assert(strpos + start_shift <= check_at);
1040 last = HOP4c(check_at, other->min_offset - start_shift,
1041 strbeg, strend);
1042 }
ead917d0 1043
6c3343a6
DM
1044 s = HOP3c(rx_origin, other->min_offset, strend);
1045 if (s < other_last) /* These positions already checked */
1046 s = other_last;
1047
1048 must = utf8_target ? other->utf8_substr : other->substr;
1049 assert(SvPOK(must));
1050 s = fbm_instr(
1051 (unsigned char*)s,
1052 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1053 must,
1054 multiline ? FBMrf_MULTILINE : 0
1055 );
1056 DEBUG_EXECUTE_r({
1057 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1058 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1059 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1060 s ? "Found" : "Contradicts",
1061 other_ix ? "floating" : "anchored",
1062 quoted, RE_SV_TAIL(must));
1063 });
ead917d0 1064
ead917d0 1065
6c3343a6
DM
1066 if (!s) {
1067 /* last1 is latest possible substr location. If we didn't
1068 * find it before there, we never will */
1069 if (last >= last1) {
1070 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1071 ", giving up...\n"));
1072 goto fail_finish;
ead917d0
DM
1073 }
1074
6c3343a6
DM
1075 /* try to find the check substr again at a later
1076 * position. Maybe next time we'll find the "other" substr
1077 * in range too */
1078 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1079 ", trying %s at offset %ld...\n",
1080 (other_ix ? "floating" : "anchored"),
197641b6 1081 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1082
1083 other_last = HOP3c(last, 1, strend) /* highest failure */;
1084 rx_origin =
4d006249 1085 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1086 ? HOP3c(rx_origin, 1, strend)
1087 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1088 goto restart;
1089 }
1090 else {
1091 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1092 (long)(s - strpos)));
ead917d0 1093
4d006249 1094 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1095 /* other_last is set to s, not s+1, since its possible for
1096 * a floating substr to fail first time, then succeed
1097 * second time at the same floating position; e.g.:
1098 * "-AB--AABZ" =~ /\wAB\d*Z/
1099 * The first time round, anchored and float match at
1100 * "-(AB)--AAB(Z)" then fail on the initial \w character
1101 * class. Second time round, they match at "-AB--A(AB)(Z)".
1102 */
1103 other_last = s;
ead917d0
DM
1104 }
1105 else {
6c3343a6
DM
1106 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1107 other_last = HOP3c(s, 1, strend);
ead917d0 1108 }
6c3343a6 1109 }
cad2e5aa 1110 }
acba93e8
DM
1111 else {
1112 DEBUG_OPTIMISE_MORE_r(
1113 PerlIO_printf(Perl_debug_log,
1114 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1115 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1116 " strend-strpos:%"IVdf"\n",
1117 (IV)prog->check_offset_min,
1118 (IV)prog->check_offset_max,
1c1c599d 1119 (IV)(check_at-strpos),
acba93e8 1120 (IV)(rx_origin-strpos),
1c1c599d 1121 (IV)(rx_origin-check_at),
acba93e8
DM
1122 (IV)(strend-strpos)
1123 )
1124 );
1125 }
2c2d71f5 1126
acba93e8 1127 postprocess_substr_matches:
0991020e 1128
e3c6feb0
DM
1129 /* handle the extra constraint of /^/m */
1130
a71a26b9
DM
1131 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
1132 /* May be due to an implicit anchor of m{.*foo} */
1133 && !(prog->intflags & PREGf_IMPLICIT))
e3c6feb0 1134 {
4620cb61
DM
1135 char *s;
1136
a62659bd
DM
1137 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1138 " looking for /^/m anchor"));
d0880ea7
DM
1139
1140 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1141 * Find the next \n, if any, even if it's beyond the current
1142 * anchored and/or floating substrings. Whether we should be
1143 * scanning ahead for the next \n or the next substr is debatable.
1144 * On the one hand you'd expect rare substrings to appear less
1145 * often than \n's. On the other hand, searching for \n means
1146 * we're effectively flipping been check_substr and "\n" on each
1147 * iteration as the current "rarest" string candidate, which
1148 * means for example that we'll quickly reject the whole string if
1149 * hasn't got a \n, rather than trying every substr position
1150 * first
1151 */
d0880ea7 1152
4620cb61
DM
1153 s = HOP3c(strend, - prog->minlen, strpos);
1154 if (s <= rx_origin ||
1155 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1156 {
d0880ea7
DM
1157 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1158 " Did not find /%s^%s/m...\n",
1159 PL_colors[0], PL_colors[1]));
a62659bd
DM
1160 goto fail_finish;
1161 }
d0880ea7 1162
4ada1233
DM
1163 /* earliest possible origin is 1 char after the \n.
1164 * (since *rx_origin == '\n', it's safe to ++ here rather than
1165 * HOP(rx_origin, 1)) */
1166 rx_origin++;
d0880ea7 1167
f4f115de 1168 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1169 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1170 {
d0880ea7
DM
1171 /* Position contradicts check-string; either because
1172 * check was anchored (and thus has no wiggle room),
4ada1233 1173 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1176 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1177 goto restart;
1178 }
1179
1180 /* if we get here, the check substr must have been float,
2e759faa 1181 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1182 * "other" substr which still contradicts */
1183 assert(prog->substrs->check_ix); /* check is float */
1184
1185 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1186 /* whoops, the anchored "other" substr exists, so we still
1187 * contradict. On the other hand, the float "check" substr
1188 * didn't contradict, so just retry the anchored "other"
1189 * substr */
1190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1191 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1192 PL_colors[0], PL_colors[1],
1193 (long)(rx_origin - strpos),
1194 (long)(rx_origin - strpos + prog->anchored_offset)));
1195 goto do_other_substr;
1196 }
1197
1198 /* success: we don't contradict the found floating substring
1199 * (and there's no anchored substr). */
d0880ea7
DM
1200 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1201 " Found /%s^%s/m at offset %ld...\n",
1202 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1203 }
1204 else {
2e759faa
DM
1205 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1206 " Starting position does not contradict /%s^%s/m...\n",
1207 PL_colors[0], PL_colors[1]));
e3c6feb0
DM
1208 }
1209
ffad1e6a 1210 success_at_start:
e3c6feb0 1211
cad2e5aa 1212
dd170ff5
DM
1213 /* if we have a starting character class, then test that extra constraint.
1214 * (trie stclasses are too expensive to use here, we are better off to
1215 * leave it to regmatch itself) */
1216
f8fc2ecf 1217 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1218 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1219
2c75e362 1220 /* XXX this value could be pre-computed */
f8fc2ecf 1221 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1222 ? (reginfo->is_utf8_pat
1223 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1224 : STR_LEN(progi->regstclass))
66e933ab 1225 : 1);
1de06328 1226 char * endpos;
fa3bb21d 1227 char *s;
000dfd2d
DM
1228 /* latest pos that a matching float substr constrains rx start to */
1229 char *rx_max_float = NULL;
1230
c75a3985
DM
1231 /* if the current rx_origin is anchored, either by satisfying an
1232 * anchored substring constraint, or a /^.../m constraint, then we
1233 * can reject the current origin if the start class isn't found
1234 * at the current position. If we have a float-only match, then
1235 * rx_origin is constrained to a range; so look for the start class
1236 * in that range. if neither, then look for the start class in the
1237 * whole rest of the string */
1238
dd170ff5
DM
1239 /* XXX DAPM it's not clear what the minlen test is for, and why
1240 * it's not used in the floating case. Nothing in the test suite
1241 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1242 * Here are some old comments, which may or may not be correct:
1243 *
1244 * minlen == 0 is possible if regstclass is \b or \B,
1245 * and the fixed substr is ''$.
1246 * Since minlen is already taken into account, rx_origin+1 is
1247 * before strend; accidentally, minlen >= 1 guaranties no false
1248 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1249 * 0) below assumes that regstclass does not come from lookahead...
1250 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1251 * This leaves EXACTF-ish only, which are dealt with in
1252 * find_byclass().
1253 */
1254
1de06328 1255 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1256 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1257 else if (prog->float_substr || prog->float_utf8) {
1258 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1259 endpos= HOP3c(rx_max_float, cl_l, strend);
1260 }
1de06328
YO
1261 else
1262 endpos= strend;
1263
1dc475d0
DM
1264 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1265 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1266 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1267 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1268 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1269
c43b5520 1270 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1271 reginfo);
be778b1a 1272 if (!s) {
6eb5f6b9 1273 if (endpos == strend) {
a3621e74 1274 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1275 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1276 goto fail;
1277 }
a3621e74 1278 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1279 " This position contradicts STCLASS...\n") );
8e1490ee 1280 if ((prog->intflags & PREGf_ANCH) && !ml_anch)
653099ff 1281 goto fail;
9fed8d02 1282
6eb5f6b9 1283 /* Contradict one of substrings */
97136c8a
DM
1284 if (prog->anchored_substr || prog->anchored_utf8) {
1285 if (prog->substrs->check_ix == 1) { /* check is float */
1286 /* Have both, check_string is floating */
1287 assert(rx_origin + start_shift <= check_at);
1288 if (rx_origin + start_shift != check_at) {
1289 /* not at latest position float substr could match:
c75a3985
DM
1290 * Recheck anchored substring, but not floating.
1291 * The condition above is in bytes rather than
1292 * chars for efficiency. It's conservative, in
1293 * that it errs on the side of doing 'goto
1294 * do_other_substr', where a more accurate
1295 * char-based calculation will be done */
97136c8a
DM
1296 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1297 " Looking for anchored substr starting at offset %ld...\n",
1298 (long)(other_last - strpos)) );
1299 goto do_other_substr;
3369914b 1300 }
3369914b
DM
1301 }
1302 }
97136c8a 1303 else {
9fed8d02
DM
1304 /* float-only */
1305
9fed8d02 1306 if (ml_anch) {
c75a3985
DM
1307 /* In the presence of ml_anch, we might be able to
1308 * find another \n without breaking the current float
1309 * constraint. */
1310
1311 /* strictly speaking this should be HOP3c(..., 1, ...),
1312 * but since we goto a block of code that's going to
1313 * search for the next \n if any, its safe here */
9fed8d02 1314 rx_origin++;
9fed8d02
DM
1315 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1316 " Looking for /%s^%s/m starting at offset %ld...\n",
1317 PL_colors[0], PL_colors[1],
1318 (long)(rx_origin - strpos)) );
9fed8d02 1319 goto postprocess_substr_matches;
ab60c45a 1320 }
c75a3985
DM
1321
1322 /* strictly speaking this can never be true; but might
1323 * be if we ever allow intuit without substrings */
1324 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1325 goto fail;
c75a3985 1326
000dfd2d 1327 rx_origin = rx_max_float;
9fed8d02
DM
1328 }
1329
c75a3985
DM
1330 /* at this point, any matching substrings have been
1331 * contradicted. Start again... */
1332
9fed8d02 1333 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1334
1335 /* uses bytes rather than char calculations for efficiency.
1336 * It's conservative: it errs on the side of doing 'goto restart',
1337 * where there is code that does a proper char-based test */
9fed8d02 1338 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1339 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1340 " Could not match STCLASS...\n") );
1341 goto fail;
1342 }
1343 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1344 " Looking for %s substr starting at offset %ld...\n",
1345 (prog->substrs->check_ix ? "floating" : "anchored"),
1346 (long)(rx_origin + start_shift - strpos)) );
1347 goto restart;
6eb5f6b9 1348 }
9fed8d02 1349
c75a3985
DM
1350 /* Success !!! */
1351
5f9c6575 1352 if (rx_origin != s) {
a3621e74 1353 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1354 " By STCLASS: moving %ld --> %ld\n",
5f9c6575 1355 (long)(rx_origin - strpos), (long)(s - strpos))
b7953727
JH
1356 );
1357 }
1358 else {
a3621e74 1359 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1360 " Does not contradict STCLASS...\n");
b7953727
JH
1361 );
1362 }
6eb5f6b9 1363 }
ffad1e6a
DM
1364
1365 /* Decide whether using the substrings helped */
1366
1367 if (rx_origin != strpos) {
1368 /* Fixed substring is found far enough so that the match
1369 cannot start at strpos. */
1370
1371 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1372 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1373 }
1374 else {
70563e16
DM
1375 /* The found rx_origin position does not prohibit matching at
1376 * strpos, so calling intuit didn't gain us anything. Decrement
1377 * the BmUSEFUL() count on the check substring, and if we reach
1378 * zero, free it. */
1379 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1380 && (utf8_target ? (
1381 prog->check_utf8 /* Could be deleted already */
1382 && --BmUSEFUL(prog->check_utf8) < 0
1383 && (prog->check_utf8 == prog->float_utf8)
1384 ) : (
1385 prog->check_substr /* Could be deleted already */
1386 && --BmUSEFUL(prog->check_substr) < 0
1387 && (prog->check_substr == prog->float_substr)
1388 )))
1389 {
1390 /* If flags & SOMETHING - do not do it many times on the same match */
1391 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1392 /* XXX Does the destruction order has to change with utf8_target? */
1393 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1394 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1395 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1396 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1397 check = NULL; /* abort */
1398 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1399 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1400 if (prog->intflags & PREGf_IMPLICIT) {
1401 prog->intflags &= ~PREGf_ANCH_MBOL;
1402 /* maybe we have no anchors left after this... */
1403 if (!(prog->intflags & PREGf_ANCH))
1404 prog->extflags &= ~RXf_IS_ANCHORED;
1405 }
1406 /* XXXX This is a remnant of the old implementation. It
1407 looks wasteful, since now INTUIT can use many
1408 other heuristics. */
1409 prog->extflags &= ~RXf_USE_INTUIT;
1410 /* XXXX What other flags might need to be cleared in this branch? */
1411 }
1412 }
1413
1414 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1415 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1416 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1417
c765d6e0 1418 return rx_origin;
2c2d71f5
JH
1419
1420 fail_finish: /* Substring not found */
33b8afdf 1421 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1422 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1423 fail:
a3621e74 1424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1425 PL_colors[4], PL_colors[5]));
bd61b366 1426 return NULL;
cad2e5aa 1427}
9661b544 1428
70563e16 1429
a0a388a1 1430#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1431 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1432 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1433 trie_type = ((scan->flags == EXACT) \
1434 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1435 : (scan->flags == EXACTFA) \
1436 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1437 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1438
fd3249ee 1439#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1440STMT_START { \
fab2782b 1441 STRLEN skiplen; \
baa60164 1442 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1443 switch (trie_type) { \
31f05a37 1444 case trie_utf8_exactfa_fold: \
baa60164
KW
1445 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1446 /* FALL THROUGH */ \
fab2782b
YO
1447 case trie_utf8_fold: \
1448 if ( foldlen>0 ) { \
c80e42f3 1449 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1450 foldlen -= len; \
1451 uscan += len; \
1452 len=0; \
1453 } else { \
445bf929 1454 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1455 len = UTF8SKIP(uc); \
1456 skiplen = UNISKIP( uvc ); \
1457 foldlen -= skiplen; \
1458 uscan = foldbuf + skiplen; \
1459 } \
1460 break; \
baa60164
KW
1461 case trie_latin_utf8_exactfa_fold: \
1462 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1463 /* FALL THROUGH */ \
fab2782b
YO
1464 case trie_latin_utf8_fold: \
1465 if ( foldlen>0 ) { \
c80e42f3 1466 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1467 foldlen -= len; \
1468 uscan += len; \
1469 len=0; \
1470 } else { \
1471 len = 1; \
31f05a37 1472 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1473 skiplen = UNISKIP( uvc ); \
1474 foldlen -= skiplen; \
1475 uscan = foldbuf + skiplen; \
1476 } \
1477 break; \
1478 case trie_utf8: \
c80e42f3 1479 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1480 break; \
1481 case trie_plain: \
1482 uvc = (UV)*uc; \
1483 len = 1; \
1484 } \
1485 if (uvc < 256) { \
1486 charid = trie->charmap[ uvc ]; \
1487 } \
1488 else { \
1489 charid = 0; \
1490 if (widecharmap) { \
1491 SV** const svpp = hv_fetch(widecharmap, \
1492 (char*)&uvc, sizeof(UV), 0); \
1493 if (svpp) \
1494 charid = (U16)SvIV(*svpp); \
1495 } \
1496 } \
4cadc6a9
YO
1497} STMT_END
1498
4cadc6a9
YO
1499#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1500STMT_START { \
1501 while (s <= e) { \
1502 if ( (CoNd) \
fac1af77 1503 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1504 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1505 goto got_it; \
1506 s++; \
1507 } \
1508} STMT_END
1509
1510#define REXEC_FBC_UTF8_SCAN(CoDe) \
1511STMT_START { \
9a902117 1512 while (s < strend) { \
4cadc6a9 1513 CoDe \
9a902117 1514 s += UTF8SKIP(s); \
4cadc6a9
YO
1515 } \
1516} STMT_END
1517
1518#define REXEC_FBC_SCAN(CoDe) \
1519STMT_START { \
1520 while (s < strend) { \
1521 CoDe \
1522 s++; \
1523 } \
1524} STMT_END
1525
1526#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1527REXEC_FBC_UTF8_SCAN( \
1528 if (CoNd) { \
02d5137b 1529 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1530 goto got_it; \
1531 else \
1532 tmp = doevery; \
1533 } \
1534 else \
1535 tmp = 1; \
1536)
1537
1538#define REXEC_FBC_CLASS_SCAN(CoNd) \
1539REXEC_FBC_SCAN( \
1540 if (CoNd) { \
02d5137b 1541 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1542 goto got_it; \
1543 else \
1544 tmp = doevery; \
1545 } \
1546 else \
1547 tmp = 1; \
1548)
1549
baa60164 1550#define REXEC_FBC_TRYIT \
02d5137b 1551if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1552 goto got_it
1553
e1d1eefb 1554#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
baa60164 1555 if (utf8_target) { \
e1d1eefb
YO
1556 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1557 } \
1558 else { \
1559 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1560 }
e1d1eefb 1561
baa60164 1562#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1563 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1564 startpos, doutf8)
786e8c11 1565
cfaf538b 1566
baa60164 1567#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1568 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1569 tmp = TEST_NON_UTF8(tmp); \
1570 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1571 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1572 tmp = !tmp; \
1573 IF_SUCCESS; \
1574 } \
1575 else { \
1576 IF_FAIL; \
1577 } \
1578 ); \
1579
baa60164 1580#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1581 if (s == reginfo->strbeg) { \
cfaf538b
KW
1582 tmp = '\n'; \
1583 } \
1584 else { \
9d9163fb 1585 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1586 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1587 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1588 } \
1589 tmp = TeSt1_UtF8; \
3db24e1e 1590 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1591 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1592 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1593 tmp = !tmp; \
1594 IF_SUCCESS; \
1595 } \
1596 else { \
1597 IF_FAIL; \
1598 } \
1599 ); \
1600
63ac0dad
KW
1601/* The only difference between the BOUND and NBOUND cases is that
1602 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1603 * NBOUND. This is accomplished by passing it in either the if or else clause,
1604 * with the other one being empty */
1605#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1606 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1607
1608#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1609 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1610
1611#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1612 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1613
1614#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1615 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1616
63ac0dad
KW
1617
1618/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1619 * be passed in completely with the variable name being tested, which isn't
1620 * such a clean interface, but this is easier to read than it was before. We
1621 * are looking for the boundary (or non-boundary between a word and non-word
1622 * character. The utf8 and non-utf8 cases have the same logic, but the details
1623 * must be different. Find the "wordness" of the character just prior to this
1624 * one, and compare it with the wordness of this one. If they differ, we have
1625 * a boundary. At the beginning of the string, pretend that the previous
1626 * character was a new-line */
baa60164 1627#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1628 if (utf8_target) { \
baa60164 1629 UTF8_CODE \
63ac0dad
KW
1630 } \
1631 else { /* Not utf8 */ \
9d9163fb 1632 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1633 tmp = TEST_NON_UTF8(tmp); \
1634 REXEC_FBC_SCAN( \
1635 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1636 tmp = !tmp; \
1637 IF_SUCCESS; \
1638 } \
1639 else { \
1640 IF_FAIL; \
1641 } \
1642 ); \
1643 } \
baa60164 1644 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1645 goto got_it;
1646
786e8c11 1647/* We know what class REx starts with. Try to find this position... */
02d5137b 1648/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1649/* annoyingly all the vars in this routine have different names from their counterparts
1650 in regmatch. /grrr */
1651
3c3eec57 1652STATIC char *
07be1b83 1653S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1654 const char *strend, regmatch_info *reginfo)
a687059c 1655{
73104a1b
KW
1656 dVAR;
1657 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1658 char *pat_string; /* The pattern's exactish string */
1659 char *pat_end; /* ptr to end char of pat_string */
1660 re_fold_t folder; /* Function for computing non-utf8 folds */
1661 const U8 *fold_array; /* array for folding ords < 256 */
1662 STRLEN ln;
1663 STRLEN lnc;
73104a1b
KW
1664 U8 c1;
1665 U8 c2;
1666 char *e;
1667 I32 tmp = 1; /* Scratch variable? */
ba44c216 1668 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1669 UV utf8_fold_flags = 0;
f9176b44 1670 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1671 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1672 with a result inverts that result, as 0^1 =
1673 1 and 1^1 = 0 */
1674 _char_class_number classnum;
1675
73104a1b 1676 RXi_GET_DECL(prog,progi);
2f7f8cb1 1677
73104a1b 1678 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1679
73104a1b
KW
1680 /* We know what class it must start with. */
1681 switch (OP(c)) {
1682 case ANYOF:
1683 if (utf8_target) {
1684 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1685 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1686 }
1687 else {
1688 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1689 }
1690 break;
1691 case CANY:
1692 REXEC_FBC_SCAN(
02d5137b 1693 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1694 goto got_it;
1695 else
1696 tmp = doevery;
1697 );
1698 break;
1699
098b07d5
KW
1700 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1701 assert(! is_utf8_pat);
1702 /* FALL THROUGH */
73104a1b 1703 case EXACTFA:
984e6dd1 1704 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1705 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1706 goto do_exactf_utf8;
1707 }
1708 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1709 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1710 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1711
2fdb7295
KW
1712 case EXACTF: /* This node only generated for non-utf8 patterns */
1713 assert(! is_utf8_pat);
73104a1b 1714 if (utf8_target) {
73104a1b
KW
1715 utf8_fold_flags = 0;
1716 goto do_exactf_utf8;
1717 }
1718 fold_array = PL_fold;
1719 folder = foldEQ;
1720 goto do_exactf_non_utf8;
1721
1722 case EXACTFL:
31f05a37 1723 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1724 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1725 goto do_exactf_utf8;
1726 }
1727 fold_array = PL_fold_locale;
1728 folder = foldEQ_locale;
1729 goto do_exactf_non_utf8;
3c760661 1730
73104a1b 1731 case EXACTFU_SS:
984e6dd1 1732 if (is_utf8_pat) {
73104a1b
KW
1733 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1734 }
1735 goto do_exactf_utf8;
16d951b7 1736
73104a1b 1737 case EXACTFU:
984e6dd1
DM
1738 if (is_utf8_pat || utf8_target) {
1739 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1740 goto do_exactf_utf8;
1741 }
fac1af77 1742
73104a1b
KW
1743 /* Any 'ss' in the pattern should have been replaced by regcomp,
1744 * so we don't have to worry here about this single special case
1745 * in the Latin1 range */
1746 fold_array = PL_fold_latin1;
1747 folder = foldEQ_latin1;
1748
1749 /* FALL THROUGH */
1750
1751 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1752 are no glitches with fold-length differences
1753 between the target string and pattern */
1754
1755 /* The idea in the non-utf8 EXACTF* cases is to first find the
1756 * first character of the EXACTF* node and then, if necessary,
1757 * case-insensitively compare the full text of the node. c1 is the
1758 * first character. c2 is its fold. This logic will not work for
1759 * Unicode semantics and the german sharp ss, which hence should
1760 * not be compiled into a node that gets here. */
1761 pat_string = STRING(c);
1762 ln = STR_LEN(c); /* length to match in octets/bytes */
1763
1764 /* We know that we have to match at least 'ln' bytes (which is the
1765 * same as characters, since not utf8). If we have to match 3
1766 * characters, and there are only 2 availabe, we know without
1767 * trying that it will fail; so don't start a match past the
1768 * required minimum number from the far end */
ea3daa5d 1769 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1770
02d5137b 1771 if (reginfo->intuit && e < s) {
73104a1b
KW
1772 e = s; /* Due to minlen logic of intuit() */
1773 }
fac1af77 1774
73104a1b
KW
1775 c1 = *pat_string;
1776 c2 = fold_array[c1];
1777 if (c1 == c2) { /* If char and fold are the same */
1778 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1779 }
1780 else {
1781 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1782 }
1783 break;
fac1af77 1784
73104a1b
KW
1785 do_exactf_utf8:
1786 {
1787 unsigned expansion;
1788
1789 /* If one of the operands is in utf8, we can't use the simpler folding
1790 * above, due to the fact that many different characters can have the
1791 * same fold, or portion of a fold, or different- length fold */
1792 pat_string = STRING(c);
1793 ln = STR_LEN(c); /* length to match in octets/bytes */
1794 pat_end = pat_string + ln;
984e6dd1 1795 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1796 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1797 : ln;
1798
1799 /* We have 'lnc' characters to match in the pattern, but because of
1800 * multi-character folding, each character in the target can match
1801 * up to 3 characters (Unicode guarantees it will never exceed
1802 * this) if it is utf8-encoded; and up to 2 if not (based on the
1803 * fact that the Latin 1 folds are already determined, and the
1804 * only multi-char fold in that range is the sharp-s folding to
1805 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1806 * string character. Adjust lnc accordingly, rounding up, so that
1807 * if we need to match at least 4+1/3 chars, that really is 5. */
1808 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1809 lnc = (lnc + expansion - 1) / expansion;
1810
1811 /* As in the non-UTF8 case, if we have to match 3 characters, and
1812 * only 2 are left, it's guaranteed to fail, so don't start a
1813 * match that would require us to go beyond the end of the string
1814 */
ea3daa5d 1815 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1816
02d5137b 1817 if (reginfo->intuit && e < s) {
73104a1b
KW
1818 e = s; /* Due to minlen logic of intuit() */
1819 }
0658cdde 1820
73104a1b
KW
1821 /* XXX Note that we could recalculate e to stop the loop earlier,
1822 * as the worst case expansion above will rarely be met, and as we
1823 * go along we would usually find that e moves further to the left.
1824 * This would happen only after we reached the point in the loop
1825 * where if there were no expansion we should fail. Unclear if
1826 * worth the expense */
1827
1828 while (s <= e) {
1829 char *my_strend= (char *)strend;
1830 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1831 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1832 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1833 {
1834 goto got_it;
1835 }
1836 s += (utf8_target) ? UTF8SKIP(s) : 1;
1837 }
1838 break;
1839 }
1840 case BOUNDL:
0eb30aeb 1841 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1842 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1843 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1844 break;
1845 case NBOUNDL:
0eb30aeb 1846 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1847 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1848 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1849 break;
1850 case BOUND:
1851 FBC_BOUND(isWORDCHAR,
0eb30aeb 1852 isWORDCHAR_uni(tmp),
03940dc2 1853 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1854 break;
1855 case BOUNDA:
1856 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1857 isWORDCHAR_A(tmp),
1858 isWORDCHAR_A((U8*)s));
1859 break;
1860 case NBOUND:
1861 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1862 isWORDCHAR_uni(tmp),
03940dc2 1863 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1864 break;
1865 case NBOUNDA:
1866 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1867 isWORDCHAR_A(tmp),
1868 isWORDCHAR_A((U8*)s));
1869 break;
1870 case BOUNDU:
1871 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1872 isWORDCHAR_uni(tmp),
03940dc2 1873 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1874 break;
1875 case NBOUNDU:
1876 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1877 isWORDCHAR_uni(tmp),
03940dc2 1878 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1879 break;
73104a1b
KW
1880 case LNBREAK:
1881 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1882 is_LNBREAK_latin1_safe(s, strend)
1883 );
1884 break;
3018b823
KW
1885
1886 /* The argument to all the POSIX node types is the class number to pass to
1887 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1888
1889 case NPOSIXL:
1890 to_complement = 1;
1891 /* FALLTHROUGH */
1892
1893 case POSIXL:
3018b823
KW
1894 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1895 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1896 break;
3018b823
KW
1897
1898 case NPOSIXD:
1899 to_complement = 1;
1900 /* FALLTHROUGH */
1901
1902 case POSIXD:
1903 if (utf8_target) {
1904 goto posix_utf8;
1905 }
1906 goto posixa;
1907
1908 case NPOSIXA:
1909 if (utf8_target) {
1910 /* The complement of something that matches only ASCII matches all
1911 * UTF-8 variant code points, plus everything in ASCII that isn't
1912 * in the class */
1913 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1914 || ! _generic_isCC_A(*s, FLAGS(c)));
1915 break;
1916 }
1917
1918 to_complement = 1;
1919 /* FALLTHROUGH */
1920
73104a1b 1921 case POSIXA:
3018b823 1922 posixa:
73104a1b 1923 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1924 * byte invariant character. */
1925 REXEC_FBC_CLASS_SCAN(
1926 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1927 break;
3018b823
KW
1928
1929 case NPOSIXU:
1930 to_complement = 1;
1931 /* FALLTHROUGH */
1932
1933 case POSIXU:
1934 if (! utf8_target) {
1935 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1936 FLAGS(c))));
1937 }
1938 else {
1939
1940 posix_utf8:
1941 classnum = (_char_class_number) FLAGS(c);
1942 if (classnum < _FIRST_NON_SWASH_CC) {
1943 while (s < strend) {
1944
1945 /* We avoid loading in the swash as long as possible, but
1946 * should we have to, we jump to a separate loop. This
1947 * extra 'if' statement is what keeps this code from being
1948 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1949 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1950 goto found_above_latin1;
1951 }
1952 if ((UTF8_IS_INVARIANT(*s)
1953 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1954 classnum)))
1955 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1956 && to_complement ^ cBOOL(
94bb8c36
KW
1957 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1958 *(s + 1)),
3018b823
KW
1959 classnum))))
1960 {
02d5137b 1961 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1962 goto got_it;
1963 else {
1964 tmp = doevery;
1965 }
1966 }
1967 else {
1968 tmp = 1;
1969 }
1970 s += UTF8SKIP(s);
1971 }
1972 }
1973 else switch (classnum) { /* These classes are implemented as
1974 macros */
1975 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1976 revert the change of \v matching this */
1977 /* FALL THROUGH */
1978
1979 case _CC_ENUM_PSXSPC:
1980 REXEC_FBC_UTF8_CLASS_SCAN(
1981 to_complement ^ cBOOL(isSPACE_utf8(s)));
1982 break;
1983
1984 case _CC_ENUM_BLANK:
1985 REXEC_FBC_UTF8_CLASS_SCAN(
1986 to_complement ^ cBOOL(isBLANK_utf8(s)));
1987 break;
1988
1989 case _CC_ENUM_XDIGIT:
1990 REXEC_FBC_UTF8_CLASS_SCAN(
1991 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1992 break;
1993
1994 case _CC_ENUM_VERTSPACE:
1995 REXEC_FBC_UTF8_CLASS_SCAN(
1996 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1997 break;
1998
1999 case _CC_ENUM_CNTRL:
2000 REXEC_FBC_UTF8_CLASS_SCAN(
2001 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2002 break;
2003
2004 default:
2005 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2006 assert(0); /* NOTREACHED */
2007 }
2008 }
2009 break;
2010
2011 found_above_latin1: /* Here we have to load a swash to get the result
2012 for the current code point */
2013 if (! PL_utf8_swash_ptrs[classnum]) {
2014 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2015 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2016 _core_swash_init("utf8",
2017 "",
2018 &PL_sv_undef, 1, 0,
2019 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2020 }
2021
2022 /* This is a copy of the loop above for swash classes, though using the
2023 * FBC macro instead of being expanded out. Since we've loaded the
2024 * swash, we don't have to check for that each time through the loop */
2025 REXEC_FBC_UTF8_CLASS_SCAN(
2026 to_complement ^ cBOOL(_generic_utf8(
2027 classnum,
2028 s,
2029 swash_fetch(PL_utf8_swash_ptrs[classnum],
2030 (U8 *) s, TRUE))));
73104a1b
KW
2031 break;
2032
2033 case AHOCORASICKC:
2034 case AHOCORASICK:
2035 {
2036 DECL_TRIE_TYPE(c);
2037 /* what trie are we using right now */
2038 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2039 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2040 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2041
2042 const char *last_start = strend - trie->minlen;
6148ee25 2043#ifdef DEBUGGING
73104a1b 2044 const char *real_start = s;
6148ee25 2045#endif
73104a1b
KW
2046 STRLEN maxlen = trie->maxlen;
2047 SV *sv_points;
2048 U8 **points; /* map of where we were in the input string
2049 when reading a given char. For ASCII this
2050 is unnecessary overhead as the relationship
2051 is always 1:1, but for Unicode, especially
2052 case folded Unicode this is not true. */
2053 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2054 U8 *bitmap=NULL;
2055
2056
2057 GET_RE_DEBUG_FLAGS_DECL;
2058
2059 /* We can't just allocate points here. We need to wrap it in
2060 * an SV so it gets freed properly if there is a croak while
2061 * running the match */
2062 ENTER;
2063 SAVETMPS;
2064 sv_points=newSV(maxlen * sizeof(U8 *));
2065 SvCUR_set(sv_points,
2066 maxlen * sizeof(U8 *));
2067 SvPOK_on(sv_points);
2068 sv_2mortal(sv_points);
2069 points=(U8**)SvPV_nolen(sv_points );
2070 if ( trie_type != trie_utf8_fold
2071 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2072 {
2073 if (trie->bitmap)
2074 bitmap=(U8*)trie->bitmap;
2075 else
2076 bitmap=(U8*)ANYOF_BITMAP(c);
2077 }
2078 /* this is the Aho-Corasick algorithm modified a touch
2079 to include special handling for long "unknown char" sequences.
2080 The basic idea being that we use AC as long as we are dealing
2081 with a possible matching char, when we encounter an unknown char
2082 (and we have not encountered an accepting state) we scan forward
2083 until we find a legal starting char.
2084 AC matching is basically that of trie matching, except that when
2085 we encounter a failing transition, we fall back to the current
2086 states "fail state", and try the current char again, a process
2087 we repeat until we reach the root state, state 1, or a legal
2088 transition. If we fail on the root state then we can either
2089 terminate if we have reached an accepting state previously, or
2090 restart the entire process from the beginning if we have not.
2091
2092 */
2093 while (s <= last_start) {
2094 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2095 U8 *uc = (U8*)s;
2096 U16 charid = 0;
2097 U32 base = 1;
2098 U32 state = 1;
2099 UV uvc = 0;
2100 STRLEN len = 0;
2101 STRLEN foldlen = 0;
2102 U8 *uscan = (U8*)NULL;
2103 U8 *leftmost = NULL;
2104#ifdef DEBUGGING
2105 U32 accepted_word= 0;
786e8c11 2106#endif
73104a1b
KW
2107 U32 pointpos = 0;
2108
2109 while ( state && uc <= (U8*)strend ) {
2110 int failed=0;
2111 U32 word = aho->states[ state ].wordnum;
2112
2113 if( state==1 ) {
2114 if ( bitmap ) {
2115 DEBUG_TRIE_EXECUTE_r(
2116 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2117 dump_exec_pos( (char *)uc, c, strend, real_start,
2118 (char *)uc, utf8_target );
2119 PerlIO_printf( Perl_debug_log,
2120 " Scanning for legal start char...\n");
2121 }
2122 );
2123 if (utf8_target) {
2124 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2125 uc += UTF8SKIP(uc);
2126 }
2127 } else {
2128 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2129 uc++;
2130 }
786e8c11 2131 }
73104a1b 2132 s= (char *)uc;
07be1b83 2133 }
73104a1b
KW
2134 if (uc >(U8*)last_start) break;
2135 }
2136
2137 if ( word ) {
2138 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2139 if (!leftmost || lpos < leftmost) {
2140 DEBUG_r(accepted_word=word);
2141 leftmost= lpos;
7016d6eb 2142 }
73104a1b 2143 if (base==0) break;
7016d6eb 2144
73104a1b
KW
2145 }
2146 points[pointpos++ % maxlen]= uc;
2147 if (foldlen || uc < (U8*)strend) {
2148 REXEC_TRIE_READ_CHAR(trie_type, trie,
2149 widecharmap, uc,
2150 uscan, len, uvc, charid, foldlen,
2151 foldbuf, uniflags);
2152 DEBUG_TRIE_EXECUTE_r({
2153 dump_exec_pos( (char *)uc, c, strend,
2154 real_start, s, utf8_target);
2155 PerlIO_printf(Perl_debug_log,
2156 " Charid:%3u CP:%4"UVxf" ",
2157 charid, uvc);
2158 });
2159 }
2160 else {
2161 len = 0;
2162 charid = 0;
2163 }
07be1b83 2164
73104a1b
KW
2165
2166 do {
6148ee25 2167#ifdef DEBUGGING
73104a1b 2168 word = aho->states[ state ].wordnum;
6148ee25 2169#endif
73104a1b
KW
2170 base = aho->states[ state ].trans.base;
2171
2172 DEBUG_TRIE_EXECUTE_r({
2173 if (failed)
2174 dump_exec_pos( (char *)uc, c, strend, real_start,
2175 s, utf8_target );
2176 PerlIO_printf( Perl_debug_log,
2177 "%sState: %4"UVxf", word=%"UVxf,
2178 failed ? " Fail transition to " : "",
2179 (UV)state, (UV)word);
2180 });
2181 if ( base ) {
2182 U32 tmp;
2183 I32 offset;
2184 if (charid &&
2185 ( ((offset = base + charid
2186 - 1 - trie->uniquecharcount)) >= 0)
2187 && ((U32)offset < trie->lasttrans)
2188 && trie->trans[offset].check == state
2189 && (tmp=trie->trans[offset].next))
2190 {
2191 DEBUG_TRIE_EXECUTE_r(
2192 PerlIO_printf( Perl_debug_log," - legal\n"));
2193 state = tmp;
2194 break;
07be1b83
YO
2195 }
2196 else {
786e8c11 2197 DEBUG_TRIE_EXECUTE_r(
73104a1b 2198 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2199 failed = 1;
73104a1b 2200 state = aho->fail[state];
07be1b83 2201 }
07be1b83 2202 }
73104a1b
KW
2203 else {
2204 /* we must be accepting here */
2205 DEBUG_TRIE_EXECUTE_r(
2206 PerlIO_printf( Perl_debug_log," - accepting\n"));
2207 failed = 1;
2208 break;
786e8c11 2209 }
73104a1b
KW
2210 } while(state);
2211 uc += len;
2212 if (failed) {
2213 if (leftmost)
2214 break;
2215 if (!state) state = 1;
07be1b83 2216 }
73104a1b
KW
2217 }
2218 if ( aho->states[ state ].wordnum ) {
2219 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2220 if (!leftmost || lpos < leftmost) {
2221 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2222 leftmost = lpos;
07be1b83
YO
2223 }
2224 }
73104a1b
KW
2225 if (leftmost) {
2226 s = (char*)leftmost;
2227 DEBUG_TRIE_EXECUTE_r({
2228 PerlIO_printf(
2229 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2230 (UV)accepted_word, (IV)(s - real_start)
2231 );
2232 });
02d5137b 2233 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2234 FREETMPS;
2235 LEAVE;
2236 goto got_it;
2237 }
2238 s = HOPc(s,1);
2239 DEBUG_TRIE_EXECUTE_r({
2240 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2241 });
2242 } else {
2243 DEBUG_TRIE_EXECUTE_r(
2244 PerlIO_printf( Perl_debug_log,"No match.\n"));
2245 break;
2246 }
2247 }
2248 FREETMPS;
2249 LEAVE;
2250 }
2251 break;
2252 default:
2253 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2254 break;
2255 }
2256 return 0;
2257 got_it:
2258 return s;
6eb5f6b9
JH
2259}
2260
60165aa4
DM
2261/* set RX_SAVED_COPY, RX_SUBBEG etc.
2262 * flags have same meanings as with regexec_flags() */
2263
749f4950
DM
2264static void
2265S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2266 char *strbeg,
2267 char *strend,
2268 SV *sv,
2269 U32 flags,
2270 bool utf8_target)
2271{
2272 struct regexp *const prog = ReANY(rx);
2273
60165aa4
DM
2274 if (flags & REXEC_COPY_STR) {
2275#ifdef PERL_ANY_COW
2276 if (SvCANCOW(sv)) {
2277 if (DEBUG_C_TEST) {
2278 PerlIO_printf(Perl_debug_log,
2279 "Copy on write: regexp capture, type %d\n",
2280 (int) SvTYPE(sv));
2281 }
5411a0e5
DM
2282 /* Create a new COW SV to share the match string and store
2283 * in saved_copy, unless the current COW SV in saved_copy
2284 * is valid and suitable for our purpose */
2285 if (( prog->saved_copy
2286 && SvIsCOW(prog->saved_copy)
2287 && SvPOKp(prog->saved_copy)
2288 && SvIsCOW(sv)
2289 && SvPOKp(sv)
2290 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2291 {
5411a0e5
DM
2292 /* just reuse saved_copy SV */
2293 if (RXp_MATCH_COPIED(prog)) {
2294 Safefree(prog->subbeg);
2295 RXp_MATCH_COPIED_off(prog);
2296 }
2297 }
2298 else {
2299 /* create new COW SV to share string */
a76b0e90
DM
2300 RX_MATCH_COPY_FREE(rx);
2301 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2302 }
5411a0e5
DM
2303 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2304 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2305 prog->sublen = strend - strbeg;
2306 prog->suboffset = 0;
2307 prog->subcoffset = 0;
2308 } else
2309#endif
2310 {
99a90e59
FC
2311 SSize_t min = 0;
2312 SSize_t max = strend - strbeg;
ea3daa5d 2313 SSize_t sublen;
60165aa4
DM
2314
2315 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2316 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2317 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2318 ) { /* don't copy $' part of string */
2319 U32 n = 0;
2320 max = -1;
2321 /* calculate the right-most part of the string covered
2322 * by a capture. Due to look-ahead, this may be to
2323 * the right of $&, so we have to scan all captures */
2324 while (n <= prog->lastparen) {
2325 if (prog->offs[n].end > max)
2326 max = prog->offs[n].end;
2327 n++;
2328 }
2329 if (max == -1)
2330 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2331 ? prog->offs[0].start
2332 : 0;
2333 assert(max >= 0 && max <= strend - strbeg);
2334 }
2335
2336 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2337 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2338 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2339 ) { /* don't copy $` part of string */
2340 U32 n = 0;
2341 min = max;
2342 /* calculate the left-most part of the string covered
2343 * by a capture. Due to look-behind, this may be to
2344 * the left of $&, so we have to scan all captures */
2345 while (min && n <= prog->lastparen) {
2346 if ( prog->offs[n].start != -1
2347 && prog->offs[n].start < min)
2348 {
2349 min = prog->offs[n].start;
2350 }
2351 n++;
2352 }
2353 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2354 && min > prog->offs[0].end
2355 )
2356 min = prog->offs[0].end;
2357
2358 }
2359
2360 assert(min >= 0 && min <= max && min <= strend - strbeg);
2361 sublen = max - min;
2362
2363 if (RX_MATCH_COPIED(rx)) {
2364 if (sublen > prog->sublen)
2365 prog->subbeg =
2366 (char*)saferealloc(prog->subbeg, sublen+1);
2367 }
2368 else
2369 prog->subbeg = (char*)safemalloc(sublen+1);
2370 Copy(strbeg + min, prog->subbeg, sublen, char);
2371 prog->subbeg[sublen] = '\0';
2372 prog->suboffset = min;
2373 prog->sublen = sublen;
2374 RX_MATCH_COPIED_on(rx);
2375 }
2376 prog->subcoffset = prog->suboffset;
2377 if (prog->suboffset && utf8_target) {
2378 /* Convert byte offset to chars.
2379 * XXX ideally should only compute this if @-/@+
2380 * has been seen, a la PL_sawampersand ??? */
2381
2382 /* If there's a direct correspondence between the
2383 * string which we're matching and the original SV,
2384 * then we can use the utf8 len cache associated with
2385 * the SV. In particular, it means that under //g,
2386 * sv_pos_b2u() will use the previously cached
2387 * position to speed up working out the new length of
2388 * subcoffset, rather than counting from the start of
2389 * the string each time. This stops
2390 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2391 * from going quadratic */
2392 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2393 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2394 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2395 else
2396 prog->subcoffset = utf8_length((U8*)strbeg,
2397 (U8*)(strbeg+prog->suboffset));
2398 }
2399 }
2400 else {
2401 RX_MATCH_COPY_FREE(rx);
2402 prog->subbeg = strbeg;
2403 prog->suboffset = 0;
2404 prog->subcoffset = 0;
2405 prog->sublen = strend - strbeg;
2406 }
2407}
2408
2409
2410
fae667d5 2411
6eb5f6b9
JH
2412/*
2413 - regexec_flags - match a regexp against a string
2414 */
2415I32
5aaab254 2416Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2417 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2418/* stringarg: the point in the string at which to begin matching */
2419/* strend: pointer to null at end of string */
2420/* strbeg: real beginning of string */
2421/* minend: end of match must be >= minend bytes after stringarg. */
2422/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2423 * itself is accessed via the pointers above */
2424/* data: May be used for some additional optimizations.
d058ec57 2425 Currently unused. */
a340edde 2426/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2427
6eb5f6b9 2428{
97aff369 2429 dVAR;
8d919b0a 2430 struct regexp *const prog = ReANY(rx);
5aaab254 2431 char *s;
eb578fdb 2432 regnode *c;
03c83e26 2433 char *startpos;
ea3daa5d
FC
2434 SSize_t minlen; /* must match at least this many chars */
2435 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2436 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2437 I32 multiline;
f8fc2ecf 2438 RXi_GET_DECL(prog,progi);
02d5137b
DM
2439 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2440 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2441 regexp_paren_pair *swap = NULL;
006f26b2 2442 I32 oldsave;
a3621e74
YO
2443 GET_RE_DEBUG_FLAGS_DECL;
2444
7918f24d 2445 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2446 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2447
2448 /* Be paranoid... */
03c83e26 2449 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2450 Perl_croak(aTHX_ "NULL regexp parameter");
2451 return 0;
2452 }
2453
6c3fea77 2454 DEBUG_EXECUTE_r(
03c83e26 2455 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2456 "Matching");
2457 );
8adc0f72 2458
b342a604
DM
2459 startpos = stringarg;
2460
58430ea8 2461 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2462 MAGIC *mg;
2463
fef7148b
DM
2464 /* set reginfo->ganch, the position where \G can match */
2465
2466 reginfo->ganch =
2467 (flags & REXEC_IGNOREPOS)
2468 ? stringarg /* use start pos rather than pos() */
2469 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2470 /* Defined pos(): */
2471 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2472 : strbeg; /* pos() not defined; use start of string */
2473
2474 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2475 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2476
03c83e26
DM
2477 /* in the presence of \G, we may need to start looking earlier in
2478 * the string than the suggested start point of stringarg:
0b2c2a84 2479 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2480 * offset, such as
2481 * /..\G/: gofs = 2
2482 * /ab|c\G/: gofs = 1
2483 * or if the minimum offset isn't known, then we have to go back
2484 * to the start of the string, e.g. /w+\G/
2485 */
2bfbe302 2486
8e1490ee 2487 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2488 startpos = reginfo->ganch - prog->gofs;
2489 if (startpos <
2490 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2491 {
2492 DEBUG_r(PerlIO_printf(Perl_debug_log,
2493 "fail: ganch-gofs before earliest possible start\n"));
2494 return 0;
2495 }
2496 }
2497 else if (prog->gofs) {
b342a604
DM
2498 if (startpos - prog->gofs < strbeg)
2499 startpos = strbeg;
2500 else
2501 startpos -= prog->gofs;
03c83e26 2502 }
58430ea8 2503 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2504 startpos = strbeg;
03c83e26
DM
2505 }
2506
2507 minlen = prog->minlen;
b342a604 2508 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2509 DEBUG_r(PerlIO_printf(Perl_debug_log,
2510 "Regex match can't succeed, so not even tried\n"));
2511 return 0;
2512 }
2513
63a3746a
DM
2514 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2515 * which will call destuctors to reset PL_regmatch_state, free higher
2516 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2517 * regmatch_info_aux_eval */
2518
2519 oldsave = PL_savestack_ix;
2520
dfa77d06
DM
2521 s = startpos;
2522
e322109a 2523 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2524 && !(flags & REXEC_CHECKED))
2525 {
dfa77d06 2526 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2527 flags, NULL);
dfa77d06 2528 if (!s)
7fadf4a7
DM
2529 return 0;
2530
e322109a 2531 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2532 /* we can match based purely on the result of INTUIT.
2533 * Set up captures etc just for $& and $-[0]
2534 * (an intuit-only match wont have $1,$2,..) */
2535 assert(!prog->nparens);
d5e7783a
DM
2536
2537 /* s/// doesn't like it if $& is earlier than where we asked it to
2538 * start searching (which can happen on something like /.\G/) */
2539 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2540 && (s < stringarg))
2541 {
2542 /* this should only be possible under \G */
58430ea8 2543 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2545 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2546 goto phooey;
2547 }
2548
7fadf4a7
DM
2549 /* match via INTUIT shouldn't have any captures.
2550 * Let @-, @+, $^N know */
2551 prog->lastparen = prog->lastcloseparen = 0;
2552 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2553 prog->offs[0].start = s - strbeg;
2554 prog->offs[0].end = utf8_target
2555 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2556 : s - strbeg + prog->minlenret;
7fadf4a7 2557 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2558 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2559 strbeg, strend,
2560 sv, flags, utf8_target);
2561
7fadf4a7
DM
2562 return 1;
2563 }
2564 }
2565
6c3fea77 2566 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2567
dfa77d06 2568 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2569 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2570 "String too short [regexec_flags]...\n"));
2571 goto phooey;
1aa99e6b 2572 }
1de06328 2573
6eb5f6b9 2574 /* Check validity of program. */
f8fc2ecf 2575 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2576 Perl_croak(aTHX_ "corrupted regexp program");
2577 }
2578
6c3fea77
DM
2579 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2580 reginfo->intuit = 0;
2581 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2582 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2583 reginfo->warned = FALSE;
9d9163fb 2584 reginfo->strbeg = strbeg;
02d5137b 2585 reginfo->sv = sv;
1cb48e53 2586 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2587 reginfo->strend = strend;
6eb5f6b9 2588 /* see how far we have to get to not match where we matched before */
fe3974be 2589 reginfo->till = stringarg + minend;
6eb5f6b9 2590
60779a30 2591 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2592 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2593 S_cleanup_regmatch_info_aux has executed (registered by
2594 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2595 magic belonging to this SV.
2596 Not newSVsv, either, as it does not COW.
2597 */
60779a30 2598 assert(!IS_PADGV(sv));
82c23608 2599 reginfo->sv = newSV(0);
4cba5ac0 2600 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2601 SAVEFREESV(reginfo->sv);
2602 }
2603
331b2dcc
DM
2604 /* reserve next 2 or 3 slots in PL_regmatch_state:
2605 * slot N+0: may currently be in use: skip it
2606 * slot N+1: use for regmatch_info_aux struct
2607 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2608 * slot N+3: ready for use by regmatch()
2609 */
bf2039a9 2610
331b2dcc
DM
2611 {
2612 regmatch_state *old_regmatch_state;
2613 regmatch_slab *old_regmatch_slab;
2614 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2615
2616 /* on first ever match, allocate first slab */
2617 if (!PL_regmatch_slab) {
2618 Newx(PL_regmatch_slab, 1, regmatch_slab);
2619 PL_regmatch_slab->prev = NULL;
2620 PL_regmatch_slab->next = NULL;
2621 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2622 }
bf2039a9 2623
331b2dcc
DM
2624 old_regmatch_state = PL_regmatch_state;
2625 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2626
331b2dcc
DM
2627 for (i=0; i <= max; i++) {
2628 if (i == 1)
2629 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2630 else if (i ==2)
2631 reginfo->info_aux_eval =
2632 reginfo->info_aux->info_aux_eval =
2633 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2634
331b2dcc
DM
2635 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2636 PL_regmatch_state = S_push_slab(aTHX);
2637 }
bf2039a9 2638
331b2dcc
DM
2639 /* note initial PL_regmatch_state position; at end of match we'll
2640 * pop back to there and free any higher slabs */
bf2039a9 2641
331b2dcc
DM
2642 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2643 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2644 reginfo->info_aux->poscache = NULL;
bf2039a9 2645
331b2dcc 2646 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2647
331b2dcc
DM
2648 if ((prog->extflags & RXf_EVAL_SEEN))
2649 S_setup_eval_state(aTHX_ reginfo);
2650 else
2651 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2652 }
d3aa529c 2653
6eb5f6b9 2654 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2655
288b8c02 2656 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2657 /* We have to be careful. If the previous successful match
2658 was from this regex we don't want a subsequent partially
2659 successful match to clobber the old results.
2660 So when we detect this possibility we add a swap buffer
d8da0584
KW
2661 to the re, and switch the buffer each match. If we fail,
2662 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2663 */
2664 swap = prog->offs;
2665 /* do we need a save destructor here for eval dies? */
2666 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2667 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2668 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2669 PTR2UV(prog),
2670 PTR2UV(swap),
2671 PTR2UV(prog->offs)
2672 ));
c74340f9 2673 }
6eb5f6b9
JH
2674
2675 /* Simplest case: anchored match need be tried only once. */
2676 /* [unless only anchor is BOL and multiline is set] */
8e1490ee 2677 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2678 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2679 goto got_it;
8e1490ee 2680 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2681 {
2682 char *end;
2683
2684 if (minlen)
2685 dontbother = minlen - 1;
1aa99e6b 2686 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2687 /* for multiline we only have to try after newlines */
33b8afdf 2688 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2689 /* because of the goto we can not easily reuse the macros for bifurcating the
2690 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2691 if (utf8_target) {
2692 if (s == startpos)
2693 goto after_try_utf8;
2694 while (1) {
02d5137b 2695 if (regtry(reginfo, &s)) {
92f3d482
YO
2696 goto got_it;
2697 }
2698 after_try_utf8:
2699 if (s > end) {
2700 goto phooey;
2701 }
2702 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2703 s = re_intuit_start(rx, sv, strbeg,
2704 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2705 if (!s) {
2706 goto phooey;
2707 }
2708 }
2709 else {
2710 s += UTF8SKIP(s);
2711 }
2712 }
2713 } /* end search for check string in unicode */
2714 else {
2715 if (s == startpos) {
2716 goto after_try_latin;
2717 }
2718 while (1) {
02d5137b 2719 if (regtry(reginfo, &s)) {
92f3d482
YO
2720 goto got_it;
2721 }
2722 after_try_latin:
2723 if (s > end) {
2724 goto phooey;
2725 }
2726 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2727 s = re_intuit_start(rx, sv, strbeg,
2728 s + 1, strend, flags, NULL);
92f3d482
YO
2729 if (!s) {
2730 goto phooey;
2731 }
2732 }
2733 else {
2734 s++;
2735 }
2736 }
2737 } /* end search for check string in latin*/
2738 } /* end search for check string */
2739 else { /* search for newline */
2740 if (s > startpos) {
2741 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2742 s--;
92f3d482 2743 }
21eede78
YO
2744 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2745 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2746 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2747 if (regtry(reginfo, &s))
6eb5f6b9
JH
2748 goto got_it;
2749 }
92f3d482
YO
2750 }
2751 } /* end search for newline */
2752 } /* end anchored/multiline check string search */
6eb5f6b9 2753 goto phooey;
a8430a8b 2754 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2755 {
a8430a8b
YO
2756 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2757 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2758 /* For anchored \G, the only position it can match from is
2759 * (ganch-gofs); we already set startpos to this above; if intuit
2760 * moved us on from there, we can't possibly succeed */
2761 assert(startpos == reginfo->ganch - prog->gofs);
2762 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2763 goto got_it;
2764 goto phooey;
2765 }
2766
2767 /* Messy cases: unanchored match. */
bbe252da 2768 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2769 /* we have /x+whatever/ */
984e6dd1 2770 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2771 char ch;
bf93d4cc
GS
2772#ifdef DEBUGGING
2773 int did_match = 0;
2774#endif
f2ed9b32 2775 if (utf8_target) {
7e0d5ad7
KW
2776 if (! prog->anchored_utf8) {
2777 to_utf8_substr(prog);
2778 }
2779 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2780 REXEC_FBC_SCAN(
6eb5f6b9 2781 if (*s == ch) {
a3621e74 2782 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2783 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2784 s += UTF8SKIP(s);
2785 while (s < strend && *s == ch)
2786 s += UTF8SKIP(s);
2787 }
4cadc6a9 2788 );
7e0d5ad7 2789
6eb5f6b9
JH
2790 }
2791 else {
7e0d5ad7
KW
2792 if (! prog->anchored_substr) {
2793 if (! to_byte_substr(prog)) {
6b54ddc5 2794 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2795 }
2796 }
2797 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2798 REXEC_FBC_SCAN(
6eb5f6b9 2799 if (*s == ch) {
a3621e74 2800 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2801 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2802 s++;
2803 while (s < strend && *s == ch)
2804 s++;
2805 }
4cadc6a9 2806 );
6eb5f6b9 2807 }
a3621e74 2808 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2809 PerlIO_printf(Perl_debug_log,
b7953727
JH
2810 "Did not find anchored character...\n")
2811 );
6eb5f6b9 2812 }
a0714e2c
SS
2813 else if (prog->anchored_substr != NULL
2814 || prog->anchored_utf8 != NULL
2815 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2816 && prog->float_max_offset < strend - s)) {
2817 SV *must;
ea3daa5d
FC
2818 SSize_t back_max;
2819 SSize_t back_min;
33b8afdf 2820 char *last;
6eb5f6b9 2821 char *last1; /* Last position checked before */
bf93d4cc
GS
2822#ifdef DEBUGGING
2823 int did_match = 0;
2824#endif
33b8afdf 2825 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2826 if (utf8_target) {
2827 if (! prog->anchored_utf8) {
2828 to_utf8_substr(prog);
2829 }
2830 must = prog->anchored_utf8;
2831 }
2832 else {
2833 if (! prog->anchored_substr) {
2834 if (! to_byte_substr(prog)) {
6b54ddc5 2835 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2836 }
2837 }
2838 must = prog->anchored_substr;
2839 }
33b8afdf
JH
2840 back_max = back_min = prog->anchored_offset;
2841 } else {
7e0d5ad7
KW
2842 if (utf8_target) {
2843 if (! prog->float_utf8) {
2844 to_utf8_substr(prog);
2845 }
2846 must = prog->float_utf8;
2847 }
2848 else {
2849 if (! prog->float_substr) {
2850 if (! to_byte_substr(prog)) {
6b54ddc5 2851 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2852 }
2853 }
2854 must = prog->float_substr;
2855 }
33b8afdf
JH
2856 back_max = prog->float_max_offset;
2857 back_min = prog->float_min_offset;
2858 }
1de06328 2859
1de06328
YO
2860 if (back_min<0) {
2861 last = strend;
2862 } else {
2863 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2864 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2865 - (SvTAIL(must) != 0) + back_min), strbeg);
2866 }
9d9163fb 2867 if (s > reginfo->strbeg)
6eb5f6b9
JH
2868 last1 = HOPc(s, -1);
2869 else
2870 last1 = s - 1; /* bogus */
2871
a0288114 2872 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2873 check_substr==must. */
bf05793b 2874 dontbother = 0;
6eb5f6b9
JH
2875 strend = HOPc(strend, -dontbother);
2876 while ( (s <= last) &&
e50d57d4 2877 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2878 (unsigned char*)strend, must,
c33e64f0 2879 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2880 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2881 if (HOPc(s, -back_max) > last1) {
2882 last1 = HOPc(s, -back_min);
2883 s = HOPc(s, -back_max);
2884 }
2885 else {
9d9163fb
DM
2886 char * const t = (last1 >= reginfo->strbeg)
2887 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2888
2889 last1 = HOPc(s, -back_min);
52657f30 2890 s = t;
6eb5f6b9 2891 }
f2ed9b32 2892 if (utf8_target) {
6eb5f6b9 2893 while (s <= last1) {
02d5137b 2894 if (regtry(reginfo, &s))
6eb5f6b9 2895 goto got_it;
7016d6eb
DM
2896 if (s >= last1) {
2897 s++; /* to break out of outer loop */
2898 break;
2899 }
2900 s += UTF8SKIP(s);
6eb5f6b9
JH
2901 }
2902 }
2903 else {
2904 while (s <= last1) {
02d5137b 2905 if (regtry(reginfo, &s))
6eb5f6b9
JH
2906 goto got_it;
2907 s++;
2908 }
2909 }
2910 }
ab3bbdeb 2911 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2912 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2913 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2914 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2915 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2916 ? "anchored" : "floating"),
ab3bbdeb
YO
2917 quoted, RE_SV_TAIL(must));
2918 });
6eb5f6b9
JH
2919 goto phooey;
2920 }
f8fc2ecf 2921 else if ( (c = progi->regstclass) ) {
f14c76ed 2922 if (minlen) {
f8fc2ecf 2923 const OPCODE op = OP(progi->regstclass);
66e933ab 2924 /* don't bother with what can't match */
786e8c11 2925 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2926 strend = HOPc(strend, -(minlen - 1));
2927 }
a3621e74 2928 DEBUG_EXECUTE_r({
be8e71aa 2929 SV * const prop = sv_newmortal();
2395827c 2930 regprop(prog, prop, c, reginfo);
0df25f3d 2931 {
f2ed9b32 2932 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2933 s,strend-s,60);
0df25f3d 2934 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2935 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2936 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2937 quoted, (int)(strend - s));
0df25f3d 2938 }
ffc61ed2 2939 });
f9176b44 2940 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2941 goto got_it;
07be1b83 2942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2943 }
2944 else {
2945 dontbother = 0;
a0714e2c 2946 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2947 /* Trim the end. */
6af40bd7 2948 char *last= NULL;
33b8afdf 2949 SV* float_real;
c33e64f0
FC
2950 STRLEN len;
2951 const char *little;
33b8afdf 2952
7e0d5ad7
KW
2953 if (utf8_target) {
2954 if (! prog->float_utf8) {
2955 to_utf8_substr(prog);
2956 }
2957 float_real = prog->float_utf8;
2958 }
2959 else {
2960 if (! prog->float_substr) {
2961 if (! to_byte_substr(prog)) {
6b54ddc5 2962 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2963 }
2964 }
2965 float_real = prog->float_substr;
2966 }
d6a28714 2967
c33e64f0
FC
2968 little = SvPV_const(float_real, len);
2969 if (SvTAIL(float_real)) {
7f18ad16
KW
2970 /* This means that float_real contains an artificial \n on
2971 * the end due to the presence of something like this:
2972 * /foo$/ where we can match both "foo" and "foo\n" at the
2973 * end of the string. So we have to compare the end of the
2974 * string first against the float_real without the \n and
2975 * then against the full float_real with the string. We
2976 * have to watch out for cases where the string might be
2977 * smaller than the float_real or the float_real without
2978 * the \n. */
1a13b075
YO
2979 char *checkpos= strend - len;
2980 DEBUG_OPTIMISE_r(
2981 PerlIO_printf(Perl_debug_log,
2982 "%sChecking for float_real.%s\n",
2983 PL_colors[4], PL_colors[5]));
2984 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2985 /* can't match, even if we remove the trailing \n
2986 * string is too short to match */
1a13b075
YO
2987 DEBUG_EXECUTE_r(
2988 PerlIO_printf(Perl_debug_log,
2989 "%sString shorter than required trailing substring, cannot match.%s\n",
2990 PL_colors[4], PL_colors[5]));
2991 goto phooey;
2992 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2993 /* can match, the end of the string matches without the
2994 * "\n" */
1a13b075
YO
2995 last = checkpos + 1;
2996 } else if (checkpos < strbeg) {
7f18ad16
KW
2997 /* cant match, string is too short when the "\n" is
2998 * included */
1a13b075
YO
2999 DEBUG_EXECUTE_r(
3000 PerlIO_printf(Perl_debug_log,
3001 "%sString does not contain required trailing substring, cannot match.%s\n",
3002 PL_colors[4], PL_colors[5]));
3003 goto phooey;
3004 } else if (!multiline) {
7f18ad16
KW
3005 /* non multiline match, so compare with the "\n" at the
3006 * end of the string */
1a13b075
YO
3007 if (memEQ(checkpos, little, len)) {
3008 last= checkpos;
3009 } else {
3010 DEBUG_EXECUTE_r(
3011 PerlIO_printf(Perl_debug_log,
3012 "%sString does not contain required trailing substring, cannot match.%s\n",
3013 PL_colors[4], PL_colors[5]));
3014 goto phooey;
3015 }
3016 } else {
7f18ad16
KW
3017 /* multiline match, so we have to search for a place
3018 * where the full string is located */
d6a28714 3019 goto find_last;
1a13b075 3020 }
c33e64f0 3021 } else {
d6a28714 3022 find_last:
9041c2e3 3023 if (len)
d6a28714 3024 last = rninstr(s, strend, little, little + len);
b8c5462f 3025 else
a0288114 3026 last = strend; /* matching "$" */
b8c5462f 3027 }
6af40bd7 3028 if (!last) {
7f18ad16
KW
3029 /* at one point this block contained a comment which was
3030 * probably incorrect, which said that this was a "should not
3031 * happen" case. Even if it was true when it was written I am
3032 * pretty sure it is not anymore, so I have removed the comment
3033 * and replaced it with this one. Yves */
6bda09f9
YO
3034 DEBUG_EXECUTE_r(
3035 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
3036 "String does not contain required substring, cannot match.\n"
3037 ));
3038 goto phooey;
bf93d4cc 3039 }
d6a28714
JH
3040 dontbother = strend - last + prog->float_min_offset;
3041 }
3042 if (minlen && (dontbother < minlen))
3043 dontbother = minlen - 1;
3044 strend -= dontbother; /* this one's always in bytes! */
3045 /* We don't know much -- general case. */
f2ed9b32 3046 if (utf8_target) {
d6a28714 3047 for (;;) {
02d5137b 3048 if (regtry(reginfo, &s))
d6a28714
JH
3049 goto got_it;
3050 if (s >= strend)
3051 break;
b8c5462f 3052 s += UTF8SKIP(s);
d6a28714
JH
3053 };
3054 }
3055 else {
3056 do {
02d5137b 3057 if (regtry(reginfo, &s))
d6a28714
JH
3058 goto got_it;
3059 } while (s++ < strend);
3060 }
3061 }
3062
3063 /* Failure. */
3064 goto phooey;
3065
3066got_it:
d5e7783a
DM
3067 /* s/// doesn't like it if $& is earlier than where we asked it to
3068 * start searching (which can happen on something like /.\G/) */
3069 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3070 && (prog->offs[0].start < stringarg - strbeg))
3071 {
3072 /* this should only be possible under \G */
58430ea8 3073 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3074 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3075 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3076 goto phooey;
3077 }
3078
495f47a5
DM
3079 DEBUG_BUFFERS_r(
3080 if (swap)
3081 PerlIO_printf(Perl_debug_log,
3082 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3083 PTR2UV(prog),
3084 PTR2UV(swap)
3085 );
3086 );
e9105d30 3087 Safefree(swap);
d6a28714 3088
bf2039a9
DM
3089 /* clean up; this will trigger destructors that will free all slabs
3090 * above the current one, and cleanup the regmatch_info_aux
3091 * and regmatch_info_aux_eval sructs */
8adc0f72 3092
006f26b2
DM
3093 LEAVE_SCOPE(oldsave);
3094
5daac39c
NC
3095 if (RXp_PAREN_NAMES(prog))
3096 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3097
0254aed9
DM
3098 RX_MATCH_UTF8_set(rx, utf8_target);
3099
d6a28714 3100 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3101 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3102 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3103 strbeg, reginfo->strend,
3104 sv, flags, utf8_target);
9041c2e3 3105
d6a28714
JH
3106 return 1;
3107
3108phooey:
a3621e74 3109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3110 PL_colors[4], PL_colors[5]));
8adc0f72 3111
bf2039a9
DM
3112 /* clean up; this will trigger destructors that will free all slabs
3113 * above the current one, and cleanup the regmatch_info_aux
3114 * and regmatch_info_aux_eval sructs */
8adc0f72 3115
006f26b2
DM
3116 LEAVE_SCOPE(oldsave);
3117
e9105d30 3118 if (swap) {
c74340f9 3119 /* we failed :-( roll it back */
495f47a5
DM
3120 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3121 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3122 PTR2UV(prog),
3123 PTR2UV(prog->offs),
3124 PTR2UV(swap)
3125 ));
e9105d30
GG
3126 Safefree(prog->offs);
3127 prog->offs = swap;
3128 }
d6a28714
JH
3129 return 0;
3130}
3131
6bda09f9 3132
b3d298be 3133/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3134 * Do inc before dec, in case old and new rex are the same */
baa60164 3135#define SET_reg_curpm(Re2) \
bf2039a9 3136 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3137 (void)ReREFCNT_inc(Re2); \
3138 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3139 PM_SETRE((PL_reg_curpm), (Re2)); \
3140 }
3141
3142
d6a28714
JH
3143/*
3144 - regtry - try match at specific point
3145 */
3146STATIC I32 /* 0 failure, 1 success */
f73aaa43 3147S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3148{
97aff369 3149 dVAR;
d6a28714 3150 CHECKPOINT lastcp;
288b8c02 3151 REGEXP *const rx = reginfo->prog;
8d919b0a 3152 regexp *const prog = ReANY(rx);
99a90e59 3153 SSize_t result;
f8fc2ecf 3154 RXi_GET_DECL(prog,progi);
a3621e74 3155 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3156
3157 PERL_ARGS_ASSERT_REGTRY;
3158
24b23f37 3159 reginfo->cutpoint=NULL;
d6a28714 3160
9d9163fb 3161 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3162 prog->lastparen = 0;
03994de8 3163 prog->lastcloseparen = 0;
d6a28714
JH
3164
3165 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3166 to do this again and again, prog->lastparen should take care of
3dd2943c 3167 this! --ilya*/
dafc8851
JH
3168
3169 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3170 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3171 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3172 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3173 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3174 * Meanwhile, this code *is* needed for the
daf18116
JH
3175 * above-mentioned test suite tests to succeed. The common theme
3176 * on those tests seems to be returning null fields from matches.
225593e1 3177 * --jhi updated by dapm */
dafc8851 3178#if 1
d6a28714 3179 if (prog->nparens) {
b93070ed 3180 regexp_paren_pair *pp = prog->offs;
eb578fdb 3181 I32 i;
b93070ed 3182 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3183 ++pp;
3184 pp->start = -1;
3185 pp->end = -1;
d6a28714
JH
3186 }
3187 }
dafc8851 3188#endif
02db2b7b 3189 REGCP_SET(lastcp);
f73aaa43
DM
3190 result = regmatch(reginfo, *startposp, progi->program + 1);
3191 if (result != -1) {
3192 prog->offs[0].end = result;
d6a28714
JH
3193 return 1;
3194 }
24b23f37 3195 if (reginfo->cutpoint)
f73aaa43 3196 *startposp= reginfo->cutpoint;
02db2b7b 3197 REGCP_UNWIND(lastcp);
d6a28714
JH
3198 return 0;
3199}
3200
02db2b7b 3201
8ba1375e
MJD
3202#define sayYES goto yes
3203#define sayNO goto no
262b90c4 3204#define sayNO_SILENT goto no_silent
8ba1375e 3205
f9f4320a
YO
3206/* we dont use STMT_START/END here because it leads to
3207 "unreachable code" warnings, which are bogus, but distracting. */
3208#define CACHEsayNO \
c476f425 3209 if (ST.cache_mask) \
2ac8ff4b 3210 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3211 sayNO
3298f257 3212
a3621e74 3213/* this is used to determine how far from the left messages like
265c4333
YO
3214 'failed...' are printed. It should be set such that messages
3215 are inline with the regop output that created them.
a3621e74 3216*/
265c4333 3217#define REPORT_CODE_OFF 32
a3621e74
YO
3218
3219
40a82448
DM
3220#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3221#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3222#define CHRTEST_NOT_A_CP_1 -999
3223#define CHRTEST_NOT_A_CP_2 -998
9e137952 3224
5d9a96ca
DM
3225/* grab a new slab and return the first slot in it */
3226
3227STATIC regmatch_state *
3228S_push_slab(pTHX)
3229{
a35a87e7 3230#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3231 dMY_CXT;
3232#endif
5d9a96ca
DM
3233 regmatch_slab *s = PL_regmatch_slab->next;
3234 if (!s) {
3235 Newx(s, 1, regmatch_slab);
3236 s->prev = PL_regmatch_slab;
3237 s->next = NULL;
3238 PL_regmatch_slab->next = s;
3239 }
3240 PL_regmatch_slab = s;
86545054 3241 return SLAB_FIRST(s);
5d9a96ca 3242}
5b47454d 3243
95b24440 3244
40a82448
DM
3245/* push a new state then goto it */
3246
4d5016e5
DM
3247#define PUSH_STATE_GOTO(state, node, input) \
3248 pushinput = input; \
40a82448
DM
3249 scan = node; \
3250 st->resume_state = state; \
3251 goto push_state;
3252
3253/* push a new state with success backtracking, then goto it */
3254
4d5016e5
DM
3255#define PUSH_YES_STATE_GOTO(state, node, input) \
3256 pushinput = input; \
40a82448
DM
3257 scan = node; \
3258 st->resume_state = state; \
3259 goto push_yes_state;
3260
aa283a38 3261
aa283a38 3262
4d5016e5 3263
d6a28714 3264/*
95b24440 3265
bf1f174e
DM
3266regmatch() - main matching routine
3267
3268This is basically one big switch statement in a loop. We execute an op,
3269set 'next' to point the next op, and continue. If we come to a point which
3270we may need to backtrack to on failure such as (A|B|C), we push a
3271backtrack state onto the backtrack stack. On failure, we pop the top
3272state, and re-enter the loop at the state indicated. If there are no more
3273states to pop, we return failure.
3274
3275Sometimes we also need to backtrack on success; for example /A+/, where
3276after successfully matching one A, we need to go back and try to
3277match another one; similarly for lookahead assertions: if the assertion
3278completes successfully, we backtrack to the state just before the assertion
3279and then carry on. In these cases, the pushed state is marked as
3280'backtrack on success too'. This marking is in fact done by a chain of
3281pointers, each pointing to the previous 'yes' state. On success, we pop to
3282the nearest yes state, discarding any intermediate failure-only states.
3283Sometimes a yes state is pushed just to force some cleanup code to be
3284called at the end of a successful match or submatch; e.g. (??{$re}) uses
3285it to free the inner regex.
3286
3287Note that failure backtracking rewinds the cursor position, while
3288success backtracking leaves it alone.
3289
3290A pattern is complete when the END op is executed, while a subpattern
3291such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3292ops trigger the "pop to last yes state if any, otherwise return true"
3293behaviour.
3294
3295A common convention in this function is to use A and B to refer to the two
3296subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3297the subpattern to be matched possibly multiple times, while B is the entire
3298rest of the pattern. Variable and state names reflect this convention.
3299
3300The states in the main switch are the union of ops and failure/success of
3301substates associated with with that op. For example, IFMATCH is the op
3302that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3303'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3304successfully matched A and IFMATCH_A_fail is a state saying that we have
3305just failed to match A. Resume states always come in pairs. The backtrack
3306state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3307at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3308on success or failure.
3309
3310The struct that holds a backtracking state is actually a big union, with
3311one variant for each major type of op. The variable st points to the
3312top-most backtrack struct. To make the code clearer, within each
3313block of code we #define ST to alias the relevant union.
3314
3315Here's a concrete example of a (vastly oversimplified) IFMATCH
3316implementation:
3317
3318 switch (state) {
3319 ....
3320
3321#define ST st->u.ifmatch
3322
3323 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3324 ST.foo = ...; // some state we wish to save
95b24440 3325 ...
bf1f174e
DM
3326 // push a yes backtrack state with a resume value of
3327 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3328 // first node of A:
4d5016e5 3329 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3330 // NOTREACHED
3331
3332 case IFMATCH_A: // we have successfully executed A; now continue with B
3333 next = B;
3334 bar = ST.foo; // do something with the preserved value
3335 break;
3336
3337 case IFMATCH_A_fail: // A failed, so the assertion failed
3338 ...; // do some housekeeping, then ...
3339 sayNO; // propagate the failure
3340
3341#undef ST
95b24440 3342
bf1f174e
DM
3343 ...
3344 }
95b24440 3345
bf1f174e
DM
3346For any old-timers reading this who are familiar with the old recursive
3347approach, the code above is equivalent to:
95b24440 3348
bf1f174e
DM
3349 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3350 {
3351 int foo = ...
95b24440 3352 ...
bf1f174e
DM
3353 if (regmatch(A)) {
3354 next = B;
3355 bar = foo;
3356 break;
95b24440 3357 }
bf1f174e
DM
3358 ...; // do some housekeeping, then ...
3359 sayNO; // propagate the failure
95b24440 3360 }
bf1f174e
DM
3361
3362The topmost backtrack state, pointed to by st, is usually free. If you
3363want to claim it, populate any ST.foo fields in it with values you wish to
3364save, then do one of
3365
4d5016e5
DM
3366 PUSH_STATE_GOTO(resume_state, node, newinput);
3367 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3368
3369which sets that backtrack state's resume value to 'resume_state', pushes a
3370new free entry to the top of the backtrack stack, then goes to 'node'.
3371On backtracking, the free slot is popped, and the saved state becomes the
3372new free state. An ST.foo field in this new top state can be temporarily
3373accessed to retrieve values, but once the main loop is re-entered, it
3374becomes available for reuse.
3375
3376Note that the depth of the backtrack stack constantly increases during the
3377left-to-right execution of the pattern, rather than going up and down with
3378the pattern nesting. For example the stack is at its maximum at Z at the
3379end of the pattern, rather than at X in the following:
3380
3381 /(((X)+)+)+....(Y)+....Z/
3382
3383The only exceptions to this are lookahead/behind assertions and the cut,
3384(?>A), which pop all the backtrack states associated with A before
3385continuing.
3386
486ec47a 3387Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3388PL_regmatch_state and st always point to the currently active state,
3389and PL_regmatch_slab points to the slab currently containing
3390PL_regmatch_state. The first time regmatch() is called, the first slab is
3391allocated, and is never freed until interpreter destruction. When the slab
3392is full, a new one is allocated and chained to the end. At exit from
3393regmatch(), slabs allocated since entry are freed.
3394
3395*/
95b24440 3396
40a82448 3397
5bc10b2c 3398#define DEBUG_STATE_pp(pp) \
265c4333 3399 DEBUG_STATE_r({ \
baa60164 3400 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3401 PerlIO_printf(Perl_debug_log, \
5d458dd8 3402 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3403 depth*2, "", \
baa60164 3404 PL_reg_name[st->resume_state], \
5d458dd8
YO
3405 ((st==yes_state||st==mark_state) ? "[" : ""), \
3406 ((st==yes_state) ? "Y" : ""), \
3407 ((st==mark_state) ? "M" : ""), \
3408 ((st==yes_state||st==mark_state) ? "]" : "") \
3409 ); \
265c4333 3410 });
5bc10b2c 3411
40a82448 3412
3dab1dad 3413#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3414
3df15adc 3415#ifdef DEBUGGING
5bc10b2c 3416
ab3bbdeb 3417STATIC void
f2ed9b32 3418S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3419 const char *start, const char *end, const char *blurb)
3420{
efd26800 3421 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3422
3423 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3424
ab3bbdeb
YO
3425 if (!PL_colorset)
3426 reginitcolors();
3427 {
3428 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3429 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3430
f2ed9b32 3431 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3432 start, end - start, 60);
3433
3434 PerlIO_printf(Perl_debug_log,
3435 "%s%s REx%s %s against %s\n",
3436 PL_colors[4], blurb, PL_colors[5], s0, s1);
3437
f2ed9b32 3438 if (utf8_target||utf8_pat)
1de06328
YO
3439 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3440 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3441 utf8_pat && utf8_target ? " and " : "",
3442 utf8_target ? "string" : ""
ab3bbdeb
YO
3443 );
3444 }
3445}
3df15adc
YO
3446
3447STATIC void
786e8c11
YO
3448S_dump_exec_pos(pTHX_ const char *locinput,
3449 const regnode *scan,
3450 const char *loc_regeol,
3451 const char *loc_bostr,
3452 const char *loc_reg_starttry,
f2ed9b32 3453 const bool utf8_target)
07be1b83 3454{
786e8c11 3455 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3456 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3457 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3458 /* The part of the string before starttry has one color
3459 (pref0_len chars), between starttry and current
3460 position another one (pref_len - pref0_len chars),
3461 after the current position the third one.
3462 We assume that pref0_len <= pref_len, otherwise we
3463 decrease pref0_len. */
786e8c11
YO
3464 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3465 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3466 int pref0_len;
3467
7918f24d
NC
3468 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3469
f2ed9b32 3470 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3471 pref_len++;
786e8c11
YO
3472 pref0_len = pref_len - (locinput - loc_reg_starttry);
3473 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3474 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3475 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3476 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3477 l--;
3478 if (pref0_len < 0)
3479 pref0_len = 0;
3480 if (pref0_len > pref_len)
3481 pref0_len = pref_len;
3482 {
f2ed9b32 3483 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3484
ab3bbdeb 3485 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3486 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3487
ab3bbdeb 3488 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3489 (locinput - pref_len + pref0_len),
1de06328 3490 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3491
ab3bbdeb 3492 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3493 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3494
1de06328 3495 const STRLEN tlen=len0+len1+len2;
3df15adc 3496 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3497 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3498 (IV)(locinput - loc_bostr),
07be1b83 3499 len0, s0,
07be1b83 3500 len1, s1,
07be1b83 3501 (docolor ? "" : "> <"),
07be1b83 3502 len2, s2,
f9f4320a 3503 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3504 "");
3505 }
3506}
3df15adc 3507
07be1b83
YO
3508#endif
3509
0a4db386
YO
3510/* reg_check_named_buff_matched()
3511 * Checks to see if a named buffer has matched. The data array of
3512 * buffer numbers corresponding to the buffer is expected to reside
3513 * in the regexp->data->data array in the slot stored in the ARG() of
3514 * node involved. Note that this routine doesn't actually care about the
3515 * name, that information is not preserved from compilation to execution.
3516 * Returns the index of the leftmost defined buffer with the given name
3517 * or 0 if non of the buffers matched.
3518 */
3519STATIC I32
7918f24d
NC
3520S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3521{
0a4db386 3522 I32 n;
f8fc2ecf 3523 RXi_GET_DECL(rex,rexi);
ad64d0ec 3524 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3525 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3526
3527 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3528
0a4db386 3529 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3530 if ((I32)rex->lastparen >= nums[n] &&
3531 rex->offs[nums[n]].end != -1)
0a4db386
YO
3532 {
3533 return nums[n];
3534 }
3535 }
3536 return 0;
3537}
3538
2f554ef7 3539
c74f6de9 3540static bool
984e6dd1 3541S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3542 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3543{
79a2a0e8
KW
3544 /* This function determines if there are one or two characters that match