This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Eliminate unused BACK regnode
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 121 : (U8*)(pos + off))
557f47af 122
3dab1dad 123#define HOPBACKc(pos, off) \
ba44c216 124 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
126 : (pos - off >= reginfo->strbeg) \
8e11feef 127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
ba44c216 130#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 131#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 132
557f47af
DM
133/* lim must be +ve. Returns NULL on overshoot */
134#define HOPMAYBE3(pos,off,lim) \
135 (reginfo->is_utf8_target \
136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
137 : ((U8*)pos + off <= lim) \
138 ? (U8*)pos + off \
139 : NULL)
140
8e9f2289
DM
141/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
142 * off must be >=0; args should be vars rather than expressions */
143#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
145 : (U8*)((pos + off) > lim ? lim : (pos + off)))
146
2974eaec
DM
147#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
149 : (U8*)(pos + off))
150#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
151
152#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
153#define NEXTCHR_IS_EOS (nextchr < 0)
154
155#define SET_nextchr \
220db18a 156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
157
158#define SET_locinput(p) \
159 locinput = (p); \
160 SET_nextchr
161
162
2a16ac92 163#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
164 if (!swash_ptr) { \
165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 167 1, 0, invlist, &flags); \
c7304fe2
KW
168 assert(swash_ptr); \
169 } \
170 } STMT_END
171
172/* If in debug mode, we test that a known character properly matches */
173#ifdef DEBUGGING
174# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
175 property_name, \
2a16ac92 176 invlist, \
c7304fe2 177 utf8_char_in_property) \
2a16ac92 178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
180#else
181# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
182 property_name, \
2a16ac92 183 invlist, \
c7304fe2 184 utf8_char_in_property) \
2a16ac92 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 186#endif
d1eb3177 187
c7304fe2
KW
188#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
190 "", \
191 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
193
194#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
195 STMT_START { \
196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
197 "_X_regular_begin", \
2a16ac92 198 NULL, \
df38da56 199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
201 "_X_extend", \
2a16ac92 202 NULL, \
c7304fe2
KW
203 COMBINING_GRAVE_ACCENT_UTF8); \
204 } STMT_END
d1eb3177 205
c7304fe2 206#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
207/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
208
5f80c4cf 209/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
210/* it would be nice to rework regcomp.sym to generate this stuff. sigh
211 *
212 * NOTE that *nothing* that affects backtracking should be in here, specifically
213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
214 * node that is in between two EXACT like nodes when ascertaining what the required
215 * "follow" character is. This should probably be moved to regex compile time
216 * although it may be done at run time beause of the REF possibility - more
217 * investigation required. -- demerphq
218*/
baa60164
KW
219#define JUMPABLE(rn) ( \
220 OP(rn) == OPEN || \
3e901dc0 221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
222 OP(rn) == EVAL || \
223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
224 OP(rn) == PLUS || OP(rn) == MINMOD || \
225 OP(rn) == KEEPS || \
226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 227)
ee9b8eae 228#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 229
ee9b8eae
YO
230#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
231
232#if 0
233/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
234 we don't need this definition. */
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 236#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
237#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
238
239#else
240/* ... so we use this as its faster. */
241#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 242#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
243#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
244#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245
246#endif
e2d8ce26 247
a84d97b6
HS
248/*
249 Search for mandatory following text node; for lookahead, the text must
250 follow but for lookbehind (rn->flags != 0) we skip to the next step.
251*/
baa60164 252#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
253 while (JUMPABLE(rn)) { \
254 const OPCODE type = OP(rn); \
255 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 256 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 257 else if (type == PLUS) \
cca55fe3 258 rn = NEXTOPER(rn); \
3dab1dad 259 else if (type == IFMATCH) \
a84d97b6 260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 261 else rn += NEXT_OFF(rn); \
3dab1dad 262 } \
5f80c4cf 263} STMT_END
74750237 264
22913b96
KW
265/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
266 * These are for the pre-composed Hangul syllables, which are all in a
267 * contiguous block and arranged there in such a way so as to facilitate
268 * alorithmic determination of their characteristics. As such, they don't need
269 * a swash, but can be determined by simple arithmetic. Almost all are
270 * GCB=LVT, but every 28th one is a GCB=LV */
271#define SBASE 0xAC00 /* Start of block */
272#define SCount 11172 /* Length of block */
273#define TCount 28
c476f425 274
006f26b2
DM
275#define SLAB_FIRST(s) (&(s)->states[0])
276#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
277
a75351a1 278static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 279static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 280static regmatch_state * S_push_slab(pTHX);
51371543 281
87c0511b 282#define REGCP_PAREN_ELEMS 3
f067efbf 283#define REGCP_OTHER_ELEMS 3
e0fa7e2b 284#define REGCP_FRAME_ELEMS 1
620d5b66
NC
285/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
286 * are needed for the regexp context stack bookkeeping. */
287
76e3520e 288STATIC CHECKPOINT
92da3157 289S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 290{
a3b680e6 291 const int retval = PL_savestack_ix;
92da3157
DM
292 const int paren_elems_to_push =
293 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
294 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
295 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 296 I32 p;
40a82448 297 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 298
b93070ed
DM
299 PERL_ARGS_ASSERT_REGCPPUSH;
300
e49a9654 301 if (paren_elems_to_push < 0)
e8a85d26
JH
302 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
303 (int)paren_elems_to_push, (int)maxopenparen,
304 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 305
e0fa7e2b
NC
306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 308 " out of range (%lu-%ld)",
92da3157
DM
309 total_elems,
310 (unsigned long)maxopenparen,
311 (long)parenfloor);
e0fa7e2b 312
620d5b66 313 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 314
495f47a5 315 DEBUG_BUFFERS_r(
92da3157 316 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
317 PerlIO_printf(Perl_debug_log,
318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
319 PTR2UV(rex),
320 PTR2UV(rex->offs)
321 );
322 );
92da3157 323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 324/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
325 SSPUSHIV(rex->offs[p].end);
326 SSPUSHIV(rex->offs[p].start);
1ca2007e 327 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
330 (UV)p,
331 (IV)rex->offs[p].start,
332 (IV)rex->offs[p].start_tmp,
333 (IV)rex->offs[p].end
40a82448 334 ));
a0d0e21e 335 }
b1ce53c5 336/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 337 SSPUSHINT(maxopenparen);
b93070ed
DM
338 SSPUSHINT(rex->lastparen);
339 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 341
a0d0e21e
LW
342 return retval;
343}
344
c277df42 345/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
346#define REGCP_SET(cp) \
347 DEBUG_STATE_r( \
ab3bbdeb 348 PerlIO_printf(Perl_debug_log, \
e4f74956 349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
350 (IV)PL_savestack_ix)); \
351 cp = PL_savestack_ix
c3464db5 352
ab3bbdeb 353#define REGCP_UNWIND(cp) \
e4f74956 354 DEBUG_STATE_r( \
ab3bbdeb 355 if (cp != PL_savestack_ix) \
e4f74956
YO
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
358 (IV)(cp), (IV)PL_savestack_ix)); \
359 regcpblow(cp)
c277df42 360
a8d1f4b4
DM
361#define UNWIND_PAREN(lp, lcp) \
362 for (n = rex->lastparen; n > lp; n--) \
363 rex->offs[n].end = -1; \
364 rex->lastparen = n; \
365 rex->lastcloseparen = lcp;
366
367
f067efbf 368STATIC void
92da3157 369S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 370{
e0fa7e2b 371 UV i;
87c0511b 372 U32 paren;
a3621e74
YO
373 GET_RE_DEBUG_FLAGS_DECL;
374
7918f24d
NC
375 PERL_ARGS_ASSERT_REGCPPOP;
376
b1ce53c5 377 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 378 i = SSPOPUV;
e0fa7e2b
NC
379 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
380 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
381 rex->lastcloseparen = SSPOPINT;
382 rex->lastparen = SSPOPINT;
92da3157 383 *maxopenparen_p = SSPOPINT;
b1ce53c5 384
620d5b66 385 i -= REGCP_OTHER_ELEMS;
b1ce53c5 386 /* Now restore the parentheses context. */
495f47a5
DM
387 DEBUG_BUFFERS_r(
388 if (i || rex->lastparen + 1 <= rex->nparens)
389 PerlIO_printf(Perl_debug_log,
390 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
391 PTR2UV(rex),
392 PTR2UV(rex->offs)
393 );
394 );
92da3157 395 paren = *maxopenparen_p;
620d5b66 396 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 397 SSize_t tmps;
1ca2007e 398 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
399 rex->offs[paren].start = SSPOPIV;
400 tmps = SSPOPIV;
b93070ed
DM
401 if (paren <= rex->lastparen)
402 rex->offs[paren].end = tmps;
495f47a5
DM
403 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
404 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
405 (UV)paren,
406 (IV)rex->offs[paren].start,
407 (IV)rex->offs[paren].start_tmp,
408 (IV)rex->offs[paren].end,
409 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 410 );
87c0511b 411 paren--;
a0d0e21e 412 }
daf18116 413#if 1
dafc8851
JH
414 /* It would seem that the similar code in regtry()
415 * already takes care of this, and in fact it is in
416 * a better location to since this code can #if 0-ed out
417 * but the code in regtry() is needed or otherwise tests
418 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
419 * (as of patchlevel 7877) will fail. Then again,
420 * this code seems to be necessary or otherwise
225593e1
DM
421 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
422 * --jhi updated by dapm */
b93070ed 423 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 424 if (i > *maxopenparen_p)
b93070ed
DM
425 rex->offs[i].start = -1;
426 rex->offs[i].end = -1;
495f47a5
DM
427 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
428 " \\%"UVuf": %s ..-1 undeffing\n",
429 (UV)i,
92da3157 430 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 431 ));
a0d0e21e 432 }
dafc8851 433#endif
a0d0e21e
LW
434}
435
74088413
DM
436/* restore the parens and associated vars at savestack position ix,
437 * but without popping the stack */
438
439STATIC void
92da3157 440S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
441{
442 I32 tmpix = PL_savestack_ix;
443 PL_savestack_ix = ix;
92da3157 444 regcppop(rex, maxopenparen_p);
74088413
DM
445 PL_savestack_ix = tmpix;
446}
447
02db2b7b 448#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 449
31c7f561
KW
450STATIC bool
451S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
452{
453 /* Returns a boolean as to whether or not 'character' is a member of the
454 * Posix character class given by 'classnum' that should be equivalent to a
455 * value in the typedef '_char_class_number'.
456 *
457 * Ideally this could be replaced by a just an array of function pointers
458 * to the C library functions that implement the macros this calls.
459 * However, to compile, the precise function signatures are required, and
460 * these may vary from platform to to platform. To avoid having to figure
461 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
462 * which adds an extra layer of function call overhead (unless the C
463 * optimizer strips it away). But we don't particularly care about
464 * performance with locales anyway. */
31c7f561
KW
465
466 switch ((_char_class_number) classnum) {
15861f94 467 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 468 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
469 case _CC_ENUM_ASCII: return isASCII_LC(character);
470 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
471 case _CC_ENUM_CASED: return isLOWER_LC(character)
472 || isUPPER_LC(character);
e8d596e0 473 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
474 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
475 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
476 case _CC_ENUM_LOWER: return isLOWER_LC(character);
477 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 478 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 479 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 480 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
481 case _CC_ENUM_UPPER: return isUPPER_LC(character);
482 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 483 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
484 default: /* VERTSPACE should never occur in locales */
485 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
486 }
487
a25b5927 488 assert(0); /* NOTREACHED */
31c7f561
KW
489 return FALSE;
490}
491
3018b823
KW
492STATIC bool
493S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
494{
495 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
496 * 'character' is a member of the Posix character class given by 'classnum'
497 * that should be equivalent to a value in the typedef
498 * '_char_class_number'.
499 *
500 * This just calls isFOO_lc on the code point for the character if it is in
501 * the range 0-255. Outside that range, all characters avoid Unicode
502 * rules, ignoring any locale. So use the Unicode function if this class
503 * requires a swash, and use the Unicode macro otherwise. */
504
505 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
506
507 if (UTF8_IS_INVARIANT(*character)) {
508 return isFOO_lc(classnum, *character);
509 }
510 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
511 return isFOO_lc(classnum,
94bb8c36 512 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
513 }
514
515 if (classnum < _FIRST_NON_SWASH_CC) {
516
517 /* Initialize the swash unless done already */
518 if (! PL_utf8_swash_ptrs[classnum]) {
519 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
520 PL_utf8_swash_ptrs[classnum] =
521 _core_swash_init("utf8",
522 "",
523 &PL_sv_undef, 1, 0,
524 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
525 }
526
92a2046b
KW
527 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
528 character,
529 TRUE /* is UTF */ ));
3018b823
KW
530 }
531
532 switch ((_char_class_number) classnum) {
533 case _CC_ENUM_SPACE:
534 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
535
536 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
537 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
538 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 539 default: break;
3018b823
KW
540 }
541
e1ee3960 542 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
543}
544
a687059c 545/*
e50aee73 546 * pregexec and friends
a687059c
LW
547 */
548
76234dfb 549#ifndef PERL_IN_XSUB_RE
a687059c 550/*
c277df42 551 - pregexec - match a regexp against a string
a687059c 552 */
c277df42 553I32
5aaab254 554Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 555 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
556/* stringarg: the point in the string at which to begin matching */
557/* strend: pointer to null at end of string */
558/* strbeg: real beginning of string */
559/* minend: end of match must be >= minend bytes after stringarg. */
560/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
561 * itself is accessed via the pointers above */
562/* nosave: For optimizations. */
c277df42 563{
7918f24d
NC
564 PERL_ARGS_ASSERT_PREGEXEC;
565
c277df42 566 return
9041c2e3 567 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
568 nosave ? 0 : REXEC_COPY_STR);
569}
76234dfb 570#endif
22e551b9 571
cad2e5aa 572
6eb5f6b9 573
1a4edc3c
DM
574/* re_intuit_start():
575 *
576 * Based on some optimiser hints, try to find the earliest position in the
577 * string where the regex could match.
578 *
579 * rx: the regex to match against
580 * sv: the SV being matched: only used for utf8 flag; the string
581 * itself is accessed via the pointers below. Note that on
582 * something like an overloaded SV, SvPOK(sv) may be false
583 * and the string pointers may point to something unrelated to
584 * the SV itself.
585 * strbeg: real beginning of string
586 * strpos: the point in the string at which to begin matching
587 * strend: pointer to the byte following the last char of the string
588 * flags currently unused; set to 0
589 * data: currently unused; set to NULL
590 *
591 * The basic idea of re_intuit_start() is to use some known information
592 * about the pattern, namely:
593 *
594 * a) the longest known anchored substring (i.e. one that's at a
595 * constant offset from the beginning of the pattern; but not
596 * necessarily at a fixed offset from the beginning of the
597 * string);
598 * b) the longest floating substring (i.e. one that's not at a constant
599 * offset from the beginning of the pattern);
600 * c) Whether the pattern is anchored to the string; either
601 * an absolute anchor: /^../, or anchored to \n: /^.../m,
602 * or anchored to pos(): /\G/;
603 * d) A start class: a real or synthetic character class which
604 * represents which characters are legal at the start of the pattern;
605 *
606 * to either quickly reject the match, or to find the earliest position
607 * within the string at which the pattern might match, thus avoiding
608 * running the full NFA engine at those earlier locations, only to
609 * eventually fail and retry further along.
610 *
611 * Returns NULL if the pattern can't match, or returns the address within
612 * the string which is the earliest place the match could occur.
613 *
614 * The longest of the anchored and floating substrings is called 'check'
615 * and is checked first. The other is called 'other' and is checked
616 * second. The 'other' substring may not be present. For example,
617 *
618 * /(abc|xyz)ABC\d{0,3}DEFG/
619 *
620 * will have
621 *
622 * check substr (float) = "DEFG", offset 6..9 chars
623 * other substr (anchored) = "ABC", offset 3..3 chars
624 * stclass = [ax]
625 *
626 * Be aware that during the course of this function, sometimes 'anchored'
627 * refers to a substring being anchored relative to the start of the
628 * pattern, and sometimes to the pattern itself being anchored relative to
629 * the string. For example:
630 *
631 * /\dabc/: "abc" is anchored to the pattern;
632 * /^\dabc/: "abc" is anchored to the pattern and the string;
633 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
634 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
635 * but the pattern is anchored to the string.
52a21eb3
DM
636 */
637
cad2e5aa 638char *
52a21eb3
DM
639Perl_re_intuit_start(pTHX_
640 REGEXP * const rx,
641 SV *sv,
642 const char * const strbeg,
643 char *strpos,
644 char *strend,
645 const U32 flags,
646 re_scream_pos_data *data)
cad2e5aa 647{
8d919b0a 648 struct regexp *const prog = ReANY(rx);
6b071d16 649 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 650 /* Should be nonnegative! */
ea3daa5d 651 SSize_t end_shift = 0;
0fc004dd
DM
652 /* current lowest pos in string where the regex can start matching */
653 char *rx_origin = strpos;
eb578fdb 654 SV *check;
f2ed9b32 655 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 656 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 657 bool ml_anch = 0;
8f4bf5fc 658 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 659 char *check_at = NULL; /* check substr found at this pos */
bbe252da 660 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 661 RXi_GET_DECL(prog,progi);
02d5137b
DM
662 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
663 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
664 GET_RE_DEBUG_FLAGS_DECL;
665
7918f24d 666 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
667 PERL_UNUSED_ARG(flags);
668 PERL_UNUSED_ARG(data);
7918f24d 669
1dc475d0
DM
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
671 "Intuit: trying to determine minimum start position...\n"));
672
fb9bbddb
DM
673 /* for now, assume that all substr offsets are positive. If at some point
674 * in the future someone wants to do clever things with look-behind and
675 * -ve offsets, they'll need to fix up any code in this function
676 * which uses these offsets. See the thread beginning
677 * <20140113145929.GF27210@iabyn.com>
678 */
679 assert(prog->substrs->data[0].min_offset >= 0);
680 assert(prog->substrs->data[0].max_offset >= 0);
681 assert(prog->substrs->data[1].min_offset >= 0);
682 assert(prog->substrs->data[1].max_offset >= 0);
683 assert(prog->substrs->data[2].min_offset >= 0);
684 assert(prog->substrs->data[2].max_offset >= 0);
685
f7022b5a 686 /* for now, assume that if both present, that the floating substring
83f2232d 687 * doesn't start before the anchored substring.
f7022b5a
DM
688 * If you break this assumption (e.g. doing better optimisations
689 * with lookahead/behind), then you'll need to audit the code in this
690 * function carefully first
691 */
692 assert(
693 ! ( (prog->anchored_utf8 || prog->anchored_substr)
694 && (prog->float_utf8 || prog->float_substr))
695 || (prog->float_min_offset >= prog->anchored_offset));
696
1a4edc3c
DM
697 /* byte rather than char calculation for efficiency. It fails
698 * to quickly reject some cases that can't match, but will reject
699 * them later after doing full char arithmetic */
c344f387 700 if (prog->minlen > strend - strpos) {
a3621e74 701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 702 " String too short...\n"));
cad2e5aa 703 goto fail;
2c2d71f5 704 }
d8da0584 705
6c3fea77 706 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 707 reginfo->info_aux = NULL;
9d9163fb 708 reginfo->strbeg = strbeg;
220db18a 709 reginfo->strend = strend;
aed7b151 710 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 711 reginfo->intuit = 1;
1cb48e53
DM
712 /* not actually used within intuit, but zero for safety anyway */
713 reginfo->poscache_maxiter = 0;
02d5137b 714
f2ed9b32 715 if (utf8_target) {
33b8afdf
JH
716 if (!prog->check_utf8 && prog->check_substr)
717 to_utf8_substr(prog);
718 check = prog->check_utf8;
719 } else {
7e0d5ad7
KW
720 if (!prog->check_substr && prog->check_utf8) {
721 if (! to_byte_substr(prog)) {
6b54ddc5 722 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
723 }
724 }
33b8afdf
JH
725 check = prog->check_substr;
726 }
274cd312 727
1dc475d0
DM
728 /* dump the various substring data */
729 DEBUG_OPTIMISE_MORE_r({
730 int i;
731 for (i=0; i<=2; i++) {
732 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
733 : prog->substrs->data[i].substr);
734 if (!sv)
735 continue;
736
737 PerlIO_printf(Perl_debug_log,
738 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
739 " useful=%"IVdf" utf8=%d [%s]\n",
740 i,
741 (IV)prog->substrs->data[i].min_offset,
742 (IV)prog->substrs->data[i].max_offset,
743 (IV)prog->substrs->data[i].end_shift,
744 BmUSEFUL(sv),
745 utf8_target ? 1 : 0,
746 SvPEEK(sv));
747 }
748 });
749
8e1490ee 750 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
751
752 /* ml_anch: check after \n?
753 *
754 * A note about IMPLICIT: on an un-anchored pattern beginning
755 * with /.*.../, these flags will have been added by the
756 * compiler:
757 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
758 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
759 */
7d2d37f5
DM
760 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
761 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 762
343c8a29 763 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
764 /* we are only allowed to match at BOS or \G */
765
57fcbfa7 766 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
767 *
768 * Note that we don't try to do a similar quick reject for
769 * \G, since generally the caller will have calculated strpos
770 * based on pos() and gofs, so the string is already correctly
771 * anchored by definition; and handling the exceptions would
772 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 773 */
7bb3b9eb 774 if ( strpos != strbeg
d3d47aac 775 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8
DM
776 {
777 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 778 " Not at start...\n"));
c889ccc8
DM
779 goto fail;
780 }
781
a5d12a4b
DM
782 /* in the presence of an anchor, the anchored (relative to the
783 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
784 * to strpos. So quickly reject if substr isn't found there.
785 * This works for \G too, because the caller will already have
786 * subtracted gofs from pos, and gofs is the offset from the
787 * \G to the start of the regex. For example, in /.abc\Gdef/,
788 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
789 * caller will have set strpos=pos()-4; we look for the substr
790 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 791
c889ccc8 792 if (prog->check_offset_min == prog->check_offset_max
d0d44648 793 && !(prog->intflags & PREGf_CANY_SEEN))
c889ccc8
DM
794 {
795 /* Substring at constant offset from beg-of-str... */
d307bf57 796 SSize_t slen = SvCUR(check);
343c8a29 797 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 798
1dc475d0
DM
799 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
800 " Looking for check substr at fixed offset %"IVdf"...\n",
801 (IV)prog->check_offset_min));
802
7742aa66
DM
803 if (SvTAIL(check)) {
804 /* In this case, the regex is anchored at the end too.
805 * Unless it's a multiline match, the lengths must match
806 * exactly, give or take a \n. NB: slen >= 1 since
807 * the last char of check is \n */
808 if (!multiline
809 && ( strend - s > slen
810 || strend - s < slen - 1
811 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
812 {
813 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 814 " String too long...\n"));
c889ccc8
DM
815 goto fail_finish;
816 }
817 /* Now should match s[0..slen-2] */
818 slen--;
c889ccc8 819 }
d307bf57
DM
820 if (slen && (*SvPVX_const(check) != *s
821 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
822 {
823 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 824 " String not equal...\n"));
d307bf57
DM
825 goto fail_finish;
826 }
c889ccc8
DM
827
828 check_at = s;
829 goto success_at_start;
cad2e5aa 830 }
cad2e5aa 831 }
cad2e5aa 832 }
0fc004dd 833
c0e0ec46 834 end_shift = prog->check_end_shift;
cad2e5aa 835
19188028 836#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 837 if (end_shift < 0)
1de06328 838 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 839 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
840#endif
841
2c2d71f5 842 restart:
1de06328 843
66b7ec5c
DM
844 /* This is the (re)entry point of the main loop in this function.
845 * The goal of this loop is to:
846 * 1) find the "check" substring in the region rx_origin..strend
847 * (adjusted by start_shift / end_shift). If not found, reject
848 * immediately.
849 * 2) If it exists, look for the "other" substr too if defined; for
850 * example, if the check substr maps to the anchored substr, then
851 * check the floating substr, and vice-versa. If not found, go
852 * back to (1) with rx_origin suitably incremented.
853 * 3) If we find an rx_origin position that doesn't contradict
854 * either of the substrings, then check the possible additional
855 * constraints on rx_origin of /^.../m or a known start class.
856 * If these fail, then depending on which constraints fail, jump
857 * back to here, or to various other re-entry points further along
858 * that skip some of the first steps.
859 * 4) If we pass all those tests, update the BmUSEFUL() count on the
860 * substring. If the start position was determined to be at the
861 * beginning of the string - so, not rejected, but not optimised,
862 * since we have to run regmatch from position 0 - decrement the
863 * BmUSEFUL() count. Otherwise increment it.
864 */
865
1a4edc3c
DM
866
867 /* first, look for the 'check' substring */
868
1de06328 869 {
c33e64f0
FC
870 U8* start_point;
871 U8* end_point;
c889ccc8 872
c889ccc8 873 DEBUG_OPTIMISE_MORE_r({
1dc475d0 874 PerlIO_printf(Perl_debug_log,
ae5d4331 875 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 876 " Start shift: %"IVdf" End shift %"IVdf
4d281893 877 " Real end Shift: %"IVdf"\n",
197641b6 878 (IV)(rx_origin - strpos),
c889ccc8 879 (IV)prog->check_offset_min,
12fbc530
DM
880 (IV)start_shift,
881 (IV)end_shift,
c889ccc8
DM
882 (IV)prog->check_end_shift);
883 });
1de06328 884
0d331aaf 885 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 886 start_point= (U8*)(rx_origin + start_shift);
12fbc530 887 end_point= (U8*)(strend - end_shift);
557f47af
DM
888 if (start_point > end_point)
889 goto fail_finish;
1de06328 890 } else {
557f47af
DM
891 end_point = HOP3(strend, -end_shift, strbeg);
892 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
893 if (!start_point)
894 goto fail_finish;
1de06328 895 }
c889ccc8 896
557f47af 897
e0362b86 898 /* If the regex is absolutely anchored to either the start of the
d3d47aac 899 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
900 * check_offset_max represents an upper bound on the string where
901 * the substr could start. For the ANCH_GPOS case, we assume that
902 * the caller of intuit will have already set strpos to
903 * pos()-gofs, so in this case strpos + offset_max will still be
904 * an upper bound on the substr.
905 */
c19c836a
DM
906 if (!ml_anch
907 && prog->intflags & PREGf_ANCH
e0362b86 908 && prog->check_offset_max != SSize_t_MAX)
c19c836a 909 {
1a08ba3a 910 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
911 const char * const anchor =
912 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
913
914 /* do a bytes rather than chars comparison. It's conservative;
915 * so it skips doing the HOP if the result can't possibly end
916 * up earlier than the old value of end_point.
917 */
918 if ((char*)end_point - anchor > prog->check_offset_max) {
919 end_point = HOP3lim((U8*)anchor,
920 prog->check_offset_max,
921 end_point -len)
922 + len;
923 }
d6ef1678
DM
924 }
925
6bda09f9 926 DEBUG_OPTIMISE_MORE_r({
1dc475d0 927 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 928 (int)(end_point - start_point),
fc8cd66c 929 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
930 start_point);
931 });
932
ae5d4331 933 check_at = fbm_instr( start_point, end_point,
7fba1cd6 934 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 935
8fd34720
DM
936 /* Update the count-of-usability, remove useless subpatterns,
937 unshift s. */
938
939 DEBUG_EXECUTE_r({
940 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
941 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
942 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
943 (check_at ? "Found" : "Did not find"),
944 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
945 ? "anchored" : "floating"),
946 quoted,
947 RE_SV_TAIL(check),
948 (check_at ? " at offset " : "...\n") );
949 });
2c2d71f5 950
8fd34720
DM
951 if (!check_at)
952 goto fail_finish;
953 /* Finish the diagnostic message */
954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 955
8fd34720
DM
956 /* set rx_origin to the minimum position where the regex could start
957 * matching, given the constraint of the just-matched check substring.
958 * But don't set it lower than previously.
959 */
fdc003fd 960
8fd34720
DM
961 if (check_at - rx_origin > prog->check_offset_max)
962 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
963 }
fdc003fd
DM
964
965
1a4edc3c 966 /* now look for the 'other' substring if defined */
2c2d71f5 967
6480a6c4
DM
968 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
969 : prog->substrs->data[other_ix].substr)
1de06328 970 {
30944b6d 971 /* Take into account the "other" substring. */
6c3343a6
DM
972 char *last, *last1;
973 char *s;
974 SV* must;
975 struct reg_substr_datum *other;
976
977 do_other_substr:
978 other = &prog->substrs->data[other_ix];
979
980 /* if "other" is anchored:
981 * we've previously found a floating substr starting at check_at.
982 * This means that the regex origin must lie somewhere
983 * between min (rx_origin): HOP3(check_at, -check_offset_max)
984 * and max: HOP3(check_at, -check_offset_min)
985 * (except that min will be >= strpos)
986 * So the fixed substr must lie somewhere between
987 * HOP3(min, anchored_offset)
988 * HOP3(max, anchored_offset) + SvCUR(substr)
989 */
990
991 /* if "other" is floating
992 * Calculate last1, the absolute latest point where the
993 * floating substr could start in the string, ignoring any
994 * constraints from the earlier fixed match. It is calculated
995 * as follows:
996 *
997 * strend - prog->minlen (in chars) is the absolute latest
998 * position within the string where the origin of the regex
999 * could appear. The latest start point for the floating
1000 * substr is float_min_offset(*) on from the start of the
1001 * regex. last1 simply combines thee two offsets.
1002 *
1003 * (*) You might think the latest start point should be
1004 * float_max_offset from the regex origin, and technically
1005 * you'd be correct. However, consider
1006 * /a\d{2,4}bcd\w/
1007 * Here, float min, max are 3,5 and minlen is 7.
1008 * This can match either
1009 * /a\d\dbcd\w/
1010 * /a\d\d\dbcd\w/
1011 * /a\d\d\d\dbcd\w/
1012 * In the first case, the regex matches minlen chars; in the
1013 * second, minlen+1, in the third, minlen+2.
1014 * In the first case, the floating offset is 3 (which equals
1015 * float_min), in the second, 4, and in the third, 5 (which
1016 * equals float_max). In all cases, the floating string bcd
1017 * can never start more than 4 chars from the end of the
1018 * string, which equals minlen - float_min. As the substring
1019 * starts to match more than float_min from the start of the
1020 * regex, it makes the regex match more than minlen chars,
1021 * and the two cancel each other out. So we can always use
1022 * float_min - minlen, rather than float_max - minlen for the
1023 * latest position in the string.
1024 *
1025 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1026 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1027 */
1028
e7a14a9c 1029 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1030 last1 = HOP3c(strend,
1031 other->min_offset - prog->minlen, strbeg);
1032
4d006249 1033 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1034 /* last is the latest point where the floating substr could
1035 * start, *given* any constraints from the earlier fixed
1036 * match. This constraint is that the floating string starts
1037 * <= float_max_offset chars from the regex origin (rx_origin).
1038 * If this value is less than last1, use it instead.
eb3831ce 1039 */
6c3343a6
DM
1040 assert(rx_origin <= last1);
1041 last =
1042 /* this condition handles the offset==infinity case, and
1043 * is a short-cut otherwise. Although it's comparing a
1044 * byte offset to a char length, it does so in a safe way,
1045 * since 1 char always occupies 1 or more bytes,
1046 * so if a string range is (last1 - rx_origin) bytes,
1047 * it will be less than or equal to (last1 - rx_origin)
1048 * chars; meaning it errs towards doing the accurate HOP3
1049 * rather than just using last1 as a short-cut */
1050 (last1 - rx_origin) < other->max_offset
1051 ? last1
1052 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1053 }
1054 else {
1055 assert(strpos + start_shift <= check_at);
1056 last = HOP4c(check_at, other->min_offset - start_shift,
1057 strbeg, strend);
1058 }
ead917d0 1059
6c3343a6
DM
1060 s = HOP3c(rx_origin, other->min_offset, strend);
1061 if (s < other_last) /* These positions already checked */
1062 s = other_last;
1063
1064 must = utf8_target ? other->utf8_substr : other->substr;
1065 assert(SvPOK(must));
1066 s = fbm_instr(
1067 (unsigned char*)s,
1068 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1069 must,
1070 multiline ? FBMrf_MULTILINE : 0
1071 );
1072 DEBUG_EXECUTE_r({
1073 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1076 s ? "Found" : "Contradicts",
1077 other_ix ? "floating" : "anchored",
1078 quoted, RE_SV_TAIL(must));
1079 });
ead917d0 1080
ead917d0 1081
6c3343a6
DM
1082 if (!s) {
1083 /* last1 is latest possible substr location. If we didn't
1084 * find it before there, we never will */
1085 if (last >= last1) {
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1087 ", giving up...\n"));
1088 goto fail_finish;
ead917d0
DM
1089 }
1090
6c3343a6
DM
1091 /* try to find the check substr again at a later
1092 * position. Maybe next time we'll find the "other" substr
1093 * in range too */
1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095 ", trying %s at offset %ld...\n",
1096 (other_ix ? "floating" : "anchored"),
197641b6 1097 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1098
1099 other_last = HOP3c(last, 1, strend) /* highest failure */;
1100 rx_origin =
4d006249 1101 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1102 ? HOP3c(rx_origin, 1, strend)
1103 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1104 goto restart;
1105 }
1106 else {
1107 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1108 (long)(s - strpos)));
ead917d0 1109
4d006249 1110 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1111 /* other_last is set to s, not s+1, since its possible for
1112 * a floating substr to fail first time, then succeed
1113 * second time at the same floating position; e.g.:
1114 * "-AB--AABZ" =~ /\wAB\d*Z/
1115 * The first time round, anchored and float match at
1116 * "-(AB)--AAB(Z)" then fail on the initial \w character
1117 * class. Second time round, they match at "-AB--A(AB)(Z)".
1118 */
1119 other_last = s;
ead917d0
DM
1120 }
1121 else {
6c3343a6
DM
1122 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1123 other_last = HOP3c(s, 1, strend);
ead917d0 1124 }
6c3343a6 1125 }
cad2e5aa 1126 }
acba93e8
DM
1127 else {
1128 DEBUG_OPTIMISE_MORE_r(
1129 PerlIO_printf(Perl_debug_log,
1130 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1131 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1132 " strend-strpos:%"IVdf"\n",
1133 (IV)prog->check_offset_min,
1134 (IV)prog->check_offset_max,
1c1c599d 1135 (IV)(check_at-strpos),
acba93e8 1136 (IV)(rx_origin-strpos),
1c1c599d 1137 (IV)(rx_origin-check_at),
acba93e8
DM
1138 (IV)(strend-strpos)
1139 )
1140 );
1141 }
2c2d71f5 1142
acba93e8 1143 postprocess_substr_matches:
0991020e 1144
1a4edc3c 1145 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1146
7d2d37f5 1147 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1148 char *s;
1149
a62659bd
DM
1150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1151 " looking for /^/m anchor"));
d0880ea7
DM
1152
1153 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1154 * Find the next \n, if any, even if it's beyond the current
1155 * anchored and/or floating substrings. Whether we should be
1156 * scanning ahead for the next \n or the next substr is debatable.
1157 * On the one hand you'd expect rare substrings to appear less
1158 * often than \n's. On the other hand, searching for \n means
1159 * we're effectively flipping been check_substr and "\n" on each
1160 * iteration as the current "rarest" string candidate, which
1161 * means for example that we'll quickly reject the whole string if
1162 * hasn't got a \n, rather than trying every substr position
1163 * first
1164 */
d0880ea7 1165
4620cb61
DM
1166 s = HOP3c(strend, - prog->minlen, strpos);
1167 if (s <= rx_origin ||
1168 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1169 {
d0880ea7
DM
1170 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1171 " Did not find /%s^%s/m...\n",
1172 PL_colors[0], PL_colors[1]));
a62659bd
DM
1173 goto fail_finish;
1174 }
d0880ea7 1175
4ada1233
DM
1176 /* earliest possible origin is 1 char after the \n.
1177 * (since *rx_origin == '\n', it's safe to ++ here rather than
1178 * HOP(rx_origin, 1)) */
1179 rx_origin++;
d0880ea7 1180
f4f115de 1181 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1182 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1183 {
d0880ea7
DM
1184 /* Position contradicts check-string; either because
1185 * check was anchored (and thus has no wiggle room),
4ada1233 1186 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1187 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1188 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1189 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1190 goto restart;
1191 }
1192
1193 /* if we get here, the check substr must have been float,
2e759faa 1194 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1195 * "other" substr which still contradicts */
1196 assert(prog->substrs->check_ix); /* check is float */
1197
1198 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1199 /* whoops, the anchored "other" substr exists, so we still
1200 * contradict. On the other hand, the float "check" substr
1201 * didn't contradict, so just retry the anchored "other"
1202 * substr */
1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1204 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1205 PL_colors[0], PL_colors[1],
1206 (long)(rx_origin - strpos),
1207 (long)(rx_origin - strpos + prog->anchored_offset)));
1208 goto do_other_substr;
1209 }
1210
1211 /* success: we don't contradict the found floating substring
1212 * (and there's no anchored substr). */
d0880ea7
DM
1213 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1214 " Found /%s^%s/m at offset %ld...\n",
1215 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1216 }
1217 else {
2e759faa 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1219 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1220 }
1221
ffad1e6a 1222 success_at_start:
e3c6feb0 1223
cad2e5aa 1224
dd170ff5
DM
1225 /* if we have a starting character class, then test that extra constraint.
1226 * (trie stclasses are too expensive to use here, we are better off to
1227 * leave it to regmatch itself) */
1228
f8fc2ecf 1229 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1230 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1231
2c75e362 1232 /* XXX this value could be pre-computed */
f8fc2ecf 1233 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1234 ? (reginfo->is_utf8_pat
1235 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1236 : STR_LEN(progi->regstclass))
66e933ab 1237 : 1);
1de06328 1238 char * endpos;
fa3bb21d 1239 char *s;
000dfd2d
DM
1240 /* latest pos that a matching float substr constrains rx start to */
1241 char *rx_max_float = NULL;
1242
c75a3985
DM
1243 /* if the current rx_origin is anchored, either by satisfying an
1244 * anchored substring constraint, or a /^.../m constraint, then we
1245 * can reject the current origin if the start class isn't found
1246 * at the current position. If we have a float-only match, then
1247 * rx_origin is constrained to a range; so look for the start class
1248 * in that range. if neither, then look for the start class in the
1249 * whole rest of the string */
1250
dd170ff5
DM
1251 /* XXX DAPM it's not clear what the minlen test is for, and why
1252 * it's not used in the floating case. Nothing in the test suite
1253 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1254 * Here are some old comments, which may or may not be correct:
1255 *
1256 * minlen == 0 is possible if regstclass is \b or \B,
1257 * and the fixed substr is ''$.
1258 * Since minlen is already taken into account, rx_origin+1 is
1259 * before strend; accidentally, minlen >= 1 guaranties no false
1260 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1261 * 0) below assumes that regstclass does not come from lookahead...
1262 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1263 * This leaves EXACTF-ish only, which are dealt with in
1264 * find_byclass().
1265 */
1266
7d2d37f5 1267 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1268 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1269 else if (prog->float_substr || prog->float_utf8) {
1270 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1271 endpos= HOP3c(rx_max_float, cl_l, strend);
1272 }
1de06328
YO
1273 else
1274 endpos= strend;
1275
1dc475d0
DM
1276 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1277 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1278 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1279 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1280 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1281
c43b5520 1282 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1283 reginfo);
be778b1a 1284 if (!s) {
6eb5f6b9 1285 if (endpos == strend) {
a3621e74 1286 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1287 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1288 goto fail;
1289 }
a3621e74 1290 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1291 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1292 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1293 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1294 goto fail;
9fed8d02 1295
6eb5f6b9 1296 /* Contradict one of substrings */
97136c8a
DM
1297 if (prog->anchored_substr || prog->anchored_utf8) {
1298 if (prog->substrs->check_ix == 1) { /* check is float */
1299 /* Have both, check_string is floating */
1300 assert(rx_origin + start_shift <= check_at);
1301 if (rx_origin + start_shift != check_at) {
1302 /* not at latest position float substr could match:
c75a3985
DM
1303 * Recheck anchored substring, but not floating.
1304 * The condition above is in bytes rather than
1305 * chars for efficiency. It's conservative, in
1306 * that it errs on the side of doing 'goto
1307 * do_other_substr', where a more accurate
1308 * char-based calculation will be done */
97136c8a
DM
1309 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1310 " Looking for anchored substr starting at offset %ld...\n",
1311 (long)(other_last - strpos)) );
1312 goto do_other_substr;
3369914b 1313 }
3369914b
DM
1314 }
1315 }
97136c8a 1316 else {
9fed8d02
DM
1317 /* float-only */
1318
7d2d37f5 1319 if (ml_anch) {
c75a3985
DM
1320 /* In the presence of ml_anch, we might be able to
1321 * find another \n without breaking the current float
1322 * constraint. */
1323
1324 /* strictly speaking this should be HOP3c(..., 1, ...),
1325 * but since we goto a block of code that's going to
1326 * search for the next \n if any, its safe here */
9fed8d02 1327 rx_origin++;
9fed8d02
DM
1328 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1329 " Looking for /%s^%s/m starting at offset %ld...\n",
1330 PL_colors[0], PL_colors[1],
1331 (long)(rx_origin - strpos)) );
9fed8d02 1332 goto postprocess_substr_matches;
ab60c45a 1333 }
c75a3985
DM
1334
1335 /* strictly speaking this can never be true; but might
1336 * be if we ever allow intuit without substrings */
1337 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1338 goto fail;
c75a3985 1339
000dfd2d 1340 rx_origin = rx_max_float;
9fed8d02
DM
1341 }
1342
c75a3985
DM
1343 /* at this point, any matching substrings have been
1344 * contradicted. Start again... */
1345
9fed8d02 1346 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1347
1348 /* uses bytes rather than char calculations for efficiency.
1349 * It's conservative: it errs on the side of doing 'goto restart',
1350 * where there is code that does a proper char-based test */
9fed8d02 1351 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1352 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1353 " Could not match STCLASS...\n") );
1354 goto fail;
1355 }
1356 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1357 " Looking for %s substr starting at offset %ld...\n",
1358 (prog->substrs->check_ix ? "floating" : "anchored"),
1359 (long)(rx_origin + start_shift - strpos)) );
1360 goto restart;
6eb5f6b9 1361 }
9fed8d02 1362
c75a3985
DM
1363 /* Success !!! */
1364
5f9c6575 1365 if (rx_origin != s) {
a3621e74 1366 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1367 " By STCLASS: moving %ld --> %ld\n",
5f9c6575 1368 (long)(rx_origin - strpos), (long)(s - strpos))
b7953727
JH
1369 );
1370 }
1371 else {
a3621e74 1372 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1373 " Does not contradict STCLASS...\n");
b7953727
JH
1374 );
1375 }
6eb5f6b9 1376 }
ffad1e6a
DM
1377
1378 /* Decide whether using the substrings helped */
1379
1380 if (rx_origin != strpos) {
1381 /* Fixed substring is found far enough so that the match
1382 cannot start at strpos. */
1383
1384 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1385 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1386 }
1387 else {
70563e16
DM
1388 /* The found rx_origin position does not prohibit matching at
1389 * strpos, so calling intuit didn't gain us anything. Decrement
1390 * the BmUSEFUL() count on the check substring, and if we reach
1391 * zero, free it. */
1392 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1393 && (utf8_target ? (
1394 prog->check_utf8 /* Could be deleted already */
1395 && --BmUSEFUL(prog->check_utf8) < 0
1396 && (prog->check_utf8 == prog->float_utf8)
1397 ) : (
1398 prog->check_substr /* Could be deleted already */
1399 && --BmUSEFUL(prog->check_substr) < 0
1400 && (prog->check_substr == prog->float_substr)
1401 )))
1402 {
1403 /* If flags & SOMETHING - do not do it many times on the same match */
1404 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1405 /* XXX Does the destruction order has to change with utf8_target? */
1406 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1407 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1408 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1409 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1410 check = NULL; /* abort */
ffad1e6a
DM
1411 /* XXXX This is a remnant of the old implementation. It
1412 looks wasteful, since now INTUIT can use many
1413 other heuristics. */
1414 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1415 }
1416 }
1417
1418 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1419 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1420 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1421
c765d6e0 1422 return rx_origin;
2c2d71f5
JH
1423
1424 fail_finish: /* Substring not found */
33b8afdf 1425 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1426 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1427 fail:
a3621e74 1428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1429 PL_colors[4], PL_colors[5]));
bd61b366 1430 return NULL;
cad2e5aa 1431}
9661b544 1432
70563e16 1433
a0a388a1 1434#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1435 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1436 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1437 trie_type = ((scan->flags == EXACT) \
1438 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1439 : (scan->flags == EXACTFA) \
1440 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1441 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1442
fd3249ee 1443#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1444STMT_START { \
fab2782b 1445 STRLEN skiplen; \
baa60164 1446 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1447 switch (trie_type) { \
31f05a37 1448 case trie_utf8_exactfa_fold: \
baa60164 1449 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1450 /* FALLTHROUGH */ \
fab2782b
YO
1451 case trie_utf8_fold: \
1452 if ( foldlen>0 ) { \
c80e42f3 1453 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1454 foldlen -= len; \
1455 uscan += len; \
1456 len=0; \
1457 } else { \
445bf929 1458 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1459 len = UTF8SKIP(uc); \
1460 skiplen = UNISKIP( uvc ); \
1461 foldlen -= skiplen; \
1462 uscan = foldbuf + skiplen; \
1463 } \
1464 break; \
baa60164
KW
1465 case trie_latin_utf8_exactfa_fold: \
1466 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1467 /* FALLTHROUGH */ \
fab2782b
YO
1468 case trie_latin_utf8_fold: \
1469 if ( foldlen>0 ) { \
c80e42f3 1470 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1471 foldlen -= len; \
1472 uscan += len; \
1473 len=0; \
1474 } else { \
1475 len = 1; \
31f05a37 1476 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1477 skiplen = UNISKIP( uvc ); \
1478 foldlen -= skiplen; \
1479 uscan = foldbuf + skiplen; \
1480 } \
1481 break; \
1482 case trie_utf8: \
c80e42f3 1483 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1484 break; \
1485 case trie_plain: \
1486 uvc = (UV)*uc; \
1487 len = 1; \
1488 } \
1489 if (uvc < 256) { \
1490 charid = trie->charmap[ uvc ]; \
1491 } \
1492 else { \
1493 charid = 0; \
1494 if (widecharmap) { \
1495 SV** const svpp = hv_fetch(widecharmap, \
1496 (char*)&uvc, sizeof(UV), 0); \
1497 if (svpp) \
1498 charid = (U16)SvIV(*svpp); \
1499 } \
1500 } \
4cadc6a9
YO
1501} STMT_END
1502
ae7c5b9b
KW
1503#define DUMP_EXEC_POS(li,s,doutf8) \
1504 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1505 startpos, doutf8)
1506
c84a03c5 1507#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1508STMT_START { \
1509 while (s <= e) { \
c84a03c5 1510 if ( (COND) \
fac1af77 1511 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1512 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1513 goto got_it; \
1514 s++; \
1515 } \
1516} STMT_END
1517
c84a03c5 1518#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1519STMT_START { \
9a902117 1520 while (s < strend) { \
c84a03c5 1521 CODE \
9a902117 1522 s += UTF8SKIP(s); \
4cadc6a9
YO
1523 } \
1524} STMT_END
1525
c84a03c5 1526#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1527STMT_START { \
1528 while (s < strend) { \
c84a03c5 1529 CODE \
4cadc6a9
YO
1530 s++; \
1531 } \
1532} STMT_END
1533
05bd126c
KW
1534#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1535REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1536 if (COND) { \
1537 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1538 goto got_it; \
1539 else \
1540 tmp = doevery; \
1541 } \
1542 else \
1543 tmp = 1; \
4cadc6a9
YO
1544)
1545
05bd126c
KW
1546#define REXEC_FBC_CLASS_SCAN(COND) \
1547REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1548 if (COND) { \
1549 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1550 goto got_it; \
1551 else \
1552 tmp = doevery; \
1553 } \
1554 else \
1555 tmp = 1; \
4cadc6a9
YO
1556)
1557
c84a03c5 1558#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1559 if (utf8_target) { \
c84a03c5 1560 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1561 } \
1562 else { \
c84a03c5 1563 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1564 }
05bd126c 1565
05bd126c
KW
1566/* The three macros below are slightly different versions of the same logic.
1567 *
1568 * The first is for /a and /aa when the target string is UTF-8. This can only
1569 * match ascii, but it must advance based on UTF-8. The other two handle the
1570 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1571 * for the boundary (or non-boundary) between a word and non-word character.
1572 * The utf8 and non-utf8 cases have the same logic, but the details must be
1573 * different. Find the "wordness" of the character just prior to this one, and
1574 * compare it with the wordness of this one. If they differ, we have a
1575 * boundary. At the beginning of the string, pretend that the previous
1576 * character was a new-line.
1577 *
1578 * All these macros uncleanly have side-effects with each other and outside
1579 * variables. So far it's been too much trouble to clean-up
1580 *
1581 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1582 * a word character or not.
1583 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1584 * word/non-word
1585 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1586 *
1587 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1588 * are looking for a boundary or for a non-boundary. If we are looking for a
1589 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1590 * see if this tentative match actually works, and if so, to quit the loop
1591 * here. And vice-versa if we are looking for a non-boundary.
1592 *
1593 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1594 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1595 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1596 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1597 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1598 * complement. But in that branch we complement tmp, meaning that at the
1599 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1600 * which means at the top of the loop in the next iteration, it is
1601 * TEST_NON_UTF8(s-1) */
b2f4e957 1602#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1603 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1604 tmp = TEST_NON_UTF8(tmp); \
1605 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1606 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1607 tmp = !tmp; \
1608 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1609 } \
1610 else { \
1611 IF_FAIL; \
1612 } \
1613 ); \
1614
1615/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1616 * TEST_UTF8 is a macro that for the same input code points returns identically
1617 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1618#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1619 if (s == reginfo->strbeg) { \
1620 tmp = '\n'; \
1621 } \
1622 else { /* Back-up to the start of the previous character */ \
1623 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1624 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1625 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1626 } \
1627 tmp = TEST_UV(tmp); \
1628 LOAD_UTF8_CHARCLASS_ALNUM(); \
1629 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1630 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1631 tmp = !tmp; \
1632 IF_SUCCESS; \
1633 } \
1634 else { \
1635 IF_FAIL; \
1636 } \
1637 );
cfaf538b 1638
05bd126c
KW
1639/* Like the above two macros. UTF8_CODE is the complete code for handling
1640 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1641 * macros below */
baa60164 1642#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1643 if (utf8_target) { \
05bd126c 1644 UTF8_CODE \
63ac0dad
KW
1645 } \
1646 else { /* Not utf8 */ \
9d9163fb 1647 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1648 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1649 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1650 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1651 IF_SUCCESS; \
760cfa8e 1652 tmp = !tmp; \
63ac0dad
KW
1653 } \
1654 else { \
1655 IF_FAIL; \
1656 } \
1657 ); \
1658 } \
c8519dc7
KW
1659 /* Here, things have been set up by the previous code so that tmp is the \
1660 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1661 * utf8ness of the target). We also have to check if this matches against \
1662 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1663 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1664 * string */ \
1665 if (tmp == ! TEST_NON_UTF8('\n')) { \
1666 IF_SUCCESS; \
1667 } \
1668 else { \
1669 IF_FAIL; \
1670 }
63ac0dad 1671
ae7c5b9b
KW
1672/* This is the macro to use when we want to see if something that looks like it
1673 * could match, actually does, and if so exits the loop */
1674#define REXEC_FBC_TRYIT \
1675 if ((reginfo->intuit || regtry(reginfo, &s))) \
1676 goto got_it
1677
1678/* The only difference between the BOUND and NBOUND cases is that
1679 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1680 * NBOUND. This is accomplished by passing it as either the if or else clause,
1681 * with the other one being empty (PLACEHOLDER is defined as empty).
1682 *
1683 * The TEST_FOO parameters are for operating on different forms of input, but
1684 * all should be ones that return identically for the same underlying code
1685 * points */
1686#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1687 FBC_BOUND_COMMON( \
1688 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1689 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1690
1691#define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1692 FBC_BOUND_COMMON( \
1693 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1694 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1695
1696#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1697 FBC_BOUND_COMMON( \
1698 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1699 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1700
1701#define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1702 FBC_BOUND_COMMON( \
1703 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1704 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1705
05bd126c 1706
786e8c11 1707/* We know what class REx starts with. Try to find this position... */
02d5137b 1708/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1709/* annoyingly all the vars in this routine have different names from their counterparts
1710 in regmatch. /grrr */
3c3eec57 1711STATIC char *
07be1b83 1712S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1713 const char *strend, regmatch_info *reginfo)
a687059c 1714{
73104a1b
KW
1715 dVAR;
1716 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1717 char *pat_string; /* The pattern's exactish string */
1718 char *pat_end; /* ptr to end char of pat_string */
1719 re_fold_t folder; /* Function for computing non-utf8 folds */
1720 const U8 *fold_array; /* array for folding ords < 256 */
1721 STRLEN ln;
1722 STRLEN lnc;
73104a1b
KW
1723 U8 c1;
1724 U8 c2;
1725 char *e;
1726 I32 tmp = 1; /* Scratch variable? */
ba44c216 1727 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1728 UV utf8_fold_flags = 0;
f9176b44 1729 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1730 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1731 with a result inverts that result, as 0^1 =
1732 1 and 1^1 = 0 */
1733 _char_class_number classnum;
1734
73104a1b 1735 RXi_GET_DECL(prog,progi);
2f7f8cb1 1736
73104a1b 1737 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1738
73104a1b
KW
1739 /* We know what class it must start with. */
1740 switch (OP(c)) {
1741 case ANYOF:
1742 if (utf8_target) {
1743 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1744 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1745 }
1746 else {
1747 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1748 }
1749 break;
1750 case CANY:
1751 REXEC_FBC_SCAN(
02d5137b 1752 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1753 goto got_it;
1754 else
1755 tmp = doevery;
1756 );
1757 break;
1758
098b07d5
KW
1759 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1760 assert(! is_utf8_pat);
924ba076 1761 /* FALLTHROUGH */
73104a1b 1762 case EXACTFA:
984e6dd1 1763 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1764 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1765 goto do_exactf_utf8;
1766 }
1767 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1768 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1769 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1770
2fdb7295
KW
1771 case EXACTF: /* This node only generated for non-utf8 patterns */
1772 assert(! is_utf8_pat);
73104a1b 1773 if (utf8_target) {
73104a1b
KW
1774 utf8_fold_flags = 0;
1775 goto do_exactf_utf8;
1776 }
1777 fold_array = PL_fold;
1778 folder = foldEQ;
1779 goto do_exactf_non_utf8;
1780
1781 case EXACTFL:
31f05a37 1782 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1783 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1784 goto do_exactf_utf8;
1785 }
1786 fold_array = PL_fold_locale;
1787 folder = foldEQ_locale;
1788 goto do_exactf_non_utf8;
3c760661 1789
73104a1b 1790 case EXACTFU_SS:
984e6dd1 1791 if (is_utf8_pat) {
73104a1b
KW
1792 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1793 }
1794 goto do_exactf_utf8;
16d951b7 1795
73104a1b 1796 case EXACTFU:
984e6dd1
DM
1797 if (is_utf8_pat || utf8_target) {
1798 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1799 goto do_exactf_utf8;
1800 }
fac1af77 1801
73104a1b
KW
1802 /* Any 'ss' in the pattern should have been replaced by regcomp,
1803 * so we don't have to worry here about this single special case
1804 * in the Latin1 range */
1805 fold_array = PL_fold_latin1;
1806 folder = foldEQ_latin1;
1807
924ba076 1808 /* FALLTHROUGH */
73104a1b
KW
1809
1810 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1811 are no glitches with fold-length differences
1812 between the target string and pattern */
1813
1814 /* The idea in the non-utf8 EXACTF* cases is to first find the
1815 * first character of the EXACTF* node and then, if necessary,
1816 * case-insensitively compare the full text of the node. c1 is the
1817 * first character. c2 is its fold. This logic will not work for
1818 * Unicode semantics and the german sharp ss, which hence should
1819 * not be compiled into a node that gets here. */
1820 pat_string = STRING(c);
1821 ln = STR_LEN(c); /* length to match in octets/bytes */
1822
1823 /* We know that we have to match at least 'ln' bytes (which is the
1824 * same as characters, since not utf8). If we have to match 3
1825 * characters, and there are only 2 availabe, we know without
1826 * trying that it will fail; so don't start a match past the
1827 * required minimum number from the far end */
ea3daa5d 1828 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1829
02d5137b 1830 if (reginfo->intuit && e < s) {
73104a1b
KW
1831 e = s; /* Due to minlen logic of intuit() */
1832 }
fac1af77 1833
73104a1b
KW
1834 c1 = *pat_string;
1835 c2 = fold_array[c1];
1836 if (c1 == c2) { /* If char and fold are the same */
1837 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1838 }
1839 else {
1840 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1841 }
1842 break;
fac1af77 1843
73104a1b
KW
1844 do_exactf_utf8:
1845 {
1846 unsigned expansion;
1847
1848 /* If one of the operands is in utf8, we can't use the simpler folding
1849 * above, due to the fact that many different characters can have the
1850 * same fold, or portion of a fold, or different- length fold */
1851 pat_string = STRING(c);
1852 ln = STR_LEN(c); /* length to match in octets/bytes */
1853 pat_end = pat_string + ln;
984e6dd1 1854 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1855 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1856 : ln;
1857
1858 /* We have 'lnc' characters to match in the pattern, but because of
1859 * multi-character folding, each character in the target can match
1860 * up to 3 characters (Unicode guarantees it will never exceed
1861 * this) if it is utf8-encoded; and up to 2 if not (based on the
1862 * fact that the Latin 1 folds are already determined, and the
1863 * only multi-char fold in that range is the sharp-s folding to
1864 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1865 * string character. Adjust lnc accordingly, rounding up, so that
1866 * if we need to match at least 4+1/3 chars, that really is 5. */
1867 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1868 lnc = (lnc + expansion - 1) / expansion;
1869
1870 /* As in the non-UTF8 case, if we have to match 3 characters, and
1871 * only 2 are left, it's guaranteed to fail, so don't start a
1872 * match that would require us to go beyond the end of the string
1873 */
ea3daa5d 1874 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1875
02d5137b 1876 if (reginfo->intuit && e < s) {
73104a1b
KW
1877 e = s; /* Due to minlen logic of intuit() */
1878 }
0658cdde 1879
73104a1b
KW
1880 /* XXX Note that we could recalculate e to stop the loop earlier,
1881 * as the worst case expansion above will rarely be met, and as we
1882 * go along we would usually find that e moves further to the left.
1883 * This would happen only after we reached the point in the loop
1884 * where if there were no expansion we should fail. Unclear if
1885 * worth the expense */
1886
1887 while (s <= e) {
1888 char *my_strend= (char *)strend;
1889 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1890 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1891 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1892 {
1893 goto got_it;
1894 }
1895 s += (utf8_target) ? UTF8SKIP(s) : 1;
1896 }
1897 break;
1898 }
236d82fd 1899
73104a1b 1900 case BOUNDL:
236d82fd 1901 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b
KW
1902 break;
1903 case NBOUNDL:
236d82fd 1904 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b
KW
1905 break;
1906 case BOUND:
236d82fd 1907 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1908 break;
1909 case BOUNDA:
236d82fd 1910 FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
73104a1b
KW
1911 break;
1912 case NBOUND:
236d82fd 1913 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1914 break;
1915 case NBOUNDA:
236d82fd 1916 FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
73104a1b
KW
1917 break;
1918 case BOUNDU:
236d82fd 1919 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b
KW
1920 break;
1921 case NBOUNDU:
236d82fd 1922 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 1923 break;
73104a1b
KW
1924 case LNBREAK:
1925 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1926 is_LNBREAK_latin1_safe(s, strend)
1927 );
1928 break;
3018b823
KW
1929
1930 /* The argument to all the POSIX node types is the class number to pass to
1931 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1932
1933 case NPOSIXL:
1934 to_complement = 1;
1935 /* FALLTHROUGH */
1936
1937 case POSIXL:
3018b823
KW
1938 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1939 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1940 break;
3018b823
KW
1941
1942 case NPOSIXD:
1943 to_complement = 1;
1944 /* FALLTHROUGH */
1945
1946 case POSIXD:
1947 if (utf8_target) {
1948 goto posix_utf8;
1949 }
1950 goto posixa;
1951
1952 case NPOSIXA:
1953 if (utf8_target) {
1954 /* The complement of something that matches only ASCII matches all
837226c8
KW
1955 * non-ASCII, plus everything in ASCII that isn't in the class. */
1956 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
1957 || ! _generic_isCC_A(*s, FLAGS(c)));
1958 break;
1959 }
1960
1961 to_complement = 1;
1962 /* FALLTHROUGH */
1963
73104a1b 1964 case POSIXA:
3018b823 1965 posixa:
73104a1b 1966 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1967 * byte invariant character. */
1968 REXEC_FBC_CLASS_SCAN(
1969 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1970 break;
3018b823
KW
1971
1972 case NPOSIXU:
1973 to_complement = 1;
1974 /* FALLTHROUGH */
1975
1976 case POSIXU:
1977 if (! utf8_target) {
1978 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1979 FLAGS(c))));
1980 }
1981 else {
1982
1983 posix_utf8:
1984 classnum = (_char_class_number) FLAGS(c);
1985 if (classnum < _FIRST_NON_SWASH_CC) {
1986 while (s < strend) {
1987
1988 /* We avoid loading in the swash as long as possible, but
1989 * should we have to, we jump to a separate loop. This
1990 * extra 'if' statement is what keeps this code from being
1991 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1992 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1993 goto found_above_latin1;
1994 }
1995 if ((UTF8_IS_INVARIANT(*s)
1996 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1997 classnum)))
1998 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1999 && to_complement ^ cBOOL(
94bb8c36
KW
2000 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2001 *(s + 1)),
3018b823
KW
2002 classnum))))
2003 {
02d5137b 2004 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2005 goto got_it;
2006 else {
2007 tmp = doevery;
2008 }
2009 }
2010 else {
2011 tmp = 1;
2012 }
2013 s += UTF8SKIP(s);
2014 }
2015 }
2016 else switch (classnum) { /* These classes are implemented as
2017 macros */
2018 case _CC_ENUM_SPACE: /* XXX would require separate code if we
2019 revert the change of \v matching this */
924ba076 2020 /* FALLTHROUGH */
3018b823
KW
2021
2022 case _CC_ENUM_PSXSPC:
2023 REXEC_FBC_UTF8_CLASS_SCAN(
2024 to_complement ^ cBOOL(isSPACE_utf8(s)));
2025 break;
2026
2027 case _CC_ENUM_BLANK:
2028 REXEC_FBC_UTF8_CLASS_SCAN(
2029 to_complement ^ cBOOL(isBLANK_utf8(s)));
2030 break;
2031
2032 case _CC_ENUM_XDIGIT:
2033 REXEC_FBC_UTF8_CLASS_SCAN(
2034 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2035 break;
2036
2037 case _CC_ENUM_VERTSPACE:
2038 REXEC_FBC_UTF8_CLASS_SCAN(
2039 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2040 break;
2041
2042 case _CC_ENUM_CNTRL:
2043 REXEC_FBC_UTF8_CLASS_SCAN(
2044 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2045 break;
2046
2047 default:
2048 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
a25b5927 2049 assert(0); /* NOTREACHED */
3018b823
KW
2050 }
2051 }
2052 break;
2053
2054 found_above_latin1: /* Here we have to load a swash to get the result
2055 for the current code point */
2056 if (! PL_utf8_swash_ptrs[classnum]) {
2057 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2058 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2059 _core_swash_init("utf8",
2060 "",
2061 &PL_sv_undef, 1, 0,
2062 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2063 }
2064
2065 /* This is a copy of the loop above for swash classes, though using the
2066 * FBC macro instead of being expanded out. Since we've loaded the
2067 * swash, we don't have to check for that each time through the loop */
2068 REXEC_FBC_UTF8_CLASS_SCAN(
2069 to_complement ^ cBOOL(_generic_utf8(
2070 classnum,
2071 s,
2072 swash_fetch(PL_utf8_swash_ptrs[classnum],
2073 (U8 *) s, TRUE))));
73104a1b
KW
2074 break;
2075
2076 case AHOCORASICKC:
2077 case AHOCORASICK:
2078 {
2079 DECL_TRIE_TYPE(c);
2080 /* what trie are we using right now */
2081 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2082 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2083 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2084
2085 const char *last_start = strend - trie->minlen;
6148ee25 2086#ifdef DEBUGGING
73104a1b 2087 const char *real_start = s;
6148ee25 2088#endif
73104a1b
KW
2089 STRLEN maxlen = trie->maxlen;
2090 SV *sv_points;
2091 U8 **points; /* map of where we were in the input string
2092 when reading a given char. For ASCII this
2093 is unnecessary overhead as the relationship
2094 is always 1:1, but for Unicode, especially
2095 case folded Unicode this is not true. */
2096 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2097 U8 *bitmap=NULL;
2098
2099
2100 GET_RE_DEBUG_FLAGS_DECL;
2101
2102 /* We can't just allocate points here. We need to wrap it in
2103 * an SV so it gets freed properly if there is a croak while
2104 * running the match */
2105 ENTER;
2106 SAVETMPS;
2107 sv_points=newSV(maxlen * sizeof(U8 *));
2108 SvCUR_set(sv_points,
2109 maxlen * sizeof(U8 *));
2110 SvPOK_on(sv_points);
2111 sv_2mortal(sv_points);
2112 points=(U8**)SvPV_nolen(sv_points );
2113 if ( trie_type != trie_utf8_fold
2114 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2115 {
2116 if (trie->bitmap)
2117 bitmap=(U8*)trie->bitmap;
2118 else
2119 bitmap=(U8*)ANYOF_BITMAP(c);
2120 }
2121 /* this is the Aho-Corasick algorithm modified a touch
2122 to include special handling for long "unknown char" sequences.
2123 The basic idea being that we use AC as long as we are dealing
2124 with a possible matching char, when we encounter an unknown char
2125 (and we have not encountered an accepting state) we scan forward
2126 until we find a legal starting char.
2127 AC matching is basically that of trie matching, except that when
2128 we encounter a failing transition, we fall back to the current
2129 states "fail state", and try the current char again, a process
2130 we repeat until we reach the root state, state 1, or a legal
2131 transition. If we fail on the root state then we can either
2132 terminate if we have reached an accepting state previously, or
2133 restart the entire process from the beginning if we have not.
2134
2135 */
2136 while (s <= last_start) {
2137 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2138 U8 *uc = (U8*)s;
2139 U16 charid = 0;
2140 U32 base = 1;
2141 U32 state = 1;
2142 UV uvc = 0;
2143 STRLEN len = 0;
2144 STRLEN foldlen = 0;
2145 U8 *uscan = (U8*)NULL;
2146 U8 *leftmost = NULL;
2147#ifdef DEBUGGING
2148 U32 accepted_word= 0;
786e8c11 2149#endif
73104a1b
KW
2150 U32 pointpos = 0;
2151
2152 while ( state && uc <= (U8*)strend ) {
2153 int failed=0;
2154 U32 word = aho->states[ state ].wordnum;
2155
2156 if( state==1 ) {
2157 if ( bitmap ) {
2158 DEBUG_TRIE_EXECUTE_r(
2159 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2160 dump_exec_pos( (char *)uc, c, strend, real_start,
2161 (char *)uc, utf8_target );
2162 PerlIO_printf( Perl_debug_log,
2163 " Scanning for legal start char...\n");
2164 }
2165 );
2166 if (utf8_target) {
2167 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2168 uc += UTF8SKIP(uc);
2169 }
2170 } else {
2171 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2172 uc++;
2173 }
786e8c11 2174 }
73104a1b 2175 s= (char *)uc;
07be1b83 2176 }
73104a1b
KW
2177 if (uc >(U8*)last_start) break;
2178 }
2179
2180 if ( word ) {
2181 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2182 if (!leftmost || lpos < leftmost) {
2183 DEBUG_r(accepted_word=word);
2184 leftmost= lpos;
7016d6eb 2185 }
73104a1b 2186 if (base==0) break;
7016d6eb 2187
73104a1b
KW
2188 }
2189 points[pointpos++ % maxlen]= uc;
2190 if (foldlen || uc < (U8*)strend) {
2191 REXEC_TRIE_READ_CHAR(trie_type, trie,
2192 widecharmap, uc,
2193 uscan, len, uvc, charid, foldlen,
2194 foldbuf, uniflags);
2195 DEBUG_TRIE_EXECUTE_r({
2196 dump_exec_pos( (char *)uc, c, strend,
2197 real_start, s, utf8_target);
2198 PerlIO_printf(Perl_debug_log,
2199 " Charid:%3u CP:%4"UVxf" ",
2200 charid, uvc);
2201 });
2202 }
2203 else {
2204 len = 0;
2205 charid = 0;
2206 }
07be1b83 2207
73104a1b
KW
2208
2209 do {
6148ee25 2210#ifdef DEBUGGING
73104a1b 2211 word = aho->states[ state ].wordnum;
6148ee25 2212#endif
73104a1b
KW
2213 base = aho->states[ state ].trans.base;
2214
2215 DEBUG_TRIE_EXECUTE_r({
2216 if (failed)
2217 dump_exec_pos( (char *)uc, c, strend, real_start,
2218 s, utf8_target );
2219 PerlIO_printf( Perl_debug_log,
2220 "%sState: %4"UVxf", word=%"UVxf,
2221 failed ? " Fail transition to " : "",
2222 (UV)state, (UV)word);
2223 });
2224 if ( base ) {
2225 U32 tmp;
2226 I32 offset;
2227 if (charid &&
2228 ( ((offset = base + charid
2229 - 1 - trie->uniquecharcount)) >= 0)
2230 && ((U32)offset < trie->lasttrans)
2231 && trie->trans[offset].check == state
2232 && (tmp=trie->trans[offset].next))
2233 {
2234 DEBUG_TRIE_EXECUTE_r(
2235 PerlIO_printf( Perl_debug_log," - legal\n"));
2236 state = tmp;
2237 break;
07be1b83
YO
2238 }
2239 else {
786e8c11 2240 DEBUG_TRIE_EXECUTE_r(
73104a1b 2241 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2242 failed = 1;
73104a1b 2243 state = aho->fail[state];
07be1b83 2244 }
07be1b83 2245 }
73104a1b
KW
2246 else {
2247 /* we must be accepting here */
2248 DEBUG_TRIE_EXECUTE_r(
2249 PerlIO_printf( Perl_debug_log," - accepting\n"));
2250 failed = 1;
2251 break;
786e8c11 2252 }
73104a1b
KW
2253 } while(state);
2254 uc += len;
2255 if (failed) {
2256 if (leftmost)
2257 break;
2258 if (!state) state = 1;
07be1b83 2259 }
73104a1b
KW
2260 }
2261 if ( aho->states[ state ].wordnum ) {
2262 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2263 if (!leftmost || lpos < leftmost) {
2264 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2265 leftmost = lpos;
07be1b83
YO
2266 }
2267 }
73104a1b
KW
2268 if (leftmost) {
2269 s = (char*)leftmost;
2270 DEBUG_TRIE_EXECUTE_r({
2271 PerlIO_printf(
2272 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2273 (UV)accepted_word, (IV)(s - real_start)
2274 );
2275 });
02d5137b 2276 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2277 FREETMPS;
2278 LEAVE;
2279 goto got_it;
2280 }
2281 s = HOPc(s,1);
2282 DEBUG_TRIE_EXECUTE_r({
2283 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2284 });
2285 } else {
2286 DEBUG_TRIE_EXECUTE_r(
2287 PerlIO_printf( Perl_debug_log,"No match.\n"));
2288 break;
2289 }
2290 }
2291 FREETMPS;
2292 LEAVE;
2293 }
2294 break;
2295 default:
2296 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2297 }
2298 return 0;
2299 got_it:
2300 return s;
6eb5f6b9
JH
2301}
2302
60165aa4
DM
2303/* set RX_SAVED_COPY, RX_SUBBEG etc.
2304 * flags have same meanings as with regexec_flags() */
2305
749f4950
DM
2306static void
2307S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2308 char *strbeg,
2309 char *strend,
2310 SV *sv,
2311 U32 flags,
2312 bool utf8_target)
2313{
2314 struct regexp *const prog = ReANY(rx);
2315
60165aa4
DM
2316 if (flags & REXEC_COPY_STR) {
2317#ifdef PERL_ANY_COW
2318 if (SvCANCOW(sv)) {
2319 if (DEBUG_C_TEST) {
2320 PerlIO_printf(Perl_debug_log,
2321 "Copy on write: regexp capture, type %d\n",
2322 (int) SvTYPE(sv));
2323 }
5411a0e5
DM
2324 /* Create a new COW SV to share the match string and store
2325 * in saved_copy, unless the current COW SV in saved_copy
2326 * is valid and suitable for our purpose */
2327 if (( prog->saved_copy
2328 && SvIsCOW(prog->saved_copy)
2329 && SvPOKp(prog->saved_copy)
2330 && SvIsCOW(sv)
2331 && SvPOKp(sv)
2332 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2333 {
5411a0e5
DM
2334 /* just reuse saved_copy SV */
2335 if (RXp_MATCH_COPIED(prog)) {
2336 Safefree(prog->subbeg);
2337 RXp_MATCH_COPIED_off(prog);
2338 }
2339 }
2340 else {
2341 /* create new COW SV to share string */
a76b0e90
DM
2342 RX_MATCH_COPY_FREE(rx);
2343 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2344 }
5411a0e5
DM
2345 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2346 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2347 prog->sublen = strend - strbeg;
2348 prog->suboffset = 0;
2349 prog->subcoffset = 0;
2350 } else
2351#endif
2352 {
99a90e59
FC
2353 SSize_t min = 0;
2354 SSize_t max = strend - strbeg;
ea3daa5d 2355 SSize_t sublen;
60165aa4
DM
2356
2357 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2358 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2359 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2360 ) { /* don't copy $' part of string */
2361 U32 n = 0;
2362 max = -1;
2363 /* calculate the right-most part of the string covered
2364 * by a capture. Due to look-ahead, this may be to
2365 * the right of $&, so we have to scan all captures */
2366 while (n <= prog->lastparen) {
2367 if (prog->offs[n].end > max)
2368 max = prog->offs[n].end;
2369 n++;
2370 }
2371 if (max == -1)
2372 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2373 ? prog->offs[0].start
2374 : 0;
2375 assert(max >= 0 && max <= strend - strbeg);
2376 }
2377
2378 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2379 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2380 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2381 ) { /* don't copy $` part of string */
2382 U32 n = 0;
2383 min = max;
2384 /* calculate the left-most part of the string covered
2385 * by a capture. Due to look-behind, this may be to
2386 * the left of $&, so we have to scan all captures */
2387 while (min && n <= prog->lastparen) {
2388 if ( prog->offs[n].start != -1
2389 && prog->offs[n].start < min)
2390 {
2391 min = prog->offs[n].start;
2392 }
2393 n++;
2394 }
2395 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2396 && min > prog->offs[0].end
2397 )
2398 min = prog->offs[0].end;
2399
2400 }
2401
2402 assert(min >= 0 && min <= max && min <= strend - strbeg);
2403 sublen = max - min;
2404
2405 if (RX_MATCH_COPIED(rx)) {
2406 if (sublen > prog->sublen)
2407 prog->subbeg =
2408 (char*)saferealloc(prog->subbeg, sublen+1);
2409 }
2410 else
2411 prog->subbeg = (char*)safemalloc(sublen+1);
2412 Copy(strbeg + min, prog->subbeg, sublen, char);
2413 prog->subbeg[sublen] = '\0';
2414 prog->suboffset = min;
2415 prog->sublen = sublen;
2416 RX_MATCH_COPIED_on(rx);
2417 }
2418 prog->subcoffset = prog->suboffset;
2419 if (prog->suboffset && utf8_target) {
2420 /* Convert byte offset to chars.
2421 * XXX ideally should only compute this if @-/@+
2422 * has been seen, a la PL_sawampersand ??? */
2423
2424 /* If there's a direct correspondence between the
2425 * string which we're matching and the original SV,
2426 * then we can use the utf8 len cache associated with
2427 * the SV. In particular, it means that under //g,
2428 * sv_pos_b2u() will use the previously cached
2429 * position to speed up working out the new length of
2430 * subcoffset, rather than counting from the start of
2431 * the string each time. This stops
2432 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2433 * from going quadratic */
2434 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2435 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2436 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2437 else
2438 prog->subcoffset = utf8_length((U8*)strbeg,
2439 (U8*)(strbeg+prog->suboffset));
2440 }
2441 }
2442 else {
2443 RX_MATCH_COPY_FREE(rx);
2444 prog->subbeg = strbeg;
2445 prog->suboffset = 0;
2446 prog->subcoffset = 0;
2447 prog->sublen = strend - strbeg;
2448 }
2449}
2450
2451
2452
fae667d5 2453
6eb5f6b9
JH
2454/*
2455 - regexec_flags - match a regexp against a string
2456 */
2457I32
5aaab254 2458Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2459 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2460/* stringarg: the point in the string at which to begin matching */
2461/* strend: pointer to null at end of string */
2462/* strbeg: real beginning of string */
2463/* minend: end of match must be >= minend bytes after stringarg. */
2464/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2465 * itself is accessed via the pointers above */
2466/* data: May be used for some additional optimizations.
d058ec57 2467 Currently unused. */
a340edde 2468/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2469
6eb5f6b9 2470{
8d919b0a 2471 struct regexp *const prog = ReANY(rx);
5aaab254 2472 char *s;
eb578fdb 2473 regnode *c;
03c83e26 2474 char *startpos;
ea3daa5d
FC
2475 SSize_t minlen; /* must match at least this many chars */
2476 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2477 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2478 I32 multiline;
f8fc2ecf 2479 RXi_GET_DECL(prog,progi);
02d5137b
DM
2480 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2481 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2482 regexp_paren_pair *swap = NULL;
006f26b2 2483 I32 oldsave;
a3621e74
YO
2484 GET_RE_DEBUG_FLAGS_DECL;
2485
7918f24d 2486 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2487 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2488
2489 /* Be paranoid... */
03c83e26 2490 if (prog == NULL || stringarg == NULL) {
6eb5f6b9 2491 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2492 }
2493
6c3fea77 2494 DEBUG_EXECUTE_r(
03c83e26 2495 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2496 "Matching");
2497 );
8adc0f72 2498
b342a604
DM
2499 startpos = stringarg;
2500
58430ea8 2501 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2502 MAGIC *mg;
2503
fef7148b
DM
2504 /* set reginfo->ganch, the position where \G can match */
2505
2506 reginfo->ganch =
2507 (flags & REXEC_IGNOREPOS)
2508 ? stringarg /* use start pos rather than pos() */
2509 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2510 /* Defined pos(): */
2511 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2512 : strbeg; /* pos() not defined; use start of string */
2513
2514 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2515 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2516
03c83e26
DM
2517 /* in the presence of \G, we may need to start looking earlier in
2518 * the string than the suggested start point of stringarg:
0b2c2a84 2519 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2520 * offset, such as
2521 * /..\G/: gofs = 2
2522 * /ab|c\G/: gofs = 1
2523 * or if the minimum offset isn't known, then we have to go back
2524 * to the start of the string, e.g. /w+\G/
2525 */
2bfbe302 2526
8e1490ee 2527 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2528 startpos = reginfo->ganch - prog->gofs;
2529 if (startpos <
2530 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2531 {
2532 DEBUG_r(PerlIO_printf(Perl_debug_log,
2533 "fail: ganch-gofs before earliest possible start\n"));
2534 return 0;
2535 }
2536 }
2537 else if (prog->gofs) {
b342a604
DM
2538 if (startpos - prog->gofs < strbeg)
2539 startpos = strbeg;
2540 else
2541 startpos -= prog->gofs;
03c83e26 2542 }
58430ea8 2543 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2544 startpos = strbeg;
03c83e26
DM
2545 }
2546
2547 minlen = prog->minlen;
b342a604 2548 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2549 DEBUG_r(PerlIO_printf(Perl_debug_log,
2550 "Regex match can't succeed, so not even tried\n"));
2551 return 0;
2552 }
2553
63a3746a
DM
2554 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2555 * which will call destuctors to reset PL_regmatch_state, free higher
2556 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2557 * regmatch_info_aux_eval */
2558
2559 oldsave = PL_savestack_ix;
2560
dfa77d06
DM
2561 s = startpos;
2562
e322109a 2563 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2564 && !(flags & REXEC_CHECKED))
2565 {
dfa77d06 2566 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2567 flags, NULL);
dfa77d06 2568 if (!s)
7fadf4a7
DM
2569 return 0;
2570
e322109a 2571 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2572 /* we can match based purely on the result of INTUIT.
2573 * Set up captures etc just for $& and $-[0]
2574 * (an intuit-only match wont have $1,$2,..) */
2575 assert(!prog->nparens);
d5e7783a
DM
2576
2577 /* s/// doesn't like it if $& is earlier than where we asked it to
2578 * start searching (which can happen on something like /.\G/) */
2579 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2580 && (s < stringarg))
2581 {
2582 /* this should only be possible under \G */
58430ea8 2583 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2584 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2585 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2586 goto phooey;
2587 }
2588
7fadf4a7
DM
2589 /* match via INTUIT shouldn't have any captures.
2590 * Let @-, @+, $^N know */
2591 prog->lastparen = prog->lastcloseparen = 0;
2592 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2593 prog->offs[0].start = s - strbeg;
2594 prog->offs[0].end = utf8_target
2595 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2596 : s - strbeg + prog->minlenret;
7fadf4a7 2597 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2598 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2599 strbeg, strend,
2600 sv, flags, utf8_target);
2601
7fadf4a7
DM
2602 return 1;
2603 }
2604 }
2605
6c3fea77 2606 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2607
dfa77d06 2608 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2609 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2610 "String too short [regexec_flags]...\n"));
2611 goto phooey;
1aa99e6b 2612 }
1de06328 2613
6eb5f6b9 2614 /* Check validity of program. */
f8fc2ecf 2615 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2616 Perl_croak(aTHX_ "corrupted regexp program");
2617 }
2618
1738e041
DM
2619 RX_MATCH_TAINTED_off(rx);
2620
6c3fea77
DM
2621 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2622 reginfo->intuit = 0;
2623 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2624 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2625 reginfo->warned = FALSE;
9d9163fb 2626 reginfo->strbeg = strbeg;
02d5137b 2627 reginfo->sv = sv;
1cb48e53 2628 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2629 reginfo->strend = strend;
6eb5f6b9 2630 /* see how far we have to get to not match where we matched before */
fe3974be 2631 reginfo->till = stringarg + minend;
6eb5f6b9 2632
60779a30 2633 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2634 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2635 S_cleanup_regmatch_info_aux has executed (registered by
2636 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2637 magic belonging to this SV.
2638 Not newSVsv, either, as it does not COW.
2639 */
2640 reginfo->sv = newSV(0);
4cba5ac0 2641 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2642 SAVEFREESV(reginfo->sv);
2643 }
2644
331b2dcc
DM
2645 /* reserve next 2 or 3 slots in PL_regmatch_state:
2646 * slot N+0: may currently be in use: skip it
2647 * slot N+1: use for regmatch_info_aux struct
2648 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2649 * slot N+3: ready for use by regmatch()
2650 */
bf2039a9 2651
331b2dcc
DM
2652 {
2653 regmatch_state *old_regmatch_state;
2654 regmatch_slab *old_regmatch_slab;
2655 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2656
2657 /* on first ever match, allocate first slab */
2658 if (!PL_regmatch_slab) {
2659 Newx(PL_regmatch_slab, 1, regmatch_slab);
2660 PL_regmatch_slab->prev = NULL;
2661 PL_regmatch_slab->next = NULL;
2662 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2663 }
bf2039a9 2664
331b2dcc
DM
2665 old_regmatch_state = PL_regmatch_state;
2666 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2667
331b2dcc
DM
2668 for (i=0; i <= max; i++) {
2669 if (i == 1)
2670 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2671 else if (i ==2)
2672 reginfo->info_aux_eval =
2673 reginfo->info_aux->info_aux_eval =
2674 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2675
331b2dcc
DM
2676 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2677 PL_regmatch_state = S_push_slab(aTHX);
2678 }
bf2039a9 2679
331b2dcc
DM
2680 /* note initial PL_regmatch_state position; at end of match we'll
2681 * pop back to there and free any higher slabs */
bf2039a9 2682
331b2dcc
DM
2683 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2684 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2685 reginfo->info_aux->poscache = NULL;
bf2039a9 2686
331b2dcc 2687 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2688
331b2dcc
DM
2689 if ((prog->extflags & RXf_EVAL_SEEN))
2690 S_setup_eval_state(aTHX_ reginfo);
2691 else
2692 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2693 }
d3aa529c 2694
6eb5f6b9 2695 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2696
288b8c02 2697 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2698 /* We have to be careful. If the previous successful match
2699 was from this regex we don't want a subsequent partially
2700 successful match to clobber the old results.
2701 So when we detect this possibility we add a swap buffer
d8da0584
KW
2702 to the re, and switch the buffer each match. If we fail,
2703 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2704 */
2705 swap = prog->offs;
2706 /* do we need a save destructor here for eval dies? */
2707 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2708 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2709 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2710 PTR2UV(prog),
2711 PTR2UV(swap),
2712 PTR2UV(prog->offs)
2713 ));
c74340f9 2714 }
6eb5f6b9
JH
2715
2716 /* Simplest case: anchored match need be tried only once. */
d3d47aac 2717 /* [unless only anchor is MBOL - implying multiline is set] */
8e1490ee 2718 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2719 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2720 goto got_it;
8e1490ee 2721 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2722 {
2723 char *end;
2724
2725 if (minlen)
2726 dontbother = minlen - 1;
1aa99e6b 2727 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2728 /* for multiline we only have to try after newlines */
33b8afdf 2729 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2730 /* because of the goto we can not easily reuse the macros for bifurcating the
2731 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2732 if (utf8_target) {
2733 if (s == startpos)
2734 goto after_try_utf8;
2735 while (1) {
02d5137b 2736 if (regtry(reginfo, &s)) {
92f3d482
YO
2737 goto got_it;
2738 }
2739 after_try_utf8:
2740 if (s > end) {
2741 goto phooey;
2742 }
2743 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2744 s = re_intuit_start(rx, sv, strbeg,
2745 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2746 if (!s) {
2747 goto phooey;
2748 }
2749 }
2750 else {
2751 s += UTF8SKIP(s);
2752 }
2753 }
2754 } /* end search for check string in unicode */
2755 else {
2756 if (s == startpos) {
2757 goto after_try_latin;
2758 }
2759 while (1) {
02d5137b 2760 if (regtry(reginfo, &s)) {
92f3d482
YO
2761 goto got_it;
2762 }
2763 after_try_latin:
2764 if (s > end) {
2765 goto phooey;
2766 }
2767 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2768 s = re_intuit_start(rx, sv, strbeg,
2769 s + 1, strend, flags, NULL);
92f3d482
YO
2770 if (!s) {
2771 goto phooey;
2772 }
2773 }
2774 else {
2775 s++;
2776 }
2777 }
2778 } /* end search for check string in latin*/
2779 } /* end search for check string */
2780 else { /* search for newline */
2781 if (s > startpos) {
2782 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2783 s--;
92f3d482 2784 }
21eede78
YO
2785 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2786 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2787 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2788 if (regtry(reginfo, &s))
6eb5f6b9
JH
2789 goto got_it;
2790 }
92f3d482
YO
2791 }
2792 } /* end search for newline */
2793 } /* end anchored/multiline check string search */
6eb5f6b9 2794 goto phooey;
a8430a8b 2795 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2796 {
a8430a8b
YO
2797 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2798 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2799 /* For anchored \G, the only position it can match from is
2800 * (ganch-gofs); we already set startpos to this above; if intuit
2801 * moved us on from there, we can't possibly succeed */
2802 assert(startpos == reginfo->ganch - prog->gofs);
2803 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2804 goto got_it;
2805 goto phooey;
2806 }
2807
2808 /* Messy cases: unanchored match. */
bbe252da 2809 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2810 /* we have /x+whatever/ */
984e6dd1 2811 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2812 char ch;
bf93d4cc
GS
2813#ifdef DEBUGGING
2814 int did_match = 0;
2815#endif
f2ed9b32 2816 if (utf8_target) {
7e0d5ad7
KW
2817 if (! prog->anchored_utf8) {
2818 to_utf8_substr(prog);
2819 }
2820 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2821 REXEC_FBC_SCAN(
6eb5f6b9 2822 if (*s == ch) {
a3621e74 2823 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2824 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2825 s += UTF8SKIP(s);
2826 while (s < strend && *s == ch)
2827 s += UTF8SKIP(s);
2828 }
4cadc6a9 2829 );
7e0d5ad7 2830
6eb5f6b9
JH
2831 }
2832 else {
7e0d5ad7
KW
2833 if (! prog->anchored_substr) {
2834 if (! to_byte_substr(prog)) {
6b54ddc5 2835 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2836 }
2837 }
2838 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2839 REXEC_FBC_SCAN(
6eb5f6b9 2840 if (*s == ch) {
a3621e74 2841 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2842 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2843 s++;
2844 while (s < strend && *s == ch)
2845 s++;
2846 }
4cadc6a9 2847 );
6eb5f6b9 2848 }
a3621e74 2849 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2850 PerlIO_printf(Perl_debug_log,
b7953727
JH
2851 "Did not find anchored character...\n")
2852 );
6eb5f6b9 2853 }
a0714e2c
SS
2854 else if (prog->anchored_substr != NULL
2855 || prog->anchored_utf8 != NULL
2856 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2857 && prog->float_max_offset < strend - s)) {
2858 SV *must;
ea3daa5d
FC
2859 SSize_t back_max;
2860 SSize_t back_min;
33b8afdf 2861 char *last;
6eb5f6b9 2862 char *last1; /* Last position checked before */
bf93d4cc
GS
2863#ifdef DEBUGGING
2864 int did_match = 0;
2865#endif
33b8afdf 2866 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2867 if (utf8_target) {
2868 if (! prog->anchored_utf8) {
2869 to_utf8_substr(prog);
2870 }
2871 must = prog->anchored_utf8;
2872 }
2873 else {
2874 if (! prog->anchored_substr) {
2875 if (! to_byte_substr(prog)) {
6b54ddc5 2876 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2877 }
2878 }
2879 must = prog->anchored_substr;
2880 }
33b8afdf
JH
2881 back_max = back_min = prog->anchored_offset;
2882 } else {
7e0d5ad7
KW
2883 if (utf8_target) {
2884 if (! prog->float_utf8) {
2885 to_utf8_substr(prog);
2886 }
2887 must = prog->float_utf8;
2888 }
2889 else {
2890 if (! prog->float_substr) {
2891 if (! to_byte_substr(prog)) {
6b54ddc5 2892 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2893 }
2894 }
2895 must = prog->float_substr;
2896 }
33b8afdf
JH
2897 back_max = prog->float_max_offset;
2898 back_min = prog->float_min_offset;
2899 }
1de06328 2900
1de06328
YO
2901 if (back_min<0) {
2902 last = strend;
2903 } else {
2904 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2905 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2906 - (SvTAIL(must) != 0) + back_min), strbeg);
2907 }
9d9163fb 2908 if (s > reginfo->strbeg)
6eb5f6b9
JH
2909 last1 = HOPc(s, -1);
2910 else
2911 last1 = s - 1; /* bogus */
2912
a0288114 2913 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2914 check_substr==must. */
bf05793b 2915 dontbother = 0;
6eb5f6b9
JH
2916 strend = HOPc(strend, -dontbother);
2917 while ( (s <= last) &&
e50d57d4 2918 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2919 (unsigned char*)strend, must,
c33e64f0 2920 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2921 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2922 if (HOPc(s, -back_max) > last1) {
2923 last1 = HOPc(s, -back_min);
2924 s = HOPc(s, -back_max);
2925 }
2926 else {
9d9163fb
DM
2927 char * const t = (last1 >= reginfo->strbeg)
2928 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2929
2930 last1 = HOPc(s, -back_min);
52657f30 2931 s = t;
6eb5f6b9 2932 }
f2ed9b32 2933 if (utf8_target) {
6eb5f6b9 2934 while (s <= last1) {
02d5137b 2935 if (regtry(reginfo, &s))
6eb5f6b9 2936 goto got_it;
7016d6eb
DM
2937 if (s >= last1) {
2938 s++; /* to break out of outer loop */
2939 break;
2940 }
2941 s += UTF8SKIP(s);
6eb5f6b9
JH
2942 }
2943 }
2944 else {
2945 while (s <= last1) {
02d5137b 2946 if (regtry(reginfo, &s))
6eb5f6b9
JH
2947 goto got_it;
2948 s++;
2949 }
2950 }
2951 }
ab3bbdeb 2952 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2953 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2954 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2955 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2956 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2957 ? "anchored" : "floating"),
ab3bbdeb
YO
2958 quoted, RE_SV_TAIL(must));
2959 });
6eb5f6b9
JH
2960 goto phooey;
2961 }
f8fc2ecf 2962 else if ( (c = progi->regstclass) ) {
f14c76ed 2963 if (minlen) {
f8fc2ecf 2964 const OPCODE op = OP(progi->regstclass);
66e933ab 2965 /* don't bother with what can't match */
786e8c11 2966 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2967 strend = HOPc(strend, -(minlen - 1));
2968 }
a3621e74 2969 DEBUG_EXECUTE_r({
be8e71aa 2970 SV * const prop = sv_newmortal();
2395827c 2971 regprop(prog, prop, c, reginfo);
0df25f3d 2972 {
f2ed9b32 2973 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2974 s,strend-s,60);
0df25f3d 2975 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2976 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2977 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2978 quoted, (int)(strend - s));
0df25f3d 2979 }
ffc61ed2 2980 });
f9176b44 2981 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2982 goto got_it;
07be1b83 2983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2984 }
2985 else {
2986 dontbother = 0;
a0714e2c 2987 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2988 /* Trim the end. */
6af40bd7 2989 char *last= NULL;
33b8afdf 2990 SV* float_real;
c33e64f0
FC
2991 STRLEN len;
2992 const char *little;
33b8afdf 2993
7e0d5ad7
KW
2994 if (utf8_target) {
2995 if (! prog->float_utf8) {
2996 to_utf8_substr(prog);
2997 }
2998 float_real = prog->float_utf8;
2999 }
3000 else {
3001 if (! prog->float_substr) {
3002 if (! to_byte_substr(prog)) {
6b54ddc5 3003 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3004 }
3005 }
3006 float_real = prog->float_substr;
3007 }
d6a28714 3008
c33e64f0
FC
3009 little = SvPV_const(float_real, len);
3010 if (SvTAIL(float_real)) {
7f18ad16
KW
3011 /* This means that float_real contains an artificial \n on
3012 * the end due to the presence of something like this:
3013 * /foo$/ where we can match both "foo" and "foo\n" at the
3014 * end of the string. So we have to compare the end of the
3015 * string first against the float_real without the \n and
3016 * then against the full float_real with the string. We
3017 * have to watch out for cases where the string might be
3018 * smaller than the float_real or the float_real without
3019 * the \n. */
1a13b075
YO
3020 char *checkpos= strend - len;
3021 DEBUG_OPTIMISE_r(
3022 PerlIO_printf(Perl_debug_log,
3023 "%sChecking for float_real.%s\n",
3024 PL_colors[4], PL_colors[5]));
3025 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3026 /* can't match, even if we remove the trailing \n
3027 * string is too short to match */
1a13b075
YO
3028 DEBUG_EXECUTE_r(
3029 PerlIO_printf(Perl_debug_log,
3030 "%sString shorter than required trailing substring, cannot match.%s\n",
3031 PL_colors[4], PL_colors[5]));
3032 goto phooey;
3033 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3034 /* can match, the end of the string matches without the
3035 * "\n" */
1a13b075
YO
3036 last = checkpos + 1;
3037 } else if (checkpos < strbeg) {
7f18ad16
KW
3038 /* cant match, string is too short when the "\n" is
3039 * included */
1a13b075
YO
3040 DEBUG_EXECUTE_r(
3041 PerlIO_printf(Perl_debug_log,
3042 "%sString does not contain required trailing substring, cannot match.%s\n",
3043 PL_colors[4], PL_colors[5]));
3044 goto phooey;
3045 } else if (!multiline) {
7f18ad16
KW
3046 /* non multiline match, so compare with the "\n" at the
3047 * end of the string */
1a13b075
YO
3048 if (memEQ(checkpos, little, len)) {
3049 last= checkpos;
3050 } else {
3051 DEBUG_EXECUTE_r(
3052 PerlIO_printf(Perl_debug_log,
3053 "%sString does not contain required trailing substring, cannot match.%s\n",
3054 PL_colors[4], PL_colors[5]));
3055 goto phooey;
3056 }
3057 } else {
7f18ad16
KW
3058 /* multiline match, so we have to search for a place
3059 * where the full string is located */
d6a28714 3060 goto find_last;
1a13b075 3061 }
c33e64f0 3062 } else {
d6a28714 3063 find_last:
9041c2e3 3064 if (len)
d6a28714 3065 last = rninstr(s, strend, little, little + len);
b8c5462f 3066 else
a0288114 3067 last = strend; /* matching "$" */
b8c5462f 3068 }
6af40bd7 3069 if (!last) {
7f18ad16
KW
3070 /* at one point this block contained a comment which was
3071 * probably incorrect, which said that this was a "should not
3072 * happen" case. Even if it was true when it was written I am
3073 * pretty sure it is not anymore, so I have removed the comment
3074 * and replaced it with this one. Yves */
6bda09f9
YO
3075 DEBUG_EXECUTE_r(
3076 PerlIO_printf(Perl_debug_log,
b729e729
YO
3077 "%sString does not contain required substring, cannot match.%s\n",
3078 PL_colors[4], PL_colors[5]
6af40bd7
YO
3079 ));
3080 goto phooey;
bf93d4cc 3081 }
d6a28714
JH
3082 dontbother = strend - last + prog->float_min_offset;
3083 }
3084 if (minlen && (dontbother < minlen))
3085 dontbother = minlen - 1;
3086 strend -= dontbother; /* this one's always in bytes! */
3087 /* We don't know much -- general case. */
f2ed9b32 3088 if (utf8_target) {
d6a28714 3089 for (;;) {
02d5137b 3090 if (regtry(reginfo, &s))
d6a28714
JH
3091 goto got_it;
3092 if (s >= strend)
3093 break;
b8c5462f 3094 s += UTF8SKIP(s);
d6a28714
JH
3095 };
3096 }
3097 else {
3098 do {
02d5137b 3099 if (regtry(reginfo, &s))
d6a28714
JH
3100 goto got_it;
3101 } while (s++ < strend);
3102 }
3103 }
3104
3105 /* Failure. */
3106 goto phooey;
3107
3108got_it:
d5e7783a
DM
3109 /* s/// doesn't like it if $& is earlier than where we asked it to
3110 * start searching (which can happen on something like /.\G/) */
3111 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3112 && (prog->offs[0].start < stringarg - strbeg))
3113 {
3114 /* this should only be possible under \G */
58430ea8 3115 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3116 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3117 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3118 goto phooey;
3119 }
3120
495f47a5
DM
3121 DEBUG_BUFFERS_r(
3122 if (swap)
3123 PerlIO_printf(Perl_debug_log,
3124 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3125 PTR2UV(prog),
3126 PTR2UV(swap)
3127 );
3128 );
e9105d30 3129 Safefree(swap);
d6a28714 3130
bf2039a9
DM
3131 /* clean up; this will trigger destructors that will free all slabs
3132 * above the current one, and cleanup the regmatch_info_aux
3133 * and regmatch_info_aux_eval sructs */
8adc0f72 3134
006f26b2
DM
3135 LEAVE_SCOPE(oldsave);
3136
5daac39c
NC
3137 if (RXp_PAREN_NAMES(prog))
3138 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3139
0254aed9
DM
3140 RX_MATCH_UTF8_set(rx, utf8_target);
3141
d6a28714 3142 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3143 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3144 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3145 strbeg, reginfo->strend,
3146 sv, flags, utf8_target);
9041c2e3 3147
d6a28714
JH
3148 return 1;
3149
3150phooey:
a3621e74 3151 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3152 PL_colors[4], PL_colors[5]));
8adc0f72 3153
bf2039a9
DM
3154 /* clean up; this will trigger destructors that will free all slabs
3155 * above the current one, and cleanup the regmatch_info_aux
3156 * and regmatch_info_aux_eval sructs */
8adc0f72 3157
006f26b2
DM
3158 LEAVE_SCOPE(oldsave);
3159
e9105d30 3160 if (swap) {
c74340f9 3161 /* we failed :-( roll it back */
495f47a5
DM
3162 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3163 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3164 PTR2UV(prog),
3165 PTR2UV(prog->offs),
3166 PTR2UV(swap)
3167 ));
e9105d30
GG
3168 Safefree(prog->offs);
3169 prog->offs = swap;
3170 }
d6a28714
JH
3171 return 0;
3172}
3173
6bda09f9 3174
b3d298be 3175/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3176 * Do inc before dec, in case old and new rex are the same */
baa60164 3177#define SET_reg_curpm(Re2) \
bf2039a9 3178 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3179 (void)ReREFCNT_inc(Re2); \
3180 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3181 PM_SETRE((PL_reg_curpm), (Re2)); \
3182 }
3183
3184
d6a28714
JH
3185/*
3186 - regtry - try match at specific point
3187 */
3188STATIC I32 /* 0 failure, 1 success */
f73aaa43 3189S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3190{
d6a28714 3191 CHECKPOINT lastcp;
288b8c02 3192 REGEXP *const rx = reginfo->prog;
8d919b0a 3193 regexp *const prog = ReANY(rx);
99a90e59 3194 SSize_t result;
f8fc2ecf 3195 RXi_GET_DECL(prog,progi);
a3621e74 3196 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3197
3198 PERL_ARGS_ASSERT_REGTRY;
3199
24b23f37 3200 reginfo->cutpoint=NULL;
d6a28714 3201
9d9163fb 3202 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3203 prog->lastparen = 0;
03994de8 3204 prog->lastcloseparen = 0;
d6a28714
JH
3205
3206 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3207 to do this again and again, prog->lastparen should take care of
3dd2943c 3208 this! --ilya*/
dafc8851
JH
3209
3210 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3211 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3212 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3213 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3214 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3215 * Meanwhile, this code *is* needed for the
daf18116
JH
3216 * above-mentioned test suite tests to succeed. The common theme
3217 * on those tests seems to be returning null fields from matches.
225593e1 3218 * --jhi updated by dapm */
dafc8851 3219#if 1
d6a28714 3220 if (prog->nparens) {
b93070ed 3221 regexp_paren_pair *pp = prog->offs;
eb578fdb 3222 I32 i;
b93070ed 3223 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3224 ++pp;
3225 pp->start = -1;
3226 pp->end = -1;
d6a28714
JH
3227 }
3228 }
dafc8851 3229#endif
02db2b7b 3230 REGCP_SET(lastcp);
f73aaa43
DM
3231 result = regmatch(reginfo, *startposp, progi->program + 1);
3232 if (result != -1) {
3233 prog->offs[0].end = result;
d6a28714
JH
3234 return 1;
3235 }
24b23f37 3236 if (reginfo->cutpoint)
f73aaa43 3237 *startposp= reginfo->cutpoint;
02db2b7b 3238 REGCP_UNWIND(lastcp);
d6a28714
JH
3239 return 0;
3240}
3241
02db2b7b 3242
8ba1375e
MJD
3243#define sayYES goto yes
3244#define sayNO goto no
262b90c4 3245#define sayNO_SILENT goto no_silent
8ba1375e 3246
f9f4320a
YO
3247/* we dont use STMT_START/END here because it leads to
3248 "unreachable code" warnings, which are bogus, but distracting. */
3249#define CACHEsayNO \
c476f425 3250 if (ST.cache_mask) \
2ac8ff4b 3251 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3252 sayNO
3298f257 3253
a3621e74 3254/* this is used to determine how far from the left messages like
265c4333
YO
3255 'failed...' are printed. It should be set such that messages
3256 are inline with the regop output that created them.
a3621e74 3257*/
265c4333 3258#define REPORT_CODE_OFF 32
a3621e74
YO
3259
3260
40a82448
DM
3261#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3262#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3263#define CHRTEST_NOT_A_CP_1 -999
3264#define CHRTEST_NOT_A_CP_2 -998
9e137952 3265
5d9a96ca
DM
3266/* grab a new slab and return the first slot in it */
3267
3268STATIC regmatch_state *
3269S_push_slab(pTHX)
3270{
a35a87e7 3271#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3272 dMY_CXT;
3273#endif
5d9a96ca
DM
3274 regmatch_slab *s = PL_regmatch_slab->next;
3275 if (!s) {
3276 Newx(s, 1, regmatch_slab);
3277 s->prev = PL_regmatch_slab;
3278 s->next = NULL;
3279 PL_regmatch_slab->next = s;
3280 }
3281 PL_regmatch_slab = s;
86545054 3282 return SLAB_FIRST(s);
5d9a96ca 3283}
5b47454d 3284
95b24440 3285
40a82448
DM
3286/* push a new state then goto it */
3287
4d5016e5
DM
3288#define PUSH_STATE_GOTO(state, node, input) \
3289 pushinput = input; \
40a82448
DM
3290 scan = node; \
3291 st->resume_state = state; \
3292 goto push_state;
3293
3294/* push a new state with success backtracking, then goto it */
3295
4d5016e5
DM
3296#define PUSH_YES_STATE_GOTO(state, node, input) \
3297 pushinput = input; \
40a82448
DM
3298 scan = node; \
3299 st->resume_state = state; \
3300 goto push_yes_state;
3301
aa283a38 3302
aa283a38 3303
4d5016e5 3304
d6a28714 3305/*
95b24440 3306
bf1f174e
DM
3307regmatch() - main matching routine
3308
3309This is basically one big switch statement in a loop. We execute an op,
3310set 'next' to point the next op, and continue. If we come to a point which
3311we may need to backtrack to on failure such as (A|B|C), we push a
3312backtrack state onto the backtrack stack. On failure, we pop the top
3313state, and re-enter the loop at the state indicated. If there are no more
3314states to pop, we return failure.
3315
3316Sometimes we also need to backtrack on success; for example /A+/, where
3317after successfully matching one A, we need to go back and try to
3318match another one; similarly for lookahead assertions: if the assertion
3319completes successfully, we backtrack to the state just before the assertion
3320and then carry on. In these cases, the pushed state is marked as
3321'backtrack on success too'. This marking is in fact done by a chain of
3322pointers, each pointing to the previous 'yes' state. On success, we pop to
3323the nearest yes state, discarding any intermediate failure-only states.
3324Sometimes a yes state is pushed just to force some cleanup code to be
3325called at the end of a successful match or submatch; e.g. (??{$re}) uses
3326it to free the inner regex.
3327
3328Note that failure backtracking rewinds the cursor position, while
3329success backtracking leaves it alone.
3330
3331A pattern is complete when the END op is executed, while a subpattern
3332such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3333ops trigger the "pop to last yes state if any, otherwise return true"
3334behaviour.
3335
3336A common convention in this function is to use A and B to refer to the two
3337subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3338the subpattern to be matched possibly multiple times, while B is the entire
3339rest of the pattern. Variable and state names reflect this convention.
3340
3341The states in the main switch are the union of ops and failure/success of
3342substates associated with with that op. For example, IFMATCH is the op
3343that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3344'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3345successfully matched A and IFMATCH_A_fail is a state saying that we have
3346just failed to match A. Resume states always come in pairs. The backtrack
3347state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3348at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3349on success or failure.
3350
3351The struct that holds a backtracking state is actually a big union, with
3352one variant for each major type of op. The variable st points to the
3353top-most backtrack struct. To make the code clearer, within each
3354block of code we #define ST to alias the relevant union.
3355
3356Here's a concrete example of a (vastly oversimplified) IFMATCH
3357implementation:
3358
3359 switch (state) {
3360 ....
3361
3362#define ST st->u.ifmatch
3363
3364 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3365 ST.foo = ...; // some state we wish to save
95b24440 3366 ...
bf1f174e
DM
3367 // push a yes backtrack state with a resume value of
3368 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3369 // first node of A:
4d5016e5 3370 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3371 // NOTREACHED
3372
3373 case IFMATCH_A: // we have successfully executed A; now continue with B
3374 next = B;
3375 bar = ST.foo; // do something with the preserved value
3376 break;
3377
3378 case IFMATCH_A_fail: // A failed, so the assertion failed
3379 ...; // do some housekeeping, then ...
3380 sayNO; // propagate the failure
3381
3382#undef ST
95b24440 3383
bf1f174e
DM
3384 ...
3385 }
95b24440 3386
bf1f174e
DM
3387For any old-timers reading this who are familiar with the old recursive
3388approach, the code above is equivalent to:
95b24440 3389
bf1f174e
DM
3390 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3391 {
3392 int foo = ...
95b24440 3393 ...
bf1f174e
DM
3394 if (regmatch(A)) {
3395 next = B;
3396 bar = foo;
3397 break;
95b24440 3398 }
bf1f174e
DM
3399 ...; // do some housekeeping, then ...
3400 sayNO; // propagate the failure
95b24440 3401 }
bf1f174e
DM
3402
3403The topmost backtrack state, pointed to by st, is usually free. If you
3404want to claim it, populate any ST.foo fields in it with values you wish to
3405save, then do one of
3406
4d5016e5
DM
3407 PUSH_STATE_GOTO(resume_state, node, newinput);
3408 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3409
3410which sets that backtrack state's resume value to 'resume_state', pushes a
3411new free entry to the top of the backtrack stack, then goes to 'node'.
3412On backtracking, the free slot is popped, and the saved state becomes the
3413new free state. An ST.foo field in this new top state can be temporarily
3414accessed to retrieve values, but once the main loop is re-entered, it
3415becomes available for reuse.
3416
3417Note that the depth of the backtrack stack constantly increases during the
3418left-to-right execution of the pattern, rather than going up and down with
3419the pattern nesting. For example the stack is at its maximum at Z at the
3420end of the pattern, rather than at X in the following:
3421
3422 /(((X)+)+)+....(Y)+....Z/
3423
3424The only exceptions to this are lookahead/behind assertions and the cut,
3425(?>A), which pop all the backtrack states associated with A before
3426continuing.
3427
486ec47a 3428Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3429PL_regmatch_state and st always point to the currently active state,
3430and PL_regmatch_slab points to the slab currently containing
3431PL_regmatch_state. The first time regmatch() is called, the first slab is
3432allocated, and is never freed until interpreter destruction. When the slab
3433is full, a new one is allocated and chained to the end. At exit from
3434regmatch(), slabs allocated since entry are freed.
3435
3436*/
95b24440 3437
40a82448 3438
5bc10b2c 3439#define DEBUG_STATE_pp(pp) \
265c4333 3440 DEBUG_STATE_r({ \
baa60164 3441 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3442 PerlIO_printf(Perl_debug_log, \
5d458dd8 3443 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3444 depth*2, "", \
baa60164 3445 PL_reg_name[st->resume_state], \
5d458dd8
YO
3446 ((st==yes_state||st==mark_state) ? "[" : ""), \
3447 ((st==yes_state) ? "Y" : ""), \
3448 ((st==mark_state) ? "M" : ""), \
3449 ((st==yes_state||st==mark_state) ? "]" : "") \
3450 ); \
265c4333 3451 });
5bc10b2c 3452
40a82448 3453
3dab1dad 3454#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3455
3df15adc 3456#ifdef DEBUGGING
5bc10b2c 3457
ab3bbdeb 3458STATIC void
f2ed9b32 3459S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3460 const char *start, const char *end, const char *blurb)
3461{
efd26800 3462 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3463
3464 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3465
ab3bbdeb
YO
3466 if (!PL_colorset)
3467 reginitcolors();
3468 {
3469 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3470 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3471
f2ed9b32 3472 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3473 start, end - start, 60);
3474
3475 PerlIO_printf(Perl_debug_log,
3476 "%s%s REx%s %s against %s\n",
3477 PL_colors[4], blurb, PL_colors[5], s0, s1);
3478
f2ed9b32 3479 if (utf8_target||utf8_pat)
1de06328
YO
3480 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3481 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3482 utf8_pat && utf8_target ? " and " : "",
3483 utf8_target ? "string" : ""
ab3bbdeb
YO
3484 );
3485 }
3486}
3df15adc
YO
3487
3488STATIC void
786e8c11
YO
3489S_dump_exec_pos(pTHX_ const char *locinput,
3490 const regnode *scan,
3491 const char *loc_regeol,
3492 const char *loc_bostr,
3493 const char *loc_reg_starttry,
f2ed9b32 3494 const bool utf8_target)
07be1b83 3495{
786e8c11 3496 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3497 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3498 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3499 /* The part of the string before starttry has one color
3500 (pref0_len chars), between starttry and current
3501 position another one (pref_len - pref0_len chars),
3502 after the current position the third one.
3503 We assume that pref0_len <= pref_len, otherwise we
3504 decrease pref0_len. */
786e8c11
YO
3505 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3506 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3507 int pref0_len;
3508
7918f24d
NC
3509 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3510
f2ed9b32 3511 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3512 pref_len++;
786e8c11
YO
3513 pref0_len = pref_len - (locinput - loc_reg_starttry);
3514 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3515 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3516 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3517 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3518 l--;
3519 if (pref0_len < 0)
3520 pref0_len = 0;
3521 if (pref0_len > pref_len)
3522 pref0_len = pref_len;
3523 {
f2ed9b32 3524 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3525
ab3bbdeb 3526 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3527 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3528
ab3bbdeb 3529 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3530 (locinput - pref_len + pref0_len),
1de06328 3531 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3532
ab3bbdeb 3533 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3534 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3535
1de06328 3536 const STRLEN tlen=len0+len1+len2;
3df15adc 3537 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3538 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3539 (IV)(locinput - loc_bostr),
07be1b83 3540 len0, s0,
07be1b83 3541 len1, s1,
07be1b83 3542 (docolor ? "" : "> <"),
07be1b83 3543 len2, s2,
f9f4320a 3544 (int)(tlen > 19 ? 0 : 19 - tlen),