This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
unify regmatch_info data
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 113#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad 115#define HOPc(pos,off) \
ba44c216 116 (char *)(reginfo->is_utf8_target \
220db18a 117 ? reghop3((U8*)pos, off, \
9d9163fb 118 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
119 : (U8*)(pos + off))
120#define HOPBACKc(pos, off) \
ba44c216 121 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
122 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
123 : (pos - off >= reginfo->strbeg) \
8e11feef 124 ? (U8*)pos - off \
3dab1dad 125 : NULL)
efb30f32 126
ba44c216 127#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 129
7016d6eb
DM
130
131#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
132#define NEXTCHR_IS_EOS (nextchr < 0)
133
134#define SET_nextchr \
220db18a 135 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
136
137#define SET_locinput(p) \
138 locinput = (p); \
139 SET_nextchr
140
141
c7304fe2
KW
142#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
143 if (!swash_ptr) { \
144 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2
KW
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
147 assert(swash_ptr); \
148 } \
149 } STMT_END
150
151/* If in debug mode, we test that a known character properly matches */
152#ifdef DEBUGGING
153# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
154 property_name, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
158#else
159# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
160 property_name, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
163#endif
d1eb3177 164
c7304fe2
KW
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
169
170#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171 STMT_START { \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
176 "_X_extend", \
177 COMBINING_GRAVE_ACCENT_UTF8); \
178 } STMT_END
d1eb3177 179
c7304fe2 180#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
181/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
182
5f80c4cf 183/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
184/* it would be nice to rework regcomp.sym to generate this stuff. sigh
185 *
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
192*/
3e901dc0
YO
193#define JUMPABLE(rn) ( \
194 OP(rn) == OPEN || \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
196 OP(rn) == EVAL || \
cca55fe3
JP
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 199 OP(rn) == KEEPS || \
3dab1dad 200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 201)
ee9b8eae 202#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 203
ee9b8eae
YO
204#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
205
206#if 0
207/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 210#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
211#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
212
213#else
214/* ... so we use this as its faster. */
215#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 216#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
217#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
219
220#endif
e2d8ce26 221
a84d97b6
HS
222/*
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
225*/
cca55fe3 226#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 230 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 231 else if (type == PLUS) \
cca55fe3 232 rn = NEXTOPER(rn); \
3dab1dad 233 else if (type == IFMATCH) \
a84d97b6 234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 235 else rn += NEXT_OFF(rn); \
3dab1dad 236 } \
5f80c4cf 237} STMT_END
74750237 238
22913b96
KW
239/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245#define SBASE 0xAC00 /* Start of block */
246#define SCount 11172 /* Length of block */
247#define TCount 28
c476f425 248
006f26b2
DM
249#define SLAB_FIRST(s) (&(s)->states[0])
250#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
251
a75351a1 252static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 253static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
006f26b2 254static void S_clear_backtrack_stack(pTHX_ void *p);
bf2039a9 255static regmatch_state * S_push_slab(pTHX);
51371543 256
87c0511b 257#define REGCP_PAREN_ELEMS 3
f067efbf 258#define REGCP_OTHER_ELEMS 3
e0fa7e2b 259#define REGCP_FRAME_ELEMS 1
620d5b66
NC
260/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
261 * are needed for the regexp context stack bookkeeping. */
262
76e3520e 263STATIC CHECKPOINT
92da3157 264S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 265{
97aff369 266 dVAR;
a3b680e6 267 const int retval = PL_savestack_ix;
92da3157
DM
268 const int paren_elems_to_push =
269 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
270 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
271 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 272 I32 p;
40a82448 273 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 274
b93070ed
DM
275 PERL_ARGS_ASSERT_REGCPPUSH;
276
e49a9654 277 if (paren_elems_to_push < 0)
5637ef5b
NC
278 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
279 paren_elems_to_push);
e49a9654 280
e0fa7e2b
NC
281 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
282 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 283 " out of range (%lu-%ld)",
92da3157
DM
284 total_elems,
285 (unsigned long)maxopenparen,
286 (long)parenfloor);
e0fa7e2b 287
620d5b66 288 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 289
495f47a5 290 DEBUG_BUFFERS_r(
92da3157 291 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
292 PerlIO_printf(Perl_debug_log,
293 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
294 PTR2UV(rex),
295 PTR2UV(rex->offs)
296 );
297 );
92da3157 298 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 299/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
300 SSPUSHINT(rex->offs[p].end);
301 SSPUSHINT(rex->offs[p].start);
1ca2007e 302 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 303 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
304 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
305 (UV)p,
306 (IV)rex->offs[p].start,
307 (IV)rex->offs[p].start_tmp,
308 (IV)rex->offs[p].end
40a82448 309 ));
a0d0e21e 310 }
b1ce53c5 311/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 312 SSPUSHINT(maxopenparen);
b93070ed
DM
313 SSPUSHINT(rex->lastparen);
314 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 315 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 316
a0d0e21e
LW
317 return retval;
318}
319
c277df42 320/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
321#define REGCP_SET(cp) \
322 DEBUG_STATE_r( \
ab3bbdeb 323 PerlIO_printf(Perl_debug_log, \
e4f74956 324 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
325 (IV)PL_savestack_ix)); \
326 cp = PL_savestack_ix
c3464db5 327
ab3bbdeb 328#define REGCP_UNWIND(cp) \
e4f74956 329 DEBUG_STATE_r( \
ab3bbdeb 330 if (cp != PL_savestack_ix) \
e4f74956
YO
331 PerlIO_printf(Perl_debug_log, \
332 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
333 (IV)(cp), (IV)PL_savestack_ix)); \
334 regcpblow(cp)
c277df42 335
a8d1f4b4
DM
336#define UNWIND_PAREN(lp, lcp) \
337 for (n = rex->lastparen; n > lp; n--) \
338 rex->offs[n].end = -1; \
339 rex->lastparen = n; \
340 rex->lastcloseparen = lcp;
341
342
f067efbf 343STATIC void
92da3157 344S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 345{
97aff369 346 dVAR;
e0fa7e2b 347 UV i;
87c0511b 348 U32 paren;
a3621e74
YO
349 GET_RE_DEBUG_FLAGS_DECL;
350
7918f24d
NC
351 PERL_ARGS_ASSERT_REGCPPOP;
352
b1ce53c5 353 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 354 i = SSPOPUV;
e0fa7e2b
NC
355 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
356 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
357 rex->lastcloseparen = SSPOPINT;
358 rex->lastparen = SSPOPINT;
92da3157 359 *maxopenparen_p = SSPOPINT;
b1ce53c5 360
620d5b66 361 i -= REGCP_OTHER_ELEMS;
b1ce53c5 362 /* Now restore the parentheses context. */
495f47a5
DM
363 DEBUG_BUFFERS_r(
364 if (i || rex->lastparen + 1 <= rex->nparens)
365 PerlIO_printf(Perl_debug_log,
366 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
367 PTR2UV(rex),
368 PTR2UV(rex->offs)
369 );
370 );
92da3157 371 paren = *maxopenparen_p;
620d5b66 372 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 373 I32 tmps;
1ca2007e 374 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 375 rex->offs[paren].start = SSPOPINT;
cf93c79d 376 tmps = SSPOPINT;
b93070ed
DM
377 if (paren <= rex->lastparen)
378 rex->offs[paren].end = tmps;
495f47a5
DM
379 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
380 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
381 (UV)paren,
382 (IV)rex->offs[paren].start,
383 (IV)rex->offs[paren].start_tmp,
384 (IV)rex->offs[paren].end,
385 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 386 );
87c0511b 387 paren--;
a0d0e21e 388 }
daf18116 389#if 1
dafc8851
JH
390 /* It would seem that the similar code in regtry()
391 * already takes care of this, and in fact it is in
392 * a better location to since this code can #if 0-ed out
393 * but the code in regtry() is needed or otherwise tests
394 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
395 * (as of patchlevel 7877) will fail. Then again,
396 * this code seems to be necessary or otherwise
225593e1
DM
397 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
398 * --jhi updated by dapm */
b93070ed 399 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 400 if (i > *maxopenparen_p)
b93070ed
DM
401 rex->offs[i].start = -1;
402 rex->offs[i].end = -1;
495f47a5
DM
403 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
404 " \\%"UVuf": %s ..-1 undeffing\n",
405 (UV)i,
92da3157 406 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 407 ));
a0d0e21e 408 }
dafc8851 409#endif
a0d0e21e
LW
410}
411
74088413
DM
412/* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
414
415STATIC void
92da3157 416S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
417{
418 I32 tmpix = PL_savestack_ix;
419 PL_savestack_ix = ix;
92da3157 420 regcppop(rex, maxopenparen_p);
74088413
DM
421 PL_savestack_ix = tmpix;
422}
423
02db2b7b 424#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 425
31c7f561
KW
426STATIC bool
427S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
428{
429 /* Returns a boolean as to whether or not 'character' is a member of the
430 * Posix character class given by 'classnum' that should be equivalent to a
431 * value in the typedef '_char_class_number'.
432 *
433 * Ideally this could be replaced by a just an array of function pointers
434 * to the C library functions that implement the macros this calls.
435 * However, to compile, the precise function signatures are required, and
436 * these may vary from platform to to platform. To avoid having to figure
437 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
438 * which adds an extra layer of function call overhead (unless the C
439 * optimizer strips it away). But we don't particularly care about
440 * performance with locales anyway. */
31c7f561
KW
441
442 switch ((_char_class_number) classnum) {
15861f94 443 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 444 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
445 case _CC_ENUM_ASCII: return isASCII_LC(character);
446 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
447 case _CC_ENUM_CASED: return isLOWER_LC(character)
448 || isUPPER_LC(character);
e8d596e0 449 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
450 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
451 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
452 case _CC_ENUM_LOWER: return isLOWER_LC(character);
453 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 454 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 455 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 456 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
457 case _CC_ENUM_UPPER: return isUPPER_LC(character);
458 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 459 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
460 default: /* VERTSPACE should never occur in locales */
461 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
462 }
463
464 assert(0); /* NOTREACHED */
465 return FALSE;
466}
467
3018b823
KW
468STATIC bool
469S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
470{
471 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
472 * 'character' is a member of the Posix character class given by 'classnum'
473 * that should be equivalent to a value in the typedef
474 * '_char_class_number'.
475 *
476 * This just calls isFOO_lc on the code point for the character if it is in
477 * the range 0-255. Outside that range, all characters avoid Unicode
478 * rules, ignoring any locale. So use the Unicode function if this class
479 * requires a swash, and use the Unicode macro otherwise. */
480
481 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
482
483 if (UTF8_IS_INVARIANT(*character)) {
484 return isFOO_lc(classnum, *character);
485 }
486 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
487 return isFOO_lc(classnum,
488 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
489 }
490
491 if (classnum < _FIRST_NON_SWASH_CC) {
492
493 /* Initialize the swash unless done already */
494 if (! PL_utf8_swash_ptrs[classnum]) {
495 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
496 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
497 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
498 }
499
92a2046b
KW
500 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
501 character,
502 TRUE /* is UTF */ ));
3018b823
KW
503 }
504
505 switch ((_char_class_number) classnum) {
506 case _CC_ENUM_SPACE:
507 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
508
509 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
510 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
511 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
512 default: return 0; /* Things like CNTRL are always
513 below 256 */
514 }
515
516 assert(0); /* NOTREACHED */
517 return FALSE;
518}
519
a687059c 520/*
e50aee73 521 * pregexec and friends
a687059c
LW
522 */
523
76234dfb 524#ifndef PERL_IN_XSUB_RE
a687059c 525/*
c277df42 526 - pregexec - match a regexp against a string
a687059c 527 */
c277df42 528I32
5aaab254 529Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 530 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
531/* stringarg: the point in the string at which to begin matching */
532/* strend: pointer to null at end of string */
533/* strbeg: real beginning of string */
534/* minend: end of match must be >= minend bytes after stringarg. */
535/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
536 * itself is accessed via the pointers above */
537/* nosave: For optimizations. */
c277df42 538{
7918f24d
NC
539 PERL_ARGS_ASSERT_PREGEXEC;
540
c277df42 541 return
9041c2e3 542 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
543 nosave ? 0 : REXEC_COPY_STR);
544}
76234dfb 545#endif
22e551b9 546
9041c2e3 547/*
cad2e5aa
JH
548 * Need to implement the following flags for reg_anch:
549 *
550 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
551 * USE_INTUIT_ML
552 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
553 * INTUIT_AUTORITATIVE_ML
554 * INTUIT_ONCE_NOML - Intuit can match in one location only.
555 * INTUIT_ONCE_ML
556 *
557 * Another flag for this function: SECOND_TIME (so that float substrs
558 * with giant delta may be not rechecked).
559 */
560
561/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
562
3f7c398e 563/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
564 Otherwise, only SvCUR(sv) is used to get strbeg. */
565
566/* XXXX We assume that strpos is strbeg unless sv. */
567
6eb5f6b9
JH
568/* XXXX Some places assume that there is a fixed substring.
569 An update may be needed if optimizer marks as "INTUITable"
570 RExen without fixed substrings. Similarly, it is assumed that
571 lengths of all the strings are no more than minlen, thus they
572 cannot come from lookahead.
40d049e4
YO
573 (Or minlen should take into account lookahead.)
574 NOTE: Some of this comment is not correct. minlen does now take account
575 of lookahead/behind. Further research is required. -- demerphq
576
577*/
6eb5f6b9 578
2c2d71f5
JH
579/* A failure to find a constant substring means that there is no need to make
580 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 581 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
582 regtry() should be needed.
583
584 REx compiler's optimizer found 4 possible hints:
585 a) Anchored substring;
586 b) Fixed substring;
587 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 588 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 589 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
590 string which does not contradict any of them.
591 */
2c2d71f5 592
6eb5f6b9
JH
593/* Most of decisions we do here should have been done at compile time.
594 The nodes of the REx which we used for the search should have been
595 deleted from the finite automaton. */
596
52a21eb3
DM
597/* args:
598 * rx: the regex to match against
599 * sv: the SV being matched: only used for utf8 flag; the string
600 * itself is accessed via the pointers below. Note that on
601 * something like an overloaded SV, SvPOK(sv) may be false
602 * and the string pointers may point to something unrelated to
603 * the SV itself.
604 * strbeg: real beginning of string
605 * strpos: the point in the string at which to begin matching
606 * strend: pointer to the byte following the last char of the string
607 * flags currently unused; set to 0
608 * data: currently unused; set to NULL
609 */
610
cad2e5aa 611char *
52a21eb3
DM
612Perl_re_intuit_start(pTHX_
613 REGEXP * const rx,
614 SV *sv,
615 const char * const strbeg,
616 char *strpos,
617 char *strend,
618 const U32 flags,
619 re_scream_pos_data *data)
cad2e5aa 620{
97aff369 621 dVAR;
8d919b0a 622 struct regexp *const prog = ReANY(rx);
eb578fdb 623 I32 start_shift = 0;
cad2e5aa 624 /* Should be nonnegative! */
eb578fdb
KW
625 I32 end_shift = 0;
626 char *s;
627 SV *check;
cad2e5aa 628 char *t;
f2ed9b32 629 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 630 I32 ml_anch;
eb578fdb 631 char *other_last = NULL; /* other substr checked before this */
bd61b366 632 char *check_at = NULL; /* check substr found at this pos */
d8080198 633 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 634 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 635 RXi_GET_DECL(prog,progi);
02d5137b
DM
636 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
637 regmatch_info *const reginfo = &reginfo_buf;
30944b6d 638#ifdef DEBUGGING
890ce7af 639 const char * const i_strpos = strpos;
30944b6d 640#endif
a3621e74
YO
641 GET_RE_DEBUG_FLAGS_DECL;
642
7918f24d 643 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
644 PERL_UNUSED_ARG(flags);
645 PERL_UNUSED_ARG(data);
7918f24d 646
ba44c216 647 reginfo->is_utf8_target = cBOOL(utf8_target);
cad2e5aa 648
c344f387
JH
649 /* CHR_DIST() would be more correct here but it makes things slow. */
650 if (prog->minlen > strend - strpos) {
a3621e74 651 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 652 "String too short... [re_intuit_start]\n"));
cad2e5aa 653 goto fail;
2c2d71f5 654 }
d8da0584 655
bf2039a9 656 reginfo->info_aux = NULL;
9d9163fb 657 reginfo->strbeg = strbeg;
220db18a 658 reginfo->strend = strend;
aed7b151 659 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 660 reginfo->intuit = 1;
1cb48e53
DM
661 /* not actually used within intuit, but zero for safety anyway */
662 reginfo->poscache_maxiter = 0;
02d5137b 663
f2ed9b32 664 if (utf8_target) {
33b8afdf
JH
665 if (!prog->check_utf8 && prog->check_substr)
666 to_utf8_substr(prog);
667 check = prog->check_utf8;
668 } else {
7e0d5ad7
KW
669 if (!prog->check_substr && prog->check_utf8) {
670 if (! to_byte_substr(prog)) {
6b54ddc5 671 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
672 }
673 }
33b8afdf
JH
674 check = prog->check_substr;
675 }
bbe252da
YO
676 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
677 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
678 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 679 && !multiline ) ); /* Check after \n? */
cad2e5aa 680
7e25d62c 681 if (!ml_anch) {
bbe252da
YO
682 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
683 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
7e25d62c 684 && (strpos != strbeg)) {
a3621e74 685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
686 goto fail;
687 }
d46b78c6
KW
688 if (prog->check_offset_min == prog->check_offset_max
689 && !(prog->extflags & RXf_CANY_SEEN)
690 && ! multiline) /* /m can cause \n's to match that aren't
691 accounted for in the string max length.
692 See [perl #115242] */
693 {
2c2d71f5 694 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
695 I32 slen;
696
1aa99e6b 697 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 698
653099ff
GS
699 if (SvTAIL(check)) {
700 slen = SvCUR(check); /* >= 1 */
cad2e5aa 701
9041c2e3 702 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 703 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 704 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 705 goto fail_finish;
cad2e5aa
JH
706 }
707 /* Now should match s[0..slen-2] */
708 slen--;
3f7c398e 709 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 710 || (slen > 1
3f7c398e 711 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 712 report_neq:
a3621e74 713 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
714 goto fail_finish;
715 }
cad2e5aa 716 }
3f7c398e 717 else if (*SvPVX_const(check) != *s
653099ff 718 || ((slen = SvCUR(check)) > 1
3f7c398e 719 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 720 goto report_neq;
c315bfe8 721 check_at = s;
2c2d71f5 722 goto success_at_start;
7e25d62c 723 }
cad2e5aa 724 }
2c2d71f5 725 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 726 s = strpos;
2c2d71f5 727 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
728 end_shift = prog->check_end_shift;
729
2c2d71f5 730 if (!ml_anch) {
a3b680e6 731 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 732 - (SvTAIL(check) != 0);
a3b680e6 733 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
734
735 if (end_shift < eshift)
736 end_shift = eshift;
737 }
cad2e5aa 738 }
2c2d71f5 739 else { /* Can match at random position */
cad2e5aa
JH
740 ml_anch = 0;
741 s = strpos;
1de06328
YO
742 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
743 end_shift = prog->check_end_shift;
744
745 /* end shift should be non negative here */
cad2e5aa
JH
746 }
747
bcdf7404 748#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 749 if (end_shift < 0)
1de06328 750 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 751 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
752#endif
753
2c2d71f5
JH
754 restart:
755 /* Find a possible match in the region s..strend by looking for
756 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
757
758 {
759 I32 srch_start_shift = start_shift;
760 I32 srch_end_shift = end_shift;
c33e64f0
FC
761 U8* start_point;
762 U8* end_point;
1de06328
YO
763 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
764 srch_end_shift -= ((strbeg - s) - srch_start_shift);
765 srch_start_shift = strbeg - s;
766 }
6bda09f9 767 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
768 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
769 (IV)prog->check_offset_min,
770 (IV)srch_start_shift,
771 (IV)srch_end_shift,
772 (IV)prog->check_end_shift);
773 });
774
bbe252da 775 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
776 start_point= (U8*)(s + srch_start_shift);
777 end_point= (U8*)(strend - srch_end_shift);
778 } else {
779 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
780 end_point= HOP3(strend, -srch_end_shift, strbeg);
781 }
6bda09f9 782 DEBUG_OPTIMISE_MORE_r({
56570a2c 783 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 784 (int)(end_point - start_point),
fc8cd66c 785 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
786 start_point);
787 });
788
789 s = fbm_instr( start_point, end_point,
7fba1cd6 790 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 791 }
cad2e5aa
JH
792 /* Update the count-of-usability, remove useless subpatterns,
793 unshift s. */
2c2d71f5 794
ab3bbdeb 795 DEBUG_EXECUTE_r({
f2ed9b32 796 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
797 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
798 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 799 (s ? "Found" : "Did not find"),
f2ed9b32 800 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
801 ? "anchored" : "floating"),
802 quoted,
803 RE_SV_TAIL(check),
804 (s ? " at offset " : "...\n") );
805 });
2c2d71f5
JH
806
807 if (!s)
808 goto fail_finish;
2c2d71f5 809 /* Finish the diagnostic message */
a3621e74 810 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 811
1de06328
YO
812 /* XXX dmq: first branch is for positive lookbehind...
813 Our check string is offset from the beginning of the pattern.
814 So we need to do any stclass tests offset forward from that
815 point. I think. :-(
816 */
817
818
819
820 check_at=s;
821
822
2c2d71f5
JH
823 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
824 Start with the other substr.
825 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 826 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
827 *always* match. Probably should be marked during compile...
828 Probably it is right to do no SCREAM here...
829 */
830
f2ed9b32 831 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
832 : (prog->float_substr && prog->anchored_substr))
833 {
30944b6d 834 /* Take into account the "other" substring. */
2c2d71f5
JH
835 /* XXXX May be hopelessly wrong for UTF... */
836 if (!other_last)
6eb5f6b9 837 other_last = strpos;
f2ed9b32 838 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
839 do_other_anchored:
840 {
890ce7af
AL
841 char * const last = HOP3c(s, -start_shift, strbeg);
842 char *last1, *last2;
be8e71aa 843 char * const saved_s = s;
33b8afdf 844 SV* must;
2c2d71f5 845
2c2d71f5
JH
846 t = s - prog->check_offset_max;
847 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 848 && (!utf8_target
0ce71af7 849 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 850 && t > strpos)))
6f207bd3 851 NOOP;
2c2d71f5
JH
852 else
853 t = strpos;
1aa99e6b 854 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
855 if (t < other_last) /* These positions already checked */
856 t = other_last;
1aa99e6b 857 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
858 if (last < last1)
859 last1 = last;
1de06328
YO
860 /* XXXX It is not documented what units *_offsets are in.
861 We assume bytes, but this is clearly wrong.
862 Meaning this code needs to be carefully reviewed for errors.
863 dmq.
864 */
865
2c2d71f5 866 /* On end-of-str: see comment below. */
f2ed9b32 867 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
868 if (must == &PL_sv_undef) {
869 s = (char*)NULL;
1de06328 870 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
871 }
872 else
873 s = fbm_instr(
874 (unsigned char*)t,
875 HOP3(HOP3(last1, prog->anchored_offset, strend)
876 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
877 must,
7fba1cd6 878 multiline ? FBMrf_MULTILINE : 0
33b8afdf 879 );
ab3bbdeb 880 DEBUG_EXECUTE_r({
f2ed9b32 881 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
882 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
883 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 884 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
885 quoted, RE_SV_TAIL(must));
886 });
887
888
2c2d71f5
JH
889 if (!s) {
890 if (last1 >= last2) {
a3621e74 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
892 ", giving up...\n"));
893 goto fail_finish;
894 }
a3621e74 895 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 896 ", trying floating at offset %ld...\n",
be8e71aa 897 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
898 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
899 s = HOP3c(last, 1, strend);
2c2d71f5
JH
900 goto restart;
901 }
902 else {
a3621e74 903 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 904 (long)(s - i_strpos)));
1aa99e6b
IH
905 t = HOP3c(s, -prog->anchored_offset, strbeg);
906 other_last = HOP3c(s, 1, strend);
be8e71aa 907 s = saved_s;
2c2d71f5
JH
908 if (t == strpos)
909 goto try_at_start;
2c2d71f5
JH
910 goto try_at_offset;
911 }
30944b6d 912 }
2c2d71f5
JH
913 }
914 else { /* Take into account the floating substring. */
33b8afdf 915 char *last, *last1;
be8e71aa 916 char * const saved_s = s;
33b8afdf
JH
917 SV* must;
918
919 t = HOP3c(s, -start_shift, strbeg);
920 last1 = last =
921 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
922 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
923 last = HOP3c(t, prog->float_max_offset, strend);
924 s = HOP3c(t, prog->float_min_offset, strend);
925 if (s < other_last)
926 s = other_last;
2c2d71f5 927 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 928 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
929 /* fbm_instr() takes into account exact value of end-of-str
930 if the check is SvTAIL(ed). Since false positives are OK,
931 and end-of-str is not later than strend we are OK. */
932 if (must == &PL_sv_undef) {
933 s = (char*)NULL;
1de06328 934 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
935 }
936 else
2c2d71f5 937 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
938 (unsigned char*)last + SvCUR(must)
939 - (SvTAIL(must)!=0),
7fba1cd6 940 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 941 DEBUG_EXECUTE_r({
f2ed9b32 942 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
943 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
944 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 945 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
946 quoted, RE_SV_TAIL(must));
947 });
33b8afdf
JH
948 if (!s) {
949 if (last1 == last) {
a3621e74 950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
951 ", giving up...\n"));
952 goto fail_finish;
2c2d71f5 953 }
a3621e74 954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 955 ", trying anchored starting at offset %ld...\n",
be8e71aa 956 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
957 other_last = last;
958 s = HOP3c(t, 1, strend);
959 goto restart;
960 }
961 else {
a3621e74 962 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
963 (long)(s - i_strpos)));
964 other_last = s; /* Fix this later. --Hugo */
be8e71aa 965 s = saved_s;
33b8afdf
JH
966 if (t == strpos)
967 goto try_at_start;
968 goto try_at_offset;
969 }
2c2d71f5 970 }
cad2e5aa 971 }
2c2d71f5 972
1de06328 973
9ef43ace 974 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 975
6bda09f9 976 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
977 PerlIO_printf(Perl_debug_log,
978 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
979 (IV)prog->check_offset_min,
980 (IV)prog->check_offset_max,
981 (IV)(s-strpos),
982 (IV)(t-strpos),
983 (IV)(t-s),
984 (IV)(strend-strpos)
985 )
986 );
987
2c2d71f5 988 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 989 && (!utf8_target
9ef43ace 990 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
991 && t > strpos)))
992 {
2c2d71f5
JH
993 /* Fixed substring is found far enough so that the match
994 cannot start at strpos. */
995 try_at_offset:
cad2e5aa 996 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
997 /* Eventually fbm_*() should handle this, but often
998 anchored_offset is not 0, so this check will not be wasted. */
999 /* XXXX In the code below we prefer to look for "^" even in
1000 presence of anchored substrings. And we search even
1001 beyond the found float position. These pessimizations
1002 are historical artefacts only. */
1003 find_anchor:
2c2d71f5 1004 while (t < strend - prog->minlen) {
cad2e5aa 1005 if (*t == '\n') {
4ee3650e 1006 if (t < check_at - prog->check_offset_min) {
f2ed9b32 1007 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
1008 /* Since we moved from the found position,
1009 we definitely contradict the found anchored
30944b6d
IZ
1010 substr. Due to the above check we do not
1011 contradict "check" substr.
1012 Thus we can arrive here only if check substr
1013 is float. Redo checking for "other"=="fixed".
1014 */
9041c2e3 1015 strpos = t + 1;
a3621e74 1016 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1017 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1018 goto do_other_anchored;
1019 }
4ee3650e
GS
1020 /* We don't contradict the found floating substring. */
1021 /* XXXX Why not check for STCLASS? */
cad2e5aa 1022 s = t + 1;
a3621e74 1023 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1024 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1025 goto set_useful;
1026 }
4ee3650e
GS
1027 /* Position contradicts check-string */
1028 /* XXXX probably better to look for check-string
1029 than for "\n", so one should lower the limit for t? */
a3621e74 1030 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1031 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1032 other_last = strpos = s = t + 1;
cad2e5aa
JH
1033 goto restart;
1034 }
1035 t++;
1036 }
a3621e74 1037 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1038 PL_colors[0], PL_colors[1]));
2c2d71f5 1039 goto fail_finish;
cad2e5aa 1040 }
f5952150 1041 else {
a3621e74 1042 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1043 PL_colors[0], PL_colors[1]));
f5952150 1044 }
cad2e5aa
JH
1045 s = t;
1046 set_useful:
f2ed9b32 1047 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1048 }
1049 else {
f5952150 1050 /* The found string does not prohibit matching at strpos,
2c2d71f5 1051 - no optimization of calling REx engine can be performed,
f5952150
GS
1052 unless it was an MBOL and we are not after MBOL,
1053 or a future STCLASS check will fail this. */
2c2d71f5
JH
1054 try_at_start:
1055 /* Even in this situation we may use MBOL flag if strpos is offset
1056 wrt the start of the string. */
52a21eb3 1057 if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1058 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1059 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1060 {
cad2e5aa
JH
1061 t = strpos;
1062 goto find_anchor;
1063 }
a3621e74 1064 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1065 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1066 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1067 );
2c2d71f5 1068 success_at_start:
bbe252da 1069 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1070 && (utf8_target ? (
33b8afdf
JH
1071 prog->check_utf8 /* Could be deleted already */
1072 && --BmUSEFUL(prog->check_utf8) < 0
1073 && (prog->check_utf8 == prog->float_utf8)
1074 ) : (
1075 prog->check_substr /* Could be deleted already */
1076 && --BmUSEFUL(prog->check_substr) < 0
1077 && (prog->check_substr == prog->float_substr)
1078 )))
66e933ab 1079 {
cad2e5aa 1080 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1081 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1082 /* XXX Does the destruction order has to change with utf8_target? */
1083 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1084 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1085 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1086 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1087 check = NULL; /* abort */
cad2e5aa 1088 s = strpos;
486ec47a 1089 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1090 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1091 if (prog->intflags & PREGf_IMPLICIT)
1092 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1093 /* XXXX This is a remnant of the old implementation. It
1094 looks wasteful, since now INTUIT can use many
6eb5f6b9 1095 other heuristics. */
bbe252da 1096 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1097 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1098 }
1099 else
1100 s = strpos;
1101 }
1102
6eb5f6b9
JH
1103 /* Last resort... */
1104 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1105 /* trie stclasses are too expensive to use here, we are better off to
1106 leave it to regmatch itself */
f8fc2ecf 1107 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1108 /* minlen == 0 is possible if regstclass is \b or \B,
1109 and the fixed substr is ''$.
1110 Since minlen is already taken into account, s+1 is before strend;
1111 accidentally, minlen >= 1 guaranties no false positives at s + 1
1112 even for \b or \B. But (minlen? 1 : 0) below assumes that
1113 regstclass does not come from lookahead... */
1114 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1115 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1116 const U8* const str = (U8*)STRING(progi->regstclass);
1117 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1118 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1119 : 1);
1de06328
YO
1120 char * endpos;
1121 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1122 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1123 else if (prog->float_substr || prog->float_utf8)
1124 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1125 else
1126 endpos= strend;
1127
d8080198
YO
1128 if (checked_upto < s)
1129 checked_upto = s;
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132
6eb5f6b9 1133 t = s;
984e6dd1 1134 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1135 reginfo);
d8080198
YO
1136 if (s) {
1137 checked_upto = s;
1138 } else {
6eb5f6b9 1139#ifdef DEBUGGING
cbbf8932 1140 const char *what = NULL;
6eb5f6b9
JH
1141#endif
1142 if (endpos == strend) {
a3621e74 1143 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1144 "Could not match STCLASS...\n") );
1145 goto fail;
1146 }
a3621e74 1147 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1148 "This position contradicts STCLASS...\n") );
bbe252da 1149 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1150 goto fail;
d8080198
YO
1151 checked_upto = HOPBACKc(endpos, start_shift);
1152 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1153 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1154 /* Contradict one of substrings */
33b8afdf 1155 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1156 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1157 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1158 hop_and_restart:
1aa99e6b 1159 s = HOP3c(t, 1, strend);
66e933ab
GS
1160 if (s + start_shift + end_shift > strend) {
1161 /* XXXX Should be taken into account earlier? */
a3621e74 1162 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1163 "Could not match STCLASS...\n") );
1164 goto fail;
1165 }
5e39e1e5
HS
1166 if (!check)
1167 goto giveup;
a3621e74 1168 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1169 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1170 what, (long)(s + start_shift - i_strpos)) );
1171 goto restart;
1172 }
66e933ab 1173 /* Have both, check_string is floating */
6eb5f6b9
JH
1174 if (t + start_shift >= check_at) /* Contradicts floating=check */
1175 goto retry_floating_check;
1176 /* Recheck anchored substring, but not floating... */
9041c2e3 1177 s = check_at;
5e39e1e5
HS
1178 if (!check)
1179 goto giveup;
a3621e74 1180 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1181 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1182 (long)(other_last - i_strpos)) );
1183 goto do_other_anchored;
1184 }
60e71179
GS
1185 /* Another way we could have checked stclass at the
1186 current position only: */
1187 if (ml_anch) {
1188 s = t = t + 1;
5e39e1e5
HS
1189 if (!check)
1190 goto giveup;
a3621e74 1191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1192 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1193 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1194 goto try_at_offset;
66e933ab 1195 }
f2ed9b32 1196 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1197 goto fail;
486ec47a 1198 /* Check is floating substring. */
6eb5f6b9
JH
1199 retry_floating_check:
1200 t = check_at - start_shift;
a3621e74 1201 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1202 goto hop_and_restart;
1203 }
b7953727 1204 if (t != s) {
a3621e74 1205 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1206 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1207 (long)(t - i_strpos), (long)(s - i_strpos))
1208 );
1209 }
1210 else {
a3621e74 1211 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1212 "Does not contradict STCLASS...\n");
1213 );
1214 }
6eb5f6b9 1215 }
5e39e1e5 1216 giveup:
a3621e74 1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1218 PL_colors[4], (check ? "Guessed" : "Giving up"),
1219 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1220 return s;
2c2d71f5
JH
1221
1222 fail_finish: /* Substring not found */
33b8afdf 1223 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1224 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1225 fail:
a3621e74 1226 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1227 PL_colors[4], PL_colors[5]));
bd61b366 1228 return NULL;
cad2e5aa 1229}
9661b544 1230
a0a388a1
YO
1231#define DECL_TRIE_TYPE(scan) \
1232 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1233 trie_type = ((scan->flags == EXACT) \
1234 ? (utf8_target ? trie_utf8 : trie_plain) \
1235 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1236
fd3249ee
YO
1237#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1238STMT_START { \
fab2782b
YO
1239 STRLEN skiplen; \
1240 switch (trie_type) { \
1241 case trie_utf8_fold: \
1242 if ( foldlen>0 ) { \
1243 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1244 foldlen -= len; \
1245 uscan += len; \
1246 len=0; \
1247 } else { \
1248 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1249 len = UTF8SKIP(uc); \
1250 skiplen = UNISKIP( uvc ); \
1251 foldlen -= skiplen; \
1252 uscan = foldbuf + skiplen; \
1253 } \
1254 break; \
1255 case trie_latin_utf8_fold: \
1256 if ( foldlen>0 ) { \
1257 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1258 foldlen -= len; \
1259 uscan += len; \
1260 len=0; \
1261 } else { \
1262 len = 1; \
51910141 1263 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \
fab2782b
YO
1264 skiplen = UNISKIP( uvc ); \
1265 foldlen -= skiplen; \
1266 uscan = foldbuf + skiplen; \
1267 } \
1268 break; \
1269 case trie_utf8: \
1270 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1271 break; \
1272 case trie_plain: \
1273 uvc = (UV)*uc; \
1274 len = 1; \
1275 } \
1276 if (uvc < 256) { \
1277 charid = trie->charmap[ uvc ]; \
1278 } \
1279 else { \
1280 charid = 0; \
1281 if (widecharmap) { \
1282 SV** const svpp = hv_fetch(widecharmap, \
1283 (char*)&uvc, sizeof(UV), 0); \
1284 if (svpp) \
1285 charid = (U16)SvIV(*svpp); \
1286 } \
1287 } \
4cadc6a9
YO
1288} STMT_END
1289
4cadc6a9
YO
1290#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1291STMT_START { \
1292 while (s <= e) { \
1293 if ( (CoNd) \
fac1af77 1294 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1295 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1296 goto got_it; \
1297 s++; \
1298 } \
1299} STMT_END
1300
1301#define REXEC_FBC_UTF8_SCAN(CoDe) \
1302STMT_START { \
9a902117 1303 while (s < strend) { \
4cadc6a9 1304 CoDe \
9a902117 1305 s += UTF8SKIP(s); \
4cadc6a9
YO
1306 } \
1307} STMT_END
1308
1309#define REXEC_FBC_SCAN(CoDe) \
1310STMT_START { \
1311 while (s < strend) { \
1312 CoDe \
1313 s++; \
1314 } \
1315} STMT_END
1316
1317#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1318REXEC_FBC_UTF8_SCAN( \
1319 if (CoNd) { \
02d5137b 1320 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1321 goto got_it; \
1322 else \
1323 tmp = doevery; \
1324 } \
1325 else \
1326 tmp = 1; \
1327)
1328
1329#define REXEC_FBC_CLASS_SCAN(CoNd) \
1330REXEC_FBC_SCAN( \
1331 if (CoNd) { \
02d5137b 1332 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1333 goto got_it; \
1334 else \
1335 tmp = doevery; \
1336 } \
1337 else \
1338 tmp = 1; \
1339)
1340
1341#define REXEC_FBC_TRYIT \
02d5137b 1342if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1343 goto got_it
1344
e1d1eefb 1345#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1346 if (utf8_target) { \
e1d1eefb
YO
1347 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1348 } \
1349 else { \
1350 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1351 }
e1d1eefb 1352
786e8c11 1353#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb
DM
1354 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1355 (PL_reg_starttry),doutf8)
786e8c11 1356
cfaf538b
KW
1357
1358#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1359 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1360 tmp = TEST_NON_UTF8(tmp); \
1361 REXEC_FBC_UTF8_SCAN( \
1362 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1363 tmp = !tmp; \
1364 IF_SUCCESS; \
1365 } \
1366 else { \
1367 IF_FAIL; \
1368 } \
1369 ); \
1370
1371#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1372 if (s == reginfo->strbeg) { \
cfaf538b
KW
1373 tmp = '\n'; \
1374 } \
1375 else { \
9d9163fb 1376 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
cfaf538b
KW
1377 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1378 } \
1379 tmp = TeSt1_UtF8; \
1380 LOAD_UTF8_CHARCLASS_ALNUM(); \
1381 REXEC_FBC_UTF8_SCAN( \
1382 if (tmp == ! (TeSt2_UtF8)) { \
1383 tmp = !tmp; \
1384 IF_SUCCESS; \
1385 } \
1386 else { \
1387 IF_FAIL; \
1388 } \
1389 ); \
1390
63ac0dad
KW
1391/* The only difference between the BOUND and NBOUND cases is that
1392 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1393 * NBOUND. This is accomplished by passing it in either the if or else clause,
1394 * with the other one being empty */
1395#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1396 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1397
1398#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1399 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1400
1401#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1402 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1403
1404#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1405 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1406
63ac0dad
KW
1407
1408/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1409 * be passed in completely with the variable name being tested, which isn't
1410 * such a clean interface, but this is easier to read than it was before. We
1411 * are looking for the boundary (or non-boundary between a word and non-word
1412 * character. The utf8 and non-utf8 cases have the same logic, but the details
1413 * must be different. Find the "wordness" of the character just prior to this
1414 * one, and compare it with the wordness of this one. If they differ, we have
1415 * a boundary. At the beginning of the string, pretend that the previous
1416 * character was a new-line */
cfaf538b 1417#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1418 if (utf8_target) { \
cfaf538b 1419 UTF8_CODE \
63ac0dad
KW
1420 } \
1421 else { /* Not utf8 */ \
9d9163fb 1422 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1423 tmp = TEST_NON_UTF8(tmp); \
1424 REXEC_FBC_SCAN( \
1425 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1426 tmp = !tmp; \
1427 IF_SUCCESS; \
1428 } \
1429 else { \
1430 IF_FAIL; \
1431 } \
1432 ); \
1433 } \
02d5137b 1434 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1435 goto got_it;
1436
786e8c11 1437/* We know what class REx starts with. Try to find this position... */
02d5137b 1438/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1439/* annoyingly all the vars in this routine have different names from their counterparts
1440 in regmatch. /grrr */
1441
3c3eec57 1442STATIC char *
07be1b83 1443S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1444 const char *strend, regmatch_info *reginfo)
a687059c 1445{
73104a1b
KW
1446 dVAR;
1447 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1448 char *pat_string; /* The pattern's exactish string */
1449 char *pat_end; /* ptr to end char of pat_string */
1450 re_fold_t folder; /* Function for computing non-utf8 folds */
1451 const U8 *fold_array; /* array for folding ords < 256 */
1452 STRLEN ln;
1453 STRLEN lnc;
73104a1b
KW
1454 U8 c1;
1455 U8 c2;
1456 char *e;
1457 I32 tmp = 1; /* Scratch variable? */
ba44c216 1458 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1459 UV utf8_fold_flags = 0;
f9176b44 1460 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1461 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1462 with a result inverts that result, as 0^1 =
1463 1 and 1^1 = 0 */
1464 _char_class_number classnum;
1465
73104a1b 1466 RXi_GET_DECL(prog,progi);
2f7f8cb1 1467
73104a1b 1468 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1469
73104a1b
KW
1470 /* We know what class it must start with. */
1471 switch (OP(c)) {
1472 case ANYOF:
9aa1e39f 1473 case ANYOF_SYNTHETIC:
954a2af6 1474 case ANYOF_WARN_SUPER:
73104a1b
KW
1475 if (utf8_target) {
1476 REXEC_FBC_UTF8_CLASS_SCAN(
1477 reginclass(prog, c, (U8*)s, utf8_target));
1478 }
1479 else {
1480 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1481 }
1482 break;
1483 case CANY:
1484 REXEC_FBC_SCAN(
02d5137b 1485 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1486 goto got_it;
1487 else
1488 tmp = doevery;
1489 );
1490 break;
1491
1492 case EXACTFA:
984e6dd1 1493 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1494 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1495 goto do_exactf_utf8;
1496 }
1497 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1498 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1499 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1500
73104a1b
KW
1501 case EXACTF:
1502 if (utf8_target) {
16d951b7 1503
73104a1b
KW
1504 /* regcomp.c already folded this if pattern is in UTF-8 */
1505 utf8_fold_flags = 0;
1506 goto do_exactf_utf8;
1507 }
1508 fold_array = PL_fold;
1509 folder = foldEQ;
1510 goto do_exactf_non_utf8;
1511
1512 case EXACTFL:
984e6dd1 1513 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1514 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1515 goto do_exactf_utf8;
1516 }
1517 fold_array = PL_fold_locale;
1518 folder = foldEQ_locale;
1519 goto do_exactf_non_utf8;
3c760661 1520
73104a1b 1521 case EXACTFU_SS:
984e6dd1 1522 if (is_utf8_pat) {
73104a1b
KW
1523 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1524 }
1525 goto do_exactf_utf8;
16d951b7 1526
73104a1b
KW
1527 case EXACTFU_TRICKYFOLD:
1528 case EXACTFU:
984e6dd1
DM
1529 if (is_utf8_pat || utf8_target) {
1530 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1531 goto do_exactf_utf8;
1532 }
fac1af77 1533
73104a1b
KW
1534 /* Any 'ss' in the pattern should have been replaced by regcomp,
1535 * so we don't have to worry here about this single special case
1536 * in the Latin1 range */
1537 fold_array = PL_fold_latin1;
1538 folder = foldEQ_latin1;
1539
1540 /* FALL THROUGH */
1541
1542 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1543 are no glitches with fold-length differences
1544 between the target string and pattern */
1545
1546 /* The idea in the non-utf8 EXACTF* cases is to first find the
1547 * first character of the EXACTF* node and then, if necessary,
1548 * case-insensitively compare the full text of the node. c1 is the
1549 * first character. c2 is its fold. This logic will not work for
1550 * Unicode semantics and the german sharp ss, which hence should
1551 * not be compiled into a node that gets here. */
1552 pat_string = STRING(c);
1553 ln = STR_LEN(c); /* length to match in octets/bytes */
1554
1555 /* We know that we have to match at least 'ln' bytes (which is the
1556 * same as characters, since not utf8). If we have to match 3
1557 * characters, and there are only 2 availabe, we know without
1558 * trying that it will fail; so don't start a match past the
1559 * required minimum number from the far end */
1560 e = HOP3c(strend, -((I32)ln), s);
1561
02d5137b 1562 if (reginfo->intuit && e < s) {
73104a1b
KW
1563 e = s; /* Due to minlen logic of intuit() */
1564 }
fac1af77 1565
73104a1b
KW
1566 c1 = *pat_string;
1567 c2 = fold_array[c1];
1568 if (c1 == c2) { /* If char and fold are the same */
1569 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1570 }
1571 else {
1572 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1573 }
1574 break;
fac1af77 1575
73104a1b
KW
1576 do_exactf_utf8:
1577 {
1578 unsigned expansion;
1579
1580 /* If one of the operands is in utf8, we can't use the simpler folding
1581 * above, due to the fact that many different characters can have the
1582 * same fold, or portion of a fold, or different- length fold */
1583 pat_string = STRING(c);
1584 ln = STR_LEN(c); /* length to match in octets/bytes */
1585 pat_end = pat_string + ln;
984e6dd1 1586 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1587 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1588 : ln;
1589
1590 /* We have 'lnc' characters to match in the pattern, but because of
1591 * multi-character folding, each character in the target can match
1592 * up to 3 characters (Unicode guarantees it will never exceed
1593 * this) if it is utf8-encoded; and up to 2 if not (based on the
1594 * fact that the Latin 1 folds are already determined, and the
1595 * only multi-char fold in that range is the sharp-s folding to
1596 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1597 * string character. Adjust lnc accordingly, rounding up, so that
1598 * if we need to match at least 4+1/3 chars, that really is 5. */
1599 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1600 lnc = (lnc + expansion - 1) / expansion;
1601
1602 /* As in the non-UTF8 case, if we have to match 3 characters, and
1603 * only 2 are left, it's guaranteed to fail, so don't start a
1604 * match that would require us to go beyond the end of the string
1605 */
1606 e = HOP3c(strend, -((I32)lnc), s);
1607
02d5137b 1608 if (reginfo->intuit && e < s) {
73104a1b
KW
1609 e = s; /* Due to minlen logic of intuit() */
1610 }
0658cdde 1611
73104a1b
KW
1612 /* XXX Note that we could recalculate e to stop the loop earlier,
1613 * as the worst case expansion above will rarely be met, and as we
1614 * go along we would usually find that e moves further to the left.
1615 * This would happen only after we reached the point in the loop
1616 * where if there were no expansion we should fail. Unclear if
1617 * worth the expense */
1618
1619 while (s <= e) {
1620 char *my_strend= (char *)strend;
1621 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1622 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1623 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1624 {
1625 goto got_it;
1626 }
1627 s += (utf8_target) ? UTF8SKIP(s) : 1;
1628 }
1629 break;
1630 }
1631 case BOUNDL:
272d35c9 1632 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1633 FBC_BOUND(isWORDCHAR_LC,
1634 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1635 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1636 break;
1637 case NBOUNDL:
272d35c9 1638 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1639 FBC_NBOUND(isWORDCHAR_LC,
1640 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1641 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1642 break;
1643 case BOUND:
1644 FBC_BOUND(isWORDCHAR,
0eb30aeb 1645 isWORDCHAR_uni(tmp),
03940dc2 1646 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1647 break;
1648 case BOUNDA:
1649 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
1653 case NBOUND:
1654 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1655 isWORDCHAR_uni(tmp),
03940dc2 1656 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1657 break;
1658 case NBOUNDA:
1659 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1660 isWORDCHAR_A(tmp),
1661 isWORDCHAR_A((U8*)s));
1662 break;
1663 case BOUNDU:
1664 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1665 isWORDCHAR_uni(tmp),
03940dc2 1666 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1667 break;
1668 case NBOUNDU:
1669 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1670 isWORDCHAR_uni(tmp),
03940dc2 1671 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1672 break;
73104a1b
KW
1673 case LNBREAK:
1674 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1675 is_LNBREAK_latin1_safe(s, strend)
1676 );
1677 break;
3018b823
KW
1678
1679 /* The argument to all the POSIX node types is the class number to pass to
1680 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1681
1682 case NPOSIXL:
1683 to_complement = 1;
1684 /* FALLTHROUGH */
1685
1686 case POSIXL:
272d35c9 1687 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1688 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1689 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1690 break;
3018b823
KW
1691
1692 case NPOSIXD:
1693 to_complement = 1;
1694 /* FALLTHROUGH */
1695
1696 case POSIXD:
1697 if (utf8_target) {
1698 goto posix_utf8;
1699 }
1700 goto posixa;
1701
1702 case NPOSIXA:
1703 if (utf8_target) {
1704 /* The complement of something that matches only ASCII matches all
1705 * UTF-8 variant code points, plus everything in ASCII that isn't
1706 * in the class */
1707 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1708 || ! _generic_isCC_A(*s, FLAGS(c)));
1709 break;
1710 }
1711
1712 to_complement = 1;
1713 /* FALLTHROUGH */
1714
73104a1b 1715 case POSIXA:
3018b823 1716 posixa:
73104a1b 1717 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1718 * byte invariant character. */
1719 REXEC_FBC_CLASS_SCAN(
1720 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1721 break;
3018b823
KW
1722
1723 case NPOSIXU:
1724 to_complement = 1;
1725 /* FALLTHROUGH */
1726
1727 case POSIXU:
1728 if (! utf8_target) {
1729 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1730 FLAGS(c))));
1731 }
1732 else {
1733
1734 posix_utf8:
1735 classnum = (_char_class_number) FLAGS(c);
1736 if (classnum < _FIRST_NON_SWASH_CC) {
1737 while (s < strend) {
1738
1739 /* We avoid loading in the swash as long as possible, but
1740 * should we have to, we jump to a separate loop. This
1741 * extra 'if' statement is what keeps this code from being
1742 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1743 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1744 goto found_above_latin1;
1745 }
1746 if ((UTF8_IS_INVARIANT(*s)
1747 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1748 classnum)))
1749 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1750 && to_complement ^ cBOOL(
1751 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1752 classnum))))
1753 {
02d5137b 1754 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1755 goto got_it;
1756 else {
1757 tmp = doevery;
1758 }
1759 }
1760 else {
1761 tmp = 1;
1762 }
1763 s += UTF8SKIP(s);
1764 }
1765 }
1766 else switch (classnum) { /* These classes are implemented as
1767 macros */
1768 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1769 revert the change of \v matching this */
1770 /* FALL THROUGH */
1771
1772 case _CC_ENUM_PSXSPC:
1773 REXEC_FBC_UTF8_CLASS_SCAN(
1774 to_complement ^ cBOOL(isSPACE_utf8(s)));
1775 break;
1776
1777 case _CC_ENUM_BLANK:
1778 REXEC_FBC_UTF8_CLASS_SCAN(
1779 to_complement ^ cBOOL(isBLANK_utf8(s)));
1780 break;
1781
1782 case _CC_ENUM_XDIGIT:
1783 REXEC_FBC_UTF8_CLASS_SCAN(
1784 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1785 break;
1786
1787 case _CC_ENUM_VERTSPACE:
1788 REXEC_FBC_UTF8_CLASS_SCAN(
1789 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1790 break;
1791
1792 case _CC_ENUM_CNTRL:
1793 REXEC_FBC_UTF8_CLASS_SCAN(
1794 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1795 break;
1796
1797 default:
1798 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1799 assert(0); /* NOTREACHED */
1800 }
1801 }
1802 break;
1803
1804 found_above_latin1: /* Here we have to load a swash to get the result
1805 for the current code point */
1806 if (! PL_utf8_swash_ptrs[classnum]) {
1807 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1808 PL_utf8_swash_ptrs[classnum] =
1809 _core_swash_init("utf8", swash_property_names[classnum],
1810 &PL_sv_undef, 1, 0, NULL, &flags);
1811 }
1812
1813 /* This is a copy of the loop above for swash classes, though using the
1814 * FBC macro instead of being expanded out. Since we've loaded the
1815 * swash, we don't have to check for that each time through the loop */
1816 REXEC_FBC_UTF8_CLASS_SCAN(
1817 to_complement ^ cBOOL(_generic_utf8(
1818 classnum,
1819 s,
1820 swash_fetch(PL_utf8_swash_ptrs[classnum],
1821 (U8 *) s, TRUE))));
73104a1b
KW
1822 break;
1823
1824 case AHOCORASICKC:
1825 case AHOCORASICK:
1826 {
1827 DECL_TRIE_TYPE(c);
1828 /* what trie are we using right now */
1829 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1830 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1831 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1832
1833 const char *last_start = strend - trie->minlen;
6148ee25 1834#ifdef DEBUGGING
73104a1b 1835 const char *real_start = s;
6148ee25 1836#endif
73104a1b
KW
1837 STRLEN maxlen = trie->maxlen;
1838 SV *sv_points;
1839 U8 **points; /* map of where we were in the input string
1840 when reading a given char. For ASCII this
1841 is unnecessary overhead as the relationship
1842 is always 1:1, but for Unicode, especially
1843 case folded Unicode this is not true. */
1844 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1845 U8 *bitmap=NULL;
1846
1847
1848 GET_RE_DEBUG_FLAGS_DECL;
1849
1850 /* We can't just allocate points here. We need to wrap it in
1851 * an SV so it gets freed properly if there is a croak while
1852 * running the match */
1853 ENTER;
1854 SAVETMPS;
1855 sv_points=newSV(maxlen * sizeof(U8 *));
1856 SvCUR_set(sv_points,
1857 maxlen * sizeof(U8 *));
1858 SvPOK_on(sv_points);
1859 sv_2mortal(sv_points);
1860 points=(U8**)SvPV_nolen(sv_points );
1861 if ( trie_type != trie_utf8_fold
1862 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1863 {
1864 if (trie->bitmap)
1865 bitmap=(U8*)trie->bitmap;
1866 else
1867 bitmap=(U8*)ANYOF_BITMAP(c);
1868 }
1869 /* this is the Aho-Corasick algorithm modified a touch
1870 to include special handling for long "unknown char" sequences.
1871 The basic idea being that we use AC as long as we are dealing
1872 with a possible matching char, when we encounter an unknown char
1873 (and we have not encountered an accepting state) we scan forward
1874 until we find a legal starting char.
1875 AC matching is basically that of trie matching, except that when
1876 we encounter a failing transition, we fall back to the current
1877 states "fail state", and try the current char again, a process
1878 we repeat until we reach the root state, state 1, or a legal
1879 transition. If we fail on the root state then we can either
1880 terminate if we have reached an accepting state previously, or
1881 restart the entire process from the beginning if we have not.
1882
1883 */
1884 while (s <= last_start) {
1885 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1886 U8 *uc = (U8*)s;
1887 U16 charid = 0;
1888 U32 base = 1;
1889 U32 state = 1;
1890 UV uvc = 0;
1891 STRLEN len = 0;
1892 STRLEN foldlen = 0;
1893 U8 *uscan = (U8*)NULL;
1894 U8 *leftmost = NULL;
1895#ifdef DEBUGGING
1896 U32 accepted_word= 0;
786e8c11 1897#endif
73104a1b
KW
1898 U32 pointpos = 0;
1899
1900 while ( state && uc <= (U8*)strend ) {
1901 int failed=0;
1902 U32 word = aho->states[ state ].wordnum;
1903
1904 if( state==1 ) {
1905 if ( bitmap ) {
1906 DEBUG_TRIE_EXECUTE_r(
1907 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1908 dump_exec_pos( (char *)uc, c, strend, real_start,
1909 (char *)uc, utf8_target );
1910 PerlIO_printf( Perl_debug_log,
1911 " Scanning for legal start char...\n");
1912 }
1913 );
1914 if (utf8_target) {
1915 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1916 uc += UTF8SKIP(uc);
1917 }
1918 } else {
1919 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1920 uc++;
1921 }
786e8c11 1922 }
73104a1b 1923 s= (char *)uc;
07be1b83 1924 }
73104a1b
KW
1925 if (uc >(U8*)last_start) break;
1926 }
1927
1928 if ( word ) {
1929 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1930 if (!leftmost || lpos < leftmost) {
1931 DEBUG_r(accepted_word=word);
1932 leftmost= lpos;
7016d6eb 1933 }
73104a1b 1934 if (base==0) break;
7016d6eb 1935
73104a1b
KW
1936 }
1937 points[pointpos++ % maxlen]= uc;
1938 if (foldlen || uc < (U8*)strend) {
1939 REXEC_TRIE_READ_CHAR(trie_type, trie,
1940 widecharmap, uc,
1941 uscan, len, uvc, charid, foldlen,
1942 foldbuf, uniflags);
1943 DEBUG_TRIE_EXECUTE_r({
1944 dump_exec_pos( (char *)uc, c, strend,
1945 real_start, s, utf8_target);
1946 PerlIO_printf(Perl_debug_log,
1947 " Charid:%3u CP:%4"UVxf" ",
1948 charid, uvc);
1949 });
1950 }
1951 else {
1952 len = 0;
1953 charid = 0;
1954 }
07be1b83 1955
73104a1b
KW
1956
1957 do {
6148ee25 1958#ifdef DEBUGGING
73104a1b 1959 word = aho->states[ state ].wordnum;
6148ee25 1960#endif
73104a1b
KW
1961 base = aho->states[ state ].trans.base;
1962
1963 DEBUG_TRIE_EXECUTE_r({
1964 if (failed)
1965 dump_exec_pos( (char *)uc, c, strend, real_start,
1966 s, utf8_target );
1967 PerlIO_printf( Perl_debug_log,
1968 "%sState: %4"UVxf", word=%"UVxf,
1969 failed ? " Fail transition to " : "",
1970 (UV)state, (UV)word);
1971 });
1972 if ( base ) {
1973 U32 tmp;
1974 I32 offset;
1975 if (charid &&
1976 ( ((offset = base + charid
1977 - 1 - trie->uniquecharcount)) >= 0)
1978 && ((U32)offset < trie->lasttrans)
1979 && trie->trans[offset].check == state
1980 && (tmp=trie->trans[offset].next))
1981 {
1982 DEBUG_TRIE_EXECUTE_r(
1983 PerlIO_printf( Perl_debug_log," - legal\n"));
1984 state = tmp;
1985 break;
07be1b83
YO
1986 }
1987 else {
786e8c11 1988 DEBUG_TRIE_EXECUTE_r(
73104a1b 1989 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1990 failed = 1;
73104a1b 1991 state = aho->fail[state];
07be1b83 1992 }
07be1b83 1993 }
73104a1b
KW
1994 else {
1995 /* we must be accepting here */
1996 DEBUG_TRIE_EXECUTE_r(
1997 PerlIO_printf( Perl_debug_log," - accepting\n"));
1998 failed = 1;
1999 break;
786e8c11 2000 }
73104a1b
KW
2001 } while(state);
2002 uc += len;
2003 if (failed) {
2004 if (leftmost)
2005 break;
2006 if (!state) state = 1;
07be1b83 2007 }
73104a1b
KW
2008 }
2009 if ( aho->states[ state ].wordnum ) {
2010 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2011 if (!leftmost || lpos < leftmost) {
2012 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2013 leftmost = lpos;
07be1b83
YO
2014 }
2015 }
73104a1b
KW
2016 if (leftmost) {
2017 s = (char*)leftmost;
2018 DEBUG_TRIE_EXECUTE_r({
2019 PerlIO_printf(
2020 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2021 (UV)accepted_word, (IV)(s - real_start)
2022 );
2023 });
02d5137b 2024 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2025 FREETMPS;
2026 LEAVE;
2027 goto got_it;
2028 }
2029 s = HOPc(s,1);
2030 DEBUG_TRIE_EXECUTE_r({
2031 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2032 });
2033 } else {
2034 DEBUG_TRIE_EXECUTE_r(
2035 PerlIO_printf( Perl_debug_log,"No match.\n"));
2036 break;
2037 }
2038 }
2039 FREETMPS;
2040 LEAVE;
2041 }
2042 break;
2043 default:
2044 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2045 break;
2046 }
2047 return 0;
2048 got_it:
2049 return s;
6eb5f6b9
JH
2050}
2051
fae667d5 2052
6eb5f6b9
JH
2053/*
2054 - regexec_flags - match a regexp against a string
2055 */
2056I32
5aaab254 2057Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2058 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2059/* stringarg: the point in the string at which to begin matching */
2060/* strend: pointer to null at end of string */
2061/* strbeg: real beginning of string */
2062/* minend: end of match must be >= minend bytes after stringarg. */
2063/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2064 * itself is accessed via the pointers above */
2065/* data: May be used for some additional optimizations.
2066 Currently its only used, with a U32 cast, for transmitting
2067 the ganch offset when doing a /g match. This will change */
2068/* nosave: For optimizations. */
2069
6eb5f6b9 2070{
97aff369 2071 dVAR;
8d919b0a 2072 struct regexp *const prog = ReANY(rx);
5aaab254 2073 char *s;
eb578fdb 2074 regnode *c;
5aaab254 2075 char *startpos = stringarg;
6eb5f6b9
JH
2076 I32 minlen; /* must match at least this many chars */
2077 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2078 I32 end_shift = 0; /* Same for the end. */ /* CC */
2079 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2080 char *scream_olds = NULL;
f2ed9b32 2081 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2082 I32 multiline;
f8fc2ecf 2083 RXi_GET_DECL(prog,progi);
02d5137b
DM
2084 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2085 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2086 regexp_paren_pair *swap = NULL;
006f26b2 2087 I32 oldsave;
a3621e74
YO
2088 GET_RE_DEBUG_FLAGS_DECL;
2089
7918f24d 2090 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2091 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2092
2093 /* Be paranoid... */
2094 if (prog == NULL || startpos == NULL) {
2095 Perl_croak(aTHX_ "NULL regexp parameter");
2096 return 0;
2097 }
2098
bbe252da 2099 multiline = prog->extflags & RXf_PMf_MULTILINE;
8adc0f72 2100
02d5137b
DM
2101 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2102 reginfo->intuit = 0;
ba44c216 2103 reginfo->is_utf8_target = cBOOL(utf8_target);
2757e526 2104
006f26b2
DM
2105 /* on first ever match, allocate first slab */
2106 if (!PL_regmatch_slab) {
2107 Newx(PL_regmatch_slab, 1, regmatch_slab);
2108 PL_regmatch_slab->prev = NULL;
2109 PL_regmatch_slab->next = NULL;
2110 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2111 }
2112
2113 /* note current PL_regmatch_state position; at end of match we'll
2114 * pop back to there and free any higher slabs */
2115 oldsave = PL_savestack_ix;
006f26b2 2116
1de06328 2117 DEBUG_EXECUTE_r(
f2ed9b32 2118 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2119 "Matching");
2120 );
bac06658 2121
6eb5f6b9 2122 minlen = prog->minlen;
1de06328
YO
2123
2124 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2125 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2126 "String too short [regexec_flags]...\n"));
2127 goto phooey;
1aa99e6b 2128 }
6eb5f6b9 2129
1de06328 2130
6eb5f6b9 2131 /* Check validity of program. */
f8fc2ecf 2132 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2133 Perl_croak(aTHX_ "corrupted regexp program");
2134 }
2135
272d35c9 2136 RX_MATCH_TAINTED_off(rx);
6eb5f6b9 2137
02d5137b
DM
2138 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2139 reginfo->warned = FALSE;
9d9163fb 2140 reginfo->strbeg = strbeg;
02d5137b 2141 reginfo->sv = sv;
1cb48e53 2142 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 2143
220db18a
DM
2144 /* Mark end of string for $ (and such) */
2145 reginfo->strend = strend;
6eb5f6b9
JH
2146
2147 /* see how far we have to get to not match where we matched before */
02d5137b 2148 reginfo->till = startpos+minend;
6eb5f6b9 2149
bf2039a9
DM
2150 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2151 SAVEVPTR(PL_regmatch_slab);
2152 SAVEVPTR(PL_regmatch_state);
2153
2154 /* grab next slot in regmatch_state stack to store regmatch_info_aux
2155 * struct */
2156
2157 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2158 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2159 PL_regmatch_state = S_push_slab(aTHX);
2160
2161 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2162
2163
2164 if ((prog->extflags & RXf_EVAL_SEEN)) {
2165 /* grab next slot in regmatch_state stack to store
2166 * regmatch_info_aux_eval struct */
2167
2168 reginfo->info_aux_eval =
2169 reginfo->info_aux->info_aux_eval =
2170 &(PL_regmatch_state->u.info_aux_eval);
2171
2172 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2173 PL_regmatch_state = S_push_slab(aTHX);
2174
d3aa529c 2175 S_setup_eval_state(aTHX_ reginfo);
bf2039a9
DM
2176 }
2177 else
2178 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
d3aa529c
DM
2179
2180
6eb5f6b9
JH
2181 /* If there is a "must appear" string, look for it. */
2182 s = startpos;
2183
bbe252da 2184 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2185 MAGIC *mg;
2c296965 2186 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
02d5137b 2187 reginfo->ganch = startpos + prog->gofs;
2c296965 2188 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
02d5137b 2189 "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2190 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2191 && SvMAGIC(sv)
14befaf4
DM
2192 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2193 && mg->mg_len >= 0) {
02d5137b 2194 reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2195 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
02d5137b 2196 "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2197
bbe252da 2198 if (prog->extflags & RXf_ANCH_GPOS) {
02d5137b 2199 if (s > reginfo->ganch)
6eb5f6b9 2200 goto phooey;
02d5137b 2201 s = reginfo->ganch - prog->gofs;
2c296965 2202 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2203 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2204 if (s < strbeg)
2205 goto phooey;
6eb5f6b9
JH
2206 }
2207 }
58e23c8d 2208 else if (data) {
02d5137b 2209 reginfo->ganch = strbeg + PTR2UV(data);
2c296965 2210 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
02d5137b 2211 "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2c296965
YO
2212
2213 } else { /* pos() not defined */
02d5137b 2214 reginfo->ganch = strbeg;
2c296965 2215 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
02d5137b 2216 "GPOS: reginfo->ganch = strbeg\n"));
2c296965 2217 }
6eb5f6b9 2218 }
288b8c02 2219 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2220 /* We have to be careful. If the previous successful match
2221 was from this regex we don't want a subsequent partially
2222 successful match to clobber the old results.
2223 So when we detect this possibility we add a swap buffer
d8da0584
KW
2224 to the re, and switch the buffer each match. If we fail,
2225 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2226 */
2227 swap = prog->offs;
2228 /* do we need a save destructor here for eval dies? */
2229 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2230 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2231 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2232 PTR2UV(prog),
2233 PTR2UV(swap),
2234 PTR2UV(prog->offs)
2235 ));
c74340f9 2236 }
a0714e2c 2237 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2238 re_scream_pos_data d;
2239
2240 d.scream_olds = &scream_olds;
2241 d.scream_pos = &scream_pos;
52a21eb3 2242 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
3fa9c3d7 2243 if (!s) {
a3621e74 2244 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2245 goto phooey; /* not present */
3fa9c3d7 2246 }
6eb5f6b9
JH
2247 }
2248
1de06328 2249
6eb5f6b9
JH
2250
2251 /* Simplest case: anchored match need be tried only once. */
2252 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2253 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
02d5137b 2254 if (s == startpos && regtry(reginfo, &startpos))
6eb5f6b9 2255 goto got_it;
bbe252da
YO
2256 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2257 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2258 {
2259 char *end;
2260
2261 if (minlen)
2262 dontbother = minlen - 1;
1aa99e6b 2263 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2264 /* for multiline we only have to try after newlines */
33b8afdf 2265 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2266 /* because of the goto we can not easily reuse the macros for bifurcating the
2267 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2268 if (utf8_target) {
2269 if (s == startpos)
2270 goto after_try_utf8;
2271 while (1) {
02d5137b 2272 if (regtry(reginfo, &s)) {
92f3d482
YO
2273 goto got_it;
2274 }
2275 after_try_utf8:
2276 if (s > end) {
2277 goto phooey;
2278 }
2279 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2280 s = re_intuit_start(rx, sv, strbeg,
2281 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2282 if (!s) {
2283 goto phooey;
2284 }
2285 }
2286 else {
2287 s += UTF8SKIP(s);
2288 }
2289 }
2290 } /* end search for check string in unicode */
2291 else {
2292 if (s == startpos) {
2293 goto after_try_latin;
2294 }
2295 while (1) {
02d5137b 2296 if (regtry(reginfo, &s)) {
92f3d482
YO
2297 goto got_it;
2298 }
2299 after_try_latin:
2300 if (s > end) {
2301 goto phooey;
2302 }
2303 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2304 s = re_intuit_start(rx, sv, strbeg,
2305 s + 1, strend, flags, NULL);
92f3d482
YO
2306 if (!s) {
2307 goto phooey;
2308 }
2309 }
2310 else {
2311 s++;
2312 }
2313 }
2314 } /* end search for check string in latin*/
2315 } /* end search for check string */
2316 else { /* search for newline */
2317 if (s > startpos) {
2318 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2319 s--;
92f3d482 2320 }
21eede78
YO
2321 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2322 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2323 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2324 if (regtry(reginfo, &s))
6eb5f6b9
JH
2325 goto got_it;
2326 }
92f3d482
YO
2327 }
2328 } /* end search for newline */
2329 } /* end anchored/multiline check string search */
6eb5f6b9 2330 goto phooey;
bbe252da 2331 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2332 {
02d5137b 2333 /* the warning about reginfo->ganch being used without initialization
bbe252da 2334 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2335 and we only enter this block when the same bit is set. */
02d5137b 2336 char *tmp_s = reginfo->ganch - prog->gofs;
c584a96e 2337
02d5137b 2338 if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
6eb5f6b9
JH
2339 goto got_it;
2340 goto phooey;
2341 }
2342
2343 /* Messy cases: unanchored match. */
bbe252da 2344 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2345 /* we have /x+whatever/ */
984e6dd1 2346 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2347 char ch;
bf93d4cc
GS
2348#ifdef DEBUGGING
2349 int did_match = 0;
2350#endif
f2ed9b32 2351 if (utf8_target) {
7e0d5ad7
KW
2352 if (! prog->anchored_utf8) {
2353 to_utf8_substr(prog);
2354 }
2355 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2356 REXEC_FBC_SCAN(
6eb5f6b9 2357 if (*s == ch) {
a3621e74 2358 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2359 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2360 s += UTF8SKIP(s);
2361 while (s < strend && *s == ch)
2362 s += UTF8SKIP(s);
2363 }
4cadc6a9 2364 );
7e0d5ad7 2365
6eb5f6b9
JH
2366 }
2367 else {
7e0d5ad7
KW
2368 if (! prog->anchored_substr) {
2369 if (! to_byte_substr(prog)) {
6b54ddc5 2370 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2371 }
2372 }
2373 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2374 REXEC_FBC_SCAN(
6eb5f6b9 2375 if (*s == ch) {
a3621e74 2376 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2377 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2378 s++;
2379 while (s < strend && *s == ch)
2380 s++;
2381 }
4cadc6a9 2382 );
6eb5f6b9 2383 }
a3621e74 2384 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2385 PerlIO_printf(Perl_debug_log,
b7953727
JH
2386 "Did not find anchored character...\n")
2387 );
6eb5f6b9 2388 }
a0714e2c
SS
2389 else if (prog->anchored_substr != NULL
2390 || prog->anchored_utf8 != NULL
2391 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2392 && prog->float_max_offset < strend - s)) {
2393 SV *must;
2394 I32 back_max;
2395 I32 back_min;
2396 char *last;
6eb5f6b9 2397 char *last1; /* Last position checked before */
bf93d4cc
GS
2398#ifdef DEBUGGING
2399 int did_match = 0;
2400#endif
33b8afdf 2401 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2402 if (utf8_target) {
2403 if (! prog->anchored_utf8) {
2404 to_utf8_substr(prog);
2405 }
2406 must = prog->anchored_utf8;
2407 }
2408 else {
2409 if (! prog->anchored_substr) {
2410 if (! to_byte_substr(prog)) {
6b54ddc5 2411 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2412 }
2413 }
2414 must = prog->anchored_substr;
2415 }
33b8afdf
JH
2416 back_max = back_min = prog->anchored_offset;
2417 } else {
7e0d5ad7
KW
2418 if (utf8_target) {
2419 if (! prog->float_utf8) {
2420 to_utf8_substr(prog);
2421 }
2422 must = prog->float_utf8;
2423 }
2424 else {
2425 if (! prog->float_substr) {
2426 if (! to_byte_substr(prog)) {
6b54ddc5 2427 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2428 }
2429 }
2430 must = prog->float_substr;
2431 }
33b8afdf
JH
2432 back_max = prog->float_max_offset;
2433 back_min = prog->float_min_offset;
2434 }
1de06328 2435
1de06328
YO
2436 if (back_min<0) {
2437 last = strend;
2438 } else {
2439 last = HOP3c(strend, /* Cannot start after this */
2440 -(I32)(CHR_SVLEN(must)
2441 - (SvTAIL(must) != 0) + back_min), strbeg);
2442 }
9d9163fb 2443 if (s > reginfo->strbeg)
6eb5f6b9
JH
2444 last1 = HOPc(s, -1);
2445 else
2446 last1 = s - 1; /* bogus */
2447
a0288114 2448 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2449 check_substr==must. */
2450 scream_pos = -1;
2451 dontbother = end_shift;
2452 strend = HOPc(strend, -dontbother);
2453 while ( (s <= last) &&
c33e64f0 2454 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2455 (unsigned char*)strend, must,
c33e64f0 2456 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2457 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2458 if (HOPc(s, -back_max) > last1) {
2459 last1 = HOPc(s, -back_min);
2460 s = HOPc(s, -back_max);
2461 }
2462 else {
9d9163fb
DM
2463 char * const t = (last1 >= reginfo->strbeg)
2464 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2465
2466 last1 = HOPc(s, -back_min);
52657f30 2467 s = t;
6eb5f6b9 2468 }
f2ed9b32 2469 if (utf8_target) {
6eb5f6b9 2470 while (s <= last1) {
02d5137b 2471 if (regtry(reginfo, &s))
6eb5f6b9 2472 goto got_it;
7016d6eb
DM
2473 if (s >= last1) {
2474 s++; /* to break out of outer loop */
2475 break;
2476 }
2477 s += UTF8SKIP(s);
6eb5f6b9
JH
2478 }
2479 }
2480 else {
2481 while (s <= last1) {
02d5137b 2482 if (regtry(reginfo, &s))
6eb5f6b9
JH
2483 goto got_it;
2484 s++;
2485 }
2486 }
2487 }
ab3bbdeb 2488 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2489 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2490 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2491 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2492 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2493 ? "anchored" : "floating"),
ab3bbdeb
YO
2494 quoted, RE_SV_TAIL(must));
2495 });
6eb5f6b9
JH
2496 goto phooey;
2497 }
f8fc2ecf 2498 else if ( (c = progi->regstclass) ) {
f14c76ed 2499 if (minlen) {
f8fc2ecf 2500 const OPCODE op = OP(progi->regstclass);
66e933ab 2501 /* don't bother with what can't match */
786e8c11 2502 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2503 strend = HOPc(strend, -(minlen - 1));
2504 }
a3621e74 2505 DEBUG_EXECUTE_r({
be8e71aa 2506 SV * const prop = sv_newmortal();
32fc9b6a 2507 regprop(prog, prop, c);
0df25f3d 2508 {
f2ed9b32 2509 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2510 s,strend-s,60);
0df25f3d 2511 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2512 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2513 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2514 quoted, (int)(strend - s));
0df25f3d 2515 }
ffc61ed2 2516 });
f9176b44 2517 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2518 goto got_it;
07be1b83 2519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2520 }
2521 else {
2522 dontbother = 0;
a0714e2c 2523 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2524 /* Trim the end. */
6af40bd7 2525 char *last= NULL;
33b8afdf 2526 SV* float_real;
c33e64f0
FC
2527 STRLEN len;
2528 const char *little;
33b8afdf 2529
7e0d5ad7
KW
2530 if (utf8_target) {
2531 if (! prog->float_utf8) {
2532 to_utf8_substr(prog);
2533 }
2534 float_real = prog->float_utf8;
2535 }
2536 else {
2537 if (! prog->float_substr) {
2538 if (! to_byte_substr(prog)) {
6b54ddc5 2539 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2540 }
2541 }
2542 float_real = prog->float_substr;
2543 }
d6a28714 2544
c33e64f0
FC
2545 little = SvPV_const(float_real, len);
2546 if (SvTAIL(float_real)) {
7f18ad16
KW
2547 /* This means that float_real contains an artificial \n on
2548 * the end due to the presence of something like this:
2549 * /foo$/ where we can match both "foo" and "foo\n" at the
2550 * end of the string. So we have to compare the end of the
2551 * string first against the float_real without the \n and
2552 * then against the full float_real with the string. We
2553 * have to watch out for cases where the string might be
2554 * smaller than the float_real or the float_real without
2555 * the \n. */
1a13b075
YO
2556 char *checkpos= strend - len;
2557 DEBUG_OPTIMISE_r(
2558 PerlIO_printf(Perl_debug_log,
2559 "%sChecking for float_real.%s\n",
2560 PL_colors[4], PL_colors[5]));
2561 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2562 /* can't match, even if we remove the trailing \n
2563 * string is too short to match */
1a13b075
YO
2564 DEBUG_EXECUTE_r(
2565 PerlIO_printf(Perl_debug_log,
2566 "%sString shorter than required trailing substring, cannot match.%s\n",
2567 PL_colors[4], PL_colors[5]));
2568 goto phooey;
2569 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2570 /* can match, the end of the string matches without the
2571 * "\n" */
1a13b075
YO
2572 last = checkpos + 1;
2573 } else if (checkpos < strbeg) {
7f18ad16
KW
2574 /* cant match, string is too short when the "\n" is
2575 * included */
1a13b075
YO
2576 DEBUG_EXECUTE_r(
2577 PerlIO_printf(Perl_debug_log,
2578 "%sString does not contain required trailing substring, cannot match.%s\n",
2579 PL_colors[4], PL_colors[5]));
2580 goto phooey;
2581 } else if (!multiline) {
7f18ad16
KW
2582 /* non multiline match, so compare with the "\n" at the
2583 * end of the string */
1a13b075
YO
2584 if (memEQ(checkpos, little, len)) {
2585 last= checkpos;
2586 } else {
2587 DEBUG_EXECUTE_r(
2588 PerlIO_printf(Perl_debug_log,
2589 "%sString does not contain required trailing substring, cannot match.%s\n",
2590 PL_colors[4], PL_colors[5]));
2591 goto phooey;
2592 }
2593 } else {
7f18ad16
KW
2594 /* multiline match, so we have to search for a place
2595 * where the full string is located */
d6a28714 2596 goto find_last;
1a13b075 2597 }
c33e64f0 2598 } else {
d6a28714 2599 find_last:
9041c2e3 2600 if (len)
d6a28714 2601 last = rninstr(s, strend, little, little + len);
b8c5462f 2602 else
a0288114 2603 last = strend; /* matching "$" */
b8c5462f 2604 }
6af40bd7 2605 if (!last) {
7f18ad16
KW
2606 /* at one point this block contained a comment which was
2607 * probably incorrect, which said that this was a "should not
2608 * happen" case. Even if it was true when it was written I am
2609 * pretty sure it is not anymore, so I have removed the comment
2610 * and replaced it with this one. Yves */
6bda09f9
YO
2611 DEBUG_EXECUTE_r(
2612 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2613 "String does not contain required substring, cannot match.\n"
2614 ));
2615 goto phooey;
bf93d4cc 2616 }
d6a28714
JH
2617 dontbother = strend - last + prog->float_min_offset;
2618 }
2619 if (minlen && (dontbother < minlen))
2620 dontbother = minlen - 1;
2621 strend -= dontbother; /* this one's always in bytes! */
2622 /* We don't know much -- general case. */
f2ed9b32 2623 if (utf8_target) {
d6a28714 2624 for (;;) {
02d5137b 2625 if (regtry(reginfo, &s))
d6a28714
JH
2626 goto got_it;
2627 if (s >= strend)
2628 break;
b8c5462f 2629 s += UTF8SKIP(s);
d6a28714
JH
2630 };
2631 }
2632 else {
2633 do {
02d5137b 2634 if (regtry(reginfo, &s))
d6a28714
JH
2635 goto got_it;
2636 } while (s++ < strend);
2637 }
2638 }
2639
2640 /* Failure. */
2641 goto phooey;
2642
2643got_it:
495f47a5
DM
2644 DEBUG_BUFFERS_r(
2645 if (swap)
2646 PerlIO_printf(Perl_debug_log,
2647 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2648 PTR2UV(prog),
2649 PTR2UV(swap)
2650 );
2651 );
e9105d30 2652 Safefree(swap);
d6a28714 2653
bf2039a9
DM
2654 /* clean up; this will trigger destructors that will free all slabs
2655 * above the current one, and cleanup the regmatch_info_aux
2656 * and regmatch_info_aux_eval sructs */
8adc0f72 2657
006f26b2
DM
2658 LEAVE_SCOPE(oldsave);
2659
5daac39c
NC
2660 if (RXp_PAREN_NAMES(prog))
2661 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 2662
0254aed9
DM
2663 RX_MATCH_UTF8_set(rx, utf8_target);
2664
d6a28714
JH
2665 /* make sure $`, $&, $', and $digit will work later */
2666 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2667 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2668#ifdef PERL_ANY_COW
2669 if (SvCANCOW(sv)) {
ed252734
NC
2670 if (DEBUG_C_TEST) {
2671 PerlIO_printf(Perl_debug_log,
2672 "Copy on write: regexp capture, type %d\n",
2673 (int) SvTYPE(sv));
2674 }
77f8f7c1 2675 RX_MATCH_COPY_FREE(rx);
ed252734 2676 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2677 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2678 assert (SvPOKp(prog->saved_copy));
220db18a 2679 prog->sublen = reginfo->strend - strbeg;
6502e081
DM
2680 prog->suboffset = 0;
2681 prog->subcoffset = 0;
ed252734
NC
2682 } else
2683#endif
2684 {
6502e081 2685 I32 min = 0;
220db18a 2686 I32 max = reginfo->strend - strbeg;
6502e081
DM
2687 I32 sublen;
2688
2689 if ( (flags & REXEC_COPY_SKIP_POST)
2690 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2692 ) { /* don't copy $' part of string */
3de645a8 2693 U32 n = 0;
6502e081
DM
2694 max = -1;
2695 /* calculate the right-most part of the string covered
2696 * by a capture. Due to look-ahead, this may be to
2697 * the right of $&, so we have to scan all captures */
2698 while (n <= prog->lastparen) {
2699 if (prog->offs[n].end > max)
2700 max = prog->offs[n].end;
2701 n++;
2702 }
2703 if (max == -1)
2704 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2705 ? prog->offs[0].start
2706 : 0;
220db18a 2707 assert(max >= 0 && max <= reginfo->strend - strbeg);
6502e081
DM
2708 }
2709
2710 if ( (flags & REXEC_COPY_SKIP_PRE)
2711 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2712 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2713 ) { /* don't copy $` part of string */
3de645a8 2714 U32 n = 0;
6502e081
DM
2715 min = max;
2716 /* calculate the left-most part of the string covered
2717 * by a capture. Due to look-behind, this may be to
2718 * the left of $&, so we have to scan all captures */
2719 while (min && n <= prog->lastparen) {
2720 if ( prog->offs[n].start != -1
2721 && prog->offs[n].start < min)
2722 {
2723 min = prog->offs[n].start;
2724 }
2725 n++;
2726 }
2727 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2728 && min > prog->offs[0].end
2729 )
2730 min = prog->offs[0].end;
2731
2732 }
2733
220db18a
DM
2734 assert(min >= 0 && min <= max
2735 && min <= reginfo->strend - strbeg);
6502e081
DM
2736 sublen = max - min;
2737
2738 if (RX_MATCH_COPIED(rx)) {
2739 if (sublen > prog->sublen)
2740 prog->subbeg =
2741 (char*)saferealloc(prog->subbeg, sublen+1);
2742 }
2743 else
2744 prog->subbeg = (char*)safemalloc(sublen+1);
2745 Copy(strbeg + min, prog->subbeg, sublen, char);
2746 prog->subbeg[sublen] = '\0';
2747 prog->suboffset = min;
2748 prog->sublen = sublen;
77f8f7c1 2749 RX_MATCH_COPIED_on(rx);
6502e081 2750 }
6502e081
DM
2751 prog->subcoffset = prog->suboffset;
2752 if (prog->suboffset && utf8_target) {
2753 /* Convert byte offset to chars.
2754 * XXX ideally should only compute this if @-/@+
2755 * has been seen, a la PL_sawampersand ??? */
2756
2757 /* If there's a direct correspondence between the
2758 * string which we're matching and the original SV,
2759 * then we can use the utf8 len cache associated with
2760 * the SV. In particular, it means that under //g,
2761 * sv_pos_b2u() will use the previously cached
2762 * position to speed up working out the new length of
2763 * subcoffset, rather than counting from the start of
2764 * the string each time. This stops
2765 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2766 * from going quadratic */
2767 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2768 sv_pos_b2u(sv, &(prog->subcoffset));
2769 else
2770 prog->subcoffset = utf8_length((U8*)strbeg,
2771 (U8*)(strbeg+prog->suboffset));
2772 }
d6a28714
JH
2773 }
2774 else {
6502e081 2775 RX_MATCH_COPY_FREE(rx);
d6a28714 2776 prog->subbeg = strbeg;
6502e081
DM
2777 prog->suboffset = 0;
2778 prog->subcoffset = 0;
220db18a
DM
2779 /* use reginfo->strend, as strend may have been modified */
2780 prog->sublen = reginfo->strend - strbeg;
d6a28714
JH
2781 }
2782 }
9041c2e3 2783
d6a28714
JH
2784 return 1;
2785
2786phooey:
a3621e74 2787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2788 PL_colors[4], PL_colors[5]));
8adc0f72 2789
bf2039a9
DM
2790 /* clean up; this will trigger destructors that will free all slabs
2791 * above the current one, and cleanup the regmatch_info_aux
2792 * and regmatch_info_aux_eval sructs */
8adc0f72 2793
006f26b2
DM
2794 LEAVE_SCOPE(oldsave);
2795
e9105d30 2796 if (swap) {
c74340f9 2797 /* we failed :-( roll it back */
495f47a5
DM
2798 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2799 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2800 PTR2UV(prog),
2801 PTR2UV(prog->offs),
2802 PTR2UV(swap)
2803 ));
e9105d30
GG
2804 Safefree(prog->offs);
2805 prog->offs = swap;
2806 }
d6a28714
JH
2807 return 0;
2808}
2809
6bda09f9 2810
ec43f78b
DM
2811/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2812 * Do inc before dec, in case old and new rex are the same */
2813#define SET_reg_curpm(Re2) \
bf2039a9 2814 if (reginfo->info_aux_eval) { \
ec43f78b
DM
2815 (void)ReREFCNT_inc(Re2); \
2816 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2817 PM_SETRE((PL_reg_curpm), (Re2)); \
2818 }
2819
2820
d6a28714
JH
2821/*
2822 - regtry - try match at specific point
2823 */
2824STATIC I32 /* 0 failure, 1 success */
f73aaa43 2825S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2826{
97aff369 2827 dVAR;
d6a28714 2828 CHECKPOINT lastcp;
288b8c02 2829 REGEXP *const rx = reginfo->prog;
8d919b0a 2830 regexp *const prog = ReANY(rx);
f73aaa43 2831 I32 result;
f8fc2ecf 2832 RXi_GET_DECL(prog,progi);
a3621e74 2833 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2834
2835 PERL_ARGS_ASSERT_REGTRY;
2836
24b23f37 2837 reginfo->cutpoint=NULL;
d6a28714 2838
97ca13b7 2839#ifdef DEBUGGING
f73aaa43 2840 PL_reg_starttry = *startposp;
97ca13b7 2841#endif
9d9163fb 2842 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 2843 prog->lastparen = 0;
03994de8 2844 prog->lastcloseparen = 0;
d6a28714
JH
2845
2846 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2847 to do this again and again, prog->lastparen should take care of
3dd2943c 2848 this! --ilya*/
dafc8851
JH
2849
2850 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2851 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2852 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2853 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2854 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2855 * Meanwhile, this code *is* needed for the
daf18116
JH
2856 * above-mentioned test suite tests to succeed. The common theme
2857 * on those tests seems to be returning null fields from matches.
225593e1 2858 * --jhi updated by dapm */
dafc8851 2859#if 1
d6a28714 2860 if (prog->nparens) {
b93070ed 2861 regexp_paren_pair *pp = prog->offs;
eb578fdb 2862 I32 i;
b93070ed 2863 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2864 ++pp;
2865 pp->start = -1;
2866 pp->end = -1;
d6a28714
JH
2867 }
2868 }
dafc8851 2869#endif
02db2b7b 2870 REGCP_SET(lastcp);
f73aaa43
DM
2871 result = regmatch(reginfo, *startposp, progi->program + 1);
2872 if (result != -1) {
2873 prog->offs[0].end = result;
d6a28714
JH
2874 return 1;
2875 }
24b23f37 2876 if (reginfo->cutpoint)
f73aaa43 2877 *startposp= reginfo->cutpoint;
02db2b7b 2878 REGCP_UNWIND(lastcp);
d6a28714
JH
2879 return 0;
2880}
2881
02db2b7b 2882
8ba1375e
MJD
2883#define sayYES goto yes
2884#define sayNO goto no
262b90c4 2885#define sayNO_SILENT goto no_silent
8ba1375e 2886
f9f4320a
YO
2887/* we dont use STMT_START/END here because it leads to
2888 "unreachable code" warnings, which are bogus, but distracting. */
2889#define CACHEsayNO \
c476f425
DM
2890 if (ST.cache_mask) \
2891 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2892 sayNO
3298f257 2893
a3621e74 2894/* this is used to determine how far from the left messages like
265c4333
YO
2895 'failed...' are printed. It should be set such that messages
2896 are inline with the regop output that created them.
a3621e74 2897*/
265c4333 2898#define REPORT_CODE_OFF 32
a3621e74
YO
2899
2900
40a82448
DM
2901#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2902#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2903#define CHRTEST_NOT_A_CP_1 -999
2904#define CHRTEST_NOT_A_CP_2 -998
9e137952 2905
5d9a96ca
DM
2906/* grab a new slab and return the first slot in it */
2907
2908STATIC regmatch_state *
2909S_push_slab(pTHX)
2910{
a35a87e7 2911#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2912 dMY_CXT;
2913#endif
5d9a96ca
DM
2914 regmatch_slab *s = PL_regmatch_slab->next;
2915 if (!s) {
2916 Newx(s, 1, regmatch_slab);
2917 s->prev = PL_regmatch_slab;
2918 s->next = NULL;
2919 PL_regmatch_slab->next = s;
2920 }
2921 PL_regmatch_slab = s;
86545054 2922 return SLAB_FIRST(s);
5d9a96ca 2923}
5b47454d 2924
95b24440 2925
40a82448
DM
2926/* push a new state then goto it */
2927
4d5016e5
DM
2928#define PUSH_STATE_GOTO(state, node, input) \
2929 pushinput = input; \
40a82448
DM
2930 scan = node; \
2931 st->resume_state = state; \
2932 goto push_state;
2933
2934/* push a new state with success backtracking, then goto it */
2935
4d5016e5
DM
2936#define PUSH_YES_STATE_GOTO(state, node, input) \
2937 pushinput = input; \
40a82448
DM
2938 scan = node; \
2939 st->resume_state = state; \
2940 goto push_yes_state;
2941
aa283a38 2942
aa283a38 2943
4d5016e5 2944
d6a28714 2945/*
95b24440 2946
bf1f174e
DM
2947regmatch() - main matching routine
2948
2949This is basically one big switch statement in a loop. We execute an op,
2950set 'next' to point the next op, and continue. If we come to a point which
2951we may need to backtrack to on failure such as (A|B|C), we push a
2952backtrack state onto the backtrack stack. On failure, we pop the top
2953state, and re-enter the loop at the state indicated. If there are no more
2954states to pop, we return failure.
2955
2956Sometimes we also need to backtrack on success; for example /A+/, where
2957after successfully matching one A, we need to go back and try to
2958match another one; similarly for lookahead assertions: if the assertion
2959completes successfully, we backtrack to the state just before the assertion
2960and then carry on. In these cases, the pushed state is marked as
2961'backtrack on success too'. This marking is in fact done by a chain of
2962pointers, each pointing to the previous 'yes' state. On success, we pop to
2963the nearest yes state, discarding any intermediate failure-only states.
2964Sometimes a yes state is pushed just to force some cleanup code to be
2965called at the end of a successful match or submatch; e.g. (??{$re}) uses
2966it to free the inner regex.
2967
2968Note that failure backtracking rewinds the cursor position, while
2969success backtracking leaves it alone.
2970
2971A pattern is complete when the END op is executed, while a subpattern
2972such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2973ops trigger the "pop to last yes state if any, otherwise return true"
2974behaviour.
2975
2976A common convention in this function is to use A and B to refer to the two
2977subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2978the subpattern to be matched possibly multiple times, while B is the entire
2979rest of the pattern. Variable and state names reflect this convention.
2980
2981The states in the main switch are the union of ops and failure/success of
2982substates associated with with that op. For example, IFMATCH is the op
2983that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2984'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2985successfully matched A and IFMATCH_A_fail is a state saying that we have
2986just failed to match A. Resume states always come in pairs. The backtrack
2987state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2988at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2989on success or failure.
2990
2991The struct that holds a backtracking state is actually a big union, with
2992one variant for each major type of op. The variable st points to the
2993top-most backtrack struct. To make the code clearer, within each
2994block of code we #define ST to alias the relevant union.
2995
2996Here's a concrete example of a (vastly oversimplified) IFMATCH
2997implementation:
2998
2999 switch (state) {
3000 ....
3001
3002#define ST st->u.ifmatch
3003
3004 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3005 ST.foo = ...; // some state we wish to save
95b24440 3006 ...
bf1f174e
DM
3007 // push a yes backtrack state with a resume value of
3008 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3009 // first node of A:
4d5016e5 3010 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3011 // NOTREACHED
3012
3013 case IFMATCH_A: // we have successfully executed A; now continue with B
3014 next = B;
3015 bar = ST.foo; // do something with the preserved value
3016 break;
3017
3018 case IFMATCH_A_fail: // A failed, so the assertion failed
3019 ...; // do some housekeeping, then ...
3020 sayNO; // propagate the failure
3021
3022#undef ST
95b24440 3023
bf1f174e
DM
3024 ...
3025 }
95b24440 3026
bf1f174e
DM
3027For any old-timers reading this who are familiar with the old recursive
3028approach, the code above is equivalent to:
95b24440 3029
bf1f174e
DM
3030 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3031 {
3032 int foo = ...
95b24440 3033 ...
bf1f174e
DM
3034 if (regmatch(A)) {
3035 next = B;
3036 bar = foo;
3037 break;
95b24440 3038 }
bf1f174e
DM
3039 ...; // do some housekeeping, then ...
3040 sayNO; // propagate the failure
95b24440 3041 }
bf1f174e
DM
3042
3043The topmost backtrack state, pointed to by st, is usually free. If you
3044want to claim it, populate any ST.foo fields in it with values you wish to
3045save, then do one of
3046
4d5016e5
DM
3047 PUSH_STATE_GOTO(resume_state, node, newinput);
3048 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3049
3050which sets that backtrack state's resume value to 'resume_state', pushes a
3051new free entry to the top of the backtrack stack, then goes to 'node'.
3052On backtracking, the free slot is popped, and the saved state becomes the
3053new free state. An ST.foo field in this new top state can be temporarily
3054accessed to retrieve values, but once the main loop is re-entered, it
3055becomes available for reuse.
3056
3057Note that the depth of the backtrack stack constantly increases during the
3058left-to-right execution of the pattern, rather than going up and down with
3059the pattern nesting. For example the stack is at its maximum at Z at the
3060end of the pattern, rather than at X in the following:
3061
3062 /(((X)+)+)+....(Y)+....Z/
3063
3064The only exceptions to this are lookahead/behind assertions and the cut,
3065(?>A), which pop all the backtrack states associated with A before
3066continuing.
3067
486ec47a 3068Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3069PL_regmatch_state and st always point to the currently active state,
3070and PL_regmatch_slab points to the slab currently containing
3071PL_regmatch_state. The first time regmatch() is called, the first slab is
3072allocated, and is never freed until interpreter destruction. When the slab
3073is full, a new one is allocated and chained to the end. At exit from
3074regmatch(), slabs allocated since entry are freed.
3075
3076*/
95b24440 3077
40a82448 3078
5bc10b2c 3079#define DEBUG_STATE_pp(pp) \
265c4333 3080 DEBUG_STATE_r({ \
f2ed9b32 3081 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3082 PerlIO_printf(Perl_debug_log, \
5d458dd8 3083 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3084 depth*2, "", \
13d6edb4 3085 PL_reg_name[st->resume_state], \
5d458dd8
YO
3086 ((st==yes_state||st==mark_state) ? "[" : ""), \
3087 ((st==yes_state) ? "Y" : ""), \
3088 ((st==mark_state) ? "M" : ""), \
3089 ((st==yes_state||st==mark_state) ? "]" : "") \
3090 ); \
265c4333 3091 });
5bc10b2c 3092
40a82448 3093
3dab1dad 3094#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3095
3df15adc 3096#ifdef DEBUGGING
5bc10b2c 3097
ab3bbdeb 3098STATIC void
f2ed9b32 3099S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3100 const char *start, const char *end, const char *blurb)
3101{
efd26800 3102 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3103
3104 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3105
ab3bbdeb
YO
3106 if (!PL_colorset)
3107 reginitcolors();
3108 {
3109 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3110 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3111
f2ed9b32 3112 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3113 start, end - start, 60);
3114
3115 PerlIO_printf(Perl_debug_log,
3116 "%s%s REx%s %s against %s\n",
3117 PL_colors[4], blurb, PL_colors[5], s0, s1);
3118
f2ed9b32 3119 if (utf8_target||utf8_pat)
1de06328
YO
3120 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3121 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3122 utf8_pat && utf8_target ? " and " : "",
3123 utf8_target ? "string" : ""
ab3bbdeb
YO
3124 );
3125 }
3126}
3df15adc
YO
3127
3128STATIC void
786e8c11
YO
3129S_dump_exec_pos(pTHX_ const char *locinput,
3130 const regnode *scan,
3131 const char *loc_regeol,
3132 const char *loc_bostr,
3133 const char *loc_reg_starttry,
f2ed9b32 3134 const bool utf8_target)
07be1b83 3135{
786e8c11 3136 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3137 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3138 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3139 /* The part of the string before starttry has one color
3140 (pref0_len chars), between starttry and current
3141 position another one (pref_len - pref0_len chars),
3142 after the current position the third one.
3143 We assume that pref0_len <= pref_len, otherwise we
3144 decrease pref0_len. */
786e8c11
YO
3145 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3146 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3147 int pref0_len;
3148
7918f24d
NC
3149 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3150
f2ed9b32 3151 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3152 pref_len++;
786e8c11
YO
3153 pref0_len = pref_len - (locinput - loc_reg_starttry);
3154 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3155 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3156 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3157 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3158 l--;
3159 if (pref0_len < 0)
3160 pref0_len = 0;
3161 if (pref0_len > pref_len)
3162 pref0_len = pref_len;
3163 {
f2ed9b32 3164 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3165
ab3bbdeb 3166 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3167 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3168
ab3bbdeb 3169 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3170 (locinput - pref_len + pref0_len),
1de06328 3171 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3172
ab3bbdeb 3173 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3174 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3175
1de06328 3176 const STRLEN tlen=len0+len1+len2;
3df15adc 3177 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3178 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3179 (IV)(locinput - loc_bostr),
07be1b83 3180 len0, s0,
07be1b83 3181 len1, s1,
07be1b83 3182 (docolor ? "" : "> <"),
07be1b83 3183 len2, s2,
f9f4320a 3184 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3185 "");
3186 }
3187}
3df15adc 3188
07be1b83
YO
3189#endif
3190
0a4db386
YO
3191/* reg_check_named_buff_matched()
3192 * Checks to see if a named buffer has matched. The data array of
3193 * buffer numbers corresponding to the buffer is expected to reside
3194 * in the regexp->data->data array in the slot stored in the ARG() of
3195 * node involved. Note that this routine doesn't actually care about the
3196 * name, that information is not preserved from compilation to execution.
3197 * Returns the index of the leftmost defined buffer with the given name
3198 * or 0 if non of the buffers matched.
3199 */
3200STATIC I32
7918f24d
NC
3201S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3202{
0a4db386 3203 I32 n;
f8fc2ecf 3204 RXi_GET_DECL(rex,rexi);
ad64d0ec 3205 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3206 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3207
3208 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3209
0a4db386 3210 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3211 if ((I32)rex->lastparen >= nums[n] &&
3212 rex->offs[nums[n]].end != -1)
0a4db386
YO
3213 {
3214 return nums[n];
3215 }
3216 }
3217 return 0;
3218}
3219
2f554ef7
DM
3220
3221/* free all slabs above current one - called during LEAVE_SCOPE */
3222
3223STATIC void
3224S_clear_backtrack_stack(pTHX_ void *p)
3225{
3226 regmatch_slab *s = PL_regmatch_slab->next;
3227 PERL_UNUSED_ARG(p);
3228
3229 if (!s)
3230 return;
3231 PL_regmatch_slab->next = NULL;
3232 while (s) {
3233 regmatch_slab * const osl = s;
3234 s = s->next;
3235 Safefree(osl);
3236 }
3237}
006f26b2
DM
3238
3239
c74f6de9 3240static bool
984e6dd1 3241S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3242 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3243{
79a2a0e8
KW
3244 /* This function determines if there are one or two characters that match
3245 * the first character of the passed-in EXACTish node <text_node>, and if
3246 * so, returns them in the passed-in pointers.
c74f6de9 3247 *
79a2a0e8
KW
3248 * If it determines that no possible character in the target string can
3249 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3250 * the first character in <text_node> requires UTF-8 to represent, and the
3251 * target string isn't in UTF-8.)
c74f6de9 3252 *
79a2a0e8
KW
3253 * If there are more than two characters that could match the beginning of
3254 * <text_node>, or if more context is required to determine a match or not,
3255 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3256 *
3257 * The motiviation behind this function is to allow the caller to set up
3258 * tight loops for matching. If <text_node> is of type EXACT, there is
3259 * only one possible character that can match its first character, and so
3260 * the situation is quite simple. But things get much more complicated if
3261 * folding is involved. It may be that the first character of an EXACTFish
3262 * node doesn't participate in any possible fold, e.g., punctuation, so it
3263 * can be matched only by itself. The vast majority of characters that are
3264 * in folds match just two things, their lower and upper-case equivalents.
3265 * But not all are like that; some have multiple possible matches, or match
3266 * sequences of more than one character. This function sorts all that out.
3267 *
3268 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3269 * loop of trying to match A*, we know we can't exit where the thing
3270 * following it isn't a B. And something can't be a B unless it is the
3271 * beginning of B. By putting a quick test for that beginning in a tight
3272 * loop, we can rule out things that can't possibly be B without having to
3273 * break out of the loop, thus avoiding work. Similarly, if A is a single
3274 * character, we can make a tight loop matching A*, using the outputs of
3275 * this function.
3276 *
3277 * If the target string to match isn't in UTF-8, and there aren't
3278 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3279 * the one or two possible octets (which are characters in this situation)
3280 * that can match. In all cases, if there is only one character that can
3281 * match, *<c1p> and *<c2p> will be identical.
3282 *
3283 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3284 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3285 * can match the beginning of <text_node>. They should be declared with at
3286 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3287 * undefined what these contain.) If one or both of the buffers are
3288 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3289 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3290 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3291 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3292 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3293
ba44c216 3294 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3295
ddb0d839
KW
3296 UV c1 = CHRTEST_NOT_A_CP_1;
3297 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8 3298 bool use_chrtest_void = FALSE;
aed7b151 3299 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3300
3301 /* Used when we have both utf8 input and utf8 output, to avoid converting
3302 * to/from code points */
3303 bool utf8_has_been_setup = FALSE;
3304
c74f6de9
KW
3305 dVAR;
3306
b4291290 3307 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3308
79a2a0e8
KW
3309 if (OP(text_node) == EXACT) {
3310
3311 /* In an exact node, only one thing can be matched, that first
3312 * character. If both the pat and the target are UTF-8, we can just
3313 * copy the input to the output, avoiding finding the code point of
3314 * that character */
984e6dd1 3315 if (!is_utf8_pat) {
79a2a0e8
KW
3316 c2 = c1 = *pat;
3317 }
3318 else if (utf8_target) {
3319 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3320 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3321 utf8_has_been_setup = TRUE;
3322 }
3323 else {
3324 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3325 }
79a2a0e8
KW
3326 }
3327 else /* an EXACTFish node */
984e6dd1 3328 if ((is_utf8_pat
79a2a0e8
KW
3329 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3330 pat + STR_LEN(text_node)))
984e6dd1 3331 || (!is_utf8_pat
79a2a0e8
KW
3332 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3333 pat + STR_LEN(text_node))))
3334 {
3335 /* Multi-character folds require more context to sort out. Also
3336 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3337 * handled outside this routine */
3338 use_chrtest_void = TRUE;
3339 }
3340 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3341 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3342 if (c1 > 256) {
3343 /* Load the folds hash, if not already done */
3344 SV** listp;
3345 if (! PL_utf8_foldclosures) {
3346 if (! PL_utf8_tofold) {
3347 U8 dummy[UTF8_MAXBYTES+1];
3348
3349 /* Force loading this by folding an above-Latin1 char */
3350 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3351 assert(PL_utf8_tofold); /* Verify that worked */
3352 }
3353 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3354 }
3355
3356 /* The fold closures data structure is a hash with the keys being
3357 * the UTF-8 of every character that is folded to, like 'k', and
3358 * the values each an array of all code points that fold to its
3359 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3360 * not included */
3361 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3362 (char *) pat,
3363 UTF8SKIP(pat),
3364 FALSE))))
3365 {
3366 /* Not found in the hash, therefore there are no folds
3367 * containing it, so there is only a single character that
3368 * could match */
3369 c2 = c1;
3370 }
3371 else { /* Does participate in folds */
3372 AV* list = (AV*) *listp;
3373 if (av_len(list) != 1) {
3374
3375 /* If there aren't exactly two folds to this, it is outside
3376 * the scope of this function */
3377 use_chrtest_void = TRUE;
3378 }
3379 else { /* There are two. Get them */
3380 SV** c_p = av_fetch(list, 0, FALSE);
3381 if (c_p == NULL) {
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3383 }
3384 c1 = SvUV(*c_p);
3385
3386 c_p = av_fetch(list, 1, FALSE);
3387 if (c_p == NULL) {
3388 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3389 }
3390 c2 = SvUV(*c_p);
3391
3392 /* Folds that cross the 255/256 boundary are forbidden if
3393 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3394 * pattern character is above 256, and its only other match
3395 * is below 256, the only legal match will be to itself.
3396 * We have thrown away the original, so have to compute
3397 * which is the one above 255 */
3398 if ((c1 < 256) != (c2 < 256)) {
3399 if (OP(text_node) == EXACTFL
3400 || (OP(text_node) == EXACTFA
3401 && (isASCII(c1) || isASCII(c2))))
3402 {
3403 if (c1 < 256) {
3404 c1 = c2;
3405 }
3406 else {
3407 c2 = c1;
3408 }
3409 }
3410 }
3411 }
3412 }
3413 }
3414 else /* Here, c1 is < 255 */
3415 if (utf8_target
3416 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3417 && OP(text_node) != EXACTFL
3418 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3419 {
3420 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3421 * folds to this character in the pattern. All such cases except
3422 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3423 * involved in their folds, so are outside the scope of this
3424 * function */
3425 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3426 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3427 }
3428 else {
3429 use_chrtest_void = TRUE;
3430 }
c74f6de9
KW
3431 }
3432 else { /* Here nothing above Latin1 can fold to the pattern character */
3433 switch (OP(text_node)) {
3434
3435 case EXACTFL: /* /l rules */
79a2a0e8 3436 c2 = PL_fold_locale[c1];
c74f6de9
KW
3437 break;
3438
3439 case EXACTF:
3440 if (! utf8_target) { /* /d rules */
79a2a0e8 3441 c2 = PL_fold[c1];
c74f6de9
KW
3442 break;
3443 }
3444 /* FALLTHROUGH */
3445 /* /u rules for all these. This happens to work for
79a2a0e8 3446 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3447 case EXACTFA:
3448 case EXACTFU_TRICKYFOLD:
79a2a0e8 3449 case EXACTFU_SS:
c74f6de9 3450 case EXACTFU:
79a2a0e8 3451 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3452 break;
3453
878531d3
KW
3454 default:
3455 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3456 assert(0); /* NOTREACHED */
c74f6de9
KW
3457 }
3458 }
3459 }
79a2a0e8
KW
3460
3461 /* Here have figured things out. Set up the returns */
3462 if (use_chrtest_void) {
3463 *c2p = *c1p = CHRTEST_VOID;
3464 }
3465 else if (utf8_target) {
3466 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3467 uvchr_to_utf8(c1_utf8, c1);
3468 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3469 }
c74f6de9 3470
79a2a0e8
KW
3471 /* Invariants are stored in both the utf8 and byte outputs; Use
3472 * negative numbers otherwise for the byte ones. Make sure that the
3473 * byte ones are the same iff the utf8 ones are the same */
3474 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3475 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3476 ? *c2_utf8
3477 : (c1 == c2)
3478 ? CHRTEST_NOT_A_CP_1
3479 : CHRTEST_NOT_A_CP_2;
3480 }
3481 else if (c1 > 255) {
3482 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3483 can represent */
3484 return FALSE;
3485 }
c74f6de9 3486
79a2a0e8
KW
3487 *c1p = *c2p = c2; /* c2 is the only representable value */
3488 }
3489 else { /* c1 is representable; see about c2 */
3490 *c1p = c1;
3491 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3492 }
2f554ef7 3493
c74f6de9
KW
3494 return TRUE;
3495}
2f554ef7 3496
f73aaa43
DM
3497/* returns -1 on failure, $+[0] on success */
3498STATIC I32
3499S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3500{
a35a87e7 3501#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3502 dMY_CXT;
3503#endif
27da23d5 3504 dVAR;
ba44c216 3505 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 3506 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3507 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3508 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3509 RXi_GET_DECL(rex,rexi);
5d9a96ca 3510 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3511 regmatch_state *st;
5d9a96ca 3512 /* cache heavy used fields of st in registers */
eb578fdb
KW
3513 regnode *scan;
3514 regnode *next;
3515 U32 n = 0; /* general value; init to avoid compiler warning */
3516 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3517 char *locinput = startpos;
4d5016e5 3518 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3519 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3520
b69b0499 3521 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3522 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3523 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3524 const U32 max_nochange_depth =
3525 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3526 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;