This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Use Perl_croak_nocontext()
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
101 goto target; \
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
2d66f61e 110/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
111 * call if there are no complications: i.e., if everything matchable is
112 * straight forward in the bitmap */
3db24e1e 113#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 114 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 115
c277df42
IZ
116/*
117 * Forwards.
118 */
119
f2ed9b32 120#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 121#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 122
3dab1dad 123#define HOPc(pos,off) \
ba44c216 124 (char *)(reginfo->is_utf8_target \
220db18a 125 ? reghop3((U8*)pos, off, \
9d9163fb 126 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 127 : (U8*)(pos + off))
557f47af 128
3dab1dad 129#define HOPBACKc(pos, off) \
ba44c216 130 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
131 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
132 : (pos - off >= reginfo->strbeg) \
8e11feef 133 ? (U8*)pos - off \
3dab1dad 134 : NULL)
efb30f32 135
ba44c216 136#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 137#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 138
557f47af
DM
139/* lim must be +ve. Returns NULL on overshoot */
140#define HOPMAYBE3(pos,off,lim) \
141 (reginfo->is_utf8_target \
142 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
143 : ((U8*)pos + off <= lim) \
144 ? (U8*)pos + off \
145 : NULL)
146
8e9f2289
DM
147/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
148 * off must be >=0; args should be vars rather than expressions */
149#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
150 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
151 : (U8*)((pos + off) > lim ? lim : (pos + off)))
152
2974eaec
DM
153#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
154 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
155 : (U8*)(pos + off))
156#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
157
158#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
159#define NEXTCHR_IS_EOS (nextchr < 0)
160
161#define SET_nextchr \
220db18a 162 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
163
164#define SET_locinput(p) \
165 locinput = (p); \
166 SET_nextchr
167
168
2a16ac92 169#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
170 if (!swash_ptr) { \
171 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 172 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 173 1, 0, invlist, &flags); \
c7304fe2
KW
174 assert(swash_ptr); \
175 } \
176 } STMT_END
177
178/* If in debug mode, we test that a known character properly matches */
179#ifdef DEBUGGING
180# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
181 property_name, \
2a16ac92 182 invlist, \
c7304fe2 183 utf8_char_in_property) \
2a16ac92 184 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
185 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
186#else
187# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
188 property_name, \
2a16ac92 189 invlist, \
c7304fe2 190 utf8_char_in_property) \
2a16ac92 191 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 192#endif
d1eb3177 193
c7304fe2
KW
194#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
195 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
196 "", \
197 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 198 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 199
c7304fe2 200#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
201/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
202
5f80c4cf 203/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
204/* it would be nice to rework regcomp.sym to generate this stuff. sigh
205 *
206 * NOTE that *nothing* that affects backtracking should be in here, specifically
207 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
208 * node that is in between two EXACT like nodes when ascertaining what the required
209 * "follow" character is. This should probably be moved to regex compile time
210 * although it may be done at run time beause of the REF possibility - more
211 * investigation required. -- demerphq
212*/
baa60164
KW
213#define JUMPABLE(rn) ( \
214 OP(rn) == OPEN || \
3e901dc0 215 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
216 OP(rn) == EVAL || \
217 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
218 OP(rn) == PLUS || OP(rn) == MINMOD || \
219 OP(rn) == KEEPS || \
220 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 221)
ee9b8eae 222#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 223
ee9b8eae
YO
224#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
225
226#if 0
227/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 228 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 229#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 230#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
231#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
232
233#else
234/* ... so we use this as its faster. */
a4525e78
KW
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
236#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
237#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
238#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
239
240#endif
e2d8ce26 241
a84d97b6
HS
242/*
243 Search for mandatory following text node; for lookahead, the text must
244 follow but for lookbehind (rn->flags != 0) we skip to the next step.
245*/
baa60164 246#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
247 while (JUMPABLE(rn)) { \
248 const OPCODE type = OP(rn); \
249 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 250 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 251 else if (type == PLUS) \
cca55fe3 252 rn = NEXTOPER(rn); \
3dab1dad 253 else if (type == IFMATCH) \
a84d97b6 254 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 255 else rn += NEXT_OFF(rn); \
3dab1dad 256 } \
5f80c4cf 257} STMT_END
74750237 258
006f26b2
DM
259#define SLAB_FIRST(s) (&(s)->states[0])
260#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
261
a75351a1 262static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 263static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 264static regmatch_state * S_push_slab(pTHX);
51371543 265
87c0511b 266#define REGCP_PAREN_ELEMS 3
f067efbf 267#define REGCP_OTHER_ELEMS 3
e0fa7e2b 268#define REGCP_FRAME_ELEMS 1
620d5b66
NC
269/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
270 * are needed for the regexp context stack bookkeeping. */
271
76e3520e 272STATIC CHECKPOINT
92da3157 273S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 274{
a3b680e6 275 const int retval = PL_savestack_ix;
92da3157
DM
276 const int paren_elems_to_push =
277 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
278 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
279 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 280 I32 p;
40a82448 281 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 282
b93070ed
DM
283 PERL_ARGS_ASSERT_REGCPPUSH;
284
e49a9654 285 if (paren_elems_to_push < 0)
e8a85d26
JH
286 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
287 (int)paren_elems_to_push, (int)maxopenparen,
288 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 289
e0fa7e2b
NC
290 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
291 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 292 " out of range (%lu-%ld)",
92da3157
DM
293 total_elems,
294 (unsigned long)maxopenparen,
295 (long)parenfloor);
e0fa7e2b 296
620d5b66 297 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 298
495f47a5 299 DEBUG_BUFFERS_r(
92da3157 300 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
301 PerlIO_printf(Perl_debug_log,
302 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
303 PTR2UV(rex),
304 PTR2UV(rex->offs)
305 );
306 );
92da3157 307 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 308/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
309 SSPUSHIV(rex->offs[p].end);
310 SSPUSHIV(rex->offs[p].start);
1ca2007e 311 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 312 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
313 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
314 (UV)p,
315 (IV)rex->offs[p].start,
316 (IV)rex->offs[p].start_tmp,
317 (IV)rex->offs[p].end
40a82448 318 ));
a0d0e21e 319 }
b1ce53c5 320/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 321 SSPUSHINT(maxopenparen);
b93070ed
DM
322 SSPUSHINT(rex->lastparen);
323 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 324 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 325
a0d0e21e
LW
326 return retval;
327}
328
c277df42 329/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
330#define REGCP_SET(cp) \
331 DEBUG_STATE_r( \
ab3bbdeb 332 PerlIO_printf(Perl_debug_log, \
e4f74956 333 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
334 (IV)PL_savestack_ix)); \
335 cp = PL_savestack_ix
c3464db5 336
ab3bbdeb 337#define REGCP_UNWIND(cp) \
e4f74956 338 DEBUG_STATE_r( \
ab3bbdeb 339 if (cp != PL_savestack_ix) \
e4f74956
YO
340 PerlIO_printf(Perl_debug_log, \
341 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
342 (IV)(cp), (IV)PL_savestack_ix)); \
343 regcpblow(cp)
c277df42 344
a8d1f4b4
DM
345#define UNWIND_PAREN(lp, lcp) \
346 for (n = rex->lastparen; n > lp; n--) \
347 rex->offs[n].end = -1; \
348 rex->lastparen = n; \
349 rex->lastcloseparen = lcp;
350
351
f067efbf 352STATIC void
92da3157 353S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 354{
e0fa7e2b 355 UV i;
87c0511b 356 U32 paren;
a3621e74
YO
357 GET_RE_DEBUG_FLAGS_DECL;
358
7918f24d
NC
359 PERL_ARGS_ASSERT_REGCPPOP;
360
b1ce53c5 361 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 362 i = SSPOPUV;
e0fa7e2b
NC
363 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
364 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
365 rex->lastcloseparen = SSPOPINT;
366 rex->lastparen = SSPOPINT;
92da3157 367 *maxopenparen_p = SSPOPINT;
b1ce53c5 368
620d5b66 369 i -= REGCP_OTHER_ELEMS;
b1ce53c5 370 /* Now restore the parentheses context. */
495f47a5
DM
371 DEBUG_BUFFERS_r(
372 if (i || rex->lastparen + 1 <= rex->nparens)
373 PerlIO_printf(Perl_debug_log,
374 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
375 PTR2UV(rex),
376 PTR2UV(rex->offs)
377 );
378 );
92da3157 379 paren = *maxopenparen_p;
620d5b66 380 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 381 SSize_t tmps;
1ca2007e 382 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
383 rex->offs[paren].start = SSPOPIV;
384 tmps = SSPOPIV;
b93070ed
DM
385 if (paren <= rex->lastparen)
386 rex->offs[paren].end = tmps;
495f47a5
DM
387 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
388 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
389 (UV)paren,
390 (IV)rex->offs[paren].start,
391 (IV)rex->offs[paren].start_tmp,
392 (IV)rex->offs[paren].end,
393 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 394 );
87c0511b 395 paren--;
a0d0e21e 396 }
daf18116 397#if 1
dafc8851
JH
398 /* It would seem that the similar code in regtry()
399 * already takes care of this, and in fact it is in
400 * a better location to since this code can #if 0-ed out
401 * but the code in regtry() is needed or otherwise tests
402 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
403 * (as of patchlevel 7877) will fail. Then again,
404 * this code seems to be necessary or otherwise
225593e1
DM
405 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
406 * --jhi updated by dapm */
b93070ed 407 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 408 if (i > *maxopenparen_p)
b93070ed
DM
409 rex->offs[i].start = -1;
410 rex->offs[i].end = -1;
495f47a5
DM
411 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
412 " \\%"UVuf": %s ..-1 undeffing\n",
413 (UV)i,
92da3157 414 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 415 ));
a0d0e21e 416 }
dafc8851 417#endif
a0d0e21e
LW
418}
419
74088413
DM
420/* restore the parens and associated vars at savestack position ix,
421 * but without popping the stack */
422
423STATIC void
92da3157 424S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
425{
426 I32 tmpix = PL_savestack_ix;
427 PL_savestack_ix = ix;
92da3157 428 regcppop(rex, maxopenparen_p);
74088413
DM
429 PL_savestack_ix = tmpix;
430}
431
02db2b7b 432#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 433
31c7f561
KW
434STATIC bool
435S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
436{
437 /* Returns a boolean as to whether or not 'character' is a member of the
438 * Posix character class given by 'classnum' that should be equivalent to a
439 * value in the typedef '_char_class_number'.
440 *
441 * Ideally this could be replaced by a just an array of function pointers
442 * to the C library functions that implement the macros this calls.
443 * However, to compile, the precise function signatures are required, and
444 * these may vary from platform to to platform. To avoid having to figure
445 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
446 * which adds an extra layer of function call overhead (unless the C
447 * optimizer strips it away). But we don't particularly care about
448 * performance with locales anyway. */
31c7f561
KW
449
450 switch ((_char_class_number) classnum) {
15861f94 451 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 452 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
453 case _CC_ENUM_ASCII: return isASCII_LC(character);
454 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
455 case _CC_ENUM_CASED: return isLOWER_LC(character)
456 || isUPPER_LC(character);
e8d596e0 457 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
458 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
459 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
460 case _CC_ENUM_LOWER: return isLOWER_LC(character);
461 case _CC_ENUM_PRINT: return isPRINT_LC(character);
462 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 463 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
464 case _CC_ENUM_UPPER: return isUPPER_LC(character);
465 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 466 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
467 default: /* VERTSPACE should never occur in locales */
468 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
469 }
470
e5964223 471 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
472 return FALSE;
473}
474
3018b823
KW
475STATIC bool
476S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
477{
478 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
479 * 'character' is a member of the Posix character class given by 'classnum'
480 * that should be equivalent to a value in the typedef
481 * '_char_class_number'.
482 *
483 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 484 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
485 * rules, ignoring any locale. So use the Unicode function if this class
486 * requires a swash, and use the Unicode macro otherwise. */
487
488 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
489
490 if (UTF8_IS_INVARIANT(*character)) {
491 return isFOO_lc(classnum, *character);
492 }
493 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
494 return isFOO_lc(classnum,
a62b247b 495 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
496 }
497
613abc6d
KW
498 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
499
3018b823
KW
500 if (classnum < _FIRST_NON_SWASH_CC) {
501
502 /* Initialize the swash unless done already */
503 if (! PL_utf8_swash_ptrs[classnum]) {
504 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
505 PL_utf8_swash_ptrs[classnum] =
506 _core_swash_init("utf8",
507 "",
508 &PL_sv_undef, 1, 0,
509 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
510 }
511
92a2046b
KW
512 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
513 character,
514 TRUE /* is UTF */ ));
3018b823
KW
515 }
516
517 switch ((_char_class_number) classnum) {
779cf272 518 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
519 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
520 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
521 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 522 default: break;
3018b823
KW
523 }
524
e1ee3960 525 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
526}
527
a687059c 528/*
e50aee73 529 * pregexec and friends
a687059c
LW
530 */
531
76234dfb 532#ifndef PERL_IN_XSUB_RE
a687059c 533/*
c277df42 534 - pregexec - match a regexp against a string
a687059c 535 */
c277df42 536I32
5aaab254 537Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 538 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
539/* stringarg: the point in the string at which to begin matching */
540/* strend: pointer to null at end of string */
541/* strbeg: real beginning of string */
542/* minend: end of match must be >= minend bytes after stringarg. */
543/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
544 * itself is accessed via the pointers above */
545/* nosave: For optimizations. */
c277df42 546{
7918f24d
NC
547 PERL_ARGS_ASSERT_PREGEXEC;
548
c277df42 549 return
9041c2e3 550 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
551 nosave ? 0 : REXEC_COPY_STR);
552}
76234dfb 553#endif
22e551b9 554
cad2e5aa 555
6eb5f6b9 556
1a4edc3c
DM
557/* re_intuit_start():
558 *
559 * Based on some optimiser hints, try to find the earliest position in the
560 * string where the regex could match.
561 *
562 * rx: the regex to match against
563 * sv: the SV being matched: only used for utf8 flag; the string
564 * itself is accessed via the pointers below. Note that on
565 * something like an overloaded SV, SvPOK(sv) may be false
566 * and the string pointers may point to something unrelated to
567 * the SV itself.
568 * strbeg: real beginning of string
569 * strpos: the point in the string at which to begin matching
570 * strend: pointer to the byte following the last char of the string
571 * flags currently unused; set to 0
572 * data: currently unused; set to NULL
573 *
574 * The basic idea of re_intuit_start() is to use some known information
575 * about the pattern, namely:
576 *
577 * a) the longest known anchored substring (i.e. one that's at a
578 * constant offset from the beginning of the pattern; but not
579 * necessarily at a fixed offset from the beginning of the
580 * string);
581 * b) the longest floating substring (i.e. one that's not at a constant
582 * offset from the beginning of the pattern);
583 * c) Whether the pattern is anchored to the string; either
584 * an absolute anchor: /^../, or anchored to \n: /^.../m,
585 * or anchored to pos(): /\G/;
586 * d) A start class: a real or synthetic character class which
587 * represents which characters are legal at the start of the pattern;
588 *
589 * to either quickly reject the match, or to find the earliest position
590 * within the string at which the pattern might match, thus avoiding
591 * running the full NFA engine at those earlier locations, only to
592 * eventually fail and retry further along.
593 *
594 * Returns NULL if the pattern can't match, or returns the address within
595 * the string which is the earliest place the match could occur.
596 *
597 * The longest of the anchored and floating substrings is called 'check'
598 * and is checked first. The other is called 'other' and is checked
599 * second. The 'other' substring may not be present. For example,
600 *
601 * /(abc|xyz)ABC\d{0,3}DEFG/
602 *
603 * will have
604 *
605 * check substr (float) = "DEFG", offset 6..9 chars
606 * other substr (anchored) = "ABC", offset 3..3 chars
607 * stclass = [ax]
608 *
609 * Be aware that during the course of this function, sometimes 'anchored'
610 * refers to a substring being anchored relative to the start of the
611 * pattern, and sometimes to the pattern itself being anchored relative to
612 * the string. For example:
613 *
614 * /\dabc/: "abc" is anchored to the pattern;
615 * /^\dabc/: "abc" is anchored to the pattern and the string;
616 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
617 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
618 * but the pattern is anchored to the string.
52a21eb3
DM
619 */
620
cad2e5aa 621char *
52a21eb3
DM
622Perl_re_intuit_start(pTHX_
623 REGEXP * const rx,
624 SV *sv,
625 const char * const strbeg,
626 char *strpos,
627 char *strend,
628 const U32 flags,
629 re_scream_pos_data *data)
cad2e5aa 630{
8d919b0a 631 struct regexp *const prog = ReANY(rx);
6b071d16 632 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 633 /* Should be nonnegative! */
ea3daa5d 634 SSize_t end_shift = 0;
0fc004dd
DM
635 /* current lowest pos in string where the regex can start matching */
636 char *rx_origin = strpos;
eb578fdb 637 SV *check;
f2ed9b32 638 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 639 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 640 bool ml_anch = 0;
8f4bf5fc 641 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 642 char *check_at = NULL; /* check substr found at this pos */
bbe252da 643 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 644 RXi_GET_DECL(prog,progi);
02d5137b
DM
645 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
646 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
647 GET_RE_DEBUG_FLAGS_DECL;
648
7918f24d 649 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
650 PERL_UNUSED_ARG(flags);
651 PERL_UNUSED_ARG(data);
7918f24d 652
1dc475d0
DM
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 "Intuit: trying to determine minimum start position...\n"));
655
fb9bbddb
DM
656 /* for now, assume that all substr offsets are positive. If at some point
657 * in the future someone wants to do clever things with look-behind and
658 * -ve offsets, they'll need to fix up any code in this function
659 * which uses these offsets. See the thread beginning
660 * <20140113145929.GF27210@iabyn.com>
661 */
662 assert(prog->substrs->data[0].min_offset >= 0);
663 assert(prog->substrs->data[0].max_offset >= 0);
664 assert(prog->substrs->data[1].min_offset >= 0);
665 assert(prog->substrs->data[1].max_offset >= 0);
666 assert(prog->substrs->data[2].min_offset >= 0);
667 assert(prog->substrs->data[2].max_offset >= 0);
668
f7022b5a 669 /* for now, assume that if both present, that the floating substring
83f2232d 670 * doesn't start before the anchored substring.
f7022b5a
DM
671 * If you break this assumption (e.g. doing better optimisations
672 * with lookahead/behind), then you'll need to audit the code in this
673 * function carefully first
674 */
675 assert(
676 ! ( (prog->anchored_utf8 || prog->anchored_substr)
677 && (prog->float_utf8 || prog->float_substr))
678 || (prog->float_min_offset >= prog->anchored_offset));
679
1a4edc3c
DM
680 /* byte rather than char calculation for efficiency. It fails
681 * to quickly reject some cases that can't match, but will reject
682 * them later after doing full char arithmetic */
c344f387 683 if (prog->minlen > strend - strpos) {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 685 " String too short...\n"));
cad2e5aa 686 goto fail;
2c2d71f5 687 }
d8da0584 688
ab4e48c1 689 RX_MATCH_UTF8_set(rx,utf8_target);
6c3fea77 690 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 691 reginfo->info_aux = NULL;
9d9163fb 692 reginfo->strbeg = strbeg;
220db18a 693 reginfo->strend = strend;
aed7b151 694 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 695 reginfo->intuit = 1;
1cb48e53
DM
696 /* not actually used within intuit, but zero for safety anyway */
697 reginfo->poscache_maxiter = 0;
02d5137b 698
f2ed9b32 699 if (utf8_target) {
33b8afdf
JH
700 if (!prog->check_utf8 && prog->check_substr)
701 to_utf8_substr(prog);
702 check = prog->check_utf8;
703 } else {
7e0d5ad7
KW
704 if (!prog->check_substr && prog->check_utf8) {
705 if (! to_byte_substr(prog)) {
6b54ddc5 706 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
707 }
708 }
33b8afdf
JH
709 check = prog->check_substr;
710 }
274cd312 711
1dc475d0
DM
712 /* dump the various substring data */
713 DEBUG_OPTIMISE_MORE_r({
714 int i;
715 for (i=0; i<=2; i++) {
716 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
717 : prog->substrs->data[i].substr);
718 if (!sv)
719 continue;
720
721 PerlIO_printf(Perl_debug_log,
722 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
723 " useful=%"IVdf" utf8=%d [%s]\n",
724 i,
725 (IV)prog->substrs->data[i].min_offset,
726 (IV)prog->substrs->data[i].max_offset,
727 (IV)prog->substrs->data[i].end_shift,
728 BmUSEFUL(sv),
729 utf8_target ? 1 : 0,
730 SvPEEK(sv));
731 }
732 });
733
8e1490ee 734 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
735
736 /* ml_anch: check after \n?
737 *
0fa70a06 738 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
739 * with /.*.../, these flags will have been added by the
740 * compiler:
741 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
742 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
743 */
7d2d37f5
DM
744 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
745 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 746
343c8a29 747 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
748 /* we are only allowed to match at BOS or \G */
749
57fcbfa7 750 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
751 *
752 * Note that we don't try to do a similar quick reject for
753 * \G, since generally the caller will have calculated strpos
754 * based on pos() and gofs, so the string is already correctly
755 * anchored by definition; and handling the exceptions would
756 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 757 */
7bb3b9eb 758 if ( strpos != strbeg
d3d47aac 759 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8
DM
760 {
761 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 762 " Not at start...\n"));
c889ccc8
DM
763 goto fail;
764 }
765
a5d12a4b
DM
766 /* in the presence of an anchor, the anchored (relative to the
767 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
768 * to strpos. So quickly reject if substr isn't found there.
769 * This works for \G too, because the caller will already have
770 * subtracted gofs from pos, and gofs is the offset from the
771 * \G to the start of the regex. For example, in /.abc\Gdef/,
772 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
773 * caller will have set strpos=pos()-4; we look for the substr
774 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 775
33c28ab2 776 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 777 /* Substring at constant offset from beg-of-str... */
d307bf57 778 SSize_t slen = SvCUR(check);
343c8a29 779 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 780
1dc475d0
DM
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
782 " Looking for check substr at fixed offset %"IVdf"...\n",
783 (IV)prog->check_offset_min));
784
7742aa66
DM
785 if (SvTAIL(check)) {
786 /* In this case, the regex is anchored at the end too.
787 * Unless it's a multiline match, the lengths must match
788 * exactly, give or take a \n. NB: slen >= 1 since
789 * the last char of check is \n */
790 if (!multiline
791 && ( strend - s > slen
792 || strend - s < slen - 1
793 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
794 {
795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 796 " String too long...\n"));
c889ccc8
DM
797 goto fail_finish;
798 }
799 /* Now should match s[0..slen-2] */
800 slen--;
c889ccc8 801 }
d307bf57
DM
802 if (slen && (*SvPVX_const(check) != *s
803 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
804 {
805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 806 " String not equal...\n"));
d307bf57
DM
807 goto fail_finish;
808 }
c889ccc8
DM
809
810 check_at = s;
811 goto success_at_start;
cad2e5aa 812 }
cad2e5aa 813 }
cad2e5aa 814 }
0fc004dd 815
c0e0ec46 816 end_shift = prog->check_end_shift;
cad2e5aa 817
19188028 818#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 819 if (end_shift < 0)
1de06328 820 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 821 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
822#endif
823
2c2d71f5 824 restart:
1de06328 825
66b7ec5c
DM
826 /* This is the (re)entry point of the main loop in this function.
827 * The goal of this loop is to:
828 * 1) find the "check" substring in the region rx_origin..strend
829 * (adjusted by start_shift / end_shift). If not found, reject
830 * immediately.
831 * 2) If it exists, look for the "other" substr too if defined; for
832 * example, if the check substr maps to the anchored substr, then
833 * check the floating substr, and vice-versa. If not found, go
834 * back to (1) with rx_origin suitably incremented.
835 * 3) If we find an rx_origin position that doesn't contradict
836 * either of the substrings, then check the possible additional
837 * constraints on rx_origin of /^.../m or a known start class.
838 * If these fail, then depending on which constraints fail, jump
839 * back to here, or to various other re-entry points further along
840 * that skip some of the first steps.
841 * 4) If we pass all those tests, update the BmUSEFUL() count on the
842 * substring. If the start position was determined to be at the
843 * beginning of the string - so, not rejected, but not optimised,
844 * since we have to run regmatch from position 0 - decrement the
845 * BmUSEFUL() count. Otherwise increment it.
846 */
847
1a4edc3c
DM
848
849 /* first, look for the 'check' substring */
850
1de06328 851 {
c33e64f0
FC
852 U8* start_point;
853 U8* end_point;
c889ccc8 854
c889ccc8 855 DEBUG_OPTIMISE_MORE_r({
1dc475d0 856 PerlIO_printf(Perl_debug_log,
ae5d4331 857 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 858 " Start shift: %"IVdf" End shift %"IVdf
4d281893 859 " Real end Shift: %"IVdf"\n",
675e93ee 860 (IV)(rx_origin - strbeg),
c889ccc8 861 (IV)prog->check_offset_min,
12fbc530
DM
862 (IV)start_shift,
863 (IV)end_shift,
c889ccc8
DM
864 (IV)prog->check_end_shift);
865 });
1de06328 866
33c28ab2
DM
867 end_point = HOP3(strend, -end_shift, strbeg);
868 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
869 if (!start_point)
870 goto fail_finish;
c889ccc8 871
557f47af 872
e0362b86 873 /* If the regex is absolutely anchored to either the start of the
d3d47aac 874 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
875 * check_offset_max represents an upper bound on the string where
876 * the substr could start. For the ANCH_GPOS case, we assume that
877 * the caller of intuit will have already set strpos to
878 * pos()-gofs, so in this case strpos + offset_max will still be
879 * an upper bound on the substr.
880 */
c19c836a
DM
881 if (!ml_anch
882 && prog->intflags & PREGf_ANCH
e0362b86 883 && prog->check_offset_max != SSize_t_MAX)
c19c836a 884 {
1a08ba3a 885 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
886 const char * const anchor =
887 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
888
889 /* do a bytes rather than chars comparison. It's conservative;
890 * so it skips doing the HOP if the result can't possibly end
891 * up earlier than the old value of end_point.
892 */
893 if ((char*)end_point - anchor > prog->check_offset_max) {
894 end_point = HOP3lim((U8*)anchor,
895 prog->check_offset_max,
896 end_point -len)
897 + len;
898 }
d6ef1678
DM
899 }
900
ae5d4331 901 check_at = fbm_instr( start_point, end_point,
7fba1cd6 902 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 903
675e93ee
DM
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
905 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
906 (IV)((char*)start_point - strbeg),
907 (IV)((char*)end_point - strbeg),
908 (IV)(check_at ? check_at - strbeg : -1)
909 ));
910
8fd34720
DM
911 /* Update the count-of-usability, remove useless subpatterns,
912 unshift s. */
913
914 DEBUG_EXECUTE_r({
915 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
916 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
917 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
918 (check_at ? "Found" : "Did not find"),
919 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
920 ? "anchored" : "floating"),
921 quoted,
922 RE_SV_TAIL(check),
923 (check_at ? " at offset " : "...\n") );
924 });
2c2d71f5 925
8fd34720
DM
926 if (!check_at)
927 goto fail_finish;
8fd34720
DM
928 /* set rx_origin to the minimum position where the regex could start
929 * matching, given the constraint of the just-matched check substring.
930 * But don't set it lower than previously.
931 */
fdc003fd 932
8fd34720
DM
933 if (check_at - rx_origin > prog->check_offset_max)
934 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee
DM
935 /* Finish the diagnostic message */
936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
937 "%ld (rx_origin now %"IVdf")...\n",
938 (long)(check_at - strbeg),
939 (IV)(rx_origin - strbeg)
940 ));
8fd34720 941 }
fdc003fd
DM
942
943
1a4edc3c 944 /* now look for the 'other' substring if defined */
2c2d71f5 945
6480a6c4
DM
946 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
947 : prog->substrs->data[other_ix].substr)
1de06328 948 {
30944b6d 949 /* Take into account the "other" substring. */
6c3343a6
DM
950 char *last, *last1;
951 char *s;
952 SV* must;
953 struct reg_substr_datum *other;
954
955 do_other_substr:
956 other = &prog->substrs->data[other_ix];
957
958 /* if "other" is anchored:
959 * we've previously found a floating substr starting at check_at.
960 * This means that the regex origin must lie somewhere
961 * between min (rx_origin): HOP3(check_at, -check_offset_max)
962 * and max: HOP3(check_at, -check_offset_min)
963 * (except that min will be >= strpos)
964 * So the fixed substr must lie somewhere between
965 * HOP3(min, anchored_offset)
966 * HOP3(max, anchored_offset) + SvCUR(substr)
967 */
968
969 /* if "other" is floating
970 * Calculate last1, the absolute latest point where the
971 * floating substr could start in the string, ignoring any
972 * constraints from the earlier fixed match. It is calculated
973 * as follows:
974 *
975 * strend - prog->minlen (in chars) is the absolute latest
976 * position within the string where the origin of the regex
977 * could appear. The latest start point for the floating
978 * substr is float_min_offset(*) on from the start of the
979 * regex. last1 simply combines thee two offsets.
980 *
981 * (*) You might think the latest start point should be
982 * float_max_offset from the regex origin, and technically
983 * you'd be correct. However, consider
984 * /a\d{2,4}bcd\w/
985 * Here, float min, max are 3,5 and minlen is 7.
986 * This can match either
987 * /a\d\dbcd\w/
988 * /a\d\d\dbcd\w/
989 * /a\d\d\d\dbcd\w/
990 * In the first case, the regex matches minlen chars; in the
991 * second, minlen+1, in the third, minlen+2.
992 * In the first case, the floating offset is 3 (which equals
993 * float_min), in the second, 4, and in the third, 5 (which
994 * equals float_max). In all cases, the floating string bcd
995 * can never start more than 4 chars from the end of the
996 * string, which equals minlen - float_min. As the substring
997 * starts to match more than float_min from the start of the
998 * regex, it makes the regex match more than minlen chars,
999 * and the two cancel each other out. So we can always use
1000 * float_min - minlen, rather than float_max - minlen for the
1001 * latest position in the string.
1002 *
1003 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1004 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1005 */
1006
e7a14a9c 1007 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1008 last1 = HOP3c(strend,
1009 other->min_offset - prog->minlen, strbeg);
1010
4d006249 1011 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1012 /* last is the latest point where the floating substr could
1013 * start, *given* any constraints from the earlier fixed
1014 * match. This constraint is that the floating string starts
1015 * <= float_max_offset chars from the regex origin (rx_origin).
1016 * If this value is less than last1, use it instead.
eb3831ce 1017 */
6c3343a6
DM
1018 assert(rx_origin <= last1);
1019 last =
1020 /* this condition handles the offset==infinity case, and
1021 * is a short-cut otherwise. Although it's comparing a
1022 * byte offset to a char length, it does so in a safe way,
1023 * since 1 char always occupies 1 or more bytes,
1024 * so if a string range is (last1 - rx_origin) bytes,
1025 * it will be less than or equal to (last1 - rx_origin)
1026 * chars; meaning it errs towards doing the accurate HOP3
1027 * rather than just using last1 as a short-cut */
1028 (last1 - rx_origin) < other->max_offset
1029 ? last1
1030 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1031 }
1032 else {
1033 assert(strpos + start_shift <= check_at);
1034 last = HOP4c(check_at, other->min_offset - start_shift,
1035 strbeg, strend);
1036 }
ead917d0 1037
6c3343a6
DM
1038 s = HOP3c(rx_origin, other->min_offset, strend);
1039 if (s < other_last) /* These positions already checked */
1040 s = other_last;
1041
1042 must = utf8_target ? other->utf8_substr : other->substr;
1043 assert(SvPOK(must));
675e93ee
DM
1044 {
1045 char *from = s;
1046 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1047
88203927
DM
1048 if (from > to) {
1049 s = NULL;
1050 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1051 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1052 (IV)(from - strbeg),
1053 (IV)(to - strbeg)
1054 ));
1055 }
1056 else {
1057 s = fbm_instr(
1058 (unsigned char*)from,
1059 (unsigned char*)to,
1060 must,
1061 multiline ? FBMrf_MULTILINE : 0
1062 );
1063 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1064 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1065 (IV)(from - strbeg),
1066 (IV)(to - strbeg),
1067 (IV)(s ? s - strbeg : -1)
1068 ));
1069 }
675e93ee
DM
1070 }
1071
6c3343a6
DM
1072 DEBUG_EXECUTE_r({
1073 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1076 s ? "Found" : "Contradicts",
1077 other_ix ? "floating" : "anchored",
1078 quoted, RE_SV_TAIL(must));
1079 });
ead917d0 1080
ead917d0 1081
6c3343a6
DM
1082 if (!s) {
1083 /* last1 is latest possible substr location. If we didn't
1084 * find it before there, we never will */
1085 if (last >= last1) {
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee 1087 "; giving up...\n"));
6c3343a6 1088 goto fail_finish;
ead917d0
DM
1089 }
1090
6c3343a6
DM
1091 /* try to find the check substr again at a later
1092 * position. Maybe next time we'll find the "other" substr
1093 * in range too */
6c3343a6
DM
1094 other_last = HOP3c(last, 1, strend) /* highest failure */;
1095 rx_origin =
4d006249 1096 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1097 ? HOP3c(rx_origin, 1, strend)
1098 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
675e93ee
DM
1099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1100 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1101 (other_ix ? "floating" : "anchored"),
1102 (long)(HOP3c(check_at, 1, strend) - strbeg),
1103 (IV)(rx_origin - strbeg)
1104 ));
6c3343a6
DM
1105 goto restart;
1106 }
1107 else {
4d006249 1108 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1109 /* other_last is set to s, not s+1, since its possible for
1110 * a floating substr to fail first time, then succeed
1111 * second time at the same floating position; e.g.:
1112 * "-AB--AABZ" =~ /\wAB\d*Z/
1113 * The first time round, anchored and float match at
1114 * "-(AB)--AAB(Z)" then fail on the initial \w character
1115 * class. Second time round, they match at "-AB--A(AB)(Z)".
1116 */
1117 other_last = s;
ead917d0
DM
1118 }
1119 else {
6c3343a6
DM
1120 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1121 other_last = HOP3c(s, 1, strend);
ead917d0 1122 }
675e93ee
DM
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1124 " at offset %ld (rx_origin now %"IVdf")...\n",
1125 (long)(s - strbeg),
1126 (IV)(rx_origin - strbeg)
1127 ));
1128
6c3343a6 1129 }
cad2e5aa 1130 }
acba93e8
DM
1131 else {
1132 DEBUG_OPTIMISE_MORE_r(
1133 PerlIO_printf(Perl_debug_log,
1134 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1135 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
675e93ee 1136 " strend:%"IVdf"\n",
acba93e8
DM
1137 (IV)prog->check_offset_min,
1138 (IV)prog->check_offset_max,
675e93ee
DM
1139 (IV)(check_at-strbeg),
1140 (IV)(rx_origin-strbeg),
1c1c599d 1141 (IV)(rx_origin-check_at),
675e93ee 1142 (IV)(strend-strbeg)
acba93e8
DM
1143 )
1144 );
1145 }
2c2d71f5 1146
acba93e8 1147 postprocess_substr_matches:
0991020e 1148
1a4edc3c 1149 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1150
7d2d37f5 1151 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1152 char *s;
1153
a62659bd
DM
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " looking for /^/m anchor"));
d0880ea7
DM
1156
1157 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1158 * Find the next \n, if any, even if it's beyond the current
1159 * anchored and/or floating substrings. Whether we should be
1160 * scanning ahead for the next \n or the next substr is debatable.
1161 * On the one hand you'd expect rare substrings to appear less
1162 * often than \n's. On the other hand, searching for \n means
675e93ee 1163 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1164 * iteration as the current "rarest" string candidate, which
1165 * means for example that we'll quickly reject the whole string if
1166 * hasn't got a \n, rather than trying every substr position
1167 * first
1168 */
d0880ea7 1169
4620cb61
DM
1170 s = HOP3c(strend, - prog->minlen, strpos);
1171 if (s <= rx_origin ||
1172 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1173 {
d0880ea7
DM
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 " Did not find /%s^%s/m...\n",
1176 PL_colors[0], PL_colors[1]));
a62659bd
DM
1177 goto fail_finish;
1178 }
d0880ea7 1179
4ada1233
DM
1180 /* earliest possible origin is 1 char after the \n.
1181 * (since *rx_origin == '\n', it's safe to ++ here rather than
1182 * HOP(rx_origin, 1)) */
1183 rx_origin++;
d0880ea7 1184
f4f115de 1185 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1186 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1187 {
d0880ea7
DM
1188 /* Position contradicts check-string; either because
1189 * check was anchored (and thus has no wiggle room),
4ada1233 1190 * or check was float and rx_origin is above the float range */
d0880ea7 1191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1192 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1193 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1194 goto restart;
1195 }
1196
1197 /* if we get here, the check substr must have been float,
2e759faa 1198 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1199 * "other" substr which still contradicts */
1200 assert(prog->substrs->check_ix); /* check is float */
1201
1202 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1203 /* whoops, the anchored "other" substr exists, so we still
1204 * contradict. On the other hand, the float "check" substr
1205 * didn't contradict, so just retry the anchored "other"
1206 * substr */
1207 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee 1208 " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
d0880ea7 1209 PL_colors[0], PL_colors[1],
675e93ee
DM
1210 (long)(rx_origin - strbeg + prog->anchored_offset),
1211 (long)(rx_origin - strbeg)
1212 ));
d0880ea7
DM
1213 goto do_other_substr;
1214 }
1215
1216 /* success: we don't contradict the found floating substring
1217 * (and there's no anchored substr). */
d0880ea7 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1219 " Found /%s^%s/m with rx_origin %ld...\n",
1220 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1221 }
1222 else {
2e759faa 1223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1224 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1225 }
1226
ffad1e6a 1227 success_at_start:
e3c6feb0 1228
cad2e5aa 1229
dd170ff5
DM
1230 /* if we have a starting character class, then test that extra constraint.
1231 * (trie stclasses are too expensive to use here, we are better off to
1232 * leave it to regmatch itself) */
1233
f8fc2ecf 1234 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1235 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1236
2c75e362 1237 /* XXX this value could be pre-computed */
f8fc2ecf 1238 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1239 ? (reginfo->is_utf8_pat
1240 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1241 : STR_LEN(progi->regstclass))
66e933ab 1242 : 1);
1de06328 1243 char * endpos;
fa3bb21d 1244 char *s;
000dfd2d
DM
1245 /* latest pos that a matching float substr constrains rx start to */
1246 char *rx_max_float = NULL;
1247
c75a3985
DM
1248 /* if the current rx_origin is anchored, either by satisfying an
1249 * anchored substring constraint, or a /^.../m constraint, then we
1250 * can reject the current origin if the start class isn't found
1251 * at the current position. If we have a float-only match, then
1252 * rx_origin is constrained to a range; so look for the start class
1253 * in that range. if neither, then look for the start class in the
1254 * whole rest of the string */
1255
dd170ff5
DM
1256 /* XXX DAPM it's not clear what the minlen test is for, and why
1257 * it's not used in the floating case. Nothing in the test suite
1258 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1259 * Here are some old comments, which may or may not be correct:
1260 *
1261 * minlen == 0 is possible if regstclass is \b or \B,
1262 * and the fixed substr is ''$.
1263 * Since minlen is already taken into account, rx_origin+1 is
1264 * before strend; accidentally, minlen >= 1 guaranties no false
1265 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1266 * 0) below assumes that regstclass does not come from lookahead...
1267 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1268 * This leaves EXACTF-ish only, which are dealt with in
1269 * find_byclass().
1270 */
1271
7d2d37f5 1272 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1273 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1274 else if (prog->float_substr || prog->float_utf8) {
1275 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1276 endpos= HOP3c(rx_max_float, cl_l, strend);
1277 }
1de06328
YO
1278 else
1279 endpos= strend;
1280
1dc475d0
DM
1281 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1282 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1283 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1284 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1285 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1286
c43b5520 1287 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1288 reginfo);
be778b1a 1289 if (!s) {
6eb5f6b9 1290 if (endpos == strend) {
a3621e74 1291 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1292 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1293 goto fail;
1294 }
a3621e74 1295 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1296 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1297 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1298 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1299 goto fail;
9fed8d02 1300
6eb5f6b9 1301 /* Contradict one of substrings */
97136c8a
DM
1302 if (prog->anchored_substr || prog->anchored_utf8) {
1303 if (prog->substrs->check_ix == 1) { /* check is float */
1304 /* Have both, check_string is floating */
1305 assert(rx_origin + start_shift <= check_at);
1306 if (rx_origin + start_shift != check_at) {
1307 /* not at latest position float substr could match:
c75a3985
DM
1308 * Recheck anchored substring, but not floating.
1309 * The condition above is in bytes rather than
1310 * chars for efficiency. It's conservative, in
1311 * that it errs on the side of doing 'goto
88203927
DM
1312 * do_other_substr'. In this case, at worst,
1313 * an extra anchored search may get done, but in
1314 * practice the extra fbm_instr() is likely to
1315 * get skipped anyway. */
97136c8a 1316 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee
DM
1317 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1318 (long)(other_last - strbeg),
1319 (IV)(rx_origin - strbeg)
1320 ));
97136c8a 1321 goto do_other_substr;
3369914b 1322 }
3369914b
DM
1323 }
1324 }
97136c8a 1325 else {
9fed8d02
DM
1326 /* float-only */
1327
7d2d37f5 1328 if (ml_anch) {
c75a3985
DM
1329 /* In the presence of ml_anch, we might be able to
1330 * find another \n without breaking the current float
1331 * constraint. */
1332
1333 /* strictly speaking this should be HOP3c(..., 1, ...),
1334 * but since we goto a block of code that's going to
1335 * search for the next \n if any, its safe here */
9fed8d02 1336 rx_origin++;
9fed8d02 1337 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1338 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1339 PL_colors[0], PL_colors[1],
675e93ee 1340 (long)(rx_origin - strbeg)) );
9fed8d02 1341 goto postprocess_substr_matches;
ab60c45a 1342 }
c75a3985
DM
1343
1344 /* strictly speaking this can never be true; but might
1345 * be if we ever allow intuit without substrings */
1346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1347 goto fail;
c75a3985 1348
000dfd2d 1349 rx_origin = rx_max_float;
9fed8d02
DM
1350 }
1351
c75a3985
DM
1352 /* at this point, any matching substrings have been
1353 * contradicted. Start again... */
1354
9fed8d02 1355 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1356
1357 /* uses bytes rather than char calculations for efficiency.
1358 * It's conservative: it errs on the side of doing 'goto restart',
1359 * where there is code that does a proper char-based test */
9fed8d02 1360 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1361 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1362 " Could not match STCLASS...\n") );
1363 goto fail;
1364 }
1365 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1366 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
9fed8d02 1367 (prog->substrs->check_ix ? "floating" : "anchored"),
675e93ee
DM
1368 (long)(rx_origin + start_shift - strbeg),
1369 (IV)(rx_origin - strbeg)
1370 ));
9fed8d02 1371 goto restart;
6eb5f6b9 1372 }
9fed8d02 1373
c75a3985
DM
1374 /* Success !!! */
1375
5f9c6575 1376 if (rx_origin != s) {
a3621e74 1377 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1378 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1379 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1380 );
1381 }
1382 else {
a3621e74 1383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1384 " Does not contradict STCLASS...\n");
b7953727
JH
1385 );
1386 }
6eb5f6b9 1387 }
ffad1e6a
DM
1388
1389 /* Decide whether using the substrings helped */
1390
1391 if (rx_origin != strpos) {
1392 /* Fixed substring is found far enough so that the match
1393 cannot start at strpos. */
1394
1395 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1396 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1397 }
1398 else {
70563e16
DM
1399 /* The found rx_origin position does not prohibit matching at
1400 * strpos, so calling intuit didn't gain us anything. Decrement
1401 * the BmUSEFUL() count on the check substring, and if we reach
1402 * zero, free it. */
1403 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1404 && (utf8_target ? (
1405 prog->check_utf8 /* Could be deleted already */
1406 && --BmUSEFUL(prog->check_utf8) < 0
1407 && (prog->check_utf8 == prog->float_utf8)
1408 ) : (
1409 prog->check_substr /* Could be deleted already */
1410 && --BmUSEFUL(prog->check_substr) < 0
1411 && (prog->check_substr == prog->float_substr)
1412 )))
1413 {
1414 /* If flags & SOMETHING - do not do it many times on the same match */
1415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1416 /* XXX Does the destruction order has to change with utf8_target? */
1417 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1418 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1419 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1420 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1421 check = NULL; /* abort */
ffad1e6a
DM
1422 /* XXXX This is a remnant of the old implementation. It
1423 looks wasteful, since now INTUIT can use many
1424 other heuristics. */
1425 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1426 }
1427 }
1428
1429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1430 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1431 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1432
c765d6e0 1433 return rx_origin;
2c2d71f5
JH
1434
1435 fail_finish: /* Substring not found */
33b8afdf 1436 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1437 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1438 fail:
a3621e74 1439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1440 PL_colors[4], PL_colors[5]));
bd61b366 1441 return NULL;
cad2e5aa 1442}
9661b544 1443
70563e16 1444
a0a388a1 1445#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1446 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1447 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1448 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1449 trie_type = ((scan->flags == EXACT) \
1450 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1451 : (scan->flags == EXACTL) \
1452 ? (utf8_target ? trie_utf8l : trie_plain) \
1453 : (scan->flags == EXACTFA) \
1454 ? (utf8_target \
1455 ? trie_utf8_exactfa_fold \
1456 : trie_latin_utf8_exactfa_fold) \
1457 : (scan->flags == EXACTFLU8 \
1458 ? trie_flu8 \
1459 : (utf8_target \
1460 ? trie_utf8_fold \
1461 : trie_latin_utf8_fold)))
fab2782b 1462
fd3249ee 1463#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1464STMT_START { \
fab2782b 1465 STRLEN skiplen; \
baa60164 1466 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1467 switch (trie_type) { \
a4525e78 1468 case trie_flu8: \
780fcc9f 1469 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1470 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1471 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1472 } \
a4525e78 1473 goto do_trie_utf8_fold; \
31f05a37 1474 case trie_utf8_exactfa_fold: \
baa60164 1475 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1476 /* FALLTHROUGH */ \
fab2782b 1477 case trie_utf8_fold: \
a4525e78 1478 do_trie_utf8_fold: \
fab2782b 1479 if ( foldlen>0 ) { \
c80e42f3 1480 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1481 foldlen -= len; \
1482 uscan += len; \
1483 len=0; \
1484 } else { \
445bf929 1485 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b 1486 len = UTF8SKIP(uc); \
5f560d8a 1487 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1488 foldlen -= skiplen; \
1489 uscan = foldbuf + skiplen; \
1490 } \
1491 break; \
baa60164
KW
1492 case trie_latin_utf8_exactfa_fold: \
1493 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1494 /* FALLTHROUGH */ \
fab2782b
YO
1495 case trie_latin_utf8_fold: \
1496 if ( foldlen>0 ) { \
c80e42f3 1497 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1498 foldlen -= len; \
1499 uscan += len; \
1500 len=0; \
1501 } else { \
1502 len = 1; \
31f05a37 1503 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1504 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1505 foldlen -= skiplen; \
1506 uscan = foldbuf + skiplen; \
1507 } \
1508 break; \
a4525e78 1509 case trie_utf8l: \
780fcc9f 1510 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1511 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1512 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1513 } \
780fcc9f 1514 /* FALLTHROUGH */ \
fab2782b 1515 case trie_utf8: \
c80e42f3 1516 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1517 break; \
1518 case trie_plain: \
1519 uvc = (UV)*uc; \
1520 len = 1; \
1521 } \
1522 if (uvc < 256) { \
1523 charid = trie->charmap[ uvc ]; \
1524 } \
1525 else { \
1526 charid = 0; \
1527 if (widecharmap) { \
1528 SV** const svpp = hv_fetch(widecharmap, \
1529 (char*)&uvc, sizeof(UV), 0); \
1530 if (svpp) \
1531 charid = (U16)SvIV(*svpp); \
1532 } \
1533 } \
4cadc6a9
YO
1534} STMT_END
1535
ae7c5b9b
KW
1536#define DUMP_EXEC_POS(li,s,doutf8) \
1537 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1538 startpos, doutf8)
1539
c84a03c5 1540#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1541STMT_START { \
1542 while (s <= e) { \
c84a03c5 1543 if ( (COND) \
fac1af77 1544 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1545 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1546 goto got_it; \
1547 s++; \
1548 } \
1549} STMT_END
1550
c84a03c5 1551#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1552STMT_START { \
9a902117 1553 while (s < strend) { \
c84a03c5 1554 CODE \
9a902117 1555 s += UTF8SKIP(s); \
4cadc6a9
YO
1556 } \
1557} STMT_END
1558
c84a03c5 1559#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1560STMT_START { \
1561 while (s < strend) { \
c84a03c5 1562 CODE \
4cadc6a9
YO
1563 s++; \
1564 } \
1565} STMT_END
1566
05bd126c
KW
1567#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1568REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1569 if (COND) { \
1570 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1571 goto got_it; \
1572 else \
1573 tmp = doevery; \
1574 } \
1575 else \
1576 tmp = 1; \
4cadc6a9
YO
1577)
1578
05bd126c
KW
1579#define REXEC_FBC_CLASS_SCAN(COND) \
1580REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1581 if (COND) { \
1582 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1583 goto got_it; \
1584 else \
1585 tmp = doevery; \
1586 } \
1587 else \
1588 tmp = 1; \
4cadc6a9
YO
1589)
1590
c84a03c5 1591#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1592 if (utf8_target) { \
c84a03c5 1593 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1594 } \
1595 else { \
c84a03c5 1596 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1597 }
05bd126c 1598
05bd126c
KW
1599/* The three macros below are slightly different versions of the same logic.
1600 *
1601 * The first is for /a and /aa when the target string is UTF-8. This can only
1602 * match ascii, but it must advance based on UTF-8. The other two handle the
1603 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1604 * for the boundary (or non-boundary) between a word and non-word character.
1605 * The utf8 and non-utf8 cases have the same logic, but the details must be
1606 * different. Find the "wordness" of the character just prior to this one, and
1607 * compare it with the wordness of this one. If they differ, we have a
1608 * boundary. At the beginning of the string, pretend that the previous
1609 * character was a new-line.
1610 *
1611 * All these macros uncleanly have side-effects with each other and outside
1612 * variables. So far it's been too much trouble to clean-up
1613 *
1614 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1615 * a word character or not.
1616 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1617 * word/non-word
1618 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1619 *
1620 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1621 * are looking for a boundary or for a non-boundary. If we are looking for a
1622 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1623 * see if this tentative match actually works, and if so, to quit the loop
1624 * here. And vice-versa if we are looking for a non-boundary.
1625 *
1626 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1627 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1628 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1629 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1630 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1631 * complement. But in that branch we complement tmp, meaning that at the
1632 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1633 * which means at the top of the loop in the next iteration, it is
1634 * TEST_NON_UTF8(s-1) */
b2f4e957 1635#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1636 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1637 tmp = TEST_NON_UTF8(tmp); \
1638 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1639 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1640 tmp = !tmp; \
1641 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1642 } \
1643 else { \
1644 IF_FAIL; \
1645 } \
1646 ); \
1647
1648/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1649 * TEST_UTF8 is a macro that for the same input code points returns identically
1650 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1651#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1652 if (s == reginfo->strbeg) { \
1653 tmp = '\n'; \
1654 } \
1655 else { /* Back-up to the start of the previous character */ \
1656 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1657 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1658 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1659 } \
1660 tmp = TEST_UV(tmp); \
1661 LOAD_UTF8_CHARCLASS_ALNUM(); \
1662 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1663 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1664 tmp = !tmp; \
1665 IF_SUCCESS; \
1666 } \
1667 else { \
1668 IF_FAIL; \
1669 } \
1670 );
cfaf538b 1671
05bd126c
KW
1672/* Like the above two macros. UTF8_CODE is the complete code for handling
1673 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1674 * macros below */
baa60164 1675#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1676 if (utf8_target) { \
05bd126c 1677 UTF8_CODE \
63ac0dad
KW
1678 } \
1679 else { /* Not utf8 */ \
9d9163fb 1680 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1681 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1682 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1683 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1684 IF_SUCCESS; \
760cfa8e 1685 tmp = !tmp; \
63ac0dad
KW
1686 } \
1687 else { \
1688 IF_FAIL; \
1689 } \
1690 ); \
1691 } \
c8519dc7
KW
1692 /* Here, things have been set up by the previous code so that tmp is the \
1693 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1694 * utf8ness of the target). We also have to check if this matches against \
1695 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1696 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1697 * string */ \
1698 if (tmp == ! TEST_NON_UTF8('\n')) { \
1699 IF_SUCCESS; \
1700 } \
1701 else { \
1702 IF_FAIL; \
1703 }
63ac0dad 1704
ae7c5b9b
KW
1705/* This is the macro to use when we want to see if something that looks like it
1706 * could match, actually does, and if so exits the loop */
1707#define REXEC_FBC_TRYIT \
1708 if ((reginfo->intuit || regtry(reginfo, &s))) \
1709 goto got_it
1710
1711/* The only difference between the BOUND and NBOUND cases is that
1712 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1713 * NBOUND. This is accomplished by passing it as either the if or else clause,
1714 * with the other one being empty (PLACEHOLDER is defined as empty).
1715 *
1716 * The TEST_FOO parameters are for operating on different forms of input, but
1717 * all should be ones that return identically for the same underlying code
1718 * points */
1719#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1720 FBC_BOUND_COMMON( \
1721 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1722 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1723
44129e46 1724#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1725 FBC_BOUND_COMMON( \
1726 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1727 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1728
1729#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1730 FBC_BOUND_COMMON( \
1731 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1732 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1733
44129e46 1734#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1735 FBC_BOUND_COMMON( \
1736 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1737 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1738
64935bc6
KW
1739/* Takes a pointer to an inversion list, a pointer to its corresponding
1740 * inversion map, and a code point, and returns the code point's value
1741 * according to the two arrays. It assumes that all code points have a value.
1742 * This is used as the base macro for macros for particular properties */
1743#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1744 invmap[_invlist_search(invlist, cp)]
1745
1746/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1747 * of a code point, returning the value for the first code point in the string.
1748 * And it takes the particular macro name that finds the desired value given a
1749 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1750#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1751 (__ASSERT_(pos < strend) \
1752 /* Note assumes is valid UTF-8 */ \
1753 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1754
1755/* Returns the GCB value for the input code point */
1756#define getGCB_VAL_CP(cp) \
1757 _generic_GET_BREAK_VAL_CP( \
1758 PL_GCB_invlist, \
02f811dd 1759 _Perl_GCB_invmap, \
64935bc6
KW
1760 (cp))
1761
1762/* Returns the GCB value for the first code point in the UTF-8 encoded string
1763 * bounded by pos and strend */
1764#define getGCB_VAL_UTF8(pos, strend) \
1765 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1766
06ae2722
KW
1767
1768/* Returns the SB value for the input code point */
1769#define getSB_VAL_CP(cp) \
1770 _generic_GET_BREAK_VAL_CP( \
1771 PL_SB_invlist, \
bf4268fa 1772 _Perl_SB_invmap, \
06ae2722
KW
1773 (cp))
1774
1775/* Returns the SB value for the first code point in the UTF-8 encoded string
1776 * bounded by pos and strend */
1777#define getSB_VAL_UTF8(pos, strend) \
1778 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1779
ae3bb8ea
KW
1780/* Returns the WB value for the input code point */
1781#define getWB_VAL_CP(cp) \
1782 _generic_GET_BREAK_VAL_CP( \
1783 PL_WB_invlist, \
bf4268fa 1784 _Perl_WB_invmap, \
ae3bb8ea
KW
1785 (cp))
1786
1787/* Returns the WB value for the first code point in the UTF-8 encoded string
1788 * bounded by pos and strend */
1789#define getWB_VAL_UTF8(pos, strend) \
1790 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1791
786e8c11 1792/* We know what class REx starts with. Try to find this position... */
02d5137b 1793/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1794/* annoyingly all the vars in this routine have different names from their counterparts
1795 in regmatch. /grrr */
3c3eec57 1796STATIC char *
07be1b83 1797S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1798 const char *strend, regmatch_info *reginfo)
a687059c 1799{
73104a1b
KW
1800 dVAR;
1801 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1802 char *pat_string; /* The pattern's exactish string */
1803 char *pat_end; /* ptr to end char of pat_string */
1804 re_fold_t folder; /* Function for computing non-utf8 folds */
1805 const U8 *fold_array; /* array for folding ords < 256 */
1806 STRLEN ln;
1807 STRLEN lnc;
73104a1b
KW
1808 U8 c1;
1809 U8 c2;
1810 char *e;
1811 I32 tmp = 1; /* Scratch variable? */
ba44c216 1812 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1813 UV utf8_fold_flags = 0;
f9176b44 1814 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1815 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1816 with a result inverts that result, as 0^1 =
1817 1 and 1^1 = 0 */
1818 _char_class_number classnum;
1819
73104a1b 1820 RXi_GET_DECL(prog,progi);
2f7f8cb1 1821
73104a1b 1822 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1823
73104a1b
KW
1824 /* We know what class it must start with. */
1825 switch (OP(c)) {
a4525e78 1826 case ANYOFL:
780fcc9f 1827 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30
KW
1828
1829 if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
1830 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1831 }
1832
780fcc9f 1833 /* FALLTHROUGH */
ac44c12e 1834 case ANYOFD:
73104a1b
KW
1835 case ANYOF:
1836 if (utf8_target) {
1837 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1838 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1839 }
1840 else {
1841 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1842 }
1843 break;
73104a1b 1844
098b07d5
KW
1845 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1846 assert(! is_utf8_pat);
924ba076 1847 /* FALLTHROUGH */
73104a1b 1848 case EXACTFA:
984e6dd1 1849 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1850 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1851 goto do_exactf_utf8;
1852 }
1853 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1854 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1855 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1856
2fdb7295
KW
1857 case EXACTF: /* This node only generated for non-utf8 patterns */
1858 assert(! is_utf8_pat);
73104a1b 1859 if (utf8_target) {
73104a1b
KW
1860 utf8_fold_flags = 0;
1861 goto do_exactf_utf8;
1862 }
1863 fold_array = PL_fold;
1864 folder = foldEQ;
1865 goto do_exactf_non_utf8;
1866
1867 case EXACTFL:
780fcc9f 1868 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 1869 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1870 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1871 goto do_exactf_utf8;
1872 }
1873 fold_array = PL_fold_locale;
1874 folder = foldEQ_locale;
1875 goto do_exactf_non_utf8;
3c760661 1876
73104a1b 1877 case EXACTFU_SS:
984e6dd1 1878 if (is_utf8_pat) {
73104a1b
KW
1879 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1880 }
1881 goto do_exactf_utf8;
16d951b7 1882
a4525e78
KW
1883 case EXACTFLU8:
1884 if (! utf8_target) { /* All code points in this node require
1885 UTF-8 to express. */
1886 break;
1887 }
613abc6d
KW
1888 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1889 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
1890 goto do_exactf_utf8;
1891
73104a1b 1892 case EXACTFU:
984e6dd1
DM
1893 if (is_utf8_pat || utf8_target) {
1894 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1895 goto do_exactf_utf8;
1896 }
fac1af77 1897
73104a1b
KW
1898 /* Any 'ss' in the pattern should have been replaced by regcomp,
1899 * so we don't have to worry here about this single special case
1900 * in the Latin1 range */
1901 fold_array = PL_fold_latin1;
1902 folder = foldEQ_latin1;
1903
924ba076 1904 /* FALLTHROUGH */
73104a1b 1905
c52b8b12 1906 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
1907 are no glitches with fold-length differences
1908 between the target string and pattern */
1909
1910 /* The idea in the non-utf8 EXACTF* cases is to first find the
1911 * first character of the EXACTF* node and then, if necessary,
1912 * case-insensitively compare the full text of the node. c1 is the
1913 * first character. c2 is its fold. This logic will not work for
1914 * Unicode semantics and the german sharp ss, which hence should
1915 * not be compiled into a node that gets here. */
1916 pat_string = STRING(c);
1917 ln = STR_LEN(c); /* length to match in octets/bytes */
1918
1919 /* We know that we have to match at least 'ln' bytes (which is the
1920 * same as characters, since not utf8). If we have to match 3
1921 * characters, and there are only 2 availabe, we know without
1922 * trying that it will fail; so don't start a match past the
1923 * required minimum number from the far end */
ea3daa5d 1924 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1925
02d5137b 1926 if (reginfo->intuit && e < s) {
73104a1b
KW
1927 e = s; /* Due to minlen logic of intuit() */
1928 }
fac1af77 1929
73104a1b
KW
1930 c1 = *pat_string;
1931 c2 = fold_array[c1];
1932 if (c1 == c2) { /* If char and fold are the same */
1933 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1934 }
1935 else {
1936 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1937 }
1938 break;
fac1af77 1939
c52b8b12
KW
1940 do_exactf_utf8:
1941 {
73104a1b
KW
1942 unsigned expansion;
1943
1944 /* If one of the operands is in utf8, we can't use the simpler folding
1945 * above, due to the fact that many different characters can have the
1946 * same fold, or portion of a fold, or different- length fold */
1947 pat_string = STRING(c);
1948 ln = STR_LEN(c); /* length to match in octets/bytes */
1949 pat_end = pat_string + ln;
984e6dd1 1950 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1951 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1952 : ln;
1953
1954 /* We have 'lnc' characters to match in the pattern, but because of
1955 * multi-character folding, each character in the target can match
1956 * up to 3 characters (Unicode guarantees it will never exceed
1957 * this) if it is utf8-encoded; and up to 2 if not (based on the
1958 * fact that the Latin 1 folds are already determined, and the
1959 * only multi-char fold in that range is the sharp-s folding to
1960 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1961 * string character. Adjust lnc accordingly, rounding up, so that
1962 * if we need to match at least 4+1/3 chars, that really is 5. */
1963 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1964 lnc = (lnc + expansion - 1) / expansion;
1965
1966 /* As in the non-UTF8 case, if we have to match 3 characters, and
1967 * only 2 are left, it's guaranteed to fail, so don't start a
1968 * match that would require us to go beyond the end of the string
1969 */
ea3daa5d 1970 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1971
02d5137b 1972 if (reginfo->intuit && e < s) {
73104a1b
KW
1973 e = s; /* Due to minlen logic of intuit() */
1974 }
0658cdde 1975
73104a1b
KW
1976 /* XXX Note that we could recalculate e to stop the loop earlier,
1977 * as the worst case expansion above will rarely be met, and as we
1978 * go along we would usually find that e moves further to the left.
1979 * This would happen only after we reached the point in the loop
1980 * where if there were no expansion we should fail. Unclear if
1981 * worth the expense */
1982
1983 while (s <= e) {
1984 char *my_strend= (char *)strend;
1985 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1986 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1987 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1988 {
1989 goto got_it;
1990 }
1991 s += (utf8_target) ? UTF8SKIP(s) : 1;
1992 }
1993 break;
1994 }
236d82fd 1995
73104a1b 1996 case BOUNDL:
780fcc9f 1997 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 1998 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
1999 if (! IN_UTF8_CTYPE_LOCALE) {
2000 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2001 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2002 }
64935bc6
KW
2003 goto do_boundu;
2004 }
2005
236d82fd 2006 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2007 break;
64935bc6 2008
73104a1b 2009 case NBOUNDL:
780fcc9f 2010 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2011 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2012 if (! IN_UTF8_CTYPE_LOCALE) {
2013 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2014 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2015 }
64935bc6
KW
2016 goto do_nboundu;
2017 }
2018
236d82fd 2019 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2020 break;
64935bc6
KW
2021
2022 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2023 meaning */
2024 assert(FLAGS(c) == TRADITIONAL_BOUND);
2025
236d82fd 2026 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2027 break;
64935bc6
KW
2028
2029 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2030 meaning */
2031 assert(FLAGS(c) == TRADITIONAL_BOUND);
2032
44129e46 2033 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2034 break;
64935bc6
KW
2035
2036 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2037 meaning */
2038 assert(FLAGS(c) == TRADITIONAL_BOUND);
2039
236d82fd 2040 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2041 break;
64935bc6
KW
2042
2043 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2044 meaning */
2045 assert(FLAGS(c) == TRADITIONAL_BOUND);
2046
44129e46 2047 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2048 break;
64935bc6 2049
73104a1b 2050 case NBOUNDU:
64935bc6
KW
2051 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2052 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2053 break;
2054 }
2055
2056 do_nboundu:
2057
2058 to_complement = 1;
2059 /* FALLTHROUGH */
2060
2061 case BOUNDU:
2062 do_boundu:
2063 switch((bound_type) FLAGS(c)) {
2064 case TRADITIONAL_BOUND:
2065 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2066 break;
2067 case GCB_BOUND:
2068 if (s == reginfo->strbeg) { /* GCB always matches at begin and
2069 end */
2070 if (to_complement ^ cBOOL(reginfo->intuit
2071 || regtry(reginfo, &s)))
2072 {
2073 goto got_it;
2074 }
2075 s += (utf8_target) ? UTF8SKIP(s) : 1;
2076 }
2077
2078 if (utf8_target) {
85e5f08b 2079 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2080 reghop3((U8*)s, -1,
2081 (U8*)(reginfo->strbeg)),
2082 (U8*) reginfo->strend);
2083 while (s < strend) {
85e5f08b 2084 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6
KW
2085 (U8*) reginfo->strend);
2086 if (to_complement ^ isGCB(before, after)) {
2087 if (reginfo->intuit || regtry(reginfo, &s)) {
2088 goto got_it;
2089 }
2090 before = after;
2091 }
2092 s += UTF8SKIP(s);
2093 }
2094 }
2095 else { /* Not utf8. Everything is a GCB except between CR and
2096 LF */
2097 while (s < strend) {
2098 if (to_complement ^ (UCHARAT(s - 1) != '\r'
2099 || UCHARAT(s) != '\n'))
2100 {
2101 if (reginfo->intuit || regtry(reginfo, &s)) {
2102 goto got_it;
2103 }
2104 s++;
2105 }
2106 }
2107 }
2108
2109 if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2110 goto got_it;
2111 }
2112 break;
ae3bb8ea 2113
06ae2722
KW
2114 case SB_BOUND:
2115 if (s == reginfo->strbeg) { /* SB always matches at beginning */
2116 if (to_complement
2117 ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2118 {
2119 goto got_it;
2120 }
2121
2122 /* Didn't match. Go try at the next position */
2123 s += (utf8_target) ? UTF8SKIP(s) : 1;
2124 }
2125
2126 if (utf8_target) {
85e5f08b 2127 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2128 -1,
2129 (U8*)(reginfo->strbeg)),
2130 (U8*) reginfo->strend);
2131 while (s < strend) {
85e5f08b 2132 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722
KW
2133 (U8*) reginfo->strend);
2134 if (to_complement ^ isSB(before,
2135 after,
2136 (U8*) reginfo->strbeg,
2137 (U8*) s,
2138 (U8*) reginfo->strend,
2139 utf8_target))
2140 {
2141 if (reginfo->intuit || regtry(reginfo, &s)) {
2142 goto got_it;
2143 }
2144 before = after;
2145 }
2146 s += UTF8SKIP(s);
2147 }
2148 }
2149 else { /* Not utf8. */
85e5f08b 2150 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2151 while (s < strend) {
85e5f08b 2152 SB_enum after = getSB_VAL_CP((U8) *s);
06ae2722
KW
2153 if (to_complement ^ isSB(before,
2154 after,
2155 (U8*) reginfo->strbeg,
2156 (U8*) s,
2157 (U8*) reginfo->strend,
2158 utf8_target))
2159 {
2160 if (reginfo->intuit || regtry(reginfo, &s)) {
2161 goto got_it;
2162 }
2163 before = after;
2164 }
2165 s++;
2166 }
2167 }
2168
2169 /* Here are at the final position in the target string. The SB
2170 * value is always true here, so matches, depending on other
2171 * constraints */
2172 if (to_complement ^ cBOOL(reginfo->intuit
2173 || regtry(reginfo, &s)))
2174 {
2175 goto got_it;
2176 }
2177
2178 break;
2179
ae3bb8ea
KW
2180 case WB_BOUND:
2181 if (s == reginfo->strbeg) {
2182 if (to_complement ^ cBOOL(reginfo->intuit
2183 || regtry(reginfo, &s)))
2184 {
2185 goto got_it;
2186 }
2187 s += (utf8_target) ? UTF8SKIP(s) : 1;
2188 }
2189
2190 if (utf8_target) {
2191 /* We are at a boundary between char_sub_0 and char_sub_1.
2192 * We also keep track of the value for char_sub_-1 as we
2193 * loop through the line. Context may be needed to make a
2194 * determination, and if so, this can save having to
2195 * recalculate it */
85e5f08b
KW
2196 WB_enum previous = WB_UNKNOWN;
2197 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2198 reghop3((U8*)s,
2199 -1,
2200 (U8*)(reginfo->strbeg)),
2201 (U8*) reginfo->strend);
2202 while (s < strend) {
85e5f08b 2203 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea
KW
2204 (U8*) reginfo->strend);
2205 if (to_complement ^ isWB(previous,
2206 before,
2207 after,
2208 (U8*) reginfo->strbeg,
2209 (U8*) s,
2210 (U8*) reginfo->strend,
2211 utf8_target))
2212 {
2213 if (reginfo->intuit || regtry(reginfo, &s)) {
2214 goto got_it;
2215 }
2216 previous = before;
2217 before = after;
2218 }
2219 s += UTF8SKIP(s);
2220 }
2221 }
2222 else { /* Not utf8. */
85e5f08b
KW
2223 WB_enum previous = WB_UNKNOWN;
2224 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2225 while (s < strend) {
85e5f08b 2226 WB_enum after = getWB_VAL_CP((U8) *s);
ae3bb8ea
KW
2227 if (to_complement ^ isWB(previous,
2228 before,
2229 after,
2230 (U8*) reginfo->strbeg,
2231 (U8*) s,
2232 (U8*) reginfo->strend,
2233 utf8_target))
2234 {
2235 if (reginfo->intuit || regtry(reginfo, &s)) {
2236 goto got_it;
2237 }
2238 previous = before;
2239 before = after;
2240 }
2241 s++;
2242 }
2243 }
2244
2245 if (to_complement ^ cBOOL(reginfo->intuit
2246 || regtry(reginfo, &s)))
2247 {
2248 goto got_it;
2249 }
2250
2251 break;
64935bc6 2252 }
73104a1b 2253 break;
64935bc6 2254
73104a1b
KW
2255 case LNBREAK:
2256 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2257 is_LNBREAK_latin1_safe(s, strend)
2258 );
2259 break;
3018b823
KW
2260
2261 /* The argument to all the POSIX node types is the class number to pass to
2262 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2263
2264 case NPOSIXL:
2265 to_complement = 1;
2266 /* FALLTHROUGH */
2267
2268 case POSIXL:
780fcc9f 2269 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2270 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2271 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2272 break;
3018b823
KW
2273
2274 case NPOSIXD:
2275 to_complement = 1;
2276 /* FALLTHROUGH */
2277
2278 case POSIXD:
2279 if (utf8_target) {
2280 goto posix_utf8;
2281 }
2282 goto posixa;
2283
2284 case NPOSIXA:
2285 if (utf8_target) {
2286 /* The complement of something that matches only ASCII matches all
837226c8
KW
2287 * non-ASCII, plus everything in ASCII that isn't in the class. */
2288 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
2289 || ! _generic_isCC_A(*s, FLAGS(c)));
2290 break;
2291 }
2292
2293 to_complement = 1;
2294 /* FALLTHROUGH */
2295
73104a1b 2296 case POSIXA:
3018b823 2297 posixa:
73104a1b 2298 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
2299 * byte invariant character. */
2300 REXEC_FBC_CLASS_SCAN(
2301 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2302 break;
3018b823
KW
2303
2304 case NPOSIXU:
2305 to_complement = 1;
2306 /* FALLTHROUGH */
2307
2308 case POSIXU:
2309 if (! utf8_target) {
2310 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2311 FLAGS(c))));
2312 }
2313 else {
2314
c52b8b12 2315 posix_utf8:
3018b823
KW
2316 classnum = (_char_class_number) FLAGS(c);
2317 if (classnum < _FIRST_NON_SWASH_CC) {
2318 while (s < strend) {
2319
2320 /* We avoid loading in the swash as long as possible, but
2321 * should we have to, we jump to a separate loop. This
2322 * extra 'if' statement is what keeps this code from being
2323 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2324 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2325 goto found_above_latin1;
2326 }
2327 if ((UTF8_IS_INVARIANT(*s)
2328 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2329 classnum)))
2330 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2331 && to_complement ^ cBOOL(
a62b247b 2332 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2333 *(s + 1)),
3018b823
KW
2334 classnum))))
2335 {
02d5137b 2336 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2337 goto got_it;
2338 else {
2339 tmp = doevery;
2340 }
2341 }
2342 else {
2343 tmp = 1;
2344 }
2345 s += UTF8SKIP(s);
2346 }
2347 }
2348 else switch (classnum) { /* These classes are implemented as
2349 macros */
779cf272 2350 case _CC_ENUM_SPACE:
3018b823
KW
2351 REXEC_FBC_UTF8_CLASS_SCAN(
2352 to_complement ^ cBOOL(isSPACE_utf8(s)));
2353 break;
2354
2355 case _CC_ENUM_BLANK:
2356 REXEC_FBC_UTF8_CLASS_SCAN(
2357 to_complement ^ cBOOL(isBLANK_utf8(s)));
2358 break;
2359
2360 case _CC_ENUM_XDIGIT:
2361 REXEC_FBC_UTF8_CLASS_SCAN(
2362 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2363 break;
2364
2365 case _CC_ENUM_VERTSPACE:
2366 REXEC_FBC_UTF8_CLASS_SCAN(
2367 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2368 break;
2369
2370 case _CC_ENUM_CNTRL:
2371 REXEC_FBC_UTF8_CLASS_SCAN(
2372 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2373 break;
2374
2375 default:
2376 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2377 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2378 }
2379 }
2380 break;
2381
2382 found_above_latin1: /* Here we have to load a swash to get the result
2383 for the current code point */
2384 if (! PL_utf8_swash_ptrs[classnum]) {
2385 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2386 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2387 _core_swash_init("utf8",
2388 "",
2389 &PL_sv_undef, 1, 0,
2390 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2391 }
2392
2393 /* This is a copy of the loop above for swash classes, though using the
2394 * FBC macro instead of being expanded out. Since we've loaded the
2395 * swash, we don't have to check for that each time through the loop */
2396 REXEC_FBC_UTF8_CLASS_SCAN(
2397 to_complement ^ cBOOL(_generic_utf8(
2398 classnum,
2399 s,
2400 swash_fetch(PL_utf8_swash_ptrs[classnum],
2401 (U8 *) s, TRUE))));
73104a1b
KW
2402 break;
2403
2404 case AHOCORASICKC:
2405 case AHOCORASICK:
2406 {
2407 DECL_TRIE_TYPE(c);
2408 /* what trie are we using right now */
2409 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2410 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2411 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2412
2413 const char *last_start = strend - trie->minlen;
6148ee25 2414#ifdef DEBUGGING
73104a1b 2415 const char *real_start = s;
6148ee25 2416#endif
73104a1b
KW
2417 STRLEN maxlen = trie->maxlen;
2418 SV *sv_points;
2419 U8 **points; /* map of where we were in the input string
2420 when reading a given char. For ASCII this
2421 is unnecessary overhead as the relationship
2422 is always 1:1, but for Unicode, especially
2423 case folded Unicode this is not true. */
2424 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2425 U8 *bitmap=NULL;
2426
2427
2428 GET_RE_DEBUG_FLAGS_DECL;
2429
2430 /* We can't just allocate points here. We need to wrap it in
2431 * an SV so it gets freed properly if there is a croak while
2432 * running the match */
2433 ENTER;
2434 SAVETMPS;
2435 sv_points=newSV(maxlen * sizeof(U8 *));
2436 SvCUR_set(sv_points,
2437 maxlen * sizeof(U8 *));
2438 SvPOK_on(sv_points);
2439 sv_2mortal(sv_points);
2440 points=(U8**)SvPV_nolen(sv_points );
2441 if ( trie_type != trie_utf8_fold
2442 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2443 {
2444 if (trie->bitmap)
2445 bitmap=(U8*)trie->bitmap;
2446 else
2447 bitmap=(U8*)ANYOF_BITMAP(c);
2448 }
2449 /* this is the Aho-Corasick algorithm modified a touch
2450 to include special handling for long "unknown char" sequences.
2451 The basic idea being that we use AC as long as we are dealing
2452 with a possible matching char, when we encounter an unknown char
2453 (and we have not encountered an accepting state) we scan forward
2454 until we find a legal starting char.
2455 AC matching is basically that of trie matching, except that when
2456 we encounter a failing transition, we fall back to the current
2457 states "fail state", and try the current char again, a process
2458 we repeat until we reach the root state, state 1, or a legal
2459 transition. If we fail on the root state then we can either
2460 terminate if we have reached an accepting state previously, or
2461 restart the entire process from the beginning if we have not.
2462
2463 */
2464 while (s <= last_start) {
2465 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2466 U8 *uc = (U8*)s;
2467 U16 charid = 0;
2468 U32 base = 1;
2469 U32 state = 1;
2470 UV uvc = 0;
2471 STRLEN len = 0;
2472 STRLEN foldlen = 0;
2473 U8 *uscan = (U8*)NULL;
2474 U8 *leftmost = NULL;
2475#ifdef DEBUGGING
2476 U32 accepted_word= 0;
786e8c11 2477#endif
73104a1b
KW
2478 U32 pointpos = 0;
2479
2480 while ( state && uc <= (U8*)strend ) {
2481 int failed=0;
2482 U32 word = aho->states[ state ].wordnum;
2483
2484 if( state==1 ) {
2485 if ( bitmap ) {
2486 DEBUG_TRIE_EXECUTE_r(
2487 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2488 dump_exec_pos( (char *)uc, c, strend, real_start,
2489 (char *)uc, utf8_target );
2490 PerlIO_printf( Perl_debug_log,
2491 " Scanning for legal start char...\n");
2492 }
2493 );
2494 if (utf8_target) {
2495 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2496 uc += UTF8SKIP(uc);
2497 }
2498 } else {
2499 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2500 uc++;
2501 }
786e8c11 2502 }
73104a1b 2503 s= (char *)uc;
07be1b83 2504 }
73104a1b
KW
2505 if (uc >(U8*)last_start) break;
2506 }
2507
2508 if ( word ) {
2509 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2510 if (!leftmost || lpos < leftmost) {
2511 DEBUG_r(accepted_word=word);
2512 leftmost= lpos;
7016d6eb 2513 }
73104a1b 2514 if (base==0) break;
7016d6eb 2515
73104a1b
KW
2516 }
2517 points[pointpos++ % maxlen]= uc;
2518 if (foldlen || uc < (U8*)strend) {
2519 REXEC_TRIE_READ_CHAR(trie_type, trie,
2520 widecharmap, uc,
2521 uscan, len, uvc, charid, foldlen,
2522 foldbuf, uniflags);
2523 DEBUG_TRIE_EXECUTE_r({
2524 dump_exec_pos( (char *)uc, c, strend,
2525 real_start, s, utf8_target);
2526 PerlIO_printf(Perl_debug_log,
2527 " Charid:%3u CP:%4"UVxf" ",
2528 charid, uvc);
2529 });
2530 }
2531 else {
2532 len = 0;
2533 charid = 0;
2534 }
07be1b83 2535
73104a1b
KW
2536
2537 do {
6148ee25 2538#ifdef DEBUGGING
73104a1b 2539 word = aho->states[ state ].wordnum;
6148ee25 2540#endif
73104a1b
KW
2541 base = aho->states[ state ].trans.base;
2542
2543 DEBUG_TRIE_EXECUTE_r({
2544 if (failed)
2545 dump_exec_pos( (char *)uc, c, strend, real_start,
2546 s, utf8_target );
2547 PerlIO_printf( Perl_debug_log,
2548 "%sState: %4"UVxf", word=%"UVxf,
2549 failed ? " Fail transition to " : "",
2550 (UV)state, (UV)word);
2551 });
2552 if ( base ) {
2553 U32 tmp;
2554 I32 offset;
2555 if (charid &&
2556 ( ((offset = base + charid
2557 - 1 - trie->uniquecharcount)) >= 0)
2558 && ((U32)offset < trie->lasttrans)
2559 && trie->trans[offset].check == state
2560 && (tmp=trie->trans[offset].next))
2561 {
2562 DEBUG_TRIE_EXECUTE_r(
2563 PerlIO_printf( Perl_debug_log," - legal\n"));
2564 state = tmp;
2565 break;
07be1b83
YO
2566 }
2567 else {
786e8c11 2568 DEBUG_TRIE_EXECUTE_r(
73104a1b 2569 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2570 failed = 1;
73104a1b 2571 state = aho->fail[state];
07be1b83 2572 }
07be1b83 2573 }
73104a1b
KW
2574 else {
2575 /* we must be accepting here */
2576 DEBUG_TRIE_EXECUTE_r(
2577 PerlIO_printf( Perl_debug_log," - accepting\n"));
2578 failed = 1;
2579 break;
786e8c11 2580 }
73104a1b
KW
2581 } while(state);
2582 uc += len;
2583 if (failed) {
2584 if (leftmost)
2585 break;
2586 if (!state) state = 1;
07be1b83 2587 }
73104a1b
KW
2588 }
2589 if ( aho->states[ state ].wordnum ) {
2590 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2591 if (!leftmost || lpos < leftmost) {
2592 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2593 leftmost = lpos;
07be1b83
YO
2594 }
2595 }
73104a1b
KW
2596 if (leftmost) {
2597 s = (char*)leftmost;
2598 DEBUG_TRIE_EXECUTE_r({
2599 PerlIO_printf(
2600 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2601 (UV)accepted_word, (IV)(s - real_start)
2602 );
2603 });
02d5137b 2604 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2605 FREETMPS;
2606 LEAVE;
2607 goto got_it;
2608 }
2609 s = HOPc(s,1);
2610 DEBUG_TRIE_EXECUTE_r({
2611 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2612 });
2613 } else {
2614 DEBUG_TRIE_EXECUTE_r(
2615 PerlIO_printf( Perl_debug_log,"No match.\n"));
2616 break;
2617 }
2618 }
2619 FREETMPS;
2620 LEAVE;
2621 }
2622 break;
2623 default:
2624 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2625 }
2626 return 0;
2627 got_it:
2628 return s;
6eb5f6b9
JH
2629}
2630
60165aa4
DM
2631/* set RX_SAVED_COPY, RX_SUBBEG etc.
2632 * flags have same meanings as with regexec_flags() */
2633
749f4950
DM
2634static void
2635S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2636 char *strbeg,
2637 char *strend,
2638 SV *sv,
2639 U32 flags,
2640 bool utf8_target)
2641{
2642 struct regexp *const prog = ReANY(rx);
2643
60165aa4
DM
2644 if (flags & REXEC_COPY_STR) {
2645#ifdef PERL_ANY_COW
2646 if (SvCANCOW(sv)) {
2647 if (DEBUG_C_TEST) {
2648 PerlIO_printf(Perl_debug_log,
2649 "Copy on write: regexp capture, type %d\n",
2650 (int) SvTYPE(sv));
2651 }
5411a0e5
DM
2652 /* Create a new COW SV to share the match string and store
2653 * in saved_copy, unless the current COW SV in saved_copy
2654 * is valid and suitable for our purpose */
2655 if (( prog->saved_copy
2656 && SvIsCOW(prog->saved_copy)
2657 && SvPOKp(prog->saved_copy)
2658 && SvIsCOW(sv)
2659 && SvPOKp(sv)
2660 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2661 {
5411a0e5
DM
2662 /* just reuse saved_copy SV */
2663 if (RXp_MATCH_COPIED(prog)) {
2664 Safefree(prog->subbeg);
2665 RXp_MATCH_COPIED_off(prog);
2666 }
2667 }
2668 else {
2669 /* create new COW SV to share string */
a76b0e90
DM
2670 RX_MATCH_COPY_FREE(rx);
2671 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2672 }
5411a0e5
DM
2673 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2674 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2675 prog->sublen = strend - strbeg;
2676 prog->suboffset = 0;
2677 prog->subcoffset = 0;
2678 } else
2679#endif
2680 {
99a90e59
FC
2681 SSize_t min = 0;
2682 SSize_t max = strend - strbeg;
ea3daa5d 2683 SSize_t sublen;
60165aa4
DM
2684
2685 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2686 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2687 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2688 ) { /* don't copy $' part of string */
2689 U32 n = 0;
2690 max = -1;
2691 /* calculate the right-most part of the string covered
2692 * by a capture. Due to look-ahead, this may be to
2693 * the right of $&, so we have to scan all captures */
2694 while (n <= prog->lastparen) {
2695 if (prog->offs[n].end > max)
2696 max = prog->offs[n].end;
2697 n++;
2698 }
2699 if (max == -1)
2700 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2701 ? prog->offs[0].start
2702 : 0;
2703 assert(max >= 0 && max <= strend - strbeg);
2704 }
2705
2706 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2707 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2708 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2709 ) { /* don't copy $` part of string */
2710 U32 n = 0;
2711 min = max;
2712 /* calculate the left-most part of the string covered
2713 * by a capture. Due to look-behind, this may be to
2714 * the left of $&, so we have to scan all captures */
2715 while (min && n <= prog->lastparen) {
2716 if ( prog->offs[n].start != -1
2717 && prog->offs[n].start < min)
2718 {
2719 min = prog->offs[n].start;
2720 }
2721 n++;
2722 }
2723 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2724 && min > prog->offs[0].end
2725 )
2726 min = prog->offs[0].end;
2727
2728 }
2729
2730 assert(min >= 0 && min <= max && min <= strend - strbeg);
2731 sublen = max - min;
2732
2733 if (RX_MATCH_COPIED(rx)) {
2734 if (sublen > prog->sublen)
2735 prog->subbeg =
2736 (char*)saferealloc(prog->subbeg, sublen+1);
2737 }
2738 else
2739 prog->subbeg = (char*)safemalloc(sublen+1);
2740 Copy(strbeg + min, prog->subbeg, sublen, char);
2741 prog->subbeg[sublen] = '\0';
2742 prog->suboffset = min;
2743 prog->sublen = sublen;
2744 RX_MATCH_COPIED_on(rx);
2745 }
2746 prog->subcoffset = prog->suboffset;
2747 if (prog->suboffset && utf8_target) {
2748 /* Convert byte offset to chars.
2749 * XXX ideally should only compute this if @-/@+
2750 * has been seen, a la PL_sawampersand ??? */
2751
2752 /* If there's a direct correspondence between the
2753 * string which we're matching and the original SV,
2754 * then we can use the utf8 len cache associated with
2755 * the SV. In particular, it means that under //g,
2756 * sv_pos_b2u() will use the previously cached
2757 * position to speed up working out the new length of
2758 * subcoffset, rather than counting from the start of
2759 * the string each time. This stops
2760 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2761 * from going quadratic */
2762 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2763 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2764 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2765 else
2766 prog->subcoffset = utf8_length((U8*)strbeg,
2767 (U8*)(strbeg+prog->suboffset));
2768 }
2769 }
2770 else {
2771 RX_MATCH_COPY_FREE(rx);
2772 prog->subbeg = strbeg;
2773 prog->suboffset = 0;
2774 prog->subcoffset = 0;
2775 prog->sublen = strend - strbeg;
2776 }
2777}
2778
2779
2780
fae667d5 2781
6eb5f6b9
JH
2782/*
2783 - regexec_flags - match a regexp against a string
2784 */
2785I32
5aaab254 2786Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2787 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2788/* stringarg: the point in the string at which to begin matching */
2789/* strend: pointer to null at end of string */
2790/* strbeg: real beginning of string */
2791/* minend: end of match must be >= minend bytes after stringarg. */
2792/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2793 * itself is accessed via the pointers above */
2794/* data: May be used for some additional optimizations.
d058ec57 2795 Currently unused. */
a340edde 2796/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2797
6eb5f6b9 2798{
8d919b0a 2799 struct regexp *const prog = ReANY(rx);
5aaab254 2800 char *s;
eb578fdb 2801 regnode *c;
03c83e26 2802 char *startpos;
ea3daa5d
FC
2803 SSize_t minlen; /* must match at least this many chars */
2804 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2805 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2806 I32 multiline;
f8fc2ecf 2807 RXi_GET_DECL(prog,progi);
02d5137b
DM
2808 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2809 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2810 regexp_paren_pair *swap = NULL;
006f26b2 2811 I32 oldsave;
a3621e74
YO
2812 GET_RE_DEBUG_FLAGS_DECL;
2813
7918f24d 2814 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2815 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2816
2817 /* Be paranoid... */
3dc78631 2818 if (prog == NULL) {
6eb5f6b9 2819 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2820 }
2821
6c3fea77 2822 DEBUG_EXECUTE_r(
03c83e26 2823 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2824 "Matching");
2825 );
8adc0f72 2826
b342a604
DM
2827 startpos = stringarg;
2828
58430ea8 2829 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2830 MAGIC *mg;
2831
fef7148b
DM
2832 /* set reginfo->ganch, the position where \G can match */
2833
2834 reginfo->ganch =
2835 (flags & REXEC_IGNOREPOS)
2836 ? stringarg /* use start pos rather than pos() */
3dc78631 2837 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2838 /* Defined pos(): */
2839 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2840 : strbeg; /* pos() not defined; use start of string */
2841
2842 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2843 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2844
03c83e26
DM
2845 /* in the presence of \G, we may need to start looking earlier in
2846 * the string than the suggested start point of stringarg:
0b2c2a84 2847 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2848 * offset, such as
2849 * /..\G/: gofs = 2
2850 * /ab|c\G/: gofs = 1
2851 * or if the minimum offset isn't known, then we have to go back
2852 * to the start of the string, e.g. /w+\G/
2853 */
2bfbe302 2854
8e1490ee 2855 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2856 startpos = reginfo->ganch - prog->gofs;
2857 if (startpos <
2858 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2859 {
2860 DEBUG_r(PerlIO_printf(Perl_debug_log,
2861 "fail: ganch-gofs before earliest possible start\n"));
2862 return 0;
2863 }
2864 }
2865 else if (prog->gofs) {
b342a604
DM
2866 if (startpos - prog->gofs < strbeg)
2867 startpos = strbeg;
2868 else
2869 startpos -= prog->gofs;
03c83e26 2870 }
58430ea8 2871 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2872 startpos = strbeg;
03c83e26
DM
2873 }
2874
2875 minlen = prog->minlen;
b342a604 2876 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2877 DEBUG_r(PerlIO_printf(Perl_debug_log,
2878 "Regex match can't succeed, so not even tried\n"));
2879 return 0;
2880 }
2881
63a3746a
DM
2882 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2883 * which will call destuctors to reset PL_regmatch_state, free higher
2884 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2885 * regmatch_info_aux_eval */
2886
2887 oldsave = PL_savestack_ix;
2888
dfa77d06
DM
2889 s = startpos;
2890
e322109a 2891 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2892 && !(flags & REXEC_CHECKED))
2893 {
dfa77d06 2894 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2895 flags, NULL);
dfa77d06 2896 if (!s)
7fadf4a7
DM
2897 return 0;
2898
e322109a 2899 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2900 /* we can match based purely on the result of INTUIT.
2901 * Set up captures etc just for $& and $-[0]
2902 * (an intuit-only match wont have $1,$2,..) */
2903 assert(!prog->nparens);
d5e7783a
DM
2904
2905 /* s/// doesn't like it if $& is earlier than where we asked it to
2906 * start searching (which can happen on something like /.\G/) */
2907 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2908 && (s < stringarg))
2909 {
2910 /* this should only be possible under \G */
58430ea8 2911 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2913 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2914 goto phooey;
2915 }
2916
7fadf4a7
DM
2917 /* match via INTUIT shouldn't have any captures.
2918 * Let @-, @+, $^N know */
2919 prog->lastparen = prog->lastcloseparen = 0;
2920 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2921 prog->offs[0].start = s - strbeg;
2922 prog->offs[0].end = utf8_target
2923 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2924 : s - strbeg + prog->minlenret;
7fadf4a7 2925 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2926 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2927 strbeg, strend,
2928 sv, flags, utf8_target);
2929
7fadf4a7
DM
2930 return 1;
2931 }
2932 }
2933
6c3fea77 2934 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2935
dfa77d06 2936 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2938 "String too short [regexec_flags]...\n"));
2939 goto phooey;
1aa99e6b 2940 }
1de06328 2941
6eb5f6b9 2942 /* Check validity of program. */
f8fc2ecf 2943 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2944 Perl_croak(aTHX_ "corrupted regexp program");
2945 }
2946
1738e041 2947 RX_MATCH_TAINTED_off(rx);
ab4e48c1 2948 RX_MATCH_UTF8_set(rx, utf8_target);
1738e041 2949
6c3fea77
DM
2950 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2951 reginfo->intuit = 0;
2952 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2953 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2954 reginfo->warned = FALSE;
9d9163fb 2955 reginfo->strbeg = strbeg;
02d5137b 2956 reginfo->sv = sv;
1cb48e53 2957 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2958 reginfo->strend = strend;
6eb5f6b9 2959 /* see how far we have to get to not match where we matched before */
fe3974be 2960 reginfo->till = stringarg + minend;
6eb5f6b9 2961
60779a30 2962 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2963 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2964 S_cleanup_regmatch_info_aux has executed (registered by
2965 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2966 magic belonging to this SV.
2967 Not newSVsv, either, as it does not COW.
2968 */
2969 reginfo->sv = newSV(0);
4cba5ac0 2970 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2971 SAVEFREESV(reginfo->sv);
2972 }
2973
331b2dcc
DM
2974 /* reserve next 2 or 3 slots in PL_regmatch_state:
2975 * slot N+0: may currently be in use: skip it
2976 * slot N+1: use for regmatch_info_aux struct
2977 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2978 * slot N+3: ready for use by regmatch()
2979 */
bf2039a9 2980
331b2dcc
DM
2981 {
2982 regmatch_state *old_regmatch_state;
2983 regmatch_slab *old_regmatch_slab;
2984 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2985
2986 /* on first ever match, allocate first slab */
2987 if (!PL_regmatch_slab) {
2988 Newx(PL_regmatch_slab, 1, regmatch_slab);
2989 PL_regmatch_slab->prev = NULL;
2990 PL_regmatch_slab->next = NULL;
2991 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2992 }
bf2039a9 2993
331b2dcc
DM
2994 old_regmatch_state = PL_regmatch_state;
2995 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2996
331b2dcc
DM
2997 for (i=0; i <= max; i++) {
2998 if (i == 1)
2999 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3000 else if (i ==2)
3001 reginfo->info_aux_eval =
3002 reginfo->info_aux->info_aux_eval =
3003 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3004
331b2dcc
DM
3005 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3006 PL_regmatch_state = S_push_slab(aTHX);
3007 }
bf2039a9 3008
331b2dcc
DM
3009 /* note initial PL_regmatch_state position; at end of match we'll
3010 * pop back to there and free any higher slabs */
bf2039a9 3011
331b2dcc
DM
3012 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3013 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3014 reginfo->info_aux->poscache = NULL;
bf2039a9 3015
331b2dcc 3016 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3017
331b2dcc
DM
3018 if ((prog->extflags & RXf_EVAL_SEEN))
3019 S_setup_eval_state(aTHX_ reginfo);
3020 else
3021 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3022 }
d3aa529c 3023
6eb5f6b9 3024 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3025
288b8c02 3026 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3027 /* We have to be careful. If the previous successful match
3028 was from this regex we don't want a subsequent partially
3029 successful match to clobber the old results.
3030 So when we detect this possibility we add a swap buffer
d8da0584
KW
3031 to the re, and switch the buffer each match. If we fail,
3032 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3033 */
3034 swap = prog->offs;
3035 /* do we need a save destructor here for eval dies? */
3036 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
3037 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3038 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3039 PTR2UV(prog),
3040 PTR2UV(swap),
3041 PTR2UV(prog->offs)
3042 ));
c74340f9 3043 }
6eb5f6b9 3044
0fa70a06
DM
3045 /* Simplest case: anchored match need be tried only once, or with
3046 * MBOL, only at the beginning of each line.
3047 *
3048 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3049 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3050 * match at the start of the string then it won't match anywhere else
3051 * either; while with /.*.../, if it doesn't match at the beginning,
3052 * the earliest it could match is at the start of the next line */
3053
8e1490ee 3054 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3055 char *end;
3056
3057 if (regtry(reginfo, &s))
6eb5f6b9 3058 goto got_it;
0fa70a06
DM
3059
3060 if (!(prog->intflags & PREGf_ANCH_MBOL))
3061 goto phooey;
3062
3063 /* didn't match at start, try at other newline positions */
3064
3065 if (minlen)
3066 dontbother = minlen - 1;
3067 end = HOP3c(strend, -dontbother, strbeg) - 1;
3068
3069 /* skip to next newline */
3070
3071 while (s <= end) { /* note it could be possible to match at the end of the string */
3072 /* NB: newlines are the same in unicode as they are in latin */
3073 if (*s++ != '\n')
3074 continue;
3075 if (prog->check_substr || prog->check_utf8) {
3076 /* note that with PREGf_IMPLICIT, intuit can only fail
3077 * or return the start position, so it's of limited utility.
3078 * Nevertheless, I made the decision that the potential for
3079 * quick fail was still worth it - DAPM */
3080 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3081 if (!s)
3082 goto phooey;
3083 }
3084 if (regtry(reginfo, &s))
3085 goto got_it;
3086 }
3087 goto phooey;
3088 } /* end anchored search */
3089
3090 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3091 {
a8430a8b
YO
3092 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3093 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3094 /* For anchored \G, the only position it can match from is
3095 * (ganch-gofs); we already set startpos to this above; if intuit
3096 * moved us on from there, we can't possibly succeed */
3097 assert(startpos == reginfo->ganch - prog->gofs);
3098 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3099 goto got_it;
3100 goto phooey;
3101 }
3102
3103 /* Messy cases: unanchored match. */
bbe252da 3104 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3105 /* we have /x+whatever/ */
984e6dd1 3106 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3107 char ch;
bf93d4cc
GS
3108#ifdef DEBUGGING
3109 int did_match = 0;
3110#endif
f2ed9b32 3111 if (utf8_target) {
7e0d5ad7
KW
3112 if (! prog->anchored_utf8) {
3113 to_utf8_substr(prog);
3114 }
3115 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3116 REXEC_FBC_SCAN(
6eb5f6b9 3117 if (*s == ch) {
a3621e74 3118 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3119 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3120 s += UTF8SKIP(s);
3121 while (s < strend && *s == ch)
3122 s += UTF8SKIP(s);
3123 }
4cadc6a9 3124 );
7e0d5ad7 3125
6eb5f6b9
JH
3126 }
3127 else {
7e0d5ad7
KW
3128 if (! prog->anchored_substr) {
3129 if (! to_byte_substr(prog)) {
6b54ddc5 3130 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3131 }
3132 }
3133 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3134 REXEC_FBC_SCAN(
6eb5f6b9 3135 if (*s == ch) {
a3621e74 3136 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3137 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3138 s++;
3139 while (s < strend && *s == ch)
3140 s++;
3141 }
4cadc6a9 3142 );
6eb5f6b9 3143 }
a3621e74 3144 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 3145 PerlIO_printf(Perl_debug_log,
b7953727
JH
3146 "Did not find anchored character...\n")
3147 );
6eb5f6b9 3148 }
a0714e2c
SS
3149 else if (prog->anchored_substr != NULL
3150 || prog->anchored_utf8 != NULL
3151 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3152 && prog->float_max_offset < strend - s)) {
3153 SV *must;
ea3daa5d
FC
3154 SSize_t back_max;
3155 SSize_t back_min;
33b8afdf 3156 char *last;
6eb5f6b9 3157 char *last1; /* Last position checked before */
bf93d4cc
GS
3158#ifdef DEBUGGING
3159 int did_match = 0;
3160#endif
33b8afdf 3161 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3162 if (utf8_target) {
3163 if (! prog->anchored_utf8) {
3164 to_utf8_substr(prog);
3165 }
3166 must = prog->anchored_utf8;
3167 }
3168 else {
3169 if (! prog->anchored_substr) {
3170 if (! to_byte_substr(prog)) {
6b54ddc5 3171 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3172 }
3173 }
3174 must = prog->anchored_substr;
3175 }
33b8afdf
JH
3176 back_max = back_min = prog->anchored_offset;
3177 } else {
7e0d5ad7
KW
3178 if (utf8_target) {
3179 if (! prog->float_utf8) {
3180 to_utf8_substr(prog);
3181 }
3182 must = prog->float_utf8;
3183 }
3184 else {
3185 if (! prog->float_substr) {
3186 if (! to_byte_substr(prog)) {
6b54ddc5 3187 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3188 }
3189 }
3190 must = prog->float_substr;
3191 }
33b8afdf
JH
3192 back_max = prog->float_max_offset;
3193 back_min = prog->float_min_offset;
3194 }
1de06328 3195
1de06328
YO
3196 if (back_min<0) {
3197 last = strend;
3198 } else {
3199 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3200 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3201 - (SvTAIL(must) != 0) + back_min), strbeg);
3202 }
9d9163fb 3203 if (s > reginfo->strbeg)
6eb5f6b9
JH
3204 last1 = HOPc(s, -1);
3205 else
3206 last1 = s - 1; /* bogus */
3207
a0288114 3208 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3209 check_substr==must. */
bf05793b 3210 dontbother = 0;
6eb5f6b9
JH
3211 strend = HOPc(strend, -dontbother);
3212 while ( (s <= last) &&
e50d57d4 3213 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3214 (unsigned char*)strend, must,
c33e64f0 3215 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3216 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
3217 if (HOPc(s, -back_max) > last1) {
3218 last1 = HOPc(s, -back_min);
3219 s = HOPc(s, -back_max);
3220 }
3221 else {
9d9163fb
DM
3222 char * const t = (last1 >= reginfo->strbeg)
3223 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
3224
3225 last1 = HOPc(s, -back_min);
52657f30 3226 s = t;
6eb5f6b9 3227 }
f2ed9b32 3228 if (utf8_target) {
6eb5f6b9 3229 while (s <= last1) {
02d5137b 3230 if (regtry(reginfo, &s))
6eb5f6b9 3231 goto got_it;
7016d6eb
DM
3232 if (s >= last1) {
3233 s++; /* to break out of outer loop */
3234 break;
3235 }
3236 s += UTF8SKIP(s);
6eb5f6b9
JH
3237 }
3238 }
3239 else {
3240 while (s <= last1) {
02d5137b 3241 if (regtry(reginfo, &s))
6eb5f6b9
JH
3242 goto got_it;
3243 s++;
3244 }
3245 }
3246 }
ab3bbdeb 3247 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 3248 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
3249 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3250 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 3251 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 3252 ? "anchored" : "floating"),
ab3bbdeb
YO
3253 quoted, RE_SV_TAIL(must));
3254 });
6eb5f6b9
JH
3255 goto phooey;
3256 }
f8fc2ecf 3257 else if ( (c = progi->regstclass) ) {
f14c76ed 3258 if (minlen) {
f8fc2ecf 3259 const OPCODE op = OP(progi->regstclass);
66e933ab 3260 /* don't bother with what can't match */
33c28ab2 3261 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
f14c76ed
RGS
3262 strend = HOPc(strend, -(minlen - 1));
3263 }
a3621e74 3264 DEBUG_EXECUTE_r({
be8e71aa 3265 SV * const prop = sv_newmortal();
8b9781c9 3266 regprop(prog, prop, c, reginfo, NULL);
0df25f3d 3267 {
f2ed9b32 3268 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 3269 s,strend-s,60);
0df25f3d 3270 PerlIO_printf(Perl_debug_log,
1c8f8eb1 3271 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 3272 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 3273 quoted, (int)(strend - s));
0df25f3d 3274 }
ffc61ed2 3275 });
f9176b44 3276 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 3277 goto got_it;
07be1b83 3278 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
3279 }
3280 else {
3281 dontbother = 0;
a0714e2c 3282 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 3283 /* Trim the end. */
6af40bd7 3284 char *last= NULL;
33b8afdf 3285 SV* float_real;
c33e64f0
FC
3286 STRLEN len;
3287 const char *little;
33b8afdf 3288
7e0d5ad7
KW
3289 if (utf8_target) {
3290 if (! prog->float_utf8) {
3291 to_utf8_substr(prog);
3292 }
3293 float_real = prog->float_utf8;
3294 }
3295 else {
3296 if (! prog->float_substr) {
3297 if (! to_byte_substr(prog)) {
6b54ddc5 3298 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3299 }
3300 }
3301 float_real = prog->float_substr;
3302 }
d6a28714 3303
c33e64f0
FC
3304 little = SvPV_const(float_real, len);
3305 if (SvTAIL(float_real)) {
7f18ad16
KW
3306 /* This means that float_real contains an artificial \n on
3307 * the end due to the presence of something like this:
3308 * /foo$/ where we can match both "foo" and "foo\n" at the
3309 * end of the string. So we have to compare the end of the
3310 * string first against the float_real without the \n and
3311 * then against the full float_real with the string. We
3312 * have to watch out for cases where the string might be
3313 * smaller than the float_real or the float_real without
3314 * the \n. */
1a13b075
YO
3315 char *checkpos= strend - len;
3316 DEBUG_OPTIMISE_r(
3317 PerlIO_printf(Perl_debug_log,
3318 "%sChecking for float_real.%s\n",
3319 PL_colors[4], PL_colors[5]));
3320 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3321 /* can't match, even if we remove the trailing \n
3322 * string is too short to match */
1a13b075
YO
3323 DEBUG_EXECUTE_r(
3324 PerlIO_printf(Perl_debug_log,
3325 "%sString shorter than required trailing substring, cannot match.%s\n",
3326 PL_colors[4], PL_colors[5]));
3327 goto phooey;
3328 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3329 /* can match, the end of the string matches without the
3330 * "\n" */
1a13b075
YO
3331 last = checkpos + 1;
3332 } else if (checkpos < strbeg) {
7f18ad16
KW
3333 /* cant match, string is too short when the "\n" is
3334 * included */
1a13b075
YO
3335 DEBUG_EXECUTE_r(
3336 PerlIO_printf(Perl_debug_log,
3337 "%sString does not contain required trailing substring, cannot match.%s\n",
3338 PL_colors[4], PL_colors[5]));
3339 goto phooey;
3340 } else if (!multiline) {
7f18ad16
KW
3341 /* non multiline match, so compare with the "\n" at the
3342 * end of the string */
1a13b075
YO
3343 if (memEQ(checkpos, little, len)) {
3344 last= checkpos;
3345 } else {
3346 DEBUG_EXECUTE_r(
3347 PerlIO_printf(Perl_debug_log,
3348 "%sString does not contain required trailing substring, cannot match.%s\n",
3349 PL_colors[4], PL_colors[5]));
3350 goto phooey;
3351 }
3352 } else {
7f18ad16
KW
3353 /* multiline match, so we have to search for a place
3354 * where the full string is located */
d6a28714 3355 goto find_last;
1a13b075 3356 }
c33e64f0 3357 } else {
d6a28714 3358 find_last:
9041c2e3 3359 if (len)
d6a28714 3360 last = rninstr(s, strend, little, little + len);
b8c5462f 3361 else
a0288114 3362 last = strend; /* matching "$" */
b8c5462f 3363 }
6af40bd7 3364 if (!last) {
7f18ad16
KW
3365 /* at one point this block contained a comment which was
3366 * probably incorrect, which said that this was a "should not
3367 * happen" case. Even if it was true when it was written I am
3368 * pretty sure it is not anymore, so I have removed the comment
3369 * and replaced it with this one. Yves */
6bda09f9
YO
3370 DEBUG_EXECUTE_r(
3371 PerlIO_printf(Perl_debug_log,
b729e729
YO
3372 "%sString does not contain required substring, cannot match.%s\n",
3373 PL_colors[4], PL_colors[5]
6af40bd7
YO
3374 ));
3375 goto phooey;
bf93d4cc 3376 }
d6a28714
JH
3377 dontbother = strend - last + prog->float_min_offset;
3378 }
3379 if (minlen && (dontbother < minlen))
3380 dontbother = minlen - 1;
3381 strend -= dontbother; /* this one's always in bytes! */
3382 /* We don't know much -- general case. */
f2ed9b32 3383 if (utf8_target) {
d6a28714 3384 for (;;) {
02d5137b 3385 if (regtry(reginfo, &s))
d6a28714
JH
3386 goto got_it;
3387 if (s >= strend)
3388 break;
b8c5462f 3389 s += UTF8SKIP(s);
d6a28714
JH
3390 };
3391 }
3392 else {
3393 do {
02d5137b 3394 if (regtry(reginfo, &s))
d6a28714
JH
3395 goto got_it;
3396 } while (s++ < strend);
3397 }
3398 }
3399
3400 /* Failure. */
3401 goto phooey;
3402
7b52d656 3403 got_it:
d5e7783a
DM
3404 /* s/// doesn't like it if $& is earlier than where we asked it to
3405 * start searching (which can happen on something like /.\G/) */
3406 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3407 && (prog->offs[0].start < stringarg - strbeg))
3408 {
3409 /* this should only be possible under \G */
58430ea8 3410 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3412 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3413 goto phooey;
3414 }
3415
495f47a5
DM
3416 DEBUG_BUFFERS_r(
3417 if (swap)
3418 PerlIO_printf(Perl_debug_log,
3419 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3420 PTR2UV(prog),
3421 PTR2UV(swap)
3422 );
3423 );
e9105d30 3424 Safefree(swap);
d6a28714 3425
bf2039a9
DM
3426 /* clean up; this will trigger destructors that will free all slabs
3427 * above the current one, and cleanup the regmatch_info_aux
3428 * and regmatch_info_aux_eval sructs */
8adc0f72 3429
006f26b2
DM
3430 LEAVE_SCOPE(oldsave);
3431
5daac39c
NC
3432 if (RXp_PAREN_NAMES(prog))
3433 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
3434
3435 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3436 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3437 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3438 strbeg, reginfo->strend,
3439 sv, flags, utf8_target);
9041c2e3 3440
d6a28714
JH
3441 return 1;
3442
7b52d656 3443 phooey:
a3621e74 3444 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3445 PL_colors[4], PL_colors[5]));
8adc0f72 3446
bf2039a9
DM
3447 /* clean up; this will trigger destructors that will free all slabs
3448 * above the current one, and cleanup the regmatch_info_aux
3449 * and regmatch_info_aux_eval sructs */
8adc0f72 3450
006f26b2
DM
3451 LEAVE_SCOPE(oldsave);
3452
e9105d30 3453 if (swap) {
c74340f9 3454 /* we failed :-( roll it back */
495f47a5
DM
3455 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3456 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3457 PTR2UV(prog),
3458 PTR2UV(prog->offs),
3459 PTR2UV(swap)
3460 ));
e9105d30
GG
3461 Safefree(prog->offs);
3462 prog->offs = swap;
3463 }
d6a28714
JH
3464 return 0;
3465}
3466
6bda09f9 3467
b3d298be 3468/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3469 * Do inc before dec, in case old and new rex are the same */
baa60164 3470#define SET_reg_curpm(Re2) \
bf2039a9 3471 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3472 (void)ReREFCNT_inc(Re2); \
3473 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3474 PM_SETRE((PL_reg_curpm), (Re2)); \
3475 }
3476
3477
d6a28714
JH
3478/*
3479 - regtry - try match at specific point
3480 */
3481STATIC I32 /* 0 failure, 1 success */
f73aaa43 3482S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3483{
d6a28714 3484 CHECKPOINT lastcp;
288b8c02 3485 REGEXP *const rx = reginfo->prog;
8d919b0a 3486 regexp *const prog = ReANY(rx);
99a90e59 3487 SSize_t result;
f8fc2ecf 3488 RXi_GET_DECL(prog,progi);
a3621e74 3489 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3490
3491 PERL_ARGS_ASSERT_REGTRY;
3492
24b23f37 3493 reginfo->cutpoint=NULL;
d6a28714 3494
9d9163fb 3495 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3496 prog->lastparen = 0;
03994de8 3497 prog->lastcloseparen = 0;
d6a28714
JH
3498
3499 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3500 to do this again and again, prog->lastparen should take care of
3dd2943c 3501 this! --ilya*/
dafc8851
JH
3502
3503 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3504 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3505 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3506 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3507 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3508 * Meanwhile, this code *is* needed for the
daf18116
JH
3509 * above-mentioned test suite tests to succeed. The common theme
3510 * on those tests seems to be returning null fields from matches.
225593e1 3511 * --jhi updated by dapm */
dafc8851 3512#if 1
d6a28714 3513 if (prog->nparens) {
b93070ed 3514 regexp_paren_pair *pp = prog->offs;
eb578fdb 3515 I32 i;
b93070ed 3516 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3517 ++pp;
3518 pp->start = -1;
3519 pp->end = -1;
d6a28714
JH
3520 }
3521 }
dafc8851 3522#endif
02db2b7b 3523 REGCP_SET(lastcp);
f73aaa43
DM
3524 result = regmatch(reginfo, *startposp, progi->program + 1);
3525 if (result != -1) {
3526 prog->offs[0].end = result;
d6a28714
JH
3527 return 1;
3528 }
24b23f37 3529 if (reginfo->cutpoint)
f73aaa43 3530 *startposp= reginfo->cutpoint;
02db2b7b 3531 REGCP_UNWIND(lastcp);
d6a28714
JH
3532 return 0;
3533}
3534
02db2b7b 3535
8ba1375e
MJD
3536#define sayYES goto yes
3537#define sayNO goto no
262b90c4 3538#define sayNO_SILENT goto no_silent
8ba1375e 3539
f9f4320a
YO
3540/* we dont use STMT_START/END here because it leads to
3541 "unreachable code" warnings, which are bogus, but distracting. */
3542#define CACHEsayNO \
c476f425 3543 if (ST.cache_mask) \
2ac8ff4b 3544 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3545 sayNO
3298f257 3546
a3621e74 3547/* this is used to determine how far from the left messages like
265c4333
YO
3548 'failed...' are printed. It should be set such that messages
3549